From 3a37277060e51a023887d0226eef01ed81aba90d Mon Sep 17 00:00:00 2001 From: Andre Ramnitz Date: Thu, 23 Apr 2026 08:24:35 +0200 Subject: [PATCH] Bump everything --- ...nt-customizable-terminal-title.patch.skip} | 0 ...nboard_fix_addressable_only_mainboard.diff | 33 - ...001-messageParser-fix-varInt-decoding.skip | 49 + ...ull-wl_output-in-QWaylandScreen-surf.patch | 52 + ..._package-to-resolve-GuiPrivate-not-b.patch | 24 + gui-apps/noctalia-shell/mod_theme_foot.patch | 29 + .../noctalia-shell/mod_theme_hyprland.patch | 43 + gui-wm/hyprland/5874-enable_LTO.patch | 10 - media-gfx/krita/0001-krita-use_wayland.patch | 110 + .../0001-fix_build_with_gcc13+.patch | 10 - net-misc/networkmanager/1966.patch | 67 - sys-kernel/gentoo-sources | 2 +- .../{0000-bore.patch.skip => 0001-bore.patch} | 0 .../gentoo-sources-6.10.3/0003-block.patch | 485 - .../0001-eevdf-next.patch | 4444 +++ .../gentoo-sources-6.11.3+/0003-bbr3.patch | 3386 ++ .../gentoo-sources-6.11.3+/0007-ksm.patch | 433 + .../gentoo-sources-6.11.3+/0012-zstd.patch | 18652 ++++++++++ .../gentoo-sources-6.11/0001-eevdf-next.patch | 4374 +++ .../0002-bbr3.patch} | 83 +- .../0007-ntsync.patch} | 23 +- .../0008-perf-per-core.patch} | 14 +- .../0010-zstd.patch | 7 +- ...ed-additional-timer-tick-frequencies.patch | 55 + .../0001-preempt-lazy.patch | 958 + .../gentoo-sources-6.12/0002-amd-pstate.patch | 902 + .../gentoo-sources-6.12/0004-bbr3.patch | 3386 ++ .../gentoo-sources-6.12/0006-crypto.patch | 1606 + .../gentoo-sources-6.12/0007-fixes.patch | 955 + .../0008-ntsync.patch} | 611 +- .../0009-perf-per-core.patch | 997 + .../gentoo-sources-6.12/0010-pksm.patch | 433 + .../gentoo-sources-6.12/0012-zstd.patch | 18652 ++++++++++ ...e_increase_maximum_concurrency_limit.patch | 11 + .../gentoo-sources-6.13/0001-amd-pstate.patch | 885 + .../0002-amd-tlb-broadcast.patch | 1350 + .../gentoo-sources-6.13/0003-bbr3.patch | 3386 ++ .../gentoo-sources-6.13/0005-crypto.patch | 774 + .../0007-itmt-core-ranking.patch | 365 + .../gentoo-sources-6.13/0008-ntsync.patch | 3050 ++ .../0009-perf-per-core.patch | 898 + .../gentoo-sources-6.13/0010-pksm.patch | 433 + .../gentoo-sources-6.13/0012-zstd.patch | 23530 +++++++++++++ .../gentoo-sources-6.14/0001-bore.patch | 1006 + .../gentoo-sources-6.14/0004-bbr3.patch | 3387 ++ .../gentoo-sources-6.14/0006-crypto.patch | 2495 ++ .../gentoo-sources-6.14/0009-zstd.patch | 23554 +++++++++++++ .../gentoo-sources-6.14/gentoo-sources-6.15 | 1 + .../0001-amd-pstate.patch.skip | 402 + .../gentoo-sources-6.15/0004-bbr3.patch | 3404 ++ .../gentoo-sources-6.15/0005-block.patch.skip | 288 + ...-20-sched-Cache-aware-load-balancing.patch | 803 + ...ral-fixes-for-cache-aware-scheduling.patch | 230 + ...k-migration-within-its-preferred-LLC.patch | 112 + ...-cpumask-if-the-system-is-overloaded.patch | 122 + ...sis-to-switch-a-task-s-preferred-LLC.patch | 157 + ...on-for-better-cache-aware-scheduling.patch | 195 + ...ther-to-allow-cache-aware-scheduling.patch | 279 + ...h-v3-08-20-sched-Set-up-LLC-indexing.patch | 224 + ...d-Introduce-task-preferred-LLC-field.patch | 148 + ...at-have-LLC-preference-on-a-runqueue.patch | 238 + ...runqueue-task-LLC-preference-counter.patch | 180 + ...ferred-LLC-tasks-during-load-balance.patch | 139 + ...nce-if-it-has-tasks-prefer-other-LLC.patch | 169 + ...th-groups-having-preferred-LLC-tasks.patch | 173 + ...track-the-preferred-LLC-load-balance.patch | 183 + ...ider-LLC-locality-for-active-balance.patch | 182 + ...hen-picking-tasks-from-busiest-queue.patch | 193 + ...t-is-moving-out-of-its-preferred-LLC.patch | 155 + ...-to-control-cache-aware-load-balance.patch | 185 + ...o-control-LLC-aggregation-on-wake-up.patch | 136 + .../gentoo-sources-6.16/0001-bore.patch.skip | 1032 + .../gentoo-sources-6.16/0002-bbr3.patch | 3404 ++ .../gentoo-sources-6.16/0003-block.patch | 288 + .../gentoo-sources-6.16/0005-fixes.patch | 59 + .../gentoo-sources-6.16/0006-s5-power.patch | 329 + ...-28-sched-Cache-aware-load-balancing.patch | 810 + ...ral-fixes-for-cache-aware-scheduling.patch | 318 + ...k-migration-within-its-preferred-LLC.patch | 117 + ...-cpumask-if-the-system-is-overloaded.patch | 131 + ...sis-to-switch-a-task-s-preferred-LLC.patch | 165 + ...on-for-better-cache-aware-scheduling.patch | 200 + ...ther-to-allow-cache-aware-scheduling.patch | 293 + ...H-v4-08-28-sched-Set-up-LLC-indexing.patch | 232 + ...d-Introduce-task-preferred-LLC-field.patch | 156 + ...at-have-LLC-preference-on-a-runqueue.patch | 255 + ...runqueue-task-LLC-preference-counter.patch | 217 + ...ferred-LLC-tasks-during-load-balance.patch | 147 + ...nce-if-it-has-tasks-prefer-other-LLC.patch | 177 + ...th-groups-having-preferred-LLC-tasks.patch | 181 + ...track-the-preferred-LLC-load-balance.patch | 191 + ...ider-LLC-locality-for-active-balance.patch | 190 + ...hen-picking-tasks-from-busiest-queue.patch | 201 + ...t-is-moving-out-of-its-preferred-LLC.patch | 163 + ...-to-control-cache-aware-load-balance.patch | 194 + ...o-control-LLC-aggregation-on-wake-up.patch | 145 + ...able-cache-aware-only-for-multi-LLCs.patch | 299 + ...-and-EPOCH_OLD-into-tunnable-debugfs.patch | 164 + ...k-s-preferred-node-for-preferred-LLC.patch | 171 + ...number-of-runninhg-tasks-per-process.patch | 169 + ...-the-process-has-many-active-threads.patch | 160 + ...cheduling-for-process-with-large-RSS.patch | 196 + ...-the-scale-factor-for-RSS-comparison.patch | 303 + ...load-balance-and-hottest-CPU-changes.patch | 307 + .../gentoo-sources-6.17/0002-bbr3.patch | 0 .../gentoo-sources-6.17/0003-block.patch | 0 .../gentoo-sources-6.17/0005-fixes.patch.skip | 0 ...cture-for-cache-aware-load-balancing.patch | 654 + ...ide-cache-aware-scheduling-decisions.patch | 227 + ...ions-to-enforce-LLC-migration-policy.patch | 335 + ...able-cache-aware-only-for-multi-LLCs.patch | 208 + ...-fair-Add-LLC-index-mapping-for-CPUs.patch | 291 + ...Assign-preferred-LLC-ID-to-processes.patch | 156 + ...ack-LLC-preferred-tasks-per-runqueue.patch | 257 + ...runqueue-task-LLC-preference-counter.patch | 194 + ...-prefering-each-LLC-in-a-sched-group.patch | 143 + ...ing-destination-LLC-during-balancing.patch | 187 + ...d_group-for-LLC-aware-load-balancing.patch | 184 + ...ation-type-for-cache-aware-balancing.patch | 185 + ...le-tasks-to-from-their-preferred-LLC.patch | 208 + ...n-selecting-tasks-for-load-balancing.patch | 201 + ...ference-in-task-migration-and-detach.patch | 156 + ...-threads-from-cache-aware-scheduling.patch | 172 + ...or-processes-with-high-thread-counts.patch | 170 + ...cheduling-for-memory-heavy-processes.patch | 246 + ...-tolerance-of-cache-aware-scheduling.patch | 366 + .../gentoo-sources-6.18/0001-amd-pstate.patch | 120 + ...ed-additional-timer-tick-frequencies.patch | 0 .../gentoo-sources-6.18/0004-bbr3.patch | 3394 ++ .../gentoo-sources-6.18/0005-block.patch | 214 + .../gentoo-sources-6.18/0007-crypto.patch | 3441 ++ .../gentoo-sources-6.18/0010-sched-ext.patch | 708 + ...-for-cache-aware-load-balancing.patch.skip | 654 + ...ache-aware-scheduling-decisions.patch.skip | 227 + ...to-enforce-LLC-migration-policy.patch.skip | 335 + ...cache-aware-only-for-multi-LLCs.patch.skip | 208 + ...-Add-LLC-index-mapping-for-CPUs.patch.skip | 291 + ...n-preferred-LLC-ID-to-processes.patch.skip | 156 + ...LC-preferred-tasks-per-runqueue.patch.skip | 257 + ...eue-task-LLC-preference-counter.patch.skip | 194 + ...ering-each-LLC-in-a-sched-group.patch.skip | 143 + ...estination-LLC-during-balancing.patch.skip | 187 + ...up-for-LLC-aware-load-balancing.patch.skip | 184 + ...-type-for-cache-aware-balancing.patch.skip | 185 + ...sks-to-from-their-preferred-LLC.patch.skip | 208 + ...ecting-tasks-for-load-balancing.patch.skip | 201 + ...ce-in-task-migration-and-detach.patch.skip | 156 + ...ads-from-cache-aware-scheduling.patch.skip | 172 + ...ocesses-with-high-thread-counts.patch.skip | 170 + ...ling-for-memory-heavy-processes.patch.skip | 246 + ...rance-of-cache-aware-scheduling.patch.skip | 366 + ...-for-cache-aware-load-balancing.patch.skip | 637 + ...ache-aware-scheduling-decisions.patch.skip | 229 + ...to-enforce-LLC-migration-policy.patch.skip | 333 + ...ed-cache-Make-LLC-id-continuous.patch.skip | 257 + ...n-preferred-LLC-ID-to-processes.patch.skip | 172 + ...LC-preferred-tasks-per-runqueue.patch.skip | 289 + ...eue-task-LLC-preference-counter.patch.skip | 293 + ...er-runqueue-task-LLC-preference.patch.skip | 142 + ...estination-LLC-in-a-sched-group.patch.skip | 160 + ...only-once-in-update_sg_lb_stats.patch.skip | 142 + ...estination-LLC-during-balancing.patch.skip | 276 + ...-type-for-cache-aware-balancing.patch.skip | 191 + ...sks-to-from-their-preferred-LLC.patch.skip | 195 + ...ecting-tasks-for-load-balancing.patch.skip | 206 + ...ce-in-task-migration-and-detach.patch.skip | 251 + ...duling-for-multi-LLCs-NUMA-node.patch.skip | 192 + ...cess-for-cache-aware-scheduling.patch.skip | 172 + ...ocesses-with-high-thread-counts.patch.skip | 175 + ...ling-for-memory-heavy-processes.patch.skip | 258 + ...eters-of-cache-aware-scheduling.patch.skip | 478 + ...-for-cache-aware-load-balancing.patch.skip | 174 + ...ack-the-load-balance-statistics.patch.skip | 172 + ...cy-for-each-process-via-proc-fs.patch.skip | 323 + ...uce_kvfree_rcu_barrier_on_cache.patch.skip | 259 + .../gentoo-sources-6.19/0002-bbr3.patch | 3395 ++ .../gentoo-sources-6.19/0005-hdmi.patch | 1729 + .../gentoo-sources-6.19/0006-r8125.patch | 29360 ++++++++++++++++ .../0007-vesa-dsc-bpp.patch | 392 + .../gentoo-sources-6.19/0008-vmscape.patch | 366 + ...d-additional-timer-tick-frequencies.patch} | 0 sys-kernel/gentoo-sources-6.6/0001-bbr3.patch | 3352 ++ sys-kernel/gentoo-sources-6.6/0001-bore.patch | 825 + sys-kernel/gentoo-sources-6.6/0005-zstd.patch | 13833 ++++++++ .../gentoo-sources-6.6/0010-sched-ext.patch | 19747 ----------- ...d-uarches-for-kernel-6.1.79-6.8-rc3.patch} | 418 +- ...r_multi-llc_select_idle_sibling.patch.skip | 94 - ...Tune-ondemand-governor-for-interacti.patch | 75 - ...ve-schedutil-dependency-on-Intel-AMD.patch | 36 - ...e-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch | 47 - ...-Disable-AVX2-and-tree-vectorization.patch | 29 - ...dm-crypt-Disable-workqueues-for-cryp.patch | 36 - .../0215-ZEN-Add-VHBA-driver.patch | 1199 - .../0301-amd-pstate_preferred_core_V12.patch | 54 - .../0302-amd-pstate_preferred_core_V12.patch | 92 - .../0303-amd-pstate_preferred_core_V12.patch | 322 - .../0304-amd-pstate_preferred_core_V12.patch | 120 - .../0305-amd-pstate_preferred_core_V12.patch | 182 - .../0306-amd-pstate_preferred_core_V12.patch | 125 - .../0307-amd-pstate_preferred_core_V12.patch | 57 - sys-kernel/gentoo-sources-7.0/0001-bore.patch | 1217 + ...d-additional-timer-tick-frequencies.patch} | 0 sys-kernel/git-sources/0001-asus.patch | 6038 ++++ sys-kernel/git-sources/0002-bbr3.patch | 3404 ++ sys-kernel/git-sources/0003-cachy.patch | 9540 +++++ sys-kernel/git-sources/0004-fixes.patch | 107 + sys-kernel/git-sources/0005-sched-ext.patch | 21992 ------------ ....14-builtin-preserve-enum-value.patch.skip | 13 + 208 files changed, 241363 insertions(+), 45437 deletions(-) rename app-editors/kakoune/{0007-Implement-customizable-terminal-title.patch => 0007-Implement-customizable-terminal-title.patch.skip} (100%) delete mode 100644 app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff create mode 100644 dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip create mode 100644 dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch create mode 100644 gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch create mode 100644 gui-apps/noctalia-shell/mod_theme_foot.patch create mode 100644 gui-apps/noctalia-shell/mod_theme_hyprland.patch delete mode 100644 gui-wm/hyprland/5874-enable_LTO.patch create mode 100644 media-gfx/krita/0001-krita-use_wayland.patch delete mode 100644 media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch delete mode 100644 net-misc/networkmanager/1966.patch rename sys-kernel/gentoo-sources-6.10.3/{0000-bore.patch.skip => 0001-bore.patch} (100%) delete mode 100644 sys-kernel/gentoo-sources-6.10.3/0003-block.patch create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch create mode 100644 sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch create mode 100644 sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch rename sys-kernel/{git-sources/0003-bbr3.patch => gentoo-sources-6.11/0002-bbr3.patch} (98%) rename sys-kernel/{git-sources/0009-ntsync.patch => gentoo-sources-6.11/0007-ntsync.patch} (99%) rename sys-kernel/{gentoo-sources-6.10.3/0010-perf-per-core.patch => gentoo-sources-6.11/0008-perf-per-core.patch} (99%) rename sys-kernel/{git-sources => gentoo-sources-6.11}/0010-zstd.patch (99%) create mode 100644 sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch create mode 100644 sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch create mode 100644 sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch create mode 100644 sys-kernel/gentoo-sources-6.12/0004-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.12/0006-crypto.patch create mode 100644 sys-kernel/gentoo-sources-6.12/0007-fixes.patch rename sys-kernel/{gentoo-sources-6.10.3/0009-ntsync.patch => gentoo-sources-6.12/0008-ntsync.patch} (84%) create mode 100644 sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch create mode 100644 sys-kernel/gentoo-sources-6.12/0010-pksm.patch create mode 100644 sys-kernel/gentoo-sources-6.12/0012-zstd.patch create mode 100644 sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0003-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0005-crypto.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0008-ntsync.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0010-pksm.patch create mode 100644 sys-kernel/gentoo-sources-6.13/0012-zstd.patch create mode 100644 sys-kernel/gentoo-sources-6.14/0001-bore.patch create mode 100644 sys-kernel/gentoo-sources-6.14/0004-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.14/0006-crypto.patch create mode 100644 sys-kernel/gentoo-sources-6.14/0009-zstd.patch create mode 120000 sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15 create mode 100644 sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.15/0004-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.15/0005-block.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch create mode 100644 sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.16/0002-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.16/0003-block.patch create mode 100644 sys-kernel/gentoo-sources-6.16/0005-fixes.patch create mode 100644 sys-kernel/gentoo-sources-6.16/0006-s5-power.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch create mode 100644 sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch create mode 100644 sys-kernel/gentoo-sources-6.17/0002-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.17/0003-block.patch create mode 100644 sys-kernel/gentoo-sources-6.17/0005-fixes.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch create mode 100644 sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch create mode 100644 sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch rename sys-kernel/{gentoo-sources-6.6 => gentoo-sources-6.18}/0002-glitched-additional-timer-tick-frequencies.patch (100%) create mode 100644 sys-kernel/gentoo-sources-6.18/0004-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.18/0005-block.patch create mode 100644 sys-kernel/gentoo-sources-6.18/0007-crypto.patch create mode 100644 sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip create mode 100644 sys-kernel/gentoo-sources-6.19/0002-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.19/0005-hdmi.patch create mode 100644 sys-kernel/gentoo-sources-6.19/0006-r8125.patch create mode 100644 sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch create mode 100644 sys-kernel/gentoo-sources-6.19/0008-vmscape.patch rename sys-kernel/{gentoo-sources-6.10.3/0100-glitched-additional-timer-tick-frequencies.patch => gentoo-sources-6.19/0101-glitched-additional-timer-tick-frequencies.patch} (100%) create mode 100644 sys-kernel/gentoo-sources-6.6/0001-bbr3.patch create mode 100644 sys-kernel/gentoo-sources-6.6/0001-bore.patch create mode 100644 sys-kernel/gentoo-sources-6.6/0005-zstd.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch rename sys-kernel/{git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch => gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch} (61%) delete mode 100644 sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip delete mode 100644 sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch delete mode 100644 sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch create mode 100644 sys-kernel/gentoo-sources-7.0/0001-bore.patch rename sys-kernel/{git-sources/0002-glitched-additional-timer-tick-frequencies.patch => gentoo-sources-7.0/0101-glitched-additional-timer-tick-frequencies.patch} (100%) create mode 100644 sys-kernel/git-sources/0001-asus.patch create mode 100644 sys-kernel/git-sources/0002-bbr3.patch create mode 100644 sys-kernel/git-sources/0003-cachy.patch create mode 100644 sys-kernel/git-sources/0004-fixes.patch delete mode 100644 sys-kernel/git-sources/0005-sched-ext.patch create mode 100644 sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip diff --git a/app-editors/kakoune/0007-Implement-customizable-terminal-title.patch b/app-editors/kakoune/0007-Implement-customizable-terminal-title.patch.skip similarity index 100% rename from app-editors/kakoune/0007-Implement-customizable-terminal-title.patch rename to app-editors/kakoune/0007-Implement-customizable-terminal-title.patch.skip diff --git a/app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff b/app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff deleted file mode 100644 index 8fee101..0000000 --- a/app-misc/openrgb/asus_mainboard_fix_addressable_only_mainboard.diff +++ /dev/null @@ -1,33 +0,0 @@ -diff --git a/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp b/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp -index 64b28d6d46a689fb21a4497b103a695c03103e5c..a18f2b4af4fc55ff638633a7ee7bc138985b4a0b 100644 ---- a/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp -+++ b/Controllers/AsusAuraUSBController/AsusAuraMainboardController.cpp -@@ -15,6 +15,7 @@ AuraMainboardController::AuraMainboardController(hid_device* dev_handle, const c - unsigned char num_total_mainboard_leds = config_table[0x1B]; - unsigned char num_rgb_headers = config_table[0x1D]; - unsigned char num_addressable_headers = config_table[0x02]; -+ unsigned char effect_channel = 0; - - if(num_total_mainboard_leds < num_rgb_headers) - { -@@ -24,14 +25,18 @@ AuraMainboardController::AuraMainboardController(hid_device* dev_handle, const c - /*-----------------------------------------------------*\ - | Add mainboard device | - \*-----------------------------------------------------*/ -- device_info.push_back({0x00, 0x04, num_total_mainboard_leds, num_rgb_headers, AuraDeviceType::FIXED}); -+ if(num_total_mainboard_leds > 0) -+ { -+ device_info.push_back({effect_channel, 0x04, num_total_mainboard_leds, num_rgb_headers, AuraDeviceType::FIXED}); -+ effect_channel++; -+ } - - /*-----------------------------------------------------*\ - | Add addressable devices | - \*-----------------------------------------------------*/ - for(int i = 0; i < num_addressable_headers; i++) - { -- device_info.push_back({0x01, (unsigned char)i, 0x01, 0, AuraDeviceType::ADDRESSABLE}); -+ device_info.push_back({effect_channel, (unsigned char)i, 0x01, 0, AuraDeviceType::ADDRESSABLE}); - } - } - diff --git a/dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip b/dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip new file mode 100644 index 0000000..08dcd3f --- /dev/null +++ b/dev-libs/hyprwire/0001-messageParser-fix-varInt-decoding.skip @@ -0,0 +1,49 @@ +From a181269e49f039b55d1b5fd1509339b5ba06e6be Mon Sep 17 00:00:00 2001 +From: Vaxry +Date: Sun, 16 Nov 2025 20:48:56 +0000 +Subject: messageParser: fix varInt decoding + +fixes https://github.com/hyprwm/hyprlauncher/issues/65 +--- + src/core/message/MessageParser.cpp | 2 +- + src/core/wireObject/IWireObject.cpp | 4 ++-- + 2 files changed, 3 insertions(+), 3 deletions(-) + +diff --git a/src/core/message/MessageParser.cpp b/src/core/message/MessageParser.cpp +index 7a51be1..7a2c155 100644 +--- a/src/core/message/MessageParser.cpp ++++ b/src/core/message/MessageParser.cpp +@@ -224,7 +224,7 @@ std::pair CMessageParser::parseVarInt(const std::span(data[i] << 1) >> 1) << (i++ * 7)); +- } while (i < LEN && data[i] & 0x80); ++ } while (i < LEN && (data[i - 1] & 0x80)); + + return {rolling, i}; + } +diff --git a/src/core/wireObject/IWireObject.cpp b/src/core/wireObject/IWireObject.cpp +index 00b4406..7c81e58 100644 +--- a/src/core/wireObject/IWireObject.cpp ++++ b/src/core/wireObject/IWireObject.cpp +@@ -101,7 +101,7 @@ uint32_t IWireObject::call(uint32_t id, ...) { + case HW_MESSAGE_MAGIC_TYPE_VARCHAR: { + data.emplace_back(HW_MESSAGE_MAGIC_TYPE_VARCHAR); + auto str = va_arg(va, const char*); +- data.append_range(g_messageParser->encodeVarInt(std::strlen(str))); ++ data.append_range(g_messageParser->encodeVarInt(std::string_view(str).size())); + data.append_range(std::string_view(str)); + break; + } +@@ -129,7 +129,7 @@ uint32_t IWireObject::call(uint32_t id, ...) { + case HW_MESSAGE_MAGIC_TYPE_VARCHAR: { + for (size_t i = 0; i < arrayLen; ++i) { + const char* element = rc(arrayData)[i]; +- data.append_range(g_messageParser->encodeVarInt(std::strlen(element))); ++ data.append_range(g_messageParser->encodeVarInt(std::string_view(element).size())); + data.append_range(std::string_view(element)); + } + break; +-- +2.51.0 + diff --git a/dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch b/dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch new file mode 100644 index 0000000..a4bcc0a --- /dev/null +++ b/dev-qt/qtbase/0001-Wayland-Ignore-null-wl_output-in-QWaylandScreen-surf.patch @@ -0,0 +1,52 @@ +From 0285fce0ce0db7b9446389870fe6c076310eb28e Mon Sep 17 00:00:00 2001 +From: Igor Khanin +Date: Thu, 26 Feb 2026 10:50:58 +0200 +Subject: Wayland: Ignore null wl_output in QWaylandScreen surface enter/leave + +Misbehaving compositors (as observed with some Smithay based +compositors) may send wl_surface.enter and wl_surface.leave messages +referring to an output which was already removed from the registry, and +therefore its' proxy object was already destroyed. This manifests as +the listener method being invoked with a null wl_output pointer, which +Qt then dereferences - leading to a crash. + +To avoid crashing, simply just ignore such events. + +Pick-to: 6.11 6.10 +Change-Id: Ib217366b5aff1b39dcc6f42e52165b94ea7a1018 +Reviewed-by: David Edmundson +--- + src/plugins/platforms/wayland/qwaylandsurface.cpp | 8 ++++++-- + 1 file changed, 6 insertions(+), 2 deletions(-) + +diff --git a/src/plugins/platforms/wayland/qwaylandsurface.cpp b/src/plugins/platforms/wayland/qwaylandsurface.cpp +index bd7c358e42a..ffccbefe61c 100644 +--- a/src/plugins/platforms/wayland/qwaylandsurface.cpp ++++ b/src/plugins/platforms/wayland/qwaylandsurface.cpp +@@ -56,8 +56,10 @@ void QWaylandSurface::handleScreenRemoved(QScreen *qScreen) + + void QWaylandSurface::surface_enter(wl_output *output) + { +- auto addedScreen = QWaylandScreen::fromWlOutput(output); ++ if (!output) ++ return; + ++ auto addedScreen = QWaylandScreen::fromWlOutput(output); + if (!addedScreen) + return; + +@@ -76,8 +78,10 @@ void QWaylandSurface::surface_enter(wl_output *output) + + void QWaylandSurface::surface_leave(wl_output *output) + { +- auto *removedScreen = QWaylandScreen::fromWlOutput(output); ++ if (!output) ++ return; + ++ auto *removedScreen = QWaylandScreen::fromWlOutput(output); + if (!removedScreen) + return; + +-- +2.52.0 + diff --git a/gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch b/gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch new file mode 100644 index 0000000..918a129 --- /dev/null +++ b/gui-apps/hyprqt6engine/0001-cmake-Added-find_package-to-resolve-GuiPrivate-not-b.patch @@ -0,0 +1,24 @@ +From 9063e6837b4e282a73b052a0c0371916daccf50a Mon Sep 17 00:00:00 2001 +From: VuaTech +Date: Fri, 12 Sep 2025 12:32:44 -0400 +Subject: cmake: Added find_package to resolve GuiPrivate not being found (#4) + +--- + hyprqtplugin/CMakeLists.txt | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/hyprqtplugin/CMakeLists.txt b/hyprqtplugin/CMakeLists.txt +index bbdb33a..c427670 100644 +--- a/hyprqtplugin/CMakeLists.txt ++++ b/hyprqtplugin/CMakeLists.txt +@@ -1,5 +1,7 @@ + project(hyprqtplugin) + ++find_package(Qt6 REQUIRED COMPONENTS Gui GuiPrivate) ++ + set(app_SRCS + main.cpp + PlatformTheme.cpp +-- +2.51.2 + diff --git a/gui-apps/noctalia-shell/mod_theme_foot.patch b/gui-apps/noctalia-shell/mod_theme_foot.patch new file mode 100644 index 0000000..810e396 --- /dev/null +++ b/gui-apps/noctalia-shell/mod_theme_foot.patch @@ -0,0 +1,29 @@ +--- noctalia-release/Assets/Templates/foot_predefined.backup 2026-04-23 08:13:04.228820613 +0200 ++++ noctalia-release/Assets/Templates/terminal/foot-predefined 2026-04-23 08:17:51.671131130 +0200 +@@ -20,3 +20,26 @@ + selection-foreground={{colors.terminal_selection_fg.default.hex_stripped}} + selection-background={{colors.terminal_selection_bg.default.hex_stripped}} + cursor={{colors.terminal_cursor_text.default.hex_stripped}} {{colors.terminal_cursor.default.hex_stripped}} ++ ++[colors-light] ++foreground={{colors.terminal_background.default.hex_stripped}} ++background={{colors.terminal_foreground.default.hex_stripped}} ++regular0={{colors.terminal_normal_black.default.hex_stripped}} ++regular1={{colors.terminal_normal_red.default.hex_stripped}} ++regular2={{colors.terminal_normal_green.default.hex_stripped}} ++regular3={{colors.terminal_normal_yellow.default.hex_stripped}} ++regular4={{colors.terminal_normal_blue.default.hex_stripped}} ++regular5={{colors.terminal_normal_magenta.default.hex_stripped}} ++regular6={{colors.terminal_normal_cyan.default.hex_stripped}} ++regular7={{colors.terminal_normal_white.default.hex_stripped}} ++bright0={{colors.terminal_bright_black.default.hex_stripped}} ++bright1={{colors.terminal_bright_red.default.hex_stripped}} ++bright2={{colors.terminal_bright_green.default.hex_stripped}} ++bright3={{colors.terminal_bright_yellow.default.hex_stripped}} ++bright4={{colors.terminal_bright_blue.default.hex_stripped}} ++bright5={{colors.terminal_bright_magenta.default.hex_stripped}} ++bright6={{colors.terminal_bright_cyan.default.hex_stripped}} ++bright7={{colors.terminal_bright_white.default.hex_stripped}} ++selection-foreground={{colors.terminal_selection_bg.default.hex_stripped}} ++selection-background={{colors.terminal_selection_fg.default.hex_stripped}} ++cursor={{colors.terminal_cursor_text.default.hex_stripped}} {{colors.terminal_cursor.default.hex_stripped}} diff --git a/gui-apps/noctalia-shell/mod_theme_hyprland.patch b/gui-apps/noctalia-shell/mod_theme_hyprland.patch new file mode 100644 index 0000000..b30286b --- /dev/null +++ b/gui-apps/noctalia-shell/mod_theme_hyprland.patch @@ -0,0 +1,43 @@ +*** noctalia-release/Assets/Templates/hyprland.conf 2026-04-02 06:45:57.794000629 +0200 +--- noctalia-release/Assets/Templates/hyprland_b.conf 2026-04-02 06:37:39.309812237 +0200 +*************** +*** 8,25 **** + general { + col.active_border = $primary + col.inactive_border = $surface + } + + group { + col.border_active = $secondary + col.border_inactive = $surface +! col.border_locked_active = $error + col.border_locked_inactive = $surface + + groupbar { + col.active = $secondary + col.inactive = $surface +! col.locked_active = $error + col.locked_inactive = $surface + } + } +--- 8,27 ---- + general { + col.active_border = $primary + col.inactive_border = $surface ++ col.nogroup_border = $surface ++ col.nogroup_border_active = $error + } + + group { + col.border_active = $secondary + col.border_inactive = $surface +! col.border_locked_active = $tertiary + col.border_locked_inactive = $surface + + groupbar { + col.active = $secondary + col.inactive = $surface +! col.locked_active = $tertiary + col.locked_inactive = $surface + } + } diff --git a/gui-wm/hyprland/5874-enable_LTO.patch b/gui-wm/hyprland/5874-enable_LTO.patch deleted file mode 100644 index 25394f4..0000000 --- a/gui-wm/hyprland/5874-enable_LTO.patch +++ /dev/null @@ -1,10 +0,0 @@ -diff --git a/meson.build b/meson.build -@@ -5,6 +5,7 @@ project('Hyprland', 'cpp', 'c', - 'default_library=static', - 'optimization=3', - 'buildtype=release', - + 'b_lto=true', - 'debug=false' - # 'cpp_std=c++23' # not yet supported by meson, as of version 0.63.0 - ]) --- diff --git a/media-gfx/krita/0001-krita-use_wayland.patch b/media-gfx/krita/0001-krita-use_wayland.patch new file mode 100644 index 0000000..f7734ce --- /dev/null +++ b/media-gfx/krita/0001-krita-use_wayland.patch @@ -0,0 +1,110 @@ +diff --git a/CMakeLists.txt b/CMakeLists.txt +index 18fb3c1..a1aca83 100644 +--- a/CMakeLists.txt ++++ b/CMakeLists.txt +@@ -544,8 +544,6 @@ endif() + + if (NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU) + +- find_package(Qt5 ${MIN_QT_VERSION} REQUIRED X11Extras) +- + find_package(Qt5DBus ${MIN_QT_VERSION}) + set(HAVE_DBUS ${Qt5DBus_FOUND}) + set_package_properties(Qt5DBus PROPERTIES +@@ -562,10 +560,8 @@ if (NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU) + TYPE OPTIONAL + PURPOSE "Optionally used to provide crash reporting on Linux") + +- find_package(X11 REQUIRED COMPONENTS Xinput) +- set(HAVE_X11 TRUE) +- add_definitions(-DHAVE_X11) +- ++ set(HAVE_X11 FALSE) ++ set(HAVE_XCB FALSE) + else() + set(HAVE_DBUS FALSE) + set(HAVE_X11 FALSE) +diff --git a/krita/main.cc b/krita/main.cc +index c7ff996..a063d1a 100644 +--- a/krita/main.cc ++++ b/krita/main.cc +@@ -568,13 +568,6 @@ extern "C" MAIN_EXPORT int MAIN_FN(int argc, char **argv) + + installTranslators(app); + +- if (KisApplication::platformName() == "wayland") { +- QMessageBox::critical(nullptr, +- i18nc("@title:window", "Fatal Error"), +- i18n("Krita does not support the Wayland platform. Use XWayland to run Krita on Wayland. Krita will close now.")); +- return -1; +- } +- + KisUsageLogger::writeHeader(); + KisOpenGL::initialize(); + +diff --git a/libs/ui/CMakeLists.txt b/libs/ui/CMakeLists.txt +index 91e7de3..c2b3e5b 100644 +--- a/libs/ui/CMakeLists.txt ++++ b/libs/ui/CMakeLists.txt +@@ -712,11 +712,6 @@ if (ANDROID) + target_link_libraries(kritaui PRIVATE Qt5::AndroidExtras) + endif() + +-if (NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU) +- target_link_libraries(kritaui PRIVATE ${X11_X11_LIB} +- ${X11_Xinput_LIB}) +-endif() +- + if (HAIKU) + target_link_libraries(kritaui PRIVATE network expat iconv intl) + endif() +@@ -736,15 +731,6 @@ if(OpenEXR_FOUND) + target_link_libraries(kritaui PUBLIC ${LINK_OPENEXR_LIB}) + endif() + +-# Add VSync disable workaround +-if(NOT WIN32 AND NOT APPLE AND NOT ANDROID AND NOT HAIKU) +- target_link_libraries(kritaui PRIVATE ${CMAKE_DL_LIBS} Qt5::X11Extras) +-endif() +- +-if(X11_FOUND) +- target_link_libraries(kritaui PRIVATE Qt5::X11Extras ${X11_LIBRARIES}) +-endif() +- + target_include_directories(kritaui + PUBLIC + $ +diff --git a/libs/widgets/CMakeLists.txt b/libs/widgets/CMakeLists.txt +index aeae382..6f26f93 100644 +--- a/libs/widgets/CMakeLists.txt ++++ b/libs/widgets/CMakeLists.txt +@@ -113,10 +113,6 @@ target_link_libraries(kritawidgets + KF5::Completion + ) + +-if(X11_FOUND) +- target_link_libraries(kritawidgets Qt5::X11Extras ${X11_LIBRARIES}) +-endif() +- + set_target_properties(kritawidgets PROPERTIES + VERSION ${GENERIC_KRITA_LIB_VERSION} SOVERSION ${GENERIC_KRITA_LIB_SOVERSION} + ) +diff --git a/plugins/extensions/pykrita/plugin/version_checker.h b/plugins/extensions/pykrita/plugin/version_checker.h +index ac092b2..6676d67 100644 +--- a/plugins/extensions/pykrita/plugin/version_checker.h ++++ b/plugins/extensions/pykrita/plugin/version_checker.h +@@ -13,6 +13,14 @@ + # include + # include + ++#ifdef major ++#undef major ++#endif ++ ++#ifdef minor ++#undef minor ++#endif ++ + namespace PyKrita + { + diff --git a/media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch b/media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch deleted file mode 100644 index 6a79f60..0000000 --- a/media-sound/drumgizmo/0001-fix_build_with_gcc13+.patch +++ /dev/null @@ -1,10 +0,0 @@ -*** ./plugin/plugingizmo/plugin.h 2023-04-30 16:29:49.409277767 +0200 -+++ ./plugin/plugingizmo/plugin.h 2023-04-30 16:29:40.469204589 +0200 -@@ -30,6 +30,7 @@ - #include - - #include -+#include - - #if defined(WIN32) - #define PG_EXPORT extern "C" __declspec(dllexport) diff --git a/net-misc/networkmanager/1966.patch b/net-misc/networkmanager/1966.patch deleted file mode 100644 index b48b21d..0000000 --- a/net-misc/networkmanager/1966.patch +++ /dev/null @@ -1,67 +0,0 @@ -From 70d1c34b94baadc3305745cf159ea55f312beacc Mon Sep 17 00:00:00 2001 -From: Khem Raj -Date: Fri, 7 Jun 2024 14:03:15 -0700 -Subject: [PATCH] libnm-systemd-core: Disable sd_dhcp6_client_set_duid_uuid - function - -When building on musl systems ( with out systemd ), and using LLD linker -from LLVM project we fail to link with undefined symbols. - -This symbol is in sd_id128.c but its disabled, so let disable the functions -which need this function. - -| x86_64-yoe-linux-musl-ld.lld: error: undefined symbol: sd_id128_get_machine_app_specific -| >>> referenced by sd-dhcp-duid.c:202 (/usr/src/debug/networkmanager/1.48.0/../NetworkManager-1.48.0/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c:202) -| >>> libnm-systemd-core.a.p/src_libsystemd-network_sd-dhcp-duid.c.o:(sd_dhcp_duid_set_uuid) in archive src/libnm-systemd-core/libnm-systemd-core.a -| x86_64-yoe-linux-musl-clang: error: linker command failed with exit code 1 (use -v to see invocation) - -Signed-off-by: Khem Raj ---- - src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c | 2 ++ - .../src/libsystemd-network/sd-dhcp6-client.c | 3 ++- - 2 files changed, 4 insertions(+), 1 deletion(-) - -diff --git a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c -index e664a4a720..7ba502086f 100644 ---- a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c -+++ b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp-duid.c -@@ -193,6 +193,7 @@ int sd_dhcp_duid_set_en(sd_dhcp_duid *duid) { - return 0; - } - -+#if 0 - int sd_dhcp_duid_set_uuid(sd_dhcp_duid *duid) { - sd_id128_t machine_id; - int r; -@@ -209,6 +210,7 @@ int sd_dhcp_duid_set_uuid(sd_dhcp_duid *duid) { - duid->size = offsetof(struct duid, uuid.uuid) + sizeof(machine_id); - return 0; - } -+#endif - - int dhcp_duid_to_string_internal(uint16_t type, const void *data, size_t data_size, char **ret) { - _cleanup_free_ char *p = NULL, *x = NULL; -diff --git a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c -index 7c20116409..08c1e96b3c 100644 ---- a/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c -+++ b/src/libnm-systemd-core/src/libsystemd-network/sd-dhcp6-client.c -@@ -244,6 +244,7 @@ int sd_dhcp6_client_set_duid_en(sd_dhcp6_client *client) { - return 0; - } - -+#if 0 - int sd_dhcp6_client_set_duid_uuid(sd_dhcp6_client *client) { - int r; - -@@ -256,7 +257,7 @@ int sd_dhcp6_client_set_duid_uuid(sd_dhcp6_client *client) { - - return 0; - } -- -+#endif - int sd_dhcp6_client_set_duid_raw(sd_dhcp6_client *client, uint16_t duid_type, const uint8_t *duid, size_t duid_len) { - int r; - --- -GitLab - diff --git a/sys-kernel/gentoo-sources b/sys-kernel/gentoo-sources index 5f1b707..d942c51 120000 --- a/sys-kernel/gentoo-sources +++ b/sys-kernel/gentoo-sources @@ -1 +1 @@ -gentoo-sources-6.10.3 \ No newline at end of file +gentoo-sources-7.0/ \ No newline at end of file diff --git a/sys-kernel/gentoo-sources-6.10.3/0000-bore.patch.skip b/sys-kernel/gentoo-sources-6.10.3/0001-bore.patch similarity index 100% rename from sys-kernel/gentoo-sources-6.10.3/0000-bore.patch.skip rename to sys-kernel/gentoo-sources-6.10.3/0001-bore.patch diff --git a/sys-kernel/gentoo-sources-6.10.3/0003-block.patch b/sys-kernel/gentoo-sources-6.10.3/0003-block.patch deleted file mode 100644 index e71c1d0..0000000 --- a/sys-kernel/gentoo-sources-6.10.3/0003-block.patch +++ /dev/null @@ -1,485 +0,0 @@ -From bbee24678a3eef2513debdb999d302394638f90b Mon Sep 17 00:00:00 2001 -From: Peter Jung -Date: Sat, 3 Aug 2024 09:33:05 +0200 -Subject: [PATCH 03/12] block - -Signed-off-by: Peter Jung ---- - block/bfq-iosched.c | 120 ++++++++++++++++++++++++++++++++++++-------- - block/bfq-iosched.h | 16 +++++- - block/mq-deadline.c | 110 +++++++++++++++++++++++++++++++++------- - 3 files changed, 203 insertions(+), 43 deletions(-) - -diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c -index 4b88a54a9b76..88df08a246fa 100644 ---- a/block/bfq-iosched.c -+++ b/block/bfq-iosched.c -@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) - return icq; - } - -+static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q) -+{ -+ if (!current->io_context) -+ return NULL; -+ if (spin_trylock_irq(&q->queue_lock)) { -+ struct bfq_io_cq *icq; -+ -+ icq = icq_to_bic(ioc_lookup_icq(q)); -+ spin_unlock_irq(&q->queue_lock); -+ return icq; -+ } -+ -+ return NULL; -+} -+ - /* - * Scheduler run of queue, if there are requests pending and no one in the - * driver that will restart queueing. -@@ -2454,10 +2469,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, - * returned by bfq_bic_lookup does not go away before - * bfqd->lock is taken. - */ -- struct bfq_io_cq *bic = bfq_bic_lookup(q); -+ struct bfq_io_cq *bic = bfq_bic_try_lookup(q); - bool ret; - -- spin_lock_irq(&bfqd->lock); -+ /* -+ * bio merging is called for every bio queued, and it's very easy -+ * to run into contention because of that. If we fail getting -+ * the dd lock, just skip this merge attempt. For related IO, the -+ * plug will be the successful merging point. If we get here, we -+ * already failed doing the obvious merge. Chances of actually -+ * getting a merge off this path is a lot slimmer, so skipping an -+ * occassional lookup that will most likely not succeed anyway should -+ * not be a problem. -+ */ -+ if (!spin_trylock_irq(&bfqd->lock)) -+ return false; - - if (bic) { - /* -@@ -5148,6 +5174,10 @@ static bool bfq_has_work(struct blk_mq_hw_ctx *hctx) - { - struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; - -+ if (!list_empty_careful(&bfqd->at_head) || -+ !list_empty_careful(&bfqd->at_tail)) -+ return true; -+ - /* - * Avoiding lock: a race on bfqd->queued should cause at - * most a call to dispatch for nothing -@@ -5297,15 +5327,61 @@ static inline void bfq_update_dispatch_stats(struct request_queue *q, - bool idle_timer_disabled) {} - #endif /* CONFIG_BFQ_CGROUP_DEBUG */ - -+static void bfq_insert_request(struct request_queue *q, struct request *rq, -+ blk_insert_t flags, struct list_head *free); -+ -+static void __bfq_do_insert(struct request_queue *q, blk_insert_t flags, -+ struct list_head *list, struct list_head *free) -+{ -+ while (!list_empty(list)) { -+ struct request *rq; -+ -+ rq = list_first_entry(list, struct request, queuelist); -+ list_del_init(&rq->queuelist); -+ bfq_insert_request(q, rq, flags, free); -+ } -+} -+ -+static void bfq_do_insert(struct request_queue *q, struct list_head *free) -+{ -+ struct bfq_data *bfqd = q->elevator->elevator_data; -+ LIST_HEAD(at_head); -+ LIST_HEAD(at_tail); -+ -+ spin_lock(&bfqd->insert_lock); -+ list_splice_init(&bfqd->at_head, &at_head); -+ list_splice_init(&bfqd->at_tail, &at_tail); -+ spin_unlock(&bfqd->insert_lock); -+ -+ __bfq_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free); -+ __bfq_do_insert(q, 0, &at_tail, free); -+} -+ - static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - { -- struct bfq_data *bfqd = hctx->queue->elevator->elevator_data; -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; - struct request *rq; - struct bfq_queue *in_serv_queue; - bool waiting_rq, idle_timer_disabled = false; -+ LIST_HEAD(free); -+ -+ /* -+ * If someone else is already dispatching, skip this one. This will -+ * defer the next dispatch event to when something completes, and could -+ * potentially lower the queue depth for contended cases. -+ * -+ * See the logic in blk_mq_do_dispatch_sched(), which loops and -+ * retries if nothing is dispatched. -+ */ -+ if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) || -+ test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state)) -+ return NULL; - - spin_lock_irq(&bfqd->lock); - -+ bfq_do_insert(hctx->queue, &free); -+ - in_serv_queue = bfqd->in_service_queue; - waiting_rq = in_serv_queue && bfq_bfqq_wait_request(in_serv_queue); - -@@ -5315,7 +5391,9 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) - waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); - } - -+ clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state); - spin_unlock_irq(&bfqd->lock); -+ blk_mq_free_requests(&free); - bfq_update_dispatch_stats(hctx->queue, rq, - idle_timer_disabled ? in_serv_queue : NULL, - idle_timer_disabled); -@@ -6236,27 +6314,21 @@ static inline void bfq_update_insert_stats(struct request_queue *q, - - static struct bfq_queue *bfq_init_rq(struct request *rq); - --static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -- blk_insert_t flags) -+static void bfq_insert_request(struct request_queue *q, struct request *rq, -+ blk_insert_t flags, struct list_head *free) - { -- struct request_queue *q = hctx->queue; - struct bfq_data *bfqd = q->elevator->elevator_data; - struct bfq_queue *bfqq; - bool idle_timer_disabled = false; - blk_opf_t cmd_flags; -- LIST_HEAD(free); - - #ifdef CONFIG_BFQ_GROUP_IOSCHED - if (!cgroup_subsys_on_dfl(io_cgrp_subsys) && rq->bio) - bfqg_stats_update_legacy_io(q, rq); - #endif -- spin_lock_irq(&bfqd->lock); - bfqq = bfq_init_rq(rq); -- if (blk_mq_sched_try_insert_merge(q, rq, &free)) { -- spin_unlock_irq(&bfqd->lock); -- blk_mq_free_requests(&free); -+ if (blk_mq_sched_try_insert_merge(q, rq, free)) - return; -- } - - trace_block_rq_insert(rq); - -@@ -6286,8 +6358,6 @@ static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, - * merge). - */ - cmd_flags = rq->cmd_flags; -- spin_unlock_irq(&bfqd->lock); -- - bfq_update_insert_stats(q, bfqq, idle_timer_disabled, - cmd_flags); - } -@@ -6296,13 +6366,15 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, - struct list_head *list, - blk_insert_t flags) - { -- while (!list_empty(list)) { -- struct request *rq; -+ struct request_queue *q = hctx->queue; -+ struct bfq_data *bfqd = q->elevator->elevator_data; - -- rq = list_first_entry(list, struct request, queuelist); -- list_del_init(&rq->queuelist); -- bfq_insert_request(hctx, rq, flags); -- } -+ spin_lock_irq(&bfqd->insert_lock); -+ if (flags & BLK_MQ_INSERT_AT_HEAD) -+ list_splice_init(list, &bfqd->at_head); -+ else -+ list_splice_init(list, &bfqd->at_tail); -+ spin_unlock_irq(&bfqd->insert_lock); - } - - static void bfq_update_hw_tag(struct bfq_data *bfqd) -@@ -7211,6 +7283,12 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - q->elevator = eq; - spin_unlock_irq(&q->queue_lock); - -+ spin_lock_init(&bfqd->lock); -+ spin_lock_init(&bfqd->insert_lock); -+ -+ INIT_LIST_HEAD(&bfqd->at_head); -+ INIT_LIST_HEAD(&bfqd->at_tail); -+ - /* - * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. - * Grab a permanent reference to it, so that the normal code flow -@@ -7329,8 +7407,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) - /* see comments on the definition of next field inside bfq_data */ - bfqd->actuator_load_threshold = 4; - -- spin_lock_init(&bfqd->lock); -- - /* - * The invocation of the next bfq_create_group_hierarchy - * function is the head of a chain of function calls -diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h -index 467e8cfc41a2..f44f5d4ec2f4 100644 ---- a/block/bfq-iosched.h -+++ b/block/bfq-iosched.h -@@ -504,12 +504,26 @@ struct bfq_io_cq { - unsigned int requests; /* Number of requests this process has in flight */ - }; - -+enum { -+ BFQ_DISPATCHING = 0, -+}; -+ - /** - * struct bfq_data - per-device data structure. - * - * All the fields are protected by @lock. - */ - struct bfq_data { -+ struct { -+ spinlock_t lock; -+ spinlock_t insert_lock; -+ } ____cacheline_aligned_in_smp; -+ -+ unsigned long run_state; -+ -+ struct list_head at_head; -+ struct list_head at_tail; -+ - /* device request queue */ - struct request_queue *queue; - /* dispatch queue */ -@@ -795,8 +809,6 @@ struct bfq_data { - /* fallback dummy bfqq for extreme OOM conditions */ - struct bfq_queue oom_bfqq; - -- spinlock_t lock; -- - /* - * bic associated with the task issuing current bio for - * merging. This and the next field are used as a support to -diff --git a/block/mq-deadline.c b/block/mq-deadline.c -index acdc28756d9d..8b214233a061 100644 ---- a/block/mq-deadline.c -+++ b/block/mq-deadline.c -@@ -79,10 +79,23 @@ struct dd_per_prio { - struct io_stats_per_prio stats; - }; - -+enum { -+ DD_DISPATCHING = 0, -+}; -+ - struct deadline_data { - /* - * run time data - */ -+ struct { -+ spinlock_t lock; -+ spinlock_t insert_lock; -+ } ____cacheline_aligned_in_smp; -+ -+ unsigned long run_state; -+ -+ struct list_head at_head; -+ struct list_head at_tail; - - struct dd_per_prio per_prio[DD_PRIO_COUNT]; - -@@ -100,8 +113,6 @@ struct deadline_data { - int front_merges; - u32 async_depth; - int prio_aging_expire; -- -- spinlock_t lock; - }; - - /* Maps an I/O priority class to a deadline scheduler priority. */ -@@ -112,6 +123,9 @@ static const enum dd_prio ioprio_class_to_prio[] = { - [IOPRIO_CLASS_IDLE] = DD_IDLE_PRIO, - }; - -+static void dd_insert_request(struct request_queue *q, struct request *rq, -+ blk_insert_t flags, struct list_head *free); -+ - static inline struct rb_root * - deadline_rb_root(struct dd_per_prio *per_prio, struct request *rq) - { -@@ -451,6 +465,33 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, - return NULL; - } - -+static void __dd_do_insert(struct request_queue *q, blk_insert_t flags, -+ struct list_head *list, struct list_head *free) -+{ -+ while (!list_empty(list)) { -+ struct request *rq; -+ -+ rq = list_first_entry(list, struct request, queuelist); -+ list_del_init(&rq->queuelist); -+ dd_insert_request(q, rq, flags, free); -+ } -+} -+ -+static void dd_do_insert(struct request_queue *q, struct list_head *free) -+{ -+ struct deadline_data *dd = q->elevator->elevator_data; -+ LIST_HEAD(at_head); -+ LIST_HEAD(at_tail); -+ -+ spin_lock(&dd->insert_lock); -+ list_splice_init(&dd->at_head, &at_head); -+ list_splice_init(&dd->at_tail, &at_tail); -+ spin_unlock(&dd->insert_lock); -+ -+ __dd_do_insert(q, BLK_MQ_INSERT_AT_HEAD, &at_head, free); -+ __dd_do_insert(q, 0, &at_tail, free); -+} -+ - /* - * Called from blk_mq_run_hw_queue() -> __blk_mq_sched_dispatch_requests(). - * -@@ -461,12 +502,27 @@ static struct request *dd_dispatch_prio_aged_requests(struct deadline_data *dd, - */ - static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) - { -- struct deadline_data *dd = hctx->queue->elevator->elevator_data; -+ struct request_queue *q = hctx->queue; -+ struct deadline_data *dd = q->elevator->elevator_data; - const unsigned long now = jiffies; - struct request *rq; - enum dd_prio prio; -+ LIST_HEAD(free); -+ -+ /* -+ * If someone else is already dispatching, skip this one. This will -+ * defer the next dispatch event to when something completes, and could -+ * potentially lower the queue depth for contended cases. -+ * -+ * See the logic in blk_mq_do_dispatch_sched(), which loops and -+ * retries if nothing is dispatched. -+ */ -+ if (test_bit(DD_DISPATCHING, &dd->run_state) || -+ test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state)) -+ return NULL; - - spin_lock(&dd->lock); -+ dd_do_insert(q, &free); - rq = dd_dispatch_prio_aged_requests(dd, now); - if (rq) - goto unlock; -@@ -482,8 +538,10 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) - } - - unlock: -+ clear_bit_unlock(DD_DISPATCHING, &dd->run_state); - spin_unlock(&dd->lock); - -+ blk_mq_free_requests(&free); - return rq; - } - -@@ -585,6 +643,12 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) - - eq->elevator_data = dd; - -+ spin_lock_init(&dd->lock); -+ spin_lock_init(&dd->insert_lock); -+ -+ INIT_LIST_HEAD(&dd->at_head); -+ INIT_LIST_HEAD(&dd->at_tail); -+ - for (prio = 0; prio <= DD_PRIO_MAX; prio++) { - struct dd_per_prio *per_prio = &dd->per_prio[prio]; - -@@ -601,7 +665,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) - dd->last_dir = DD_WRITE; - dd->fifo_batch = fifo_batch; - dd->prio_aging_expire = prio_aging_expire; -- spin_lock_init(&dd->lock); - - /* We dispatch from request queue wide instead of hw queue */ - blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); -@@ -657,7 +720,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, - struct request *free = NULL; - bool ret; - -- spin_lock(&dd->lock); -+ /* -+ * bio merging is called for every bio queued, and it's very easy -+ * to run into contention because of that. If we fail getting -+ * the dd lock, just skip this merge attempt. For related IO, the -+ * plug will be the successful merging point. If we get here, we -+ * already failed doing the obvious merge. Chances of actually -+ * getting a merge off this path is a lot slimmer, so skipping an -+ * occassional lookup that will most likely not succeed anyway should -+ * not be a problem. -+ */ -+ if (!spin_trylock(&dd->lock)) -+ return false; -+ - ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); - spin_unlock(&dd->lock); - -@@ -670,10 +745,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, - /* - * add rq to rbtree and fifo - */ --static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, -+static void dd_insert_request(struct request_queue *q, struct request *rq, - blk_insert_t flags, struct list_head *free) - { -- struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; - const enum dd_data_dir data_dir = rq_data_dir(rq); - u16 ioprio = req_get_ioprio(rq); -@@ -727,19 +801,13 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, - { - struct request_queue *q = hctx->queue; - struct deadline_data *dd = q->elevator->elevator_data; -- LIST_HEAD(free); -- -- spin_lock(&dd->lock); -- while (!list_empty(list)) { -- struct request *rq; -- -- rq = list_first_entry(list, struct request, queuelist); -- list_del_init(&rq->queuelist); -- dd_insert_request(hctx, rq, flags, &free); -- } -- spin_unlock(&dd->lock); - -- blk_mq_free_requests(&free); -+ spin_lock(&dd->insert_lock); -+ if (flags & BLK_MQ_INSERT_AT_HEAD) -+ list_splice_init(list, &dd->at_head); -+ else -+ list_splice_init(list, &dd->at_tail); -+ spin_unlock(&dd->insert_lock); - } - - /* Callback from inside blk_mq_rq_ctx_init(). */ -@@ -780,6 +848,10 @@ static bool dd_has_work(struct blk_mq_hw_ctx *hctx) - struct deadline_data *dd = hctx->queue->elevator->elevator_data; - enum dd_prio prio; - -+ if (!list_empty_careful(&dd->at_head) || -+ !list_empty_careful(&dd->at_tail)) -+ return true; -+ - for (prio = 0; prio <= DD_PRIO_MAX; prio++) - if (dd_has_work_for_prio(&dd->per_prio[prio])) - return true; --- -2.46.0.rc1 - diff --git a/sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch b/sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch new file mode 100644 index 0000000..8a554c3 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.11.3+/0001-eevdf-next.patch @@ -0,0 +1,4444 @@ +From 5a335b2c05a76e727dad94990bd8d78b220829c3 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 10 Oct 2024 12:45:33 +0200 +Subject: [PATCH] eevdf-next + +Signed-off-by: Peter Jung +--- + Documentation/scheduler/sched-deadline.rst | 14 +- + drivers/cpufreq/cppc_cpufreq.c | 6 +- + fs/bcachefs/six.c | 2 +- + fs/select.c | 2 +- + include/linux/ioprio.h | 2 +- + include/linux/sched.h | 26 +- + include/linux/sched/deadline.h | 14 +- + include/linux/sched/prio.h | 1 + + include/linux/sched/rt.h | 33 +- + include/uapi/linux/sched/types.h | 6 +- + kernel/freezer.c | 2 +- + kernel/locking/rtmutex.c | 4 +- + kernel/locking/rwsem.c | 4 +- + kernel/locking/ww_mutex.h | 2 +- + kernel/sched/core.c | 273 +++++--- + kernel/sched/cpufreq_schedutil.c | 6 +- + kernel/sched/deadline.c | 465 ++++++++++--- + kernel/sched/debug.c | 198 +++++- + kernel/sched/fair.c | 750 ++++++++++++++++----- + kernel/sched/features.h | 30 +- + kernel/sched/idle.c | 23 +- + kernel/sched/rt.c | 261 +++---- + kernel/sched/sched.h | 102 ++- + kernel/sched/stats.h | 10 + + kernel/sched/stop_task.c | 18 +- + kernel/sched/syscalls.c | 132 +--- + kernel/sched/topology.c | 8 + + kernel/time/hrtimer.c | 6 +- + kernel/trace/trace_sched_wakeup.c | 2 +- + mm/page-writeback.c | 4 +- + mm/page_alloc.c | 2 +- + 31 files changed, 1688 insertions(+), 720 deletions(-) + +diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst +index 9fe4846079bb..22838ed8e13a 100644 +--- a/Documentation/scheduler/sched-deadline.rst ++++ b/Documentation/scheduler/sched-deadline.rst +@@ -749,21 +749,19 @@ Appendix A. Test suite + of the command line options. Please refer to rt-app documentation for more + details (`/doc/*.json`). + +- The second testing application is a modification of schedtool, called +- schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a +- certain pid/application. schedtool-dl is available at: +- https://github.com/scheduler-tools/schedtool-dl.git. ++ The second testing application is done using chrt which has support ++ for SCHED_DEADLINE. + + The usage is straightforward:: + +- # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app ++ # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app + + With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation +- of 10ms every 100ms (note that parameters are expressed in microseconds). +- You can also use schedtool to create a reservation for an already running ++ of 10ms every 100ms (note that parameters are expressed in nanoseconds). ++ You can also use chrt to create a reservation for an already running + application, given that you know its pid:: + +- # schedtool -E -t 10000000:100000000 my_app_pid ++ # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid + + Appendix B. Minimal main() + ========================== +diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c +index bafa32dd375d..1a5ad184d28f 100644 +--- a/drivers/cpufreq/cppc_cpufreq.c ++++ b/drivers/cpufreq/cppc_cpufreq.c +@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void) + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ +- .sched_runtime = 1000000, +- .sched_deadline = 10000000, +- .sched_period = 10000000, ++ .sched_runtime = NSEC_PER_MSEC, ++ .sched_deadline = 10 * NSEC_PER_MSEC, ++ .sched_period = 10 * NSEC_PER_MSEC, + }; + int ret; + +diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c +index 3a494c5d1247..9cbd3c14c94f 100644 +--- a/fs/bcachefs/six.c ++++ b/fs/bcachefs/six.c +@@ -335,7 +335,7 @@ static inline bool six_owner_running(struct six_lock *lock) + */ + rcu_read_lock(); + struct task_struct *owner = READ_ONCE(lock->owner); +- bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); ++ bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); + rcu_read_unlock(); + + return ret; +diff --git a/fs/select.c b/fs/select.c +index bc185d111436..bc5762b03945 100644 +--- a/fs/select.c ++++ b/fs/select.c +@@ -82,7 +82,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv) + * Realtime tasks get a slack of 0 for obvious reasons. + */ + +- if (rt_task(current)) ++ if (rt_or_dl_task(current)) + return 0; + + ktime_get_ts64(&now); +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index db1249cd9692..b25377b6ea98 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task) + { + if (task->policy == SCHED_IDLE) + return IOPRIO_CLASS_IDLE; +- else if (task_is_realtime(task)) ++ else if (rt_or_dl_task_policy(task)) + return IOPRIO_CLASS_RT; + else + return IOPRIO_CLASS_BE; +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 1c771ea4481d..57cf27a3045c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -149,8 +149,9 @@ struct user_event_mm; + * Special states are those that do not use the normal wait-loop pattern. See + * the comment with set_special_state(). + */ +-#define is_special_task_state(state) \ +- ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) ++#define is_special_task_state(state) \ ++ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ ++ TASK_DEAD | TASK_FROZEN)) + + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + # define debug_normal_state_change(state_value) \ +@@ -541,9 +542,14 @@ struct sched_entity { + struct rb_node run_node; + u64 deadline; + u64 min_vruntime; ++ u64 min_slice; + + struct list_head group_node; +- unsigned int on_rq; ++ unsigned char on_rq; ++ unsigned char sched_delayed; ++ unsigned char rel_deadline; ++ unsigned char custom_slice; ++ /* hole */ + + u64 exec_start; + u64 sum_exec_runtime; +@@ -641,12 +647,24 @@ struct sched_dl_entity { + * overruns. + * + * @dl_server tells if this is a server entity. ++ * ++ * @dl_defer tells if this is a deferred or regular server. For ++ * now only defer server exists. ++ * ++ * @dl_defer_armed tells if the deferrable server is waiting ++ * for the replenishment timer to activate it. ++ * ++ * @dl_defer_running tells if the deferrable server is actually ++ * running, skipping the defer phase. + */ + unsigned int dl_throttled : 1; + unsigned int dl_yielded : 1; + unsigned int dl_non_contending : 1; + unsigned int dl_overrun : 1; + unsigned int dl_server : 1; ++ unsigned int dl_defer : 1; ++ unsigned int dl_defer_armed : 1; ++ unsigned int dl_defer_running : 1; + + /* + * Bandwidth enforcement timer. Each -deadline task has its +@@ -674,7 +692,7 @@ struct sched_dl_entity { + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; +- dl_server_pick_f server_pick; ++ dl_server_pick_f server_pick_task; + + #ifdef CONFIG_RT_MUTEXES + /* +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index df3aca89d4f5..3a912ab42bb5 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -10,16 +10,16 @@ + + #include + +-#define MAX_DL_PRIO 0 +- +-static inline int dl_prio(int prio) ++static inline bool dl_prio(int prio) + { +- if (unlikely(prio < MAX_DL_PRIO)) +- return 1; +- return 0; ++ return unlikely(prio < MAX_DL_PRIO); + } + +-static inline int dl_task(struct task_struct *p) ++/* ++ * Returns true if a task has a priority that belongs to DL class. PI-boosted ++ * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. ++ */ ++static inline bool dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index ab83d85e1183..6ab43b4f72f9 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -14,6 +14,7 @@ + */ + + #define MAX_RT_PRIO 100 ++#define MAX_DL_PRIO 0 + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index b2b9e6eb9683..4e3338103654 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -6,19 +6,40 @@ + + struct task_struct; + +-static inline int rt_prio(int prio) ++static inline bool rt_prio(int prio) + { +- if (unlikely(prio < MAX_RT_PRIO)) +- return 1; +- return 0; ++ return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); + } + +-static inline int rt_task(struct task_struct *p) ++static inline bool rt_or_dl_prio(int prio) ++{ ++ return unlikely(prio < MAX_RT_PRIO); ++} ++ ++/* ++ * Returns true if a task has a priority that belongs to RT class. PI-boosted ++ * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. ++ */ ++static inline bool rt_task(struct task_struct *p) + { + return rt_prio(p->prio); + } + +-static inline bool task_is_realtime(struct task_struct *tsk) ++/* ++ * Returns true if a task has a priority that belongs to RT or DL classes. ++ * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore ++ * PI-boosted tasks. ++ */ ++static inline bool rt_or_dl_task(struct task_struct *p) ++{ ++ return rt_or_dl_prio(p->prio); ++} ++ ++/* ++ * Returns true if a task has a policy that belongs to RT or DL classes. ++ * PI-boosted tasks will return false. ++ */ ++static inline bool rt_or_dl_task_policy(struct task_struct *tsk) + { + int policy = tsk->policy; + +diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h +index 90662385689b..bf6e9ae031c1 100644 +--- a/include/uapi/linux/sched/types.h ++++ b/include/uapi/linux/sched/types.h +@@ -58,9 +58,9 @@ + * + * This is reflected by the following fields of the sched_attr structure: + * +- * @sched_deadline representative of the task's deadline +- * @sched_runtime representative of the task's runtime +- * @sched_period representative of the task's period ++ * @sched_deadline representative of the task's deadline in nanoseconds ++ * @sched_runtime representative of the task's runtime in nanoseconds ++ * @sched_period representative of the task's period in nanoseconds + * + * Given this task model, there are a multiplicity of scheduling algorithms + * and policies, that can be used to ensure all the tasks will make their +diff --git a/kernel/freezer.c b/kernel/freezer.c +index f57aaf96b829..44bbd7dbd2c8 100644 +--- a/kernel/freezer.c ++++ b/kernel/freezer.c +@@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop) + bool freeze; + + raw_spin_lock_irq(¤t->pi_lock); +- set_current_state(TASK_FROZEN); ++ WRITE_ONCE(current->__state, TASK_FROZEN); + /* unstale saved_state so that __thaw_task() will wake us up */ + current->saved_state = TASK_RUNNING; + raw_spin_unlock_irq(¤t->pi_lock); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index fba1229f1de6..ebebd0eec7f6 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task) + { + int prio = task->prio; + +- if (!rt_prio(prio)) ++ if (!rt_or_dl_prio(prio)) + return DEFAULT_PRIO; + + return prio; +@@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, + * Note that RT tasks are excluded from same priority (lateral) + * steals to prevent the introduction of an unbounded latency. + */ +- if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) ++ if (rt_or_dl_prio(waiter->tree.prio)) + return false; + + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index 3277df47ab3c..299b793d55e1 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, + * if it is an RT task or wait in the wait queue + * for too long. + */ +- if (has_handoff || (!rt_task(waiter->task) && ++ if (has_handoff || (!rt_or_dl_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) + return false; + +@@ -916,7 +916,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) + if (owner_state != OWNER_WRITER) { + if (need_resched()) + break; +- if (rt_task(current) && ++ if (rt_or_dl_task(current) && + (prev_owner_state != OWNER_WRITER)) + break; + } +diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h +index 3ad2cc4823e5..76d204b7d29c 100644 +--- a/kernel/locking/ww_mutex.h ++++ b/kernel/locking/ww_mutex.h +@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) + int a_prio = a->task->prio; + int b_prio = b->task->prio; + +- if (rt_prio(a_prio) || rt_prio(b_prio)) { ++ if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) { + + if (a_prio > b_prio) + return true; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 1af59cf714cd..6ea3c49788a4 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -163,7 +163,10 @@ static inline int __task_prio(const struct task_struct *p) + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + +- if (rt_prio(p->prio)) /* includes deadline */ ++ if (p->dl_server) ++ return -1; /* deadline */ ++ ++ if (rt_or_dl_prio(p->prio)) + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) +@@ -192,8 +195,24 @@ static inline bool prio_less(const struct task_struct *a, + if (-pb < -pa) + return false; + +- if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ +- return !dl_time_before(a->dl.deadline, b->dl.deadline); ++ if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ ++ const struct sched_dl_entity *a_dl, *b_dl; ++ ++ a_dl = &a->dl; ++ /* ++ * Since,'a' and 'b' can be CFS tasks served by DL server, ++ * __task_prio() can return -1 (for DL) even for those. In that ++ * case, get to the dl_server's DL entity. ++ */ ++ if (a->dl_server) ++ a_dl = a->dl_server; ++ ++ b_dl = &b->dl; ++ if (b->dl_server) ++ b_dl = b->dl_server; ++ ++ return !dl_time_before(a_dl->deadline, b_dl->deadline); ++ } + + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi); +@@ -240,6 +259,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) + + void sched_core_enqueue(struct rq *rq, struct task_struct *p) + { ++ if (p->se.sched_delayed) ++ return; ++ + rq->core->core_task_seq++; + + if (!p->core_cookie) +@@ -250,6 +272,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) + + void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) + { ++ if (p->se.sched_delayed) ++ return; ++ + rq->core->core_task_seq++; + + if (sched_core_enqueued(p)) { +@@ -1269,7 +1294,7 @@ bool sched_can_stop_tick(struct rq *rq) + * dequeued by migrating while the constrained task continues to run. + * E.g. going from 2->1 without going through pick_next_task(). + */ +- if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { ++ if (__need_bw_check(rq, rq->curr)) { + if (cfs_task_bw_constrained(rq->curr)) + return false; + } +@@ -1672,6 +1697,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + ++ if (p->se.sched_delayed) ++ return; ++ + for_each_clamp_id(clamp_id) + uclamp_rq_inc_id(rq, p, clamp_id); + +@@ -1696,6 +1724,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + ++ if (p->se.sched_delayed) ++ return; ++ + for_each_clamp_id(clamp_id) + uclamp_rq_dec_id(rq, p, clamp_id); + } +@@ -1967,22 +1998,38 @@ unsigned long get_wchan(struct task_struct *p) + + void enqueue_task(struct rq *rq, struct task_struct *p, int flags) + { ++ bool wakee_not_migrated = (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED); ++ + if (!(flags & ENQUEUE_NOCLOCK)) + update_rq_clock(rq); + + if (!(flags & ENQUEUE_RESTORE)) { + sched_info_enqueue(rq, p); +- psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); ++ ++ /* Notify PSI that the task was migrated in a delayed state before wakeup. */ ++ if ((p->migration_flags & DELAYED_MIGRATED) && !task_on_rq_migrating(p)) { ++ wakee_not_migrated = false; ++ p->migration_flags &= ~DELAYED_MIGRATED; ++ } + } + +- uclamp_rq_inc(rq, p); + p->sched_class->enqueue_task(rq, p, flags); ++ /* ++ * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear ++ * ->sched_delayed. ++ */ ++ uclamp_rq_inc(rq, p); ++ if (!(flags & ENQUEUE_RESTORE)) ++ psi_enqueue(p, wakee_not_migrated); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); + } + +-void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++/* ++ * Must only return false when DEQUEUE_SLEEP. ++ */ ++inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) + { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p, flags); +@@ -1993,10 +2040,17 @@ void dequeue_task(struct rq *rq, struct task_struct *p, int flags) + if (!(flags & DEQUEUE_SAVE)) { + sched_info_dequeue(rq, p); + psi_dequeue(p, flags & DEQUEUE_SLEEP); ++ ++ if (p->se.sched_delayed && task_on_rq_migrating(p)) ++ p->migration_flags |= DELAYED_MIGRATED; + } + ++ /* ++ * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' ++ * and mark the task ->sched_delayed. ++ */ + uclamp_rq_dec(rq, p); +- p->sched_class->dequeue_task(rq, p, flags); ++ return p->sched_class->dequeue_task(rq, p, flags); + } + + void activate_task(struct rq *rq, struct task_struct *p, int flags) +@@ -2014,12 +2068,25 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) + + void deactivate_task(struct rq *rq, struct task_struct *p, int flags) + { +- WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); ++ SCHED_WARN_ON(flags & DEQUEUE_SLEEP); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + ++ /* ++ * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* ++ * dequeue_task() and cleared *after* enqueue_task(). ++ */ ++ + dequeue_task(rq, p, flags); + } + ++static void block_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) ++ __block_task(rq, p); ++} ++ + /** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. +@@ -2233,6 +2300,12 @@ void migrate_disable(void) + struct task_struct *p = current; + + if (p->migration_disabled) { ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ *Warn about overflow half-way through the range. ++ */ ++ WARN_ON_ONCE((s16)p->migration_disabled < 0); ++#endif + p->migration_disabled++; + return; + } +@@ -2251,14 +2324,20 @@ void migrate_enable(void) + .flags = SCA_MIGRATE_ENABLE, + }; + ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Check both overflow from migrate_disable() and superfluous ++ * migrate_enable(). ++ */ ++ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) ++ return; ++#endif ++ + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + +- if (WARN_ON_ONCE(!p->migration_disabled)) +- return; +- + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). +@@ -3607,8 +3686,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + rq->idle_stamp = 0; + } + #endif +- +- p->dl_server = NULL; + } + + /* +@@ -3644,12 +3721,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) + + rq = __task_rq_lock(p, &rf); + if (task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ if (p->se.sched_delayed) ++ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_WAKEUP | ENQUEUE_DELAYED); + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ +- update_rq_clock(rq); + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); +@@ -4029,11 +4108,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + * case the whole 'p->on_rq && ttwu_runnable()' case below + * without taking any locks. + * ++ * Specifically, given current runs ttwu() we must be before ++ * schedule()'s block_task(), as such this must not observe ++ * sched_delayed. ++ * + * In particular: + * - we rely on Program-Order guarantees for all the ordering, + * - we're serialized against set_special_state() by virtue of + * it disabling IRQs (this allows not taking ->pi_lock). + */ ++ SCHED_WARN_ON(p->se.sched_delayed); + if (!ttwu_state_match(p, state, &success)) + goto out; + +@@ -4322,9 +4406,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.nr_migrations = 0; + p->se.vruntime = 0; + p->se.vlag = 0; +- p->se.slice = sysctl_sched_base_slice; + INIT_LIST_HEAD(&p->se.group_node); + ++ /* A delayed task cannot be in clone(). */ ++ SCHED_WARN_ON(p->se.sched_delayed); ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; + #endif +@@ -4572,6 +4658,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p, false); ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; + + /* + * We don't need the reset flag anymore after the fork. It has +@@ -4686,7 +4774,7 @@ void wake_up_new_task(struct task_struct *p) + update_rq_clock(rq); + post_init_entity_util_avg(p); + +- activate_task(rq, p, ENQUEUE_NOCLOCK); ++ activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); + trace_sched_wakeup_new(p); + wakeup_preempt(rq, p, WF_FORK); + #ifdef CONFIG_SMP +@@ -5769,8 +5857,8 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) + schedstat_inc(this_rq()->sched_count); + } + +-static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, +- struct rq_flags *rf) ++static void prev_balance(struct rq *rq, struct task_struct *prev, ++ struct rq_flags *rf) + { + #ifdef CONFIG_SMP + const struct sched_class *class; +@@ -5787,16 +5875,6 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + break; + } + #endif +- +- put_prev_task(rq, prev); +- +- /* +- * We've updated @prev and no longer need the server link, clear it. +- * Must be done before ->pick_next_task() because that can (re)set +- * ->dl_server. +- */ +- if (prev->dl_server) +- prev->dl_server = NULL; + } + + /* +@@ -5808,6 +5886,8 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + const struct sched_class *class; + struct task_struct *p; + ++ rq->dl_server = NULL; ++ + /* + * Optimization: we know that if all tasks are in the fair class we can + * call that function directly, but only if the @prev task wasn't of a +@@ -5823,34 +5903,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + + /* Assume the next prioritized class is idle_sched_class */ + if (!p) { +- put_prev_task(rq, prev); +- p = pick_next_task_idle(rq); ++ p = pick_task_idle(rq); ++ put_prev_set_next_task(rq, prev, p); + } + +- /* +- * This is a normal CFS pick, but the previous could be a DL pick. +- * Clear it as previous is no longer picked. +- */ +- if (prev->dl_server) +- prev->dl_server = NULL; +- +- /* +- * This is the fast path; it cannot be a DL server pick; +- * therefore even if @p == @prev, ->dl_server must be NULL. +- */ +- if (p->dl_server) +- p->dl_server = NULL; +- + return p; + } + + restart: +- put_prev_task_balance(rq, prev, rf); ++ prev_balance(rq, prev, rf); + + for_each_class(class) { +- p = class->pick_next_task(rq); +- if (p) +- return p; ++ if (class->pick_next_task) { ++ p = class->pick_next_task(rq, prev); ++ if (p) ++ return p; ++ } else { ++ p = class->pick_task(rq); ++ if (p) { ++ put_prev_set_next_task(rq, prev, p); ++ return p; ++ } ++ } + } + + BUG(); /* The idle class should always have a runnable task. */ +@@ -5880,6 +5954,8 @@ static inline struct task_struct *pick_task(struct rq *rq) + const struct sched_class *class; + struct task_struct *p; + ++ rq->dl_server = NULL; ++ + for_each_class(class) { + p = class->pick_task(rq); + if (p) +@@ -5918,6 +5994,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * another cpu during offline. + */ + rq->core_pick = NULL; ++ rq->core_dl_server = NULL; + return __pick_next_task(rq, prev, rf); + } + +@@ -5936,16 +6013,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); + + next = rq->core_pick; +- if (next != prev) { +- put_prev_task(rq, prev); +- set_next_task(rq, next); +- } +- ++ rq->dl_server = rq->core_dl_server; + rq->core_pick = NULL; +- goto out; ++ rq->core_dl_server = NULL; ++ goto out_set_next; + } + +- put_prev_task_balance(rq, prev, rf); ++ prev_balance(rq, prev, rf); + + smt_mask = cpu_smt_mask(cpu); + need_sync = !!rq->core->core_cookie; +@@ -5986,6 +6060,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + next = pick_task(rq); + if (!next->core_cookie) { + rq->core_pick = NULL; ++ rq->core_dl_server = NULL; + /* + * For robustness, update the min_vruntime_fi for + * unconstrained picks as well. +@@ -6013,7 +6088,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) + update_rq_clock(rq_i); + +- p = rq_i->core_pick = pick_task(rq_i); ++ rq_i->core_pick = p = pick_task(rq_i); ++ rq_i->core_dl_server = rq_i->dl_server; ++ + if (!max || prio_less(max, p, fi_before)) + max = p; + } +@@ -6037,6 +6114,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + + rq_i->core_pick = p; ++ rq_i->core_dl_server = NULL; + + if (p == rq_i->idle) { + if (rq_i->nr_running) { +@@ -6097,6 +6175,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + + if (i == cpu) { + rq_i->core_pick = NULL; ++ rq_i->core_dl_server = NULL; + continue; + } + +@@ -6105,6 +6184,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + + if (rq_i->curr == rq_i->core_pick) { + rq_i->core_pick = NULL; ++ rq_i->core_dl_server = NULL; + continue; + } + +@@ -6112,8 +6192,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + + out_set_next: +- set_next_task(rq, next); +-out: ++ put_prev_set_next_task(rq, prev, next); + if (rq->core->core_forceidle_count && next == rq->idle) + queue_core_balance(rq); + +@@ -6349,19 +6428,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a +- * preemption from blocking on an 'sleeping' spin/rwlock. Note that +- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to +- * optimize the AND operation out and just check for zero. ++ * preemption from blocking on an 'sleeping' spin/rwlock. + */ +-#define SM_NONE 0x0 +-#define SM_PREEMPT 0x1 +-#define SM_RTLOCK_WAIT 0x2 +- +-#ifndef CONFIG_PREEMPT_RT +-# define SM_MASK_PREEMPT (~0U) +-#else +-# define SM_MASK_PREEMPT SM_PREEMPT +-#endif ++#define SM_IDLE (-1) ++#define SM_NONE 0 ++#define SM_PREEMPT 1 ++#define SM_RTLOCK_WAIT 2 + + /* + * __schedule() is the main scheduler function. +@@ -6402,9 +6474,15 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * + * WARNING: must be called with preemption disabled! + */ +-static void __sched notrace __schedule(unsigned int sched_mode) ++static void __sched notrace __schedule(int sched_mode) + { + struct task_struct *prev, *next; ++ /* ++ * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted ++ * as a preemption by schedule_debug() and RCU. ++ */ ++ bool preempt = sched_mode > SM_NONE; ++ bool block = false; + unsigned long *switch_count; + unsigned long prev_state; + struct rq_flags rf; +@@ -6415,13 +6493,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) + rq = cpu_rq(cpu); + prev = rq->curr; + +- schedule_debug(prev, !!sched_mode); ++ schedule_debug(prev, preempt); + + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) + hrtick_clear(rq); + + local_irq_disable(); +- rcu_note_context_switch(!!sched_mode); ++ rcu_note_context_switch(preempt); + + /* + * Make sure that signal_pending_state()->signal_pending() below +@@ -6450,22 +6528,32 @@ static void __sched notrace __schedule(unsigned int sched_mode) + + switch_count = &prev->nivcsw; + ++ /* Task state changes only considers SM_PREEMPT as preemption */ ++ preempt = sched_mode == SM_PREEMPT; ++ + /* + * We must load prev->state once (task_struct::state is volatile), such + * that we form a control dependency vs deactivate_task() below. + */ + prev_state = READ_ONCE(prev->__state); +- if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { ++ if (sched_mode == SM_IDLE) { ++ if (!rq->nr_running) { ++ next = prev; ++ goto picked; ++ } ++ } else if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { + WRITE_ONCE(prev->__state, TASK_RUNNING); + } else { ++ int flags = DEQUEUE_NOCLOCK; ++ + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev_state & TASK_FROZEN); + +- if (prev->sched_contributes_to_load) +- rq->nr_uninterruptible++; ++ if (unlikely(is_special_task_state(prev_state))) ++ flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() +@@ -6478,17 +6566,14 @@ static void __sched notrace __schedule(unsigned int sched_mode) + * + * After this, schedule() must not care about p->state any more. + */ +- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); +- +- if (prev->in_iowait) { +- atomic_inc(&rq->nr_iowait); +- delayacct_blkio_start(); +- } ++ block_task(rq, prev, flags); ++ block = true; + } + switch_count = &prev->nvcsw; + } + + next = pick_next_task(rq, prev, &rf); ++picked: + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + #ifdef CONFIG_SCHED_DEBUG +@@ -6528,9 +6613,9 @@ static void __sched notrace __schedule(unsigned int sched_mode) + + migrate_disable_switch(rq, prev); + psi_account_irqtime(rq, prev, next); +- psi_sched_switch(prev, next, !task_on_rq_queued(prev)); ++ psi_sched_switch(prev, next, block); + +- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); ++ trace_sched_switch(preempt, prev, next, prev_state); + + /* Also unlocks the rq: */ + rq = context_switch(rq, prev, next, &rf); +@@ -6606,7 +6691,7 @@ static void sched_update_worker(struct task_struct *tsk) + } + } + +-static __always_inline void __schedule_loop(unsigned int sched_mode) ++static __always_inline void __schedule_loop(int sched_mode) + { + do { + preempt_disable(); +@@ -6651,7 +6736,7 @@ void __sched schedule_idle(void) + */ + WARN_ON_ONCE(current->__state); + do { +- __schedule(SM_NONE); ++ __schedule(SM_IDLE); + } while (need_resched()); + } + +@@ -8235,8 +8320,6 @@ void __init sched_init(void) + #endif /* CONFIG_RT_GROUP_SCHED */ + } + +- init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); +- + #ifdef CONFIG_SMP + init_defrootdomain(); + #endif +@@ -8291,8 +8374,13 @@ void __init sched_init(void) + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); + #endif /* CONFIG_FAIR_GROUP_SCHED */ + +- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; + #ifdef CONFIG_RT_GROUP_SCHED ++ /* ++ * This is required for init cpu because rt.c:__enable_runtime() ++ * starts working after scheduler_running, which is not the case ++ * yet. ++ */ ++ rq->rt.rt_runtime = global_rt_runtime(); + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); + #endif + #ifdef CONFIG_SMP +@@ -8324,10 +8412,12 @@ void __init sched_init(void) + #endif /* CONFIG_SMP */ + hrtick_rq_init(rq); + atomic_set(&rq->nr_iowait, 0); ++ fair_server_init(rq); + + #ifdef CONFIG_SCHED_CORE + rq->core = rq; + rq->core_pick = NULL; ++ rq->core_dl_server = NULL; + rq->core_enabled = 0; + rq->core_tree = RB_ROOT; + rq->core_forceidle_count = 0; +@@ -8340,6 +8430,7 @@ void __init sched_init(void) + } + + set_load_weight(&init_task, false); ++ init_task.se.slice = sysctl_sched_base_slice, + + /* + * The boot idle thread does lazy MMU switching as well: +@@ -8555,7 +8646,7 @@ void normalize_rt_tasks(void) + schedstat_set(p->stats.sleep_start, 0); + schedstat_set(p->stats.block_start, 0); + +- if (!dl_task(p) && !rt_task(p)) { ++ if (!rt_or_dl_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index eece6244f9d2..43111a515a28 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -654,9 +654,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ +- .sched_runtime = 1000000, +- .sched_deadline = 10000000, +- .sched_period = 10000000, ++ .sched_runtime = NSEC_PER_MSEC, ++ .sched_deadline = 10 * NSEC_PER_MSEC, ++ .sched_period = 10 * NSEC_PER_MSEC, + }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index c5a3691ba6cc..9ce93d0bf452 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) + __sub_running_bw(dl_se->dl_bw, dl_rq); + } + +-static void dl_change_utilization(struct task_struct *p, u64 new_bw) ++static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) + { +- struct rq *rq; +- +- WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); +- +- if (task_on_rq_queued(p)) +- return; ++ if (dl_se->dl_non_contending) { ++ sub_running_bw(dl_se, &rq->dl); ++ dl_se->dl_non_contending = 0; + +- rq = task_rq(p); +- if (p->dl.dl_non_contending) { +- sub_running_bw(&p->dl, &rq->dl); +- p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be canceled, inactive_task_timer() +@@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) + * will not touch the rq's active utilization, + * so we are still safe. + */ +- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) +- put_task_struct(p); ++ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { ++ if (!dl_server(dl_se)) ++ put_task_struct(dl_task_of(dl_se)); ++ } + } +- __sub_rq_bw(p->dl.dl_bw, &rq->dl); ++ __sub_rq_bw(dl_se->dl_bw, &rq->dl); + __add_rq_bw(new_bw, &rq->dl); + } + ++static void dl_change_utilization(struct task_struct *p, u64 new_bw) ++{ ++ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); ++ ++ if (task_on_rq_queued(p)) ++ return; ++ ++ dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); ++} ++ + static void __dl_clear_params(struct sched_dl_entity *dl_se); + + /* +@@ -771,6 +776,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; ++ ++ /* ++ * If it is a deferred reservation, and the server ++ * is not handling an starvation case, defer it. ++ */ ++ if (dl_se->dl_defer & !dl_se->dl_defer_running) { ++ dl_se->dl_throttled = 1; ++ dl_se->dl_defer_armed = 1; ++ } + } + + /* +@@ -809,6 +823,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) + replenish_dl_new_period(dl_se, rq); + } + ++static int start_dl_timer(struct sched_dl_entity *dl_se); ++static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); ++ + /* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus +@@ -837,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) + /* + * This could be the case for a !-dl task that is boosted. + * Just go with full inherited parameters. ++ * ++ * Or, it could be the case of a deferred reservation that ++ * was not able to consume its runtime in background and ++ * reached this point with current u > U. ++ * ++ * In both cases, set a new period. + */ +- if (dl_se->dl_deadline == 0) +- replenish_dl_new_period(dl_se, rq); ++ if (dl_se->dl_deadline == 0 || ++ (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { ++ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; ++ dl_se->runtime = pi_of(dl_se)->dl_runtime; ++ } + + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; +@@ -873,6 +899,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; ++ ++ /* ++ * If this is the replenishment of a deferred reservation, ++ * clear the flag and return. ++ */ ++ if (dl_se->dl_defer_armed) { ++ dl_se->dl_defer_armed = 0; ++ return; ++ } ++ ++ /* ++ * A this point, if the deferred server is not armed, and the deadline ++ * is in the future, if it is not running already, throttle the server ++ * and arm the defer timer. ++ */ ++ if (dl_se->dl_defer && !dl_se->dl_defer_running && ++ dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { ++ if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { ++ ++ /* ++ * Set dl_se->dl_defer_armed and dl_throttled variables to ++ * inform the start_dl_timer() that this is a deferred ++ * activation. ++ */ ++ dl_se->dl_defer_armed = 1; ++ dl_se->dl_throttled = 1; ++ if (!start_dl_timer(dl_se)) { ++ /* ++ * If for whatever reason (delays), a previous timer was ++ * queued but not serviced, cancel it and clean the ++ * deferrable server variables intended for start_dl_timer(). ++ */ ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ dl_se->dl_defer_armed = 0; ++ dl_se->dl_throttled = 0; ++ } ++ } ++ } + } + + /* +@@ -1023,6 +1087,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) + } + + replenish_dl_new_period(dl_se, rq); ++ } else if (dl_server(dl_se) && dl_se->dl_defer) { ++ /* ++ * The server can still use its previous deadline, so check if ++ * it left the dl_defer_running state. ++ */ ++ if (!dl_se->dl_defer_running) { ++ dl_se->dl_defer_armed = 1; ++ dl_se->dl_throttled = 1; ++ } + } + } + +@@ -1055,8 +1128,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) + * We want the timer to fire at the deadline, but considering + * that it is actually coming from rq->clock and not from + * hrtimer's time base reading. ++ * ++ * The deferred reservation will have its timer set to ++ * (deadline - runtime). At that point, the CBS rule will decide ++ * if the current deadline can be used, or if a replenishment is ++ * required to avoid add too much pressure on the system ++ * (current u > U). + */ +- act = ns_to_ktime(dl_next_period(dl_se)); ++ if (dl_se->dl_defer_armed) { ++ WARN_ON_ONCE(!dl_se->dl_throttled); ++ act = ns_to_ktime(dl_se->deadline - dl_se->runtime); ++ } else { ++ /* act = deadline - rel-deadline + period */ ++ act = ns_to_ktime(dl_next_period(dl_se)); ++ } ++ + now = hrtimer_cb_get_time(timer); + delta = ktime_to_ns(now) - rq_clock(rq); + act = ktime_add_ns(act, delta); +@@ -1106,6 +1192,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) + #endif + } + ++/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ ++static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; ++ ++static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) ++{ ++ struct rq *rq = rq_of_dl_se(dl_se); ++ u64 fw; ++ ++ scoped_guard (rq_lock, rq) { ++ struct rq_flags *rf = &scope.rf; ++ ++ if (!dl_se->dl_throttled || !dl_se->dl_runtime) ++ return HRTIMER_NORESTART; ++ ++ sched_clock_tick(); ++ update_rq_clock(rq); ++ ++ if (!dl_se->dl_runtime) ++ return HRTIMER_NORESTART; ++ ++ if (!dl_se->server_has_tasks(dl_se)) { ++ replenish_dl_entity(dl_se); ++ return HRTIMER_NORESTART; ++ } ++ ++ if (dl_se->dl_defer_armed) { ++ /* ++ * First check if the server could consume runtime in background. ++ * If so, it is possible to push the defer timer for this amount ++ * of time. The dl_server_min_res serves as a limit to avoid ++ * forwarding the timer for a too small amount of time. ++ */ ++ if (dl_time_before(rq_clock(dl_se->rq), ++ (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { ++ ++ /* reset the defer timer */ ++ fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; ++ ++ hrtimer_forward_now(timer, ns_to_ktime(fw)); ++ return HRTIMER_RESTART; ++ } ++ ++ dl_se->dl_defer_running = 1; ++ } ++ ++ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); ++ ++ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) ++ resched_curr(rq); ++ ++ __push_dl_task(rq, rf); ++ } ++ ++ return HRTIMER_NORESTART; ++} ++ + /* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running +@@ -1128,28 +1270,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) + struct rq_flags rf; + struct rq *rq; + +- if (dl_server(dl_se)) { +- struct rq *rq = rq_of_dl_se(dl_se); +- struct rq_flags rf; +- +- rq_lock(rq, &rf); +- if (dl_se->dl_throttled) { +- sched_clock_tick(); +- update_rq_clock(rq); +- +- if (dl_se->server_has_tasks(dl_se)) { +- enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); +- resched_curr(rq); +- __push_dl_task(rq, &rf); +- } else { +- replenish_dl_entity(dl_se); +- } +- +- } +- rq_unlock(rq, &rf); +- +- return HRTIMER_NORESTART; +- } ++ if (dl_server(dl_se)) ++ return dl_server_timer(timer, dl_se); + + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); +@@ -1319,22 +1441,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) + return (delta * u_act) >> BW_SHIFT; + } + +-static inline void +-update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, +- int flags); +-static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) ++s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) + { + s64 scaled_delta_exec; + +- if (unlikely(delta_exec <= 0)) { +- if (unlikely(dl_se->dl_yielded)) +- goto throttle; +- return; +- } +- +- if (dl_entity_is_special(dl_se)) +- return; +- + /* + * For tasks that participate in GRUB, we implement GRUB-PA: the + * spare reclaimed bandwidth is used to clock down frequency. +@@ -1353,8 +1463,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 + scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); + } + ++ return scaled_delta_exec; ++} ++ ++static inline void ++update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, ++ int flags); ++static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) ++{ ++ s64 scaled_delta_exec; ++ ++ if (unlikely(delta_exec <= 0)) { ++ if (unlikely(dl_se->dl_yielded)) ++ goto throttle; ++ return; ++ } ++ ++ if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) ++ return; ++ ++ if (dl_entity_is_special(dl_se)) ++ return; ++ ++ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); ++ + dl_se->runtime -= scaled_delta_exec; + ++ /* ++ * The fair server can consume its runtime while throttled (not queued/ ++ * running as regular CFS). ++ * ++ * If the server consumes its entire runtime in this state. The server ++ * is not required for the current period. Thus, reset the server by ++ * starting a new period, pushing the activation. ++ */ ++ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { ++ /* ++ * If the server was previously activated - the starving condition ++ * took place, it this point it went away because the fair scheduler ++ * was able to get runtime in background. So return to the initial ++ * state. ++ */ ++ dl_se->dl_defer_running = 0; ++ ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ ++ replenish_dl_new_period(dl_se, dl_se->rq); ++ ++ /* ++ * Not being able to start the timer seems problematic. If it could not ++ * be started for whatever reason, we need to "unthrottle" the DL server ++ * and queue right away. Otherwise nothing might queue it. That's similar ++ * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. ++ */ ++ WARN_ON_ONCE(!start_dl_timer(dl_se)); ++ ++ return; ++ } ++ + throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { + dl_se->dl_throttled = 1; +@@ -1381,6 +1547,14 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 + resched_curr(rq); + } + ++ /* ++ * The fair server (sole dl_server) does not account for real-time ++ * workload because it is running fair work. ++ */ ++ if (dl_se == &rq->fair_server) ++ return; ++ ++#ifdef CONFIG_RT_GROUP_SCHED + /* + * Because -- for now -- we share the rt bandwidth, we need to + * account our runtime there too, otherwise actual rt tasks +@@ -1405,34 +1579,155 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 + rt_rq->rt_time += delta_exec; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } ++#endif ++} ++ ++/* ++ * In the non-defer mode, the idle time is not accounted, as the ++ * server provides a guarantee. ++ * ++ * If the dl_server is in defer mode, the idle time is also considered ++ * as time available for the fair server, avoiding a penalty for the ++ * rt scheduler that did not consumed that time. ++ */ ++void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) ++{ ++ s64 delta_exec, scaled_delta_exec; ++ ++ if (!rq->fair_server.dl_defer) ++ return; ++ ++ /* no need to discount more */ ++ if (rq->fair_server.runtime < 0) ++ return; ++ ++ delta_exec = rq_clock_task(rq) - p->se.exec_start; ++ if (delta_exec < 0) ++ return; ++ ++ scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); ++ ++ rq->fair_server.runtime -= scaled_delta_exec; ++ ++ if (rq->fair_server.runtime < 0) { ++ rq->fair_server.dl_defer_running = 0; ++ rq->fair_server.runtime = 0; ++ } ++ ++ p->se.exec_start = rq_clock_task(rq); + } + + void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) + { +- update_curr_dl_se(dl_se->rq, dl_se, delta_exec); ++ /* 0 runtime = fair server disabled */ ++ if (dl_se->dl_runtime) ++ update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + } + + void dl_server_start(struct sched_dl_entity *dl_se) + { ++ struct rq *rq = dl_se->rq; ++ ++ /* ++ * XXX: the apply do not work fine at the init phase for the ++ * fair server because things are not yet set. We need to improve ++ * this before getting generic. ++ */ + if (!dl_server(dl_se)) { ++ u64 runtime = 50 * NSEC_PER_MSEC; ++ u64 period = 1000 * NSEC_PER_MSEC; ++ ++ dl_server_apply_params(dl_se, runtime, period, 1); ++ + dl_se->dl_server = 1; ++ dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } ++ ++ if (!dl_se->dl_runtime) ++ return; ++ + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); ++ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) ++ resched_curr(dl_se->rq); + } + + void dl_server_stop(struct sched_dl_entity *dl_se) + { ++ if (!dl_se->dl_runtime) ++ return; ++ + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ dl_se->dl_defer_armed = 0; ++ dl_se->dl_throttled = 0; + } + + void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, +- dl_server_pick_f pick) ++ dl_server_pick_f pick_task) + { + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; +- dl_se->server_pick = pick; ++ dl_se->server_pick_task = pick_task; ++} ++ ++void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) ++{ ++ u64 new_bw = dl_se->dl_bw; ++ int cpu = cpu_of(rq); ++ struct dl_bw *dl_b; ++ ++ dl_b = dl_bw_of(cpu_of(rq)); ++ guard(raw_spinlock)(&dl_b->lock); ++ ++ if (!dl_bw_cpus(cpu)) ++ return; ++ ++ __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); ++} ++ ++int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) ++{ ++ u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); ++ u64 new_bw = to_ratio(period, runtime); ++ struct rq *rq = dl_se->rq; ++ int cpu = cpu_of(rq); ++ struct dl_bw *dl_b; ++ unsigned long cap; ++ int retval = 0; ++ int cpus; ++ ++ dl_b = dl_bw_of(cpu); ++ guard(raw_spinlock)(&dl_b->lock); ++ ++ cpus = dl_bw_cpus(cpu); ++ cap = dl_bw_capacity(cpu); ++ ++ if (__dl_overflow(dl_b, cap, old_bw, new_bw)) ++ return -EBUSY; ++ ++ if (init) { ++ __add_rq_bw(new_bw, &rq->dl); ++ __dl_add(dl_b, new_bw, cpus); ++ } else { ++ __dl_sub(dl_b, dl_se->dl_bw, cpus); ++ __dl_add(dl_b, new_bw, cpus); ++ ++ dl_rq_change_utilization(rq, dl_se, new_bw); ++ } ++ ++ dl_se->dl_runtime = runtime; ++ dl_se->dl_deadline = period; ++ dl_se->dl_period = period; ++ ++ dl_se->runtime = 0; ++ dl_se->deadline = 0; ++ ++ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); ++ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); ++ ++ return retval; + } + + /* +@@ -1729,7 +2024,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) + * be counted in the active utilization; hence, we need to call + * add_running_bw(). + */ +- if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { ++ if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(dl_se, flags); + +@@ -1751,6 +2046,25 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) + setup_new_dl_entity(dl_se); + } + ++ /* ++ * If the reservation is still throttled, e.g., it got replenished but is a ++ * deferred task and still got to wait, don't enqueue. ++ */ ++ if (dl_se->dl_throttled && start_dl_timer(dl_se)) ++ return; ++ ++ /* ++ * We're about to enqueue, make sure we're not ->dl_throttled! ++ * In case the timer was not started, say because the defer time ++ * has passed, mark as not throttled and mark unarmed. ++ * Also cancel earlier timers, since letting those run is pointless. ++ */ ++ if (dl_se->dl_throttled) { ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ dl_se->dl_defer_armed = 0; ++ dl_se->dl_throttled = 0; ++ } ++ + __enqueue_dl_entity(dl_se); + } + +@@ -1840,7 +2154,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) + enqueue_pushable_dl_task(rq, p); + } + +-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) ++static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) + { + update_curr_dl(rq); + +@@ -1850,6 +2164,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); ++ ++ return true; + } + + /* +@@ -2068,6 +2384,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + deadline_queue_push_tasks(rq); ++ ++ if (hrtick_enabled(rq)) ++ start_hrtick_dl(rq, &p->dl); + } + + static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) +@@ -2080,7 +2399,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) + return __node_2_dle(left); + } + +-static struct task_struct *pick_task_dl(struct rq *rq) ++/* ++ * __pick_next_task_dl - Helper to pick the next -deadline task to run. ++ * @rq: The runqueue to pick the next task from. ++ */ ++static struct task_struct *__pick_task_dl(struct rq *rq) + { + struct sched_dl_entity *dl_se; + struct dl_rq *dl_rq = &rq->dl; +@@ -2094,14 +2417,13 @@ static struct task_struct *pick_task_dl(struct rq *rq) + WARN_ON_ONCE(!dl_se); + + if (dl_server(dl_se)) { +- p = dl_se->server_pick(dl_se); ++ p = dl_se->server_pick_task(dl_se); + if (!p) { +- WARN_ON_ONCE(1); + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + goto again; + } +- p->dl_server = dl_se; ++ rq->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } +@@ -2109,24 +2431,12 @@ static struct task_struct *pick_task_dl(struct rq *rq) + return p; + } + +-static struct task_struct *pick_next_task_dl(struct rq *rq) ++static struct task_struct *pick_task_dl(struct rq *rq) + { +- struct task_struct *p; +- +- p = pick_task_dl(rq); +- if (!p) +- return p; +- +- if (!p->dl_server) +- set_next_task_dl(rq, p, true); +- +- if (hrtick_enabled(rq)) +- start_hrtick_dl(rq, &p->dl); +- +- return p; ++ return __pick_task_dl(rq); + } + +-static void put_prev_task_dl(struct rq *rq, struct task_struct *p) ++static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) + { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; +@@ -2818,13 +3128,12 @@ DEFINE_SCHED_CLASS(dl) = { + + .wakeup_preempt = wakeup_preempt_dl, + +- .pick_next_task = pick_next_task_dl, ++ .pick_task = pick_task_dl, + .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, + + #ifdef CONFIG_SMP + .balance = balance_dl, +- .pick_task = pick_task_dl, + .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, + .set_cpus_allowed = set_cpus_allowed_dl, +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index c1eb9a1afd13..de1dc5264b3f 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = { + .release = seq_release, + }; + ++enum dl_param { ++ DL_RUNTIME = 0, ++ DL_PERIOD, ++}; ++ ++static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ ++static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ ++ ++static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos, enum dl_param param) ++{ ++ long cpu = (long) ((struct seq_file *) filp->private_data)->private; ++ struct rq *rq = cpu_rq(cpu); ++ u64 runtime, period; ++ size_t err; ++ int retval; ++ u64 value; ++ ++ err = kstrtoull_from_user(ubuf, cnt, 10, &value); ++ if (err) ++ return err; ++ ++ scoped_guard (rq_lock_irqsave, rq) { ++ runtime = rq->fair_server.dl_runtime; ++ period = rq->fair_server.dl_period; ++ ++ switch (param) { ++ case DL_RUNTIME: ++ if (runtime == value) ++ break; ++ runtime = value; ++ break; ++ case DL_PERIOD: ++ if (value == period) ++ break; ++ period = value; ++ break; ++ } ++ ++ if (runtime > period || ++ period > fair_server_period_max || ++ period < fair_server_period_min) { ++ return -EINVAL; ++ } ++ ++ if (rq->cfs.h_nr_running) { ++ update_rq_clock(rq); ++ dl_server_stop(&rq->fair_server); ++ } ++ ++ retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); ++ if (retval) ++ cnt = retval; ++ ++ if (!runtime) ++ printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", ++ cpu_of(rq)); ++ ++ if (rq->cfs.h_nr_running) ++ dl_server_start(&rq->fair_server); ++ } ++ ++ *ppos += cnt; ++ return cnt; ++} ++ ++static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) ++{ ++ unsigned long cpu = (unsigned long) m->private; ++ struct rq *rq = cpu_rq(cpu); ++ u64 value; ++ ++ switch (param) { ++ case DL_RUNTIME: ++ value = rq->fair_server.dl_runtime; ++ break; ++ case DL_PERIOD: ++ value = rq->fair_server.dl_period; ++ break; ++ } ++ ++ seq_printf(m, "%llu\n", value); ++ return 0; ++ ++} ++ ++static ssize_t ++sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); ++} ++ ++static int sched_fair_server_runtime_show(struct seq_file *m, void *v) ++{ ++ return sched_fair_server_show(m, v, DL_RUNTIME); ++} ++ ++static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) ++{ ++ return single_open(filp, sched_fair_server_runtime_show, inode->i_private); ++} ++ ++static const struct file_operations fair_server_runtime_fops = { ++ .open = sched_fair_server_runtime_open, ++ .write = sched_fair_server_runtime_write, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ ++static ssize_t ++sched_fair_server_period_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); ++} ++ ++static int sched_fair_server_period_show(struct seq_file *m, void *v) ++{ ++ return sched_fair_server_show(m, v, DL_PERIOD); ++} ++ ++static int sched_fair_server_period_open(struct inode *inode, struct file *filp) ++{ ++ return single_open(filp, sched_fair_server_period_show, inode->i_private); ++} ++ ++static const struct file_operations fair_server_period_fops = { ++ .open = sched_fair_server_period_open, ++ .write = sched_fair_server_period_write, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ + static struct dentry *debugfs_sched; + ++static void debugfs_fair_server_init(void) ++{ ++ struct dentry *d_fair; ++ unsigned long cpu; ++ ++ d_fair = debugfs_create_dir("fair_server", debugfs_sched); ++ if (!d_fair) ++ return; ++ ++ for_each_possible_cpu(cpu) { ++ struct dentry *d_cpu; ++ char buf[32]; ++ ++ snprintf(buf, sizeof(buf), "cpu%lu", cpu); ++ d_cpu = debugfs_create_dir(buf, d_fair); ++ ++ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops); ++ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops); ++ } ++} ++ + static __init int sched_init_debug(void) + { + struct dentry __maybe_unused *numa; +@@ -374,6 +531,8 @@ static __init int sched_init_debug(void) + + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + ++ debugfs_fair_server_init(); ++ + return 0; + } + late_initcall(sched_init_debug); +@@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + else + SEQ_printf(m, " %c", task_state_to_char(p)); + +- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", ++ SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + p->comm, task_pid_nr(p), + SPLIT_NS(p->se.vruntime), + entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', + SPLIT_NS(p->se.deadline), ++ p->se.custom_slice ? 'S' : ' ', + SPLIT_NS(p->se.slice), + SPLIT_NS(p->se.sum_exec_runtime), + (long long)(p->nvcsw + p->nivcsw), + p->prio); + +- SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", ++ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", + SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), +- SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + + #ifdef CONFIG_NUMA_BALANCING +- SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); ++ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif + #ifdef CONFIG_CGROUP_SCHED +- SEQ_printf_task_group_path(m, task_group(p), " %s") ++ SEQ_printf_task_group_path(m, task_group(p), " %s") + #endif + + SEQ_printf(m, "\n"); +@@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) + + SEQ_printf(m, "\n"); + SEQ_printf(m, "runnable tasks:\n"); +- SEQ_printf(m, " S task PID tree-key switches prio" +- " wait-time sum-exec sum-sleep\n"); ++ SEQ_printf(m, " S task PID vruntime eligible " ++ "deadline slice sum-exec switches " ++ "prio wait-time sum-sleep sum-block" ++#ifdef CONFIG_NUMA_BALANCING ++ " node group-id" ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ " group-path" ++#endif ++ "\n"); + SEQ_printf(m, "-------------------------------------------------------" +- "------------------------------------------------------\n"); ++ "------------------------------------------------------" ++ "------------------------------------------------------" ++#ifdef CONFIG_NUMA_BALANCING ++ "--------------" ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ "--------------" ++#endif ++ "\n"); + + rcu_read_lock(); + for_each_process_thread(g, p) { +@@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:\n", cpu); + #endif +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", +- SPLIT_NS(cfs_rq->exec_clock)); + + raw_spin_rq_lock_irqsave(rq, flags); + root = __pick_root_entity(cfs_rq); +@@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + SPLIT_NS(right_vruntime)); + spread = right_vruntime - left_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); +- SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", +- cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", +@@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + PU(rt_nr_running); ++ ++#ifdef CONFIG_RT_GROUP_SCHED + P(rt_throttled); + PN(rt_time); + PN(rt_runtime); ++#endif + + #undef PN + #undef PU +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 91b242e47db7..c89e7f1693d4 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -792,8 +792,22 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + } + + /* ensure we never gain time by being placed backwards. */ +- u64_u32_store(cfs_rq->min_vruntime, +- __update_min_vruntime(cfs_rq, vruntime)); ++ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); ++} ++ ++static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) ++{ ++ struct sched_entity *root = __pick_root_entity(cfs_rq); ++ struct sched_entity *curr = cfs_rq->curr; ++ u64 min_slice = ~0ULL; ++ ++ if (curr && curr->on_rq) ++ min_slice = curr->slice; ++ ++ if (root) ++ min_slice = min(min_slice, root->min_slice); ++ ++ return min_slice; + } + + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -812,19 +826,34 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node + } + } + ++static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) ++{ ++ if (node) { ++ struct sched_entity *rse = __node_2_se(node); ++ if (rse->min_slice < se->min_slice) ++ se->min_slice = rse->min_slice; ++ } ++} ++ + /* + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) + */ + static inline bool min_vruntime_update(struct sched_entity *se, bool exit) + { + u64 old_min_vruntime = se->min_vruntime; ++ u64 old_min_slice = se->min_slice; + struct rb_node *node = &se->run_node; + + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); + +- return se->min_vruntime == old_min_vruntime; ++ se->min_slice = se->slice; ++ __min_slice_update(se, node->rb_right); ++ __min_slice_update(se, node->rb_left); ++ ++ return se->min_vruntime == old_min_vruntime && ++ se->min_slice == old_min_slice; + } + + RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, +@@ -837,6 +866,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + avg_vruntime_add(cfs_rq, se); + se->min_vruntime = se->vruntime; ++ se->min_slice = se->slice; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + __entity_less, &min_vruntime_cb); + } +@@ -987,17 +1017,18 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); + * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i + * this is probably good enough. + */ +-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) ++static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + if ((s64)(se->vruntime - se->deadline) < 0) +- return; ++ return false; + + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + + /* + * EEVDF: vd_i = ve_i + r_i / w_i +@@ -1007,10 +1038,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + /* + * The task has consumed its request, reschedule. + */ +- if (cfs_rq->nr_running > 1) { +- resched_curr(rq_of(cfs_rq)); +- clear_buddies(cfs_rq, se); +- } ++ return true; + } + + #include "pelt.h" +@@ -1148,6 +1176,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) + dl_server_update(p->dl_server, delta_exec); + } + ++static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++{ ++ if (!sched_feat(PREEMPT_SHORT)) ++ return false; ++ ++ if (curr->vlag == curr->deadline) ++ return false; ++ ++ return !entity_eligible(cfs_rq, curr); ++} ++ ++static inline bool do_preempt_short(struct cfs_rq *cfs_rq, ++ struct sched_entity *pse, struct sched_entity *se) ++{ ++ if (!sched_feat(PREEMPT_SHORT)) ++ return false; ++ ++ if (pse->slice >= se->slice) ++ return false; ++ ++ if (!entity_eligible(cfs_rq, pse)) ++ return false; ++ ++ if (entity_before(pse, se)) ++ return true; ++ ++ if (!entity_eligible(cfs_rq, se)) ++ return true; ++ ++ return false; ++} ++ + /* + * Used by other classes to account runtime. + */ +@@ -1169,23 +1229,44 @@ s64 update_curr_common(struct rq *rq) + static void update_curr(struct cfs_rq *cfs_rq) + { + struct sched_entity *curr = cfs_rq->curr; ++ struct rq *rq = rq_of(cfs_rq); + s64 delta_exec; ++ bool resched; + + if (unlikely(!curr)) + return; + +- delta_exec = update_curr_se(rq_of(cfs_rq), curr); ++ delta_exec = update_curr_se(rq, curr); + if (unlikely(delta_exec <= 0)) + return; + + curr->vruntime += calc_delta_fair(delta_exec, curr); +- update_deadline(cfs_rq, curr); ++ resched = update_deadline(cfs_rq, curr); + update_min_vruntime(cfs_rq); + +- if (entity_is_task(curr)) +- update_curr_task(task_of(curr), delta_exec); ++ if (entity_is_task(curr)) { ++ struct task_struct *p = task_of(curr); ++ ++ update_curr_task(p, delta_exec); ++ ++ /* ++ * Any fair task that runs outside of fair_server should ++ * account against fair_server such that it can account for ++ * this time and possibly avoid running this period. ++ */ ++ if (p->dl_server != &rq->fair_server) ++ dl_server_update(&rq->fair_server, delta_exec); ++ } + + account_cfs_rq_runtime(cfs_rq, delta_exec); ++ ++ if (cfs_rq->nr_running == 1) ++ return; ++ ++ if (resched || did_preempt_short(cfs_rq, curr)) { ++ resched_curr(rq); ++ clear_buddies(cfs_rq, curr); ++ } + } + + static void update_curr_fair(struct rq *rq) +@@ -5200,7 +5281,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + u64 vslice, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; + +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); + + /* +@@ -5281,6 +5363,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + se->vruntime = vruntime - lag; + ++ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { ++ se->deadline += se->vruntime; ++ se->rel_deadline = 0; ++ return; ++ } ++ + /* + * When joining the competition; the existing tasks will be, + * on average, halfway through their slice, as such start tasks +@@ -5300,6 +5388,9 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + + static inline bool cfs_bandwidth_used(void); + ++static void ++requeue_delayed_entity(struct sched_entity *se); ++ + static void + enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +@@ -5387,19 +5478,47 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) + + static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +-static void ++static inline void finish_delayed_dequeue_entity(struct sched_entity *se) ++{ ++ se->sched_delayed = 0; ++ if (sched_feat(DELAY_ZERO) && se->vlag > 0) ++ se->vlag = 0; ++} ++ ++static bool + dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- int action = UPDATE_TG; ++ bool sleep = flags & DEQUEUE_SLEEP; ++ ++ update_curr(cfs_rq); ++ ++ if (flags & DEQUEUE_DELAYED) { ++ SCHED_WARN_ON(!se->sched_delayed); ++ } else { ++ bool delay = sleep; ++ /* ++ * DELAY_DEQUEUE relies on spurious wakeups, special task ++ * states must not suffer spurious wakeups, excempt them. ++ */ ++ if (flags & DEQUEUE_SPECIAL) ++ delay = false; ++ ++ SCHED_WARN_ON(delay && se->sched_delayed); + ++ if (sched_feat(DELAY_DEQUEUE) && delay && ++ !entity_eligible(cfs_rq, se)) { ++ if (cfs_rq->next == se) ++ cfs_rq->next = NULL; ++ update_load_avg(cfs_rq, se, 0); ++ se->sched_delayed = 1; ++ return false; ++ } ++ } ++ ++ int action = UPDATE_TG; + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + +- /* +- * Update run-time statistics of the 'current'. +- */ +- update_curr(cfs_rq); +- + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. +@@ -5417,6 +5536,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + clear_buddies(cfs_rq, se); + + update_entity_lag(cfs_rq, se); ++ if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { ++ se->deadline -= se->vruntime; ++ se->rel_deadline = 1; ++ } ++ + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; +@@ -5436,8 +5560,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); + ++ if (flags & DEQUEUE_DELAYED) ++ finish_delayed_dequeue_entity(se); ++ + if (cfs_rq->nr_running == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); ++ ++ return true; + } + + static void +@@ -5463,6 +5592,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + } + + update_stats_curr_start(cfs_rq, se); ++ SCHED_WARN_ON(cfs_rq->curr); + cfs_rq->curr = se; + + /* +@@ -5483,6 +5613,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + se->prev_sum_exec_runtime = se->sum_exec_runtime; + } + ++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); ++ + /* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups +@@ -5491,16 +5623,26 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + * 4) do not run the "skip" process, if something else is available + */ + static struct sched_entity * +-pick_next_entity(struct cfs_rq *cfs_rq) ++pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) + { + /* + * Enabling NEXT_BUDDY will affect latency but not fairness. + */ + if (sched_feat(NEXT_BUDDY) && +- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { ++ /* ->next will never be delayed */ ++ SCHED_WARN_ON(cfs_rq->next->sched_delayed); + return cfs_rq->next; ++ } + +- return pick_eevdf(cfs_rq); ++ struct sched_entity *se = pick_eevdf(cfs_rq); ++ if (se->sched_delayed) { ++ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); ++ SCHED_WARN_ON(se->sched_delayed); ++ SCHED_WARN_ON(se->on_rq); ++ return NULL; ++ } ++ return se; + } + + static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +@@ -5524,6 +5666,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) + /* in !on_rq case, update occurred at dequeue */ + update_load_avg(cfs_rq, prev, 0); + } ++ SCHED_WARN_ON(cfs_rq->curr != prev); + cfs_rq->curr = NULL; + } + +@@ -5787,6 +5930,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, idle_task_delta, dequeue = 1; ++ long rq_h_nr_running = rq->cfs.h_nr_running; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ +@@ -5820,11 +5964,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); ++ int flags; ++ + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + +- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); ++ /* ++ * Abuse SPECIAL to avoid delayed dequeue in this instance. ++ * This avoids teaching dequeue_entities() about throttled ++ * entities and keeps things relatively simple. ++ */ ++ flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; ++ if (se->sched_delayed) ++ flags |= DEQUEUE_DELAYED; ++ dequeue_entity(qcfs_rq, se, flags); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; +@@ -5858,6 +6012,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + ++ /* Stop the fair server if throttling resulted in no runnable tasks */ ++ if (rq_h_nr_running && !rq->cfs.h_nr_running) ++ dl_server_stop(&rq->fair_server); + done: + /* + * Note: distribution will already see us throttled via the +@@ -5876,6 +6033,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, idle_task_delta; ++ long rq_h_nr_running = rq->cfs.h_nr_running; + + se = cfs_rq->tg->se[cpu_of(rq)]; + +@@ -5913,7 +6071,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + +- if (se->on_rq) ++ /* Handle any unfinished DELAY_DEQUEUE business first. */ ++ if (se->sched_delayed) { ++ int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; ++ ++ dequeue_entity(qcfs_rq, se, flags); ++ } else if (se->on_rq) + break; + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + +@@ -5945,6 +6108,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + goto unthrottle_throttle; + } + ++ /* Start the fair server if un-throttling resulted in new runnable tasks */ ++ if (!rq_h_nr_running && rq->cfs.h_nr_running) ++ dl_server_start(&rq->fair_server); ++ + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, task_delta); + +@@ -6577,7 +6744,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) + { + int cpu = cpu_of(rq); + +- if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) ++ if (!cfs_bandwidth_used()) + return; + + if (!tick_nohz_full_cpu(cpu)) +@@ -6760,6 +6927,37 @@ static int sched_idle_cpu(int cpu) + } + #endif + ++static void ++requeue_delayed_entity(struct sched_entity *se) ++{ ++ struct cfs_rq *cfs_rq = cfs_rq_of(se); ++ ++ /* ++ * se->sched_delayed should imply: se->on_rq == 1. ++ * Because a delayed entity is one that is still on ++ * the runqueue competing until elegibility. ++ */ ++ SCHED_WARN_ON(!se->sched_delayed); ++ SCHED_WARN_ON(!se->on_rq); ++ ++ if (sched_feat(DELAY_ZERO)) { ++ update_entity_lag(cfs_rq, se); ++ if (se->vlag > 0) { ++ cfs_rq->nr_running--; ++ if (se != cfs_rq->curr) ++ __dequeue_entity(cfs_rq, se); ++ se->vlag = 0; ++ place_entity(cfs_rq, se, 0); ++ if (se != cfs_rq->curr) ++ __enqueue_entity(cfs_rq, se); ++ cfs_rq->nr_running++; ++ } ++ } ++ ++ update_load_avg(cfs_rq, se, 0); ++ se->sched_delayed = 0; ++} ++ + /* + * The enqueue_task method is called before nr_running is + * increased. Here we update the fair scheduling stats and +@@ -6772,6 +6970,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); ++ int rq_h_nr_running = rq->cfs.h_nr_running; ++ u64 slice = 0; + + /* + * The code below (indirectly) updates schedutil which looks at +@@ -6779,7 +6979,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ +- util_est_enqueue(&rq->cfs, p); ++ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) ++ util_est_enqueue(&rq->cfs, p); ++ ++ if (flags & ENQUEUE_DELAYED) { ++ requeue_delayed_entity(se); ++ return; ++ } + + /* + * If in_iowait is set, the code below may not trigger any cpufreq +@@ -6790,10 +6996,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + + for_each_sched_entity(se) { +- if (se->on_rq) ++ if (se->on_rq) { ++ if (se->sched_delayed) ++ requeue_delayed_entity(se); + break; ++ } + cfs_rq = cfs_rq_of(se); ++ ++ /* ++ * Basically set the slice of group entries to the min_slice of ++ * their respective cfs_rq. This ensures the group can service ++ * its entities in the desired time-frame. ++ */ ++ if (slice) { ++ se->slice = slice; ++ se->custom_slice = 1; ++ } + enqueue_entity(cfs_rq, se, flags); ++ slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; +@@ -6815,6 +7035,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + se_update_runnable(se); + update_cfs_group(se); + ++ se->slice = slice; ++ slice = cfs_rq_min_slice(cfs_rq); ++ + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + +@@ -6826,6 +7049,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + goto enqueue_throttle; + } + ++ if (!rq_h_nr_running && rq->cfs.h_nr_running) { ++ /* Account for idle runtime */ ++ if (!rq->nr_running) ++ dl_server_update_idle_time(rq, rq->curr); ++ dl_server_start(&rq->fair_server); ++ } ++ + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); + +@@ -6855,36 +7085,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + static void set_next_buddy(struct sched_entity *se); + + /* +- * The dequeue_task method is called before nr_running is +- * decreased. We remove the task from the rbtree and +- * update the fair scheduling stats: ++ * Basically dequeue_task_fair(), except it can deal with dequeue_entity() ++ * failing half-way through and resume the dequeue later. ++ * ++ * Returns: ++ * -1 - dequeue delayed ++ * 0 - dequeue throttled ++ * 1 - dequeue complete + */ +-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) + { +- struct cfs_rq *cfs_rq; +- struct sched_entity *se = &p->se; +- int task_sleep = flags & DEQUEUE_SLEEP; +- int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); ++ int rq_h_nr_running = rq->cfs.h_nr_running; ++ bool task_sleep = flags & DEQUEUE_SLEEP; ++ bool task_delayed = flags & DEQUEUE_DELAYED; ++ struct task_struct *p = NULL; ++ int idle_h_nr_running = 0; ++ int h_nr_running = 0; ++ struct cfs_rq *cfs_rq; ++ u64 slice = 0; + +- util_est_dequeue(&rq->cfs, p); ++ if (entity_is_task(se)) { ++ p = task_of(se); ++ h_nr_running = 1; ++ idle_h_nr_running = task_has_idle_policy(p); ++ } else { ++ cfs_rq = group_cfs_rq(se); ++ slice = cfs_rq_min_slice(cfs_rq); ++ } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); +- dequeue_entity(cfs_rq, se, flags); + +- cfs_rq->h_nr_running--; ++ if (!dequeue_entity(cfs_rq, se, flags)) { ++ if (p && &p->se == se) ++ return -1; ++ ++ break; ++ } ++ ++ cfs_rq->h_nr_running -= h_nr_running; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + if (cfs_rq_is_idle(cfs_rq)) +- idle_h_nr_running = 1; ++ idle_h_nr_running = h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) +- goto dequeue_throttle; ++ return 0; + + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) { ++ slice = cfs_rq_min_slice(cfs_rq); ++ + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + /* +@@ -6896,6 +7149,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + break; + } + flags |= DEQUEUE_SLEEP; ++ flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); + } + + for_each_sched_entity(se) { +@@ -6905,28 +7159,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + se_update_runnable(se); + update_cfs_group(se); + +- cfs_rq->h_nr_running--; ++ se->slice = slice; ++ slice = cfs_rq_min_slice(cfs_rq); ++ ++ cfs_rq->h_nr_running -= h_nr_running; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + if (cfs_rq_is_idle(cfs_rq)) +- idle_h_nr_running = 1; ++ idle_h_nr_running = h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) +- goto dequeue_throttle; +- ++ return 0; + } + +- /* At this point se is NULL and we are at root level*/ +- sub_nr_running(rq, 1); ++ sub_nr_running(rq, h_nr_running); ++ ++ if (rq_h_nr_running && !rq->cfs.h_nr_running) ++ dl_server_stop(&rq->fair_server); + + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + +-dequeue_throttle: +- util_est_update(&rq->cfs, p, task_sleep); ++ if (p && task_delayed) { ++ SCHED_WARN_ON(!task_sleep); ++ SCHED_WARN_ON(p->on_rq != 1); ++ ++ /* Fix-up what dequeue_task_fair() skipped */ ++ hrtick_update(rq); ++ ++ /* Fix-up what block_task() skipped. */ ++ __block_task(rq, p); ++ } ++ ++ return 1; ++} ++ ++/* ++ * The dequeue_task method is called before nr_running is ++ * decreased. We remove the task from the rbtree and ++ * update the fair scheduling stats: ++ */ ++static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ++{ ++ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) ++ util_est_dequeue(&rq->cfs, p); ++ ++ if (dequeue_entities(rq, &p->se, flags) < 0) { ++ util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); ++ return false; ++ } ++ ++ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + hrtick_update(rq); ++ return true; + } + + #ifdef CONFIG_SMP +@@ -7824,6 +8111,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) + return cpu_util(cpu, p, -1, 0); + } + ++/* ++ * This function computes an effective utilization for the given CPU, to be ++ * used for frequency selection given the linear relation: f = u * f_max. ++ * ++ * The scheduler tracks the following metrics: ++ * ++ * cpu_util_{cfs,rt,dl,irq}() ++ * cpu_bw_dl() ++ * ++ * Where the cfs,rt and dl util numbers are tracked with the same metric and ++ * synchronized windows and are thus directly comparable. ++ * ++ * The cfs,rt,dl utilization are the running times measured with rq->clock_task ++ * which excludes things like IRQ and steal-time. These latter are then accrued ++ * in the IRQ utilization. ++ * ++ * The DL bandwidth number OTOH is not a measured metric but a value computed ++ * based on the task model parameters and gives the minimal utilization ++ * required to meet deadlines. ++ */ ++unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long *min, ++ unsigned long *max) ++{ ++ unsigned long util, irq, scale; ++ struct rq *rq = cpu_rq(cpu); ++ ++ scale = arch_scale_cpu_capacity(cpu); ++ ++ /* ++ * Early check to see if IRQ/steal time saturates the CPU, can be ++ * because of inaccuracies in how we track these -- see ++ * update_irq_load_avg(). ++ */ ++ irq = cpu_util_irq(rq); ++ if (unlikely(irq >= scale)) { ++ if (min) ++ *min = scale; ++ if (max) ++ *max = scale; ++ return scale; ++ } ++ ++ if (min) { ++ /* ++ * The minimum utilization returns the highest level between: ++ * - the computed DL bandwidth needed with the IRQ pressure which ++ * steals time to the deadline task. ++ * - The minimum performance requirement for CFS and/or RT. ++ */ ++ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); ++ ++ /* ++ * When an RT task is runnable and uclamp is not used, we must ++ * ensure that the task will run at maximum compute capacity. ++ */ ++ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) ++ *min = max(*min, scale); ++ } ++ ++ /* ++ * Because the time spend on RT/DL tasks is visible as 'lost' time to ++ * CFS tasks and we use the same metric to track the effective ++ * utilization (PELT windows are synchronized) we can directly add them ++ * to obtain the CPU's actual utilization. ++ */ ++ util = util_cfs + cpu_util_rt(rq); ++ util += cpu_util_dl(rq); ++ ++ /* ++ * The maximum hint is a soft bandwidth requirement, which can be lower ++ * than the actual utilization because of uclamp_max requirements. ++ */ ++ if (max) ++ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); ++ ++ if (util >= scale) ++ return scale; ++ ++ /* ++ * There is still idle time; further improve the number by using the ++ * IRQ metric. Because IRQ/steal time is hidden from the task clock we ++ * need to scale the task numbers: ++ * ++ * max - irq ++ * U' = irq + --------- * U ++ * max ++ */ ++ util = scale_irq_capacity(util, irq, scale); ++ util += irq; ++ ++ return min(scale, util); ++} ++ ++unsigned long sched_cpu_util(int cpu) ++{ ++ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); ++} ++ + /* + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test the +@@ -8308,7 +8694,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + + static void task_dead_fair(struct task_struct *p) + { +- remove_entity_load_avg(&p->se); ++ struct sched_entity *se = &p->se; ++ ++ if (se->sched_delayed) { ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(p, &rf); ++ if (se->sched_delayed) { ++ update_rq_clock(rq); ++ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); ++ } ++ task_rq_unlock(rq, p, &rf); ++ } ++ ++ remove_entity_load_avg(se); + } + + /* +@@ -8344,7 +8744,7 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context + static int + balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + { +- if (rq->nr_running) ++ if (sched_fair_runnable(rq)) + return 1; + + return sched_balance_newidle(rq, rf) != 0; +@@ -8430,7 +8830,17 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + /* +- * XXX pick_eevdf(cfs_rq) != se ? ++ * If @p has a shorter slice than current and @p is eligible, override ++ * current's slice protection in order to allow preemption. ++ * ++ * Note that even if @p does not turn out to be the most eligible ++ * task at this moment, current's slice protection will be lost. ++ */ ++ if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) ++ se->vlag = se->deadline + 1; ++ ++ /* ++ * If @p has become the most eligible task, force preemption. + */ + if (pick_eevdf(cfs_rq) == pse) + goto preempt; +@@ -8441,7 +8851,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + resched_curr(rq); + } + +-#ifdef CONFIG_SMP + static struct task_struct *pick_task_fair(struct rq *rq) + { + struct sched_entity *se; +@@ -8453,95 +8862,58 @@ static struct task_struct *pick_task_fair(struct rq *rq) + return NULL; + + do { +- struct sched_entity *curr = cfs_rq->curr; ++ /* Might not have done put_prev_entity() */ ++ if (cfs_rq->curr && cfs_rq->curr->on_rq) ++ update_curr(cfs_rq); + +- /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ +- if (curr) { +- if (curr->on_rq) +- update_curr(cfs_rq); +- else +- curr = NULL; ++ if (unlikely(check_cfs_rq_runtime(cfs_rq))) ++ goto again; + +- if (unlikely(check_cfs_rq_runtime(cfs_rq))) +- goto again; +- } +- +- se = pick_next_entity(cfs_rq); ++ se = pick_next_entity(rq, cfs_rq); ++ if (!se) ++ goto again; + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); + } +-#endif ++ ++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); ++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); + + struct task_struct * + pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + { +- struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + struct task_struct *p; + int new_tasks; + + again: +- if (!sched_fair_runnable(rq)) ++ p = pick_task_fair(rq); ++ if (!p) + goto idle; ++ se = &p->se; + + #ifdef CONFIG_FAIR_GROUP_SCHED +- if (!prev || prev->sched_class != &fair_sched_class) ++ if (prev->sched_class != &fair_sched_class) + goto simple; + ++ __put_prev_set_next_dl_server(rq, prev, p); ++ + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. +- */ +- +- do { +- struct sched_entity *curr = cfs_rq->curr; +- +- /* +- * Since we got here without doing put_prev_entity() we also +- * have to consider cfs_rq->curr. If it is still a runnable +- * entity, update_curr() will update its vruntime, otherwise +- * forget we've ever seen it. +- */ +- if (curr) { +- if (curr->on_rq) +- update_curr(cfs_rq); +- else +- curr = NULL; +- +- /* +- * This call to check_cfs_rq_runtime() will do the +- * throttle and dequeue its entity in the parent(s). +- * Therefore the nr_running test will indeed +- * be correct. +- */ +- if (unlikely(check_cfs_rq_runtime(cfs_rq))) { +- cfs_rq = &rq->cfs; +- +- if (!cfs_rq->nr_running) +- goto idle; +- +- goto simple; +- } +- } +- +- se = pick_next_entity(cfs_rq); +- cfs_rq = group_cfs_rq(se); +- } while (cfs_rq); +- +- p = task_of(se); +- +- /* ++ * + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; ++ struct cfs_rq *cfs_rq; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; +@@ -8559,38 +8931,15 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); +- } +- +- goto done; +-simple: +-#endif +- if (prev) +- put_prev_task(rq, prev); + +- do { +- se = pick_next_entity(cfs_rq); +- set_next_entity(cfs_rq, se); +- cfs_rq = group_cfs_rq(se); +- } while (cfs_rq); ++ __set_next_task_fair(rq, p, true); ++ } + +- p = task_of(se); ++ return p; + +-done: __maybe_unused; +-#ifdef CONFIG_SMP +- /* +- * Move the next running task to the front of +- * the list, so our cfs_tasks list becomes MRU +- * one. +- */ +- list_move(&p->se.group_node, &rq->cfs_tasks); ++simple: + #endif +- +- if (hrtick_enabled_fair(rq)) +- hrtick_start_fair(rq, p); +- +- update_misfit_status(p, rq); +- sched_fair_update_stop_tick(rq, p); +- ++ put_prev_set_next_task(rq, prev, p); + return p; + + idle: +@@ -8619,15 +8968,34 @@ done: __maybe_unused; + return NULL; + } + +-static struct task_struct *__pick_next_task_fair(struct rq *rq) ++static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) ++{ ++ return pick_next_task_fair(rq, prev, NULL); ++} ++ ++static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) ++{ ++ return !!dl_se->rq->cfs.nr_running; ++} ++ ++static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) + { +- return pick_next_task_fair(rq, NULL, NULL); ++ return pick_task_fair(dl_se->rq); ++} ++ ++void fair_server_init(struct rq *rq) ++{ ++ struct sched_dl_entity *dl_se = &rq->fair_server; ++ ++ init_dl_entity(dl_se); ++ ++ dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + } + + /* + * Account for a descheduled task: + */ +-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) ++static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) + { + struct sched_entity *se = &prev->se; + struct cfs_rq *cfs_rq; +@@ -12721,22 +13089,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + */ + static void task_fork_fair(struct task_struct *p) + { +- struct sched_entity *se = &p->se, *curr; +- struct cfs_rq *cfs_rq; +- struct rq *rq = this_rq(); +- struct rq_flags rf; +- +- rq_lock(rq, &rf); +- update_rq_clock(rq); +- + set_task_max_allowed_capacity(p); +- +- cfs_rq = task_cfs_rq(current); +- curr = cfs_rq->curr; +- if (curr) +- update_curr(cfs_rq); +- place_entity(cfs_rq, se, ENQUEUE_INITIAL); +- rq_unlock(rq, &rf); + } + + /* +@@ -12848,10 +13201,28 @@ static void attach_task_cfs_rq(struct task_struct *p) + static void switched_from_fair(struct rq *rq, struct task_struct *p) + { + detach_task_cfs_rq(p); ++ /* ++ * Since this is called after changing class, this is a little weird ++ * and we cannot use DEQUEUE_DELAYED. ++ */ ++ if (p->se.sched_delayed) { ++ /* First, dequeue it from its new class' structures */ ++ dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); ++ /* ++ * Now, clean up the fair_sched_class side of things ++ * related to sched_delayed being true and that wasn't done ++ * due to the generic dequeue not using DEQUEUE_DELAYED. ++ */ ++ finish_delayed_dequeue_entity(&p->se); ++ p->se.rel_deadline = 0; ++ __block_task(rq, p); ++ } + } + + static void switched_to_fair(struct rq *rq, struct task_struct *p) + { ++ SCHED_WARN_ON(p->se.sched_delayed); ++ + attach_task_cfs_rq(p); + + set_task_max_allowed_capacity(p); +@@ -12869,12 +13240,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) + } + } + +-/* Account for a task changing its policy or group. +- * +- * This routine is mostly called to set cfs_rq->curr field when a task +- * migrates between groups/classes. +- */ +-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) ++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + { + struct sched_entity *se = &p->se; + +@@ -12887,6 +13253,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + list_move(&se->group_node, &rq->cfs_tasks); + } + #endif ++ if (!first) ++ return; ++ ++ SCHED_WARN_ON(se->sched_delayed); ++ ++ if (hrtick_enabled_fair(rq)) ++ hrtick_start_fair(rq, p); ++ ++ update_misfit_status(p, rq); ++ sched_fair_update_stop_tick(rq, p); ++} ++ ++/* ++ * Account for a task changing its policy or group. ++ * ++ * This routine is mostly called to set cfs_rq->curr field when a task ++ * migrates between groups/classes. ++ */ ++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) ++{ ++ struct sched_entity *se = &p->se; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); +@@ -12895,12 +13282,14 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } ++ ++ __set_next_task_fair(rq, p, first); + } + + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; +- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); ++ cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); + #endif +@@ -13002,28 +13391,35 @@ void online_fair_sched_group(struct task_group *tg) + + void unregister_fair_sched_group(struct task_group *tg) + { +- unsigned long flags; +- struct rq *rq; + int cpu; + + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(cpu) { +- if (tg->se[cpu]) +- remove_entity_load_avg(tg->se[cpu]); ++ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; ++ struct sched_entity *se = tg->se[cpu]; ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (se) { ++ if (se->sched_delayed) { ++ guard(rq_lock_irqsave)(rq); ++ if (se->sched_delayed) { ++ update_rq_clock(rq); ++ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); ++ } ++ list_del_leaf_cfs_rq(cfs_rq); ++ } ++ remove_entity_load_avg(se); ++ } + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ +- if (!tg->cfs_rq[cpu]->on_list) +- continue; +- +- rq = cpu_rq(cpu); +- +- raw_spin_rq_lock_irqsave(rq, flags); +- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); +- raw_spin_rq_unlock_irqrestore(rq, flags); ++ if (cfs_rq->on_list) { ++ guard(rq_lock_irqsave)(rq); ++ list_del_leaf_cfs_rq(cfs_rq); ++ } + } + } + +@@ -13213,13 +13609,13 @@ DEFINE_SCHED_CLASS(fair) = { + + .wakeup_preempt = check_preempt_wakeup_fair, + ++ .pick_task = pick_task_fair, + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, + + #ifdef CONFIG_SMP + .balance = balance_fair, +- .pick_task = pick_task_fair, + .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 143f55df890b..290874079f60 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -5,8 +5,24 @@ + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. + */ + SCHED_FEAT(PLACE_LAG, true) ++/* ++ * Give new tasks half a slice to ease into the competition. ++ */ + SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) ++/* ++ * Preserve relative virtual deadline on 'migration'. ++ */ ++SCHED_FEAT(PLACE_REL_DEADLINE, true) ++/* ++ * Inhibit (wakeup) preemption until the current task has either matched the ++ * 0-lag point or until is has exhausted it's slice. ++ */ + SCHED_FEAT(RUN_TO_PARITY, true) ++/* ++ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for ++ * current. ++ */ ++SCHED_FEAT(PREEMPT_SHORT, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +@@ -21,6 +37,18 @@ SCHED_FEAT(NEXT_BUDDY, false) + */ + SCHED_FEAT(CACHE_HOT_BUDDY, true) + ++/* ++ * Delay dequeueing tasks until they get selected or woken. ++ * ++ * By delaying the dequeue for non-eligible tasks, they remain in the ++ * competition and can burn off their negative lag. When they get selected ++ * they'll have positive lag by definition. ++ * ++ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0. ++ */ ++SCHED_FEAT(DELAY_DEQUEUE, true) ++SCHED_FEAT(DELAY_ZERO, true) ++ + /* + * Allow wakeup-time preemption of the current task: + */ +@@ -85,5 +113,3 @@ SCHED_FEAT(WA_BIAS, true) + SCHED_FEAT(UTIL_EST, true) + + SCHED_FEAT(LATENCY_WARN, false) +- +-SCHED_FEAT(HZ_BW, true) +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 6e78d071beb5..7a105a0123aa 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -450,43 +450,35 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) + resched_curr(rq); + } + +-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) ++static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) + { ++ dl_server_update_idle_time(rq, prev); + } + + static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) + { + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); ++ next->se.exec_start = rq_clock_task(rq); + } + +-#ifdef CONFIG_SMP +-static struct task_struct *pick_task_idle(struct rq *rq) ++struct task_struct *pick_task_idle(struct rq *rq) + { + return rq->idle; + } +-#endif +- +-struct task_struct *pick_next_task_idle(struct rq *rq) +-{ +- struct task_struct *next = rq->idle; +- +- set_next_task_idle(rq, next, true); +- +- return next; +-} + + /* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +-static void ++static bool + dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) + { + raw_spin_rq_unlock_irq(rq); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + raw_spin_rq_lock_irq(rq); ++ return true; + } + + /* +@@ -528,13 +520,12 @@ DEFINE_SCHED_CLASS(idle) = { + + .wakeup_preempt = wakeup_preempt_idle, + +- .pick_next_task = pick_next_task_idle, ++ .pick_task = pick_task_idle, + .put_prev_task = put_prev_task_idle, + .set_next_task = set_next_task_idle, + + #ifdef CONFIG_SMP + .balance = balance_idle, +- .pick_task = pick_task_idle, + .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, + #endif +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index 310523c1b9e3..172c588de542 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE; + /* More than 4 hours if BW_SHIFT equals 20. */ + static const u64 max_rt_runtime = MAX_BW; + +-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); +- +-struct rt_bandwidth def_rt_bandwidth; +- + /* + * period over which we measure -rt task CPU usage in us. + * default: 1s +@@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void) + late_initcall(sched_rt_sysctl_init); + #endif + ++void init_rt_rq(struct rt_rq *rt_rq) ++{ ++ struct rt_prio_array *array; ++ int i; ++ ++ array = &rt_rq->active; ++ for (i = 0; i < MAX_RT_PRIO; i++) { ++ INIT_LIST_HEAD(array->queue + i); ++ __clear_bit(i, array->bitmap); ++ } ++ /* delimiter for bitsearch: */ ++ __set_bit(MAX_RT_PRIO, array->bitmap); ++ ++#if defined CONFIG_SMP ++ rt_rq->highest_prio.curr = MAX_RT_PRIO-1; ++ rt_rq->highest_prio.next = MAX_RT_PRIO-1; ++ rt_rq->overloaded = 0; ++ plist_head_init(&rt_rq->pushable_tasks); ++#endif /* CONFIG_SMP */ ++ /* We start is dequeued state, because no RT tasks are queued */ ++ rt_rq->rt_queued = 0; ++ ++#ifdef CONFIG_RT_GROUP_SCHED ++ rt_rq->rt_time = 0; ++ rt_rq->rt_throttled = 0; ++ rt_rq->rt_runtime = 0; ++ raw_spin_lock_init(&rt_rq->rt_runtime_lock); ++#endif ++} ++ ++#ifdef CONFIG_RT_GROUP_SCHED ++ ++static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); ++ + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) + { + struct rt_bandwidth *rt_b = +@@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) + do_start_rt_bandwidth(rt_b); + } + +-void init_rt_rq(struct rt_rq *rt_rq) +-{ +- struct rt_prio_array *array; +- int i; +- +- array = &rt_rq->active; +- for (i = 0; i < MAX_RT_PRIO; i++) { +- INIT_LIST_HEAD(array->queue + i); +- __clear_bit(i, array->bitmap); +- } +- /* delimiter for bit-search: */ +- __set_bit(MAX_RT_PRIO, array->bitmap); +- +-#if defined CONFIG_SMP +- rt_rq->highest_prio.curr = MAX_RT_PRIO-1; +- rt_rq->highest_prio.next = MAX_RT_PRIO-1; +- rt_rq->overloaded = 0; +- plist_head_init(&rt_rq->pushable_tasks); +-#endif /* CONFIG_SMP */ +- /* We start is dequeued state, because no RT tasks are queued */ +- rt_rq->rt_queued = 0; +- +- rt_rq->rt_time = 0; +- rt_rq->rt_throttled = 0; +- rt_rq->rt_runtime = 0; +- raw_spin_lock_init(&rt_rq->rt_runtime_lock); +-} +- +-#ifdef CONFIG_RT_GROUP_SCHED + static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) + { + hrtimer_cancel(&rt_b->rt_period_timer); +@@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg) + { + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); +- + } + + void free_rt_sched_group(struct task_group *tg) +@@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) + if (!tg->rt_se) + goto err; + +- init_rt_bandwidth(&tg->rt_bandwidth, +- ktime_to_ns(def_rt_bandwidth.rt_period), 0); ++ init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); + + for_each_possible_cpu(i) { + rt_rq = kzalloc_node(sizeof(struct rt_rq), +@@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) + return &rt_rq->tg->rt_bandwidth; + } + +-#else /* !CONFIG_RT_GROUP_SCHED */ +- +-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +-{ +- return rt_rq->rt_runtime; +-} +- +-static inline u64 sched_rt_period(struct rt_rq *rt_rq) +-{ +- return ktime_to_ns(def_rt_bandwidth.rt_period); +-} +- +-typedef struct rt_rq *rt_rq_iter_t; +- +-#define for_each_rt_rq(rt_rq, iter, rq) \ +- for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) +- +-#define for_each_sched_rt_entity(rt_se) \ +- for (; rt_se; rt_se = NULL) +- +-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +-{ +- return NULL; +-} +- +-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +-{ +- struct rq *rq = rq_of_rt_rq(rt_rq); +- +- if (!rt_rq->rt_nr_running) +- return; +- +- enqueue_top_rt_rq(rt_rq); +- resched_curr(rq); +-} +- +-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +-{ +- dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); +-} +- +-static inline int rt_rq_throttled(struct rt_rq *rt_rq) +-{ +- return rt_rq->rt_throttled; +-} +- +-static inline const struct cpumask *sched_rt_period_mask(void) +-{ +- return cpu_online_mask; +-} +- +-static inline +-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +-{ +- return &cpu_rq(cpu)->rt; +-} +- +-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +-{ +- return &def_rt_bandwidth; +-} +- +-#endif /* CONFIG_RT_GROUP_SCHED */ +- + bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) + { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); +@@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + const struct cpumask *span; + + span = sched_rt_period_mask(); +-#ifdef CONFIG_RT_GROUP_SCHED ++ + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest +@@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +-#endif ++ + for_each_cpu(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); +@@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + return idle; + } + +-static inline int rt_se_prio(struct sched_rt_entity *rt_se) +-{ +-#ifdef CONFIG_RT_GROUP_SCHED +- struct rt_rq *rt_rq = group_rt_rq(rt_se); +- +- if (rt_rq) +- return rt_rq->highest_prio.curr; +-#endif +- +- return rt_task_of(rt_se)->prio; +-} +- + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) + { + u64 runtime = sched_rt_runtime(rt_rq); +@@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) + return 0; + } + ++#else /* !CONFIG_RT_GROUP_SCHED */ ++ ++typedef struct rt_rq *rt_rq_iter_t; ++ ++#define for_each_rt_rq(rt_rq, iter, rq) \ ++ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) ++ ++#define for_each_sched_rt_entity(rt_se) \ ++ for (; rt_se; rt_se = NULL) ++ ++static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) ++{ ++ return NULL; ++} ++ ++static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) ++{ ++ struct rq *rq = rq_of_rt_rq(rt_rq); ++ ++ if (!rt_rq->rt_nr_running) ++ return; ++ ++ enqueue_top_rt_rq(rt_rq); ++ resched_curr(rq); ++} ++ ++static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) ++{ ++ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); ++} ++ ++static inline int rt_rq_throttled(struct rt_rq *rt_rq) ++{ ++ return false; ++} ++ ++static inline const struct cpumask *sched_rt_period_mask(void) ++{ ++ return cpu_online_mask; ++} ++ ++static inline ++struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) ++{ ++ return &cpu_rq(cpu)->rt; ++} ++ ++#ifdef CONFIG_SMP ++static void __enable_runtime(struct rq *rq) { } ++static void __disable_runtime(struct rq *rq) { } ++#endif ++ ++#endif /* CONFIG_RT_GROUP_SCHED */ ++ ++static inline int rt_se_prio(struct sched_rt_entity *rt_se) ++{ ++#ifdef CONFIG_RT_GROUP_SCHED ++ struct rt_rq *rt_rq = group_rt_rq(rt_se); ++ ++ if (rt_rq) ++ return rt_rq->highest_prio.curr; ++#endif ++ ++ return rt_task_of(rt_se)->prio; ++} ++ + /* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. +@@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) + static void update_curr_rt(struct rq *rq) + { + struct task_struct *curr = rq->curr; +- struct sched_rt_entity *rt_se = &curr->rt; + s64 delta_exec; + + if (curr->sched_class != &rt_sched_class) +@@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq) + if (unlikely(delta_exec <= 0)) + return; + ++#ifdef CONFIG_RT_GROUP_SCHED ++ struct sched_rt_entity *rt_se = &curr->rt; ++ + if (!rt_bandwidth_enabled()) + return; + +@@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); + } + } ++#endif + } + + static void +@@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) + static void + inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) + { +- start_rt_bandwidth(&def_rt_bandwidth); + } + + static inline +@@ -1492,7 +1483,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) + enqueue_pushable_task(rq, p); + } + +-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) ++static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) + { + struct sched_rt_entity *rt_se = &p->rt; + +@@ -1500,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) + dequeue_rt_entity(rt_se, flags); + + dequeue_pushable_task(rq, p); ++ ++ return true; + } + + /* +@@ -1755,17 +1748,7 @@ static struct task_struct *pick_task_rt(struct rq *rq) + return p; + } + +-static struct task_struct *pick_next_task_rt(struct rq *rq) +-{ +- struct task_struct *p = pick_task_rt(rq); +- +- if (p) +- set_next_task_rt(rq, p, true); +- +- return p; +-} +- +-static void put_prev_task_rt(struct rq *rq, struct task_struct *p) ++static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) + { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; +@@ -2652,13 +2635,12 @@ DEFINE_SCHED_CLASS(rt) = { + + .wakeup_preempt = wakeup_preempt_rt, + +- .pick_next_task = pick_next_task_rt, ++ .pick_task = pick_task_rt, + .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, + + #ifdef CONFIG_SMP + .balance = balance_rt, +- .pick_task = pick_task_rt, + .select_task_rq = select_task_rq_rt, + .set_cpus_allowed = set_cpus_allowed_common, + .rq_online = rq_online_rt, +@@ -2912,19 +2894,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) + #ifdef CONFIG_SYSCTL + static int sched_rt_global_constraints(void) + { +- unsigned long flags; +- int i; +- +- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); +- for_each_possible_cpu(i) { +- struct rt_rq *rt_rq = &cpu_rq(i)->rt; +- +- raw_spin_lock(&rt_rq->rt_runtime_lock); +- rt_rq->rt_runtime = global_rt_runtime(); +- raw_spin_unlock(&rt_rq->rt_runtime_lock); +- } +- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); +- + return 0; + } + #endif /* CONFIG_SYSCTL */ +@@ -2944,12 +2913,6 @@ static int sched_rt_global_validate(void) + + static void sched_rt_do_global(void) + { +- unsigned long flags; +- +- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); +- def_rt_bandwidth.rt_runtime = global_rt_runtime(); +- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + } + + static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 432b43aa091c..8b84608f2531 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -335,7 +336,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr); + extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); + extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); + extern int dl_bw_check_overflow(int cpu); +- ++extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); + /* + * SCHED_DEADLINE supports servers (nested scheduling) with the following + * interface: +@@ -361,7 +362,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se); + extern void dl_server_stop(struct sched_dl_entity *dl_se); + extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, +- dl_server_pick_f pick); ++ dl_server_pick_f pick_task); ++ ++extern void dl_server_update_idle_time(struct rq *rq, ++ struct task_struct *p); ++extern void fair_server_init(struct rq *rq); ++extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); ++extern int dl_server_apply_params(struct sched_dl_entity *dl_se, ++ u64 runtime, u64 period, bool init); + + #ifdef CONFIG_CGROUP_SCHED + +@@ -599,17 +607,12 @@ struct cfs_rq { + s64 avg_vruntime; + u64 avg_load; + +- u64 exec_clock; + u64 min_vruntime; + #ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; + #endif + +-#ifndef CONFIG_64BIT +- u64 min_vruntime_copy; +-#endif +- + struct rb_root_cached tasks_timeline; + + /* +@@ -619,10 +622,6 @@ struct cfs_rq { + struct sched_entity *curr; + struct sched_entity *next; + +-#ifdef CONFIG_SCHED_DEBUG +- unsigned int nr_spread_over; +-#endif +- + #ifdef CONFIG_SMP + /* + * CFS load tracking +@@ -726,13 +725,13 @@ struct rt_rq { + #endif /* CONFIG_SMP */ + int rt_queued; + ++#ifdef CONFIG_RT_GROUP_SCHED + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +-#ifdef CONFIG_RT_GROUP_SCHED + unsigned int rt_nr_boosted; + + struct rq *rq; +@@ -820,6 +819,9 @@ static inline void se_update_runnable(struct sched_entity *se) + + static inline long se_runnable(struct sched_entity *se) + { ++ if (se->sched_delayed) ++ return false; ++ + if (entity_is_task(se)) + return !!se->on_rq; + else +@@ -834,6 +836,9 @@ static inline void se_update_runnable(struct sched_entity *se) { } + + static inline long se_runnable(struct sched_entity *se) + { ++ if (se->sched_delayed) ++ return false; ++ + return !!se->on_rq; + } + +@@ -1044,6 +1049,8 @@ struct rq { + struct rt_rq rt; + struct dl_rq dl; + ++ struct sched_dl_entity fair_server; ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; +@@ -1059,6 +1066,7 @@ struct rq { + unsigned int nr_uninterruptible; + + struct task_struct __rcu *curr; ++ struct sched_dl_entity *dl_server; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; +@@ -1158,7 +1166,6 @@ struct rq { + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; +- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; +@@ -1187,6 +1194,7 @@ struct rq { + /* per rq */ + struct rq *core; + struct task_struct *core_pick; ++ struct sched_dl_entity *core_dl_server; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; +@@ -1236,6 +1244,7 @@ static inline int cpu_of(struct rq *rq) + } + + #define MDF_PUSH 0x01 ++#define DELAYED_MIGRATED 0x02 /* Task was migrated when in DELAYED_DEQUEUE state */ + + static inline bool is_migration_disabled(struct task_struct *p) + { +@@ -2247,11 +2256,13 @@ extern const u32 sched_prio_to_wmult[40]; + * + */ + +-#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ + #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ + #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ + #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ ++#define DEQUEUE_SPECIAL 0x10 + #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ ++#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ + + #define ENQUEUE_WAKEUP 0x01 + #define ENQUEUE_RESTORE 0x02 +@@ -2267,6 +2278,7 @@ extern const u32 sched_prio_to_wmult[40]; + #endif + #define ENQUEUE_INITIAL 0x80 + #define ENQUEUE_MIGRATING 0x100 ++#define ENQUEUE_DELAYED 0x200 + + #define RETRY_TASK ((void *)-1UL) + +@@ -2285,23 +2297,31 @@ struct sched_class { + #endif + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); +- void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); ++ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + +- struct task_struct *(*pick_next_task)(struct rq *rq); ++ struct task_struct *(*pick_task)(struct rq *rq); ++ /* ++ * Optional! When implemented pick_next_task() should be equivalent to: ++ * ++ * next = pick_task(); ++ * if (next) { ++ * put_prev_task(prev); ++ * set_next_task_first(next); ++ * } ++ */ ++ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + +- void (*put_prev_task)(struct rq *rq, struct task_struct *p); ++ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + + #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + +- struct task_struct * (*pick_task)(struct rq *rq); +- + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + + void (*task_woken)(struct rq *this_rq, struct task_struct *task); +@@ -2345,7 +2365,7 @@ struct sched_class { + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) + { + WARN_ON_ONCE(rq->curr != prev); +- prev->sched_class->put_prev_task(rq, prev); ++ prev->sched_class->put_prev_task(rq, prev, NULL); + } + + static inline void set_next_task(struct rq *rq, struct task_struct *next) +@@ -2353,6 +2373,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) + next->sched_class->set_next_task(rq, next, false); + } + ++static inline void ++__put_prev_set_next_dl_server(struct rq *rq, ++ struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prev->dl_server = NULL; ++ next->dl_server = rq->dl_server; ++ rq->dl_server = NULL; ++} ++ ++static inline void put_prev_set_next_task(struct rq *rq, ++ struct task_struct *prev, ++ struct task_struct *next) ++{ ++ WARN_ON_ONCE(rq->curr != prev); ++ ++ __put_prev_set_next_dl_server(rq, prev, next); ++ ++ if (next == prev) ++ return; ++ ++ prev->sched_class->put_prev_task(rq, prev, next); ++ next->sched_class->set_next_task(rq, next, true); ++} + + /* + * Helper to define a sched_class instance; each one is placed in a separate +@@ -2408,7 +2452,7 @@ static inline bool sched_fair_runnable(struct rq *rq) + } + + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +-extern struct task_struct *pick_next_task_idle(struct rq *rq); ++extern struct task_struct *pick_task_idle(struct rq *rq); + + #define SCA_CHECK 0x01 + #define SCA_MIGRATE_DISABLE 0x02 +@@ -2515,7 +2559,6 @@ extern void reweight_task(struct task_struct *p, const struct load_weight *lw); + extern void resched_curr(struct rq *rq); + extern void resched_cpu(int cpu); + +-extern struct rt_bandwidth def_rt_bandwidth; + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + +@@ -2586,6 +2629,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) + sched_update_tick_dependency(rq); + } + ++static inline void __block_task(struct rq *rq, struct task_struct *p) ++{ ++ WRITE_ONCE(p->on_rq, 0); ++ ASSERT_EXCLUSIVE_WRITER(p->on_rq); ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ if (p->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++} ++ + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); + extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +@@ -3607,7 +3663,7 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c + extern void __setscheduler_prio(struct task_struct *p, int prio); + extern void set_load_weight(struct task_struct *p, bool update_load); + extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +-extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); ++extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); + + extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, +diff --git a/kernel/sched/stats.h b/kernel/sched/stats.h +index 237780aa3c53..06a2c6d3ec1e 100644 +--- a/kernel/sched/stats.h ++++ b/kernel/sched/stats.h +@@ -129,6 +129,13 @@ static inline void psi_enqueue(struct task_struct *p, bool wakeup) + if (static_branch_likely(&psi_disabled)) + return; + ++ /* ++ * Delayed task is not ready to run yet! ++ * Wait for a requeue before accounting. ++ */ ++ if (p->se.sched_delayed) ++ return; ++ + if (p->in_memstall) + set |= TSK_MEMSTALL_RUNNING; + +@@ -148,6 +155,9 @@ static inline void psi_dequeue(struct task_struct *p, bool sleep) + if (static_branch_likely(&psi_disabled)) + return; + ++ /* Delayed task can only be dequeued for migration. */ ++ WARN_ON_ONCE(p->se.sched_delayed && sleep); ++ + /* + * A voluntary sleep is a dequeue followed by a task switch. To + * avoid walking all ancestors twice, psi_task_switch() handles +diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c +index b1b8fe61c532..058dd42e3d9b 100644 +--- a/kernel/sched/stop_task.c ++++ b/kernel/sched/stop_task.c +@@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq) + return rq->stop; + } + +-static struct task_struct *pick_next_task_stop(struct rq *rq) +-{ +- struct task_struct *p = pick_task_stop(rq); +- +- if (p) +- set_next_task_stop(rq, p, true); +- +- return p; +-} +- + static void + enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) + { + add_nr_running(rq, 1); + } + +-static void ++static bool + dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) + { + sub_nr_running(rq, 1); ++ return true; + } + + static void yield_task_stop(struct rq *rq) +@@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq) + BUG(); /* the stop task should never yield, its pointless. */ + } + +-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) ++static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next) + { + update_curr_common(rq); + } +@@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = { + + .wakeup_preempt = wakeup_preempt_stop, + +- .pick_next_task = pick_next_task_stop, ++ .pick_task = pick_task_stop, + .put_prev_task = put_prev_task_stop, + .set_next_task = set_next_task_stop, + + #ifdef CONFIG_SMP + .balance = balance_stop, +- .pick_task = pick_task_stop, + .select_task_rq = select_task_rq_stop, + .set_cpus_allowed = set_cpus_allowed_common, + #endif +diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c +index ae1b42775ef9..c62acf509b74 100644 +--- a/kernel/sched/syscalls.c ++++ b/kernel/sched/syscalls.c +@@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p) + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ +- if (!rt_prio(p->prio)) ++ if (!rt_or_dl_prio(p->prio)) + return p->normal_prio; + return p->prio; + } +@@ -258,107 +258,6 @@ int sched_core_idle_cpu(int cpu) + + #endif + +-#ifdef CONFIG_SMP +-/* +- * This function computes an effective utilization for the given CPU, to be +- * used for frequency selection given the linear relation: f = u * f_max. +- * +- * The scheduler tracks the following metrics: +- * +- * cpu_util_{cfs,rt,dl,irq}() +- * cpu_bw_dl() +- * +- * Where the cfs,rt and dl util numbers are tracked with the same metric and +- * synchronized windows and are thus directly comparable. +- * +- * The cfs,rt,dl utilization are the running times measured with rq->clock_task +- * which excludes things like IRQ and steal-time. These latter are then accrued +- * in the IRQ utilization. +- * +- * The DL bandwidth number OTOH is not a measured metric but a value computed +- * based on the task model parameters and gives the minimal utilization +- * required to meet deadlines. +- */ +-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, +- unsigned long *min, +- unsigned long *max) +-{ +- unsigned long util, irq, scale; +- struct rq *rq = cpu_rq(cpu); +- +- scale = arch_scale_cpu_capacity(cpu); +- +- /* +- * Early check to see if IRQ/steal time saturates the CPU, can be +- * because of inaccuracies in how we track these -- see +- * update_irq_load_avg(). +- */ +- irq = cpu_util_irq(rq); +- if (unlikely(irq >= scale)) { +- if (min) +- *min = scale; +- if (max) +- *max = scale; +- return scale; +- } +- +- if (min) { +- /* +- * The minimum utilization returns the highest level between: +- * - the computed DL bandwidth needed with the IRQ pressure which +- * steals time to the deadline task. +- * - The minimum performance requirement for CFS and/or RT. +- */ +- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); +- +- /* +- * When an RT task is runnable and uclamp is not used, we must +- * ensure that the task will run at maximum compute capacity. +- */ +- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) +- *min = max(*min, scale); +- } +- +- /* +- * Because the time spend on RT/DL tasks is visible as 'lost' time to +- * CFS tasks and we use the same metric to track the effective +- * utilization (PELT windows are synchronized) we can directly add them +- * to obtain the CPU's actual utilization. +- */ +- util = util_cfs + cpu_util_rt(rq); +- util += cpu_util_dl(rq); +- +- /* +- * The maximum hint is a soft bandwidth requirement, which can be lower +- * than the actual utilization because of uclamp_max requirements. +- */ +- if (max) +- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); +- +- if (util >= scale) +- return scale; +- +- /* +- * There is still idle time; further improve the number by using the +- * IRQ metric. Because IRQ/steal time is hidden from the task clock we +- * need to scale the task numbers: +- * +- * max - irq +- * U' = irq + --------- * U +- * max +- */ +- util = scale_irq_capacity(util, irq, scale); +- util += irq; +- +- return min(scale, util); +-} +- +-unsigned long sched_cpu_util(int cpu) +-{ +- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); +-} +-#endif /* CONFIG_SMP */ +- + /** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. +@@ -401,10 +300,20 @@ static void __setscheduler_params(struct task_struct *p, + + p->policy = policy; + +- if (dl_policy(policy)) ++ if (dl_policy(policy)) { + __setparam_dl(p, attr); +- else if (fair_policy(policy)) ++ } else if (fair_policy(policy)) { + p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ if (attr->sched_runtime) { ++ p->se.custom_slice = 1; ++ p->se.slice = clamp_t(u64, attr->sched_runtime, ++ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ ++ NSEC_PER_MSEC*100); /* HZ=100 / 10 */ ++ } else { ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; ++ } ++ } + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when +@@ -700,7 +609,9 @@ int __sched_setscheduler(struct task_struct *p, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { +- if (fair_policy(policy) && attr->sched_nice != task_nice(p)) ++ if (fair_policy(policy) && ++ (attr->sched_nice != task_nice(p) || ++ (attr->sched_runtime != p->se.slice))) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; +@@ -846,6 +757,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + ++ if (p->se.custom_slice) ++ attr.sched_runtime = p->se.slice; ++ + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +@@ -1012,12 +926,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + + static void get_params(struct task_struct *p, struct sched_attr *attr) + { +- if (task_has_dl_policy(p)) ++ if (task_has_dl_policy(p)) { + __getparam_dl(p, attr); +- else if (task_has_rt_policy(p)) ++ } else if (task_has_rt_policy(p)) { + attr->sched_priority = p->rt_priority; +- else ++ } else { + attr->sched_nice = task_nice(p); ++ attr->sched_runtime = p->se.slice; ++ } + } + + /** +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 76504b776d03..9748a4c8d668 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++ /* ++ * Because the rq is not a task, dl_add_task_root_domain() did not ++ * move the fair server bw to the rd if it already started. ++ * Add it now. ++ */ ++ if (rq->fair_server.dl_server) ++ __dl_server_attach_root(&rq->fair_server, rq); ++ + rq_unlock_irqrestore(rq, &rf); + + if (old_rd) +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index b8ee320208d4..f4be3abbb47b 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1975,7 +1975,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + * expiry. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { +- if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) ++ if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) + mode |= HRTIMER_MODE_HARD; + } + +@@ -2075,7 +2075,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + u64 slack; + + slack = current->timer_slack_ns; +- if (rt_task(current)) ++ if (rt_or_dl_task(current)) + slack = 0; + + hrtimer_init_sleeper_on_stack(&t, clockid, mode); +@@ -2280,7 +2280,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + * Override any slack passed by the user if under + * rt contraints. + */ +- if (rt_task(current)) ++ if (rt_or_dl_task(current)) + delta = 0; + + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); +diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c +index 130ca7e7787e..ae2ace5e515a 100644 +--- a/kernel/trace/trace_sched_wakeup.c ++++ b/kernel/trace/trace_sched_wakeup.c +@@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p) + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || +- (wakeup_rt && !dl_task(p) && !rt_task(p)) || ++ (wakeup_rt && !rt_or_dl_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) + return; + +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 3bd08b60a9b3..9bd709077621 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -426,7 +426,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) + bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; + + tsk = current; +- if (rt_task(tsk)) { ++ if (rt_or_dl_task(tsk)) { + bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; + thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; + } +@@ -485,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) + else + dirty = vm_dirty_ratio * node_memory / 100; + +- if (rt_task(tsk)) ++ if (rt_or_dl_task(tsk)) + dirty += dirty / 4; + + /* +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index 6040ed48da3e..ba29c5f5ef64 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -4009,7 +4009,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) + */ + if (alloc_flags & ALLOC_MIN_RESERVE) + alloc_flags &= ~ALLOC_CPUSET; +- } else if (unlikely(rt_task(current)) && in_task()) ++ } else if (unlikely(rt_or_dl_task(current)) && in_task()) + alloc_flags |= ALLOC_MIN_RESERVE; + + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); +-- +2.47.0.rc0 + diff --git a/sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch b/sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch new file mode 100644 index 0000000..9abc2a6 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.11.3+/0003-bbr3.patch @@ -0,0 +1,3386 @@ +From 9dff1ca88508fbe0bf6044ecc4423640382a4d57 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 10 Oct 2024 12:36:51 +0200 +Subject: [PATCH 03/12] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 4 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 72 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 9 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2230 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 16 files changed, 1940 insertions(+), 553 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 6a5e08b937b3..27aab715490e 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -369,7 +369,9 @@ struct tcp_sock { + u8 compressed_ack; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ +- unused:5; ++ fast_ack_mode:2, /* which fast ack mode ? */ ++ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ ++ unused:2; + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ + fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index c0deaafebfdc..d53f042d936e 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -137,8 +137,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 196c148fce8a..f37256b8abfd 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_LOW 16 ++#define TCP_ECN_ECT_PERMANENT 32 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -973,9 +989,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1087,6 +1108,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1109,7 +1131,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1129,10 +1155,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1143,7 +1172,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1167,8 +1198,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1234,6 +1268,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1253,6 +1295,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1265,6 +1308,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2416,7 +2474,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 50655de04c9b..82f8bd8f0d16 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index 3b687d20c9ed..a7c30c243b54 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -507,12 +507,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index dbf896f3146c..4702cd2f1ffc 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ ++#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 8e94ed7c56a0..50dc9970cad2 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -668,15 +668,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index 3f88d0961e5b..4273cac333f6 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } + ++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) ++{ ++} ++ + static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) + { +@@ -340,7 +344,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, ++ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 831a18dc7aa6..d9faa8fef55e 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -3849,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..a180fa648d5e 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which ++ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index 0306d257fa64..28f581c0dab7 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index e37488d3453f..62eef7d067c2 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3799,7 +3815,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3816,6 +3833,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3826,6 +3844,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3934,6 +3957,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4008,7 +4032,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_rack_update_reo_wnd(sk, &rs); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4032,6 +4056,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4051,7 +4076,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5718,13 +5743,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index a19a9dbd3409..e0ef8406a326 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 16c48df8df4c..6c3a1895238e 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -388,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1601,7 +1604,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + int nlen; + u8 flags; +@@ -1676,6 +1679,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2033,13 +2060,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2981,6 +3008,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 4d40615dc8fc..f27941201ef2 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock *sk) + return; + } + ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.47.0.rc0 + diff --git a/sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch b/sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch new file mode 100644 index 0000000..cfe58be --- /dev/null +++ b/sys-kernel/gentoo-sources-6.11.3+/0007-ksm.patch @@ -0,0 +1,433 @@ +From 92797c0423c1c2ffc1276ca82f17d01852adbe34 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 10 Oct 2024 12:37:57 +0200 +Subject: [PATCH 07/12] ksm + +Signed-off-by: Peter Jung +--- + arch/alpha/kernel/syscalls/syscall.tbl | 3 + + arch/arm/tools/syscall.tbl | 3 + + arch/m68k/kernel/syscalls/syscall.tbl | 3 + + arch/microblaze/kernel/syscalls/syscall.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n32.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n64.tbl | 3 + + arch/mips/kernel/syscalls/syscall_o32.tbl | 3 + + arch/parisc/kernel/syscalls/syscall.tbl | 3 + + arch/powerpc/kernel/syscalls/syscall.tbl | 3 + + arch/s390/kernel/syscalls/syscall.tbl | 3 + + arch/sh/kernel/syscalls/syscall.tbl | 3 + + arch/sparc/kernel/syscalls/syscall.tbl | 3 + + arch/x86/entry/syscalls/syscall_32.tbl | 3 + + arch/x86/entry/syscalls/syscall_64.tbl | 3 + + arch/xtensa/kernel/syscalls/syscall.tbl | 3 + + include/linux/syscalls.h | 3 + + include/uapi/asm-generic/unistd.h | 9 +- + kernel/sys.c | 138 ++++++++++++++++++ + kernel/sys_ni.c | 3 + + scripts/syscall.tbl | 3 + + .../arch/powerpc/entry/syscalls/syscall.tbl | 3 + + .../perf/arch/s390/entry/syscalls/syscall.tbl | 3 + + 22 files changed, 206 insertions(+), 1 deletion(-) + +diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl +index 74720667fe09..e6a11f3c0a2e 100644 +--- a/arch/alpha/kernel/syscalls/syscall.tbl ++++ b/arch/alpha/kernel/syscalls/syscall.tbl +@@ -502,3 +502,6 @@ + 570 common lsm_set_self_attr sys_lsm_set_self_attr + 571 common lsm_list_modules sys_lsm_list_modules + 572 common mseal sys_mseal ++573 common process_ksm_enable sys_process_ksm_enable ++574 common process_ksm_disable sys_process_ksm_disable ++575 common process_ksm_status sys_process_ksm_status +diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl +index 23c98203c40f..10a3099decbe 100644 +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -477,3 +477,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl +index 22a3cbd4c602..12d2c7594bf0 100644 +--- a/arch/m68k/kernel/syscalls/syscall.tbl ++++ b/arch/m68k/kernel/syscalls/syscall.tbl +@@ -462,3 +462,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl +index 2b81a6bd78b2..e2a93c856eed 100644 +--- a/arch/microblaze/kernel/syscalls/syscall.tbl ++++ b/arch/microblaze/kernel/syscalls/syscall.tbl +@@ -468,3 +468,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl +index 953f5b7dc723..b921fbf56fa6 100644 +--- a/arch/mips/kernel/syscalls/syscall_n32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl +@@ -401,3 +401,6 @@ + 460 n32 lsm_set_self_attr sys_lsm_set_self_attr + 461 n32 lsm_list_modules sys_lsm_list_modules + 462 n32 mseal sys_mseal ++463 n32 process_ksm_enable sys_process_ksm_enable ++464 n32 process_ksm_disable sys_process_ksm_disable ++465 n32 process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl +index 1464c6be6eb3..8d7f9ddd66f4 100644 +--- a/arch/mips/kernel/syscalls/syscall_n64.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl +@@ -377,3 +377,6 @@ + 460 n64 lsm_set_self_attr sys_lsm_set_self_attr + 461 n64 lsm_list_modules sys_lsm_list_modules + 462 n64 mseal sys_mseal ++463 n64 process_ksm_enable sys_process_ksm_enable ++464 n64 process_ksm_disable sys_process_ksm_disable ++465 n64 process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl +index 2439a2491cff..9d6142739954 100644 +--- a/arch/mips/kernel/syscalls/syscall_o32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl +@@ -450,3 +450,6 @@ + 460 o32 lsm_set_self_attr sys_lsm_set_self_attr + 461 o32 lsm_list_modules sys_lsm_list_modules + 462 o32 mseal sys_mseal ++463 o32 process_ksm_enable sys_process_ksm_enable ++464 o32 process_ksm_disable sys_process_ksm_disable ++465 o32 process_ksm_status sys_process_ksm_status +diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl +index 66dc406b12e4..9d46476fd908 100644 +--- a/arch/parisc/kernel/syscalls/syscall.tbl ++++ b/arch/parisc/kernel/syscalls/syscall.tbl +@@ -461,3 +461,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl +index ebae8415dfbb..16f71bc2f6f0 100644 +--- a/arch/powerpc/kernel/syscalls/syscall.tbl ++++ b/arch/powerpc/kernel/syscalls/syscall.tbl +@@ -553,3 +553,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl +index 01071182763e..7394bad8178e 100644 +--- a/arch/s390/kernel/syscalls/syscall.tbl ++++ b/arch/s390/kernel/syscalls/syscall.tbl +@@ -465,3 +465,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl +index c55fd7696d40..b9fc31221b87 100644 +--- a/arch/sh/kernel/syscalls/syscall.tbl ++++ b/arch/sh/kernel/syscalls/syscall.tbl +@@ -466,3 +466,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl +index cfdfb3707c16..0d79fd772854 100644 +--- a/arch/sparc/kernel/syscalls/syscall.tbl ++++ b/arch/sparc/kernel/syscalls/syscall.tbl +@@ -508,3 +508,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 534c74b14fab..c546a30575f1 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -468,3 +468,6 @@ + 460 i386 lsm_set_self_attr sys_lsm_set_self_attr + 461 i386 lsm_list_modules sys_lsm_list_modules + 462 i386 mseal sys_mseal ++463 i386 process_ksm_enable sys_process_ksm_enable ++464 i386 process_ksm_disable sys_process_ksm_disable ++465 i386 process_ksm_status sys_process_ksm_status +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index 7093ee21c0d1..0fcd10ba8dfe 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -386,6 +386,9 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status + + # + # Due to a historical design error, certain syscalls are numbered differently +diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl +index 67083fc1b2f5..c1aecee4ad9b 100644 +--- a/arch/xtensa/kernel/syscalls/syscall.tbl ++++ b/arch/xtensa/kernel/syscalls/syscall.tbl +@@ -433,3 +433,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 4bcf6754738d..b3ea08e920f7 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); + asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); + asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags); + asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, + unsigned long prot, unsigned long pgoff, + unsigned long flags); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index 5bf6148cac2b..613e559ad6e0 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) + #define __NR_mseal 462 + __SYSCALL(__NR_mseal, sys_mseal) + ++#define __NR_process_ksm_enable 463 ++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable) ++#define __NR_process_ksm_disable 464 ++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable) ++#define __NR_process_ksm_status 465 ++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status) ++ + #undef __NR_syscalls +-#define __NR_syscalls 463 ++#define __NR_syscalls 466 + + /* + * 32 bit systems traditionally used different +diff --git a/kernel/sys.c b/kernel/sys.c +index 3a2df1bd9f64..bc77dc784527 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -2789,6 +2789,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + return error; + } + ++#ifdef CONFIG_KSM ++enum pkc_action { ++ PKSM_ENABLE = 0, ++ PKSM_DISABLE, ++ PKSM_STATUS, ++}; ++ ++static long do_process_ksm_control(int pidfd, enum pkc_action action) ++{ ++ long ret; ++ struct task_struct *task; ++ struct mm_struct *mm; ++ unsigned int f_flags; ++ ++ task = pidfd_get_task(pidfd, &f_flags); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ ++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); ++ if (IS_ERR_OR_NULL(mm)) { ++ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; ++ goto release_task; ++ } ++ ++ /* Require CAP_SYS_NICE for influencing process performance. */ ++ if (!capable(CAP_SYS_NICE)) { ++ ret = -EPERM; ++ goto release_mm; ++ } ++ ++ if (mmap_write_lock_killable(mm)) { ++ ret = -EINTR; ++ goto release_mm; ++ } ++ ++ switch (action) { ++ case PKSM_ENABLE: ++ ret = ksm_enable_merge_any(mm); ++ break; ++ case PKSM_DISABLE: ++ ret = ksm_disable_merge_any(mm); ++ break; ++ case PKSM_STATUS: ++ ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags); ++ break; ++ } ++ ++ mmap_write_unlock(mm); ++ ++release_mm: ++ mmput(mm); ++release_task: ++ put_task_struct(task); ++out: ++ return ret; ++} ++#endif /* CONFIG_KSM */ ++ ++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_ENABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_DISABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_STATUS); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++#ifdef CONFIG_KSM ++static ssize_t process_ksm_enable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_enable); ++} ++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable); ++ ++static ssize_t process_ksm_disable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_disable); ++} ++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable); ++ ++static ssize_t process_ksm_status_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_status); ++} ++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status); ++ ++static struct attribute *process_ksm_sysfs_attrs[] = { ++ &process_ksm_enable_attr.attr, ++ &process_ksm_disable_attr.attr, ++ &process_ksm_status_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group process_ksm_sysfs_attr_group = { ++ .attrs = process_ksm_sysfs_attrs, ++ .name = "process_ksm", ++}; ++ ++static int __init process_ksm_sysfs_init(void) ++{ ++ return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group); ++} ++subsys_initcall(process_ksm_sysfs_init); ++#endif /* CONFIG_KSM */ ++ + SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) + { +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index c00a86931f8c..d82213d68522 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -186,6 +186,9 @@ COND_SYSCALL(mincore); + COND_SYSCALL(madvise); + COND_SYSCALL(process_madvise); + COND_SYSCALL(process_mrelease); ++COND_SYSCALL(process_ksm_enable); ++COND_SYSCALL(process_ksm_disable); ++COND_SYSCALL(process_ksm_status); + COND_SYSCALL(remap_file_pages); + COND_SYSCALL(mbind); + COND_SYSCALL(get_mempolicy); +diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl +index 845e24eb372e..227d9cc12365 100644 +--- a/scripts/syscall.tbl ++++ b/scripts/syscall.tbl +@@ -403,3 +403,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +index ebae8415dfbb..16f71bc2f6f0 100644 +--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +@@ -553,3 +553,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl +index 01071182763e..7394bad8178e 100644 +--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl +@@ -465,3 +465,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +-- +2.47.0.rc0 + diff --git a/sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch b/sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch new file mode 100644 index 0000000..552ebb3 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.11.3+/0012-zstd.patch @@ -0,0 +1,18652 @@ +From 89792579fbd7314abdd8a19d0ee9b510e9bec911 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 10 Oct 2024 12:39:34 +0200 +Subject: [PATCH 12/12] zstd + +Signed-off-by: Peter Jung +--- + include/linux/zstd.h | 2 +- + include/linux/zstd_errors.h | 23 +- + include/linux/zstd_lib.h | 850 +++++-- + lib/zstd/Makefile | 2 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 149 ++ + lib/zstd/common/bitstream.h | 127 +- + lib/zstd/common/compiler.h | 134 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 9 +- + lib/zstd/common/debug.h | 34 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 12 +- + lib/zstd/common/error_private.h | 84 +- + lib/zstd/common/fse.h | 94 +- + lib/zstd/common/fse_decompress.c | 130 +- + lib/zstd/common/huf.h | 237 +- + lib/zstd/common/mem.h | 3 +- + lib/zstd/common/portability_macros.h | 28 +- + lib/zstd/common/zstd_common.c | 38 +- + lib/zstd/common/zstd_deps.h | 16 +- + lib/zstd/common/zstd_internal.h | 109 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 74 +- + lib/zstd/compress/hist.c | 3 +- + lib/zstd/compress/hist.h | 3 +- + lib/zstd/compress/huf_compress.c | 441 ++-- + lib/zstd/compress/zstd_compress.c | 2111 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 359 ++- + lib/zstd/compress/zstd_compress_literals.c | 155 +- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 7 +- + lib/zstd/compress/zstd_compress_sequences.h | 3 +- + lib/zstd/compress/zstd_compress_superblock.c | 376 ++- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 169 +- + lib/zstd/compress/zstd_double_fast.c | 143 +- + lib/zstd/compress/zstd_double_fast.h | 17 +- + lib/zstd/compress/zstd_fast.c | 596 +++-- + lib/zstd/compress/zstd_fast.h | 6 +- + lib/zstd/compress/zstd_lazy.c | 732 +++--- + lib/zstd/compress/zstd_lazy.h | 138 +- + lib/zstd/compress/zstd_ldm.c | 21 +- + lib/zstd/compress/zstd_ldm.h | 3 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 497 ++-- + lib/zstd/compress/zstd_opt.h | 41 +- + lib/zstd/decompress/huf_decompress.c | 887 ++++--- + lib/zstd/decompress/zstd_ddict.c | 9 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 358 ++- + lib/zstd/decompress/zstd_decompress_block.c | 708 +++--- + lib/zstd/decompress/zstd_decompress_block.h | 10 +- + .../decompress/zstd_decompress_internal.h | 9 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 5 +- + lib/zstd/zstd_compress_module.c | 2 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 58 files changed, 6577 insertions(+), 3531 deletions(-) + create mode 100644 lib/zstd/common/allocations.h + create mode 100644 lib/zstd/common/bits.h + +diff --git a/include/linux/zstd.h b/include/linux/zstd.h +index 113408eef6ec..f109d49f43f8 100644 +--- a/include/linux/zstd.h ++++ b/include/linux/zstd.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h +index 58b6dd45a969..6d5cf55f0bf3 100644 +--- a/include/linux/zstd_errors.h ++++ b/include/linux/zstd_errors.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -17,8 +18,17 @@ + + + /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY ++#define ZSTDERRORLIB_VISIBLE ++ ++#ifndef ZSTDERRORLIB_HIDDEN ++# if (__GNUC__ >= 4) && !defined(__MINGW32__) ++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) ++# else ++# define ZSTDERRORLIB_HIDDEN ++# endif ++#endif ++ ++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE + + /*-********************************************* + * Error codes list +@@ -43,14 +53,17 @@ typedef enum { + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, ++ ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, ++ ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, +@@ -58,11 +71,15 @@ typedef enum { + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, ++ ZSTD_error_noForwardProgress_destFull = 80, ++ ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, ++ ZSTD_error_sequenceProducer_failed = 106, ++ ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ + } ZSTD_ErrorCode; + +diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h +index 79d55465d5c1..6320fedcf8a4 100644 +--- a/include/linux/zstd_lib.h ++++ b/include/linux/zstd_lib.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,23 +12,42 @@ + #ifndef ZSTD_H_235446 + #define ZSTD_H_235446 + +-/* ====== Dependency ======*/ ++/* ====== Dependencies ======*/ + #include /* INT_MAX */ + #include /* size_t */ + + + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ +-#ifndef ZSTDLIB_VISIBLE ++#define ZSTDLIB_VISIBLE ++ ++#ifndef ZSTDLIB_HIDDEN + # if (__GNUC__ >= 4) && !defined(__MINGW32__) +-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) + # else +-# define ZSTDLIB_VISIBLE + # define ZSTDLIB_HIDDEN + # endif + #endif ++ + #define ZSTDLIB_API ZSTDLIB_VISIBLE + ++/* Deprecation warnings : ++ * Should these warnings be a problem, it is generally possible to disable them, ++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. ++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. ++ */ ++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS ++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ ++#else ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) ++# elif (__GNUC__ >= 3) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) ++# else ++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") ++# define ZSTD_DEPRECATED(message) ++# endif ++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ ++ + + /* ***************************************************************************** + Introduction +@@ -65,7 +85,7 @@ + /*------ Version ------*/ + #define ZSTD_VERSION_MAJOR 1 + #define ZSTD_VERSION_MINOR 5 +-#define ZSTD_VERSION_RELEASE 2 ++#define ZSTD_VERSION_RELEASE 6 + #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + /*! ZSTD_versionNumber() : +@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void); + ***************************************/ + /*! ZSTD_compress() : + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, +@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); ++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") ++ZSTDLIB_API ++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ + * `src` should point to the start of a ZSTD frame or skippable frame. +@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) + + + /*====== Helper functions ======*/ +-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_compressBound() : ++ * maximum compressed size in worst case single-pass scenario. ++ * When invoking `ZSTD_compress()` or any other one-pass compression function, ++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) ++ * as it eliminates one potential failure scenario, ++ * aka not enough room in dst buffer to write the compressed frame. ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE . ++ * In which case, ZSTD_compressBound() will return an error code ++ * which can be tested using ZSTD_isError(). ++ * ++ * ZSTD_COMPRESSBOUND() : ++ * same as ZSTD_compressBound(), but as a macro. ++ * It can be used to produce constants, which can be useful for static allocation, ++ * for example to size a static array on stack. ++ * Will produce constant value 0 if srcSize too large. ++ */ ++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) ++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_isError() : ++ * Most ZSTD_* functions returning a size_t value can be tested for error, ++ * using ZSTD_isError(). ++ * @return 1 if error, 0 otherwise ++ */ + ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ + ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ + ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +@@ -183,7 +228,7 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compres + /*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. +@@ -196,9 +241,9 @@ ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer * + + /*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. +- * Important : in order to behave similarly to `ZSTD_compress()`, +- * this function compresses at requested compression level, +- * __ignoring any other parameter__ . ++ * Important : in order to mirror `ZSTD_compress()` behavior, ++ * this function compresses at the requested compression level, ++ * __ignoring any other advanced parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + /*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ + typedef struct ZSTD_DCtx_s ZSTD_DCtx; +@@ -220,7 +265,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer * + /*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. +- * Compatible with sticky parameters. ++ * Compatible with sticky parameters (see below). + */ + ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! +- * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . ++ * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supersedes all other "advanced" API entry points in the experimental section. +- * In the future, we expect to remove from experimental API entry points which are redundant with this API. ++ * In the future, we expect to remove API entry points from experimental which are redundant with this API. + */ + + +@@ -324,6 +369,19 @@ typedef enum { + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ ++ ++ ZSTD_c_targetCBlockSize=130, /* v1.5.6+ ++ * Attempts to fit compressed block size into approximatively targetCBlockSize. ++ * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. ++ * Note that it's not a guarantee, just a convergence target (default:0). ++ * No target when targetCBlockSize == 0. ++ * This is helpful in low bandwidth streaming environments to improve end-to-end latency, ++ * when a client can make use of partial documents (a prominent example being Chrome). ++ * Note: this parameter is stable since v1.5.6. ++ * It was present as an experimental parameter in earlier versions, ++ * but it's not recommended using it with earlier library versions ++ * due to massive performance regressions. ++ */ + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio +@@ -403,7 +461,6 @@ typedef enum { + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode +- * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer +@@ -412,6 +469,9 @@ typedef enum { + * ZSTD_c_validateSequences + * ZSTD_c_useBlockSplitter + * ZSTD_c_useRowMatchFinder ++ * ZSTD_c_prefetchCDictTables ++ * ZSTD_c_enableSeqProducerFallback ++ * ZSTD_c_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. +@@ -421,7 +481,7 @@ typedef enum { + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, +- ZSTD_c_experimentalParam6=1003, ++ /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, +@@ -430,7 +490,11 @@ typedef enum { + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, +- ZSTD_c_experimentalParam15=1012 ++ ZSTD_c_experimentalParam15=1012, ++ ZSTD_c_experimentalParam16=1013, ++ ZSTD_c_experimentalParam17=1014, ++ ZSTD_c_experimentalParam18=1015, ++ ZSTD_c_experimentalParam19=1016 + } ZSTD_cParameter; + + typedef struct { +@@ -493,7 +557,7 @@ typedef enum { + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". +- * This removes any reference to any dictionary too. ++ * This also removes any reference to any dictionary or external sequence producer. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. +@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + + /*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. ++ * (note that this entry point doesn't even expose a compression level parameter). + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data, though it is possible it fails for other reasons. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +@@ -543,13 +609,17 @@ typedef enum { + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts ++ * ZSTD_d_disableHuffmanAssembly ++ * ZSTD_d_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, +- ZSTD_d_experimentalParam4=1003 ++ ZSTD_d_experimentalParam4=1003, ++ ZSTD_d_experimentalParam5=1004, ++ ZSTD_d_experimentalParam6=1005 + + } ZSTD_dParameter; + +@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s { + * A ZSTD_CStream object is required to track streaming operation. + * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. + * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +-* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. ++* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. + * + * For parallel execution, use one separate ZSTD_CStream per thread. + * + * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. + * + * Parameters are sticky : when starting a new compression on the same context, +-* it will re-use the same sticky parameters as previous compression session. ++* it will reuse the same sticky parameters as previous compression session. + * When in doubt, it's recommended to fully initialize the context before usage. + * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), + * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +@@ -700,6 +770,11 @@ typedef enum { + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. ++ * - note: if an operation ends with an error, it may leave @cctx in an undefined state. ++ * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. ++ * In order to be re-employed after an error, a state must be reset, ++ * which can be done explicitly (ZSTD_CCtx_reset()), ++ * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) + */ + ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, +@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * This following is a legacy streaming API, available since v1.0+ . + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. +- * Streaming in combination with advanced parameters and dictionary compression +- * can only be used through the new API. + ******************************************************************************/ + + /*! +@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ++ * ++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API ++ * to compress with a dictionary. + */ + ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); + /*! +@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + * + * A ZSTD_DStream object is required to track streaming operations. + * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +-* ZSTD_DStream objects can be re-used multiple times. ++* ZSTD_DStream objects can be reused multiple times. + * + * Use ZSTD_initDStream() to start a new decompression operation. + * @return : recommended first input size +@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer + + /*===== Streaming decompression functions =====*/ + +-/* This function is redundant with the advanced API and equivalent to: ++/*! ZSTD_initDStream() : ++ * Initialize/reset DStream state for new decompression operation. ++ * Call before new decompression operation using same DStream. + * ++ * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ + ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + ++/*! ZSTD_decompressStream() : ++ * Streaming decompression function. ++ * Call repetitively to consume full input updating it as necessary. ++ * Function will update both input and output `pos` fields exposing current state via these fields: ++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input ++ * on the next call. ++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers. ++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers, ++ * call ZSTD_decompressStream() again to flush remaining data to output. ++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * @return : 0 when a frame is completely decoded and fully flushed, ++ * or an error code, which can be tested using ZSTD_isError(), ++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. ++ * ++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state. ++ * It's UB to invoke `ZSTD_decompressStream()` on such a state. ++ * In order to re-use such a state, it must be first reset, ++ * which can be done explicitly (`ZSTD_DCtx_reset()`), ++ * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) ++ */ + ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + + ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). +- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. ++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. +@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Advanced dictionary and prefix API (Requires v1.4.0+) + * + * This API allows dictionaries to be used with ZSTD_compress2(), +- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and +- * only reset with the context is reset with ZSTD_reset_parameters or +- * ZSTD_reset_session_and_parameters. Prefixes are single-use. ++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). ++ * Dictionaries are sticky, they remain valid when same context is reused, ++ * they only reset when the context is reset ++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. ++ * In contrast, Prefixes are single-use. + ******************************************************************************/ + + +@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". +- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. +- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). ++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, ++ * until parameters are reset, a new dictionary is loaded, or the dictionary ++ * is explicitly invalidated by loading a NULL dictionary. + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, +@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() +- * to precisely select how dictionary content must be interpreted. */ ++ * to precisely select how dictionary content must be interpreted. ++ * Note 5 : This method does not benefit from LDM (long distance mode). ++ * If you want to employ LDM on some large dictionary content, ++ * prefer employing ZSTD_CCtx_refPrefix() described below. ++ */ + ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ +- * Reference a prepared dictionary, to be used for all next compressed frames. ++ * Reference a prepared dictionary, to be used for all future compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. +@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). ++ * This method is compatible with LDM (long distance mode). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. +@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ +- * Create an internal DDict from dict buffer, +- * to be used to decompress next frames. +- * The dictionary remains valid for all future frames, until explicitly invalidated. ++ * Create an internal DDict from dict buffer, to be used to decompress all future frames. ++ * The dictionary remains valid for all future frames, until explicitly invalidated, or ++ * a new dictionary is loaded. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". +@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * ++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary ++ * will be managed, and referencing a dictionary effectively "discards" any previous one. ++ * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). +- * Note 1 : Currently, only one dictionary can be managed. +- * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE + #endif + +-/* Deprecation warnings : +- * Should these warnings be a problem, it is generally possible to disable them, +- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. +- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. +- */ +-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +-#else +-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +-# elif (__GNUC__ >= 3) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +-# else +-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +-# endif +-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ +- + /* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** +@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ + #define ZSTD_STRATEGY_MIN ZSTD_fast + #define ZSTD_STRATEGY_MAX ZSTD_btultra2 ++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ + + + #define ZSTD_OVERLAPLOG_MIN 0 +@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) + + /* Advanced parameter bounds */ +-#define ZSTD_TARGETCBLOCKSIZE_MIN 64 ++#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ + #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX + #define ZSTD_SRCSIZEHINT_MIN 0 + #define ZSTD_SRCSIZEHINT_MAX INT_MAX +@@ -1303,7 +1395,7 @@ typedef enum { + } ZSTD_paramSwitch_e; + + /* ************************************* +-* Frame size functions ++* Frame header and size functions + ***************************************/ + + /*! ZSTD_findDecompressedSize() : +@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size + * or an error code (if srcSize is too small) */ + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; ++typedef struct { ++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ ++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ ++ unsigned blockSizeMax; ++ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ unsigned headerSize; ++ unsigned dictID; ++ unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; ++} ZSTD_frameHeader; ++ ++/*! ZSTD_getFrameHeader() : ++ * decode Frame Header, or requires larger `srcSize`. ++ * @return : 0, `zfhPtr` is correctly filled, ++ * >0, `srcSize` is too small, value is wanted `srcSize` amount, ++ * or an error code, which can be tested using ZSTD_isError() */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ ++/*! ZSTD_getFrameHeader_advanced() : ++ * same as ZSTD_getFrameHeader(), ++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ ++/*! ZSTD_decompressionMargin() : ++ * Zstd supports in-place decompression, where the input and output buffers overlap. ++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, ++ * and the input buffer must be at the end of the output buffer. ++ * ++ * _______________________ Output Buffer ________________________ ++ * | | ++ * | ____ Input Buffer ____| ++ * | | | ++ * v v v ++ * |---------------------------------------|-----------|----------| ++ * ^ ^ ^ ++ * |___________________ Output_Size ___________________|_ Margin _| ++ * ++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). ++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or ++ * ZSTD_decompressDCtx(). ++ * NOTE: This function supports multi-frame input. ++ * ++ * @param src The compressed frame(s) ++ * @param srcSize The size of the compressed frame(s) ++ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); ++ ++/*! ZSTD_DECOMPRESS_MARGIN() : ++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from ++ * the compressed frame, compute it from the original size and the blockSizeLog. ++ * See ZSTD_decompressionMargin() for details. ++ * ++ * WARNING: This macro does not support multi-frame input, the input must be a single ++ * zstd frame. If you need that support use the function, or implement it yourself. ++ * ++ * @param originalSize The original uncompressed size of the data. ++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). ++ * Unless you explicitly set the windowLog smaller than ++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. ++ */ ++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ ++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ ++ 4 /* checksum */ + \ ++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ ++ (blockSize) /* One block of margin */ \ ++ )) ++ + typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ + } ZSTD_sequenceFormat_e; + ++/*! ZSTD_sequenceBound() : ++ * `srcSize` : size of the input buffer ++ * @return : upper-bound for the number of sequences that can be generated ++ * from a buffer of srcSize bytes ++ * ++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); ++ + /*! ZSTD_generateSequences() : +- * Generate sequences using ZSTD_compress2, given a source buffer. ++ * WARNING: This function is meant for debugging and informational purposes ONLY! ++ * Its implementation is flawed, and it will be deleted in a future version. ++ * It is not guaranteed to succeed, as there are several cases where it will give ++ * up and fail. You should NOT use this function in production code. ++ * ++ * This function is deprecated, and will be removed in a future version. ++ * ++ * Generate sequences using ZSTD_compress2(), given a source buffer. ++ * ++ * @param zc The compression context to be used for ZSTD_compress2(). Set any ++ * compression parameters you need on this context. ++ * @param outSeqs The output sequences buffer of size @p outSeqsSize ++ * @param outSeqsSize The size of the output sequences buffer. ++ * ZSTD_sequenceBound(srcSize) is an upper bound on the number ++ * of sequences that can be generated. ++ * @param src The source buffer to generate sequences from of size @p srcSize. ++ * @param srcSize The size of the source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * +- * zc can be used to insert custom compression params. +- * This function invokes ZSTD_compress2 +- * +- * The output of this function can be fed into ZSTD_compressSequences() with CCtx +- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters +- * @return : number of sequences generated ++ * @returns The number of sequences generated, necessarily less than ++ * ZSTD_sequenceBound(srcSize), or an error code that can be checked ++ * with ZSTD_isError(). + */ +- +-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +- size_t outSeqsSize, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") ++ZSTDLIB_STATIC_API size_t ++ZSTD_generateSequences(ZSTD_CCtx* zc, ++ ZSTD_Sequence* outSeqs, size_t outSeqsSize, ++ const void* src, size_t srcSize); + + /*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals +@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + + /*! ZSTD_compressSequences() : +- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. ++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. ++ * @src contains the entire input (not just the literals). ++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals + * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * The entire source is compressed into a single frame. + * +@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. + * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history +- * @return : final compressed size or a ZSTD error. ++ * @return : final compressed size, or a ZSTD error code. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); + + + /*! ZSTD_writeSkippableFrame() : +@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + /*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. ++ * This is useful in combination with ZSTD_initStatic(), ++ * which makes it possible to employ a static buffer for ZSTD_CCtx* state. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough +- * for any compression level up to selected one. +- * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate +- * does not include space for a window buffer. +- * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. ++ * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() ++ * associated with any compression level up to max specified one. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * ++ * Note that the size estimation is specific for one-shot compression, ++ * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) ++ * nor other potential ways of using a ZSTD_CCtx* state. ++ * + * When srcSize can be bound by a known and rather "small" value, +- * this fact can be used to provide a tighter estimation +- * because the CCtx compression context will need less memory. +- * This tighter estimation can be provided by more advanced functions ++ * this knowledge can be used to provide a tighter budget estimation ++ * because the ZSTD_CCtx* state will need less memory for small inputs. ++ * This tighter estimation can be provided by employing more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * +- * Note 2 : only single-threaded compression is supported. ++ * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); + + /*! ZSTD_estimateCStreamSize() : +- * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. +- * It will also consider src size to be arbitrarily "large", which is worst case. ++ * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression ++ * using any compression level up to the max specified one. ++ * It will also consider src size to be arbitrarily "large", which is a worst case scenario. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. +- * ZSTD_DStream memory budget depends on window Size. ++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. ++ * ++ * ZSTD_DStream memory budget depends on frame's window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); ++ * Any frame requesting a window size larger than max specified one will be rejected. + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. +- * In this case, get total size by adding ZSTD_estimate?DictSize */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); ++ * In this case, get total size by adding ZSTD_estimate?DictSize ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + + /*! ZSTD_estimate?DictSize() : +@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + * This function never fails (wide contract) */ + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + ++/*! ZSTD_CCtx_setCParams() : ++ * Set all parameters provided within @p cparams into the working @p cctx. ++ * Note : if modifying parameters during compression (MT mode only), ++ * note that changes to the .windowLog parameter will be ignored. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ * On failure, no parameters are updated. ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ ++/*! ZSTD_CCtx_setFParams() : ++ * Set all parameters provided within @p fparams into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); ++ ++/*! ZSTD_CCtx_setParams() : ++ * Set all parameters provided within @p params into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); ++ + /*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- const void* dict,size_t dictSize, +- ZSTD_parameters params); ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ const void* dict,size_t dictSize, ++ ZSTD_parameters params); + + /*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +-/* Tries to fit compressed block size to be around targetCBlockSize. +- * No target when targetCBlockSize == 0. +- * There is no guarantee on compressed block size (default:0) */ +-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 +- + /* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, +@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * +- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same +- * between calls, except for the modifications that zstd makes to pos (the +- * caller must not modify pos). This is checked by the compressor, and +- * compression will fail if it ever changes. This means the only flush +- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end +- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) +- * MUST not be modified during compression or you will get data corruption. ++ * Tells the compressor that input data presented with ZSTD_inBuffer ++ * will ALWAYS be the same between calls. ++ * Technically, the @src pointer must never be changed, ++ * and the @pos field can only be updated by zstd. ++ * However, it's possible to increase the @size field, ++ * allowing scenarios where more data can be appended after compressions starts. ++ * These conditions are checked by the compressor, ++ * and compression will fail if they are not respected. ++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) ++ * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until +@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * +- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. +- * That means this flag cannot be used with ZSTD_compressStream(). +- * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds +- * memory. However, compression WILL fail if you violate the preconditions. ++ * memory. However, compression WILL fail if conditions are not respected. + * +- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST +- * not be modified during compression or you will get data corruption. This +- * is because zstd needs to reference data in the ZSTD_inBuffer to find ++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST ++ * not be modified during compression or it will result in data corruption. ++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, +- * but passing this flag tells zstd to use the user provided buffer. ++ * but passing this flag tells zstd to rely on user provided buffer instead. + */ + #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Without validation, providing a sequence that does not conform to the zstd spec will cause + * undefined behavior, and may produce a corrupted block. + * +- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for ++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. + * +@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + ++/* ZSTD_c_prefetchCDictTables ++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto. ++ * ++ * In some situations, zstd uses CDict tables in-place rather than copying them ++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). ++ * In such situations, compression speed is seriously impacted when CDict tables are ++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables ++ * when they are used in-place. ++ * ++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. ++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables ++ * into the working context, so there is no need to prefetch. This parameter is ++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be ++ * useful but memcpy() is too expensive. The exact range of input sizes where this ++ * makes sense is best determined by careful experimentation. ++ * ++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, ++ * but in the future zstd may conditionally enable this feature via an auto-detection ++ * heuristic for cold CDicts. ++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. ++ */ ++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 ++ ++/* ZSTD_c_enableSeqProducerFallback ++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. ++ * ++ * Controls whether zstd will fall back to an internal sequence producer if an ++ * external sequence producer is registered and returns an error code. This fallback ++ * is block-by-block: the internal sequence producer will only be called for blocks ++ * where the external sequence producer returns an error code. Fallback parsing will ++ * follow any other cParam settings, such as compression level, the same as in a ++ * normal (fully-internal) compression operation. ++ * ++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API ++ * documentation (below) before setting this parameter. */ ++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 ++ ++/* ZSTD_c_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * This parameter can be used to set an upper bound on the blocksize ++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper ++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make ++ * compressBound() inaccurate). Only currently meant to be used for testing. ++ * ++ */ ++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 ++ ++/* ZSTD_c_searchForExternalRepcodes ++ * This parameter affects how zstd parses external sequences, such as sequences ++ * provided through the compressSequences() API or from an external block-level ++ * sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in ++ * external sequences, even if those repcodes are not explicitly indicated in ++ * the "rep" field. Note that this is the only way to exploit repcode matches ++ * while using compressSequences() or an external sequence producer, since zstd ++ * currently ignores the "rep" field of external sequences. ++ * ++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in ++ * external sequences, regardless of whether the "rep" field has been set. This ++ * reduces sequence compression overhead by about 25% while sacrificing some ++ * compression ratio. ++ * ++ * The default value is ZSTD_ps_auto, for which the library will enable/disable ++ * based on compression level. ++ * ++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future. ++ */ ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 ++ + /*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. +@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * +- * When this flags is enabled zstd won't allocate an output buffer, because ++ * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. +@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + */ + #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + ++/* ZSTD_d_disableHuffmanAssembly ++ * Set to 1 to disable the Huffman assembly implementation. ++ * The default value is 0, which allows zstd to use the Huffman assembly ++ * implementation if available. ++ * ++ * This parameter can be used to disable Huffman assembly at runtime. ++ * If you want to disable it at compile time you can define the macro ++ * ZSTD_DISABLE_ASM. ++ */ ++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 ++ ++/* ZSTD_d_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * Forces the decompressor to reject blocks whose content size is ++ * larger than the configured maxBlockSize. When maxBlockSize is ++ * larger than the windowSize, the windowSize is used instead. ++ * This saves memory on the decoder when you know all blocks are small. ++ * ++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize. ++ * ++ * WARNING: This causes the decoder to reject otherwise valid frames ++ * that have block sizes larger than the configured maxBlockSize. ++ */ ++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 ++ + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). +@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") ++ZSTDLIB_STATIC_API + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + /*! ZSTD_decompressStream_simpleArgs() : +@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); +@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + + /*! ZSTD_initCStream_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd parameter and leave the rest as-is. +- * for ((param, value) : params) { +- * ZSTD_CCtx_setParameter(zcs, param, value); +- * } ++ * ZSTD_CCtx_setParams(zcs, params); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * +@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, +@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + + /*! ZSTD_initCStream_usingCDict_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. +- * for ((fParam, value) : fParams) { +- * ZSTD_CCtx_setParameter(zcs, fParam, value); +- * } ++ * ZSTD_CCtx_setFParams(zcs, fParams); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * +@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, +@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * explicitly specified. + * + * start a new frame, using same parameters from previous frame. +- * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. ++ * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. +@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + + /*! +@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + + /*! +@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * +- * re-use decompression parameters from previous init; saves dictionary loading +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x ++ * reuse decompression parameters from previous init; saves dictionary loading + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + ++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* ++ * ++ * *** OVERVIEW *** ++ * The Block-Level Sequence Producer API allows users to provide their own custom ++ * sequence producer which libzstd invokes to process each block. The produced list ++ * of sequences (literals and matches) is then post-processed by libzstd to produce ++ * valid compressed blocks. ++ * ++ * This block-level offload API is a more granular complement of the existing ++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers ++ * an easier migration story for applications already integrated with libzstd: the ++ * user application continues to invoke the same compression functions ++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits ++ * from the specific advantages of the external sequence producer. For example, ++ * the sequence producer could be tuned to take advantage of known characteristics ++ * of the input, to offer better speed / ratio, or could leverage hardware ++ * acceleration not available within libzstd itself. ++ * ++ * See contrib/externalSequenceProducer for an example program employing the ++ * Block-Level Sequence Producer API. ++ * ++ * *** USAGE *** ++ * The user is responsible for implementing a function of type ++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following ++ * arguments to the user-provided function: ++ * ++ * - sequenceProducerState: a pointer to a user-managed state for the sequence ++ * producer. ++ * ++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. ++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory ++ * backing outSeqs is managed by the CCtx. ++ * ++ * - src, srcSize: an input buffer for the sequence producer to parse. ++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * - dict, dictSize: a history buffer, which may be empty, which the sequence ++ * producer may reference as it parses the src buffer. Currently, zstd will ++ * always pass dictSize == 0 into external sequence producers, but this will ++ * change in the future. ++ * ++ * - compressionLevel: a signed integer representing the zstd compression level ++ * set by the user for the current operation. The sequence producer may choose ++ * to use this information to change its compression strategy and speed/ratio ++ * tradeoff. Note: the compression level does not reflect zstd parameters set ++ * through the advanced API. ++ * ++ * - windowSize: a size_t representing the maximum allowed offset for external ++ * sequences. Note that sequence offsets are sometimes allowed to exceed the ++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md ++ * for details. ++ * ++ * The user-provided function shall return a size_t representing the number of ++ * sequences written to outSeqs. This return value will be treated as an error ++ * code if it is greater than outSeqsCapacity. The return value must be non-zero ++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided ++ * for convenience, but any value greater than outSeqsCapacity will be treated as ++ * an error code. ++ * ++ * If the user-provided function does not return an error code, the sequences ++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may ++ * occur if the parse is not valid. A parse is defined to be valid if the ++ * following conditions hold: ++ * - The sum of matchLengths and literalLengths must equal srcSize. ++ * - All sequences in the parse, except for the final sequence, must have ++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have ++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. ++ * - All offsets must respect the windowSize parameter as specified in ++ * doc/zstd_compression_format.md. ++ * - If the final sequence has matchLength == 0, it must also have offset == 0. ++ * ++ * zstd will only validate these conditions (and fail compression if they do not ++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence ++ * validation has a performance cost. ++ * ++ * If the user-provided function returns an error, zstd will either fall back ++ * to an internal sequence producer or fail the compression operation. The user can ++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback ++ * cParam. Fallback compression will follow any other cParam settings, such as ++ * compression level, the same as in a normal compression operation. ++ * ++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F ++ * function by calling ++ * ZSTD_registerSequenceProducer(cctx, ++ * sequenceProducerState, ++ * sequenceProducer) ++ * This setting will persist until the next parameter reset of the CCtx. ++ * ++ * The sequenceProducerState must be initialized by the user before calling ++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the ++ * sequenceProducerState. ++ * ++ * *** LIMITATIONS *** ++ * This API is compatible with all zstd compression APIs which respect advanced parameters. ++ * However, there are three limitations: ++ * ++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. ++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level ++ * external sequence producer. ++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some ++ * cases (see its documentation for details). Users must explicitly set ++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external ++ * sequence producer is registered. ++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default ++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should ++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence ++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). ++ * ++ * Second, history buffers are not currently supported. Concretely, zstd will always pass ++ * dictSize == 0 to the external sequence producer (for now). This has two implications: ++ * - Dictionaries are not currently supported. Compression will *not* fail if the user ++ * references a dictionary, but the dictionary won't have any effect. ++ * - Stream history is not currently supported. All advanced compression APIs, including ++ * streaming APIs, work with external sequence producers, but each block is treated as ++ * an independent chunk without history from previous blocks. ++ * ++ * Third, multi-threading within a single compression is not currently supported. In other words, ++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. ++ * Multi-threading across compressions is fine: simply create one CCtx per thread. ++ * ++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to ++ * overcoming them. It is purely a question of engineering effort. ++ */ ++ ++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) ++ ++typedef size_t (*ZSTD_sequenceProducer_F) ( ++ void* sequenceProducerState, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize, ++ const void* dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize ++); ++ ++/*! ZSTD_registerSequenceProducer() : ++ * Instruct zstd to use a block-level external sequence producer function. ++ * ++ * The sequenceProducerState must be initialized by the caller, and the caller is ++ * responsible for managing its lifetime. This parameter is sticky across ++ * compressions. It will remain set until the user explicitly resets compression ++ * parameters. ++ * ++ * Sequence producer registration is considered to be an "advanced parameter", ++ * part of the "advanced API". This means it will only have an effect on compression ++ * APIs which respect advanced parameters, such as compress2() and compressStream2(). ++ * Older compression APIs such as compressCCtx(), which predate the introduction of ++ * "advanced parameters", will ignore any external sequence producer setting. ++ * ++ * The sequence producer can be "cleared" by registering a NULL function pointer. This ++ * removes all limitations described above in the "LIMITATIONS" section of the API docs. ++ * ++ * The user is strongly encouraged to read the full API documentation (above) before ++ * calling this function. */ ++ZSTDLIB_STATIC_API void ++ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* cctx, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++/*! ZSTD_CCtxParams_registerSequenceProducer() : ++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. ++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), ++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). ++ * ++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() ++ * is required, then this function is for you. Otherwise, you probably don't need it. ++ * ++ * See tests/zstreamtest.c for example usage. */ ++ZSTDLIB_STATIC_API void ++ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++ + /* ******************************************************************* +-* Buffer-less and synchronous inner streaming functions ++* Buffer-less and synchronous inner streaming functions (DEPRECATED) ++* ++* This API is deprecated, and will be removed in a future version. ++* It allows streaming (de)compression with user allocated buffers. ++* However, it is hard to use, and not as well tested as the rest of ++* our API. + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. ++* Please use the normal streaming API instead: ZSTD_compressStream2, ++* and ZSTD_decompressStream. ++* If there is functionality that you need, but it doesn't provide, ++* please open an issue on our GitHub. + ********************************************************************* */ + + /* +@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. +- ZSTD_CCtx object can be re-used multiple times within successive compression operations. ++ ZSTD_CCtx object can be reused multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + +- `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. ++ `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. + */ + + /*===== Buffer-less streaming compression functions =====*/ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ++ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. +- A ZSTD_DCtx object can be re-used multiple times. ++ A ZSTD_DCtx object can be reused multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, +@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + */ + + /*===== Buffer-less streaming decompression functions =====*/ +-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +-typedef struct { +- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ +- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ +- unsigned blockSizeMax; +- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ +- unsigned headerSize; +- unsigned dictID; +- unsigned checksumFlag; +-} ZSTD_frameHeader; + +-/*! ZSTD_getFrameHeader() : +- * decode Frame Header, or requires larger `srcSize`. +- * @return : 0, `zfhPtr` is correctly filled, +- * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +-/*! ZSTD_getFrameHeader_advanced() : +- * same as ZSTD_getFrameHeader(), +- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + +-/* ============================ */ +-/* Block level API */ +-/* ============================ */ ++/* ========================================= */ ++/* Block level API (DEPRECATED) */ ++/* ========================================= */ + + /*! ++ ++ This API is deprecated in favor of the regular compression API. ++ You can get the frame header down to 2 bytes by setting: ++ - ZSTD_c_format = ZSTD_f_zstd1_magicless ++ - ZSTD_c_contentSizeFlag = 0 ++ - ZSTD_c_checksumFlag = 0 ++ - ZSTD_c_dictIDFlag = 0 ++ ++ This API is not as well tested as our normal API, so we recommend not using it. ++ We will be removing it in a future version. If the normal API doesn't provide ++ the functionality you need, please open a GitHub issue. ++ + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. +@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + */ + + /*===== Raw zstd block functions =====*/ ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + +- + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile +index 20f08c644b71..464c410b2768 100644 +--- a/lib/zstd/Makefile ++++ b/lib/zstd/Makefile +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + # ################################################################ +-# Copyright (c) Facebook, Inc. ++# Copyright (c) Meta Platforms, Inc. and affiliates. + # All rights reserved. + # + # This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h +new file mode 100644 +index 000000000000..16c3d08e8d1a +--- /dev/null ++++ b/lib/zstd/common/allocations.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++/* This file provides custom allocation primitives ++ */ ++ ++#define ZSTD_DEPS_NEED_MALLOC ++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ ++ ++#include "compiler.h" /* MEM_STATIC */ ++#define ZSTD_STATIC_LINKING_ONLY ++#include /* ZSTD_customMem */ ++ ++#ifndef ZSTD_ALLOCATIONS_H ++#define ZSTD_ALLOCATIONS_H ++ ++/* custom memory allocation functions */ ++ ++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) ++ return customMem.customAlloc(customMem.opaque, size); ++ return ZSTD_malloc(size); ++} ++ ++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) { ++ /* calloc implemented as malloc+memset; ++ * not as efficient as calloc, but next best guess for custom malloc */ ++ void* const ptr = customMem.customAlloc(customMem.opaque, size); ++ ZSTD_memset(ptr, 0, size); ++ return ptr; ++ } ++ return ZSTD_calloc(1, size); ++} ++ ++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ++{ ++ if (ptr!=NULL) { ++ if (customMem.customFree) ++ customMem.customFree(customMem.opaque, ptr); ++ else ++ ZSTD_free(ptr); ++ } ++} ++ ++#endif /* ZSTD_ALLOCATIONS_H */ +diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h +new file mode 100644 +index 000000000000..aa3487ec4b6a +--- /dev/null ++++ b/lib/zstd/common/bits.h +@@ -0,0 +1,149 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_BITS_H ++#define ZSTD_BITS_H ++ ++#include "mem.h" ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ 30, 22, 20, 15, 25, 17, 4, 8, ++ 31, 27, 13, 23, 21, 19, 16, 7, ++ 26, 12, 18, 6, 11, 5, 10, 9}; ++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++# else ++ return ZSTD_countTrailingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) { ++ assert(val != 0); ++ { ++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, ++ 11, 14, 16, 18, 22, 25, 3, 30, ++ 8, 12, 20, 28, 15, 17, 24, 7, ++ 19, 27, 23, 6, 26, 5, 4, 31}; ++ val |= val >> 1; ++ val |= val >> 2; ++ val |= val >> 4; ++ val |= val >> 8; ++ val |= val >> 16; ++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++# else ++ return ZSTD_countLeadingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) ++{ ++ if (MEM_isLittleEndian()) { ++ if (MEM_64bits()) { ++ return ZSTD_countTrailingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countTrailingZeros32((U32)val) >> 3; ++ } ++ } else { /* Big Endian CPU */ ++ if (MEM_64bits()) { ++ return ZSTD_countLeadingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countLeadingZeros32((U32)val) >> 3; ++ } ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ ++{ ++ assert(val != 0); ++ return 31 - ZSTD_countLeadingZeros32(val); ++} ++ ++/* ZSTD_rotateRight_*(): ++ * Rotates a bitfield to the right by "count" bits. ++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts ++ */ ++MEM_STATIC ++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { ++ assert(count < 64); ++ count &= 0x3F; /* for fickle pattern recognition */ ++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); ++} ++ ++MEM_STATIC ++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { ++ assert(count < 32); ++ count &= 0x1F; /* for fickle pattern recognition */ ++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); ++} ++ ++MEM_STATIC ++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { ++ assert(count < 16); ++ count &= 0x0F; /* for fickle pattern recognition */ ++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++} ++ ++#endif /* ZSTD_BITS_H */ +diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h +index feef3a1b1d60..6a13f1f0f1e8 100644 +--- a/lib/zstd/common/bitstream.h ++++ b/lib/zstd/common/bitstream.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * bitstream + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -27,6 +28,7 @@ + #include "compiler.h" /* UNLIKELY() */ + #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ + #include "error_private.h" /* error codes and messages */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /*========================================= +@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + /*-******************************************** + * bitStream decoding API (read backward) + **********************************************/ ++typedef size_t BitContainerType; + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; + const char* limitPtr; + } BIT_DStream_t; + +-typedef enum { BIT_DStream_unfinished = 0, +- BIT_DStream_endOfBuffer = 1, +- BIT_DStream_completed = 2, +- BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ +- /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ ++typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ ++ BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ ++ BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ ++ BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ ++ } BIT_DStream_status; /* result of BIT_reloadDStream() */ + + MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); + MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); +@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + + /* Start by invoking BIT_initDStream(). + * A chunk of the bitStream is then stored into a local register. +-* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). ++* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). + * You can then retrieve bitFields stored into the local register, **in reverse order**. + * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. + * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); + /* faster, but works only if nbBits >= 1 */ + +- +- +-/*-************************************************************** +-* Internal functions +-****************************************************************/ +-MEM_STATIC unsigned BIT_highbit32 (U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, +- 11, 14, 16, 18, 22, 25, 3, 30, +- 8, 12, 20, 28, 15, 17, 24, 7, +- 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- + /*===== Local Constants =====*/ + static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, +@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + return 0; + } + ++FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) ++{ ++ assert(nbBits < BIT_MASK_SIZE); ++ return bitContainer & BIT_mask[nbBits]; ++} ++ + /*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; ++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; + bitC->bitPos += nbBits; + } + +@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { +- case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); ++ case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + ZSTD_FALLTHROUGH; + +- case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); ++ case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + ZSTD_FALLTHROUGH; + +- case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); ++ case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + ZSTD_FALLTHROUGH; + +- case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; ++ case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; + ZSTD_FALLTHROUGH; + +- case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; ++ case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; + ZSTD_FALLTHROUGH; + +- case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; ++ case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; + ZSTD_FALLTHROUGH; + + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; +@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + return srcSize; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) ++FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start) + { + return bitContainer >> start; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) + { + U32 const regMask = sizeof(bitContainer)*8 - 1; + /* if start > regMask, bitstream is corrupted, and result is undefined */ +@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c + #endif + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +-{ +- assert(nbBits < BIT_MASK_SIZE); +- return bitContainer & BIT_mask[nbBits]; +-} +- + /*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) + { + /* arbitrate between double-shift and shift+mask */ + #if 1 +@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); + } + +-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + { + bitD->bitsConsumed += nbBits; + } +@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); +@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n + } + + /*! BIT_readBitsFast() : +- * unsafe version; only works only if nbBits >= 1 */ ++ * unsafe version; only works if nbBits >= 1 */ + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBitsFast(bitD, nbBits); +@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + return value; + } + ++/*! BIT_reloadDStream_internal() : ++ * Simple variant of BIT_reloadDStream(), with two conditions: ++ * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 ++ * 2. look window is valid after shifted down : bitD->ptr >= bitD->start ++ */ ++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) ++{ ++ assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); ++ bitD->ptr -= bitD->bitsConsumed >> 3; ++ assert(bitD->ptr >= bitD->start); ++ bitD->bitsConsumed &= 7; ++ bitD->bitContainer = MEM_readLEST(bitD->ptr); ++ return BIT_DStream_unfinished; ++} ++ + /*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! +@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) + { + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; +- assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); +- bitD->ptr -= bitD->bitsConsumed >> 3; +- bitD->bitsConsumed &= 7; +- bitD->bitContainer = MEM_readLEST(bitD->ptr); +- return BIT_DStream_unfinished; ++ return BIT_reloadDStream_internal(bitD); + } + + /*! BIT_reloadDStream() : + * Refill `bitD` from buffer previously set in BIT_initDStream() . +- * This function is safe, it guarantees it will not read beyond src buffer. ++ * This function is safe, it guarantees it will not never beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) ++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) + { +- if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ ++ /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ ++ if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { ++ static const BitContainerType zeroFilled = 0; ++ bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ ++ /* overflow detected, erroneous scenario or end of stream: no update */ + return BIT_DStream_overflow; ++ } ++ ++ assert(bitD->ptr >= bitD->start); + + if (bitD->ptr >= bitD->limitPtr) { +- return BIT_reloadDStreamFast(bitD); ++ return BIT_reloadDStream_internal(bitD); + } + if (bitD->ptr == bitD->start) { ++ /* reached end of bitStream => no update */ + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } +- /* start < ptr < limitPtr */ ++ /* start < ptr < limitPtr => cautious update */ + { U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { +diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h +index c42d39faf9bd..508ee25537bb 100644 +--- a/lib/zstd/common/compiler.h ++++ b/lib/zstd/common/compiler.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,6 +12,8 @@ + #ifndef ZSTD_COMPILER_H + #define ZSTD_COMPILER_H + ++#include ++ + #include "portability_macros.h" + + /*-******************************************************* +@@ -41,12 +44,15 @@ + */ + #define WIN_CDECL + ++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ ++#define UNUSED_ATTR __attribute__((unused)) ++ + /* + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to eliminate the constant + * branches. + */ +-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR ++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR + /* + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers +@@ -61,11 +67,21 @@ + #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 + # define HINT_INLINE static INLINE_KEYWORD + #else +-# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR ++# define HINT_INLINE FORCE_INLINE_TEMPLATE + #endif + +-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +-#define UNUSED_ATTR __attribute__((unused)) ++/* "soft" inline : ++ * The compiler is free to select if it's a good idea to inline or not. ++ * The main objective is to silence compiler warnings ++ * when a defined function in included but not used. ++ * ++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. ++ * Updating the prefix is probably preferable, but requires a fairly large codemod, ++ * since this name is used everywhere. ++ */ ++#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ ++#define MEM_STATIC static __inline UNUSED_ATTR ++#endif + + /* force no inlining */ + #define FORCE_NOINLINE static __attribute__((__noinline__)) +@@ -86,23 +102,24 @@ + # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) + # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) + #elif defined(__aarch64__) +-# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) +-# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) ++# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) ++# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) + #else +-# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +-# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ ++# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ ++# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ + #endif /* NO_PREFETCH */ + + #define CACHELINE_SIZE 64 + +-#define PREFETCH_AREA(p, s) { \ +- const char* const _ptr = (const char*)(p); \ +- size_t const _size = (size_t)(s); \ +- size_t _pos; \ +- for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ +- PREFETCH_L2(_ptr + _pos); \ +- } \ +-} ++#define PREFETCH_AREA(p, s) \ ++ do { \ ++ const char* const _ptr = (const char*)(p); \ ++ size_t const _size = (size_t)(s); \ ++ size_t _pos; \ ++ for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ ++ PREFETCH_L2(_ptr + _pos); \ ++ } \ ++ } while (0) + + /* vectorization + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, +@@ -126,9 +143,9 @@ + #define UNLIKELY(x) (__builtin_expect((x), 0)) + + #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) +-# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } ++# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) + #else +-# define ZSTD_UNREACHABLE { assert(0); } ++# define ZSTD_UNREACHABLE do { assert(0); } while (0) + #endif + + /* disable warnings */ +@@ -179,6 +196,85 @@ + * Sanitizer + *****************************************************************/ + ++/* ++ * Zstd relies on pointer overflow in its decompressor. ++ * We add this attribute to functions that rely on pointer overflow. ++ */ ++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# if __has_attribute(no_sanitize) ++# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 ++ /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) ++# else ++ /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) ++# endif ++# else ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# endif ++#endif ++ ++/* ++ * Helper function to perform a wrapped pointer difference without trigging ++ * UBSAN. ++ * ++ * @returns lhs - rhs with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) ++{ ++ return lhs - rhs; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer add without triggering UBSAN. ++ * ++ * @return ptr + add with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) ++{ ++ return ptr + add; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer subtraction without triggering ++ * UBSAN. ++ * ++ * @return ptr - sub with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) ++{ ++ return ptr - sub; ++} ++ ++/* ++ * Helper function to add to a pointer that works around C's undefined behavior ++ * of adding 0 to NULL. ++ * ++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`. ++ */ ++MEM_STATIC ++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) ++{ ++ return add > 0 ? ptr + add : ptr; ++} ++ ++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an ++ * abundance of caution, disable our custom poisoning on mingw. */ ++#ifdef __MINGW32__ ++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE ++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 ++#endif ++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE ++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 ++#endif ++#endif ++ + + + #endif /* ZSTD_COMPILER_H */ +diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h +index 0db7b42407ee..d8319a2bef4c 100644 +--- a/lib/zstd/common/cpu.h ++++ b/lib/zstd/common/cpu.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c +index bb863c9ea616..8eb6aa9a3b20 100644 +--- a/lib/zstd/common/debug.c ++++ b/lib/zstd/common/debug.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -21,4 +22,10 @@ + + #include "debug.h" + ++#if (DEBUGLEVEL>=2) ++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a ++ * translation unit is empty. So remove this from Linux kernel builds, but ++ * otherwise just leave it in. ++ */ + int g_debuglevel = DEBUGLEVEL; ++#endif +diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h +index 6dd88d1fbd02..226ba3c57ec3 100644 +--- a/lib/zstd/common/debug.h ++++ b/lib/zstd/common/debug.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable is only declared, + It's useful when enabling very verbose levels + on selective conditions (such as position in src) */ + +-# define RAWLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__VA_ARGS__); \ +- } } +-# define DEBUGLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \ +- ZSTD_DEBUG_PRINT(" \n"); \ +- } } ++# define RAWLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__VA_ARGS__); \ ++ } \ ++ } while (0) ++ ++#define STRINGIFY(x) #x ++#define TOSTRING(x) STRINGIFY(x) ++#define LINE_AS_STRING TOSTRING(__LINE__) ++ ++# define DEBUGLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ ++ ZSTD_DEBUG_PRINT(" \n"); \ ++ } \ ++ } while (0) + #else +-# define RAWLOG(l, ...) {} /* disabled */ +-# define DEBUGLOG(l, ...) {} /* disabled */ ++# define RAWLOG(l, ...) do { } while (0) /* disabled */ ++# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ + #endif + + +diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c +index fef67056f052..6cdd82233fb5 100644 +--- a/lib/zstd/common/entropy_common.c ++++ b/lib/zstd/common/entropy_common.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * Common functions of New Generation Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,8 +20,8 @@ + #include "error_private.h" /* ERR_*, ERROR */ + #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ + #include "huf.h" ++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ + + + /*=== Version ===*/ +@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + /*-************************************************************** + * FSE NCount encoding-decoding + ****************************************************************/ +-static U32 FSE_ctz(U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_ctz(val); +-# else /* Software version */ +- U32 count = 0; +- while ((val & 1) == 0) { +- val >>= 1; +- ++count; +- } +- return count; +-# endif +- } +-} +- + FORCE_INLINE_TEMPLATE + size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * repeat. + * Avoid UB by setting the high bit to 1. + */ +- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { +@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; +- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + } + charnum += 3 * repeats; + bitStream >>= 2 * repeats; +@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * know that threshold > 1. + */ + if (remaining <= 1) break; +- nbBits = BIT_highbit32(remaining) + 1; ++ nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); + } + if (charnum >= maxSV1) break; +@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + const void* src, size_t srcSize) + { + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); ++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ +- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; ++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; +- U32 const verif = 1 << BIT_highbit32(rest); +- U32 const lastWeight = BIT_highbit32(rest) + 1; ++ U32 const verif = 1 << ZSTD_highbit32(rest); ++ U32 const lastWeight = ZSTD_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; +@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, +- int bmi2) ++ int flags) + { + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } + #endif +- (void)bmi2; ++ (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c +index 6d1135f8c373..a4062d30d170 100644 +--- a/lib/zstd/common/error_private.c ++++ b/lib/zstd/common/error_private.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; +- case PREFIX(corruption_detected): return "Corrupted block detected"; ++ case PREFIX(corruption_detected): return "Data corruption detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; ++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; ++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; +@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; ++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; ++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; ++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; ++ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; + case PREFIX(maxCode): + default: return notErrorCode; + } +diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h +index ca5101e542fa..0410ca415b54 100644 +--- a/lib/zstd/common/error_private.h ++++ b/lib/zstd/common/error_private.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } + + /* check and forward error code */ +-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e +-#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } ++#define CHECK_V_F(e, f) \ ++ size_t const e = f; \ ++ do { \ ++ if (ERR_isError(e)) \ ++ return e; \ ++ } while (0) ++#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) + + + /*-**************************************** +@@ -84,10 +90,12 @@ void _force_has_format_string(const char *format, ...) { + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +-#define _FORCE_HAS_FORMAT_STRING(...) \ +- if (0) { \ +- _force_has_format_string(__VA_ARGS__); \ +- } ++#define _FORCE_HAS_FORMAT_STRING(...) \ ++ do { \ ++ if (0) { \ ++ _force_has_format_string(__VA_ARGS__); \ ++ } \ ++ } while (0) + + #define ERR_QUOTE(str) #str + +@@ -98,48 +106,50 @@ void _force_has_format_string(const char *format, ...) { + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +-#define RETURN_ERROR_IF(cond, err, ...) \ +- if (cond) { \ +- RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } ++#define RETURN_ERROR_IF(cond, err, ...) \ ++ do { \ ++ if (cond) { \ ++ RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } \ ++ } while (0) + + /* + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +-#define RETURN_ERROR(err, ...) \ +- do { \ +- RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } while(0); ++#define RETURN_ERROR(err, ...) \ ++ do { \ ++ RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } while(0) + + /* + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +-#define FORWARD_IF_ERROR(err, ...) \ +- do { \ +- size_t const err_code = (err); \ +- if (ERR_isError(err_code)) { \ +- RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ +- __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return err_code; \ +- } \ +- } while(0); ++#define FORWARD_IF_ERROR(err, ...) \ ++ do { \ ++ size_t const err_code = (err); \ ++ if (ERR_isError(err_code)) { \ ++ RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return err_code; \ ++ } \ ++ } while(0) + + + #endif /* ERROR_H_MODULE */ +diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h +index 4507043b2287..2185a578617d 100644 +--- a/lib/zstd/common/fse.h ++++ b/lib/zstd/common/fse.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -50,34 +51,6 @@ + FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ + + +-/*-**************************************** +-* FSE simple functions +-******************************************/ +-/*! FSE_compress() : +- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. +- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). +- @return : size of compressed data (<= dstCapacity). +- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. +- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +-*/ +-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/*! FSE_decompress(): +- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', +- into already allocated destination buffer 'dst', of size 'dstCapacity'. +- @return : size of regenerated data (<= maxDstSize), +- or an error code, which can be tested using FSE_isError() . +- +- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! +- Why ? : making this distinction requires a header. +- Header management is intentionally delegated to the user layer, which can better manage special cases. +-*/ +-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, +- const void* cSrc, size_t cSrcSize); +- +- + /*-***************************************** + * Tool functions + ******************************************/ +@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return + FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +-/*-***************************************** +-* FSE advanced functions +-******************************************/ +-/*! FSE_compress2() : +- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' +- Both parameters can be defined as '0' to mean : use default value +- @return : size of compressed data +- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. +- if FSE_isError(return), it's an error code. +-*/ +-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +- +- + /*-***************************************** + * FSE detailed API + ******************************************/ +@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + /*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ + typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + + /*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). +@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +-/*! Constructor and Destructor of FSE_DTable. +- Note that its size depends on 'tableLog' */ + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); +- +-/*! FSE_buildDTable(): +- Builds 'dt', which must be already allocated, using FSE_createDTable(). +- return : 0, or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +- +-/*! FSE_decompress_usingDTable(): +- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` +- into `dst` which must be already allocated. +- @return : size of regenerated data (necessarily <= `dstCapacity`), +- or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + + /*! + Tutorial : +@@ -286,6 +227,7 @@ If there is an error, the function will return an error code, which can be teste + + #endif /* FSE_H */ + ++ + #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY) + #define FSE_H_FSE_STATIC_LINKING_ONLY + +@@ -317,16 +259,6 @@ If there is an error, the function will return an error code, which can be teste + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); + /*< same as FSE_optimalTableLog(), which used `minus==2` */ + +-/* FSE_compress_wksp() : +- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). +- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. +- */ +-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +- +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ +- + size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); + /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi + FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); +-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ +- +-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); +-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ +- +-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) ++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) + #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ +- + size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ ++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. ++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + + typedef enum { + FSE_repeat_none, /*< Cannot use the previous table */ +@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); +- BIT_addBits(bitC, statePtr->value, nbBitsOut); ++ BIT_addBits(bitC, (size_t)statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } + + MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) + { +- BIT_addBits(bitC, statePtr->value, statePtr->stateLog); ++ BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); + } + + + /* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. +- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) ++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ + MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c +index 8dcb8ca39767..3a17e84f27bf 100644 +--- a/lib/zstd/common/fse_decompress.c ++++ b/lib/zstd/common/fse_decompress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy decoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -22,8 +23,8 @@ + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" + #include "error_private.h" +-#define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" ++#include "zstd_deps.h" /* ZSTD_memcpy */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -55,19 +56,6 @@ + #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) + #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + +- +-/* Function templates */ +-FSE_DTable* FSE_createDTable (unsigned tableLog) +-{ +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +-} +- +-void FSE_freeDTable (FSE_DTable* dt) +-{ +- ZSTD_free(dt); +-} +- + static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) + { + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ +@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + symbolNext[s] = 1; + } else { + if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; +- symbolNext[s] = normalizedCounter[s]; ++ symbolNext[s] = (U16)normalizedCounter[s]; + } } } + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); + } +@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ +- { +- U64 const add = 0x0101010101010101ull; ++ { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; +@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; +- } +- } ++ pos += (size_t)n; ++ } } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (u=0; utableLog = 0; +- DTableH->fastMode = 0; +- +- cell->newState = 0; +- cell->symbol = symbolValue; +- cell->nbBits = 0; +- +- return 0; +-} +- +- +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +-{ +- void* ptr = dt; +- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; +- void* dPtr = dt + 1; +- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSV1 = tableMask+1; +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* Build Decoding Table */ +- DTableH->tableLog = (U16)nbBits; +- DTableH->fastMode = 1; +- for (s=0; sfastMode; +- +- /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); +- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +-} +- +- +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +-{ +- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); ++ assert(op >= ostart); ++ return (size_t)(op-ostart); + } + + typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; +- FSE_DTable dtable[]; /* Dynamically sized */ + } FSE_DecompressWksp; + + +@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; ++ size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); ++ FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; + +- DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); ++ FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); + ++ /* correct offset to dtable depends on this property */ ++ FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); ++ + /* normal FSE decoding mode */ +- { +- size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); ++ { size_t const NCountLength = ++ FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + if (FSE_isError(NCountLength)) return NCountLength; + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); +@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + } + + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); +- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); ++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); ++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + +- CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); ++ CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); + + { +- const void* ptr = wksp->dtable; ++ const void* ptr = dtable; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); +- return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); ++ if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); ++ return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + } + } + +@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } + +- +-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +- +- +- + #endif /* FSE_COMMONDEFS_ONLY */ +diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h +index 5042ff870308..57462466e188 100644 +--- a/lib/zstd/common/huf.h ++++ b/lib/zstd/common/huf.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -18,99 +19,22 @@ + + /* *** Dependencies *** */ + #include "zstd_deps.h" /* size_t */ +- +- +-/* *** library symbols visibility *** */ +-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, +- * HUF symbols remain "private" (internal symbols for library only). +- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +-# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +-# define HUF_PUBLIC_API __declspec(dllexport) +-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +-#else +-# define HUF_PUBLIC_API +-#endif +- +- +-/* ========================== */ +-/* *** simple functions *** */ +-/* ========================== */ +- +-/* HUF_compress() : +- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. +- * 'dst' buffer must be already allocated. +- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). +- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. +- * @return : size of compressed data (<= `dstCapacity`). +- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) +- */ +-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/* HUF_decompress() : +- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', +- * into already allocated buffer 'dst', of minimum size 'dstSize'. +- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. +- * Note : in contrast with FSE, HUF_decompress can regenerate +- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, +- * because it knows size to regenerate (originalSize). +- * @return : size of regenerated data (== originalSize), +- * or an error code, which can be tested using HUF_isError() +- */ +-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, +- const void* cSrc, size_t cSrcSize); ++#include "mem.h" /* U32 */ ++#define FSE_STATIC_LINKING_ONLY ++#include "fse.h" + + + /* *** Tool functions *** */ +-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ +-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ ++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ ++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ + + /* Error Management */ +-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ +-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ +- ++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ ++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ + +-/* *** Advanced function *** */ + +-/* HUF_compress2() : +- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. +- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . +- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog); +- +-/* HUF_compress4X_wksp() : +- * Same as HUF_compress2(), but uses externally allocated `workSpace`. +- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) +-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog, +- void* workSpace, size_t wkspSize); +- +-#endif /* HUF_H_298734234 */ +- +-/* ****************************************************************** +- * WARNING !! +- * The following section contains advanced and experimental definitions +- * which shall never be used in the context of a dynamic library, +- * because they are not guaranteed to remain stable in the future. +- * Only consider them in association with static linking. +- * *****************************************************************/ +-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +-#define HUF_H_HUF_STATIC_LINKING_ONLY +- +-/* *** Dependencies *** */ +-#include "mem.h" /* U32 */ +-#define FSE_STATIC_LINKING_ONLY +-#include "fse.h" +- + + /* *** Constants *** */ + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ +@@ -151,25 +75,49 @@ typedef U32 HUF_DTable; + /* **************************************** + * Advanced decompression functions + ******************************************/ +-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-#endif + +-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ +-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++/* ++ * Huffman flags bitset. ++ * For all flags, 0 is the default value. ++ */ ++typedef enum { ++ /* ++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. ++ * Otherwise: Ignored. ++ */ ++ HUF_flags_bmi2 = (1 << 0), ++ /* ++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. ++ * If unset: Use heuristic to find the table depth. ++ */ ++ HUF_flags_optimalDepth = (1 << 1), ++ /* ++ * If set: If the previous table can encode the input, always reuse the previous table. ++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. ++ */ ++ HUF_flags_preferRepeat = (1 << 2), ++ /* ++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. ++ * If unset: Always histogram the entire input. ++ */ ++ HUF_flags_suspectUncompressible = (1 << 3), ++ /* ++ * If set: Don't use assembly implementations ++ * If unset: Allow using assembly implementations ++ */ ++ HUF_flags_disableAsm = (1 << 4), ++ /* ++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. ++ * If unset: Use the fast decoding loop when possible. ++ */ ++ HUF_flags_disableFast = (1 << 5) ++} HUF_flags_e; + + + /* **************************************** + * HUF detailed API + * ****************************************/ ++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra + + /*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") +@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); ++unsigned HUF_minTableLog(unsigned symbolCardinality); ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); ++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, ++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ + size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +@@ -196,6 +144,7 @@ typedef enum { + HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; ++ + /* HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) ++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) + #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) + size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, +@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, +- int bmi2); ++ int flags); + + /* HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + + /* HUF_getNbBitsFromCTable() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX +- * Note 1 : is not inlined, as HUF_CElt definition is private */ ++ * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 ++ * Note 2 : is not inlined, as HUF_CElt definition is private ++ */ + U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); + ++typedef struct { ++ BYTE tableLog; ++ BYTE maxSymbolValue; ++ BYTE unused[sizeof(size_t) - 2]; ++} HUF_CTableHeader; ++ ++/* HUF_readCTableHeader() : ++ * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. ++ */ ++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); ++ + /* + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics +@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) + #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +- + + /* ====================== */ + /* single stream variants */ + /* ====================== */ + +-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + /* HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + +-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +-#endif +- +-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#endif ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif +- +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); ++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ + #endif + + /* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #endif +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + +-#endif /* HUF_STATIC_LINKING_ONLY */ ++#endif /* HUF_H_298734234 */ + +diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h +index 1d9cc03924ca..2e91e7780c1f 100644 +--- a/lib/zstd/common/mem.h ++++ b/lib/zstd/common/mem.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,6 +24,7 @@ + /*-**************************************** + * Compiler specifics + ******************************************/ ++#undef MEM_STATIC /* may be already defined from common/compiler.h */ + #define MEM_STATIC static inline + + /*-************************************************************** +diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h +index 0e3b2c0a527d..f08638cced6c 100644 +--- a/lib/zstd/common/portability_macros.h ++++ b/lib/zstd/common/portability_macros.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,7 +13,7 @@ + #define ZSTD_PORTABILITY_MACROS_H + + /* +- * This header file contains macro defintions to support portability. ++ * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * +@@ -45,6 +46,8 @@ + /* Mark the internal assembly functions as hidden */ + #ifdef __ELF__ + # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func ++#elif defined(__APPLE__) ++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func + #else + # define ZSTD_HIDE_ASM_FUNCTION(func) + #endif +@@ -65,7 +68,7 @@ + #endif + + /* +- * Only enable assembly for GNUC comptabile compilers, ++ * Only enable assembly for GNUC compatible compilers, + * because other platforms may not support GAS assembly syntax. + * + * Only enable assembly for Linux / MacOS, other platforms may +@@ -90,4 +93,23 @@ + */ + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + ++/* ++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in ++ * assembly sources when CET is enabled. ++ * ++ * Additionally, any function that may be called indirectly must begin ++ * with ZSTD_CET_ENDBRANCH. ++ */ ++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ ++ && defined(__has_include) ++# if __has_include() ++# include ++# define ZSTD_CET_ENDBRANCH _CET_ENDBR ++# endif ++#endif ++ ++#ifndef ZSTD_CET_ENDBRANCH ++# define ZSTD_CET_ENDBRANCH ++#endif ++ + #endif /* ZSTD_PORTABILITY_MACROS_H */ +diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c +index 3d7e35b309b5..44b95b25344a 100644 +--- a/lib/zstd/common/zstd_common.c ++++ b/lib/zstd/common/zstd_common.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,6 @@ + * Dependencies + ***************************************/ + #define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + #include "error_private.h" + #include "zstd_internal.h" + +@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + /*! ZSTD_getErrorString() : + * provides error code string from enum */ + const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } +- +- +- +-/*=************************************************************** +-* Custom allocator +-****************************************************************/ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) +- return customMem.customAlloc(customMem.opaque, size); +- return ZSTD_malloc(size); +-} +- +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) { +- /* calloc implemented as malloc+memset; +- * not as efficient as calloc, but next best guess for custom malloc */ +- void* const ptr = customMem.customAlloc(customMem.opaque, size); +- ZSTD_memset(ptr, 0, size); +- return ptr; +- } +- return ZSTD_calloc(1, size); +-} +- +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +-{ +- if (ptr!=NULL) { +- if (customMem.customFree) +- customMem.customFree(customMem.opaque, ptr); +- else +- ZSTD_free(ptr); +- } +-} +diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h +index 2c34e8a33a1c..f931f7d0e294 100644 +--- a/lib/zstd/common/zstd_deps.h ++++ b/lib/zstd/common/zstd_deps.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { + + #endif /* ZSTD_DEPS_IO */ + #endif /* ZSTD_DEPS_NEED_IO */ ++ ++/* ++ * Only requested when MSAN is enabled. ++ * Need: ++ * intptr_t ++ */ ++#ifdef ZSTD_DEPS_NEED_STDINT ++#ifndef ZSTD_DEPS_STDINT ++#define ZSTD_DEPS_STDINT ++ ++/* intptr_t already provided by ZSTD_DEPS_COMMON */ ++ ++#endif /* ZSTD_DEPS_STDINT */ ++#endif /* ZSTD_DEPS_NEED_STDINT */ +diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h +index 93305d9b41bb..11da1233e890 100644 +--- a/lib/zstd/common/zstd_internal.h ++++ b/lib/zstd/common/zstd_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -28,7 +29,6 @@ + #include + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "huf.h" + #include /* XXH_reset, update, digest */ + #define ZSTD_TRACE 0 +@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + #define ZSTD_FRAMECHECKSUMSIZE 4 + + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ ++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ ++#define MIN_LITERALS_FOR_4_STREAMS 6 + +-#define HufLog 12 + typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + + #define LONGNBSEQ 0x7F00 +@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy + #define MINMATCH 3 + + #define Litbits 8 ++#define LitHufLog 11 + #define MaxLit ((1<= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); +@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. + */ +-#ifdef __aarch64__ +- do { +- COPY16(op, ip); +- } +- while (op < oend); +-#else + ZSTD_copy16(op, ip); + if (16 >= length) return; + op += 16; +@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + COPY16(op, ip); + } + while (op < oend); +-#endif + } + } + +@@ -289,11 +285,11 @@ typedef enum { + typedef struct { + seqDef* sequencesStart; + seqDef* sequences; /* ptr to end of sequences */ +- BYTE* litStart; +- BYTE* lit; /* ptr to end of literals */ +- BYTE* llCode; +- BYTE* mlCode; +- BYTE* ofCode; ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + +@@ -301,8 +297,8 @@ typedef struct { + * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment + * the existing value of the litLength or matchLength by 0x10000. + */ +- ZSTD_longLengthType_e longLengthType; +- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ + } seqStore_t; + + typedef struct { +@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + seqLen.matchLength = seq->mlBase + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { +- seqLen.litLength += 0xFFFF; ++ seqLen.litLength += 0x10000; + } + if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- seqLen.matchLength += 0xFFFF; ++ seqLen.matchLength += 0x10000; + } + } + return seqLen; +@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ + typedef struct { ++ size_t nbBlocks; + size_t compressedSize; + unsigned long long decompressedBound; + } ZSTD_frameSizeInfo; /* decompress & legacy */ + + const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ +- +-/* custom memory allocation functions */ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); +- +- +-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- +-/* +- * Counts the number of trailing zeros of a `size_t`. +- * Most compilers should support CTZ as a builtin. A backup +- * implementation is provided if the builtin isn't supported, but +- * it may not be terribly efficient. +- */ +-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +-{ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return __builtin_ctzll((U64)val); +-# else +- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, +- 4, 25, 14, 28, 9, 34, 20, 56, +- 5, 17, 26, 54, 15, 41, 29, 43, +- 10, 31, 38, 35, 21, 45, 49, 57, +- 63, 6, 12, 18, 24, 27, 33, 55, +- 16, 53, 40, 42, 30, 37, 44, 48, +- 62, 11, 23, 32, 52, 39, 36, 47, +- 61, 22, 51, 46, 60, 50, 59, 58 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return __builtin_ctz((U32)val); +-# else +- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, +- 30, 22, 20, 15, 25, 17, 4, 8, +- 31, 27, 13, 23, 21, 19, 16, 7, +- 26, 12, 18, 6, 11, 5, 10, 9 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +-} ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + + + /* ZSTD_invalidateRepCodes() : +@@ -420,13 +357,13 @@ typedef struct { + + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: decompress, fullbench */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + + /*! ZSTD_decodeSeqHeaders() : + * decode sequence header from src */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: zstd_decompress_block, fullbench */ + size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize); + +diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h +index d9a76112ec3a..6ab8be6532ef 100644 +--- a/lib/zstd/compress/clevels.h ++++ b/lib/zstd/compress/clevels.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c +index ec5b1ca6d71a..44a3c10becf2 100644 +--- a/lib/zstd/compress/fse_compress.c ++++ b/lib/zstd/compress/fse_compress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy encoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -25,7 +26,8 @@ + #include "../common/error_private.h" + #define ZSTD_DEPS_NEED_MALLOC + #define ZSTD_DEPS_NEED_MATH64 +-#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : +- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ ++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ +@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + break; + default : + assert(normalizedCounter[s] > 1); +- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); ++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) +- + 1 /* round up to whole nb bytes */ +- + 2 /* additional two bytes for bitstream flush */; ++ + 1 /* round up to whole nb bytes */ ++ + 2 /* additional two bytes for bitstream flush */; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ + } + +@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; +- nbBits = tableLog+1; ++ nbBits = (int)tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { +@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + } + while (symbol >= start+3) { + start+=3; +- bitStream += 3 << bitCount; ++ bitStream += 3U << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; +@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ +- bitStream += count << bitCount; ++ bitStream += (U32)count << bitCount; + bitCount += nbBits; + bitCount -= (count>8); + out+= (bitCount+7) /8; + +- return (out-ostart); ++ assert(out >= ostart); ++ return (size_t)(out-ostart); + } + + +@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, + * FSE Compression Code + ****************************************************************/ + +-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +-{ +- size_t size; +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); +- return (FSE_CTable*)ZSTD_malloc(size); +-} +- +-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } +- + /* provides the minimum logSize to safely represent a distribution */ + static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + { +- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; +- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; ++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; ++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) + { +- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; ++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ +@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + return tableLog; + } + +- +-/* fake FSE_CTable, for raw (uncompressed) input */ +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) +-{ +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSymbolValue = tableMask; +- void* const ptr = ct; +- U16* const tableU16 = ( (U16*) ptr) + 2; +- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ +- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* header */ +- tableU16[-2] = (U16) nbBits; +- tableU16[-1] = (U16) maxSymbolValue; +- +- /* Build table */ +- for (s=0; s= 2 ++ ++static size_t showU32(const U32* arr, size_t size) + { +- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ size_t u; ++ for (u=0; u= sizeof(HUF_WriteCTableWksp)); ++ ++ assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); ++ assert(HUF_readCTableHeader(CTable).tableLog == huffLog); ++ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); +@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + return ((maxSymbolValue+1)/2) + 1; + } + +-/*! HUF_writeCTable() : +- `CTable` : Huffman tree to save, using huf representation. +- @return : size of saved CTable */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, +- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +-{ +- HUF_WriteCTableWksp wksp; +- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +-} +- + + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) + { +@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + +- CTable[0] = tableLog; ++ *maxSymbolValuePtr = nbSymbols - 1; ++ ++ HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; +@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) ++ return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); + } + + +-typedef struct nodeElt_s { +- U32 count; +- U16 parent; +- BYTE byte; +- BYTE nbBits; +-} nodeElt; +- + /* + * HUF_setMaxHeight(): +- * Enforces maxNbBits on the Huffman tree described in huffNode. ++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * +- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts +- * the tree to so that it is a valid canonical Huffman tree. ++ * It attempts to convert all nodes with nbBits > @targetNbBits ++ * to employ @targetNbBits instead. Then it adjusts the tree ++ * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, +- * where largestBits is the return value <= maxNbBits. ++ * where largestBits is the return value (expected <= targetNbBits). + * +- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. ++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. ++ * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. +- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree ++ * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will +- * respect maxNbBits. +- * @return The maximum number of bits of the Huffman tree after adjustment, +- * necessarily no more than maxNbBits. ++ * respect targetNbBits. ++ * @return The maximum number of bits of the Huffman tree after adjustment. + */ +-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) ++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) + { + const U32 largestBits = huffNode[lastNonNull].nbBits; +- /* early exit : no elt > maxNbBits, so the tree is already valid. */ +- if (largestBits <= maxNbBits) return largestBits; ++ /* early exit : no elt > targetNbBits, so the tree is already valid. */ ++ if (largestBits <= targetNbBits) return largestBits; ++ ++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; +- const U32 baseCost = 1 << (largestBits - maxNbBits); ++ const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + +- /* Adjust any ranks > maxNbBits to maxNbBits. ++ /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ +- while (huffNode[n].nbBits > maxNbBits) { ++ while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); +- huffNode[n].nbBits = (BYTE)maxNbBits; ++ huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } +- /* n stops at huffNode[n].nbBits <= maxNbBits */ +- assert(huffNode[n].nbBits <= maxNbBits); +- /* n end at index of smallest symbol using < maxNbBits */ +- while (huffNode[n].nbBits == maxNbBits) --n; ++ /* n stops at huffNode[n].nbBits <= targetNbBits */ ++ assert(huffNode[n].nbBits <= targetNbBits); ++ /* n end at index of smallest symbol using < targetNbBits */ ++ while (huffNode[n].nbBits == targetNbBits) --n; + +- /* renorm totalCost from 2^largestBits to 2^maxNbBits ++ /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ +- assert((totalCost & (baseCost - 1)) == 0); +- totalCost >>= (largestBits - maxNbBits); ++ assert(((U32)totalCost & (baseCost - 1)) == 0); ++ totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ +@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); +- { U32 currentNbBits = maxNbBits; ++ { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; +- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ +- rankLast[maxNbBits-currentNbBits] = (U32)pos; ++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ ++ rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ +- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; ++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; +@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; +- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) ++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ +@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ +- /* special case : no rank 1 symbol (using maxNbBits-1); +- * let's create one from largest rank 0 (using maxNbBits). ++ /* special case : no rank 1 symbol (using targetNbBits-1); ++ * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { +- while (huffNode[n].nbBits == maxNbBits) n--; ++ while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); +@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + +- return maxNbBits; ++ return targetNbBits; + } + + typedef struct { +@@ -429,7 +500,7 @@ typedef struct { + U16 curr; + } rankPos; + +-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; ++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + + /* Number of buckets available for HUF_sort() */ + #define RANK_POSITION_TABLE_SIZE 192 +@@ -448,8 +519,8 @@ typedef struct { + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ + #define RANK_POSITION_MAX_COUNT_LOG 32 +-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ ++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) ++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + + /* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. +@@ -457,7 +528,7 @@ typedef struct { + static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count +- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; ++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; + } + + /* Helper swap function for HUF_quickSortPartition() */ +@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; ++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); +@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + assert(HUF_isSorted(huffNode, maxSymbolValue1)); + } + ++ + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). +@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; ++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; +@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + ++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); ++ + return nonNullRank; + } + +@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ + for (n=0; nhuffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + ++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); ++ ++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); ++ + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) +- return ERROR(workSpace_tooSmall); ++ return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) +- return ERROR(maxSymbolValue_tooLarge); ++ return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); ++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + +- /* enforce maxTableLog */ ++ /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + +@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, + } + + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { +- HUF_CElt const* ct = CTable + 1; +- int bad = 0; +- int s; +- for (s = 0; s <= (int)maxSymbolValue; ++s) { +- bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); +- } +- return !bad; ++ HUF_CTableHeader header = HUF_readCTableHeader(CTable); ++ HUF_CElt const* ct = CTable + 1; ++ int bad = 0; ++ int s; ++ ++ assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); ++ ++ if (header.maxSymbolValue < maxSymbolValue) ++ return 0; ++ ++ for (s = 0; s <= (int)maxSymbolValue; ++s) { ++ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); ++ } ++ return !bad; + } + + size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id + #if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); +- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; ++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC) + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } + } + +@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) + { +- U32 const tableLog = (U32)CTable[0]; ++ U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; +- BYTE* op = ostart; + HUF_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ +- { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); ++ { BYTE* op = ostart; ++ size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) +@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- (void)bmi2; ++ (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); + } + + #endif + +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +-{ +- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + static size_t + HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, int bmi2) ++ const HUF_CElt* CTable, int flags) + { + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; +@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + op += 6; /* jumpTable */ + + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; +@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; +@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; +@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } +@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + return (size_t)(op-ostart); + } + +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; +@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, +- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) ++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) + { + size_t const cSize = (nbStreams==HUF_singleStream) ? +- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : +- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); ++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : ++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; +@@ -1168,6 +1249,81 @@ typedef struct { + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) ++{ ++ unsigned cardinality = 0; ++ unsigned i; ++ ++ for (i = 0; i < maxSymbolValue + 1; i++) { ++ if (count[i] != 0) cardinality += 1; ++ } ++ ++ return cardinality; ++} ++ ++unsigned HUF_minTableLog(unsigned symbolCardinality) ++{ ++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; ++ return minBitsSymbols; ++} ++ ++unsigned HUF_optimalTableLog( ++ unsigned maxTableLog, ++ size_t srcSize, ++ unsigned maxSymbolValue, ++ void* workSpace, size_t wkspSize, ++ HUF_CElt* table, ++ const unsigned* count, ++ int flags) ++{ ++ assert(srcSize > 1); /* Not supported, RLE should be used instead */ ++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); ++ ++ if (!(flags & HUF_flags_optimalDepth)) { ++ /* cheap evaluation, based on FSE */ ++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ } ++ ++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); ++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); ++ size_t hSize, newSize; ++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); ++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); ++ size_t optSize = ((size_t) ~0) - 1; ++ unsigned optLog = maxTableLog, optLogGuess; ++ ++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); ++ ++ /* Search until size increases */ ++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { ++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); ++ ++ { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); ++ if (ERR_isError(maxBits)) continue; ++ ++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; ++ ++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); ++ } ++ ++ if (ERR_isError(hSize)) continue; ++ ++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; ++ ++ if (newSize > optSize + 1) { ++ break; ++ } ++ ++ if (newSize < optSize) { ++ optSize = newSize; ++ optLog = optLogGuess; ++ } ++ } ++ assert(optLog <= HUF_TABLELOG_MAX); ++ return optLog; ++ } ++} ++ + /* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, +- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, +- const int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) + { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + ++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ +@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ +- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { ++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; ++ DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; +@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } ++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat +@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ +- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; +- } +- /* Zero unused symbols in CTable, so we can check it for validity */ +- { +- size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); +- size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); +- ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); ++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + + /* Write table description header */ +@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ +@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize, + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, table->CTable, bmi2); +-} +- +- +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_singleStream, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ nbStreams, table->CTable, flags); + } + + size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +- int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, +- repeat, preferRepeat, bmi2, suspectUncompressible); +-} +- +-/* HUF_compress4X_repeat(): +- * compress input using 4 streams. +- * provide workspace to generate compression tables */ +-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_fourStreams, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ repeat, flags); + } + + /* HUF_compress4X_repeat(): + * compress input using 4 streams. + * consider skipping quickly +- * re-use an existing huffman compression table */ ++ * reuse an existing huffman compression table */ + size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, +- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); ++ hufTable, repeat, flags); + } +- +diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c +index f620cafca633..0d139727cd39 100644 +--- a/lib/zstd/compress/zstd_compress.c ++++ b/lib/zstd/compress/zstd_compress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,12 +12,12 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ + #include "../common/mem.h" + #include "hist.h" /* HIST_countFast_wksp */ + #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_compress_internal.h" + #include "zstd_compress_sequences.h" +@@ -27,6 +28,7 @@ + #include "zstd_opt.h" + #include "zstd_ldm.h" + #include "zstd_compress_superblock.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + + /* *************************************************************** + * Tuning parameters +@@ -55,14 +57,17 @@ + * Helper functions + ***************************************/ + /* ZSTD_compressBound() +- * Note that the result from this function is only compatible with the "normal" +- * full-block strategy. +- * When there are a lot of small blocks due to frequent flush in streaming mode +- * the overhead of headers can make the compressed data to be larger than the +- * return value of ZSTD_compressBound(). ++ * Note that the result from this function is only valid for ++ * the one-pass compression functions. ++ * When employing the streaming mode, ++ * if flushes are frequently altering the size of blocks, ++ * the overhead from block headers can make the compressed data larger ++ * than the return value of ZSTD_compressBound(). + */ + size_t ZSTD_compressBound(size_t srcSize) { +- return ZSTD_COMPRESSBOUND(srcSize); ++ size_t const r = ZSTD_COMPRESSBOUND(srcSize); ++ if (r==0) return ERROR(srcSize_wrong); ++ return r; + } + + +@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) + + size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) + { ++ DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); +- { +- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); +- if (!cctxInWorkspace) { +- ZSTD_customFree(cctx, cctx->customMem); +- } ++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; + } +@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); + } + +-/* Returns 1 if compression parameters are such that we should ++/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). +- * Returns 0 otherwise. ++ * Returns ZSTD_ps_disable otherwise. + */ + static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; + } + ++static int ZSTD_resolveExternalSequenceValidation(int mode) { ++ return mode; ++} ++ ++/* Resolves maxBlockSize to the default if no value is present. */ ++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { ++ if (maxBlockSize == 0) { ++ return ZSTD_BLOCKSIZE_MAX; ++ } else { ++ return maxBlockSize; ++ } ++} ++ ++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) { ++ if (value != ZSTD_ps_auto) return value; ++ if (cLevel < 10) { ++ return ZSTD_ps_disable; ++ } else { ++ return ZSTD_ps_enable; ++ } ++} ++ ++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. ++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ ++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { ++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; ++} ++ + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) + { +@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + } + cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); ++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); ++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); ++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, ++ cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; + } +@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) + #define ZSTD_NO_CLEVEL 0 + + /* +- * Initializes the cctxParams from params and compressionLevel. ++ * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) ++static void ++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ++ const ZSTD_parameters* params, ++ int compressionLevel) + { + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); +@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); ++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); ++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); ++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); + } +@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete + + /* + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. +- * @param param Validated zstd parameters. ++ * @param params Validated zstd parameters. + */ + static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + return bounds; + + case ZSTD_c_enableLongDistanceMatching: +- bounds.lowerBound = 0; +- bounds.upperBound = 1; ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: +@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + ++ case ZSTD_c_prefetchCDictTables: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ ++ case ZSTD_c_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; +@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) + return 0; + } + +-#define BOUNDCHECK(cParam, val) { \ +- RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ +- parameter_outOfBound, "Param out of bounds"); \ +-} ++#define BOUNDCHECK(cParam, val) \ ++ do { \ ++ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ ++ parameter_outOfBound, "Param out of bounds"); \ ++ } while (0) + + + static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + default: + return 0; + } +@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { +- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); ++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) +@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); +@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); +- CCtxParams->cParams.minMatch = value; ++ CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); +- CCtxParams->cParams.targetLength = value; ++ CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : +@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; +- return CCtxParams->fParams.contentSizeFlag; ++ return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; +- return CCtxParams->fParams.checksumFlag; ++ return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); +@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); +- return CCtxParams->forceWindow; ++ return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; +- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); ++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } +@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); +- return CCtxParams->enableDedicatedDictSearch; ++ return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : ++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); + CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); +- CCtxParams->ldmParams.hashLog = value; ++ CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); +- CCtxParams->ldmParams.minMatchLength = value; ++ CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); +- CCtxParams->ldmParams.bucketSizeLog = value; ++ CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); +- CCtxParams->ldmParams.hashRateLog = value; ++ CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : +- if (value!=0) /* 0 ==> default */ ++ if (value!=0) { /* 0 ==> default */ ++ value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); +- CCtxParams->targetCBlockSize = value; ++ } ++ CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; +- return CCtxParams->srcSizeHint; ++ return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); +@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_validateSequences: + BOUNDCHECK(ZSTD_c_validateSequences, value); + CCtxParams->validateSequences = value; +- return CCtxParams->validateSequences; ++ return (size_t)CCtxParams->validateSequences; + + case ZSTD_c_useBlockSplitter: + BOUNDCHECK(ZSTD_c_useBlockSplitter, value); +@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; +- return CCtxParams->deterministicRefPrefix; ++ return (size_t)CCtxParams->deterministicRefPrefix; ++ ++ case ZSTD_c_prefetchCDictTables: ++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); ++ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->prefetchCDictTables; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); ++ CCtxParams->enableMatchFinderFallback = value; ++ return (size_t)CCtxParams->enableMatchFinderFallback; ++ ++ case ZSTD_c_maxBlockSize: ++ if (value!=0) /* 0 ==> default */ ++ BOUNDCHECK(ZSTD_c_maxBlockSize, value); ++ CCtxParams->maxBlockSize = value; ++ return CCtxParams->maxBlockSize; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->searchForExternalRepcodes; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; ++ case ZSTD_c_prefetchCDictTables: ++ *value = (int)CCtxParams->prefetchCDictTables; ++ break; ++ case ZSTD_c_enableSeqProducerFallback: ++ *value = CCtxParams->enableMatchFinderFallback; ++ break; ++ case ZSTD_c_maxBlockSize: ++ *value = (int)CCtxParams->maxBlockSize; ++ break; ++ case ZSTD_c_searchForExternalRepcodes: ++ *value = (int)CCtxParams->searchForExternalRepcodes; ++ break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( + return 0; + } + ++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); ++ /* only update if all parameters are valid */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setParams"); ++ /* First check cParams, because we want to update all or none. */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); ++ /* Finally set cParams, which should succeed. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); ++ return 0; ++} ++ + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) + { +- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); ++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; +@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + + /* +- * Initializes the local dict using the requested parameters. +- * NOTE: This does not use the pledged src size, because it may be used for more +- * than one compression. ++ * Initializes the local dictionary using requested parameters. ++ * NOTE: Initialization does not employ the pledged src size, ++ * because the dictionary may be used for multiple compressions. + */ + static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + { +@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + return 0; + } + if (dl->cdict != NULL) { +- assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ ++ assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); +@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + } + + size_t ZSTD_CCtx_loadDictionary_advanced( +- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) ++ ZSTD_CCtx* cctx, ++ const void* dict, size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_dictContentType_e dictContentType) + { +- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't load a dictionary when ctx is not in init stage."); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); +- ZSTD_clearAllDicts(cctx); /* in case one already exists */ +- if (dict == NULL || dictSize == 0) /* no dictionary mode */ ++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ++ "Can't load a dictionary when cctx is not in init stage."); ++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ ++ if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { ++ /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, +- "no malloc for static CCtx"); ++ "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); +- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); ++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, ++ "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); +- cctx->localDict.dictBuffer = dictBuffer; +- cctx->localDict.dict = dictBuffer; ++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ ++ cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; +@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't reset parameters only when not in init stage."); ++ "Reset parameters is only possible during init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } +@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) + static ZSTD_compressionParameters + ZSTD_clampCParams(ZSTD_compressionParameters cParams) + { +-# define CLAMP_TYPE(cParam, val, type) { \ +- ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ +- if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ +- } ++# define CLAMP_TYPE(cParam, val, type) \ ++ do { \ ++ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ ++ if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ ++ } while (0) + # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); +@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters + ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, +- ZSTD_cParamMode_e mode) ++ ZSTD_cParamMode_e mode, ++ ZSTD_paramSwitch_e useRowMatchFinder) + { + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + ++ /* Cascade the selected strategy down to the next-highest one built into ++ * this binary. */ ++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btultra2) { ++ cPar.strategy = ZSTD_btultra; ++ } ++ if (cPar.strategy == ZSTD_btultra) { ++ cPar.strategy = ZSTD_btopt; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btopt) { ++ cPar.strategy = ZSTD_btlazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btlazy2) { ++ cPar.strategy = ZSTD_lazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy2) { ++ cPar.strategy = ZSTD_lazy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy) { ++ cPar.strategy = ZSTD_greedy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_greedy) { ++ cPar.strategy = ZSTD_dfast; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_dfast) { ++ cPar.strategy = ZSTD_fast; ++ cPar.targetLength = 0; ++ } ++#endif ++ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: +@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + } + + /* resize windowLog if input is small enough, to use less memory */ +- if ( (srcSize < maxWindowResize) +- && (dictSize < maxWindowResize) ) { ++ if ( (srcSize <= maxWindowResize) ++ && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : +@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + ++ /* We can't use more than 32 bits of hash in total, so that means that we require: ++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 ++ */ ++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { ++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; ++ if (cPar.hashLog > maxShortCacheHashLog) { ++ cPar.hashLog = maxShortCacheHashLog; ++ } ++ if (cPar.chainLog > maxShortCacheHashLog) { ++ cPar.chainLog = maxShortCacheHashLog; ++ } ++ } ++ ++ ++ /* At this point, we aren't 100% sure if we are using the row match finder. ++ * Unless it is explicitly disabled, conservatively assume that it is enabled. ++ * In this case it will only be disabled for small sources, so shrinking the ++ * hash log a little bit shouldn't result in any ratio loss. ++ */ ++ if (useRowMatchFinder == ZSTD_ps_auto) ++ useRowMatchFinder = ZSTD_ps_enable; ++ ++ /* We can't hash more than 32-bits in total. So that means that we require: ++ * (hashLog - rowLog + 8) <= 32 ++ */ ++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { ++ /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); ++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; ++ U32 const maxHashLog = maxRowHashLog + rowLog; ++ assert(cPar.hashLog >= rowLog); ++ if (cPar.hashLog > maxHashLog) { ++ cPar.hashLog = maxHashLog; ++ } ++ } ++ + return cPar; + } + +@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + { + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; +- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); ++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); + } + + static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ +- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); + } + + static size_t +@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((1<strategy, useRowMatchFinder) +- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) ++ ? ZSTD_cwksp_aligned_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace +@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; + } + ++/* Helper function for calculating memory requirements. ++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ ++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { ++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; ++ return blockSize / divider; ++} ++ + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, +@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_paramSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, +- const U64 pledgedSrcSize) ++ const U64 pledgedSrcSize, ++ int useSequenceProducer, ++ size_t maxBlockSize) + { + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (cParams->minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); +@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ size_t const externalSeqSpace = useSequenceProducer ++ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ : 0; ++ + size_t const neededSpace = + cctxSpace + + entropySpace + +@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ldmSeqSpace + + matchStateSize + + tokenSpace + +- bufferSpace; ++ bufferSpace + ++ externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( +- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); ++ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + + size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; +@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, +- ZSTD_CONTENTSIZE_UNKNOWN); ++ ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + } + +@@ -1637,6 +1879,19 @@ typedef enum { + ZSTD_resetTarget_CCtx + } ZSTD_resetTarget_e; + ++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ ++static U64 ZSTD_bitmix(U64 val, U64 len) { ++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); ++ val *= 0x9FB21C651E98DF25ULL; ++ val ^= (val >> 35) + len ; ++ val *= 0x9FB21C651E98DF25ULL; ++ return val ^ (val >> 28); ++} ++ ++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ ++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { ++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); ++} + + static size_t + ZSTD_reset_matchState(ZSTD_matchState_t* ms, +@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + + ms->hashLog3 = hashLog3; ++ ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + +@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp_clean_tables(ws); + } + +- /* opt parser space */ +- if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { +- DEBUGLOG(4, "reserving optimal parser space"); +- ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); +- ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); +- ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); +- ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); +- ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); +- } +- + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +- { /* Row match finder needs an additional table of hashes ("tags") */ +- size_t const tagTableSize = hSize*sizeof(U16); +- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ /* Row match finder needs an additional table of hashes ("tags") */ ++ size_t const tagTableSize = hSize; ++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use ++ * 0 when we reset a Cdict */ ++ if(forWho == ZSTD_resetTarget_CCtx) { ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ++ ZSTD_advanceHashSalt(ms); ++ } else { ++ /* When we are not salting we want to always memset the memory */ ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); ++ ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ ms->hashSalt = 0; + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + } + ++ /* opt parser space */ ++ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { ++ DEBUGLOG(4, "reserving optimal parser space"); ++ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); ++ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); ++ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); ++ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); ++ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); ++ } ++ + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, +@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->useBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); ++ assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(params->maxBlockSize, windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, +- buffInSize, buffOutSize, pledgedSrcSize); +- int resizeWorkspace; ++ buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); + +@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + { /* Check if workspace is large enough, alloc a new one if needed */ + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); +- resizeWorkspace = workspaceTooSmall || workspaceWasteful; ++ int resizeWorkspace = workspaceTooSmall || workspaceWasteful; + DEBUGLOG(4, "Need %zu B workspace", neededSpace); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + +@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; ++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; +@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + ++ FORWARD_IF_ERROR(ZSTD_reset_matchState( ++ &zc->blockState.matchState, ++ ws, ++ ¶ms->cParams, ++ params->useRowMatchFinder, ++ crp, ++ needsIndexReset, ++ ZSTD_resetTarget_CCtx), ""); ++ ++ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); ++ ++ /* ldm hash table */ ++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { ++ /* TODO: avoid memset? */ ++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->maxNbLdmSequences = maxNbLdmSeq; ++ ++ ZSTD_window_init(&zc->ldmState.window); ++ zc->ldmState.loadedDictEnd = 0; ++ } ++ ++ /* reserve space for block-level external sequences */ ++ if (ZSTD_hasExtSeqProd(params)) { ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ zc->extSeqBufCapacity = maxNbExternalSeq; ++ zc->extSeqBuf = ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ } ++ ++ /* buffers */ ++ + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + +- /* buffers */ + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); +@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); +- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); +- +- FORWARD_IF_ERROR(ZSTD_reset_matchState( +- &zc->blockState.matchState, +- ws, +- ¶ms->cParams, +- params->useRowMatchFinder, +- crp, +- needsIndexReset, +- ZSTD_resetTarget_CCtx), ""); +- +- /* ldm hash table */ +- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { +- /* TODO: avoid memset? */ +- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; +- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); +- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); +- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); +- zc->maxNbLdmSequences = maxNbLdmSeq; +- +- ZSTD_window_init(&zc->ldmState.window); +- zc->ldmState.loadedDictEnd = 0; +- } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + +@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, +- cdict->dictContentSize, ZSTD_cpm_attachDict); ++ cdict->dictContentSize, ZSTD_cpm_attachDict, ++ params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + return 0; + } + ++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ++ ZSTD_compressionParameters const* cParams) { ++ if (ZSTD_CDictIndicesAreTagged(cParams)){ ++ /* Remove tags from the CDict table if they are present. ++ * See docs on "short cache" in zstd_compress_internal.h for context. */ ++ size_t i; ++ for (i = 0; i < tableSize; i++) { ++ U32 const taggedIndex = src[i]; ++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; ++ dst[i] = index; ++ } ++ } else { ++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); ++ } ++} ++ + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, +@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + +- ZSTD_memcpy(cctx->blockState.matchState.hashTable, +- cdict->matchState.hashTable, +- hSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, ++ cdict->matchState.hashTable, ++ hSize, cdict_cParams); ++ + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +- ZSTD_memcpy(cctx->blockState.matchState.chainTable, +- cdict->matchState.chainTable, +- chainSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, ++ cdict->matchState.chainTable, ++ chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +- size_t const tagTableSize = hSize*sizeof(U16); ++ size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, +- cdict->matchState.tagTable, +- tagTableSize); ++ cdict->matchState.tagTable, ++ tagTableSize); ++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + +@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; ++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); +@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par + + /* See doc/zstd_compression_format.md for detailed format description */ + +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + { + const seqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; +@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; ++ int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); ++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) ++ longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; ++ return longOffsets; + } + + /* ZSTD_useTargetCBlockSize(): +@@ -2347,6 +2647,7 @@ typedef struct { + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ ++ int longOffsets; + } ZSTD_symbolEncodingTypeStats_t; + + /* ZSTD_buildSequencesStatistics(): +@@ -2357,11 +2658,13 @@ typedef struct { + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +- BYTE* dst, const BYTE* const dstEnd, +- ZSTD_strategy strategy, unsigned* countWorkspace, +- void* entropyWorkspace, size_t entropyWkspSize) { ++ZSTD_buildSequencesStatistics( ++ const seqStore_t* seqStorePtr, size_t nbSeq, ++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, ++ BYTE* dst, const BYTE* const dstEnd, ++ ZSTD_strategy strategy, unsigned* countWorkspace, ++ void* entropyWorkspace, size_t entropyWkspSize) ++{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; +@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + + stats.lastCountSize = 0; + /* convert length/distances into codes */ +- ZSTD_seqToCodes(seqStorePtr); ++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ +@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + */ + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- void* entropyWorkspace, size_t entropyWkspSize, +- const int bmi2) ++ZSTD_entropyCompressSeqStore_internal( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ const int bmi2) + { +- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + const seqDef* const sequences = seqStorePtr->sequencesStart; +- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; +@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; ++ int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ + { const BYTE* const literals = seqStorePtr->litStart; +- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; ++ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ + size_t const cSize = ZSTD_compressLiterals( +- &prevEntropy->huf, &nextEntropy->huf, +- cctxParams->cParams.strategy, +- ZSTD_literalsCompressionIsDisabled(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, +- bmi2, suspectUncompressible); ++ &prevEntropy->huf, &nextEntropy->huf, ++ cctxParams->cParams.strategy, ++ ZSTD_literalsCompressionIsDisabled(cctxParams), ++ suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; +@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } +- { +- ZSTD_symbolEncodingTypeStats_t stats; +- BYTE* seqHead = op++; ++ { BYTE* const seqHead = op++; + /* build stats for sequences */ +- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, ++ const ZSTD_symbolEncodingTypeStats_t stats = ++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, +@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; ++ longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + } + + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- size_t srcSize, +- void* entropyWorkspace, size_t entropyWkspSize, +- int bmi2) ++ZSTD_entropyCompressSeqStore( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) + { + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, +@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ +- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) ++ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) { ++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ ++ } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. ++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. ++ */ ++ assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; + } + +@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS + static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, +- ZSTD_compressBlock_doubleFast, +- ZSTD_compressBlock_greedy, +- ZSTD_compressBlock_lazy, +- ZSTD_compressBlock_lazy2, +- ZSTD_compressBlock_btlazy2, +- ZSTD_compressBlock_btopt, +- ZSTD_compressBlock_btultra, +- ZSTD_compressBlock_btultra2 }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST, ++ ZSTD_COMPRESSBLOCK_GREEDY, ++ ZSTD_COMPRESSBLOCK_LAZY, ++ ZSTD_COMPRESSBLOCK_LAZY2, ++ ZSTD_COMPRESSBLOCK_BTLAZY2, ++ ZSTD_COMPRESSBLOCK_BTOPT, ++ ZSTD_COMPRESSBLOCK_BTULTRA, ++ ZSTD_COMPRESSBLOCK_BTULTRA2 ++ }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, +- ZSTD_compressBlock_doubleFast_extDict, +- ZSTD_compressBlock_greedy_extDict, +- ZSTD_compressBlock_lazy_extDict, +- ZSTD_compressBlock_lazy2_extDict, +- ZSTD_compressBlock_btlazy2_extDict, +- ZSTD_compressBlock_btopt_extDict, +- ZSTD_compressBlock_btultra_extDict, +- ZSTD_compressBlock_btultra_extDict }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ++ }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, +- ZSTD_compressBlock_doubleFast_dictMatchState, +- ZSTD_compressBlock_greedy_dictMatchState, +- ZSTD_compressBlock_lazy_dictMatchState, +- ZSTD_compressBlock_lazy2_dictMatchState, +- ZSTD_compressBlock_btlazy2_dictMatchState, +- ZSTD_compressBlock_btopt_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ++ }, + { NULL /* default for 0 */, + NULL, + NULL, +- ZSTD_compressBlock_greedy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch, ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, + NULL, + NULL, + NULL, +@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS + DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { +- { ZSTD_compressBlock_greedy_row, +- ZSTD_compressBlock_lazy_row, +- ZSTD_compressBlock_lazy2_row }, +- { ZSTD_compressBlock_greedy_extDict_row, +- ZSTD_compressBlock_lazy_extDict_row, +- ZSTD_compressBlock_lazy2_extDict_row }, +- { ZSTD_compressBlock_greedy_dictMatchState_row, +- ZSTD_compressBlock_lazy_dictMatchState_row, +- ZSTD_compressBlock_lazy2_dictMatchState_row }, +- { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ++ } + }; + DEBUGLOG(4, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); +@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) + ssPtr->longLengthType = ZSTD_llt_none; + } + ++/* ZSTD_postProcessSequenceProducerResult() : ++ * Validates and post-processes sequences obtained through the external matchfinder API: ++ * - Checks whether nbExternalSeqs represents an error condition. ++ * - Appends a block delimiter to outSeqs if one is not already present. ++ * See zstd.h for context regarding block delimiters. ++ * Returns the number of sequences after post-processing, or an error code. */ ++static size_t ZSTD_postProcessSequenceProducerResult( ++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ++) { ++ RETURN_ERROR_IF( ++ nbExternalSeqs > outSeqsCapacity, ++ sequenceProducer_failed, ++ "External sequence producer returned error code %lu", ++ (unsigned long)nbExternalSeqs ++ ); ++ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == 0 && srcSize > 0, ++ sequenceProducer_failed, ++ "Got zero sequences from external sequence producer for a non-empty src buffer!" ++ ); ++ ++ if (srcSize == 0) { ++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); ++ return 1; ++ } ++ ++ { ++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; ++ ++ /* We can return early if lastSeq is already a block delimiter. */ ++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { ++ return nbExternalSeqs; ++ } ++ ++ /* This error condition is only possible if the external matchfinder ++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == outSeqsCapacity, ++ sequenceProducer_failed, ++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ++ ); ++ ++ /* lastSeq is not a block delimiter, so we need to append one. */ ++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); ++ return nbExternalSeqs + 1; ++ } ++} ++ ++/* ZSTD_fastSequenceLengthSum() : ++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. ++ * Similar to another function in zstd_compress.c (determine_blockSize), ++ * except it doesn't check for a block delimiter to end summation. ++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). ++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ ++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { ++ size_t matchLenSum, litLenSum, i; ++ matchLenSum = 0; ++ litLenSum = 0; ++ for (i = 0; i < seqBufSize; i++) { ++ litLenSum += seqBuf[i].litLength; ++ matchLenSum += seqBuf[i].matchLength; ++ } ++ return litLenSum + matchLenSum; ++} ++ + typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; + + static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); +- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { +@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, +@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + rawSeqStore_t ldmSeqStore = kNullRawSeqStore; + ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ +@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); +- } else { /* not long range mode */ +- ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, +- zc->appliedParams.useRowMatchFinder, +- dictMode); ++ } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { ++ assert( ++ zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) ++ ); ++ assert(zc->appliedParams.extSeqProdFunc != NULL); ++ ++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; ++ ++ size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( ++ zc->appliedParams.extSeqProdState, ++ zc->extSeqBuf, ++ zc->extSeqBufCapacity, ++ src, srcSize, ++ NULL, 0, /* dict and dictSize, currently not supported */ ++ zc->appliedParams.compressionLevel, ++ windowSize ++ ); ++ ++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( ++ zc->extSeqBuf, ++ nbExternalSeqs, ++ zc->extSeqBufCapacity, ++ srcSize ++ ); ++ ++ /* Return early if there is no error, since we don't need to worry about last literals */ ++ if (!ZSTD_isError(nbPostProcessedSeqs)) { ++ ZSTD_sequencePosition seqPos = {0,0,0}; ++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); ++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); ++ FORWARD_IF_ERROR( ++ ZSTD_copySequencesToSeqStoreExplicitBlockDelim( ++ zc, &seqPos, ++ zc->extSeqBuf, nbPostProcessedSeqs, ++ src, srcSize, ++ zc->appliedParams.searchForExternalRepcodes ++ ), ++ "Failed to copy external sequences to seqStore!" ++ ); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); ++ return ZSTDbss_compress; ++ } ++ ++ /* Propagate the error if fallback is disabled */ ++ if (!zc->appliedParams.enableMatchFinderFallback) { ++ return nbPostProcessedSeqs; ++ } ++ ++ /* Fallback to software matchfinder */ ++ { ZSTD_blockCompressor const blockCompressor = ++ ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG( ++ 5, ++ "External sequence producer returned error code %lu. Falling back to internal parser.", ++ (unsigned long)nbExternalSeqs ++ ); ++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); ++ } } ++ } else { /* not long range mode and no external matchfinder */ ++ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); + ms->ldmSeqStore = NULL; + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } +@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + return ZSTDbss_compress; + } + +-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) ++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) + { +- const seqStore_t* seqStore = ZSTD_getSeqStore(zc); +- const seqDef* seqStoreSeqs = seqStore->sequencesStart; +- size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; +- size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); +- size_t literalsRead = 0; +- size_t lastLLSize; ++ const seqDef* inSeqs = seqStore->sequencesStart; ++ const size_t nbInSequences = seqStore->sequences - inSeqs; ++ const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); + +- ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; ++ ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; ++ const size_t nbOutSequences = nbInSequences + 1; ++ size_t nbOutLiterals = 0; ++ repcodes_t repcodes; + size_t i; +- repcodes_t updatedRepcodes; + +- assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); +- /* Ensure we have enough space for last literals "sequence" */ +- assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); +- ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (i = 0; i < seqStoreSeqSize; ++i) { +- U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; +- outSeqs[i].litLength = seqStoreSeqs[i].litLength; +- outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; ++ /* Bounds check that we have enough space for every input sequence ++ * and the block delimiter ++ */ ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ RETURN_ERROR_IF( ++ nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), ++ dstSize_tooSmall, ++ "Not enough space to copy sequences"); ++ ++ ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); ++ for (i = 0; i < nbInSequences; ++i) { ++ U32 rawOffset; ++ outSeqs[i].litLength = inSeqs[i].litLength; ++ outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; + outSeqs[i].rep = 0; + ++ /* Handle the possible single length >= 64K ++ * There can only be one because we add MINMATCH to every match length, ++ * and blocks are at most 128K. ++ */ + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { + outSeqs[i].litLength += 0x10000; +@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + } + } + +- if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { +- /* Derive the correct offset corresponding to a repcode */ +- outSeqs[i].rep = seqStoreSeqs[i].offBase; ++ /* Determine the raw offset given the offBase, which may be a repcode. */ ++ if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { ++ const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); ++ assert(repcode > 0); ++ outSeqs[i].rep = repcode; + if (outSeqs[i].litLength != 0) { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; ++ rawOffset = repcodes.rep[repcode - 1]; + } else { +- if (outSeqs[i].rep == 3) { +- rawOffset = updatedRepcodes.rep[0] - 1; ++ if (repcode == 3) { ++ assert(repcodes.rep[0] > 1); ++ rawOffset = repcodes.rep[0] - 1; + } else { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; ++ rawOffset = repcodes.rep[repcode]; + } + } ++ } else { ++ rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); + } + outSeqs[i].offset = rawOffset; +- /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode +- so we provide seqStoreSeqs[i].offset - 1 */ +- ZSTD_updateRep(updatedRepcodes.rep, +- seqStoreSeqs[i].offBase - 1, +- seqStoreSeqs[i].litLength == 0); +- literalsRead += outSeqs[i].litLength; ++ ++ /* Update repcode history for the sequence */ ++ ZSTD_updateRep(repcodes.rep, ++ inSeqs[i].offBase, ++ inSeqs[i].litLength == 0); ++ ++ nbOutLiterals += outSeqs[i].litLength; + } + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ +- assert(seqStoreLiteralsSize >= literalsRead); +- lastLLSize = seqStoreLiteralsSize - literalsRead; +- outSeqs[i].litLength = (U32)lastLLSize; +- outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; +- seqStoreSeqSize++; +- zc->seqCollector.seqIndex += seqStoreSeqSize; ++ assert(nbInLiterals >= nbOutLiterals); ++ { ++ const size_t lastLLSize = nbInLiterals - nbOutLiterals; ++ outSeqs[nbInSequences].litLength = (U32)lastLLSize; ++ outSeqs[nbInSequences].matchLength = 0; ++ outSeqs[nbInSequences].offset = 0; ++ assert(nbOutSequences == nbInSequences + 1); ++ } ++ seqCollector->seqIndex += nbOutSequences; ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ ++ return 0; ++} ++ ++size_t ZSTD_sequenceBound(size_t srcSize) { ++ const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; ++ const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; ++ return maxNbSeq + maxNbDelims; + } + + size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + const size_t dstCapacity = ZSTD_compressBound(srcSize); + void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); + SeqCollector seqCollector; ++ { ++ int targetCBlockSize; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); ++ RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); ++ } ++ { ++ int nbWorkers; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); ++ RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); ++ } + + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + +@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + +- ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); +- ZSTD_customFree(dst, ZSTD_defaultCMem); ++ { ++ const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ++ ZSTD_customFree(dst, ZSTD_defaultCMem); ++ FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); ++ } ++ assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); + return zc->seqCollector.seqIndex; + } + +@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; +- size_t u; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { ++ size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; +- } +- } +- } ++ } } } + return 1; + } + +@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) + return nbSeqs < 4 && nbLits < 10; + } + +-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) ++static void ++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) + { + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; +@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c + } + + /* Writes the block header */ +-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { ++static void ++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) ++{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); +@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace +- * @return : size of huffman description table or error code */ +-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +- const ZSTD_hufCTables_t* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_hufCTablesMetadata_t* hufMetadata, +- const int literalsCompressionIsDisabled, +- void* workspace, size_t wkspSize) ++ * @return : size of huffman description table, or an error code ++ */ ++static size_t ++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const int literalsCompressionIsDisabled, ++ void* workspace, size_t wkspSize, ++ int hufFlags) + { + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; +@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; +- const size_t nodeWkspSize = wkspEnd-nodeWksp; ++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +- unsigned huffLog = HUF_TABLELOG_DEFAULT; ++ unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + + /* small ? don't even attempt compression (speed opt) */ + #ifndef COMPRESS_LITERALS_SIZE_MIN +-#define COMPRESS_LITERALS_SIZE_MIN 63 ++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ + #endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Scan input and build symbol stats */ +- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); ++ { size_t const largest = ++ HIST_count_wksp (countWksp, &maxSymbolValue, ++ (const BYTE*)src, srcSize, ++ workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { ++ /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { ++ /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Validate the previous Huffman table */ +- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { ++ if (repeat == HUF_repeat_check ++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); ++ assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; +- { /* Build and write the CTable */ +- size_t const newCSize = HUF_estimateCompressedSize( +- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +- size_t const hSize = HUF_writeCTable_wksp( +- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +- nodeWksp, nodeWkspSize); +- /* Check against repeating the previous CTable */ +- if (repeat != HUF_repeat_none) { +- size_t const oldCSize = HUF_estimateCompressedSize( +- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +- DEBUGLOG(5, "set_repeat - smaller"); +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_repeat; +- return 0; +- } +- } +- if (newCSize + hSize >= srcSize) { +- DEBUGLOG(5, "set_basic - no gains"); ++ } ++ { /* Build and write the CTable */ ++ size_t const newCSize = HUF_estimateCompressedSize( ++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); ++ size_t const hSize = HUF_writeCTable_wksp( ++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), ++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, ++ nodeWksp, nodeWkspSize); ++ /* Check against repeating the previous CTable */ ++ if (repeat != HUF_repeat_none) { ++ size_t const oldCSize = HUF_estimateCompressedSize( ++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); ++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { ++ DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_basic; ++ hufMetadata->hType = set_repeat; + return 0; +- } +- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +- hufMetadata->hType = set_compressed; +- nextHuf->repeatMode = HUF_repeat_check; +- return hSize; ++ } } ++ if (newCSize + hSize >= srcSize) { ++ DEBUGLOG(5, "set_basic - no gains"); ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ hufMetadata->hType = set_basic; ++ return 0; + } ++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); ++ hufMetadata->hType = set_compressed; ++ nextHuf->repeatMode = HUF_repeat_check; ++ return hSize; + } + } + +@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + * and updates nextEntropy to the appropriate repeatMode. + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; ++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) ++{ ++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; +@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. +- * @return : size of fse tables or error code */ +-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +- const ZSTD_fseCTables_t* prevEntropy, +- ZSTD_fseCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize) ++ * @return : size of fse tables or error code */ ++static size_t ++ZSTD_buildBlockEntropyStats_sequences( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_fseCTables_t* prevEntropy, ++ ZSTD_fseCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize) + { + ZSTD_strategy const strategy = cctxParams->cParams.strategy; +- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; +@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE +- * +- * @return : 0 on success or error code ++ * @return : 0 on success, or an error code ++ * Note : also employed in superblock + */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize) +-{ +- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize) ++{ ++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); ++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; ++ + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), +- workspace, wkspSize); ++ workspace, wkspSize, hufFlags); ++ + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + } + + /* Returns the size estimate for the literals section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +- const ZSTD_hufCTables_t* huf, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, ++ const ZSTD_hufCTables_t* huf, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz + } + + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +- const FSE_CTable* fseCTable, +- const U8* additionalBits, +- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +- void* workspace, size_t wkspSize) ++static size_t ++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, ++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, ++ const FSE_CTable* fseCTable, ++ const U8* additionalBits, ++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, ++ void* workspace, size_t wkspSize) + { + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; +@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + } + + /* Returns the size estimate for the sequences section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +- fseTables->offcodeCTable, NULL, +- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +- workspace, wkspSize); ++ fseTables->offcodeCTable, NULL, ++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +- fseTables->litlengthCTable, LL_bits, +- LL_defaultNorm, LL_defaultNormLog, MaxLL, +- workspace, wkspSize); ++ fseTables->litlengthCTable, LL_bits, ++ LL_defaultNorm, LL_defaultNormLog, MaxLL, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +- fseTables->matchlengthCTable, ML_bits, +- ML_defaultNorm, ML_defaultNormLog, MaxML, +- workspace, wkspSize); ++ fseTables->matchlengthCTable, ML_bits, ++ ML_defaultNorm, ML_defaultNormLog, MaxML, ++ workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + + /* Returns the size estimate for a given stream of literals, of, ll, ml */ +-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +- const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_entropyCTables_t* entropy, +- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { ++static size_t ++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, ++ const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_entropyCTables_t* entropy, ++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize, ++ int writeLitEntropy, int writeSeqEntropy) ++{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +- workspace, wkspSize, writeSeqEntropy); ++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, ++ workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; + } + + /* Builds entropy statistics and uses them for blocksize estimation. + * +- * Returns the estimated compressed size of the seqStore, or a zstd error. ++ * @return: estimated compressed size of the seqStore, or a zstd error. + */ +-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; ++static size_t ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) ++{ ++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), ""); ++ return ZSTD_estimateBlockSize( ++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), +- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ &zc->blockState.nextCBlock->entropy, ++ entropyMetadata, ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); + } + + /* Returns literals bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) ++{ + size_t literalsBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ seqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; +- } +- } ++ } } + return literalsBytes; + } + + /* Returns match bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) ++{ + size_t matchBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; +- } +- } ++ } } + return matchBytes; + } + +@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { + */ + static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, +- size_t startIdx, size_t endIdx) { +- BYTE* const litEnd = originalSeqStore->lit; +- size_t literalsBytes; +- size_t literalsBytesPreceding = 0; +- ++ size_t startIdx, size_t endIdx) ++{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ +@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +- resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +- resultSeqStore->lit = litEnd; ++ assert(resultSeqStore->lit == originalSeqStore->lit); + } else { +- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; ++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; +@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + + /* +- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. +- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). ++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. ++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ + static U32 +-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +-{ +- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ +- assert(STORED_IS_REPCODE(offCode)); +- if (adjustedOffCode == ZSTD_REP_NUM) { +- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +- assert(rep[0] > 0); ++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) ++{ ++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ ++ assert(OFFBASE_IS_REPCODE(offBase)); ++ if (adjustedRepCode == ZSTD_REP_NUM) { ++ assert(ll0); ++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 ++ * This is only valid if it results in a valid offset value, aka > 0. ++ * Note : it may happen that `rep[0]==1` in exceptional circumstances. ++ * In which case this function will return 0, which is an invalid offset. ++ * It's not an issue though, since this value will be ++ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). ++ */ + return rep[0] - 1; + } +- return rep[adjustedOffCode]; ++ return rep[adjustedRepCode]; + } + + /* +@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +- seqStore_t* const seqStore, U32 const nbSeq) { ++static void ++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, ++ const seqStore_t* const seqStore, U32 const nbSeq) ++{ + U32 idx = 0; ++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { + seqDef* const seq = seqStore->sequencesStart + idx; +- U32 const ll0 = (seq->litLength == 0); +- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +- assert(seq->offBase > 0); +- if (STORED_IS_REPCODE(offCode)) { +- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); ++ U32 const offBase = seq->offBase; ++ assert(offBase > 0); ++ if (OFFBASE_IS_REPCODE(offBase)) { ++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); ++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { +- seq->offBase = cRawOffset + ZSTD_REP_NUM; ++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ +- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); ++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } + } + +@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ + * Returns the total size of that block (including header) or a ZSTD error code. + */ + static size_t +-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, ++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, ++ const seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, ++ const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) + { + const U32 rleMaxLength = 25; +@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + cSeqsSize = 1; + } + ++ /* Sequence collection not supported when block splitting */ + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3481,45 +4027,49 @@ typedef struct { + + /* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. +- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then +- * we do not recurse. ++ * If advantageous to split, then we recurse down the two sub-blocks. ++ * If not, or if an error occurred in estimation, then we do not recurse. + * +- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. ++ * Note: The recursion depth is capped by a heuristic minimum number of sequences, ++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * +- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize ++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. ++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ + static void + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) + { +- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + ++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); ++ assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); ++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } +- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", ++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { ++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; +@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end + } + } + +-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. ++/* Base recursive function. ++ * Populates a table with intra-block partition indices that can improve compression ratio. + * +- * Returns the number of splits made (which equals the size of the partition table - 1). ++ * @return: number of splits made (which equals the size of the partition table - 1). + */ +-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +- seqStoreSplits splits = {partitions, 0}; ++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) ++{ ++ seqStoreSplits splits; ++ splits.splitLocations = partitions; ++ splits.idx = 0; + if (nbSeq <= 4) { +- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); ++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } +@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ + static size_t +-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) ++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t blockSize, ++ U32 lastBlock, U32 nbSeq) + { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; +- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); ++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ ++ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { +- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +- &dRep, &cRep, +- op, dstCapacity, +- ip, blockSize, +- lastBlock, 0 /* isPartition */); ++ size_t cSizeSingleBlock = ++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, ++ &dRep, &cRep, ++ op, dstCapacity, ++ ip, blockSize, ++ lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { +- size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + +- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); ++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ +@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); +- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); ++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; +@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; +- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize); + } +- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +- * for the next block. ++ /* cRep and dRep may have diverged during the compression. ++ * If so, we use the dRep repcodes for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); + return cSize; +@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) + { +- const BYTE* ip = (const BYTE*)src; +- BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; +@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) + { +- /* This the upper bound for the length of an rle block. +- * This isn't the actual upper bound. Finding the real threshold +- * needs further investigation. ++ /* This is an estimated upper bound for the length of an rle block. ++ * This isn't the actual upper bound. ++ * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; +@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); +- if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } ++ if (bss == ZSTDbss_noCompress) { ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = 0; ++ goto out; ++ } + } + + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ +- { +- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); ++ { size_t const cSize = ++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { +- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); ++ size_t const maxCSize = ++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + } + } + } +- } ++ } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. +@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + * All blocks will be terminated, all input will be consumed. + * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. + * Frame is supposed already started (header already produced) +-* @return : compressed size, or an error code ++* @return : compressed size, or an error code + */ + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + +- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); + if (remaining < blockSize) blockSize = remaining; +@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } +- } ++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ + + + ip += blockSize; +@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) + } + } + +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) + { +- RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, +- "wrong cctx stage"); +- RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, +- parameter_unsupported, +- "incompatible with ldm"); ++ assert(cctx->stage == ZSTDcs_init); ++ assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + cctx->externSeqStore.posInSequence = 0; +- return 0; + } + + +@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + } + } + +-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressContinue_public() */ ++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); ++} + +-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) + { + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); +- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); ++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); + } + +-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ ++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++{ ++ return ZSTD_getBlockSize_deprecated(cctx); ++} ++ ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); +- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); ++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++{ ++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); ++} ++ + /*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, +- ZSTD_dictTableLoadMethod_e dtlm) ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) + { + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + +- /* Assert that we the ms params match the params we're being given */ ++ /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +- if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ { /* Ensure large dictionaries can't cause index overflow */ ++ + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ +- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +- /* We must have cleared our windows when our source is this large. */ +- assert(ZSTD_window_isEmpty(ms->window)); +- if (loadLdmDict) +- assert(ZSTD_window_isEmpty(ls->window)); ++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; ++ ++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); ++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { ++ /* Some dictionary matchfinders in zstd use "short cache", ++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each ++ * CDict hashtable entry as a tag rather than as part of an index. ++ * When short cache is used, we need to truncate the dictionary ++ * so that its indices don't overlap with the tag. */ ++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; ++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); ++ assert(!loadLdmDict); ++ } ++ + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; +@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } + } + +- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ /* We must have cleared our windows when our source is this large. */ ++ assert(ZSTD_window_isEmpty(ms->window)); ++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); ++ } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); +- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +- ms->forceNonContiguous = params->deterministicRefPrefix; + +- if (loadLdmDict) { ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ ++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ++ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); + } + ++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ ++ if (params->cParams.strategy < ZSTD_btultra) { ++ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28); ++ if (srcSize > maxDictSize) { ++ ip = iend - maxDictSize; ++ src = ip; ++ srcSize = maxDictSize; ++ } ++ } ++ ++ ms->nextToUpdate = (U32)(ip - ms->window.base); ++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ++ ms->forceNonContiguous = params->deterministicRefPrefix; ++ + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + +- if (loadLdmDict) +- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); +- + switch(params->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, dtlm); ++ ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, dtlm); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); +@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { +- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); ++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); +@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); + } + } ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + default: +@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ +- if (!hasZeroWeights) ++ if (!hasZeroWeights && maxSymbolValue == 255) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); +- RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + +@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + const BYTE* dictPtr = (const BYTE*)dict; +@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( +- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); ++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; + } +@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); +@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) +- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); ++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( +- ms, ls, ws, params, dict, dictSize, dtlm); ++ ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ +@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( +- bs, ms, ws, params, dict, dictSize, dtlm, workspace); ++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); + } + + #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) + #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + + /*! ZSTD_compressBegin_internal() : ++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ + static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, +@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, +- cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->entropyWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, +- dictContentType, dtlm, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; +@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + &cctxParams, pledgedSrcSize); + } + +-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++static size_t ++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) + { + ZSTD_CCtx_params cctxParams; +- { +- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); +@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); + } + ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++{ ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); ++} ++ + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) + { +- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); + } + + +@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + { + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; +- size_t fhSize = 0; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { +- fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); ++ size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; +@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; +- RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); +- MEM_writeLE32(op, cBlockHeader24); ++ ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); ++ MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } +@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) + (void)extraCSize; + } + +-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, +@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + return cSize + endResult; + } + ++/* NOTE: Must just wrap ZSTD_compressEnd_public() */ ++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); ++} ++ + size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal( + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, +@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal( + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, +- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); ++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; +@@ -4811,7 +5450,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, + customMem); + +- if (ZSTD_isError( ZSTD_initCDict_internal(cdict, ++ if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cctxParams) )) { +@@ -4906,6 +5545,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; ++ cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, +@@ -4985,12 +5625,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( + + /* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + { + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + } + ++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++{ ++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); ++} ++ + /*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +@@ -5000,7 +5645,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) + { + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + /*! ZSTD_compress_usingCDict_advanced(): +@@ -5197,30 +5842,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) + + static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) + { +- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; +- if (hintInSize==0) hintInSize = cctx->blockSize; +- return hintInSize; ++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ return cctx->blockSize - cctx->stableIn_notConsumed; ++ } ++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); ++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; ++ if (hintInSize==0) hintInSize = cctx->blockSize; ++ return hintInSize; ++ } + } + + /* ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants +- * non-static, because can be called from zstdmt_compress.c +- * @return : hint size for next input */ ++ * @return : hint size for next input to complete ongoing block */ + static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) + { +- const char* const istart = (const char*)input->src; +- const char* const iend = input->size != 0 ? istart + input->size : istart; +- const char* ip = input->pos != 0 ? istart + input->pos : istart; +- char* const ostart = (char*)output->dst; +- char* const oend = output->size != 0 ? ostart + output->size : ostart; +- char* op = output->pos != 0 ? ostart + output->pos : ostart; ++ const char* const istart = (assert(input != NULL), (const char*)input->src); ++ const char* const iend = (istart != NULL) ? istart + input->size : istart; ++ const char* ip = (istart != NULL) ? istart + input->pos : istart; ++ char* const ostart = (assert(output != NULL), (char*)output->dst); ++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; ++ char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ +- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); ++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); ++ assert(zcs != NULL); ++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ assert(input->pos >= zcs->stableIn_notConsumed); ++ input->pos -= zcs->stableIn_notConsumed; ++ if (ip) ip -= zcs->stableIn_notConsumed; ++ zcs->stableIn_notConsumed = 0; ++ } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); +@@ -5229,8 +5885,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } +- assert(output->pos <= output->size); ++ if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); ++ if (output->dst == NULL) assert(output->size == 0); ++ assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { +@@ -5245,7 +5903,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ +- size_t const cSize = ZSTD_compressEnd(zcs, ++ size_t const cSize = ZSTD_compressEnd_public(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); +@@ -5262,8 +5920,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); + zcs->inBuffPos += loaded; +- if (loaded != 0) +- ip += loaded; ++ if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ +@@ -5274,6 +5931,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + /* empty */ + someMoreWork = 0; break; + } ++ } else { ++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ if ( (flushMode == ZSTD_e_continue) ++ && ( (size_t)(iend - ip) < zcs->blockSize) ) { ++ /* can't compress a full block : stop here */ ++ zcs->stableIn_notConsumed = (size_t)(iend - ip); ++ ip = iend; /* pretend to have consumed input */ ++ someMoreWork = 0; break; ++ } ++ if ( (flushMode == ZSTD_e_flush) ++ && (ip == iend) ) { ++ /* empty */ ++ someMoreWork = 0; break; ++ } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); +@@ -5281,9 +5952,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + void* cDst; + size_t cSize; + size_t oSize = oend-op; +- size_t const iSize = inputBuffered +- ? zcs->inBuffPos - zcs->inToCompress +- : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress ++ : MIN((size_t)(iend - ip), zcs->blockSize); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else +@@ -5291,9 +5961,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ++ ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ++ ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +@@ -5306,19 +5976,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; +- } else { +- unsigned const lastBlock = (ip + iSize == iend); +- assert(flushMode == ZSTD_e_end /* Already validated */); ++ } else { /* !inputBuffered, hence ZSTD_bm_stable */ ++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); ++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ +- if (iSize > 0) +- ip += iSize; ++ if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +- if (lastBlock) +- assert(ip == iend); ++ if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; +@@ -5388,8 +6055,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf + /* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) ++static void ++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) + { ++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } +@@ -5408,22 +6077,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + { + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; +- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); +- if (endOp != ZSTD_e_end) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); ++ if (expect.src != input->src || expect.pos != input->pos) ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } ++ (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) +- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; + } + + static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, +- size_t inSize) { ++ size_t inSize) ++{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ +@@ -5437,9 +6106,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.compressionLevel = cctx->cdict->compressionLevel; + } + DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ +- { +- size_t const dictSize = prefixDict.dict ++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ ++ ++ { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); +@@ -5451,6 +6120,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); ++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); ++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); ++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +@@ -5477,6 +6149,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + return 0; + } + ++/* @return provides a minimum amount of data remaining to be flushed from internal buffers ++ */ + size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, +@@ -5491,8 +6165,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { +- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); +- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ ++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ ++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; ++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ ++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ ++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ ++ if (cctx->stableIn_notConsumed) { /* not the first time */ ++ /* check stable source guarantees */ ++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); ++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); ++ } ++ /* pretend input was consumed, to give a sense forward progress */ ++ input->pos = input->size; ++ /* save stable inBuffer, for later control, and flush/end */ ++ cctx->expectedInBuffer = *input; ++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ ++ cctx->stableIn_notConsumed += inputSize; ++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ ++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + +@@ -5510,13 +6203,20 @@ size_t ZSTD_compressStream2_simpleArgs ( + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } + + size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5539,6 +6239,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; ++ + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); +@@ -5549,64 +6250,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + } + } + +-typedef struct { +- U32 idx; /* Index in array of ZSTD_Sequence */ +- U32 posInSequence; /* Position within sequence at idx */ +- size_t posInSrc; /* Number of bytes given by sequences provided so far */ +-} ZSTD_sequencePosition; +- + /* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ + static size_t +-ZSTD_validateSequence(U32 offCode, U32 matchLength, +- size_t posInSrc, U32 windowLog, size_t dictSize) ++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch, ++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) + { +- U32 const windowSize = 1 << windowLog; ++ U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); ++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; ++ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ ++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; + } + + /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) ++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) + { +- U32 offCode = STORE_OFFSET(rawOffset); ++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { +- offCode = STORE_REPCODE_1; ++ offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { +- offCode = STORE_REPCODE(2 - ll0); ++ offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { +- offCode = STORE_REPCODE(3 - ll0); ++ offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { +- offCode = STORE_REPCODE_3; ++ offBase = REPCODE3_TO_OFFBASE; + } +- return offCode; ++ return offBase; + } + +-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of +- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ++ ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; ++ U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; + repcodes_t updatedRepcodes; + U32 dictSize; + ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize); ++ + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5615,25 +6313,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + dictSize = 0; + } + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; +- U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; +- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ U32 offBase; ++ ++ if (externalRepSearch == ZSTD_ps_disable) { ++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); ++ } else { ++ U32 const ll0 = (litLength == 0); ++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } + +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ assert(externalRepSearch != ZSTD_ps_auto); ++ assert(idx >= startIdx); ++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { ++ U32* const rep = updatedRepcodes.rep; ++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ ++ ++ if (lastSeqIdx >= startIdx + 2) { ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (lastSeqIdx == startIdx + 1) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else { ++ assert(lastSeqIdx == startIdx); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } ++ } ++ + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); + + if (inSeqs[idx].litLength) { +@@ -5642,26 +6370,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } +- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); ++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; + return 0; + } + +-/* Returns the number of bytes to move the current read position back by. Only non-zero +- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something +- * went wrong. +- * +- * This function will attempt to scan through blockSize bytes represented by the sequences +- * in inSeqs, storing any (partial) sequences. +- * +- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to +- * avoid splitting a match, or to avoid splitting a match such that it would produce a match +- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; +@@ -5673,6 +6390,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + ++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ ++ (void)externalRepSearch; ++ + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5680,7 +6400,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } else { + dictSize = 0; + } +- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { +@@ -5688,7 +6408,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; +- U32 offCode; ++ U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { +@@ -5702,7 +6422,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; +- idx++; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ +@@ -5742,21 +6461,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); +- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; ++ if (!finalMatchSplit) ++ idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); +@@ -5779,7 +6500,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + + typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); + static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + { + ZSTD_sequenceCopier sequenceCopier = NULL; +@@ -5793,6 +6514,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + return sequenceCopier; + } + ++/* Discover the size of next block by searching for the delimiter. ++ * Note that a block delimiter **must** exist in this mode, ++ * otherwise it's an input error. ++ * The block size retrieved will be later compared to ensure it remains within bounds */ ++static size_t ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ int end = 0; ++ size_t blockSize = 0; ++ size_t spos = seqPos.idx; ++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); ++ assert(spos <= inSeqsSize); ++ while (spos < inSeqsSize) { ++ end = (inSeqs[spos].offset == 0); ++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; ++ if (end) { ++ if (inSeqs[spos].matchLength != 0) ++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); ++ break; ++ } ++ spos++; ++ } ++ if (!end) ++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); ++ return blockSize; ++} ++ ++/* More a "target" block size */ ++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) ++{ ++ int const lastBlock = (remaining <= blockSize); ++ return lastBlock ? remaining : blockSize; ++} ++ ++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode, ++ size_t blockSize, size_t remaining, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) ++ return blockSize_noDelimiter(blockSize, remaining); ++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); ++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); ++ if (explicitBlockSize > blockSize) ++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); ++ if (explicitBlockSize > remaining) ++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); ++ return explicitBlockSize; ++ } ++} ++ + /* Compress, block-by-block, all of the sequences given. + * + * Returns the cumulative size of all compressed blocks (including their headers), +@@ -5805,9 +6577,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + const void* src, size_t srcSize) + { + size_t cSize = 0; +- U32 lastBlock; +- size_t blockSize; +- size_t compressedSeqsSize; + size_t remaining = srcSize; + ZSTD_sequencePosition seqPos = {0, 0, 0}; + +@@ -5827,22 +6596,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + } + + while (remaining) { ++ size_t compressedSeqsSize; + size_t cBlockSize; + size_t additionalByteAdjustment; +- lastBlock = remaining <= cctx->blockSize; +- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; ++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, ++ cctx->blockSize, remaining, ++ inSeqs, inSeqsSize, seqPos); ++ U32 const lastBlock = (blockSize == remaining); ++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); ++ assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); +- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); ++ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize); + +- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); ++ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); + FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); + blockSize -= additionalByteAdjustment; + + /* If blocks are too small, emit as a nocompress block */ +- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; +@@ -5851,6 +6627,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + continue; + } + ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, +@@ -5859,11 +6636,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); +- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && +- ZSTD_isRLE((BYTE const*)src, srcSize)) { ++ ZSTD_isRLE(ip, blockSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 +@@ -5874,12 +6651,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ +@@ -5891,11 +6668,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; +- DEBUGLOG(4, "cSize running total: %zu", cSize); + + if (lastBlock) { + break; +@@ -5906,12 +6682,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + ++ DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; + } + +-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, ++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) + { +@@ -5921,7 +6700,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + size_t frameHeaderSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ +- DEBUGLOG(3, "ZSTD_compressSequences()"); ++ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); + /* Begin writing output, starting with frame header */ +@@ -5949,26 +6728,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + cSize += 4; + } + +- DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; + } + + /*====== Finalize ======*/ + ++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) ++{ ++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; ++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ return stableInput ? zcs->expectedInBuffer : nullInput; ++} ++ + /*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ + size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); ++ input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); + } + + + size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); +- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); ++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; +@@ -6090,7 +6877,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ +- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } + } + +@@ -6125,3 +6912,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); + } ++ ++void ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* zc, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc ++) { ++ assert(zc != NULL); ++ ZSTD_CCtxParams_registerSequenceProducer( ++ &zc->requestedParams, extSeqProdState, extSeqProdFunc ++ ); ++} ++ ++void ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc ++) { ++ assert(params != NULL); ++ if (extSeqProdFunc != NULL) { ++ params->extSeqProdFunc = extSeqProdFunc; ++ params->extSeqProdState = extSeqProdState; ++ } else { ++ params->extSeqProdFunc = NULL; ++ params->extSeqProdState = NULL; ++ } ++} +diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h +index 71697a11ae30..53cb582a8d2b 100644 +--- a/lib/zstd/compress/zstd_compress_internal.h ++++ b/lib/zstd/compress/zstd_compress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,6 +21,7 @@ + ***************************************/ + #include "../common/zstd_internal.h" + #include "zstd_cwksp.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ + + + /*-************************************* +@@ -32,7 +34,7 @@ + It's not a big deal though : candidate will just be sorted again. + Additionally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. +- The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. ++ The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. + This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ + + +@@ -111,12 +113,13 @@ typedef struct { + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize); ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize); + + /* ******************************* + * Compression internals structs * +@@ -142,26 +145,33 @@ typedef struct { + size_t capacity; /* The capacity starting from `seq` pointer */ + } rawSeqStore_t; + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_sequencePosition; ++ + UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + + typedef struct { +- int price; +- U32 off; +- U32 mlen; +- U32 litlen; +- U32 rep[ZSTD_REP_NUM]; ++ int price; /* price from beginning of segment to this position */ ++ U32 off; /* offset of previous match */ ++ U32 mlen; /* length of previous match */ ++ U32 litlen; /* nb of literals since previous match */ ++ U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ + } ZSTD_optimal_t; + + typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; + ++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) + typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + unsigned* litFreq; /* table of literals statistics, of size 256 */ + unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ +- ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ +- ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ ++ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ ++ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ +@@ -212,8 +222,10 @@ struct ZSTD_matchState_t { + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ ++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ ++ U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ ++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + + U32* hashTable; + U32* hashTable3; +@@ -228,6 +240,18 @@ struct ZSTD_matchState_t { + const ZSTD_matchState_t* dictMatchState; + ZSTD_compressionParameters cParams; + const rawSeqStore_t* ldmSeqStore; ++ ++ /* Controls prefetching in some dictMatchState matchfinders. ++ * This behavior is controlled from the cctx ms. ++ * This parameter has no effect in the cdict ms. */ ++ int prefetchCDictTables; ++ ++ /* When == 0, lazy match finders insert every position. ++ * When != 0, lazy match finders only insert positions they search. ++ * This allows them to skip much faster over incompressible data, ++ * at a small cost to compression ratio. ++ */ ++ int lazySkipping; + }; + + typedef struct { +@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s { + + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ + ZSTD_customMem customMem; ++ ++ /* Controls prefetching in some dictMatchState matchfinders */ ++ ZSTD_paramSwitch_e prefetchCDictTables; ++ ++ /* Controls whether zstd will fall back to an internal matchfinder ++ * if the external matchfinder returns an error code. */ ++ int enableMatchFinderFallback; ++ ++ /* Parameters for the external sequence producer API. ++ * Users set these parameters through ZSTD_registerSequenceProducer(). ++ * It is not possible to set these parameters individually through the public API. */ ++ void* extSeqProdState; ++ ZSTD_sequenceProducer_F extSeqProdFunc; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; ++ ++ /* Controls repcode search in external sequence parsing */ ++ ZSTD_paramSwitch_e searchForExternalRepcodes; + }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ + + #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) +@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s { + + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; ++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + + /* Dictionary */ +@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s { + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; ++ ++ /* Buffer for output from external sequence producer */ ++ ZSTD_Sequence* extSeqBuf; ++ size_t extSeqBufCapacity; + }; + + typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + + typedef enum { + ZSTD_noDict = 0, +@@ -441,7 +490,7 @@ typedef enum { + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ +- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. ++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. +@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) + /* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) + { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); ++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); +@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi + return ZSTD_blockHeaderSize + srcSize; + } + +-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) + { + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); +@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) + { + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + return (srcSize >> minlog) + 2; + } + +@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con + while (ip < iend) *op++ = *ip++; + } + +-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +-#define STORE_REPCODE_1 STORE_REPCODE(1) +-#define STORE_REPCODE_2 STORE_REPCODE(2) +-#define STORE_REPCODE_3 STORE_REPCODE(3) +-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +-#define STORED_TO_OFFBASE(o) ((o)+1) +-#define OFFBASE_TO_STORED(o) ((o)-1) ++ ++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) ++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) ++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) ++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ ++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) ++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) ++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) ++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) ++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ + + /*! ZSTD_storeSeq() : +- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. +- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +- * Allowed to overread literals up to litLimit. ++ * Allowed to over-read literals up to litLimit. + */ + HINT_INLINE UNUSED_ATTR void + ZSTD_storeSeq(seqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, +- U32 offBase_minus1, ++ U32 offBase, + size_t matchLength) + { + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; +@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); +- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", +- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); ++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", ++ pos, (U32)litLength, (U32)matchLength, (U32)offBase); + } + #endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); +@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. +- * First copy 16 bytes, because literals are likely short. +- */ +- assert(WILDCOPY_OVERLENGTH >= 16); ++ * First copy 16 bytes, because literals are likely short. ++ */ ++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); +@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ +- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); ++ seqStorePtr->sequences[0].offBase = offBase; + + /* match Length */ + assert(matchLength >= MINMATCH); +@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + + /* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) +- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() ++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ + MEM_STATIC void +-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ ++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; +- rep[0] = STORED_OFFSET(offBase_minus1); ++ rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ +- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; ++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +@@ -673,11 +723,11 @@ typedef struct repcodes_s { + } repcodes_t; + + MEM_STATIC repcodes_t +-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { + repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); ++ ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; + } + +@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 + /*-************************************* + * Match length counter + ***************************************/ +-static unsigned ZSTD_NbCommonBytes (size_t val) +-{ +- if (MEM_isLittleEndian()) { +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_ctzll((U64)val) >> 3); +-# else +- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, +- 0, 3, 1, 3, 1, 4, 2, 7, +- 0, 2, 3, 6, 1, 5, 3, 5, +- 1, 3, 4, 4, 2, 5, 6, 7, +- 7, 0, 1, 2, 3, 3, 4, 6, +- 2, 6, 5, 5, 3, 4, 5, 6, +- 7, 1, 2, 4, 6, 4, 4, 5, +- 7, 2, 6, 5, 7, 6, 7, 7 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_ctz((U32)val) >> 3); +-# else +- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, +- 3, 2, 2, 1, 3, 2, 0, 1, +- 3, 3, 1, 2, 2, 2, 2, 0, +- 3, 1, 2, 0, 1, 0, 1, 1 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +- } else { /* Big Endian CPU */ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_clzll(val) >> 3); +-# else +- unsigned r; +- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ +- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } +- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } +- r += (!val); +- return r; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_clz((U32)val) >> 3); +-# else +- unsigned r; +- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } +- r += (!val); +- return r; +-# endif +- } } +-} +- +- + MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) + { + const BYTE* const pStart = pIn; +@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + * Hashes + ***************************************/ + static const U32 prime3bytes = 506832829U; +-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } +-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ ++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } ++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ ++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } + + static const U32 prime4bytes = 2654435761U; +-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } ++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } + + static const U64 prime5bytes = 889523592379ULL; +-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } ++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } + + static const U64 prime6bytes = 227718039650203ULL; +-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } ++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } + + static const U64 prime7bytes = 58295818150454627ULL; +-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } ++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } + + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } ++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } ++ + + MEM_STATIC FORCE_INLINE_ATTR + size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ + switch(mls) + { + default: +@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + } + } + ++MEM_STATIC FORCE_INLINE_ATTR ++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ ++ switch(mls) ++ { ++ default: ++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); ++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); ++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); ++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); ++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); ++ } ++} ++ ++ + /* ZSTD_ipow() : + * Return base^exponent. + */ +@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + */ +-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) + { + /* preemptive overflow correction: +@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + +- if (blockEndIdx > loadedDictEnd + maxDist) { ++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. ++ * ++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected ++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. ++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use ++ * dictMatchState, so setting it to NULL is not a problem. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; +@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_update(ZSTD_window_t* window, + void const* src, size_t srcSize, + int forceNonContiguous) + { +@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) + + #endif + ++/* Short Cache */ ++ ++/* Normally, zstd matchfinders follow this flow: ++ * 1. Compute hash at ip ++ * 2. Load index from hashTable[hash] ++ * 3. Check if *ip == *(base + index) ++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. ++ * ++ * Short cache is an optimization which allows us to avoid step 3 most of the time ++ * when the data doesn't actually match. With short cache, the flow becomes: ++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. ++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. ++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. ++ * ++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to ++ * dictMatchState matchfinders. ++ */ ++#define ZSTD_SHORT_CACHE_TAG_BITS 8 ++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) ++ ++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. ++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ ++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { ++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); ++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); ++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; ++} ++ ++/* Helper function for short cache matchfinders. ++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ ++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { ++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; ++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; ++ return tag1 == tag2; ++} + + + /* =============================================================== +@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity); + * This cannot be used when long range matching is enabled. + * Zstd will use these sequences, and pass the literals to a secondary block + * compressor. +- * @return : An error code on failure. + * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory + * access and data corruption. + */ +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); + + /* ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + */ + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + ++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of ++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. ++ * Note that the block delimiter must include the last literals of the block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns the number of bytes to move the current read position back by. ++ * Only non-zero if we ended up splitting a sequence. ++ * Otherwise, it may return a ZSTD error if something went wrong. ++ * ++ * This function will attempt to scan through blockSize bytes ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. ++ * ++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to ++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match ++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ ++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { ++ return params->extSeqProdFunc != NULL; ++} ++ ++/* =============================================================== ++ * Deprecated definitions that are still used internally to avoid ++ * deprecation warnings. These functions are exactly equivalent to ++ * their public variants, but avoid the deprecation warnings. ++ * =============================================================== */ ++ ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ++ ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ ++ + #endif /* ZSTD_COMPRESS_H */ +diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c +index 52b0a8059aba..3e9ea46a670a 100644 +--- a/lib/zstd/compress/zstd_compress_literals.c ++++ b/lib/zstd/compress/zstd_compress_literals.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,11 +14,36 @@ + ***************************************/ + #include "zstd_compress_literals.h" + ++ ++/* ************************************************************** ++* Debug Traces ++****************************************************************/ ++#if DEBUGLEVEL >= 2 ++ ++static size_t showHexa(const void* src, size_t srcSize) ++{ ++ const BYTE* const ip = (const BYTE*)src; ++ size_t u; ++ for (u=0; u31) + (srcSize>4095); + ++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); ++ + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) +@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, + } + + ZSTD_memcpy(ostart + flSize, src, srcSize); +- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); ++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; + } + ++static int allBytesIdentical(const void* src, size_t srcSize) ++{ ++ assert(srcSize >= 1); ++ assert(src != NULL); ++ { const BYTE b = ((const BYTE*)src)[0]; ++ size_t p; ++ for (p=1; p31) + (srcSize>4095); + +- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ ++ assert(dstCapacity >= 4); (void)dstCapacity; ++ assert(allBytesIdentical(src, srcSize)); + + switch(flSize) + { +@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* + } + + ostart[flSize] = *(const BYTE*)src; +- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); ++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); + return flSize+1; + } + +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible) ++/* ZSTD_minLiteralsToCompress() : ++ * returns minimal amount of literals ++ * for literal compression to even be attempted. ++ * Minimum is made tighter as compression strategy increases. ++ */ ++static size_t ++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) ++{ ++ assert((int)strategy >= 0); ++ assert((int)strategy <= 9); ++ /* btultra2 : min 8 bytes; ++ * then 2x larger for each successive compression strategy ++ * max threshold 64 bytes */ ++ { int const shift = MIN(9-(int)strategy, 3); ++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; ++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); ++ return mintc; ++ } ++} ++ ++size_t ZSTD_compressLiterals ( ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ void* entropyWorkspace, size_t entropyWorkspaceSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, ++ int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2) + { +- size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", +- disableLiteralCompression, (U32)srcSize); ++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", ++ disableLiteralCompression, (U32)srcSize, dstCapacity); ++ ++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + +- /* small ? don't even attempt compression (speed opt) */ +-# define COMPRESS_LITERALS_SIZE_MIN 63 +- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ /* if too small, don't even attempt compression (speed opt) */ ++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; +- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; ++ int const flags = 0 ++ | (bmi2 ? HUF_flags_bmi2 : 0) ++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) ++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) ++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); ++ ++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); ++ huf_compress_f huf_compress; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; +- cLitSize = singleStream ? +- HUF_compress1X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : +- HUF_compress4X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); ++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; ++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, ++ src, srcSize, ++ HUF_SYMBOLVALUE_MAX, LitHufLog, ++ entropyWorkspace, entropyWorkspaceSize, ++ (HUF_CElt*)nextHuf->CTable, ++ &repeat, flags); ++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ +- DEBUGLOG(5, "Reusing previous huffman table"); ++ DEBUGLOG(5, "reusing statistics from previous huffman block"); + hType = set_repeat; + } + } + +- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ { size_t const minGain = ZSTD_minGain(srcSize, strategy); ++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); ++ } } + if (cLitSize==1) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); +- } ++ /* A return value of 1 signals that the alphabet consists of a single symbol. ++ * However, in some rare circumstances, it could be the compressed size (a single byte). ++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. ++ * (it's also necessary to not generate statistics). ++ * Therefore, in such a case, actively check that all bytes are identical. */ ++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); ++ } } + + if (hType == set_compressed) { + /* using a newly constructed table */ +@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); ++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); +diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h +index 9775fb97cb70..a2a85d6b69e5 100644 +--- a/lib/zstd/compress/zstd_compress_literals.h ++++ b/lib/zstd/compress/zstd_compress_literals.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,16 +17,24 @@ + + size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ++/* ZSTD_compressRleLiteralsBlock() : ++ * Conditions : ++ * - All bytes in @src are identical ++ * - dstCapacity >= 4 */ + size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, ++/* ZSTD_compressLiterals(): ++ * @entropyWorkspace: must be aligned on 4-bytes boundaries ++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE ++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding ++ */ ++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible); ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2); + + #endif /* ZSTD_COMPRESS_LITERALS_H */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c +index 21ddc1b37acf..5c028c78d889 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.c ++++ b/lib/zstd/compress/zstd_compress_sequences.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq) + { + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on +- * comprssibility. ++ * compressibility. + */ + return nbSeq >= 2048; + } +@@ -166,7 +167,7 @@ ZSTD_selectEncodingType( + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { +- /* Prefer set_basic over set_rle when there are 2 or less symbols, ++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h +index 7991364c2f71..7fe6f4ff5cf2 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.h ++++ b/lib/zstd/compress/zstd_compress_sequences.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c +index 17d836cc84e8..41f6521b27cd 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.c ++++ b/lib/zstd/compress/zstd_compress_superblock.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -36,13 +37,14 @@ + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block +- * Or 0 if it unable to compress. ++ * Or 0 if unable to compress. + * Or error code */ +-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- const BYTE* literals, size_t litSize, +- void* dst, size_t dstSize, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const BYTE* literals, size_t litSize, ++ void* dst, size_t dstSize, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); +@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + +- (void)bmi2; /* TODO bmi2... */ +- + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; +@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + +- /* TODO bmi2 */ +- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) +- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); ++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; ++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) ++ : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { +@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } +@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); +- return op-ostart; ++ return (size_t)(op-ostart); + } + +-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { +- const seqDef* const sstart = sequences; +- const seqDef* const send = sequences + nbSeq; +- const seqDef* sp = sstart; ++static size_t ++ZSTD_seqDecompressedSize(seqStore_t const* seqStore, ++ const seqDef* sequences, size_t nbSeqs, ++ size_t litSize, int lastSubBlock) ++{ + size_t matchLengthSum = 0; + size_t litLengthSum = 0; +- (void)(litLengthSum); /* suppress unused variable warning on some environments */ +- while (send-sp > 0) { +- ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); ++ size_t n; ++ for (n=0; ncParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; +@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); +- if (nbSeq < 0x7F) ++ if (nbSeq < 128) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ +@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +- op, oend - op, ++ op, (size_t)(oend - op), + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, +@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + #endif + + *entropyWritten = 1; +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* ZSTD_compressSubBlock() : +@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, +- op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; +@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, +- op, oend-op, ++ op, (size_t)(oend-op), + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ +- { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; ++ { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } +- return op-ostart; ++ return (size_t)(op-ostart); + } + + static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, +@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + +-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, ++typedef struct { ++ size_t estLitSize; ++ size_t estBlockSize; ++} EstimatedBlockSize; ++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, +@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { +- size_t cSizeEstimate = 0; +- cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); +- cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, ++ int writeLitEntropy, int writeSeqEntropy) ++{ ++ EstimatedBlockSize ebs; ++ ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); ++ ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); +- return cSizeEstimate + ZSTD_blockHeaderSize; ++ ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; ++ return ebs; + } + + static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe + return 0; + } + ++static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount) ++{ ++ size_t n, total = 0; ++ assert(sp != NULL); ++ for (n=0; n %zu bytes", seqCount, (const void*)sp, total); ++ return total; ++} ++ ++#define BYTESCALE 256 ++ ++static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs, ++ size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, ++ int firstSubBlock) ++{ ++ size_t n, budget = 0, inSize=0; ++ /* entropy headers */ ++ size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ ++ assert(firstSubBlock==0 || firstSubBlock==1); ++ budget += headerSize; ++ ++ /* first sequence => at least one sequence*/ ++ budget += sp[0].litLength * avgLitCost + avgSeqCost; ++ if (budget > targetBudget) return 1; ++ inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); ++ ++ /* loop over sequences */ ++ for (n=1; n targetBudget) ++ /* though continue to expand until the sub-block is deemed compressible */ ++ && (budget < inSize * BYTESCALE) ) ++ break; ++ } ++ ++ return n; ++} ++ + /* ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. +- * Entropy will be written to the first block. +- * The following blocks will use repeat mode to compress. +- * All sub-blocks are compressed blocks (no raw or rle blocks). +- * @return : compressed size of the super block (which is multiple ZSTD blocks) +- * Or 0 if it failed to compress. */ ++ * Entropy will be written into the first block. ++ * The following blocks use repeat_mode to compress. ++ * Sub-blocks are all compressed, except the last one when beneficial. ++ * @return : compressed size of the super block (which features multiple ZSTD blocks) ++ * or 0 if it failed to compress. */ + static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, +@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + { + const seqDef* const sstart = seqStorePtr->sequencesStart; + const seqDef* const send = seqStorePtr->sequences; +- const seqDef* sp = sstart; ++ const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ ++ size_t const nbSeqs = (size_t)(send - sstart); + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; ++ size_t const nbLiterals = (size_t)(lend - lstart); + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; +@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; +- size_t targetCBlockSize = cctxParams->targetCBlockSize; +- size_t litSize, seqCount; +- int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; ++ size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ ++ size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); ++ int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); + int writeSeqEntropy = 1; +- int lastSequence = 0; +- +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", +- (unsigned)(lend-lp), (unsigned)(send-sstart)); +- +- litSize = 0; +- seqCount = 0; +- do { +- size_t cBlockSizeEstimate = 0; +- if (sstart == send) { +- lastSequence = 1; +- } else { +- const seqDef* const sequence = sp + seqCount; +- lastSequence = sequence == send - 1; +- litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; +- seqCount++; +- } +- if (lastSequence) { +- assert(lp <= lend); +- assert(litSize <= (size_t)(lend - lp)); +- litSize = (size_t)(lend - lp); ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", ++ (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); ++ ++ /* let's start by a general estimation for the full block */ ++ if (nbSeqs > 0) { ++ EstimatedBlockSize const ebs = ++ ZSTD_estimateSubBlockSize(lp, nbLiterals, ++ ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, ++ &nextCBlock->entropy, entropyMetadata, ++ workspace, wkspSize, ++ writeLitEntropy, writeSeqEntropy); ++ /* quick estimation */ ++ size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; ++ size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; ++ const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); ++ size_t n, avgBlockBudget, blockBudgetSupp=0; ++ avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; ++ DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", ++ (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, ++ (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); ++ /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately ++ * this will result in the production of a single uncompressed block covering @srcSize.*/ ++ if (ebs.estBlockSize > srcSize) return 0; ++ ++ /* compress and write sub-blocks */ ++ assert(nbSubBlocks>0); ++ for (n=0; n < nbSubBlocks-1; n++) { ++ /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ ++ size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), ++ avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); ++ /* if reached last sequence : break to last sub-block (simplification) */ ++ assert(seqCount <= (size_t)(send-sp)); ++ if (sp + seqCount == send) break; ++ assert(seqCount > 0); ++ /* compress sub-block */ ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = countLiterals(seqStorePtr, sp, seqCount); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ 0); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* check compressibility, update state components */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; ++ } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; ++ blockBudgetSupp = 0; ++ } } ++ /* otherwise : do not compress yet, coalesce current sub-block with following one */ + } +- /* I think there is an optimization opportunity here. +- * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful +- * since it recalculates estimate from scratch. +- * For example, it would recount literal distribution and symbol codes every time. +- */ +- cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, +- &nextCBlock->entropy, entropyMetadata, +- workspace, wkspSize, writeLitEntropy, writeSeqEntropy); +- if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { +- int litEntropyWritten = 0; +- int seqEntropyWritten = 0; +- const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); +- const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, +- sp, seqCount, +- lp, litSize, +- llCodePtr, mlCodePtr, ofCodePtr, +- cctxParams, +- op, oend-op, +- bmi2, writeLitEntropy, writeSeqEntropy, +- &litEntropyWritten, &seqEntropyWritten, +- lastBlock && lastSequence); +- FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); +- if (cSize > 0 && cSize < decompressedSize) { +- DEBUGLOG(5, "Committed the sub-block"); +- assert(ip + decompressedSize <= iend); +- ip += decompressedSize; +- sp += seqCount; +- lp += litSize; +- op += cSize; +- llCodePtr += seqCount; +- mlCodePtr += seqCount; +- ofCodePtr += seqCount; +- litSize = 0; +- seqCount = 0; +- /* Entropy only needs to be written once */ +- if (litEntropyWritten) { +- writeLitEntropy = 0; +- } +- if (seqEntropyWritten) { +- writeSeqEntropy = 0; +- } ++ } /* if (nbSeqs > 0) */ ++ ++ /* write last block */ ++ DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = (size_t)(lend - lp); ++ size_t seqCount = (size_t)(send - sp); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ lastBlock); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* update pointers, the nb of literals borrowed from next sequence must be preserved */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; + } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; + } +- } while (!lastSequence); ++ } ++ ++ + if (writeLitEntropy) { +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); ++ DEBUGLOG(5, "Literal entropy tables were never written"); + ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); ++ DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); + return 0; + } ++ + if (ip < iend) { +- size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); ++ /* some data left : last part of the block sent uncompressed */ ++ size_t const rSize = (size_t)((iend - ip)); ++ size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); ++ DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { +- seqDef const* seq; ++ const seqDef* seq; + repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { +- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); ++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); +- return op-ostart; ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", ++ (unsigned)(op-ostart)); ++ return (size_t)(op-ostart); + } + + size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, +- void const* src, size_t srcSize, +- unsigned lastBlock) { ++ const void* src, size_t srcSize, ++ unsigned lastBlock) ++{ + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, +diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h +index 224ece79546e..826bbc9e029b 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.h ++++ b/lib/zstd/compress/zstd_compress_superblock.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h +index 349fc923c355..86bc3c2c23c7 100644 +--- a/lib/zstd/compress/zstd_cwksp.h ++++ b/lib/zstd/compress/zstd_cwksp.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,9 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_internal.h" ++#include "../common/portability_macros.h" + + + /*-************************************* +@@ -41,8 +44,9 @@ + ***************************************/ + typedef enum { + ZSTD_cwksp_alloc_objects, +- ZSTD_cwksp_alloc_buffers, +- ZSTD_cwksp_alloc_aligned ++ ZSTD_cwksp_alloc_aligned_init_once, ++ ZSTD_cwksp_alloc_aligned, ++ ZSTD_cwksp_alloc_buffers + } ZSTD_cwksp_alloc_phase_e; + + /* +@@ -95,8 +99,8 @@ typedef enum { + * + * Workspace Layout: + * +- * [ ... workspace ... ] +- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] ++ * [ ... workspace ... ] ++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: +@@ -120,9 +124,18 @@ typedef enum { + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. These tables are 64-byte aligned. + * +- * - Aligned: these buffers are used for various purposes that require 4 byte +- * alignment, but don't require any initialization before they're used. These +- * buffers are each aligned to 64 bytes. ++ * - Init once: these buffers require to be initialized at least once before ++ * use. They should be used when we want to skip memory initialization ++ * while not triggering memory checkers (like Valgrind) when reading from ++ * from this memory without writing to it first. ++ * These buffers should be used carefully as they might contain data ++ * from previous compressions. ++ * Buffers are aligned to 64 bytes. ++ * ++ * - Aligned: these buffers don't require any initialization before they're ++ * used. The user of the buffer should make sure they write into a buffer ++ * location before reading from it. ++ * Buffers are aligned to 64 bytes. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can +@@ -134,8 +147,9 @@ typedef enum { + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects +- * 2. Buffers +- * 3. Aligned/Tables ++ * 2. Init once / Tables ++ * 3. Aligned / Tables ++ * 4. Buffers / Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +@@ -147,6 +161,7 @@ typedef struct { + void* tableEnd; + void* tableValidEnd; + void* allocStart; ++ void* initOnceStart; + + BYTE allocFailed; + int workspaceOversizedDuration; +@@ -159,6 +174,7 @@ typedef struct { + ***************************************/ + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); + + MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; +@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); ++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); ++ assert(ws->workspace <= ws->initOnceStart); + } + + /* +@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + * for internal purposes (currently only alignment). + */ + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +- * to align the beginning of the aligned section. +- * +- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +- * aligneds being sized in multiples of 64 bytes. ++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES ++ * bytes to align the beginning of tables section and end of buffers; + */ +- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; ++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; + } + +@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert((alignBytes & alignBytesMask) == 0); +- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(bytes < alignBytes); + return bytes; + } + ++/* ++ * Returns the initial value for allocStart which is used to determine the position from ++ * which we can allocate from the end of the workspace. ++ */ ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { ++ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); ++} ++ + /* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, +@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + { + assert(phase >= ws->phase); + if (phase > ws->phase) { +- /* Going from allocating objects to allocating buffers */ +- if (ws->phase < ZSTD_cwksp_alloc_buffers && +- phase >= ZSTD_cwksp_alloc_buffers) { ++ /* Going from allocating objects to allocating initOnce / tables */ ++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && ++ phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; +- } ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + +- /* Going from allocating buffers to allocating aligneds/tables */ +- if (ws->phase < ZSTD_cwksp_alloc_aligned && +- phase >= ZSTD_cwksp_alloc_aligned) { +- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +- size_t const bytesToAlign = +- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +- memory_allocation, "aligned phase - alignment initial allocation failed!"); +- } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +- void* const alloc = ws->objectEnd; ++ void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +- void* const objectEnd = (BYTE*)alloc + bytesToAlign; ++ void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); +@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; +- } } } ++ } ++ } ++ } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + */ + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) + { +- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); ++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); + } + + /* +@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); + } + ++/* ++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ * This memory has been initialized at least once in the past. ++ * This doesn't mean it has been initialized this time, and it might contain data from previous ++ * operations. ++ * The main usage is for algorithms that might need read access into uninitialized memory. ++ * The algorithm must maintain safety under these conditions and must make sure it doesn't ++ * leak any of the past data (directly or in side channels). ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) ++{ ++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); ++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ if(ptr && ptr < ws->initOnceStart) { ++ /* We assume the memory following the current allocation is either: ++ * 1. Not usable as initOnce memory (end of workspace) ++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) ++ * 3. An ASAN redzone, in which case we don't want to write on it ++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. ++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ++ ws->initOnceStart = ptr; ++ } ++ return ptr; ++} ++ + /* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + */ +@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) + + /* + * Aligned on 64 bytes. These buffers have the special property that +- * their values remain constrained, allowing us to re-use them without ++ * their values remain constrained, allowing us to reuse them without + * memset()-ing them. + */ + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + { +- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; ++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + +- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +- return NULL; ++ /* We can only start allocating tables after we are done reserving space for objects at the ++ * start of the workspace */ ++ if(ws->phase < phase) { ++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { ++ return NULL; ++ } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; +@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { +- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); ++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); + } + ZSTD_cwksp_mark_tables_clean(ws); + } +@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + + + ws->tableEnd = ws->objectEnd; +- ws->allocStart = ws->workspaceEnd; ++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); + ws->allocFailed = 0; +- if (ws->phase > ZSTD_cwksp_alloc_buffers) { +- ws->phase = ZSTD_cwksp_alloc_buffers; ++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { ++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; + } + ZSTD_cwksp_assert_internal_consistency(ws); + } + ++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); ++} ++ ++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) ++ + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); ++} ++ + /* + * The provided workspace takes ownership of the buffer [start, start+size). + * Any existing values in the workspace are ignored (the previously managed +@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; + ZSTD_cwksp_clear(ws); +@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { + ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); + } + +-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +-} +- +-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) +- + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); +-} +- + MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + return ws->allocFailed; + } +@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +- size_t const estimatedSpace, int resizedWorkspace) { +- if (resizedWorkspace) { +- /* Resized/newly allocated wksp should have exact bounds */ +- return ZSTD_cwksp_used(ws) == estimatedSpace; +- } else { +- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +- * than estimatedSpace. See the comments in zstd_cwksp.h for details. +- */ +- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +- } ++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { ++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice ++ * the alignment bytes difference between estimation and actual usage */ ++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && ++ ZSTD_cwksp_used(ws) <= estimatedSpace; + } + + +diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c +index 76933dea2624..5ff54f17d92f 100644 +--- a/lib/zstd/compress/zstd_double_fast.c ++++ b/lib/zstd/compress/zstd_double_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,49 @@ + #include "zstd_compress_internal.h" + #include "zstd_double_fast.h" + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashLarge = ms->hashTable; ++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ U32* const hashSmall = ms->chainTable; ++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; ++ ++ /* Always insert every fastHashFillStep position into the hash tables. ++ * Insert the other positions into the large hash table if their entry ++ * is empty. ++ */ ++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ U32 i; ++ for (i = 0; i < fastHashFillStep; ++i) { ++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); ++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); ++ if (i == 0) { ++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); ++ } ++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { ++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); ++ } ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ if (dtlm == ZSTD_dtlm_fast) ++ break; ++ } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; +- } } ++ } } ++} ++ ++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); ++ } + } + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) +@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; +@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; +- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ +@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + } while (ip1 <= ilimit); + + _cleanup: ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -217,7 +276,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + hashLong[hl1] = (U32)(ip1 - base); + } + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -243,7 +302,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -254,6 +313,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, +@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; +@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); +- const U32 dictHBitsL = dictCParams->hashLog; +- const U32 dictHBitsS = dictCParams->chainLog; ++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); +@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashLong, hashTableBytes); ++ PREFETCH_AREA(dictHashSmall, chainTableBytes); ++ } ++ + /* init */ + ip += (dictAndPrefixLength == 0); + +@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); +- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); +- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); ++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); ++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; +@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL) { + /* check dictMatchState long match */ +- U32 const dictMatchIndexL = dictHashLong[dictHL]; ++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + +@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } +- } else { ++ } else if (dictTagsMatchS) { + /* check dictMatchState short match */ +- U32 const dictMatchIndexS = dictHashSmall[dictHS]; ++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + +@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + continue; + + _search_next_long: +- + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); ++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; ++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + +@@ -391,9 +462,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL3) { + /* check dict long +1 match */ +- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; ++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { +@@ -419,7 +490,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + offset_2 = offset_1; + offset_1 = offset; + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -448,7 +519,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -461,8 +532,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } /* while (ip < ilimit) */ + + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_doubleFast_extDict_generic( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) +@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; +@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + } + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; +@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict( + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); + } + } ++ ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ +diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h +index 6822bde65a1d..b7ddc714f13e 100644 +--- a/lib/zstd/compress/zstd_double_fast.h ++++ b/lib/zstd/compress/zstd_double_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -15,8 +16,12 @@ + #include "../common/mem.h" /* U32 */ + #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ + void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); ++ + size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ + + + #endif /* ZSTD_DOUBLE_FAST_H */ +diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c +index a752e6beab52..b7a63ba4ce56 100644 +--- a/lib/zstd/compress/zstd_fast.c ++++ b/lib/zstd/compress/zstd_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,46 @@ + #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ + #include "zstd_fast.h" + ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashTable = ms->hashTable; ++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_full); ++ ++ /* Always insert every fastHashFillStep position into the hash table. ++ * Insert the other positions if their hash entry is empty. ++ */ ++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } ++ ++ if (dtlm == ZSTD_dtlm_fast) continue; ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ { U32 p; ++ for (p = 1; p < fastHashFillStep; ++p) { ++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); ++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); ++ } } } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) + { +@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + ++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_fast); ++ + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ +@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + } } } } + } + ++void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillHashTableForCCtx(ms, end, dtlm); ++ } ++} ++ + + /* + * If you squint hard enough (and ignore repcodes), the search operation at any +@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + * + * This is also the work we do at the beginning to enter the loop initially. + */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_fast_noDict_generic( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls, U32 const hasStep) +@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic( + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ +@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; +- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; ++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; ++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +@@ -180,8 +236,14 @@ ZSTD_compressBlock_fast_noDict_generic( + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; +- offcode = STORE_REPCODE_1; ++ offcode = REPCODE1_TO_OFFBASE; + mLength += 4; ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 is before the ++ * repcode (ip2). */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _match; + } + +@@ -195,6 +257,12 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 == ip0 + 1, so ++ * we know we will resume searching after ip1 */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _offset; + } + +@@ -224,6 +292,21 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* first write next hash table entry; we've already calculated it */ ++ if (step <= 4) { ++ /* We need to avoid writing an index into the hash table >= the ++ * position at which we will pick up our searching after we've ++ * taken this match. ++ * ++ * The minimum possible match has length 4, so the earliest ip0 ++ * can be after we take this match will be the current ip0 + 4. ++ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely ++ * write this position. ++ */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ + goto _offset; + } + +@@ -254,9 +337,24 @@ ZSTD_compressBlock_fast_noDict_generic( + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + ++ /* When the repcodes are outside of the prefix, we set them to zero before the loop. ++ * When the offsets are still zero, we need to restore them after the block to have a correct ++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both ++ * offsets were invalid. We need to figure out which offset to refill with. ++ * - If both offsets are zero they are in the same order. ++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. ++ * - If only one is zero, we need to decide which offset to restore. ++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. ++ * - It is impossible for rep_offset2 to be non-zero. ++ * ++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then ++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. ++ */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; ++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; ++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -267,7 +365,7 @@ ZSTD_compressBlock_fast_noDict_generic( + match0 = base + idx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); +- offcode = STORE_OFFSET(rep_offset1); ++ offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ +@@ -287,11 +385,6 @@ ZSTD_compressBlock_fast_noDict_generic( + ip0 += mLength; + anchor = ip0; + +- /* write next hash table entry */ +- if (ip1 < ip0) { +- hashTable[hash1] = (U32)(ip1 - base); +- } +- + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ +@@ -306,7 +399,7 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; +- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } +@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_fast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) +@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; ++ const BYTE* ip0 = istart; ++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; +@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); +- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); +- const U32 dictHLog = dictCParams->hashLog; ++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); ++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; +- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); ++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + +@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashTable, hashTableBytes); ++ } ++ + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); +- ip += (dictAndPrefixLength == 0); ++ ip0 += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + +- /* Main Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ++ /* Outer search loop */ ++ assert(stepSize >= 1); ++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + size_t mLength; +- size_t const h = ZSTD_hashPtr(ip, hlog, mls); +- U32 const curr = (U32)(ip-base); +- U32 const matchIndex = hashTable[h]; +- const BYTE* match = base + matchIndex; +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* repMatch = (repIndex < prefixStartIndex) ? +- dictBase + (repIndex - dictIndexDelta) : +- base + repIndex; +- hashTable[h] = curr; /* update hash table */ +- +- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +- } else if ( (matchIndex <= prefixStartIndex) ) { +- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); +- U32 const dictMatchIndex = dictHashTable[dictHash]; +- const BYTE* dictMatch = dictBase + dictMatchIndex; +- if (dictMatchIndex <= dictStartIndex || +- MEM_read32(dictMatch) != MEM_read32(ip)) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a dict match */ +- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); +- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; +- while (((ip>anchor) & (dictMatch>dictStart)) +- && (ip[-1] == dictMatch[-1])) { +- ip--; dictMatch--; mLength++; ++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ ++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); ++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); ++ ++ U32 matchIndex = hashTable[hash0]; ++ U32 curr = (U32)(ip0 - base); ++ size_t step = stepSize; ++ const size_t kStepIncr = 1 << kSearchStrength; ++ const BYTE* nextStep = ip0 + kStepIncr; ++ ++ /* Inner search loop */ ++ while (1) { ++ const BYTE* match = base + matchIndex; ++ const U32 repIndex = curr + 1 - offset_1; ++ const BYTE* repMatch = (repIndex < prefixStartIndex) ? ++ dictBase + (repIndex - dictIndexDelta) : ++ base + repIndex; ++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); ++ hashTable[hash0] = curr; /* update hash table */ ++ ++ if (((U32) ((prefixStartIndex - 1) - repIndex) >= ++ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ ++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { ++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ++ ip0++; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ++ break; ++ } ++ ++ if (dictTagsMatch) { ++ /* Found a possible dict match */ ++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* dictMatch = dictBase + dictMatchIndex; ++ if (dictMatchIndex > dictStartIndex && ++ MEM_read32(dictMatch) == MEM_read32(ip0)) { ++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ ++ if (matchIndex <= prefixStartIndex) { ++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); ++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; ++ while (((ip0 > anchor) & (dictMatch > dictStart)) ++ && (ip0[-1] == dictMatch[-1])) { ++ ip0--; ++ dictMatch--; ++ mLength++; ++ } /* catch up */ ++ offset_2 = offset_1; ++ offset_1 = offset; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; ++ } ++ } ++ } ++ ++ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { ++ /* found a regular match */ ++ U32 const offset = (U32) (ip0 - match); ++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; ++ while (((ip0 > anchor) & (match > prefixStart)) ++ && (ip0[-1] == match[-1])) { ++ ip0--; ++ match--; ++ mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; + } +- } else if (MEM_read32(match) != MEM_read32(ip)) { +- /* it's not a match, and we're not going to check the dictionary */ +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a regular match */ +- U32 const offset = (U32)(ip-match); +- mLength = ZSTD_count(ip+4, match+4, iend) + 4; +- while (((ip>anchor) & (match>prefixStart)) +- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; +- offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- } ++ ++ /* Prepare for next iteration */ ++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); ++ matchIndex = hashTable[hash1]; ++ ++ if (ip1 >= nextStep) { ++ step++; ++ nextStep += kStepIncr; ++ } ++ ip0 = ip1; ++ ip1 = ip1 + step; ++ if (ip1 > ilimit) goto _cleanup; ++ ++ curr = (U32)(ip0 - base); ++ hash0 = hash1; ++ } /* end inner search loop */ + + /* match found */ +- ip += mLength; +- anchor = ip; ++ assert(mLength); ++ ip0 += mLength; ++ anchor = ip0; + +- if (ip <= ilimit) { ++ if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); ++ while (ip0 <= ilimit) { ++ U32 const current2 = (U32)(ip0-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ++ ip0 += repLength2; ++ anchor = ip0; + continue; + } + break; + } + } ++ ++ /* Prepare for next iteration */ ++ assert(ip0 == anchor); ++ ip1 = ip0 + stepSize; + } + ++_cleanup: + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_fast_extDict_generic( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) + { +@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ +- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); +@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; ++ ++ const BYTE* ip0 = istart; ++ const BYTE* ip1; ++ const BYTE* ip2; ++ const BYTE* ip3; ++ U32 current0; ++ ++ ++ size_t hash0; /* hash for ip0 */ ++ size_t hash1; /* hash for ip1 */ ++ U32 idx; /* match idx for ip0 */ ++ const BYTE* idxBase; /* base pointer for idx */ ++ ++ U32 offcode; ++ const BYTE* match0; ++ size_t mLength; ++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ ++ ++ size_t step; ++ const BYTE* nextStep; ++ const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ + +@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + +- /* Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ +- const size_t h = ZSTD_hashPtr(ip, hlog, mls); +- const U32 matchIndex = hashTable[h]; +- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; +- const BYTE* match = matchBase + matchIndex; +- const U32 curr = (U32)(ip-base); +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; +- const BYTE* const repMatch = repBase + repIndex; +- hashTable[h] = curr; /* update hash table */ +- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); +- +- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); +- ip += rLength; +- anchor = ip; +- } else { +- if ( (matchIndex < dictStartIndex) || +- (MEM_read32(match) != MEM_read32(ip)) ) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; ++ { U32 const curr = (U32)(ip0 - base); ++ U32 const maxRep = curr - dictStartIndex; ++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; ++ } ++ ++ /* start each op */ ++_start: /* Requires: ip0 */ ++ ++ step = stepSize; ++ nextStep = ip0 + kStepIncr; ++ ++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ++ ip1 = ip0 + 1; ++ ip2 = ip0 + step; ++ ip3 = ip2 + 1; ++ ++ if (ip3 >= ilimit) { ++ goto _cleanup; ++ } ++ ++ hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ ++ idx = hashTable[hash0]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ do { ++ { /* load repcode match for ip[2] */ ++ U32 const current2 = (U32)(ip2 - base); ++ U32 const repIndex = current2 - offset_1; ++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; ++ U32 rval; ++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ ++ & (offset_1 > 0) ) { ++ rval = MEM_read32(repBase + repIndex); ++ } else { ++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ + } +- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; +- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; +- U32 const offset = curr - matchIndex; +- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; +- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = offset; /* update offset history */ +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- ip += mLength; +- anchor = ip; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ /* check repcode at ip[2] */ ++ if (MEM_read32(ip2) == rval) { ++ ip0 = ip2; ++ match0 = repBase + repIndex; ++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ assert((match0 != prefixStart) & (match0 != dictStart)); ++ mLength = ip0[-1] == match0[-1]; ++ ip0 -= mLength; ++ match0 -= mLength; ++ offcode = REPCODE1_TO_OFFBASE; ++ mLength += 4; ++ goto _match; + } } + +- if (ip <= ilimit) { +- /* Fill Table */ +- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); +- /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); +- U32 const repIndex2 = current2 - offset_2; +- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; +- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; +- continue; +- } +- break; +- } } } ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip3; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip0 + step; ++ ip3 = ip1 + step; ++ ++ /* calculate step */ ++ if (ip2 >= nextStep) { ++ step++; ++ PREFETCH_L1(ip1 + 64); ++ PREFETCH_L1(ip1 + 128); ++ nextStep += kStepIncr; ++ } ++ } while (ip3 < ilimit); ++ ++_cleanup: ++ /* Note that there are probably still a couple positions we could search. ++ * However, it seems to be a meaningful performance hit to try to search ++ * them. So let's not. */ ++ ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ +- rep[0] = offset_1; +- rep[1] = offset_2; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); ++ ++_offset: /* Requires: ip0, idx, idxBase */ ++ ++ /* Compute the offset code. */ ++ { U32 const offset = current0 - idx; ++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; ++ matchEnd = idx < prefixStartIndex ? dictEnd : iend; ++ match0 = idxBase + idx; ++ offset_2 = offset_1; ++ offset_1 = offset; ++ offcode = OFFSET_TO_OFFBASE(offset); ++ mLength = 4; ++ ++ /* Count the backwards match length. */ ++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { ++ ip0--; ++ match0--; ++ mLength++; ++ } } ++ ++_match: /* Requires: ip0, match0, offcode, matchEnd */ ++ ++ /* Count the forward length. */ ++ assert(matchEnd != 0); ++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); ++ ++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); ++ ++ ip0 += mLength; ++ anchor = ip0; ++ ++ /* write next hash table entry */ ++ if (ip1 < ip0) { ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ ++ /* Fill table and check for immediate repcode. */ ++ if (ip0 <= ilimit) { ++ /* Fill Table */ ++ assert(base+current0+2 > istart); /* check base overflow */ ++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); ++ ++ while (ip0 <= ilimit) { ++ U32 const repIndex2 = (U32)(ip0-base) - offset_2; ++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; ++ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ ++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { ++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ++ ip0 += repLength2; ++ anchor = ip0; ++ continue; ++ } ++ break; ++ } } ++ ++ goto _start; + } + + ZSTD_GEN_FAST_FN(extDict, 4, 0) +@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict( + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; ++ assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ +diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h +index fddc2f532d21..e64d9e1b2d39 100644 +--- a/lib/zstd/compress/zstd_fast.h ++++ b/lib/zstd/compress/zstd_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,8 @@ + #include "zstd_compress_internal.h" + + void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c +index 0298a01a7504..3e88d8a1a136 100644 +--- a/lib/zstd/compress/zstd_lazy.c ++++ b/lib/zstd/compress/zstd_lazy.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -10,14 +11,23 @@ + + #include "zstd_compress_internal.h" + #include "zstd_lazy.h" ++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) ++ ++#define kLazySkippingStep 8 + + + /*-************************************* + * Binary Tree search + ***************************************/ + +-static void +-ZSTD_updateDUBT(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_updateDUBT(ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) + { +@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, + * sort one already inserted but unsorted position + * assumption : curr >= btlow == (curr - btmask) + * doesn't fail */ +-static void +-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, + U32 curr, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, + } + + +-static size_t +-ZSTD_DUBT_findBetterDictMatch ( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBetterDictMatch ( + const ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, +@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", +- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); ++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ +@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } +@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + +-static size_t +-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +- size_t* offsetPtr, ++ size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; +- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) ++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any +@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, +- offsetPtr, bestLength, nbCompares, ++ offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", +- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); ++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + + + /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +- size_t* offsetPtr, ++ size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) + { + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); +- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); ++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); + } + + /* ********************************* +@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; +@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + + /* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, +- const BYTE* ip, U32 const mls) ++ const BYTE* ip, U32 const mls, U32 const lazySkipping) + { + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; +@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; ++ /* Stop inserting every position when in the lazy skipping mode. */ ++ if (lazySkipping) ++ break; + } + + ms->nextToUpdate = target; +@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); ++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); + } + + /* inlining is important to hardwire a hot branch (template emulation) */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_HcFindBestMatch( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch( + } + + /* HC4 match finder */ +- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); ++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch( + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch( + * (SIMD) Row-based matchfinder + ***********************************/ + /* Constants for row-based hash */ +-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +- assert(val != 0); +-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +- if (sizeof(size_t) == 4) { +- U32 mostSignificantWord = (U32)(val >> 32); +- U32 leastSignificantWord = (U32)val; +- if (leastSignificantWord == 0) { +- return 32 + (U32)__builtin_ctz(mostSignificantWord); +- } else { +- return (U32)__builtin_ctz(leastSignificantWord); +- } +- } else { +- return (U32)__builtin_ctzll(val); +- } +-# else +- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +- */ +- val = ~val & (val - 1ULL); /* Lowest set bit mask */ +- val = val - ((val >> 1) & 0x5555555555555555); +- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +-# endif +-} +- +-/* ZSTD_rotateRight_*(): +- * Rotates a bitfield to the right by "count" bits. +- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts +- */ +-FORCE_INLINE_TEMPLATE +-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +- assert(count < 64); +- count &= 0x3F; /* for fickle pattern recognition */ +- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +- assert(count < 32); +- count &= 0x1F; /* for fickle pattern recognition */ +- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +- assert(count < 16); +- count &= 0x0F; /* for fickle pattern recognition */ +- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { ++ return ZSTD_countTrailingZeros64(val); + } + + /* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" +- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) ++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +- U32 const next = (*tagRow - 1) & rowMask; +- *tagRow = (BYTE)next; +- return next; ++ U32 next = (*tagRow-1) & rowMask; ++ next += (next == 0) ? rowMask : 0; /* skip first position */ ++ *tagRow = (BYTE)next; ++ return next; + } + + /* ZSTD_isAligned(): +@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + /* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { ++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); +@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) + { + U32 const* const hashTable = ms->hashTable; +- U16 const* const tagTable = ms->tagTable; ++ BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { +- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; +@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +- U16 const* tagTable, BYTE const* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, ++ BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, +- U32 const rowLog, U32 const mls) ++ U32 const rowLog, U32 const mls, ++ U64 const hashSalt) + { +- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab + /* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, +- U32 updateStartIdx, U32 const updateEndIdx, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, ++ U32 updateStartIdx, U32 const updateEndIdx, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) ++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; +- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +- Explicit cast allows us to get exact desired position within each row */ ++ BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; ++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } + } +@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; +@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); ++} ++ ++/* Returns the mask width of bits group of which will be set to 1. Given not all ++ * architectures have easy movemask instruction, this helps to iterate over ++ * groups of bits easier and faster. ++ */ ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ (void)rowEntries; ++#if defined(ZSTD_ARCH_ARM_NEON) ++ /* NEON path only works for little endian */ ++ if (!MEM_isLittleEndian()) { ++ return 1; ++ } ++ if (rowEntries == 16) { ++ return 4; ++ } ++ if (rowEntries == 32) { ++ return 2; ++ } ++ if (rowEntries == 64) { ++ return 1; ++ } ++#endif ++ return 1; + } + + #if defined(ZSTD_ARCH_X86_SSE2) +@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U + } + #endif + +-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches +- * the hash at the nth position in a row of the tagTable. +- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield +- * to match up with the actual layout of the entries within the hashTable */ ++#if defined(ZSTD_ARCH_ARM_NEON) ++FORCE_INLINE_TEMPLATE ZSTD_VecMask ++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ if (rowEntries == 16) { ++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. ++ * After that groups of 4 bits represent the equalMask. We lower ++ * all bits except the highest in these groups by doing AND with ++ * 0x88 = 0b10001000. ++ */ ++ const uint8x16_t chunk = vld1q_u8(src); ++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); ++ const uint8x8_t res = vshrn_n_u16(equalMask, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; ++ } else if (rowEntries == 32) { ++ /* Same idea as with rowEntries == 16 but doing AND with ++ * 0x55 = 0b01010101. ++ */ ++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); ++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); ++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); ++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); ++ const uint8x8_t res = vsli_n_u8(t0, t1, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; ++ } else { /* rowEntries == 64 */ ++ const uint8x16x4_t chunk = vld4q_u8(src); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); ++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); ++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); ++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); ++ ++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); ++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); ++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); ++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); ++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped); ++ } ++} ++#endif ++ ++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by ++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" ++ * matches the hash at the nth position in a row of the tagTable. ++ * Each row is a circular buffer beginning at the value of "headGrouped". So we ++ * must rotate the "matches" bitfield to match up with the actual layout of the ++ * entries within the hashTable */ + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) + { +- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; ++ const BYTE* const src = tagRow; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); + + #if defined(ZSTD_ARCH_X86_SSE2) + +- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); ++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); + + #else /* SW or NEON-LE */ + + # if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { +- if (rowEntries == 16) { +- const uint8x16_t chunk = vld1q_u8(src); +- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +- const U16 hi = (U16)vgetq_lane_u8(t3, 8); +- const U16 lo = (U16)vgetq_lane_u8(t3, 0); +- return ZSTD_rotateRight_U16((hi << 8) | lo, head); +- } else if (rowEntries == 32) { +- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +- const uint8x8x2_t t3 = vuzp_u8(t2, t0); +- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +- return ZSTD_rotateRight_U32(matches, head); +- } else { /* rowEntries == 64 */ +- const uint8x16x4_t chunk = vld4q_u8(src); +- const uint8x16_t dup = vdupq_n_u8(tag); +- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); +- +- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +- return ZSTD_rotateRight_U64(matches, head); +- } ++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); + } + # endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ +- { const size_t chunkSize = sizeof(size_t); ++ { const int chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; +@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + } + matches = ~matches; + if (rowEntries == 16) { +- return ZSTD_rotateRight_U16((U16)matches, head); ++ return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { +- return ZSTD_rotateRight_U32((U32)matches, head); ++ return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { +- return ZSTD_rotateRight_U64((U64)matches, head); ++ return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } + #endif +@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + + /* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: +- * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" +- * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines ++ * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. ++ * - The hash is salted by a value that changes on every contex reset, so when the same table is used ++ * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. ++ * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines + * which row to insert into. +- * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can +- * be considered as a circular buffer with a "head" index that resides in the tagTable. +- * - Also insert the "tag" into the equivalent row and position in the tagTable. +- * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. +- * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, +- * for alignment/performance reasons, leaving some bytes unused. +- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and ++ * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can ++ * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes ++ * per row). ++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. ++ * - Insert the tag into the equivalent row and position in the tagTable. + */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_RowFindBestMatch( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowLog) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ ++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); ++ const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; ++ U32 hash; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms = ms->dictMatchState; +@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch( + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; +- U16* const dmsTagTable = dms->tagTable; ++ BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch( + } + + /* Update the hashTable and tagTable up to (but not including) ip */ +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ if (!ms->lazySkipping) { ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); ++ } else { ++ /* Stop inserting every position when in the lazy skipping mode. ++ * The hash cache is also not kept up to date in this mode. ++ */ ++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); ++ ms->nextToUpdate = curr; ++ } ++ ms->hashSaltEntropy += hash; /* collect salt entropy */ ++ + { /* Get the hash for ip, compute the appropriate row */ +- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); +- U32 const head = *tagRow & rowMask; ++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; ++ if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; +@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; ++ tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + +@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch( + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch( + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + +- { U32 const head = *dmsTagRow & rowMask; ++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; ++ if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Return the longest match */ +@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } +@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + * Common parser - lazy strategy + *********************************/ + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_lazy_generic( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_lazy_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, +@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic( + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + +- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; ++ U32 offset_1 = rep[0], offset_2 = rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; +@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic( + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; +- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 +@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic( + assert(offset_2 <= dictAndPrefixLength); + } + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + +@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic( + } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); ++ { size_t offbaseFound = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; ++ ip += step; ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 1"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 2"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ +@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic( + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { ++ if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { +- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ ++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) ++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + if (isDxS) { +@@ -1686,8 +1745,8 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; +@@ -1701,166 +1760,181 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + +- /* Save reps for next block */ +- rep[0] = offset_1 ? offset_1 : savedOffset; +- rep[1] = offset_2 ? offset_2 : savedOffset; ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ ++ /* save reps for next block */ ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + + +-size_t ZSTD_compressBlock_btlazy2( ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_lazy2( ++size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_greedy( ++size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dictMatchState( ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); + } + +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); + } + +-/* Row-based matchfinder */ +-size_t ZSTD_compressBlock_lazy2_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_row( ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); + } + +- + size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); + } ++#endif + ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_lazy_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], +@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + +@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + } } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); ++ ip += step + 1; /* jump faster over incompressible sections */ ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ if (OFFBASE_IS_OFFSET(offBase)) { ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + while (ip <= ilimit) { +@@ -2029,8 +2120,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -2045,8 +2136,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + +- ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict( + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); + } + +-size_t ZSTD_compressBlock_lazy_extDict( ++size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict( ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); + } + +-size_t ZSTD_compressBlock_btlazy2_extDict( ++size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); + } ++#endif + +-size_t ZSTD_compressBlock_greedy_extDict_row( ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) ++ + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); + } + +-size_t ZSTD_compressBlock_lazy_extDict_row( ++size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict_row( ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); + } ++#endif +diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h +index e5bdf4df8dde..22c9201f4e63 100644 +--- a/lib/zstd/compress/zstd_lazy.h ++++ b/lib/zstd/compress/zstd_lazy.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -22,98 +23,175 @@ + */ + #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + ++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); + void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); + + void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); + + void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ ++#endif + +-size_t ZSTD_compressBlock_btlazy2( ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2( ++size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy( ++size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy( ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_row( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_row( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_row( ++size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_GREEDY NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++ ++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_greedy_extDict( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_extDict_row( ++size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict_row( ++ ++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_extDict_row( ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- ++ ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL ++#endif ++ + + + #endif /* ZSTD_LAZY_H */ +diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c +index dd86fc83e7dd..07f3bc6437ce 100644 +--- a/lib/zstd/compress/zstd_ldm.c ++++ b/lib/zstd/compress/zstd_ldm.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + switch(ms->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: +@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) + } + } + +-static size_t ZSTD_ldm_generateSequences_internal( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) + { +@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences( + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the +- * the offset against maxDist directly. ++ * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may +@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); +- int i; + /* End signal */ + if (sequence.offset == 0) + break; +@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { ++ int i; + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; +@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, +- STORE_OFFSET(sequence.offset), ++ OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } +diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h +index fbc6a5e88fd7..c540731abde7 100644 +--- a/lib/zstd/compress/zstd_ldm.h ++++ b/lib/zstd/compress/zstd_ldm.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h +index 647f865be290..cfccfc46f6f7 100644 +--- a/lib/zstd/compress/zstd_ldm_geartab.h ++++ b/lib/zstd/compress/zstd_ldm_geartab.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c +index fd82acfda62f..a87b66ac8d24 100644 +--- a/lib/zstd/compress/zstd_opt.c ++++ b/lib/zstd/compress/zstd_opt.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,11 +13,14 @@ + #include "hist.h" + #include "zstd_opt.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + + #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ + #define ZSTD_MAX_PRICE (1<<30) + +-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + + /*-************************************* +@@ -26,27 +30,35 @@ + #if 0 /* approximation at bit level (for tests) */ + # define BITCOST_ACCURACY 0 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) ++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) + #elif 0 /* fractional bit accuracy (for tests) */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) + #else /* opt==approx, ultra==accurate */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) ++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) + #endif + ++/* ZSTD_bitWeight() : ++ * provide estimated "cost" of a stat in full bits only */ + MEM_STATIC U32 ZSTD_bitWeight(U32 stat) + { + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); + } + ++/* ZSTD_fracWeight() : ++ * provide fractional-bit "cost" of a stat, ++ * using linear interpolation approximation */ + MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + { + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; ++ /* Fweight was meant for "Fractional weight" ++ * but it's effectively a value between 1 and 2 ++ * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); +@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + /* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +-MEM_STATIC double ZSTD_fCost(U32 price) ++MEM_STATIC double ZSTD_fCost(int price) + { + return (double)price / (BITCOST_MULTIPLIER*8); + } +@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts) + return total; + } + +-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) ++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; ++ ++static U32 ++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) + { + U32 s, sum=0; +- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); ++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", ++ (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); + for (s=0; s> shift); +- sum += table[s]; ++ unsigned const base = base1 ? 1 : (table[s]>0); ++ unsigned const newStat = base + (table[s] >> shift); ++ sum += newStat; ++ table[s] = newStat; + } + return sum; + } + + /* ZSTD_scaleStats() : +- * reduce all elements in table is sum too large ++ * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + { +@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; +- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); ++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); + } + + /* ZSTD_rescaleFreqs() : +@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + +- if (optPtr->litLengthSum == 0) { /* first block : init */ +- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ +- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); ++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ ++ ++ /* heuristic: use pre-defined stats for too small inputs */ ++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { ++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { +- /* huffman table presumed generated by dictionary */ ++ ++ /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { ++ /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; +@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + +- } else { /* not a dictionary */ ++ } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { ++ /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ +- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); ++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { +@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + +- + } + +- } else { /* new block : re-use previous statistics, scaled down */ ++ } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr, + int optLevel) + { ++ DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) +@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ +- { U32 price = litLength * optPtr->litSumBasePrice; ++ { U32 price = optPtr->litSumBasePrice * litLength; ++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; ++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { +- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ +- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; ++ price -= litPrice; + } + return price; + } +@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); +- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +- * because it isn't representable in the zstd format. So instead just +- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +- * would be all literals. ++ ++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX ++ * because it isn't representable in the zstd format. ++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. ++ * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); +@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + } + + /* ZSTD_getMatchPrice() : +- * Provides the cost of the match part (offset + matchLength) of a sequence ++ * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. +- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 ++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ + FORCE_INLINE_TEMPLATE U32 +-ZSTD_getMatchPrice(U32 const offcode, ++ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) + { + U32 price; +- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); ++ U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + +- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ +- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); ++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ ++ return WEIGHT(mlBase, optLevel) ++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); +@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode, + } + + /* ZSTD_updateStats() : +- * assumption : literals + litLengtn <= iend */ ++ * assumption : literals + litLength <= iend */ + static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, +- U32 offsetCode, U32 matchLength) ++ U32 offBase, U32 matchLength) + { + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { +@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, + optPtr->litLengthSum++; + } + +- /* offset code : expected to follow storeSeq() numeric representation */ +- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); ++ /* offset code : follows storeSeq() numeric representation */ ++ { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; +@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) + + /* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip) + { + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; +@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position + * @return : nb of positions added */ +-static U32 ZSTD_insertBt1( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertBt1( + const ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const target, +@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_updateTree_internal( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal( + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; +- DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", ++ DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { +@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { + } + + FORCE_INLINE_TEMPLATE +-U32 ZSTD_insertBtAndGetAllMatches ( +- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ +- ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, +- const U32 rep[ZSTD_REP_NUM], +- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ +- const U32 lengthToBeat, +- U32 const mls /* template */) ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ++ZSTD_insertBtAndGetAllMatches ( ++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ++ ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip, const BYTE* const iLimit, ++ const ZSTD_dictMode_e dictMode, ++ const U32 rep[ZSTD_REP_NUM], ++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ ++ const U32 lengthToBeat, ++ const U32 mls /* template */) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; +- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ ++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) +@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ +- matches[0].off = STORE_OFFSET(curr - matchIndex3); ++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | +@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + } + + if (matchLength > bestLength) { +- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; +- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)( + U32 const ll0, + U32 const lengthToBeat); + +-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, + ZSTD_matchState_t* ms, + U32* nextToUpdate3, +@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) + { + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; +- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ ++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ +@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); +- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", +- candidateOffCode, candidateMatchLength, currPosInBlock); ++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); ++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", ++ candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; +- matches[*nbMatches].off = candidateOffCode; ++ matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } + } +@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + * Optimal parser + *********************************/ + +-static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +-{ +- return sol.litlen + sol.mlen; +-} +- + #if 0 /* debug */ + + static void +@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltID) + + #endif + +-FORCE_INLINE_TEMPLATE size_t ++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) ++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) ++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) ++ ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t + ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], +@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; +- ZSTD_optimal_t lastSequence; ++ ZSTD_optimal_t lastStretch; + ZSTD_optLdm_t optLdm; + ++ ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); ++ + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); +@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const ll0 = !litlen; + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(ip-istart), (U32)(iend - ip)); +- if (!nbMatches) { ip++; continue; } ++ (U32)(ip-istart), (U32)(iend-ip)); ++ if (!nbMatches) { ++ DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); ++ ip++; ++ continue; ++ } ++ ++ /* Match found: let's store this solution, and eventually find more candidates. ++ * During this forward pass, @opt is used to store stretches, ++ * defined as "a match followed by N literals". ++ * Note how this is different from a Sequence, which is "N literals followed by a match". ++ * Storing stretches allows us to store different match predecessors ++ * for each literal position part of a literals run. */ + + /* initialize opt[0] */ +- { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; +- U32 const maxOffcode = matches[nbMatches-1].off; +- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", +- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); ++ U32 const maxOffBase = matches[nbMatches-1].off; ++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", ++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { +- lastSequence.litlen = litlen; +- lastSequence.mlen = maxML; +- lastSequence.off = maxOffcode; +- DEBUGLOG(6, "large match (%u>%u), immediate encoding", ++ lastStretch.litlen = 0; ++ lastStretch.mlen = maxML; ++ lastStretch.off = maxOffBase; ++ DEBUGLOG(6, "large match (%u>%u) => immediate encoding", + maxML, sufficient_len); + cur = 0; +- last_pos = ZSTD_totalLen(lastSequence); ++ last_pos = maxML; + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + assert(opt[0].price >= 0); +- { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); +- U32 pos; ++ { U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { +- opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ ++ opt[pos].price = ZSTD_MAX_PRICE; ++ opt[pos].mlen = 0; ++ opt[pos].litlen = litlen + pos; + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { +- U32 const offcode = matches[matchNb].off; ++ U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { +- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); +- U32 const sequencePrice = literalsPrice + matchPrice; ++ int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); ++ int const sequencePrice = opt[0].price + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; +- opt[pos].off = offcode; +- opt[pos].litlen = litlen; +- opt[pos].price = (int)sequencePrice; +- } } ++ opt[pos].off = offBase; ++ opt[pos].litlen = 0; /* end of match */ ++ opt[pos].price = sequencePrice + LL_PRICE(0); ++ } ++ } + last_pos = pos-1; ++ opt[pos].price = ZSTD_MAX_PRICE; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; +- assert(cur < ZSTD_OPT_NUM); +- DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) ++ assert(cur <= ZSTD_OPT_NUM); ++ DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur); + + /* Fix current position with one literal if cheaper */ +- { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; ++ { U32 const litlen = opt[cur-1].litlen + 1; + int const price = opt[cur-1].price +- + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) +- + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) +- - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); ++ + LIT_PRICE(ip+cur-1) ++ + LL_INCPRICE(litlen); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { ++ ZSTD_optimal_t const prevMatch = opt[cur]; + DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); +- opt[cur].mlen = 0; +- opt[cur].off = 0; ++ opt[cur] = opt[cur-1]; + opt[cur].litlen = litlen; + opt[cur].price = price; ++ if ( (optLevel >= 1) /* additional check only for higher modes */ ++ && (prevMatch.litlen == 0) /* replace a match */ ++ && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ ++ && LIKELY(ip + cur < iend) ++ ) { ++ /* check next position, in case it would be cheaper */ ++ int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); ++ int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); ++ DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", ++ cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); ++ if ( (with1literal < withMoreLiterals) ++ && (with1literal < opt[cur+1].price) ) { ++ /* update offset history - before it disappears */ ++ U32 const prev = cur - prevMatch.mlen; ++ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); ++ assert(cur >= prevMatch.mlen); ++ DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", ++ ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), ++ newReps.rep[0], newReps.rep[1], newReps.rep[2] ); ++ opt[cur+1] = prevMatch; /* mlen & offbase */ ++ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t)); ++ opt[cur+1].litlen = 1; ++ opt[cur+1].price = with1literal; ++ if (last_pos < cur+1) last_pos = cur+1; ++ } ++ } + } else { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), +- opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); ++ DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)", ++ inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); + } + } + +- /* Set the repcodes of the current position. We must do it here +- * because we rely on the repcodes of the 2nd to last sequence being +- * correct to set the next chunks repcodes during the backward +- * traversal. ++ /* Offset history is not updated during match comparison. ++ * Do it here, now that the match is selected and confirmed. + */ + ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); + assert(cur >= opt[cur].mlen); +- if (opt[cur].mlen != 0) { ++ if (opt[cur].litlen == 0) { ++ /* just finished a match => alter offset history */ + U32 const prev = cur - opt[cur].mlen; +- repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); ++ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); + ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); +- } else { +- ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ +@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { +- DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); ++ DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + assert(opt[cur].price >= 0); +- { U32 const ll0 = (opt[cur].mlen != 0); +- U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; +- U32 const previousPrice = (U32)opt[cur].price; +- U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); ++ { U32 const ll0 = (opt[cur].litlen == 0); ++ int const previousPrice = opt[cur].price; ++ int const basePrice = previousPrice + LL_PRICE(0); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); + U32 matchNb; + +@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + continue; + } + +- { U32 const maxML = matches[nbMatches-1].len; +- DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", +- inr-istart, cur, nbMatches, maxML); +- +- if ( (maxML > sufficient_len) +- || (cur + maxML >= ZSTD_OPT_NUM) ) { +- lastSequence.mlen = maxML; +- lastSequence.off = matches[nbMatches-1].off; +- lastSequence.litlen = litlen; +- cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ +- last_pos = cur + ZSTD_totalLen(lastSequence); +- if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ ++ { U32 const longestML = matches[nbMatches-1].len; ++ DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u", ++ inr-istart, cur, nbMatches, longestML); ++ ++ if ( (longestML > sufficient_len) ++ || (cur + longestML >= ZSTD_OPT_NUM) ++ || (ip + cur + longestML >= iend) ) { ++ lastStretch.mlen = longestML; ++ lastStretch.off = matches[nbMatches-1].off; ++ lastStretch.litlen = 0; ++ last_pos = cur + longestML; + goto _shortestPath; + } } + +@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + +- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", +- matchNb, matches[matchNb].off, lastML, litlen); ++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", ++ matchNb, matches[matchNb].off, lastML, opt[cur].litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; +- int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); ++ int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); +- while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ ++ while (last_pos < pos) { ++ /* fill empty positions, for future comparisons */ ++ last_pos++; ++ opt[last_pos].price = ZSTD_MAX_PRICE; ++ opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ ++ } + opt[pos].mlen = mlen; + opt[pos].off = offset; +- opt[pos].litlen = litlen; ++ opt[pos].litlen = 0; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", +@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } ++ opt[last_pos+1].price = ZSTD_MAX_PRICE; + } /* for (cur = 1; cur <= last_pos; cur++) */ + +- lastSequence = opt[last_pos]; +- cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ +- assert(cur < ZSTD_OPT_NUM); /* control overflow*/ ++ lastStretch = opt[last_pos]; ++ assert(cur >= lastStretch.mlen); ++ cur = last_pos - lastStretch.mlen; + + _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); ++ assert(last_pos >= lastStretch.mlen); ++ assert(cur == last_pos - lastStretch.mlen); + +- /* Set the next chunk's repcodes based on the repcodes of the beginning +- * of the last match, and the last sequence. This avoids us having to +- * update them while traversing the sequences. +- */ +- if (lastSequence.mlen != 0) { +- repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); +- ZSTD_memcpy(rep, &reps, sizeof(reps)); ++ if (lastStretch.mlen==0) { ++ /* no solution : all matches have been converted into literals */ ++ assert(lastStretch.litlen == (ip - anchor) + last_pos); ++ ip += last_pos; ++ continue; ++ } ++ assert(lastStretch.off > 0); ++ ++ /* Update offset history */ ++ if (lastStretch.litlen == 0) { ++ /* finishing on a match : update offset history */ ++ repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); ++ ZSTD_memcpy(rep, &reps, sizeof(repcodes_t)); + } else { +- ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t)); ++ assert(cur >= lastStretch.litlen); ++ cur -= lastStretch.litlen; + } + +- { U32 const storeEnd = cur + 1; ++ /* Let's write the shortest path solution. ++ * It is stored in @opt in reverse order, ++ * starting from @storeEnd (==cur+2), ++ * effectively partially @opt overwriting. ++ * Content is changed too: ++ * - So far, @opt stored stretches, aka a match followed by literals ++ * - Now, it will store sequences, aka literals followed by a match ++ */ ++ { U32 const storeEnd = cur + 2; + U32 storeStart = storeEnd; +- U32 seqPos = cur; ++ U32 stretchPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; +- assert(storeEnd < ZSTD_OPT_NUM); +- DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); +- opt[storeEnd] = lastSequence; +- while (seqPos > 0) { +- U32 const backDist = ZSTD_totalLen(opt[seqPos]); ++ assert(storeEnd < ZSTD_OPT_SIZE); ++ DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", ++ storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); ++ if (lastStretch.litlen > 0) { ++ /* last "sequence" is unfinished: just a bunch of literals */ ++ opt[storeEnd].litlen = lastStretch.litlen; ++ opt[storeEnd].mlen = 0; ++ storeStart = storeEnd-1; ++ opt[storeStart] = lastStretch; ++ } { ++ opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ ++ storeStart = storeEnd; ++ } ++ while (1) { ++ ZSTD_optimal_t nextStretch = opt[stretchPos]; ++ opt[storeStart].litlen = nextStretch.litlen; ++ DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", ++ opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); ++ if (nextStretch.mlen == 0) { ++ /* reaching beginning of segment */ ++ break; ++ } + storeStart--; +- DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); +- opt[storeStart] = opt[seqPos]; +- seqPos = (seqPos > backDist) ? seqPos - backDist : 0; ++ opt[storeStart] = nextStretch; /* note: litlen will be fixed */ ++ assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); ++ stretchPos -= nextStretch.litlen + nextStretch.mlen; + } + + /* save sequences */ +- DEBUGLOG(6, "sending selected sequences into seqStore") ++ DEBUGLOG(6, "sending selected sequences into seqStore"); + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; +- U32 const offCode = opt[storePos].off; ++ U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", + anchor - istart, (unsigned)llen, (unsigned)mlen); +@@ -1308,11 +1422,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + } + + assert(anchor + llen <= iend); +- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); +- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); ++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); ++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } ++ DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); ++ ++ /* update all costs */ + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ +@@ -1320,21 +1437,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt0( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt( + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + + + + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + /* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. +- * this function cannot error, hence its contract must be respected. ++ * this function cannot error out, its narrow contract must be respected. + */ +-static void +-ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, +- U32 rep[ZSTD_REP_NUM], +- const void* src, size_t srcSize) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_initStats_ultra(ZSTD_matchState_t* ms, ++ seqStore_t* seqStore, ++ U32 rep[ZSTD_REP_NUM], ++ const void* src, size_t srcSize) + { + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); +@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + +- /* invalidate first scan from history */ ++ /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; +@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2( + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + +- /* 2-pass strategy: ++ /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics +- * and seed next round's statistics with it. +- * After 1st pass, function forgets everything, and starts a new block. ++ * in order to seed next round's statistics with it. ++ * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), +@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2( + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ +- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ +- && (srcSize > ZSTD_PREDEF_THRESHOLD) ++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ ++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState( + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_btultra_dictMatchState( ++size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + +-size_t ZSTD_compressBlock_btopt_extDict( ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); ++ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + + size_t ZSTD_compressBlock_btultra_extDict( +@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDict( + { + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries +diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h +index 22b862858ba7..ac1b743d27cd 100644 +--- a/lib/zstd/compress/zstd_opt.h ++++ b/lib/zstd/compress/zstd_opt.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,30 +15,40 @@ + + #include "zstd_compress_internal.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + /* used in ZSTD_loadDictionaryContent() */ + void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra( ++size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra2( ++size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTOPT NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL ++#endif + +-size_t ZSTD_compressBlock_btopt_dictMatchState( ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDict( + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ ++size_t ZSTD_compressBlock_btultra2( ++ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); ++ ++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 ++#else ++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL ++#endif + + + #endif /* ZSTD_OPT_H */ +diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c +index 60958afebc41..ac8b87f48f84 100644 +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,10 +20,10 @@ + #include "../common/compiler.h" + #include "../common/bitstream.h" /* BIT_* */ + #include "../common/fse.h" /* to compress headers */ +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/error_private.h" + #include "../common/zstd_internal.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + + /* ************************************************************** + * Constants +@@ -34,6 +35,12 @@ + * Macros + ****************************************************************/ + ++#ifdef HUF_DISABLE_FAST_DECODE ++# define HUF_ENABLE_FAST_DECODE 0 ++#else ++# define HUF_ENABLE_FAST_DECODE 1 ++#endif ++ + /* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. +@@ -43,27 +50,25 @@ + #error "Cannot force the use of the X1 and X2 decoders at the same time!" + #endif + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE ++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is ++ * supported at runtime, so we can add the BMI2 target attribute. ++ * When it is disabled, we will still get BMI2 if it is enabled statically. ++ */ ++#if DYNAMIC_BMI2 ++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE + #else +-# define HUF_ASM_X86_64_BMI2_ATTRS ++# define HUF_FAST_BMI2_ATTRS + #endif + + #define HUF_EXTERN_C + #define HUF_ASM_DECL HUF_EXTERN_C + +-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) ++#if DYNAMIC_BMI2 + # define HUF_NEED_BMI2_FUNCTION 1 + #else + # define HUF_NEED_BMI2_FUNCTION 0 + #endif + +-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +-# define HUF_NEED_DEFAULT_FUNCTION 1 +-#else +-# define HUF_NEED_DEFAULT_FUNCTION 0 +-#endif +- + /* ************************************************************** + * Error Management + ****************************************************************/ +@@ -80,6 +85,11 @@ + /* ************************************************************** + * BMI2 Variant Wrappers + ****************************************************************/ ++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, ++ const void *cSrc, ++ size_t cSrcSize, ++ const HUF_DTable *DTable); ++ + #if DYNAMIC_BMI2 + + #define HUF_DGEN(fn) \ +@@ -101,9 +111,9 @@ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- if (bmi2) { \ ++ if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ +@@ -113,9 +123,9 @@ + + #define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- (void)bmi2; \ ++ (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) + return dtd; + } + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 +- +-static size_t HUF_initDStream(BYTE const* ip) { ++static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; +- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); ++ assert(sizeof(size_t) == 8); + return value << bitsConsumed; + } ++ ++ ++/* ++ * The input/output arguments to the Huffman fast decoding loop: ++ * ++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. ++ * op [in/out] - The output pointers, must be updated to reflect what is written. ++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. ++ * dt [in] - The decoding table. ++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read ++ * down to this pointer. It may be below iend[0]. ++ * oend [in] - The end of the output stream. op[3] must not cross oend. ++ * iend [in] - The end of each input stream. ip[i] may cross iend[i], ++ * as long as it is above ilowest, but that indicates corruption. ++ */ + typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; +- BYTE const* ilimit; ++ BYTE const* ilowest; + BYTE* oend; + BYTE const* iend[4]; +-} HUF_DecompressAsmArgs; ++} HUF_DecompressFastArgs; ++ ++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + + /* +- * Initializes args for the asm decoding loop. +- * @returns 0 on success +- * 1 if the fallback implementation should be used. ++ * Initializes args for the fast decoding loop. ++ * @returns 1 on success ++ * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) ++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) + { + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + +- const BYTE* const ilimit = (const BYTE*)src + 6 + 8; ++ const BYTE* const istart = (const BYTE*)src; + +- BYTE* const oend = (BYTE*)dst + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + +- /* The following condition is false on x32 platform, +- * but HUF_asm is not compatible with this ABI */ +- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; ++ /* The fast decoding loop assumes 64-bit little-endian. ++ * This condition is false on x32. ++ */ ++ if (!MEM_isLittleEndian() || MEM_32bits()) ++ return 0; ++ ++ /* Avoid nullptr addition */ ++ if (dstSize == 0) ++ return 0; ++ assert(dst != NULL); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) +@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) +- return 1; ++ return 0; + + /* Read the jump table. */ + { +- const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); +@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + +- /* HUF_initDStream() requires this, and this small of an input ++ /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. +- * length1 must be >= 16 so that ip[0] >= ilimit before the loop +- * starts. + */ +- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +- return 1; ++ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) ++ return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ +@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) +- return 1; ++ return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. +@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ +- args->bits[0] = HUF_initDStream(args->ip[0]); +- args->bits[1] = HUF_initDStream(args->ip[1]); +- args->bits[2] = HUF_initDStream(args->ip[2]); +- args->bits[3] = HUF_initDStream(args->ip[3]); +- +- /* If ip[] >= ilimit, it is guaranteed to be safe to +- * reload bits[]. It may be beyond its section, but is +- * guaranteed to be valid (>= istart). +- */ +- args->ilimit = ilimit; ++ args->bits[0] = HUF_initFastDStream(args->ip[0]); ++ args->bits[1] = HUF_initFastDStream(args->ip[1]); ++ args->bits[2] = HUF_initFastDStream(args->ip[2]); ++ args->bits[3] = HUF_initFastDStream(args->ip[3]); ++ ++ /* The decoders must be sure to never read beyond ilowest. ++ * This is lower than iend[0], but allowing decoders to read ++ * down to ilowest can allow an extra iteration or two in the ++ * fast loop. ++ */ ++ args->ilowest = istart; + + args->oend = oend; + args->dt = dt; + +- return 0; ++ return 1; + } + +-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) ++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) + { + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) +@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ +- bit->bitContainer = MEM_readLE64(args->ip[stream]); +- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); +- bit->start = (const char*)args->iend[0]; ++ assert(sizeof(size_t) == 8); ++ bit->bitContainer = MEM_readLEST(args->ip[stream]); ++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); ++ bit->start = (const char*)args->ilowest; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; + } +-#endif ++ ++/* Calls X(N) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM(X) \ ++ do { \ ++ X(0); \ ++ X(1); \ ++ X(2); \ ++ X(3); \ ++ } while (0) ++ ++/* Calls X(N, var) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ ++ do { \ ++ X(0, (var)); \ ++ X(1, (var)); \ ++ X(2, (var)); \ ++ X(3, (var)); \ ++ } while (0) + + + #ifndef HUF_FORCE_DECOMPRESS_X2 +@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi + static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { +- D4 = (symbol << 8) + nbBits; ++ D4 = (U64)((symbol << 8) + nbBits); + } else { +- D4 = symbol + (nbBits << 8); ++ D4 = (U64)(symbol + (nbBits << 8)); + } ++ assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; + } +@@ -329,13 +379,7 @@ typedef struct { + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; + } HUF_ReadDTableX1_Workspace; + +- +-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog = 0; + U32 nbSymbols = 0; +@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + +@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ +- { +- int n; +- int nextRankStart = 0; ++ { int n; ++ U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { +@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ +- { +- U32 w; +- int symbol=wksp->rankVal[0]; +- int rankStart=0; ++ { U32 w; ++ int symbol = wksp->rankVal[0]; ++ int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; +@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog + } + + #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ +- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) ++ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + +- return pEnd-pStart; ++ return (size_t)(pEnd-pStart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body( + const HUF_DTable* DTable) + { + BYTE* op = (BYTE*)dst; +- BYTE* const oend = op + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; +@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body( + return dstSize; + } + ++/* HUF_decompress4X1_usingDTable_internal_body(): ++ * Conditions : ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body( + { + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body( + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6); /* validated above */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ U16 const* const dtable = (U16 const*)args->dt; ++ BYTE* const oend = args->oend; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local variables */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each iteration produces 5 output symbols per stream */ ++ size_t const oiters = (size_t)(oend - op[3]) / 5; ++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes ++ * per stream. ++ */ ++ size_t const iiters = (size_t)(ip[0] - ilowest) / 7; ++ /* We can safely run iters iterations before running bounds checks */ ++ size_t const iters = MIN(oiters, iiters); ++ size_t const symbols = iters * 5; ++ ++ /* We can simply check that op[3] < olimit, instead of checking all ++ * of our bounds, since we can't hit the other bounds until we've run ++ * iters iterations, which only happens when op[3] == olimit. ++ */ ++ olimit = op[3] + symbols; ++ ++ /* Exit fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ ++ do { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ int const entry = (int)dtable[index]; \ ++ bits[(_stream)] <<= (entry & 0x3F); \ ++ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ ++ } while (0) ++ ++#define HUF_4X1_RELOAD_STREAM(_stream) \ ++ do { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ op[(_stream)] += 5; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols in each of the 4 streams */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); ++ ++ /* Reload each of the 4 the bitstreams */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ ++#undef HUF_4X1_DECODE_SYMBOL ++#undef HUF_4X1_RELOAD_STREAM ++ } + +-static HUF_ASM_X86_64_BMI2_ATTRS ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++/* ++ * @returns @p dstSize on success (>= 6) ++ * 0 if the fallback implementation should be used ++ * An error if an error occurred ++ */ ++static HUF_FAST_BMI2_ATTRS + size_t +-HUF_decompress4X1_usingDTable_internal_bmi2_asm( ++HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) + { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; +- { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +- FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ BYTE const* const ilowest = (BYTE const*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; ++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + +- /* Our loop guarantees that ip[] >= ilimit and that we haven't ++ /* Our loop guarantees that ip[] >= ilowest and that we haven't + * overwritten any op[]. + */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bit streams one by one. */ +- { +- size_t const segmentSize = (dstSize+3) / 4; ++ { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { +@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + } + + /* decoded size */ ++ assert(dstSize != 0); + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +- +-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, +- const void *cSrc, +- size_t cSrcSize, +- const HUF_DTable *DTable); + + HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + #endif +-} +- +- +-size_t HUF_decompress1X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} + +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- const BYTE* ip = (const BYTE*) cSrc; +- +- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); +- if (HUF_isError(hSize)) return hSize; +- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); +- ip += hSize; cSrcSize -= hSize; +- +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + +-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +-} +- +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); ++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +- + #endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +@@ -1040,14 +1175,7 @@ typedef struct { + + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +- const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); +@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ +@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c + } + + #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, +@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body( + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; +- BYTE* const oend = ostart + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); +@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body( + /* decoded size */ + return dstSize; + } ++ ++/* HUF_decompress4X2_usingDTable_internal_body(): ++ * Conditions: ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body( + const HUF_DTable* DTable) + { + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body( + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + +- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ +- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ ++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6 /* validated above */); + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ BYTE* oend[4]; ++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local registers. */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ oend[0] = op[1]; ++ oend[1] = op[2]; ++ oend[2] = op[3]; ++ oend[3] = args->oend; ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= oend[stream]); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each loop does 5 table lookups for each of the 4 streams. ++ * Each table lookup consumes up to 11 bits of input, and produces ++ * up to 2 bytes of output. ++ */ ++ /* We can consume up to 7 bytes of input per iteration per stream. ++ * We also know that each input pointer is >= ip[0]. So we can run ++ * iters loops before running out of input. ++ */ ++ size_t iters = (size_t)(ip[0] - ilowest) / 7; ++ /* Each iteration can produce up to 10 bytes of output per stream. ++ * Each output stream my advance at different rates. So take the ++ * minimum number of safe iterations among all the output streams. ++ */ ++ for (stream = 0; stream < 4; ++stream) { ++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; ++ iters = MIN(iters, oiters); ++ } ++ ++ /* Each iteration produces at least 5 output symbols. So until ++ * op[3] crosses olimit, we know we haven't executed iters ++ * iterations yet. This saves us maintaining an iters counter, ++ * at the expense of computing the remaining # of iterations ++ * more frequently. ++ */ ++ olimit = op[3] + (iters * 5); ++ ++ /* Exit the fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif + +-static HUF_ASM_X86_64_BMI2_ATTRS size_t +-HUF_decompress4X2_usingDTable_internal_bmi2_asm( ++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ ++ do { \ ++ if ((_decode3) || (_stream) != 3) { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ HUF_DEltX2 const entry = dtable[index]; \ ++ MEM_write16(op[(_stream)], entry.sequence); \ ++ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ ++ op[(_stream)] += (entry.length); \ ++ } \ ++ } while (0) ++ ++#define HUF_4X2_RELOAD_STREAM(_stream) \ ++ do { \ ++ HUF_4X2_DECODE_SYMBOL(3, 1); \ ++ { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols from each of the first 3 streams. ++ * The final stream will be decoded during the reload phase ++ * to reduce register pressure. ++ */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ ++ /* Decode one symbol from the final stream */ ++ HUF_4X2_DECODE_SYMBOL(3, 1); ++ ++ /* Decode 4 symbols from the final stream & reload bitstreams. ++ * The final stream is reloaded last, meaning that all 5 symbols ++ * are decoded from the final stream before it is reloaded. ++ */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ } ++ ++#undef HUF_4X2_DECODE_SYMBOL ++#undef HUF_4X2_RELOAD_STREAM ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++ ++static HUF_FAST_BMI2_ATTRS size_t ++HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) { ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; ++ const BYTE* const ilowest = (const BYTE*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; + { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + + /* note : op4 already verified within main loop */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bitStreams one by one */ + { +@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( + /* decoded size */ + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + #endif ++ ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +-size_t HUF_decompress1X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- + size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); ++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); + } + +- +-size_t HUF_decompress4X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- +-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + /* Universal decompression selectors */ + /* ***********************************/ + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- + + #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) + #endif + } + +- +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, +- size_t dstSize, const void* cSrc, +- size_t cSrcSize, void* workSpace, +- size_t wkspSize) +-{ +- /* validation checks */ +- if (dstSize == 0) return ERROR(dstSize_tooSmall); +- if (cSrcSize == 0) return ERROR(corruption_detected); +- +- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)algoNb; +- assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)algoNb; +- assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#else +- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): +- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#endif +- } +-} +- + size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): ++ cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #endif + } + } + + +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + #endif + +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #else +- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : +- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : ++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #endif + } + } +- +diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c +index dbbc7919de53..30ef65e1ab5c 100644 +--- a/lib/zstd/decompress/zstd_ddict.c ++++ b/lib/zstd/decompress/zstd_ddict.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,12 +15,12 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/cpu.h" /* bmi2 */ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_decompress_internal.h" + #include "zstd_ddict.h" +@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + ZSTD_memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; +- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); +@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) + unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) + { + if (ddict==NULL) return 0; +- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); ++ return ddict->dictID; + } +diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h +index 8c1a79d666f8..de459a0dacd1 100644 +--- a/lib/zstd/decompress/zstd_ddict.h ++++ b/lib/zstd/decompress/zstd_ddict.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c +index 6b3177c94711..c9cbc45f6ed9 100644 +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -53,13 +54,15 @@ + * Dependencies + *********************************************************/ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ ++#include "../common/error_private.h" ++#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "../common/mem.h" /* low level memory routines */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ +-#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ +@@ -72,11 +75,11 @@ + *************************************/ + + #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. +- * Currently, that means a 0.75 load factor. +- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded +- * the load factor of the ddict hash set. +- */ ++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. ++ * Currently, that means a 0.75 load factor. ++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded ++ * the load factor of the ddict hash set. ++ */ + + #define DDICT_HASHSET_TABLE_BASE_SIZE 64 + #define DDICT_HASHSET_RESIZE_FACTOR 2 +@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; ++ dctx->disableHufAsm = 0; ++ dctx->maxBlockSizeParam = 0; + } + + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; ++ dctx->isFrameDecompression = 1; + #if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); + #endif +@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ ++** or an error code, which can be tested using ZSTD_isError() */ + size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) + { + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + +- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ +- if (srcSize < minInputSize) return minInputSize; +- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); ++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); ++ ++ if (srcSize > 0) { ++ /* note : technically could be considered an assert(), since it's an invalid entry */ ++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); ++ } ++ if (srcSize < minInputSize) { ++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { ++ /* when receiving less than @minInputSize bytes, ++ * control these bytes at least correspond to a supported magic number ++ * in order to error out early if they don't. ++ **/ ++ size_t const toCopy = MIN(4, srcSize); ++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); ++ assert(src != NULL); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { ++ /* not a zstd frame : let's check if it's a skippable frame */ ++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { ++ RETURN_ERROR(prefix_unknown, ++ "first bytes don't correspond to any supported magic number"); ++ } } } ++ return minInputSize; ++ } + ++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { +@@ -540,61 +570,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); +- { +- size_t const skippableSize = skippableHeaderSize + sizeU32; ++ { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } + } + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * +- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. ++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize) ++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, /* optional, can be NULL */ ++ const void* src, size_t srcSize) + { +- U32 const magicNumber = MEM_readLE32(src); +- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; +- +- /* check input validity */ +- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + +- /* deliver payload */ +- if (skippableContentSize > 0 && dst != NULL) +- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +- if (magicVariant != NULL) +- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +- return skippableContentSize; ++ { U32 const magicNumber = MEM_readLE32(src); ++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); ++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; ++ ++ /* check input validity */ ++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); ++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); ++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ ++ /* deliver payload */ ++ if (skippableContentSize > 0 && dst != NULL) ++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); ++ if (magicVariant != NULL) ++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; ++ return skippableContentSize; ++ } + } + + /* ZSTD_findDecompressedSize() : +- * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames +- * @return : decompressed size of the frames contained */ ++ * note: compatible with legacy mode ++ * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { +- unsigned long long totalDstSize = 0; ++ U64 totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- if (ZSTD_isError(skippableSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; +@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + continue; + } + +- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); +- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; ++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); ++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- /* check for overflow */ +- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; +- totalDstSize += ret; ++ if (U64_MAX - totalDstSize < fcs) ++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ ++ totalDstSize += fcs; + } ++ /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); +- if (ZSTD_isError(frameSrcSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; ++ assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; +@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) + return frameSizeInfo; + } + +-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) ++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) + { + ZSTD_frameSizeInfo frameSizeInfo; + ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + + +- if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) ++ if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || +@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ZSTD_frameHeader zfh; + + /* Extract Frame Header */ +- { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); ++ { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) +@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ip += 4; + } + ++ frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize +- : nbBlocks * zfh.blockSizeMax; ++ : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } + } + ++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); ++ return frameSizeInfo.compressedSize; ++} ++ + /* ZSTD_findFrameCompressedSize() : +- * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame +- * `srcSize` must be at least as large as the frame contained +- * @return : the compressed size of the frame starting at `src` */ ++ * See docs in zstd.h ++ * Note: compatible with legacy mode */ + size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) + { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); +- return frameSizeInfo.compressedSize; ++ return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); + } + + /* ZSTD_decompressBound() : +@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) +@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + return bound; + } + ++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) ++{ ++ size_t margin = 0; ++ unsigned maxBlockSize = 0; ++ ++ /* Iterate over each frame */ ++ while (srcSize > 0) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); ++ size_t const compressedSize = frameSizeInfo.compressedSize; ++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ++ ZSTD_frameHeader zfh; ++ ++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); ++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) ++ return ERROR(corruption_detected); ++ ++ if (zfh.frameType == ZSTD_frame) { ++ /* Add the frame header to our margin */ ++ margin += zfh.headerSize; ++ /* Add the checksum to our margin */ ++ margin += zfh.checksumFlag ? 4 : 0; ++ /* Add 3 bytes per block */ ++ margin += 3 * frameSizeInfo.nbBlocks; ++ ++ /* Compute the max block size */ ++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); ++ } else { ++ assert(zfh.frameType == ZSTD_skippableFrame); ++ /* Add the entire skippable frame size to our margin. */ ++ margin += compressedSize; ++ } ++ ++ assert(srcSize >= compressedSize); ++ src = (const BYTE*)src + compressedSize; ++ srcSize -= compressedSize; ++ } ++ ++ /* Add the max block size back to the margin. */ ++ margin += maxBlockSize; ++ ++ return margin; ++} + + /*-************************************************************* + * Frame decoding +@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + ++ /* Shrink the blockSizeMax if enabled */ ++ if (dctx->maxBlockSizeParam != 0) ++ dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); ++ + /* Loop on each block */ + while (1) { + BYTE* oBlockEnd = oend; +@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + switch(blockProperties.blockType) + { + case bt_compressed: +- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); + break; + case bt_raw : + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ +@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } +- +- if (ZSTD_isError(decodedSize)) return decodedSize; +- if (dctx->validateChecksum) ++ FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); ++ DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); ++ if (dctx->validateChecksum) { + xxh64_update(&dctx->xxhState, op, decodedSize); +- if (decodedSize != 0) ++ } ++ if (decodedSize) /* support dst = NULL,0 */ { + op += decodedSize; ++ } + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; +@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); + } + +-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, +@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + + +- { U32 const magicNumber = MEM_readLE32(src); +- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", +- (unsigned)magicNumber, ZSTD_MAGICNUMBER); ++ if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { ++ U32 const magicNumber = MEM_readLE32(src); ++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { ++ /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); ++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; +- continue; ++ continue; /* check next frame */ + } } + + if (ddict) { +@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr + size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + + /* +- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, +- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can ++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we ++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce +@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); +- rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : +@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); ++ assert(dctx->format != ZSTD_f_zstd1_magicless); + ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; +@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } + } + +@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; +@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; +- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; ++ dctx->isFrameDecompression = 1; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; +@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. +- * Needed dictionary is a hidden information. ++ * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. +@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ + unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) + { +- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; ++ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di + size_t ZSTD_initDStream(ZSTD_DStream* zds) + { + DEBUGLOG(4, "ZSTD_initDStream"); +- return ZSTD_initDStream_usingDDict(zds, NULL); ++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); ++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); ++ return ZSTD_startingInputLength(zds->format); + } + + /* ZSTD_initDStream_usingDDict() : +@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) + * this function cannot fail */ + size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + { ++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + * this function cannot fail */ + size_t ZSTD_resetDStream(ZSTD_DStream* dctx) + { ++ DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); + } +@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; ++ case ZSTD_d_disableHuffmanAssembly: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ case ZSTD_d_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ + default:; + } + bounds.error = ERROR(parameter_unsupported); +@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ *value = (int)dctx->disableHufAsm; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ *value = dctx->maxBlockSizeParam; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); ++ dctx->disableHufAsm = value != 0; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); ++ dctx->maxBlockSizeParam = value; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; ++ dctx->isFrameDecompression = 1; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { +@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) + return ZSTD_sizeof_DCtx(dctx); + } + +-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) + { +- size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +- /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ +- unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); ++ size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); ++ /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block ++ * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing ++ * the block at the beginning of the output buffer, and maintain a full window. ++ * ++ * We need another blockSize worth of buffer so that we can store split ++ * literals at the end of the block without overwriting the extDict window. ++ */ ++ unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, +@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long + return minRBSize; + } + ++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++{ ++ return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); ++} ++ + size_t ZSTD_estimateDStreamSize(size_t windowSize) + { + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } +- DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { + return hSize; /* error */ + } +@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->lhSize += remainingInput; + } + input->pos = input->size; ++ /* check first few bytes */ ++ FORWARD_IF_ERROR( ++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), ++ "First few bytes detected incorrect" ); ++ /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); +@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { +- size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart)); ++ size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; +- DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") ++ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); ++ assert(istart != NULL); + ip = istart + cSize; +- op += decompressedSize; ++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; +@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + +- if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ ++ if (zds->format == ZSTD_f_zstd1 ++ && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { +@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); ++ if (zds->maxBlockSizeParam != 0) ++ zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered +- ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) ++ ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); +@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); ++ assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; +@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ +- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); ++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { +@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } +- ip += loadedSize; +- zds->inPos += loadedSize; ++ if (loadedSize != 0) { ++ /* ip may be NULL */ ++ ip += loadedSize; ++ zds->inPos += loadedSize; ++ } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ +@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + break; + } + case zdss_flush: +- { size_t const toFlushSize = zds->outEnd - zds->outStart; ++ { ++ size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); +- op += flushedSize; ++ ++ op = op ? op + flushedSize : op; ++ + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) +- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { ++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); +@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ +@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { +- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); +- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); ++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); ++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { +@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs ( + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; +- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; ++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } +diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c +index c1913b8e7c89..9fe9a12c8a2c 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.c ++++ b/lib/zstd/decompress/zstd_decompress_block.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,12 +21,12 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/zstd_internal.h" + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /*_******************************************************* + * Macros +@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } + * Block decoding + ***************************************************************/ + ++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) ++{ ++ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; ++ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ return blockSizeMax; ++} ++ + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, +@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) + { +- if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) +- { +- /* room for litbuffer to fit without read faulting */ +- dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); ++ assert(litSize <= blockSizeMax); ++ assert(dctx->isFrameDecompression || streaming == not_streaming); ++ assert(expectedWriteSize <= blockSizeMax); ++ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { ++ /* If we aren't streaming, we can just put the literals after the output ++ * of the current block. We don't need to worry about overwriting the ++ * extDict of our window, because it doesn't exist. ++ * So if we have space after the end of the block, just put it there. ++ */ ++ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; +- } +- else if (litSize > ZSTD_LITBUFFEREXTRASIZE) +- { +- /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ ++ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { ++ /* Literals fit entirely within the extra buffer, put them there to avoid ++ * having to split the literals. ++ */ ++ dctx->litBuffer = dctx->litExtraBuffer; ++ dctx->litBufferEnd = dctx->litBuffer + litSize; ++ dctx->litBufferLocation = ZSTD_not_in_dst; ++ } else { ++ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); ++ /* Literals must be split between the output block and the extra lit ++ * buffer. We fill the extra lit buffer with the tail of the literals, ++ * and put the rest of the literals at the end of the block, with ++ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. ++ * This MUST not write more than our maxBlockSize beyond dst, because in ++ * streaming mode, that could overwrite part of our extDict window. ++ */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; +- } +- else { +- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ ++ } else { ++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; +- } +- else +- { +- /* fits entirely within litExtraBuffer, so no split is necessary */ +- dctx->litBuffer = dctx->litExtraBuffer; +- dctx->litBufferEnd = dctx->litBuffer + litSize; +- dctx->litBufferLocation = ZSTD_not_in_dst; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); + } + } + +-/* Hidden declaration for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, +- const void* src, size_t srcSize, +- void* dst, size_t dstCapacity, const streaming_operation streaming); + /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current +@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + * + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, ++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) + { +@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + { const BYTE* const istart = (const BYTE*) src; + symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + + switch(litEncType) + { +@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + ZSTD_FALLTHROUGH; + + case set_compressed: +- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); ++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); ++ int const flags = 0 ++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) ++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); ++ if (!singleStream) ++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, ++ "Not enough literals (%zu) for the 4-streams mode (min %u)", ++ litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); +@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + if (litEncType==set_repeat) { + if (singleStream) { +- hufSuccess = HUF_decompress1X_usingDTable_bmi2( ++ hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } else { +- hufSuccess = HUF_decompress4X_usingDTable_bmi2( ++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); ++ hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } + } else { + if (singleStream) { +@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace)); ++ sizeof(dctx->workspace), flags); + #else +- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( ++ hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + #endif + } else { +- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( ++ hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) + { ++ assert(litSize > ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); +@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } + + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ +@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 1: + lhSize = 2; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; +- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) +@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + } + } + ++/* Hidden declaration for fullbench */ ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity); ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity) ++{ ++ dctx->isFrameDecompression = 0; ++ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); ++} ++ + /* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions +@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; ++ assert(n>=0); ++ pos += (size_t)n; + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ ++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } +@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (u=0; u 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); +@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + } + *nbSeqPtr = nbSeq; + ++ if (nbSeq == 0) { ++ /* No sequence : section ends immediately */ ++ RETURN_ERROR_IF(ip != iend, corruption_detected, ++ "extraneous data present in the Sequences section"); ++ return (size_t)(ip - istart); ++ } ++ + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ ++ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ + { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); + symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); + symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); +@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt + /* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { ++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + +@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op, + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op, + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); ++ ++#if defined(__aarch64__) ++ /* prefetch sequence starting from match that will be used for copy later */ ++ PREFETCH_L1(match); ++#endif + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend +@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + } + + /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum +- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) ++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + + typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; + ++/* ++ * ZSTD_decodeSequence(): ++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets ++ * only used in 32-bit mode ++ * @return : Sequence (litL + matchL + offset) ++ */ + FORCE_INLINE_TEMPLATE seq_t +-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) ++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) + { + seq_t seq; ++ /* ++ * ZSTD_seqSymbol is a 64 bits wide structure. ++ * It can be loaded in one operation ++ * and its fields extracted by simply shifting or bit-extracting on aarch64. ++ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh ++ * operations that cause performance drop. This can be avoided by using this ++ * ZSTD_memcpy hack. ++ */ ++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) ++ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; ++ ZSTD_seqSymbol* const llDInfo = &llDInfoS; ++ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; ++ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; ++ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); ++#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; ++#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; +@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; ++ ++ assert(llBits <= MaxLLBits); ++ assert(mlBits <= MaxMLBits); ++ assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only +- * valuable to mark likelyness for clang, it gives around 3-4% of ++ * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; +- #if defined(__clang__) +- if (LIKELY(ofBits > 1)) { +- #else + if (ofBits > 1) { +- #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +- assert(ofBits <= MaxOff); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); ++ /* Always read extra bits, this keeps the logic simple, ++ * avoids branches, and avoids accidentally reading 0 bits. ++ */ ++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); +- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ ++ offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); +@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; +- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ ++ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; +@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + seq.offset = offset; + } + +- #if defined(__clang__) +- if (UNLIKELY(mlBits > 0)) +- #else + if (mlBits > 0) +- #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +- #if defined(__clang__) +- if (UNLIKELY(llBits > 0)) +- #else + if (llBits > 0) +- #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) +@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + +- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ +- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ if (!isLastSeq) { ++ /* don't update FSE state for last Sequence */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ ++ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ BIT_reloadDStream(&seqState->DStream); ++ } + } + + return seq; + } + +-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) ++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) ++#if DEBUGLEVEL >= 1 ++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) + { + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ +@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix + /* Dictionary is active. */ + return 1; + } ++#endif + +-MEM_STATIC void ZSTD_assertValidSequence( ++static void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) + { + #if DEBUGLEVEL >= 1 +- size_t const windowSize = dctx->fParams.windowSize; +- size_t const sequenceSize = seq.litLength + seq.matchLength; +- BYTE const* const oLitEnd = op + seq.litLength; +- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", +- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); +- assert(op <= oend); +- assert((size_t)(oend - op) >= sequenceSize); +- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); +- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { +- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); +- /* Offset must be within the dictionary. */ +- assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); +- assert(seq.offset <= windowSize + dictSize); +- } else { +- /* Offset must be within our window. */ +- assert(seq.offset <= windowSize); ++ if (dctx->isFrameDecompression) { ++ size_t const windowSize = dctx->fParams.windowSize; ++ size_t const sequenceSize = seq.litLength + seq.matchLength; ++ BYTE const* const oLitEnd = op + seq.litLength; ++ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", ++ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); ++ assert(op <= oend); ++ assert((size_t)(oend - op) >= sequenceSize); ++ assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); ++ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { ++ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); ++ /* Offset must be within the dictionary. */ ++ assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); ++ assert(seq.offset <= windowSize + dictSize); ++ } else { ++ /* Offset must be within our window. */ ++ assert(seq.offset <= windowSize); ++ } + } + #else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +@@ -1322,23 +1404,21 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = ostart + maxDstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); + +- /* Regen sequences */ ++ /* Literals are split between internal buffer & output buffer */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; +@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + BIT_DStream_completed < BIT_DStream_overflow); + + /* decompress without overrunning litPtr begins */ +- { +- seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression +@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + #endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ +- for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { +- size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ for ( ; nbSeq; nbSeq--) { ++ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); ++ if (litPtr + sequence.litLength > dctx->litBufferEnd) break; ++ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif +- if (UNLIKELY(ZSTD_isError(oneSeqSize))) +- return oneSeqSize; +- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +- op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); +- sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +- } ++ if (UNLIKELY(ZSTD_isError(oneSeqSize))) ++ return oneSeqSize; ++ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); ++ op += oneSeqSize; ++ } } ++ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; +@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (--nbSeq) +- BIT_reloadDStream(&(seqState.DStream)); + } ++ nbSeq--; + } + } + +- if (nbSeq > 0) /* there is remaining lit from extra buffer */ +- { ++ if (nbSeq > 0) { ++ /* there is remaining lit from extra buffer */ + + #if defined(__x86_64__) + __asm__(".p2align 6"); +@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + # endif + #endif + +- for (; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ +- { +- size_t const lastLLSize = litBufferEnd - litPtr; ++ if (dctx->litBufferLocation == ZSTD_split) { ++ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ ++ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); +@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } +- { size_t const lastLLSize = litBufferEnd - litPtr; ++ /* copy last literals from internal buffer */ ++ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -1539,21 +1616,19 @@ DONT_VECTORIZE + ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_body"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + + /* Regen sequences */ + if (nbSeq) { +@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + +- ZSTD_STATIC_ASSERT( +- BIT_DStream_unfinished < BIT_DStream_completed && +- BIT_DStream_endOfBuffer < BIT_DStream_completed && +- BIT_DStream_completed < BIT_DStream_overflow); +- + #if defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + # endif + #endif + +- for ( ; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + + /* check if reached exact end */ +- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); +- RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ assert(nbSeq == 0); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- { size_t const lastLLSize = litEnd - litPtr; ++ { size_t const lastLLSize = (size_t)(litEnd - litPtr); ++ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + + static size_t + ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, ++FORCE_INLINE_TEMPLATE ++ ++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) + { + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; +- const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. +- * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. ++ * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- (void)frame; + + /* Regen sequences */ + if (nbSeq) { +@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body( + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ +- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) +@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif +- if (ZSTD_isError(oneSeqSize)) return oneSeqSize; ++ if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); +- sequences[seqNb & STORED_SEQS_MASK] = sequence; +- op += oneSeqSize; +- } ++ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); ++ sequences[seqNb & STORED_SEQS_MASK] = sequence; ++ op += oneSeqSize; ++ } } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ +- oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ++ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body( + op += oneSeqSize; + } + } +- RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; +@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body( + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body( + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ +- { ++ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { +@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body( + } + } + +- return op-ostart; ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1851,20 +1908,18 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static BMI2_TARGET_ATTRIBUTE size_t + DONT_VECTORIZE + ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t + ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequences_t)( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame); ++ const ZSTD_longOffset_e isLongOffset); + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + static size_t + ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequences"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static size_t + ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1931,69 +1982,114 @@ static size_t + ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + ++/* ++ * @returns The total size of the history referenceable by zstd, including ++ * both the prefix and the extDict. At @p op any offset larger than this ++ * is invalid. ++ */ ++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) ++{ ++ return (size_t)(op - virtualStart); ++} ++ ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : ++/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) +- * compared to maximum possible of (1< 22) total += 1; ++ ZSTD_OffsetInfo info = {0, 0}; ++ /* If nbSeq == 0, then the offTable is uninitialized, but we have ++ * no sequences, so both values should be 0. ++ */ ++ if (nbSeq != 0) { ++ const void* ptr = offTable; ++ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; ++ const ZSTD_seqSymbol* table = offTable + 1; ++ U32 const max = 1 << tableLog; ++ U32 u; ++ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); ++ ++ assert(max <= (1 << OffFSELog)); /* max not too large */ ++ for (u=0; u 22) info.longOffsetShare += 1; ++ } ++ ++ assert(tableLog <= OffFSELog); ++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + +- assert(tableLog <= OffFSELog); +- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ ++ return info; ++} + +- return total; ++/* ++ * @returns The maximum offset we can decode in one read of our bitstream, without ++ * reloading more bits in the middle of the offset bits read. Any offsets larger ++ * than this must use the long offset decoder. ++ */ ++static size_t ZSTD_maxShortOffset(void) ++{ ++ if (MEM_64bits()) { ++ /* We can decode any offset without reloading bits. ++ * This might change if the max window size grows. ++ */ ++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ++ return (size_t)-1; ++ } else { ++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. ++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. ++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. ++ */ ++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; ++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; ++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); ++ return maxOffset; ++ } + } +-#endif + + size_t + ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming) ++ const void* src, size_t srcSize, const streaming_operation streaming) + { /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; +- /* isLongOffset must be true if there are long offsets. +- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. +- * We don't expect that to be the case in 64-bit mode. +- * In block mode, window size is not known, so we have to be conservative. +- * (note: but it could be evaluated from current-lowLimit) +- */ +- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); +- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); +- +- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); ++ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); ++ ++ /* Note : the wording of the specification ++ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). ++ * This generally does not happen, as it makes little sense, ++ * since an uncompressed block would feature same size and have no decompression cost. ++ * Also, note that decoder from reference libzstd before < v1.5.4 ++ * would consider this edge case as an error. ++ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) ++ * for broader compatibility with the deployed ecosystem of zstd decoders */ ++ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); +- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); ++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; +@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + + /* Build Decoding Tables */ + { ++ /* Compute the maximum block size, which must also work when !frame and fParams are unset. ++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. ++ */ ++ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); ++ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); ++ /* isLongOffset must be true if there are long offsets. ++ * Offsets are long if they are larger than ZSTD_maxShortOffset(). ++ * We don't expect that to be the case in 64-bit mode. ++ * ++ * We check here to see if our history is large enough to allow long offsets. ++ * If it isn't, then we can't possible have (valid) long offsets. If the offset ++ * is invalid, then it is okay to read it incorrectly. ++ * ++ * If isLongOffsets is true, then we will later check our decoding table to see ++ * if it is even possible to generate long offsets. ++ */ ++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. +@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; ++#else ++ /* Set to 1 to avoid computing offset info if we don't need to. ++ * Otherwise this value is ignored. ++ */ ++ int usePrefetchDecoder = 1; + #endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); +@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + ip += seqHSize; + srcSize -= seqHSize; + +- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, ++ "invalid dst"); + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if ( !usePrefetchDecoder +- && (!frame || (dctx->fParams.windowSize > (1<<24))) +- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ +- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); +- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ +- usePrefetchDecoder = (shareLongOffsets >= minShare); ++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, ++ * compute information about the share of long offsets, and the maximum nbAdditionalBits. ++ * NOTE: could probably use a larger nbSeq limit ++ */ ++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); ++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { ++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small ++ * enough, then we know it is impossible to have too long an offset in this block, so we can ++ * use the regular offset decoder. ++ */ ++ isLongOffset = ZSTD_lo_isRegularOffset; ++ } ++ if (!usePrefetchDecoder) { ++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ ++ usePrefetchDecoder = (info.longOffsetShare >= minShare); ++ } + } +-#endif + + dctx->ddictIsCold = 0; + + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if (usePrefetchDecoder) ++ if (usePrefetchDecoder) { ++#else ++ (void)usePrefetchDecoder; ++ { + #endif + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif ++ } + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + if (dctx->litBufferLocation == ZSTD_split) +- return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + else +- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif + } + } + + ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + { + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ +@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + } + + +-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t dSize; ++ dctx->isFrameDecompression = 0; + ZSTD_checkContinuity(dctx, dst, dstCapacity); +- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); ++ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); ++ FORWARD_IF_ERROR(dSize, ""); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; + } ++ ++ ++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ ++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); ++} +diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h +index 3d2d57a5d25a..becffbd89364 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.h ++++ b/lib/zstd/decompress/zstd_decompress_block.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -47,7 +48,7 @@ typedef enum { + */ + size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming); ++ const void* src, size_t srcSize, const streaming_operation streaming); + + /* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) +@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + ++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ + + #endif /* ZSTD_DEC_BLOCK_H */ +diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h +index 98102edb6a83..0f02526be774 100644 +--- a/lib/zstd/decompress/zstd_decompress_internal.h ++++ b/lib/zstd/decompress/zstd_decompress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = { + + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) ++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + + typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ +- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ ++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; + } ZSTD_entropyDTables_t; +@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s + size_t litSize; + size_t rleSize; + size_t staticSize; ++ int isFrameDecompression; + #if DYNAMIC_BMI2 != 0 + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + #endif +@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s + ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ ++ int disableHufAsm; ++ int maxBlockSizeParam; + + /* streaming */ + ZSTD_dStreamStage streamStage; +diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h +index a06ca187aab5..8a47eb2a4514 100644 +--- a/lib/zstd/decompress_sources.h ++++ b/lib/zstd/decompress_sources.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c +index 22686e367e6f..466828e35752 100644 +--- a/lib/zstd/zstd_common_module.c ++++ b/lib/zstd/zstd_common_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); + EXPORT_SYMBOL_GPL(ZSTD_isError); + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +-EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customFree); + + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Common"); +diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c +index 04e1b5c01d9b..8ecf43226af2 100644 +--- a/lib/zstd/zstd_compress_module.c ++++ b/lib/zstd/zstd_compress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c +index f4ed952ed485..7d31518e9d5a 100644 +--- a/lib/zstd/zstd_decompress_module.c ++++ b/lib/zstd/zstd_decompress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream); + + size_t zstd_reset_dstream(zstd_dstream *dstream) + { +- return ZSTD_resetDStream(dstream); ++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); + } + EXPORT_SYMBOL(zstd_reset_dstream); + +-- +2.47.0.rc0 + diff --git a/sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch b/sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch new file mode 100644 index 0000000..392f8fc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.11/0001-eevdf-next.patch @@ -0,0 +1,4374 @@ +From 7c2f0545fa986157158c76300a43ab48802d25d3 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 4 Oct 2024 18:04:35 +0200 +Subject: [PATCH] eevdf-next + +Signed-off-by: Peter Jung +--- + Documentation/scheduler/sched-deadline.rst | 14 +- + drivers/cpufreq/cppc_cpufreq.c | 6 +- + fs/bcachefs/six.c | 2 +- + fs/select.c | 2 +- + include/linux/ioprio.h | 2 +- + include/linux/sched.h | 28 +- + include/linux/sched/deadline.h | 14 +- + include/linux/sched/prio.h | 1 + + include/linux/sched/rt.h | 33 +- + include/uapi/linux/sched/types.h | 6 +- + kernel/freezer.c | 2 +- + kernel/locking/rtmutex.c | 4 +- + kernel/locking/rwsem.c | 4 +- + kernel/locking/ww_mutex.h | 2 +- + kernel/sched/core.c | 248 ++++--- + kernel/sched/cpufreq_schedutil.c | 6 +- + kernel/sched/deadline.c | 465 ++++++++++--- + kernel/sched/debug.c | 198 +++++- + kernel/sched/fair.c | 750 ++++++++++++++++----- + kernel/sched/features.h | 30 +- + kernel/sched/idle.c | 23 +- + kernel/sched/rt.c | 261 +++---- + kernel/sched/sched.h | 101 ++- + kernel/sched/stop_task.c | 18 +- + kernel/sched/syscalls.c | 132 +--- + kernel/sched/topology.c | 8 + + kernel/time/hrtimer.c | 6 +- + kernel/trace/trace_sched_wakeup.c | 2 +- + mm/page-writeback.c | 4 +- + mm/page_alloc.c | 2 +- + 30 files changed, 1663 insertions(+), 711 deletions(-) + +diff --git a/Documentation/scheduler/sched-deadline.rst b/Documentation/scheduler/sched-deadline.rst +index 9fe4846079bb..22838ed8e13a 100644 +--- a/Documentation/scheduler/sched-deadline.rst ++++ b/Documentation/scheduler/sched-deadline.rst +@@ -749,21 +749,19 @@ Appendix A. Test suite + of the command line options. Please refer to rt-app documentation for more + details (`/doc/*.json`). + +- The second testing application is a modification of schedtool, called +- schedtool-dl, which can be used to setup SCHED_DEADLINE parameters for a +- certain pid/application. schedtool-dl is available at: +- https://github.com/scheduler-tools/schedtool-dl.git. ++ The second testing application is done using chrt which has support ++ for SCHED_DEADLINE. + + The usage is straightforward:: + +- # schedtool -E -t 10000000:100000000 -e ./my_cpuhog_app ++ # chrt -d -T 10000000 -D 100000000 0 ./my_cpuhog_app + + With this, my_cpuhog_app is put to run inside a SCHED_DEADLINE reservation +- of 10ms every 100ms (note that parameters are expressed in microseconds). +- You can also use schedtool to create a reservation for an already running ++ of 10ms every 100ms (note that parameters are expressed in nanoseconds). ++ You can also use chrt to create a reservation for an already running + application, given that you know its pid:: + +- # schedtool -E -t 10000000:100000000 my_app_pid ++ # chrt -d -T 10000000 -D 100000000 -p 0 my_app_pid + + Appendix B. Minimal main() + ========================== +diff --git a/drivers/cpufreq/cppc_cpufreq.c b/drivers/cpufreq/cppc_cpufreq.c +index bafa32dd375d..1a5ad184d28f 100644 +--- a/drivers/cpufreq/cppc_cpufreq.c ++++ b/drivers/cpufreq/cppc_cpufreq.c +@@ -224,9 +224,9 @@ static void __init cppc_freq_invariance_init(void) + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ +- .sched_runtime = 1000000, +- .sched_deadline = 10000000, +- .sched_period = 10000000, ++ .sched_runtime = NSEC_PER_MSEC, ++ .sched_deadline = 10 * NSEC_PER_MSEC, ++ .sched_period = 10 * NSEC_PER_MSEC, + }; + int ret; + +diff --git a/fs/bcachefs/six.c b/fs/bcachefs/six.c +index 3a494c5d1247..9cbd3c14c94f 100644 +--- a/fs/bcachefs/six.c ++++ b/fs/bcachefs/six.c +@@ -335,7 +335,7 @@ static inline bool six_owner_running(struct six_lock *lock) + */ + rcu_read_lock(); + struct task_struct *owner = READ_ONCE(lock->owner); +- bool ret = owner ? owner_on_cpu(owner) : !rt_task(current); ++ bool ret = owner ? owner_on_cpu(owner) : !rt_or_dl_task(current); + rcu_read_unlock(); + + return ret; +diff --git a/fs/select.c b/fs/select.c +index bc185d111436..bc5762b03945 100644 +--- a/fs/select.c ++++ b/fs/select.c +@@ -82,7 +82,7 @@ u64 select_estimate_accuracy(struct timespec64 *tv) + * Realtime tasks get a slack of 0 for obvious reasons. + */ + +- if (rt_task(current)) ++ if (rt_or_dl_task(current)) + return 0; + + ktime_get_ts64(&now); +diff --git a/include/linux/ioprio.h b/include/linux/ioprio.h +index db1249cd9692..b25377b6ea98 100644 +--- a/include/linux/ioprio.h ++++ b/include/linux/ioprio.h +@@ -40,7 +40,7 @@ static inline int task_nice_ioclass(struct task_struct *task) + { + if (task->policy == SCHED_IDLE) + return IOPRIO_CLASS_IDLE; +- else if (task_is_realtime(task)) ++ else if (rt_or_dl_task_policy(task)) + return IOPRIO_CLASS_RT; + else + return IOPRIO_CLASS_BE; +diff --git a/include/linux/sched.h b/include/linux/sched.h +index f8d150343d42..57cf27a3045c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -149,8 +149,9 @@ struct user_event_mm; + * Special states are those that do not use the normal wait-loop pattern. See + * the comment with set_special_state(). + */ +-#define is_special_task_state(state) \ +- ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | TASK_DEAD)) ++#define is_special_task_state(state) \ ++ ((state) & (__TASK_STOPPED | __TASK_TRACED | TASK_PARKED | \ ++ TASK_DEAD | TASK_FROZEN)) + + #ifdef CONFIG_DEBUG_ATOMIC_SLEEP + # define debug_normal_state_change(state_value) \ +@@ -541,9 +542,14 @@ struct sched_entity { + struct rb_node run_node; + u64 deadline; + u64 min_vruntime; ++ u64 min_slice; + + struct list_head group_node; +- unsigned int on_rq; ++ unsigned char on_rq; ++ unsigned char sched_delayed; ++ unsigned char rel_deadline; ++ unsigned char custom_slice; ++ /* hole */ + + u64 exec_start; + u64 sum_exec_runtime; +@@ -639,12 +645,26 @@ struct sched_dl_entity { + * + * @dl_overrun tells if the task asked to be informed about runtime + * overruns. ++ * ++ * @dl_server tells if this is a server entity. ++ * ++ * @dl_defer tells if this is a deferred or regular server. For ++ * now only defer server exists. ++ * ++ * @dl_defer_armed tells if the deferrable server is waiting ++ * for the replenishment timer to activate it. ++ * ++ * @dl_defer_running tells if the deferrable server is actually ++ * running, skipping the defer phase. + */ + unsigned int dl_throttled : 1; + unsigned int dl_yielded : 1; + unsigned int dl_non_contending : 1; + unsigned int dl_overrun : 1; + unsigned int dl_server : 1; ++ unsigned int dl_defer : 1; ++ unsigned int dl_defer_armed : 1; ++ unsigned int dl_defer_running : 1; + + /* + * Bandwidth enforcement timer. Each -deadline task has its +@@ -672,7 +692,7 @@ struct sched_dl_entity { + */ + struct rq *rq; + dl_server_has_tasks_f server_has_tasks; +- dl_server_pick_f server_pick; ++ dl_server_pick_f server_pick_task; + + #ifdef CONFIG_RT_MUTEXES + /* +diff --git a/include/linux/sched/deadline.h b/include/linux/sched/deadline.h +index df3aca89d4f5..3a912ab42bb5 100644 +--- a/include/linux/sched/deadline.h ++++ b/include/linux/sched/deadline.h +@@ -10,16 +10,16 @@ + + #include + +-#define MAX_DL_PRIO 0 +- +-static inline int dl_prio(int prio) ++static inline bool dl_prio(int prio) + { +- if (unlikely(prio < MAX_DL_PRIO)) +- return 1; +- return 0; ++ return unlikely(prio < MAX_DL_PRIO); + } + +-static inline int dl_task(struct task_struct *p) ++/* ++ * Returns true if a task has a priority that belongs to DL class. PI-boosted ++ * tasks will return true. Use dl_policy() to ignore PI-boosted tasks. ++ */ ++static inline bool dl_task(struct task_struct *p) + { + return dl_prio(p->prio); + } +diff --git a/include/linux/sched/prio.h b/include/linux/sched/prio.h +index ab83d85e1183..6ab43b4f72f9 100644 +--- a/include/linux/sched/prio.h ++++ b/include/linux/sched/prio.h +@@ -14,6 +14,7 @@ + */ + + #define MAX_RT_PRIO 100 ++#define MAX_DL_PRIO 0 + + #define MAX_PRIO (MAX_RT_PRIO + NICE_WIDTH) + #define DEFAULT_PRIO (MAX_RT_PRIO + NICE_WIDTH / 2) +diff --git a/include/linux/sched/rt.h b/include/linux/sched/rt.h +index b2b9e6eb9683..4e3338103654 100644 +--- a/include/linux/sched/rt.h ++++ b/include/linux/sched/rt.h +@@ -6,19 +6,40 @@ + + struct task_struct; + +-static inline int rt_prio(int prio) ++static inline bool rt_prio(int prio) + { +- if (unlikely(prio < MAX_RT_PRIO)) +- return 1; +- return 0; ++ return unlikely(prio < MAX_RT_PRIO && prio >= MAX_DL_PRIO); + } + +-static inline int rt_task(struct task_struct *p) ++static inline bool rt_or_dl_prio(int prio) ++{ ++ return unlikely(prio < MAX_RT_PRIO); ++} ++ ++/* ++ * Returns true if a task has a priority that belongs to RT class. PI-boosted ++ * tasks will return true. Use rt_policy() to ignore PI-boosted tasks. ++ */ ++static inline bool rt_task(struct task_struct *p) + { + return rt_prio(p->prio); + } + +-static inline bool task_is_realtime(struct task_struct *tsk) ++/* ++ * Returns true if a task has a priority that belongs to RT or DL classes. ++ * PI-boosted tasks will return true. Use rt_or_dl_task_policy() to ignore ++ * PI-boosted tasks. ++ */ ++static inline bool rt_or_dl_task(struct task_struct *p) ++{ ++ return rt_or_dl_prio(p->prio); ++} ++ ++/* ++ * Returns true if a task has a policy that belongs to RT or DL classes. ++ * PI-boosted tasks will return false. ++ */ ++static inline bool rt_or_dl_task_policy(struct task_struct *tsk) + { + int policy = tsk->policy; + +diff --git a/include/uapi/linux/sched/types.h b/include/uapi/linux/sched/types.h +index 90662385689b..bf6e9ae031c1 100644 +--- a/include/uapi/linux/sched/types.h ++++ b/include/uapi/linux/sched/types.h +@@ -58,9 +58,9 @@ + * + * This is reflected by the following fields of the sched_attr structure: + * +- * @sched_deadline representative of the task's deadline +- * @sched_runtime representative of the task's runtime +- * @sched_period representative of the task's period ++ * @sched_deadline representative of the task's deadline in nanoseconds ++ * @sched_runtime representative of the task's runtime in nanoseconds ++ * @sched_period representative of the task's period in nanoseconds + * + * Given this task model, there are a multiplicity of scheduling algorithms + * and policies, that can be used to ensure all the tasks will make their +diff --git a/kernel/freezer.c b/kernel/freezer.c +index f57aaf96b829..44bbd7dbd2c8 100644 +--- a/kernel/freezer.c ++++ b/kernel/freezer.c +@@ -72,7 +72,7 @@ bool __refrigerator(bool check_kthr_stop) + bool freeze; + + raw_spin_lock_irq(¤t->pi_lock); +- set_current_state(TASK_FROZEN); ++ WRITE_ONCE(current->__state, TASK_FROZEN); + /* unstale saved_state so that __thaw_task() will wake us up */ + current->saved_state = TASK_RUNNING; + raw_spin_unlock_irq(¤t->pi_lock); +diff --git a/kernel/locking/rtmutex.c b/kernel/locking/rtmutex.c +index fba1229f1de6..ebebd0eec7f6 100644 +--- a/kernel/locking/rtmutex.c ++++ b/kernel/locking/rtmutex.c +@@ -347,7 +347,7 @@ static __always_inline int __waiter_prio(struct task_struct *task) + { + int prio = task->prio; + +- if (!rt_prio(prio)) ++ if (!rt_or_dl_prio(prio)) + return DEFAULT_PRIO; + + return prio; +@@ -435,7 +435,7 @@ static inline bool rt_mutex_steal(struct rt_mutex_waiter *waiter, + * Note that RT tasks are excluded from same priority (lateral) + * steals to prevent the introduction of an unbounded latency. + */ +- if (rt_prio(waiter->tree.prio) || dl_prio(waiter->tree.prio)) ++ if (rt_or_dl_prio(waiter->tree.prio)) + return false; + + return rt_waiter_node_equal(&waiter->tree, &top_waiter->tree); +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index 3277df47ab3c..299b793d55e1 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -631,7 +631,7 @@ static inline bool rwsem_try_write_lock(struct rw_semaphore *sem, + * if it is an RT task or wait in the wait queue + * for too long. + */ +- if (has_handoff || (!rt_task(waiter->task) && ++ if (has_handoff || (!rt_or_dl_task(waiter->task) && + !time_after(jiffies, waiter->timeout))) + return false; + +@@ -916,7 +916,7 @@ static bool rwsem_optimistic_spin(struct rw_semaphore *sem) + if (owner_state != OWNER_WRITER) { + if (need_resched()) + break; +- if (rt_task(current) && ++ if (rt_or_dl_task(current) && + (prev_owner_state != OWNER_WRITER)) + break; + } +diff --git a/kernel/locking/ww_mutex.h b/kernel/locking/ww_mutex.h +index 3ad2cc4823e5..76d204b7d29c 100644 +--- a/kernel/locking/ww_mutex.h ++++ b/kernel/locking/ww_mutex.h +@@ -237,7 +237,7 @@ __ww_ctx_less(struct ww_acquire_ctx *a, struct ww_acquire_ctx *b) + int a_prio = a->task->prio; + int b_prio = b->task->prio; + +- if (rt_prio(a_prio) || rt_prio(b_prio)) { ++ if (rt_or_dl_prio(a_prio) || rt_or_dl_prio(b_prio)) { + + if (a_prio > b_prio) + return true; +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index f3951e4a55e5..b4c5d83e54d4 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -163,7 +163,10 @@ static inline int __task_prio(const struct task_struct *p) + if (p->sched_class == &stop_sched_class) /* trumps deadline */ + return -2; + +- if (rt_prio(p->prio)) /* includes deadline */ ++ if (p->dl_server) ++ return -1; /* deadline */ ++ ++ if (rt_or_dl_prio(p->prio)) + return p->prio; /* [-1, 99] */ + + if (p->sched_class == &idle_sched_class) +@@ -192,8 +195,24 @@ static inline bool prio_less(const struct task_struct *a, + if (-pb < -pa) + return false; + +- if (pa == -1) /* dl_prio() doesn't work because of stop_class above */ +- return !dl_time_before(a->dl.deadline, b->dl.deadline); ++ if (pa == -1) { /* dl_prio() doesn't work because of stop_class above */ ++ const struct sched_dl_entity *a_dl, *b_dl; ++ ++ a_dl = &a->dl; ++ /* ++ * Since,'a' and 'b' can be CFS tasks served by DL server, ++ * __task_prio() can return -1 (for DL) even for those. In that ++ * case, get to the dl_server's DL entity. ++ */ ++ if (a->dl_server) ++ a_dl = a->dl_server; ++ ++ b_dl = &b->dl; ++ if (b->dl_server) ++ b_dl = b->dl_server; ++ ++ return !dl_time_before(a_dl->deadline, b_dl->deadline); ++ } + + if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ + return cfs_prio_less(a, b, in_fi); +@@ -240,6 +259,9 @@ static inline int rb_sched_core_cmp(const void *key, const struct rb_node *node) + + void sched_core_enqueue(struct rq *rq, struct task_struct *p) + { ++ if (p->se.sched_delayed) ++ return; ++ + rq->core->core_task_seq++; + + if (!p->core_cookie) +@@ -250,6 +272,9 @@ void sched_core_enqueue(struct rq *rq, struct task_struct *p) + + void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags) + { ++ if (p->se.sched_delayed) ++ return; ++ + rq->core->core_task_seq++; + + if (sched_core_enqueued(p)) { +@@ -1269,7 +1294,7 @@ bool sched_can_stop_tick(struct rq *rq) + * dequeued by migrating while the constrained task continues to run. + * E.g. going from 2->1 without going through pick_next_task(). + */ +- if (sched_feat(HZ_BW) && __need_bw_check(rq, rq->curr)) { ++ if (__need_bw_check(rq, rq->curr)) { + if (cfs_task_bw_constrained(rq->curr)) + return false; + } +@@ -1672,6 +1697,9 @@ static inline void uclamp_rq_inc(struct rq *rq, struct task_struct *p) + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + ++ if (p->se.sched_delayed) ++ return; ++ + for_each_clamp_id(clamp_id) + uclamp_rq_inc_id(rq, p, clamp_id); + +@@ -1696,6 +1724,9 @@ static inline void uclamp_rq_dec(struct rq *rq, struct task_struct *p) + if (unlikely(!p->sched_class->uclamp_enabled)) + return; + ++ if (p->se.sched_delayed) ++ return; ++ + for_each_clamp_id(clamp_id) + uclamp_rq_dec_id(rq, p, clamp_id); + } +@@ -1975,14 +2006,21 @@ void enqueue_task(struct rq *rq, struct task_struct *p, int flags) + psi_enqueue(p, (flags & ENQUEUE_WAKEUP) && !(flags & ENQUEUE_MIGRATED)); + } + +- uclamp_rq_inc(rq, p); + p->sched_class->enqueue_task(rq, p, flags); ++ /* ++ * Must be after ->enqueue_task() because ENQUEUE_DELAYED can clear ++ * ->sched_delayed. ++ */ ++ uclamp_rq_inc(rq, p); + + if (sched_core_enabled(rq)) + sched_core_enqueue(rq, p); + } + +-void dequeue_task(struct rq *rq, struct task_struct *p, int flags) ++/* ++ * Must only return false when DEQUEUE_SLEEP. ++ */ ++inline bool dequeue_task(struct rq *rq, struct task_struct *p, int flags) + { + if (sched_core_enabled(rq)) + sched_core_dequeue(rq, p, flags); +@@ -1995,8 +2033,12 @@ void dequeue_task(struct rq *rq, struct task_struct *p, int flags) + psi_dequeue(p, flags & DEQUEUE_SLEEP); + } + ++ /* ++ * Must be before ->dequeue_task() because ->dequeue_task() can 'fail' ++ * and mark the task ->sched_delayed. ++ */ + uclamp_rq_dec(rq, p); +- p->sched_class->dequeue_task(rq, p, flags); ++ return p->sched_class->dequeue_task(rq, p, flags); + } + + void activate_task(struct rq *rq, struct task_struct *p, int flags) +@@ -2014,12 +2056,25 @@ void activate_task(struct rq *rq, struct task_struct *p, int flags) + + void deactivate_task(struct rq *rq, struct task_struct *p, int flags) + { +- WRITE_ONCE(p->on_rq, (flags & DEQUEUE_SLEEP) ? 0 : TASK_ON_RQ_MIGRATING); ++ SCHED_WARN_ON(flags & DEQUEUE_SLEEP); ++ ++ WRITE_ONCE(p->on_rq, TASK_ON_RQ_MIGRATING); + ASSERT_EXCLUSIVE_WRITER(p->on_rq); + ++ /* ++ * Code explicitly relies on TASK_ON_RQ_MIGRATING begin set *before* ++ * dequeue_task() and cleared *after* enqueue_task(). ++ */ ++ + dequeue_task(rq, p, flags); + } + ++static void block_task(struct rq *rq, struct task_struct *p, int flags) ++{ ++ if (dequeue_task(rq, p, DEQUEUE_SLEEP | flags)) ++ __block_task(rq, p); ++} ++ + /** + * task_curr - is this task currently executing on a CPU? + * @p: the task in question. +@@ -2233,6 +2288,12 @@ void migrate_disable(void) + struct task_struct *p = current; + + if (p->migration_disabled) { ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ *Warn about overflow half-way through the range. ++ */ ++ WARN_ON_ONCE((s16)p->migration_disabled < 0); ++#endif + p->migration_disabled++; + return; + } +@@ -2251,14 +2312,20 @@ void migrate_enable(void) + .flags = SCA_MIGRATE_ENABLE, + }; + ++#ifdef CONFIG_DEBUG_PREEMPT ++ /* ++ * Check both overflow from migrate_disable() and superfluous ++ * migrate_enable(). ++ */ ++ if (WARN_ON_ONCE((s16)p->migration_disabled <= 0)) ++ return; ++#endif ++ + if (p->migration_disabled > 1) { + p->migration_disabled--; + return; + } + +- if (WARN_ON_ONCE(!p->migration_disabled)) +- return; +- + /* + * Ensure stop_task runs either before or after this, and that + * __set_cpus_allowed_ptr(SCA_MIGRATE_ENABLE) doesn't schedule(). +@@ -3607,8 +3674,6 @@ ttwu_do_activate(struct rq *rq, struct task_struct *p, int wake_flags, + rq->idle_stamp = 0; + } + #endif +- +- p->dl_server = NULL; + } + + /* +@@ -3644,12 +3709,14 @@ static int ttwu_runnable(struct task_struct *p, int wake_flags) + + rq = __task_rq_lock(p, &rf); + if (task_on_rq_queued(p)) { ++ update_rq_clock(rq); ++ if (p->se.sched_delayed) ++ enqueue_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_DELAYED); + if (!task_on_cpu(rq, p)) { + /* + * When on_rq && !on_cpu the task is preempted, see if + * it should preempt the task that is current now. + */ +- update_rq_clock(rq); + wakeup_preempt(rq, p, wake_flags); + } + ttwu_do_wakeup(p); +@@ -4029,11 +4096,16 @@ int try_to_wake_up(struct task_struct *p, unsigned int state, int wake_flags) + * case the whole 'p->on_rq && ttwu_runnable()' case below + * without taking any locks. + * ++ * Specifically, given current runs ttwu() we must be before ++ * schedule()'s block_task(), as such this must not observe ++ * sched_delayed. ++ * + * In particular: + * - we rely on Program-Order guarantees for all the ordering, + * - we're serialized against set_special_state() by virtue of + * it disabling IRQs (this allows not taking ->pi_lock). + */ ++ SCHED_WARN_ON(p->se.sched_delayed); + if (!ttwu_state_match(p, state, &success)) + goto out; + +@@ -4322,9 +4394,11 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.nr_migrations = 0; + p->se.vruntime = 0; + p->se.vlag = 0; +- p->se.slice = sysctl_sched_base_slice; + INIT_LIST_HEAD(&p->se.group_node); + ++ /* A delayed task cannot be in clone(). */ ++ SCHED_WARN_ON(p->se.sched_delayed); ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + p->se.cfs_rq = NULL; + #endif +@@ -4572,6 +4646,8 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) + + p->prio = p->normal_prio = p->static_prio; + set_load_weight(p, false); ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; + + /* + * We don't need the reset flag anymore after the fork. It has +@@ -4686,7 +4762,7 @@ void wake_up_new_task(struct task_struct *p) + update_rq_clock(rq); + post_init_entity_util_avg(p); + +- activate_task(rq, p, ENQUEUE_NOCLOCK); ++ activate_task(rq, p, ENQUEUE_NOCLOCK | ENQUEUE_INITIAL); + trace_sched_wakeup_new(p); + wakeup_preempt(rq, p, WF_FORK); + #ifdef CONFIG_SMP +@@ -5769,8 +5845,8 @@ static inline void schedule_debug(struct task_struct *prev, bool preempt) + schedstat_inc(this_rq()->sched_count); + } + +-static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, +- struct rq_flags *rf) ++static void prev_balance(struct rq *rq, struct task_struct *prev, ++ struct rq_flags *rf) + { + #ifdef CONFIG_SMP + const struct sched_class *class; +@@ -5787,8 +5863,6 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, + break; + } + #endif +- +- put_prev_task(rq, prev); + } + + /* +@@ -5800,6 +5874,8 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + const struct sched_class *class; + struct task_struct *p; + ++ rq->dl_server = NULL; ++ + /* + * Optimization: we know that if all tasks are in the fair class we can + * call that function directly, but only if the @prev task wasn't of a +@@ -5815,35 +5891,28 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + + /* Assume the next prioritized class is idle_sched_class */ + if (!p) { +- put_prev_task(rq, prev); +- p = pick_next_task_idle(rq); ++ p = pick_task_idle(rq); ++ put_prev_set_next_task(rq, prev, p); + } + +- /* +- * This is the fast path; it cannot be a DL server pick; +- * therefore even if @p == @prev, ->dl_server must be NULL. +- */ +- if (p->dl_server) +- p->dl_server = NULL; +- + return p; + } + + restart: +- put_prev_task_balance(rq, prev, rf); +- +- /* +- * We've updated @prev and no longer need the server link, clear it. +- * Must be done before ->pick_next_task() because that can (re)set +- * ->dl_server. +- */ +- if (prev->dl_server) +- prev->dl_server = NULL; ++ prev_balance(rq, prev, rf); + + for_each_class(class) { +- p = class->pick_next_task(rq); +- if (p) +- return p; ++ if (class->pick_next_task) { ++ p = class->pick_next_task(rq, prev); ++ if (p) ++ return p; ++ } else { ++ p = class->pick_task(rq); ++ if (p) { ++ put_prev_set_next_task(rq, prev, p); ++ return p; ++ } ++ } + } + + BUG(); /* The idle class should always have a runnable task. */ +@@ -5873,6 +5942,8 @@ static inline struct task_struct *pick_task(struct rq *rq) + const struct sched_class *class; + struct task_struct *p; + ++ rq->dl_server = NULL; ++ + for_each_class(class) { + p = class->pick_task(rq); + if (p) +@@ -5911,6 +5982,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * another cpu during offline. + */ + rq->core_pick = NULL; ++ rq->core_dl_server = NULL; + return __pick_next_task(rq, prev, rf); + } + +@@ -5929,16 +6001,13 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + WRITE_ONCE(rq->core_sched_seq, rq->core->core_pick_seq); + + next = rq->core_pick; +- if (next != prev) { +- put_prev_task(rq, prev); +- set_next_task(rq, next); +- } +- ++ rq->dl_server = rq->core_dl_server; + rq->core_pick = NULL; +- goto out; ++ rq->core_dl_server = NULL; ++ goto out_set_next; + } + +- put_prev_task_balance(rq, prev, rf); ++ prev_balance(rq, prev, rf); + + smt_mask = cpu_smt_mask(cpu); + need_sync = !!rq->core->core_cookie; +@@ -5979,6 +6048,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + next = pick_task(rq); + if (!next->core_cookie) { + rq->core_pick = NULL; ++ rq->core_dl_server = NULL; + /* + * For robustness, update the min_vruntime_fi for + * unconstrained picks as well. +@@ -6006,7 +6076,9 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + if (i != cpu && (rq_i != rq->core || !core_clock_updated)) + update_rq_clock(rq_i); + +- p = rq_i->core_pick = pick_task(rq_i); ++ rq_i->core_pick = p = pick_task(rq_i); ++ rq_i->core_dl_server = rq_i->dl_server; ++ + if (!max || prio_less(max, p, fi_before)) + max = p; + } +@@ -6030,6 +6102,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + + rq_i->core_pick = p; ++ rq_i->core_dl_server = NULL; + + if (p == rq_i->idle) { + if (rq_i->nr_running) { +@@ -6090,6 +6163,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + + if (i == cpu) { + rq_i->core_pick = NULL; ++ rq_i->core_dl_server = NULL; + continue; + } + +@@ -6098,6 +6172,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + + if (rq_i->curr == rq_i->core_pick) { + rq_i->core_pick = NULL; ++ rq_i->core_dl_server = NULL; + continue; + } + +@@ -6105,8 +6180,7 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + } + + out_set_next: +- set_next_task(rq, next); +-out: ++ put_prev_set_next_task(rq, prev, next); + if (rq->core->core_forceidle_count && next == rq->idle) + queue_core_balance(rq); + +@@ -6342,19 +6416,12 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * Constants for the sched_mode argument of __schedule(). + * + * The mode argument allows RT enabled kernels to differentiate a +- * preemption from blocking on an 'sleeping' spin/rwlock. Note that +- * SM_MASK_PREEMPT for !RT has all bits set, which allows the compiler to +- * optimize the AND operation out and just check for zero. ++ * preemption from blocking on an 'sleeping' spin/rwlock. + */ +-#define SM_NONE 0x0 +-#define SM_PREEMPT 0x1 +-#define SM_RTLOCK_WAIT 0x2 +- +-#ifndef CONFIG_PREEMPT_RT +-# define SM_MASK_PREEMPT (~0U) +-#else +-# define SM_MASK_PREEMPT SM_PREEMPT +-#endif ++#define SM_IDLE (-1) ++#define SM_NONE 0 ++#define SM_PREEMPT 1 ++#define SM_RTLOCK_WAIT 2 + + /* + * __schedule() is the main scheduler function. +@@ -6395,9 +6462,14 @@ pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + * + * WARNING: must be called with preemption disabled! + */ +-static void __sched notrace __schedule(unsigned int sched_mode) ++static void __sched notrace __schedule(int sched_mode) + { + struct task_struct *prev, *next; ++ /* ++ * On PREEMPT_RT kernel, SM_RTLOCK_WAIT is noted ++ * as a preemption by schedule_debug() and RCU. ++ */ ++ bool preempt = sched_mode > SM_NONE; + unsigned long *switch_count; + unsigned long prev_state; + struct rq_flags rf; +@@ -6408,13 +6480,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) + rq = cpu_rq(cpu); + prev = rq->curr; + +- schedule_debug(prev, !!sched_mode); ++ schedule_debug(prev, preempt); + + if (sched_feat(HRTICK) || sched_feat(HRTICK_DL)) + hrtick_clear(rq); + + local_irq_disable(); +- rcu_note_context_switch(!!sched_mode); ++ rcu_note_context_switch(preempt); + + /* + * Make sure that signal_pending_state()->signal_pending() below +@@ -6443,22 +6515,32 @@ static void __sched notrace __schedule(unsigned int sched_mode) + + switch_count = &prev->nivcsw; + ++ /* Task state changes only considers SM_PREEMPT as preemption */ ++ preempt = sched_mode == SM_PREEMPT; ++ + /* + * We must load prev->state once (task_struct::state is volatile), such + * that we form a control dependency vs deactivate_task() below. + */ + prev_state = READ_ONCE(prev->__state); +- if (!(sched_mode & SM_MASK_PREEMPT) && prev_state) { ++ if (sched_mode == SM_IDLE) { ++ if (!rq->nr_running) { ++ next = prev; ++ goto picked; ++ } ++ } else if (!preempt && prev_state) { + if (signal_pending_state(prev_state, prev)) { + WRITE_ONCE(prev->__state, TASK_RUNNING); + } else { ++ int flags = DEQUEUE_NOCLOCK; ++ + prev->sched_contributes_to_load = + (prev_state & TASK_UNINTERRUPTIBLE) && + !(prev_state & TASK_NOLOAD) && + !(prev_state & TASK_FROZEN); + +- if (prev->sched_contributes_to_load) +- rq->nr_uninterruptible++; ++ if (unlikely(is_special_task_state(prev_state))) ++ flags |= DEQUEUE_SPECIAL; + + /* + * __schedule() ttwu() +@@ -6471,17 +6553,13 @@ static void __sched notrace __schedule(unsigned int sched_mode) + * + * After this, schedule() must not care about p->state any more. + */ +- deactivate_task(rq, prev, DEQUEUE_SLEEP | DEQUEUE_NOCLOCK); +- +- if (prev->in_iowait) { +- atomic_inc(&rq->nr_iowait); +- delayacct_blkio_start(); +- } ++ block_task(rq, prev, flags); + } + switch_count = &prev->nvcsw; + } + + next = pick_next_task(rq, prev, &rf); ++picked: + clear_tsk_need_resched(prev); + clear_preempt_need_resched(); + #ifdef CONFIG_SCHED_DEBUG +@@ -6523,7 +6601,7 @@ static void __sched notrace __schedule(unsigned int sched_mode) + psi_account_irqtime(rq, prev, next); + psi_sched_switch(prev, next, !task_on_rq_queued(prev)); + +- trace_sched_switch(sched_mode & SM_MASK_PREEMPT, prev, next, prev_state); ++ trace_sched_switch(preempt, prev, next, prev_state); + + /* Also unlocks the rq: */ + rq = context_switch(rq, prev, next, &rf); +@@ -6599,7 +6677,7 @@ static void sched_update_worker(struct task_struct *tsk) + } + } + +-static __always_inline void __schedule_loop(unsigned int sched_mode) ++static __always_inline void __schedule_loop(int sched_mode) + { + do { + preempt_disable(); +@@ -6644,7 +6722,7 @@ void __sched schedule_idle(void) + */ + WARN_ON_ONCE(current->__state); + do { +- __schedule(SM_NONE); ++ __schedule(SM_IDLE); + } while (need_resched()); + } + +@@ -8228,8 +8306,6 @@ void __init sched_init(void) + #endif /* CONFIG_RT_GROUP_SCHED */ + } + +- init_rt_bandwidth(&def_rt_bandwidth, global_rt_period(), global_rt_runtime()); +- + #ifdef CONFIG_SMP + init_defrootdomain(); + #endif +@@ -8284,8 +8360,13 @@ void __init sched_init(void) + init_tg_cfs_entry(&root_task_group, &rq->cfs, NULL, i, NULL); + #endif /* CONFIG_FAIR_GROUP_SCHED */ + +- rq->rt.rt_runtime = def_rt_bandwidth.rt_runtime; + #ifdef CONFIG_RT_GROUP_SCHED ++ /* ++ * This is required for init cpu because rt.c:__enable_runtime() ++ * starts working after scheduler_running, which is not the case ++ * yet. ++ */ ++ rq->rt.rt_runtime = global_rt_runtime(); + init_tg_rt_entry(&root_task_group, &rq->rt, NULL, i, NULL); + #endif + #ifdef CONFIG_SMP +@@ -8317,10 +8398,12 @@ void __init sched_init(void) + #endif /* CONFIG_SMP */ + hrtick_rq_init(rq); + atomic_set(&rq->nr_iowait, 0); ++ fair_server_init(rq); + + #ifdef CONFIG_SCHED_CORE + rq->core = rq; + rq->core_pick = NULL; ++ rq->core_dl_server = NULL; + rq->core_enabled = 0; + rq->core_tree = RB_ROOT; + rq->core_forceidle_count = 0; +@@ -8333,6 +8416,7 @@ void __init sched_init(void) + } + + set_load_weight(&init_task, false); ++ init_task.se.slice = sysctl_sched_base_slice, + + /* + * The boot idle thread does lazy MMU switching as well: +@@ -8548,7 +8632,7 @@ void normalize_rt_tasks(void) + schedstat_set(p->stats.sleep_start, 0); + schedstat_set(p->stats.block_start, 0); + +- if (!dl_task(p) && !rt_task(p)) { ++ if (!rt_or_dl_task(p)) { + /* + * Renice negative nice level userspace + * tasks back to 0: +diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c +index eece6244f9d2..43111a515a28 100644 +--- a/kernel/sched/cpufreq_schedutil.c ++++ b/kernel/sched/cpufreq_schedutil.c +@@ -654,9 +654,9 @@ static int sugov_kthread_create(struct sugov_policy *sg_policy) + * Fake (unused) bandwidth; workaround to "fix" + * priority inheritance. + */ +- .sched_runtime = 1000000, +- .sched_deadline = 10000000, +- .sched_period = 10000000, ++ .sched_runtime = NSEC_PER_MSEC, ++ .sched_deadline = 10 * NSEC_PER_MSEC, ++ .sched_period = 10 * NSEC_PER_MSEC, + }; + struct cpufreq_policy *policy = sg_policy->policy; + int ret; +diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c +index c5a3691ba6cc..9ce93d0bf452 100644 +--- a/kernel/sched/deadline.c ++++ b/kernel/sched/deadline.c +@@ -320,19 +320,12 @@ void sub_running_bw(struct sched_dl_entity *dl_se, struct dl_rq *dl_rq) + __sub_running_bw(dl_se->dl_bw, dl_rq); + } + +-static void dl_change_utilization(struct task_struct *p, u64 new_bw) ++static void dl_rq_change_utilization(struct rq *rq, struct sched_dl_entity *dl_se, u64 new_bw) + { +- struct rq *rq; +- +- WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); +- +- if (task_on_rq_queued(p)) +- return; ++ if (dl_se->dl_non_contending) { ++ sub_running_bw(dl_se, &rq->dl); ++ dl_se->dl_non_contending = 0; + +- rq = task_rq(p); +- if (p->dl.dl_non_contending) { +- sub_running_bw(&p->dl, &rq->dl); +- p->dl.dl_non_contending = 0; + /* + * If the timer handler is currently running and the + * timer cannot be canceled, inactive_task_timer() +@@ -340,13 +333,25 @@ static void dl_change_utilization(struct task_struct *p, u64 new_bw) + * will not touch the rq's active utilization, + * so we are still safe. + */ +- if (hrtimer_try_to_cancel(&p->dl.inactive_timer) == 1) +- put_task_struct(p); ++ if (hrtimer_try_to_cancel(&dl_se->inactive_timer) == 1) { ++ if (!dl_server(dl_se)) ++ put_task_struct(dl_task_of(dl_se)); ++ } + } +- __sub_rq_bw(p->dl.dl_bw, &rq->dl); ++ __sub_rq_bw(dl_se->dl_bw, &rq->dl); + __add_rq_bw(new_bw, &rq->dl); + } + ++static void dl_change_utilization(struct task_struct *p, u64 new_bw) ++{ ++ WARN_ON_ONCE(p->dl.flags & SCHED_FLAG_SUGOV); ++ ++ if (task_on_rq_queued(p)) ++ return; ++ ++ dl_rq_change_utilization(task_rq(p), &p->dl, new_bw); ++} ++ + static void __dl_clear_params(struct sched_dl_entity *dl_se); + + /* +@@ -771,6 +776,15 @@ static inline void replenish_dl_new_period(struct sched_dl_entity *dl_se, + /* for non-boosted task, pi_of(dl_se) == dl_se */ + dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; + dl_se->runtime = pi_of(dl_se)->dl_runtime; ++ ++ /* ++ * If it is a deferred reservation, and the server ++ * is not handling an starvation case, defer it. ++ */ ++ if (dl_se->dl_defer & !dl_se->dl_defer_running) { ++ dl_se->dl_throttled = 1; ++ dl_se->dl_defer_armed = 1; ++ } + } + + /* +@@ -809,6 +823,9 @@ static inline void setup_new_dl_entity(struct sched_dl_entity *dl_se) + replenish_dl_new_period(dl_se, rq); + } + ++static int start_dl_timer(struct sched_dl_entity *dl_se); ++static bool dl_entity_overflow(struct sched_dl_entity *dl_se, u64 t); ++ + /* + * Pure Earliest Deadline First (EDF) scheduling does not deal with the + * possibility of a entity lasting more than what it declared, and thus +@@ -837,9 +854,18 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) + /* + * This could be the case for a !-dl task that is boosted. + * Just go with full inherited parameters. ++ * ++ * Or, it could be the case of a deferred reservation that ++ * was not able to consume its runtime in background and ++ * reached this point with current u > U. ++ * ++ * In both cases, set a new period. + */ +- if (dl_se->dl_deadline == 0) +- replenish_dl_new_period(dl_se, rq); ++ if (dl_se->dl_deadline == 0 || ++ (dl_se->dl_defer_armed && dl_entity_overflow(dl_se, rq_clock(rq)))) { ++ dl_se->deadline = rq_clock(rq) + pi_of(dl_se)->dl_deadline; ++ dl_se->runtime = pi_of(dl_se)->dl_runtime; ++ } + + if (dl_se->dl_yielded && dl_se->runtime > 0) + dl_se->runtime = 0; +@@ -873,6 +899,44 @@ static void replenish_dl_entity(struct sched_dl_entity *dl_se) + dl_se->dl_yielded = 0; + if (dl_se->dl_throttled) + dl_se->dl_throttled = 0; ++ ++ /* ++ * If this is the replenishment of a deferred reservation, ++ * clear the flag and return. ++ */ ++ if (dl_se->dl_defer_armed) { ++ dl_se->dl_defer_armed = 0; ++ return; ++ } ++ ++ /* ++ * A this point, if the deferred server is not armed, and the deadline ++ * is in the future, if it is not running already, throttle the server ++ * and arm the defer timer. ++ */ ++ if (dl_se->dl_defer && !dl_se->dl_defer_running && ++ dl_time_before(rq_clock(dl_se->rq), dl_se->deadline - dl_se->runtime)) { ++ if (!is_dl_boosted(dl_se) && dl_se->server_has_tasks(dl_se)) { ++ ++ /* ++ * Set dl_se->dl_defer_armed and dl_throttled variables to ++ * inform the start_dl_timer() that this is a deferred ++ * activation. ++ */ ++ dl_se->dl_defer_armed = 1; ++ dl_se->dl_throttled = 1; ++ if (!start_dl_timer(dl_se)) { ++ /* ++ * If for whatever reason (delays), a previous timer was ++ * queued but not serviced, cancel it and clean the ++ * deferrable server variables intended for start_dl_timer(). ++ */ ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ dl_se->dl_defer_armed = 0; ++ dl_se->dl_throttled = 0; ++ } ++ } ++ } + } + + /* +@@ -1023,6 +1087,15 @@ static void update_dl_entity(struct sched_dl_entity *dl_se) + } + + replenish_dl_new_period(dl_se, rq); ++ } else if (dl_server(dl_se) && dl_se->dl_defer) { ++ /* ++ * The server can still use its previous deadline, so check if ++ * it left the dl_defer_running state. ++ */ ++ if (!dl_se->dl_defer_running) { ++ dl_se->dl_defer_armed = 1; ++ dl_se->dl_throttled = 1; ++ } + } + } + +@@ -1055,8 +1128,21 @@ static int start_dl_timer(struct sched_dl_entity *dl_se) + * We want the timer to fire at the deadline, but considering + * that it is actually coming from rq->clock and not from + * hrtimer's time base reading. ++ * ++ * The deferred reservation will have its timer set to ++ * (deadline - runtime). At that point, the CBS rule will decide ++ * if the current deadline can be used, or if a replenishment is ++ * required to avoid add too much pressure on the system ++ * (current u > U). + */ +- act = ns_to_ktime(dl_next_period(dl_se)); ++ if (dl_se->dl_defer_armed) { ++ WARN_ON_ONCE(!dl_se->dl_throttled); ++ act = ns_to_ktime(dl_se->deadline - dl_se->runtime); ++ } else { ++ /* act = deadline - rel-deadline + period */ ++ act = ns_to_ktime(dl_next_period(dl_se)); ++ } ++ + now = hrtimer_cb_get_time(timer); + delta = ktime_to_ns(now) - rq_clock(rq); + act = ktime_add_ns(act, delta); +@@ -1106,6 +1192,62 @@ static void __push_dl_task(struct rq *rq, struct rq_flags *rf) + #endif + } + ++/* a defer timer will not be reset if the runtime consumed was < dl_server_min_res */ ++static const u64 dl_server_min_res = 1 * NSEC_PER_MSEC; ++ ++static enum hrtimer_restart dl_server_timer(struct hrtimer *timer, struct sched_dl_entity *dl_se) ++{ ++ struct rq *rq = rq_of_dl_se(dl_se); ++ u64 fw; ++ ++ scoped_guard (rq_lock, rq) { ++ struct rq_flags *rf = &scope.rf; ++ ++ if (!dl_se->dl_throttled || !dl_se->dl_runtime) ++ return HRTIMER_NORESTART; ++ ++ sched_clock_tick(); ++ update_rq_clock(rq); ++ ++ if (!dl_se->dl_runtime) ++ return HRTIMER_NORESTART; ++ ++ if (!dl_se->server_has_tasks(dl_se)) { ++ replenish_dl_entity(dl_se); ++ return HRTIMER_NORESTART; ++ } ++ ++ if (dl_se->dl_defer_armed) { ++ /* ++ * First check if the server could consume runtime in background. ++ * If so, it is possible to push the defer timer for this amount ++ * of time. The dl_server_min_res serves as a limit to avoid ++ * forwarding the timer for a too small amount of time. ++ */ ++ if (dl_time_before(rq_clock(dl_se->rq), ++ (dl_se->deadline - dl_se->runtime - dl_server_min_res))) { ++ ++ /* reset the defer timer */ ++ fw = dl_se->deadline - rq_clock(dl_se->rq) - dl_se->runtime; ++ ++ hrtimer_forward_now(timer, ns_to_ktime(fw)); ++ return HRTIMER_RESTART; ++ } ++ ++ dl_se->dl_defer_running = 1; ++ } ++ ++ enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); ++ ++ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &dl_se->rq->curr->dl)) ++ resched_curr(rq); ++ ++ __push_dl_task(rq, rf); ++ } ++ ++ return HRTIMER_NORESTART; ++} ++ + /* + * This is the bandwidth enforcement timer callback. If here, we know + * a task is not on its dl_rq, since the fact that the timer was running +@@ -1128,28 +1270,8 @@ static enum hrtimer_restart dl_task_timer(struct hrtimer *timer) + struct rq_flags rf; + struct rq *rq; + +- if (dl_server(dl_se)) { +- struct rq *rq = rq_of_dl_se(dl_se); +- struct rq_flags rf; +- +- rq_lock(rq, &rf); +- if (dl_se->dl_throttled) { +- sched_clock_tick(); +- update_rq_clock(rq); +- +- if (dl_se->server_has_tasks(dl_se)) { +- enqueue_dl_entity(dl_se, ENQUEUE_REPLENISH); +- resched_curr(rq); +- __push_dl_task(rq, &rf); +- } else { +- replenish_dl_entity(dl_se); +- } +- +- } +- rq_unlock(rq, &rf); +- +- return HRTIMER_NORESTART; +- } ++ if (dl_server(dl_se)) ++ return dl_server_timer(timer, dl_se); + + p = dl_task_of(dl_se); + rq = task_rq_lock(p, &rf); +@@ -1319,22 +1441,10 @@ static u64 grub_reclaim(u64 delta, struct rq *rq, struct sched_dl_entity *dl_se) + return (delta * u_act) >> BW_SHIFT; + } + +-static inline void +-update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, +- int flags); +-static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) ++s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) + { + s64 scaled_delta_exec; + +- if (unlikely(delta_exec <= 0)) { +- if (unlikely(dl_se->dl_yielded)) +- goto throttle; +- return; +- } +- +- if (dl_entity_is_special(dl_se)) +- return; +- + /* + * For tasks that participate in GRUB, we implement GRUB-PA: the + * spare reclaimed bandwidth is used to clock down frequency. +@@ -1353,8 +1463,64 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 + scaled_delta_exec = cap_scale(scaled_delta_exec, scale_cpu); + } + ++ return scaled_delta_exec; ++} ++ ++static inline void ++update_stats_dequeue_dl(struct dl_rq *dl_rq, struct sched_dl_entity *dl_se, ++ int flags); ++static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec) ++{ ++ s64 scaled_delta_exec; ++ ++ if (unlikely(delta_exec <= 0)) { ++ if (unlikely(dl_se->dl_yielded)) ++ goto throttle; ++ return; ++ } ++ ++ if (dl_server(dl_se) && dl_se->dl_throttled && !dl_se->dl_defer) ++ return; ++ ++ if (dl_entity_is_special(dl_se)) ++ return; ++ ++ scaled_delta_exec = dl_scaled_delta_exec(rq, dl_se, delta_exec); ++ + dl_se->runtime -= scaled_delta_exec; + ++ /* ++ * The fair server can consume its runtime while throttled (not queued/ ++ * running as regular CFS). ++ * ++ * If the server consumes its entire runtime in this state. The server ++ * is not required for the current period. Thus, reset the server by ++ * starting a new period, pushing the activation. ++ */ ++ if (dl_se->dl_defer && dl_se->dl_throttled && dl_runtime_exceeded(dl_se)) { ++ /* ++ * If the server was previously activated - the starving condition ++ * took place, it this point it went away because the fair scheduler ++ * was able to get runtime in background. So return to the initial ++ * state. ++ */ ++ dl_se->dl_defer_running = 0; ++ ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ ++ replenish_dl_new_period(dl_se, dl_se->rq); ++ ++ /* ++ * Not being able to start the timer seems problematic. If it could not ++ * be started for whatever reason, we need to "unthrottle" the DL server ++ * and queue right away. Otherwise nothing might queue it. That's similar ++ * to what enqueue_dl_entity() does on start_dl_timer==0. For now, just warn. ++ */ ++ WARN_ON_ONCE(!start_dl_timer(dl_se)); ++ ++ return; ++ } ++ + throttle: + if (dl_runtime_exceeded(dl_se) || dl_se->dl_yielded) { + dl_se->dl_throttled = 1; +@@ -1381,6 +1547,14 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 + resched_curr(rq); + } + ++ /* ++ * The fair server (sole dl_server) does not account for real-time ++ * workload because it is running fair work. ++ */ ++ if (dl_se == &rq->fair_server) ++ return; ++ ++#ifdef CONFIG_RT_GROUP_SCHED + /* + * Because -- for now -- we share the rt bandwidth, we need to + * account our runtime there too, otherwise actual rt tasks +@@ -1405,34 +1579,155 @@ static void update_curr_dl_se(struct rq *rq, struct sched_dl_entity *dl_se, s64 + rt_rq->rt_time += delta_exec; + raw_spin_unlock(&rt_rq->rt_runtime_lock); + } ++#endif ++} ++ ++/* ++ * In the non-defer mode, the idle time is not accounted, as the ++ * server provides a guarantee. ++ * ++ * If the dl_server is in defer mode, the idle time is also considered ++ * as time available for the fair server, avoiding a penalty for the ++ * rt scheduler that did not consumed that time. ++ */ ++void dl_server_update_idle_time(struct rq *rq, struct task_struct *p) ++{ ++ s64 delta_exec, scaled_delta_exec; ++ ++ if (!rq->fair_server.dl_defer) ++ return; ++ ++ /* no need to discount more */ ++ if (rq->fair_server.runtime < 0) ++ return; ++ ++ delta_exec = rq_clock_task(rq) - p->se.exec_start; ++ if (delta_exec < 0) ++ return; ++ ++ scaled_delta_exec = dl_scaled_delta_exec(rq, &rq->fair_server, delta_exec); ++ ++ rq->fair_server.runtime -= scaled_delta_exec; ++ ++ if (rq->fair_server.runtime < 0) { ++ rq->fair_server.dl_defer_running = 0; ++ rq->fair_server.runtime = 0; ++ } ++ ++ p->se.exec_start = rq_clock_task(rq); + } + + void dl_server_update(struct sched_dl_entity *dl_se, s64 delta_exec) + { +- update_curr_dl_se(dl_se->rq, dl_se, delta_exec); ++ /* 0 runtime = fair server disabled */ ++ if (dl_se->dl_runtime) ++ update_curr_dl_se(dl_se->rq, dl_se, delta_exec); + } + + void dl_server_start(struct sched_dl_entity *dl_se) + { ++ struct rq *rq = dl_se->rq; ++ ++ /* ++ * XXX: the apply do not work fine at the init phase for the ++ * fair server because things are not yet set. We need to improve ++ * this before getting generic. ++ */ + if (!dl_server(dl_se)) { ++ u64 runtime = 50 * NSEC_PER_MSEC; ++ u64 period = 1000 * NSEC_PER_MSEC; ++ ++ dl_server_apply_params(dl_se, runtime, period, 1); ++ + dl_se->dl_server = 1; ++ dl_se->dl_defer = 1; + setup_new_dl_entity(dl_se); + } ++ ++ if (!dl_se->dl_runtime) ++ return; ++ + enqueue_dl_entity(dl_se, ENQUEUE_WAKEUP); ++ if (!dl_task(dl_se->rq->curr) || dl_entity_preempt(dl_se, &rq->curr->dl)) ++ resched_curr(dl_se->rq); + } + + void dl_server_stop(struct sched_dl_entity *dl_se) + { ++ if (!dl_se->dl_runtime) ++ return; ++ + dequeue_dl_entity(dl_se, DEQUEUE_SLEEP); ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ dl_se->dl_defer_armed = 0; ++ dl_se->dl_throttled = 0; + } + + void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, +- dl_server_pick_f pick) ++ dl_server_pick_f pick_task) + { + dl_se->rq = rq; + dl_se->server_has_tasks = has_tasks; +- dl_se->server_pick = pick; ++ dl_se->server_pick_task = pick_task; ++} ++ ++void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq) ++{ ++ u64 new_bw = dl_se->dl_bw; ++ int cpu = cpu_of(rq); ++ struct dl_bw *dl_b; ++ ++ dl_b = dl_bw_of(cpu_of(rq)); ++ guard(raw_spinlock)(&dl_b->lock); ++ ++ if (!dl_bw_cpus(cpu)) ++ return; ++ ++ __dl_add(dl_b, new_bw, dl_bw_cpus(cpu)); ++} ++ ++int dl_server_apply_params(struct sched_dl_entity *dl_se, u64 runtime, u64 period, bool init) ++{ ++ u64 old_bw = init ? 0 : to_ratio(dl_se->dl_period, dl_se->dl_runtime); ++ u64 new_bw = to_ratio(period, runtime); ++ struct rq *rq = dl_se->rq; ++ int cpu = cpu_of(rq); ++ struct dl_bw *dl_b; ++ unsigned long cap; ++ int retval = 0; ++ int cpus; ++ ++ dl_b = dl_bw_of(cpu); ++ guard(raw_spinlock)(&dl_b->lock); ++ ++ cpus = dl_bw_cpus(cpu); ++ cap = dl_bw_capacity(cpu); ++ ++ if (__dl_overflow(dl_b, cap, old_bw, new_bw)) ++ return -EBUSY; ++ ++ if (init) { ++ __add_rq_bw(new_bw, &rq->dl); ++ __dl_add(dl_b, new_bw, cpus); ++ } else { ++ __dl_sub(dl_b, dl_se->dl_bw, cpus); ++ __dl_add(dl_b, new_bw, cpus); ++ ++ dl_rq_change_utilization(rq, dl_se, new_bw); ++ } ++ ++ dl_se->dl_runtime = runtime; ++ dl_se->dl_deadline = period; ++ dl_se->dl_period = period; ++ ++ dl_se->runtime = 0; ++ dl_se->deadline = 0; ++ ++ dl_se->dl_bw = to_ratio(dl_se->dl_period, dl_se->dl_runtime); ++ dl_se->dl_density = to_ratio(dl_se->dl_deadline, dl_se->dl_runtime); ++ ++ return retval; + } + + /* +@@ -1729,7 +2024,7 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) + * be counted in the active utilization; hence, we need to call + * add_running_bw(). + */ +- if (dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { ++ if (!dl_se->dl_defer && dl_se->dl_throttled && !(flags & ENQUEUE_REPLENISH)) { + if (flags & ENQUEUE_WAKEUP) + task_contending(dl_se, flags); + +@@ -1751,6 +2046,25 @@ enqueue_dl_entity(struct sched_dl_entity *dl_se, int flags) + setup_new_dl_entity(dl_se); + } + ++ /* ++ * If the reservation is still throttled, e.g., it got replenished but is a ++ * deferred task and still got to wait, don't enqueue. ++ */ ++ if (dl_se->dl_throttled && start_dl_timer(dl_se)) ++ return; ++ ++ /* ++ * We're about to enqueue, make sure we're not ->dl_throttled! ++ * In case the timer was not started, say because the defer time ++ * has passed, mark as not throttled and mark unarmed. ++ * Also cancel earlier timers, since letting those run is pointless. ++ */ ++ if (dl_se->dl_throttled) { ++ hrtimer_try_to_cancel(&dl_se->dl_timer); ++ dl_se->dl_defer_armed = 0; ++ dl_se->dl_throttled = 0; ++ } ++ + __enqueue_dl_entity(dl_se); + } + +@@ -1840,7 +2154,7 @@ static void enqueue_task_dl(struct rq *rq, struct task_struct *p, int flags) + enqueue_pushable_dl_task(rq, p); + } + +-static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) ++static bool dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) + { + update_curr_dl(rq); + +@@ -1850,6 +2164,8 @@ static void dequeue_task_dl(struct rq *rq, struct task_struct *p, int flags) + dequeue_dl_entity(&p->dl, flags); + if (!p->dl.dl_throttled && !dl_server(&p->dl)) + dequeue_pushable_dl_task(rq, p); ++ ++ return true; + } + + /* +@@ -2068,6 +2384,9 @@ static void set_next_task_dl(struct rq *rq, struct task_struct *p, bool first) + update_dl_rq_load_avg(rq_clock_pelt(rq), rq, 0); + + deadline_queue_push_tasks(rq); ++ ++ if (hrtick_enabled(rq)) ++ start_hrtick_dl(rq, &p->dl); + } + + static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) +@@ -2080,7 +2399,11 @@ static struct sched_dl_entity *pick_next_dl_entity(struct dl_rq *dl_rq) + return __node_2_dle(left); + } + +-static struct task_struct *pick_task_dl(struct rq *rq) ++/* ++ * __pick_next_task_dl - Helper to pick the next -deadline task to run. ++ * @rq: The runqueue to pick the next task from. ++ */ ++static struct task_struct *__pick_task_dl(struct rq *rq) + { + struct sched_dl_entity *dl_se; + struct dl_rq *dl_rq = &rq->dl; +@@ -2094,14 +2417,13 @@ static struct task_struct *pick_task_dl(struct rq *rq) + WARN_ON_ONCE(!dl_se); + + if (dl_server(dl_se)) { +- p = dl_se->server_pick(dl_se); ++ p = dl_se->server_pick_task(dl_se); + if (!p) { +- WARN_ON_ONCE(1); + dl_se->dl_yielded = 1; + update_curr_dl_se(rq, dl_se, 0); + goto again; + } +- p->dl_server = dl_se; ++ rq->dl_server = dl_se; + } else { + p = dl_task_of(dl_se); + } +@@ -2109,24 +2431,12 @@ static struct task_struct *pick_task_dl(struct rq *rq) + return p; + } + +-static struct task_struct *pick_next_task_dl(struct rq *rq) ++static struct task_struct *pick_task_dl(struct rq *rq) + { +- struct task_struct *p; +- +- p = pick_task_dl(rq); +- if (!p) +- return p; +- +- if (!p->dl_server) +- set_next_task_dl(rq, p, true); +- +- if (hrtick_enabled(rq)) +- start_hrtick_dl(rq, &p->dl); +- +- return p; ++ return __pick_task_dl(rq); + } + +-static void put_prev_task_dl(struct rq *rq, struct task_struct *p) ++static void put_prev_task_dl(struct rq *rq, struct task_struct *p, struct task_struct *next) + { + struct sched_dl_entity *dl_se = &p->dl; + struct dl_rq *dl_rq = &rq->dl; +@@ -2818,13 +3128,12 @@ DEFINE_SCHED_CLASS(dl) = { + + .wakeup_preempt = wakeup_preempt_dl, + +- .pick_next_task = pick_next_task_dl, ++ .pick_task = pick_task_dl, + .put_prev_task = put_prev_task_dl, + .set_next_task = set_next_task_dl, + + #ifdef CONFIG_SMP + .balance = balance_dl, +- .pick_task = pick_task_dl, + .select_task_rq = select_task_rq_dl, + .migrate_task_rq = migrate_task_rq_dl, + .set_cpus_allowed = set_cpus_allowed_dl, +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index c1eb9a1afd13..de1dc5264b3f 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -333,8 +333,165 @@ static const struct file_operations sched_debug_fops = { + .release = seq_release, + }; + ++enum dl_param { ++ DL_RUNTIME = 0, ++ DL_PERIOD, ++}; ++ ++static unsigned long fair_server_period_max = (1UL << 22) * NSEC_PER_USEC; /* ~4 seconds */ ++static unsigned long fair_server_period_min = (100) * NSEC_PER_USEC; /* 100 us */ ++ ++static ssize_t sched_fair_server_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos, enum dl_param param) ++{ ++ long cpu = (long) ((struct seq_file *) filp->private_data)->private; ++ struct rq *rq = cpu_rq(cpu); ++ u64 runtime, period; ++ size_t err; ++ int retval; ++ u64 value; ++ ++ err = kstrtoull_from_user(ubuf, cnt, 10, &value); ++ if (err) ++ return err; ++ ++ scoped_guard (rq_lock_irqsave, rq) { ++ runtime = rq->fair_server.dl_runtime; ++ period = rq->fair_server.dl_period; ++ ++ switch (param) { ++ case DL_RUNTIME: ++ if (runtime == value) ++ break; ++ runtime = value; ++ break; ++ case DL_PERIOD: ++ if (value == period) ++ break; ++ period = value; ++ break; ++ } ++ ++ if (runtime > period || ++ period > fair_server_period_max || ++ period < fair_server_period_min) { ++ return -EINVAL; ++ } ++ ++ if (rq->cfs.h_nr_running) { ++ update_rq_clock(rq); ++ dl_server_stop(&rq->fair_server); ++ } ++ ++ retval = dl_server_apply_params(&rq->fair_server, runtime, period, 0); ++ if (retval) ++ cnt = retval; ++ ++ if (!runtime) ++ printk_deferred("Fair server disabled in CPU %d, system may crash due to starvation.\n", ++ cpu_of(rq)); ++ ++ if (rq->cfs.h_nr_running) ++ dl_server_start(&rq->fair_server); ++ } ++ ++ *ppos += cnt; ++ return cnt; ++} ++ ++static size_t sched_fair_server_show(struct seq_file *m, void *v, enum dl_param param) ++{ ++ unsigned long cpu = (unsigned long) m->private; ++ struct rq *rq = cpu_rq(cpu); ++ u64 value; ++ ++ switch (param) { ++ case DL_RUNTIME: ++ value = rq->fair_server.dl_runtime; ++ break; ++ case DL_PERIOD: ++ value = rq->fair_server.dl_period; ++ break; ++ } ++ ++ seq_printf(m, "%llu\n", value); ++ return 0; ++ ++} ++ ++static ssize_t ++sched_fair_server_runtime_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_RUNTIME); ++} ++ ++static int sched_fair_server_runtime_show(struct seq_file *m, void *v) ++{ ++ return sched_fair_server_show(m, v, DL_RUNTIME); ++} ++ ++static int sched_fair_server_runtime_open(struct inode *inode, struct file *filp) ++{ ++ return single_open(filp, sched_fair_server_runtime_show, inode->i_private); ++} ++ ++static const struct file_operations fair_server_runtime_fops = { ++ .open = sched_fair_server_runtime_open, ++ .write = sched_fair_server_runtime_write, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ ++static ssize_t ++sched_fair_server_period_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ return sched_fair_server_write(filp, ubuf, cnt, ppos, DL_PERIOD); ++} ++ ++static int sched_fair_server_period_show(struct seq_file *m, void *v) ++{ ++ return sched_fair_server_show(m, v, DL_PERIOD); ++} ++ ++static int sched_fair_server_period_open(struct inode *inode, struct file *filp) ++{ ++ return single_open(filp, sched_fair_server_period_show, inode->i_private); ++} ++ ++static const struct file_operations fair_server_period_fops = { ++ .open = sched_fair_server_period_open, ++ .write = sched_fair_server_period_write, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++ + static struct dentry *debugfs_sched; + ++static void debugfs_fair_server_init(void) ++{ ++ struct dentry *d_fair; ++ unsigned long cpu; ++ ++ d_fair = debugfs_create_dir("fair_server", debugfs_sched); ++ if (!d_fair) ++ return; ++ ++ for_each_possible_cpu(cpu) { ++ struct dentry *d_cpu; ++ char buf[32]; ++ ++ snprintf(buf, sizeof(buf), "cpu%lu", cpu); ++ d_cpu = debugfs_create_dir(buf, d_fair); ++ ++ debugfs_create_file("runtime", 0644, d_cpu, (void *) cpu, &fair_server_runtime_fops); ++ debugfs_create_file("period", 0644, d_cpu, (void *) cpu, &fair_server_period_fops); ++ } ++} ++ + static __init int sched_init_debug(void) + { + struct dentry __maybe_unused *numa; +@@ -374,6 +531,8 @@ static __init int sched_init_debug(void) + + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + ++ debugfs_fair_server_init(); ++ + return 0; + } + late_initcall(sched_init_debug); +@@ -580,27 +739,27 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + else + SEQ_printf(m, " %c", task_state_to_char(p)); + +- SEQ_printf(m, "%15s %5d %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", ++ SEQ_printf(m, " %15s %5d %9Ld.%06ld %c %9Ld.%06ld %c %9Ld.%06ld %9Ld.%06ld %9Ld %5d ", + p->comm, task_pid_nr(p), + SPLIT_NS(p->se.vruntime), + entity_eligible(cfs_rq_of(&p->se), &p->se) ? 'E' : 'N', + SPLIT_NS(p->se.deadline), ++ p->se.custom_slice ? 'S' : ' ', + SPLIT_NS(p->se.slice), + SPLIT_NS(p->se.sum_exec_runtime), + (long long)(p->nvcsw + p->nivcsw), + p->prio); + +- SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld %9lld.%06ld", ++ SEQ_printf(m, "%9lld.%06ld %9lld.%06ld %9lld.%06ld", + SPLIT_NS(schedstat_val_or_zero(p->stats.wait_sum)), +- SPLIT_NS(p->se.sum_exec_runtime), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + + #ifdef CONFIG_NUMA_BALANCING +- SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); ++ SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif + #ifdef CONFIG_CGROUP_SCHED +- SEQ_printf_task_group_path(m, task_group(p), " %s") ++ SEQ_printf_task_group_path(m, task_group(p), " %s") + #endif + + SEQ_printf(m, "\n"); +@@ -612,10 +771,26 @@ static void print_rq(struct seq_file *m, struct rq *rq, int rq_cpu) + + SEQ_printf(m, "\n"); + SEQ_printf(m, "runnable tasks:\n"); +- SEQ_printf(m, " S task PID tree-key switches prio" +- " wait-time sum-exec sum-sleep\n"); ++ SEQ_printf(m, " S task PID vruntime eligible " ++ "deadline slice sum-exec switches " ++ "prio wait-time sum-sleep sum-block" ++#ifdef CONFIG_NUMA_BALANCING ++ " node group-id" ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ " group-path" ++#endif ++ "\n"); + SEQ_printf(m, "-------------------------------------------------------" +- "------------------------------------------------------\n"); ++ "------------------------------------------------------" ++ "------------------------------------------------------" ++#ifdef CONFIG_NUMA_BALANCING ++ "--------------" ++#endif ++#ifdef CONFIG_CGROUP_SCHED ++ "--------------" ++#endif ++ "\n"); + + rcu_read_lock(); + for_each_process_thread(g, p) { +@@ -641,8 +816,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + SEQ_printf(m, "\n"); + SEQ_printf(m, "cfs_rq[%d]:\n", cpu); + #endif +- SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "exec_clock", +- SPLIT_NS(cfs_rq->exec_clock)); + + raw_spin_rq_lock_irqsave(rq, flags); + root = __pick_root_entity(cfs_rq); +@@ -669,8 +842,6 @@ void print_cfs_rq(struct seq_file *m, int cpu, struct cfs_rq *cfs_rq) + SPLIT_NS(right_vruntime)); + spread = right_vruntime - left_vruntime; + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", "spread", SPLIT_NS(spread)); +- SEQ_printf(m, " .%-30s: %d\n", "nr_spread_over", +- cfs_rq->nr_spread_over); + SEQ_printf(m, " .%-30s: %d\n", "nr_running", cfs_rq->nr_running); + SEQ_printf(m, " .%-30s: %d\n", "h_nr_running", cfs_rq->h_nr_running); + SEQ_printf(m, " .%-30s: %d\n", "idle_nr_running", +@@ -730,9 +901,12 @@ void print_rt_rq(struct seq_file *m, int cpu, struct rt_rq *rt_rq) + SEQ_printf(m, " .%-30s: %Ld.%06ld\n", #x, SPLIT_NS(rt_rq->x)) + + PU(rt_nr_running); ++ ++#ifdef CONFIG_RT_GROUP_SCHED + P(rt_throttled); + PN(rt_time); + PN(rt_runtime); ++#endif + + #undef PN + #undef PU +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 91b242e47db7..c89e7f1693d4 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -792,8 +792,22 @@ static void update_min_vruntime(struct cfs_rq *cfs_rq) + } + + /* ensure we never gain time by being placed backwards. */ +- u64_u32_store(cfs_rq->min_vruntime, +- __update_min_vruntime(cfs_rq, vruntime)); ++ cfs_rq->min_vruntime = __update_min_vruntime(cfs_rq, vruntime); ++} ++ ++static inline u64 cfs_rq_min_slice(struct cfs_rq *cfs_rq) ++{ ++ struct sched_entity *root = __pick_root_entity(cfs_rq); ++ struct sched_entity *curr = cfs_rq->curr; ++ u64 min_slice = ~0ULL; ++ ++ if (curr && curr->on_rq) ++ min_slice = curr->slice; ++ ++ if (root) ++ min_slice = min(min_slice, root->min_slice); ++ ++ return min_slice; + } + + static inline bool __entity_less(struct rb_node *a, const struct rb_node *b) +@@ -812,19 +826,34 @@ static inline void __min_vruntime_update(struct sched_entity *se, struct rb_node + } + } + ++static inline void __min_slice_update(struct sched_entity *se, struct rb_node *node) ++{ ++ if (node) { ++ struct sched_entity *rse = __node_2_se(node); ++ if (rse->min_slice < se->min_slice) ++ se->min_slice = rse->min_slice; ++ } ++} ++ + /* + * se->min_vruntime = min(se->vruntime, {left,right}->min_vruntime) + */ + static inline bool min_vruntime_update(struct sched_entity *se, bool exit) + { + u64 old_min_vruntime = se->min_vruntime; ++ u64 old_min_slice = se->min_slice; + struct rb_node *node = &se->run_node; + + se->min_vruntime = se->vruntime; + __min_vruntime_update(se, node->rb_right); + __min_vruntime_update(se, node->rb_left); + +- return se->min_vruntime == old_min_vruntime; ++ se->min_slice = se->slice; ++ __min_slice_update(se, node->rb_right); ++ __min_slice_update(se, node->rb_left); ++ ++ return se->min_vruntime == old_min_vruntime && ++ se->min_slice == old_min_slice; + } + + RB_DECLARE_CALLBACKS(static, min_vruntime_cb, struct sched_entity, +@@ -837,6 +866,7 @@ static void __enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + avg_vruntime_add(cfs_rq, se); + se->min_vruntime = se->vruntime; ++ se->min_slice = se->slice; + rb_add_augmented_cached(&se->run_node, &cfs_rq->tasks_timeline, + __entity_less, &min_vruntime_cb); + } +@@ -987,17 +1017,18 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); + * XXX: strictly: vd_i += N*r_i/w_i such that: vd_i > ve_i + * this is probably good enough. + */ +-static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) ++static bool update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + if ((s64)(se->vruntime - se->deadline) < 0) +- return; ++ return false; + + /* + * For EEVDF the virtual time slope is determined by w_i (iow. + * nice) while the request time r_i is determined by + * sysctl_sched_base_slice. + */ +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + + /* + * EEVDF: vd_i = ve_i + r_i / w_i +@@ -1007,10 +1038,7 @@ static void update_deadline(struct cfs_rq *cfs_rq, struct sched_entity *se) + /* + * The task has consumed its request, reschedule. + */ +- if (cfs_rq->nr_running > 1) { +- resched_curr(rq_of(cfs_rq)); +- clear_buddies(cfs_rq, se); +- } ++ return true; + } + + #include "pelt.h" +@@ -1148,6 +1176,38 @@ static inline void update_curr_task(struct task_struct *p, s64 delta_exec) + dl_server_update(p->dl_server, delta_exec); + } + ++static inline bool did_preempt_short(struct cfs_rq *cfs_rq, struct sched_entity *curr) ++{ ++ if (!sched_feat(PREEMPT_SHORT)) ++ return false; ++ ++ if (curr->vlag == curr->deadline) ++ return false; ++ ++ return !entity_eligible(cfs_rq, curr); ++} ++ ++static inline bool do_preempt_short(struct cfs_rq *cfs_rq, ++ struct sched_entity *pse, struct sched_entity *se) ++{ ++ if (!sched_feat(PREEMPT_SHORT)) ++ return false; ++ ++ if (pse->slice >= se->slice) ++ return false; ++ ++ if (!entity_eligible(cfs_rq, pse)) ++ return false; ++ ++ if (entity_before(pse, se)) ++ return true; ++ ++ if (!entity_eligible(cfs_rq, se)) ++ return true; ++ ++ return false; ++} ++ + /* + * Used by other classes to account runtime. + */ +@@ -1169,23 +1229,44 @@ s64 update_curr_common(struct rq *rq) + static void update_curr(struct cfs_rq *cfs_rq) + { + struct sched_entity *curr = cfs_rq->curr; ++ struct rq *rq = rq_of(cfs_rq); + s64 delta_exec; ++ bool resched; + + if (unlikely(!curr)) + return; + +- delta_exec = update_curr_se(rq_of(cfs_rq), curr); ++ delta_exec = update_curr_se(rq, curr); + if (unlikely(delta_exec <= 0)) + return; + + curr->vruntime += calc_delta_fair(delta_exec, curr); +- update_deadline(cfs_rq, curr); ++ resched = update_deadline(cfs_rq, curr); + update_min_vruntime(cfs_rq); + +- if (entity_is_task(curr)) +- update_curr_task(task_of(curr), delta_exec); ++ if (entity_is_task(curr)) { ++ struct task_struct *p = task_of(curr); ++ ++ update_curr_task(p, delta_exec); ++ ++ /* ++ * Any fair task that runs outside of fair_server should ++ * account against fair_server such that it can account for ++ * this time and possibly avoid running this period. ++ */ ++ if (p->dl_server != &rq->fair_server) ++ dl_server_update(&rq->fair_server, delta_exec); ++ } + + account_cfs_rq_runtime(cfs_rq, delta_exec); ++ ++ if (cfs_rq->nr_running == 1) ++ return; ++ ++ if (resched || did_preempt_short(cfs_rq, curr)) { ++ resched_curr(rq); ++ clear_buddies(cfs_rq, curr); ++ } + } + + static void update_curr_fair(struct rq *rq) +@@ -5200,7 +5281,8 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + u64 vslice, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; + +- se->slice = sysctl_sched_base_slice; ++ if (!se->custom_slice) ++ se->slice = sysctl_sched_base_slice; + vslice = calc_delta_fair(se->slice, se); + + /* +@@ -5281,6 +5363,12 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + + se->vruntime = vruntime - lag; + ++ if (sched_feat(PLACE_REL_DEADLINE) && se->rel_deadline) { ++ se->deadline += se->vruntime; ++ se->rel_deadline = 0; ++ return; ++ } ++ + /* + * When joining the competition; the existing tasks will be, + * on average, halfway through their slice, as such start tasks +@@ -5300,6 +5388,9 @@ static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + + static inline bool cfs_bandwidth_used(void); + ++static void ++requeue_delayed_entity(struct sched_entity *se); ++ + static void + enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +@@ -5387,19 +5478,47 @@ static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se) + + static __always_inline void return_cfs_rq_runtime(struct cfs_rq *cfs_rq); + +-static void ++static inline void finish_delayed_dequeue_entity(struct sched_entity *se) ++{ ++ se->sched_delayed = 0; ++ if (sched_feat(DELAY_ZERO) && se->vlag > 0) ++ se->vlag = 0; ++} ++ ++static bool + dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- int action = UPDATE_TG; ++ bool sleep = flags & DEQUEUE_SLEEP; ++ ++ update_curr(cfs_rq); ++ ++ if (flags & DEQUEUE_DELAYED) { ++ SCHED_WARN_ON(!se->sched_delayed); ++ } else { ++ bool delay = sleep; ++ /* ++ * DELAY_DEQUEUE relies on spurious wakeups, special task ++ * states must not suffer spurious wakeups, excempt them. ++ */ ++ if (flags & DEQUEUE_SPECIAL) ++ delay = false; ++ ++ SCHED_WARN_ON(delay && se->sched_delayed); + ++ if (sched_feat(DELAY_DEQUEUE) && delay && ++ !entity_eligible(cfs_rq, se)) { ++ if (cfs_rq->next == se) ++ cfs_rq->next = NULL; ++ update_load_avg(cfs_rq, se, 0); ++ se->sched_delayed = 1; ++ return false; ++ } ++ } ++ ++ int action = UPDATE_TG; + if (entity_is_task(se) && task_on_rq_migrating(task_of(se))) + action |= DO_DETACH; + +- /* +- * Update run-time statistics of the 'current'. +- */ +- update_curr(cfs_rq); +- + /* + * When dequeuing a sched_entity, we must: + * - Update loads to have both entity and cfs_rq synced with now. +@@ -5417,6 +5536,11 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + clear_buddies(cfs_rq, se); + + update_entity_lag(cfs_rq, se); ++ if (sched_feat(PLACE_REL_DEADLINE) && !sleep) { ++ se->deadline -= se->vruntime; ++ se->rel_deadline = 1; ++ } ++ + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->on_rq = 0; +@@ -5436,8 +5560,13 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + if ((flags & (DEQUEUE_SAVE | DEQUEUE_MOVE)) != DEQUEUE_SAVE) + update_min_vruntime(cfs_rq); + ++ if (flags & DEQUEUE_DELAYED) ++ finish_delayed_dequeue_entity(se); ++ + if (cfs_rq->nr_running == 0) + update_idle_cfs_rq_clock_pelt(cfs_rq); ++ ++ return true; + } + + static void +@@ -5463,6 +5592,7 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + } + + update_stats_curr_start(cfs_rq, se); ++ SCHED_WARN_ON(cfs_rq->curr); + cfs_rq->curr = se; + + /* +@@ -5483,6 +5613,8 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + se->prev_sum_exec_runtime = se->sum_exec_runtime; + } + ++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags); ++ + /* + * Pick the next process, keeping these things in mind, in this order: + * 1) keep things fair between processes/task groups +@@ -5491,16 +5623,26 @@ set_next_entity(struct cfs_rq *cfs_rq, struct sched_entity *se) + * 4) do not run the "skip" process, if something else is available + */ + static struct sched_entity * +-pick_next_entity(struct cfs_rq *cfs_rq) ++pick_next_entity(struct rq *rq, struct cfs_rq *cfs_rq) + { + /* + * Enabling NEXT_BUDDY will affect latency but not fairness. + */ + if (sched_feat(NEXT_BUDDY) && +- cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) ++ cfs_rq->next && entity_eligible(cfs_rq, cfs_rq->next)) { ++ /* ->next will never be delayed */ ++ SCHED_WARN_ON(cfs_rq->next->sched_delayed); + return cfs_rq->next; ++ } + +- return pick_eevdf(cfs_rq); ++ struct sched_entity *se = pick_eevdf(cfs_rq); ++ if (se->sched_delayed) { ++ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); ++ SCHED_WARN_ON(se->sched_delayed); ++ SCHED_WARN_ON(se->on_rq); ++ return NULL; ++ } ++ return se; + } + + static bool check_cfs_rq_runtime(struct cfs_rq *cfs_rq); +@@ -5524,6 +5666,7 @@ static void put_prev_entity(struct cfs_rq *cfs_rq, struct sched_entity *prev) + /* in !on_rq case, update occurred at dequeue */ + update_load_avg(cfs_rq, prev, 0); + } ++ SCHED_WARN_ON(cfs_rq->curr != prev); + cfs_rq->curr = NULL; + } + +@@ -5787,6 +5930,7 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, idle_task_delta, dequeue = 1; ++ long rq_h_nr_running = rq->cfs.h_nr_running; + + raw_spin_lock(&cfs_b->lock); + /* This will start the period timer if necessary */ +@@ -5820,11 +5964,21 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) + idle_task_delta = cfs_rq->idle_h_nr_running; + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); ++ int flags; ++ + /* throttled entity or throttle-on-deactivate */ + if (!se->on_rq) + goto done; + +- dequeue_entity(qcfs_rq, se, DEQUEUE_SLEEP); ++ /* ++ * Abuse SPECIAL to avoid delayed dequeue in this instance. ++ * This avoids teaching dequeue_entities() about throttled ++ * entities and keeps things relatively simple. ++ */ ++ flags = DEQUEUE_SLEEP | DEQUEUE_SPECIAL; ++ if (se->sched_delayed) ++ flags |= DEQUEUE_DELAYED; ++ dequeue_entity(qcfs_rq, se, flags); + + if (cfs_rq_is_idle(group_cfs_rq(se))) + idle_task_delta = cfs_rq->h_nr_running; +@@ -5858,6 +6012,9 @@ static bool throttle_cfs_rq(struct cfs_rq *cfs_rq) + /* At this point se is NULL and we are at root level*/ + sub_nr_running(rq, task_delta); + ++ /* Stop the fair server if throttling resulted in no runnable tasks */ ++ if (rq_h_nr_running && !rq->cfs.h_nr_running) ++ dl_server_stop(&rq->fair_server); + done: + /* + * Note: distribution will already see us throttled via the +@@ -5876,6 +6033,7 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + struct cfs_bandwidth *cfs_b = tg_cfs_bandwidth(cfs_rq->tg); + struct sched_entity *se; + long task_delta, idle_task_delta; ++ long rq_h_nr_running = rq->cfs.h_nr_running; + + se = cfs_rq->tg->se[cpu_of(rq)]; + +@@ -5913,7 +6071,12 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + for_each_sched_entity(se) { + struct cfs_rq *qcfs_rq = cfs_rq_of(se); + +- if (se->on_rq) ++ /* Handle any unfinished DELAY_DEQUEUE business first. */ ++ if (se->sched_delayed) { ++ int flags = DEQUEUE_SLEEP | DEQUEUE_DELAYED; ++ ++ dequeue_entity(qcfs_rq, se, flags); ++ } else if (se->on_rq) + break; + enqueue_entity(qcfs_rq, se, ENQUEUE_WAKEUP); + +@@ -5945,6 +6108,10 @@ void unthrottle_cfs_rq(struct cfs_rq *cfs_rq) + goto unthrottle_throttle; + } + ++ /* Start the fair server if un-throttling resulted in new runnable tasks */ ++ if (!rq_h_nr_running && rq->cfs.h_nr_running) ++ dl_server_start(&rq->fair_server); ++ + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, task_delta); + +@@ -6577,7 +6744,7 @@ static void sched_fair_update_stop_tick(struct rq *rq, struct task_struct *p) + { + int cpu = cpu_of(rq); + +- if (!sched_feat(HZ_BW) || !cfs_bandwidth_used()) ++ if (!cfs_bandwidth_used()) + return; + + if (!tick_nohz_full_cpu(cpu)) +@@ -6760,6 +6927,37 @@ static int sched_idle_cpu(int cpu) + } + #endif + ++static void ++requeue_delayed_entity(struct sched_entity *se) ++{ ++ struct cfs_rq *cfs_rq = cfs_rq_of(se); ++ ++ /* ++ * se->sched_delayed should imply: se->on_rq == 1. ++ * Because a delayed entity is one that is still on ++ * the runqueue competing until elegibility. ++ */ ++ SCHED_WARN_ON(!se->sched_delayed); ++ SCHED_WARN_ON(!se->on_rq); ++ ++ if (sched_feat(DELAY_ZERO)) { ++ update_entity_lag(cfs_rq, se); ++ if (se->vlag > 0) { ++ cfs_rq->nr_running--; ++ if (se != cfs_rq->curr) ++ __dequeue_entity(cfs_rq, se); ++ se->vlag = 0; ++ place_entity(cfs_rq, se, 0); ++ if (se != cfs_rq->curr) ++ __enqueue_entity(cfs_rq, se); ++ cfs_rq->nr_running++; ++ } ++ } ++ ++ update_load_avg(cfs_rq, se, 0); ++ se->sched_delayed = 0; ++} ++ + /* + * The enqueue_task method is called before nr_running is + * increased. Here we update the fair scheduling stats and +@@ -6772,6 +6970,8 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + struct sched_entity *se = &p->se; + int idle_h_nr_running = task_has_idle_policy(p); + int task_new = !(flags & ENQUEUE_WAKEUP); ++ int rq_h_nr_running = rq->cfs.h_nr_running; ++ u64 slice = 0; + + /* + * The code below (indirectly) updates schedutil which looks at +@@ -6779,7 +6979,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + * Let's add the task's estimated utilization to the cfs_rq's + * estimated utilization, before we update schedutil. + */ +- util_est_enqueue(&rq->cfs, p); ++ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & ENQUEUE_RESTORE)))) ++ util_est_enqueue(&rq->cfs, p); ++ ++ if (flags & ENQUEUE_DELAYED) { ++ requeue_delayed_entity(se); ++ return; ++ } + + /* + * If in_iowait is set, the code below may not trigger any cpufreq +@@ -6790,10 +6996,24 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + cpufreq_update_util(rq, SCHED_CPUFREQ_IOWAIT); + + for_each_sched_entity(se) { +- if (se->on_rq) ++ if (se->on_rq) { ++ if (se->sched_delayed) ++ requeue_delayed_entity(se); + break; ++ } + cfs_rq = cfs_rq_of(se); ++ ++ /* ++ * Basically set the slice of group entries to the min_slice of ++ * their respective cfs_rq. This ensures the group can service ++ * its entities in the desired time-frame. ++ */ ++ if (slice) { ++ se->slice = slice; ++ se->custom_slice = 1; ++ } + enqueue_entity(cfs_rq, se, flags); ++ slice = cfs_rq_min_slice(cfs_rq); + + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; +@@ -6815,6 +7035,9 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + se_update_runnable(se); + update_cfs_group(se); + ++ se->slice = slice; ++ slice = cfs_rq_min_slice(cfs_rq); ++ + cfs_rq->h_nr_running++; + cfs_rq->idle_h_nr_running += idle_h_nr_running; + +@@ -6826,6 +7049,13 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + goto enqueue_throttle; + } + ++ if (!rq_h_nr_running && rq->cfs.h_nr_running) { ++ /* Account for idle runtime */ ++ if (!rq->nr_running) ++ dl_server_update_idle_time(rq, rq->curr); ++ dl_server_start(&rq->fair_server); ++ } ++ + /* At this point se is NULL and we are at root level*/ + add_nr_running(rq, 1); + +@@ -6855,36 +7085,59 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + static void set_next_buddy(struct sched_entity *se); + + /* +- * The dequeue_task method is called before nr_running is +- * decreased. We remove the task from the rbtree and +- * update the fair scheduling stats: ++ * Basically dequeue_task_fair(), except it can deal with dequeue_entity() ++ * failing half-way through and resume the dequeue later. ++ * ++ * Returns: ++ * -1 - dequeue delayed ++ * 0 - dequeue throttled ++ * 1 - dequeue complete + */ +-static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ++static int dequeue_entities(struct rq *rq, struct sched_entity *se, int flags) + { +- struct cfs_rq *cfs_rq; +- struct sched_entity *se = &p->se; +- int task_sleep = flags & DEQUEUE_SLEEP; +- int idle_h_nr_running = task_has_idle_policy(p); + bool was_sched_idle = sched_idle_rq(rq); ++ int rq_h_nr_running = rq->cfs.h_nr_running; ++ bool task_sleep = flags & DEQUEUE_SLEEP; ++ bool task_delayed = flags & DEQUEUE_DELAYED; ++ struct task_struct *p = NULL; ++ int idle_h_nr_running = 0; ++ int h_nr_running = 0; ++ struct cfs_rq *cfs_rq; ++ u64 slice = 0; + +- util_est_dequeue(&rq->cfs, p); ++ if (entity_is_task(se)) { ++ p = task_of(se); ++ h_nr_running = 1; ++ idle_h_nr_running = task_has_idle_policy(p); ++ } else { ++ cfs_rq = group_cfs_rq(se); ++ slice = cfs_rq_min_slice(cfs_rq); ++ } + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); +- dequeue_entity(cfs_rq, se, flags); + +- cfs_rq->h_nr_running--; ++ if (!dequeue_entity(cfs_rq, se, flags)) { ++ if (p && &p->se == se) ++ return -1; ++ ++ break; ++ } ++ ++ cfs_rq->h_nr_running -= h_nr_running; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + if (cfs_rq_is_idle(cfs_rq)) +- idle_h_nr_running = 1; ++ idle_h_nr_running = h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) +- goto dequeue_throttle; ++ return 0; + + /* Don't dequeue parent if it has other entities besides us */ + if (cfs_rq->load.weight) { ++ slice = cfs_rq_min_slice(cfs_rq); ++ + /* Avoid re-evaluating load for this entity: */ + se = parent_entity(se); + /* +@@ -6896,6 +7149,7 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + break; + } + flags |= DEQUEUE_SLEEP; ++ flags &= ~(DEQUEUE_DELAYED | DEQUEUE_SPECIAL); + } + + for_each_sched_entity(se) { +@@ -6905,28 +7159,61 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + se_update_runnable(se); + update_cfs_group(se); + +- cfs_rq->h_nr_running--; ++ se->slice = slice; ++ slice = cfs_rq_min_slice(cfs_rq); ++ ++ cfs_rq->h_nr_running -= h_nr_running; + cfs_rq->idle_h_nr_running -= idle_h_nr_running; + + if (cfs_rq_is_idle(cfs_rq)) +- idle_h_nr_running = 1; ++ idle_h_nr_running = h_nr_running; + + /* end evaluation on encountering a throttled cfs_rq */ + if (cfs_rq_throttled(cfs_rq)) +- goto dequeue_throttle; +- ++ return 0; + } + +- /* At this point se is NULL and we are at root level*/ +- sub_nr_running(rq, 1); ++ sub_nr_running(rq, h_nr_running); ++ ++ if (rq_h_nr_running && !rq->cfs.h_nr_running) ++ dl_server_stop(&rq->fair_server); + + /* balance early to pull high priority tasks */ + if (unlikely(!was_sched_idle && sched_idle_rq(rq))) + rq->next_balance = jiffies; + +-dequeue_throttle: +- util_est_update(&rq->cfs, p, task_sleep); ++ if (p && task_delayed) { ++ SCHED_WARN_ON(!task_sleep); ++ SCHED_WARN_ON(p->on_rq != 1); ++ ++ /* Fix-up what dequeue_task_fair() skipped */ ++ hrtick_update(rq); ++ ++ /* Fix-up what block_task() skipped. */ ++ __block_task(rq, p); ++ } ++ ++ return 1; ++} ++ ++/* ++ * The dequeue_task method is called before nr_running is ++ * decreased. We remove the task from the rbtree and ++ * update the fair scheduling stats: ++ */ ++static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) ++{ ++ if (!(p->se.sched_delayed && (task_on_rq_migrating(p) || (flags & DEQUEUE_SAVE)))) ++ util_est_dequeue(&rq->cfs, p); ++ ++ if (dequeue_entities(rq, &p->se, flags) < 0) { ++ util_est_update(&rq->cfs, p, DEQUEUE_SLEEP); ++ return false; ++ } ++ ++ util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); + hrtick_update(rq); ++ return true; + } + + #ifdef CONFIG_SMP +@@ -7824,6 +8111,105 @@ static unsigned long cpu_util_without(int cpu, struct task_struct *p) + return cpu_util(cpu, p, -1, 0); + } + ++/* ++ * This function computes an effective utilization for the given CPU, to be ++ * used for frequency selection given the linear relation: f = u * f_max. ++ * ++ * The scheduler tracks the following metrics: ++ * ++ * cpu_util_{cfs,rt,dl,irq}() ++ * cpu_bw_dl() ++ * ++ * Where the cfs,rt and dl util numbers are tracked with the same metric and ++ * synchronized windows and are thus directly comparable. ++ * ++ * The cfs,rt,dl utilization are the running times measured with rq->clock_task ++ * which excludes things like IRQ and steal-time. These latter are then accrued ++ * in the IRQ utilization. ++ * ++ * The DL bandwidth number OTOH is not a measured metric but a value computed ++ * based on the task model parameters and gives the minimal utilization ++ * required to meet deadlines. ++ */ ++unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, ++ unsigned long *min, ++ unsigned long *max) ++{ ++ unsigned long util, irq, scale; ++ struct rq *rq = cpu_rq(cpu); ++ ++ scale = arch_scale_cpu_capacity(cpu); ++ ++ /* ++ * Early check to see if IRQ/steal time saturates the CPU, can be ++ * because of inaccuracies in how we track these -- see ++ * update_irq_load_avg(). ++ */ ++ irq = cpu_util_irq(rq); ++ if (unlikely(irq >= scale)) { ++ if (min) ++ *min = scale; ++ if (max) ++ *max = scale; ++ return scale; ++ } ++ ++ if (min) { ++ /* ++ * The minimum utilization returns the highest level between: ++ * - the computed DL bandwidth needed with the IRQ pressure which ++ * steals time to the deadline task. ++ * - The minimum performance requirement for CFS and/or RT. ++ */ ++ *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); ++ ++ /* ++ * When an RT task is runnable and uclamp is not used, we must ++ * ensure that the task will run at maximum compute capacity. ++ */ ++ if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) ++ *min = max(*min, scale); ++ } ++ ++ /* ++ * Because the time spend on RT/DL tasks is visible as 'lost' time to ++ * CFS tasks and we use the same metric to track the effective ++ * utilization (PELT windows are synchronized) we can directly add them ++ * to obtain the CPU's actual utilization. ++ */ ++ util = util_cfs + cpu_util_rt(rq); ++ util += cpu_util_dl(rq); ++ ++ /* ++ * The maximum hint is a soft bandwidth requirement, which can be lower ++ * than the actual utilization because of uclamp_max requirements. ++ */ ++ if (max) ++ *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); ++ ++ if (util >= scale) ++ return scale; ++ ++ /* ++ * There is still idle time; further improve the number by using the ++ * IRQ metric. Because IRQ/steal time is hidden from the task clock we ++ * need to scale the task numbers: ++ * ++ * max - irq ++ * U' = irq + --------- * U ++ * max ++ */ ++ util = scale_irq_capacity(util, irq, scale); ++ util += irq; ++ ++ return min(scale, util); ++} ++ ++unsigned long sched_cpu_util(int cpu) ++{ ++ return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); ++} ++ + /* + * energy_env - Utilization landscape for energy estimation. + * @task_busy_time: Utilization contribution by the task for which we test the +@@ -8308,7 +8694,21 @@ static void migrate_task_rq_fair(struct task_struct *p, int new_cpu) + + static void task_dead_fair(struct task_struct *p) + { +- remove_entity_load_avg(&p->se); ++ struct sched_entity *se = &p->se; ++ ++ if (se->sched_delayed) { ++ struct rq_flags rf; ++ struct rq *rq; ++ ++ rq = task_rq_lock(p, &rf); ++ if (se->sched_delayed) { ++ update_rq_clock(rq); ++ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); ++ } ++ task_rq_unlock(rq, p, &rf); ++ } ++ ++ remove_entity_load_avg(se); + } + + /* +@@ -8344,7 +8744,7 @@ static void set_cpus_allowed_fair(struct task_struct *p, struct affinity_context + static int + balance_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + { +- if (rq->nr_running) ++ if (sched_fair_runnable(rq)) + return 1; + + return sched_balance_newidle(rq, rf) != 0; +@@ -8430,7 +8830,17 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + cfs_rq = cfs_rq_of(se); + update_curr(cfs_rq); + /* +- * XXX pick_eevdf(cfs_rq) != se ? ++ * If @p has a shorter slice than current and @p is eligible, override ++ * current's slice protection in order to allow preemption. ++ * ++ * Note that even if @p does not turn out to be the most eligible ++ * task at this moment, current's slice protection will be lost. ++ */ ++ if (do_preempt_short(cfs_rq, pse, se) && se->vlag == se->deadline) ++ se->vlag = se->deadline + 1; ++ ++ /* ++ * If @p has become the most eligible task, force preemption. + */ + if (pick_eevdf(cfs_rq) == pse) + goto preempt; +@@ -8441,7 +8851,6 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + resched_curr(rq); + } + +-#ifdef CONFIG_SMP + static struct task_struct *pick_task_fair(struct rq *rq) + { + struct sched_entity *se; +@@ -8453,95 +8862,58 @@ static struct task_struct *pick_task_fair(struct rq *rq) + return NULL; + + do { +- struct sched_entity *curr = cfs_rq->curr; ++ /* Might not have done put_prev_entity() */ ++ if (cfs_rq->curr && cfs_rq->curr->on_rq) ++ update_curr(cfs_rq); + +- /* When we pick for a remote RQ, we'll not have done put_prev_entity() */ +- if (curr) { +- if (curr->on_rq) +- update_curr(cfs_rq); +- else +- curr = NULL; ++ if (unlikely(check_cfs_rq_runtime(cfs_rq))) ++ goto again; + +- if (unlikely(check_cfs_rq_runtime(cfs_rq))) +- goto again; +- } +- +- se = pick_next_entity(cfs_rq); ++ se = pick_next_entity(rq, cfs_rq); ++ if (!se) ++ goto again; + cfs_rq = group_cfs_rq(se); + } while (cfs_rq); + + return task_of(se); + } +-#endif ++ ++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); ++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first); + + struct task_struct * + pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) + { +- struct cfs_rq *cfs_rq = &rq->cfs; + struct sched_entity *se; + struct task_struct *p; + int new_tasks; + + again: +- if (!sched_fair_runnable(rq)) ++ p = pick_task_fair(rq); ++ if (!p) + goto idle; ++ se = &p->se; + + #ifdef CONFIG_FAIR_GROUP_SCHED +- if (!prev || prev->sched_class != &fair_sched_class) ++ if (prev->sched_class != &fair_sched_class) + goto simple; + ++ __put_prev_set_next_dl_server(rq, prev, p); ++ + /* + * Because of the set_next_buddy() in dequeue_task_fair() it is rather + * likely that a next task is from the same cgroup as the current. + * + * Therefore attempt to avoid putting and setting the entire cgroup + * hierarchy, only change the part that actually changes. +- */ +- +- do { +- struct sched_entity *curr = cfs_rq->curr; +- +- /* +- * Since we got here without doing put_prev_entity() we also +- * have to consider cfs_rq->curr. If it is still a runnable +- * entity, update_curr() will update its vruntime, otherwise +- * forget we've ever seen it. +- */ +- if (curr) { +- if (curr->on_rq) +- update_curr(cfs_rq); +- else +- curr = NULL; +- +- /* +- * This call to check_cfs_rq_runtime() will do the +- * throttle and dequeue its entity in the parent(s). +- * Therefore the nr_running test will indeed +- * be correct. +- */ +- if (unlikely(check_cfs_rq_runtime(cfs_rq))) { +- cfs_rq = &rq->cfs; +- +- if (!cfs_rq->nr_running) +- goto idle; +- +- goto simple; +- } +- } +- +- se = pick_next_entity(cfs_rq); +- cfs_rq = group_cfs_rq(se); +- } while (cfs_rq); +- +- p = task_of(se); +- +- /* ++ * + * Since we haven't yet done put_prev_entity and if the selected task + * is a different task than we started out with, try and touch the + * least amount of cfs_rqs. + */ + if (prev != p) { + struct sched_entity *pse = &prev->se; ++ struct cfs_rq *cfs_rq; + + while (!(cfs_rq = is_same_group(se, pse))) { + int se_depth = se->depth; +@@ -8559,38 +8931,15 @@ pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf + + put_prev_entity(cfs_rq, pse); + set_next_entity(cfs_rq, se); +- } +- +- goto done; +-simple: +-#endif +- if (prev) +- put_prev_task(rq, prev); + +- do { +- se = pick_next_entity(cfs_rq); +- set_next_entity(cfs_rq, se); +- cfs_rq = group_cfs_rq(se); +- } while (cfs_rq); ++ __set_next_task_fair(rq, p, true); ++ } + +- p = task_of(se); ++ return p; + +-done: __maybe_unused; +-#ifdef CONFIG_SMP +- /* +- * Move the next running task to the front of +- * the list, so our cfs_tasks list becomes MRU +- * one. +- */ +- list_move(&p->se.group_node, &rq->cfs_tasks); ++simple: + #endif +- +- if (hrtick_enabled_fair(rq)) +- hrtick_start_fair(rq, p); +- +- update_misfit_status(p, rq); +- sched_fair_update_stop_tick(rq, p); +- ++ put_prev_set_next_task(rq, prev, p); + return p; + + idle: +@@ -8619,15 +8968,34 @@ done: __maybe_unused; + return NULL; + } + +-static struct task_struct *__pick_next_task_fair(struct rq *rq) ++static struct task_struct *__pick_next_task_fair(struct rq *rq, struct task_struct *prev) ++{ ++ return pick_next_task_fair(rq, prev, NULL); ++} ++ ++static bool fair_server_has_tasks(struct sched_dl_entity *dl_se) ++{ ++ return !!dl_se->rq->cfs.nr_running; ++} ++ ++static struct task_struct *fair_server_pick_task(struct sched_dl_entity *dl_se) + { +- return pick_next_task_fair(rq, NULL, NULL); ++ return pick_task_fair(dl_se->rq); ++} ++ ++void fair_server_init(struct rq *rq) ++{ ++ struct sched_dl_entity *dl_se = &rq->fair_server; ++ ++ init_dl_entity(dl_se); ++ ++ dl_server_init(dl_se, rq, fair_server_has_tasks, fair_server_pick_task); + } + + /* + * Account for a descheduled task: + */ +-static void put_prev_task_fair(struct rq *rq, struct task_struct *prev) ++static void put_prev_task_fair(struct rq *rq, struct task_struct *prev, struct task_struct *next) + { + struct sched_entity *se = &prev->se; + struct cfs_rq *cfs_rq; +@@ -12721,22 +13089,7 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + */ + static void task_fork_fair(struct task_struct *p) + { +- struct sched_entity *se = &p->se, *curr; +- struct cfs_rq *cfs_rq; +- struct rq *rq = this_rq(); +- struct rq_flags rf; +- +- rq_lock(rq, &rf); +- update_rq_clock(rq); +- + set_task_max_allowed_capacity(p); +- +- cfs_rq = task_cfs_rq(current); +- curr = cfs_rq->curr; +- if (curr) +- update_curr(cfs_rq); +- place_entity(cfs_rq, se, ENQUEUE_INITIAL); +- rq_unlock(rq, &rf); + } + + /* +@@ -12848,10 +13201,28 @@ static void attach_task_cfs_rq(struct task_struct *p) + static void switched_from_fair(struct rq *rq, struct task_struct *p) + { + detach_task_cfs_rq(p); ++ /* ++ * Since this is called after changing class, this is a little weird ++ * and we cannot use DEQUEUE_DELAYED. ++ */ ++ if (p->se.sched_delayed) { ++ /* First, dequeue it from its new class' structures */ ++ dequeue_task(rq, p, DEQUEUE_NOCLOCK | DEQUEUE_SLEEP); ++ /* ++ * Now, clean up the fair_sched_class side of things ++ * related to sched_delayed being true and that wasn't done ++ * due to the generic dequeue not using DEQUEUE_DELAYED. ++ */ ++ finish_delayed_dequeue_entity(&p->se); ++ p->se.rel_deadline = 0; ++ __block_task(rq, p); ++ } + } + + static void switched_to_fair(struct rq *rq, struct task_struct *p) + { ++ SCHED_WARN_ON(p->se.sched_delayed); ++ + attach_task_cfs_rq(p); + + set_task_max_allowed_capacity(p); +@@ -12869,12 +13240,7 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) + } + } + +-/* Account for a task changing its policy or group. +- * +- * This routine is mostly called to set cfs_rq->curr field when a task +- * migrates between groups/classes. +- */ +-static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) ++static void __set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + { + struct sched_entity *se = &p->se; + +@@ -12887,6 +13253,27 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + list_move(&se->group_node, &rq->cfs_tasks); + } + #endif ++ if (!first) ++ return; ++ ++ SCHED_WARN_ON(se->sched_delayed); ++ ++ if (hrtick_enabled_fair(rq)) ++ hrtick_start_fair(rq, p); ++ ++ update_misfit_status(p, rq); ++ sched_fair_update_stop_tick(rq, p); ++} ++ ++/* ++ * Account for a task changing its policy or group. ++ * ++ * This routine is mostly called to set cfs_rq->curr field when a task ++ * migrates between groups/classes. ++ */ ++static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) ++{ ++ struct sched_entity *se = &p->se; + + for_each_sched_entity(se) { + struct cfs_rq *cfs_rq = cfs_rq_of(se); +@@ -12895,12 +13282,14 @@ static void set_next_task_fair(struct rq *rq, struct task_struct *p, bool first) + /* ensure bandwidth has been allocated on our new cfs_rq */ + account_cfs_rq_runtime(cfs_rq, 0); + } ++ ++ __set_next_task_fair(rq, p, first); + } + + void init_cfs_rq(struct cfs_rq *cfs_rq) + { + cfs_rq->tasks_timeline = RB_ROOT_CACHED; +- u64_u32_store(cfs_rq->min_vruntime, (u64)(-(1LL << 20))); ++ cfs_rq->min_vruntime = (u64)(-(1LL << 20)); + #ifdef CONFIG_SMP + raw_spin_lock_init(&cfs_rq->removed.lock); + #endif +@@ -13002,28 +13391,35 @@ void online_fair_sched_group(struct task_group *tg) + + void unregister_fair_sched_group(struct task_group *tg) + { +- unsigned long flags; +- struct rq *rq; + int cpu; + + destroy_cfs_bandwidth(tg_cfs_bandwidth(tg)); + + for_each_possible_cpu(cpu) { +- if (tg->se[cpu]) +- remove_entity_load_avg(tg->se[cpu]); ++ struct cfs_rq *cfs_rq = tg->cfs_rq[cpu]; ++ struct sched_entity *se = tg->se[cpu]; ++ struct rq *rq = cpu_rq(cpu); ++ ++ if (se) { ++ if (se->sched_delayed) { ++ guard(rq_lock_irqsave)(rq); ++ if (se->sched_delayed) { ++ update_rq_clock(rq); ++ dequeue_entities(rq, se, DEQUEUE_SLEEP | DEQUEUE_DELAYED); ++ } ++ list_del_leaf_cfs_rq(cfs_rq); ++ } ++ remove_entity_load_avg(se); ++ } + + /* + * Only empty task groups can be destroyed; so we can speculatively + * check on_list without danger of it being re-added. + */ +- if (!tg->cfs_rq[cpu]->on_list) +- continue; +- +- rq = cpu_rq(cpu); +- +- raw_spin_rq_lock_irqsave(rq, flags); +- list_del_leaf_cfs_rq(tg->cfs_rq[cpu]); +- raw_spin_rq_unlock_irqrestore(rq, flags); ++ if (cfs_rq->on_list) { ++ guard(rq_lock_irqsave)(rq); ++ list_del_leaf_cfs_rq(cfs_rq); ++ } + } + } + +@@ -13213,13 +13609,13 @@ DEFINE_SCHED_CLASS(fair) = { + + .wakeup_preempt = check_preempt_wakeup_fair, + ++ .pick_task = pick_task_fair, + .pick_next_task = __pick_next_task_fair, + .put_prev_task = put_prev_task_fair, + .set_next_task = set_next_task_fair, + + #ifdef CONFIG_SMP + .balance = balance_fair, +- .pick_task = pick_task_fair, + .select_task_rq = select_task_rq_fair, + .migrate_task_rq = migrate_task_rq_fair, + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 143f55df890b..290874079f60 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -5,8 +5,24 @@ + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. + */ + SCHED_FEAT(PLACE_LAG, true) ++/* ++ * Give new tasks half a slice to ease into the competition. ++ */ + SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) ++/* ++ * Preserve relative virtual deadline on 'migration'. ++ */ ++SCHED_FEAT(PLACE_REL_DEADLINE, true) ++/* ++ * Inhibit (wakeup) preemption until the current task has either matched the ++ * 0-lag point or until is has exhausted it's slice. ++ */ + SCHED_FEAT(RUN_TO_PARITY, true) ++/* ++ * Allow wakeup of tasks with a shorter slice to cancel RESPECT_SLICE for ++ * current. ++ */ ++SCHED_FEAT(PREEMPT_SHORT, true) + + /* + * Prefer to schedule the task we woke last (assuming it failed +@@ -21,6 +37,18 @@ SCHED_FEAT(NEXT_BUDDY, false) + */ + SCHED_FEAT(CACHE_HOT_BUDDY, true) + ++/* ++ * Delay dequeueing tasks until they get selected or woken. ++ * ++ * By delaying the dequeue for non-eligible tasks, they remain in the ++ * competition and can burn off their negative lag. When they get selected ++ * they'll have positive lag by definition. ++ * ++ * DELAY_ZERO clips the lag on dequeue (or wakeup) to 0. ++ */ ++SCHED_FEAT(DELAY_DEQUEUE, true) ++SCHED_FEAT(DELAY_ZERO, true) ++ + /* + * Allow wakeup-time preemption of the current task: + */ +@@ -85,5 +113,3 @@ SCHED_FEAT(WA_BIAS, true) + SCHED_FEAT(UTIL_EST, true) + + SCHED_FEAT(LATENCY_WARN, false) +- +-SCHED_FEAT(HZ_BW, true) +diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c +index 6e78d071beb5..7a105a0123aa 100644 +--- a/kernel/sched/idle.c ++++ b/kernel/sched/idle.c +@@ -450,43 +450,35 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) + resched_curr(rq); + } + +-static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) ++static void put_prev_task_idle(struct rq *rq, struct task_struct *prev, struct task_struct *next) + { ++ dl_server_update_idle_time(rq, prev); + } + + static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) + { + update_idle_core(rq); + schedstat_inc(rq->sched_goidle); ++ next->se.exec_start = rq_clock_task(rq); + } + +-#ifdef CONFIG_SMP +-static struct task_struct *pick_task_idle(struct rq *rq) ++struct task_struct *pick_task_idle(struct rq *rq) + { + return rq->idle; + } +-#endif +- +-struct task_struct *pick_next_task_idle(struct rq *rq) +-{ +- struct task_struct *next = rq->idle; +- +- set_next_task_idle(rq, next, true); +- +- return next; +-} + + /* + * It is not legal to sleep in the idle task - print a warning + * message if some code attempts to do it: + */ +-static void ++static bool + dequeue_task_idle(struct rq *rq, struct task_struct *p, int flags) + { + raw_spin_rq_unlock_irq(rq); + printk(KERN_ERR "bad: scheduling from the idle thread!\n"); + dump_stack(); + raw_spin_rq_lock_irq(rq); ++ return true; + } + + /* +@@ -528,13 +520,12 @@ DEFINE_SCHED_CLASS(idle) = { + + .wakeup_preempt = wakeup_preempt_idle, + +- .pick_next_task = pick_next_task_idle, ++ .pick_task = pick_task_idle, + .put_prev_task = put_prev_task_idle, + .set_next_task = set_next_task_idle, + + #ifdef CONFIG_SMP + .balance = balance_idle, +- .pick_task = pick_task_idle, + .select_task_rq = select_task_rq_idle, + .set_cpus_allowed = set_cpus_allowed_common, + #endif +diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c +index 310523c1b9e3..172c588de542 100644 +--- a/kernel/sched/rt.c ++++ b/kernel/sched/rt.c +@@ -8,10 +8,6 @@ int sched_rr_timeslice = RR_TIMESLICE; + /* More than 4 hours if BW_SHIFT equals 20. */ + static const u64 max_rt_runtime = MAX_BW; + +-static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); +- +-struct rt_bandwidth def_rt_bandwidth; +- + /* + * period over which we measure -rt task CPU usage in us. + * default: 1s +@@ -66,6 +62,40 @@ static int __init sched_rt_sysctl_init(void) + late_initcall(sched_rt_sysctl_init); + #endif + ++void init_rt_rq(struct rt_rq *rt_rq) ++{ ++ struct rt_prio_array *array; ++ int i; ++ ++ array = &rt_rq->active; ++ for (i = 0; i < MAX_RT_PRIO; i++) { ++ INIT_LIST_HEAD(array->queue + i); ++ __clear_bit(i, array->bitmap); ++ } ++ /* delimiter for bitsearch: */ ++ __set_bit(MAX_RT_PRIO, array->bitmap); ++ ++#if defined CONFIG_SMP ++ rt_rq->highest_prio.curr = MAX_RT_PRIO-1; ++ rt_rq->highest_prio.next = MAX_RT_PRIO-1; ++ rt_rq->overloaded = 0; ++ plist_head_init(&rt_rq->pushable_tasks); ++#endif /* CONFIG_SMP */ ++ /* We start is dequeued state, because no RT tasks are queued */ ++ rt_rq->rt_queued = 0; ++ ++#ifdef CONFIG_RT_GROUP_SCHED ++ rt_rq->rt_time = 0; ++ rt_rq->rt_throttled = 0; ++ rt_rq->rt_runtime = 0; ++ raw_spin_lock_init(&rt_rq->rt_runtime_lock); ++#endif ++} ++ ++#ifdef CONFIG_RT_GROUP_SCHED ++ ++static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun); ++ + static enum hrtimer_restart sched_rt_period_timer(struct hrtimer *timer) + { + struct rt_bandwidth *rt_b = +@@ -130,35 +160,6 @@ static void start_rt_bandwidth(struct rt_bandwidth *rt_b) + do_start_rt_bandwidth(rt_b); + } + +-void init_rt_rq(struct rt_rq *rt_rq) +-{ +- struct rt_prio_array *array; +- int i; +- +- array = &rt_rq->active; +- for (i = 0; i < MAX_RT_PRIO; i++) { +- INIT_LIST_HEAD(array->queue + i); +- __clear_bit(i, array->bitmap); +- } +- /* delimiter for bit-search: */ +- __set_bit(MAX_RT_PRIO, array->bitmap); +- +-#if defined CONFIG_SMP +- rt_rq->highest_prio.curr = MAX_RT_PRIO-1; +- rt_rq->highest_prio.next = MAX_RT_PRIO-1; +- rt_rq->overloaded = 0; +- plist_head_init(&rt_rq->pushable_tasks); +-#endif /* CONFIG_SMP */ +- /* We start is dequeued state, because no RT tasks are queued */ +- rt_rq->rt_queued = 0; +- +- rt_rq->rt_time = 0; +- rt_rq->rt_throttled = 0; +- rt_rq->rt_runtime = 0; +- raw_spin_lock_init(&rt_rq->rt_runtime_lock); +-} +- +-#ifdef CONFIG_RT_GROUP_SCHED + static void destroy_rt_bandwidth(struct rt_bandwidth *rt_b) + { + hrtimer_cancel(&rt_b->rt_period_timer); +@@ -195,7 +196,6 @@ void unregister_rt_sched_group(struct task_group *tg) + { + if (tg->rt_se) + destroy_rt_bandwidth(&tg->rt_bandwidth); +- + } + + void free_rt_sched_group(struct task_group *tg) +@@ -253,8 +253,7 @@ int alloc_rt_sched_group(struct task_group *tg, struct task_group *parent) + if (!tg->rt_se) + goto err; + +- init_rt_bandwidth(&tg->rt_bandwidth, +- ktime_to_ns(def_rt_bandwidth.rt_period), 0); ++ init_rt_bandwidth(&tg->rt_bandwidth, ktime_to_ns(global_rt_period()), 0); + + for_each_possible_cpu(i) { + rt_rq = kzalloc_node(sizeof(struct rt_rq), +@@ -604,70 +603,6 @@ static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) + return &rt_rq->tg->rt_bandwidth; + } + +-#else /* !CONFIG_RT_GROUP_SCHED */ +- +-static inline u64 sched_rt_runtime(struct rt_rq *rt_rq) +-{ +- return rt_rq->rt_runtime; +-} +- +-static inline u64 sched_rt_period(struct rt_rq *rt_rq) +-{ +- return ktime_to_ns(def_rt_bandwidth.rt_period); +-} +- +-typedef struct rt_rq *rt_rq_iter_t; +- +-#define for_each_rt_rq(rt_rq, iter, rq) \ +- for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) +- +-#define for_each_sched_rt_entity(rt_se) \ +- for (; rt_se; rt_se = NULL) +- +-static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) +-{ +- return NULL; +-} +- +-static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) +-{ +- struct rq *rq = rq_of_rt_rq(rt_rq); +- +- if (!rt_rq->rt_nr_running) +- return; +- +- enqueue_top_rt_rq(rt_rq); +- resched_curr(rq); +-} +- +-static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) +-{ +- dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); +-} +- +-static inline int rt_rq_throttled(struct rt_rq *rt_rq) +-{ +- return rt_rq->rt_throttled; +-} +- +-static inline const struct cpumask *sched_rt_period_mask(void) +-{ +- return cpu_online_mask; +-} +- +-static inline +-struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) +-{ +- return &cpu_rq(cpu)->rt; +-} +- +-static inline struct rt_bandwidth *sched_rt_bandwidth(struct rt_rq *rt_rq) +-{ +- return &def_rt_bandwidth; +-} +- +-#endif /* CONFIG_RT_GROUP_SCHED */ +- + bool sched_rt_bandwidth_account(struct rt_rq *rt_rq) + { + struct rt_bandwidth *rt_b = sched_rt_bandwidth(rt_rq); +@@ -859,7 +794,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + const struct cpumask *span; + + span = sched_rt_period_mask(); +-#ifdef CONFIG_RT_GROUP_SCHED ++ + /* + * FIXME: isolated CPUs should really leave the root task group, + * whether they are isolcpus or were isolated via cpusets, lest +@@ -871,7 +806,7 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + */ + if (rt_b == &root_task_group.rt_bandwidth) + span = cpu_online_mask; +-#endif ++ + for_each_cpu(i, span) { + int enqueue = 0; + struct rt_rq *rt_rq = sched_rt_period_rt_rq(rt_b, i); +@@ -938,18 +873,6 @@ static int do_sched_rt_period_timer(struct rt_bandwidth *rt_b, int overrun) + return idle; + } + +-static inline int rt_se_prio(struct sched_rt_entity *rt_se) +-{ +-#ifdef CONFIG_RT_GROUP_SCHED +- struct rt_rq *rt_rq = group_rt_rq(rt_se); +- +- if (rt_rq) +- return rt_rq->highest_prio.curr; +-#endif +- +- return rt_task_of(rt_se)->prio; +-} +- + static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) + { + u64 runtime = sched_rt_runtime(rt_rq); +@@ -993,6 +916,72 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) + return 0; + } + ++#else /* !CONFIG_RT_GROUP_SCHED */ ++ ++typedef struct rt_rq *rt_rq_iter_t; ++ ++#define for_each_rt_rq(rt_rq, iter, rq) \ ++ for ((void) iter, rt_rq = &rq->rt; rt_rq; rt_rq = NULL) ++ ++#define for_each_sched_rt_entity(rt_se) \ ++ for (; rt_se; rt_se = NULL) ++ ++static inline struct rt_rq *group_rt_rq(struct sched_rt_entity *rt_se) ++{ ++ return NULL; ++} ++ ++static inline void sched_rt_rq_enqueue(struct rt_rq *rt_rq) ++{ ++ struct rq *rq = rq_of_rt_rq(rt_rq); ++ ++ if (!rt_rq->rt_nr_running) ++ return; ++ ++ enqueue_top_rt_rq(rt_rq); ++ resched_curr(rq); ++} ++ ++static inline void sched_rt_rq_dequeue(struct rt_rq *rt_rq) ++{ ++ dequeue_top_rt_rq(rt_rq, rt_rq->rt_nr_running); ++} ++ ++static inline int rt_rq_throttled(struct rt_rq *rt_rq) ++{ ++ return false; ++} ++ ++static inline const struct cpumask *sched_rt_period_mask(void) ++{ ++ return cpu_online_mask; ++} ++ ++static inline ++struct rt_rq *sched_rt_period_rt_rq(struct rt_bandwidth *rt_b, int cpu) ++{ ++ return &cpu_rq(cpu)->rt; ++} ++ ++#ifdef CONFIG_SMP ++static void __enable_runtime(struct rq *rq) { } ++static void __disable_runtime(struct rq *rq) { } ++#endif ++ ++#endif /* CONFIG_RT_GROUP_SCHED */ ++ ++static inline int rt_se_prio(struct sched_rt_entity *rt_se) ++{ ++#ifdef CONFIG_RT_GROUP_SCHED ++ struct rt_rq *rt_rq = group_rt_rq(rt_se); ++ ++ if (rt_rq) ++ return rt_rq->highest_prio.curr; ++#endif ++ ++ return rt_task_of(rt_se)->prio; ++} ++ + /* + * Update the current task's runtime statistics. Skip current tasks that + * are not in our scheduling class. +@@ -1000,7 +989,6 @@ static int sched_rt_runtime_exceeded(struct rt_rq *rt_rq) + static void update_curr_rt(struct rq *rq) + { + struct task_struct *curr = rq->curr; +- struct sched_rt_entity *rt_se = &curr->rt; + s64 delta_exec; + + if (curr->sched_class != &rt_sched_class) +@@ -1010,6 +998,9 @@ static void update_curr_rt(struct rq *rq) + if (unlikely(delta_exec <= 0)) + return; + ++#ifdef CONFIG_RT_GROUP_SCHED ++ struct sched_rt_entity *rt_se = &curr->rt; ++ + if (!rt_bandwidth_enabled()) + return; + +@@ -1028,6 +1019,7 @@ static void update_curr_rt(struct rq *rq) + do_start_rt_bandwidth(sched_rt_bandwidth(rt_rq)); + } + } ++#endif + } + + static void +@@ -1184,7 +1176,6 @@ dec_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) + static void + inc_rt_group(struct sched_rt_entity *rt_se, struct rt_rq *rt_rq) + { +- start_rt_bandwidth(&def_rt_bandwidth); + } + + static inline +@@ -1492,7 +1483,7 @@ enqueue_task_rt(struct rq *rq, struct task_struct *p, int flags) + enqueue_pushable_task(rq, p); + } + +-static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) ++static bool dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) + { + struct sched_rt_entity *rt_se = &p->rt; + +@@ -1500,6 +1491,8 @@ static void dequeue_task_rt(struct rq *rq, struct task_struct *p, int flags) + dequeue_rt_entity(rt_se, flags); + + dequeue_pushable_task(rq, p); ++ ++ return true; + } + + /* +@@ -1755,17 +1748,7 @@ static struct task_struct *pick_task_rt(struct rq *rq) + return p; + } + +-static struct task_struct *pick_next_task_rt(struct rq *rq) +-{ +- struct task_struct *p = pick_task_rt(rq); +- +- if (p) +- set_next_task_rt(rq, p, true); +- +- return p; +-} +- +-static void put_prev_task_rt(struct rq *rq, struct task_struct *p) ++static void put_prev_task_rt(struct rq *rq, struct task_struct *p, struct task_struct *next) + { + struct sched_rt_entity *rt_se = &p->rt; + struct rt_rq *rt_rq = &rq->rt; +@@ -2652,13 +2635,12 @@ DEFINE_SCHED_CLASS(rt) = { + + .wakeup_preempt = wakeup_preempt_rt, + +- .pick_next_task = pick_next_task_rt, ++ .pick_task = pick_task_rt, + .put_prev_task = put_prev_task_rt, + .set_next_task = set_next_task_rt, + + #ifdef CONFIG_SMP + .balance = balance_rt, +- .pick_task = pick_task_rt, + .select_task_rq = select_task_rq_rt, + .set_cpus_allowed = set_cpus_allowed_common, + .rq_online = rq_online_rt, +@@ -2912,19 +2894,6 @@ int sched_rt_can_attach(struct task_group *tg, struct task_struct *tsk) + #ifdef CONFIG_SYSCTL + static int sched_rt_global_constraints(void) + { +- unsigned long flags; +- int i; +- +- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); +- for_each_possible_cpu(i) { +- struct rt_rq *rt_rq = &cpu_rq(i)->rt; +- +- raw_spin_lock(&rt_rq->rt_runtime_lock); +- rt_rq->rt_runtime = global_rt_runtime(); +- raw_spin_unlock(&rt_rq->rt_runtime_lock); +- } +- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); +- + return 0; + } + #endif /* CONFIG_SYSCTL */ +@@ -2944,12 +2913,6 @@ static int sched_rt_global_validate(void) + + static void sched_rt_do_global(void) + { +- unsigned long flags; +- +- raw_spin_lock_irqsave(&def_rt_bandwidth.rt_runtime_lock, flags); +- def_rt_bandwidth.rt_runtime = global_rt_runtime(); +- def_rt_bandwidth.rt_period = ns_to_ktime(global_rt_period()); +- raw_spin_unlock_irqrestore(&def_rt_bandwidth.rt_runtime_lock, flags); + } + + static int sched_rt_handler(const struct ctl_table *table, int write, void *buffer, +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 432b43aa091c..10b72dcb57e4 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -68,6 +68,7 @@ + #include + #include + #include ++#include + + #include + #include +@@ -335,7 +336,7 @@ extern bool __checkparam_dl(const struct sched_attr *attr); + extern bool dl_param_changed(struct task_struct *p, const struct sched_attr *attr); + extern int dl_cpuset_cpumask_can_shrink(const struct cpumask *cur, const struct cpumask *trial); + extern int dl_bw_check_overflow(int cpu); +- ++extern s64 dl_scaled_delta_exec(struct rq *rq, struct sched_dl_entity *dl_se, s64 delta_exec); + /* + * SCHED_DEADLINE supports servers (nested scheduling) with the following + * interface: +@@ -361,7 +362,14 @@ extern void dl_server_start(struct sched_dl_entity *dl_se); + extern void dl_server_stop(struct sched_dl_entity *dl_se); + extern void dl_server_init(struct sched_dl_entity *dl_se, struct rq *rq, + dl_server_has_tasks_f has_tasks, +- dl_server_pick_f pick); ++ dl_server_pick_f pick_task); ++ ++extern void dl_server_update_idle_time(struct rq *rq, ++ struct task_struct *p); ++extern void fair_server_init(struct rq *rq); ++extern void __dl_server_attach_root(struct sched_dl_entity *dl_se, struct rq *rq); ++extern int dl_server_apply_params(struct sched_dl_entity *dl_se, ++ u64 runtime, u64 period, bool init); + + #ifdef CONFIG_CGROUP_SCHED + +@@ -599,17 +607,12 @@ struct cfs_rq { + s64 avg_vruntime; + u64 avg_load; + +- u64 exec_clock; + u64 min_vruntime; + #ifdef CONFIG_SCHED_CORE + unsigned int forceidle_seq; + u64 min_vruntime_fi; + #endif + +-#ifndef CONFIG_64BIT +- u64 min_vruntime_copy; +-#endif +- + struct rb_root_cached tasks_timeline; + + /* +@@ -619,10 +622,6 @@ struct cfs_rq { + struct sched_entity *curr; + struct sched_entity *next; + +-#ifdef CONFIG_SCHED_DEBUG +- unsigned int nr_spread_over; +-#endif +- + #ifdef CONFIG_SMP + /* + * CFS load tracking +@@ -726,13 +725,13 @@ struct rt_rq { + #endif /* CONFIG_SMP */ + int rt_queued; + ++#ifdef CONFIG_RT_GROUP_SCHED + int rt_throttled; + u64 rt_time; + u64 rt_runtime; + /* Nests inside the rq lock: */ + raw_spinlock_t rt_runtime_lock; + +-#ifdef CONFIG_RT_GROUP_SCHED + unsigned int rt_nr_boosted; + + struct rq *rq; +@@ -820,6 +819,9 @@ static inline void se_update_runnable(struct sched_entity *se) + + static inline long se_runnable(struct sched_entity *se) + { ++ if (se->sched_delayed) ++ return false; ++ + if (entity_is_task(se)) + return !!se->on_rq; + else +@@ -834,6 +836,9 @@ static inline void se_update_runnable(struct sched_entity *se) { } + + static inline long se_runnable(struct sched_entity *se) + { ++ if (se->sched_delayed) ++ return false; ++ + return !!se->on_rq; + } + +@@ -1044,6 +1049,8 @@ struct rq { + struct rt_rq rt; + struct dl_rq dl; + ++ struct sched_dl_entity fair_server; ++ + #ifdef CONFIG_FAIR_GROUP_SCHED + /* list of leaf cfs_rq on this CPU: */ + struct list_head leaf_cfs_rq_list; +@@ -1059,6 +1066,7 @@ struct rq { + unsigned int nr_uninterruptible; + + struct task_struct __rcu *curr; ++ struct sched_dl_entity *dl_server; + struct task_struct *idle; + struct task_struct *stop; + unsigned long next_balance; +@@ -1158,7 +1166,6 @@ struct rq { + /* latency stats */ + struct sched_info rq_sched_info; + unsigned long long rq_cpu_time; +- /* could above be rq->cfs_rq.exec_clock + rq->rt_rq.rt_runtime ? */ + + /* sys_sched_yield() stats */ + unsigned int yld_count; +@@ -1187,6 +1194,7 @@ struct rq { + /* per rq */ + struct rq *core; + struct task_struct *core_pick; ++ struct sched_dl_entity *core_dl_server; + unsigned int core_enabled; + unsigned int core_sched_seq; + struct rb_root core_tree; +@@ -2247,11 +2255,13 @@ extern const u32 sched_prio_to_wmult[40]; + * + */ + +-#define DEQUEUE_SLEEP 0x01 ++#define DEQUEUE_SLEEP 0x01 /* Matches ENQUEUE_WAKEUP */ + #define DEQUEUE_SAVE 0x02 /* Matches ENQUEUE_RESTORE */ + #define DEQUEUE_MOVE 0x04 /* Matches ENQUEUE_MOVE */ + #define DEQUEUE_NOCLOCK 0x08 /* Matches ENQUEUE_NOCLOCK */ ++#define DEQUEUE_SPECIAL 0x10 + #define DEQUEUE_MIGRATING 0x100 /* Matches ENQUEUE_MIGRATING */ ++#define DEQUEUE_DELAYED 0x200 /* Matches ENQUEUE_DELAYED */ + + #define ENQUEUE_WAKEUP 0x01 + #define ENQUEUE_RESTORE 0x02 +@@ -2267,6 +2277,7 @@ extern const u32 sched_prio_to_wmult[40]; + #endif + #define ENQUEUE_INITIAL 0x80 + #define ENQUEUE_MIGRATING 0x100 ++#define ENQUEUE_DELAYED 0x200 + + #define RETRY_TASK ((void *)-1UL) + +@@ -2285,23 +2296,31 @@ struct sched_class { + #endif + + void (*enqueue_task) (struct rq *rq, struct task_struct *p, int flags); +- void (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); ++ bool (*dequeue_task) (struct rq *rq, struct task_struct *p, int flags); + void (*yield_task) (struct rq *rq); + bool (*yield_to_task)(struct rq *rq, struct task_struct *p); + + void (*wakeup_preempt)(struct rq *rq, struct task_struct *p, int flags); + +- struct task_struct *(*pick_next_task)(struct rq *rq); ++ struct task_struct *(*pick_task)(struct rq *rq); ++ /* ++ * Optional! When implemented pick_next_task() should be equivalent to: ++ * ++ * next = pick_task(); ++ * if (next) { ++ * put_prev_task(prev); ++ * set_next_task_first(next); ++ * } ++ */ ++ struct task_struct *(*pick_next_task)(struct rq *rq, struct task_struct *prev); + +- void (*put_prev_task)(struct rq *rq, struct task_struct *p); ++ void (*put_prev_task)(struct rq *rq, struct task_struct *p, struct task_struct *next); + void (*set_next_task)(struct rq *rq, struct task_struct *p, bool first); + + #ifdef CONFIG_SMP + int (*balance)(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); + int (*select_task_rq)(struct task_struct *p, int task_cpu, int flags); + +- struct task_struct * (*pick_task)(struct rq *rq); +- + void (*migrate_task_rq)(struct task_struct *p, int new_cpu); + + void (*task_woken)(struct rq *this_rq, struct task_struct *task); +@@ -2345,7 +2364,7 @@ struct sched_class { + static inline void put_prev_task(struct rq *rq, struct task_struct *prev) + { + WARN_ON_ONCE(rq->curr != prev); +- prev->sched_class->put_prev_task(rq, prev); ++ prev->sched_class->put_prev_task(rq, prev, NULL); + } + + static inline void set_next_task(struct rq *rq, struct task_struct *next) +@@ -2353,6 +2372,30 @@ static inline void set_next_task(struct rq *rq, struct task_struct *next) + next->sched_class->set_next_task(rq, next, false); + } + ++static inline void ++__put_prev_set_next_dl_server(struct rq *rq, ++ struct task_struct *prev, ++ struct task_struct *next) ++{ ++ prev->dl_server = NULL; ++ next->dl_server = rq->dl_server; ++ rq->dl_server = NULL; ++} ++ ++static inline void put_prev_set_next_task(struct rq *rq, ++ struct task_struct *prev, ++ struct task_struct *next) ++{ ++ WARN_ON_ONCE(rq->curr != prev); ++ ++ __put_prev_set_next_dl_server(rq, prev, next); ++ ++ if (next == prev) ++ return; ++ ++ prev->sched_class->put_prev_task(rq, prev, next); ++ next->sched_class->set_next_task(rq, next, true); ++} + + /* + * Helper to define a sched_class instance; each one is placed in a separate +@@ -2408,7 +2451,7 @@ static inline bool sched_fair_runnable(struct rq *rq) + } + + extern struct task_struct *pick_next_task_fair(struct rq *rq, struct task_struct *prev, struct rq_flags *rf); +-extern struct task_struct *pick_next_task_idle(struct rq *rq); ++extern struct task_struct *pick_task_idle(struct rq *rq); + + #define SCA_CHECK 0x01 + #define SCA_MIGRATE_DISABLE 0x02 +@@ -2515,7 +2558,6 @@ extern void reweight_task(struct task_struct *p, const struct load_weight *lw); + extern void resched_curr(struct rq *rq); + extern void resched_cpu(int cpu); + +-extern struct rt_bandwidth def_rt_bandwidth; + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); + extern bool sched_rt_bandwidth_account(struct rt_rq *rt_rq); + +@@ -2586,6 +2628,19 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) + sched_update_tick_dependency(rq); + } + ++static inline void __block_task(struct rq *rq, struct task_struct *p) ++{ ++ WRITE_ONCE(p->on_rq, 0); ++ ASSERT_EXCLUSIVE_WRITER(p->on_rq); ++ if (p->sched_contributes_to_load) ++ rq->nr_uninterruptible++; ++ ++ if (p->in_iowait) { ++ atomic_inc(&rq->nr_iowait); ++ delayacct_blkio_start(); ++ } ++} ++ + extern void activate_task(struct rq *rq, struct task_struct *p, int flags); + extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + +@@ -3607,7 +3662,7 @@ extern int __sched_setaffinity(struct task_struct *p, struct affinity_context *c + extern void __setscheduler_prio(struct task_struct *p, int prio); + extern void set_load_weight(struct task_struct *p, bool update_load); + extern void enqueue_task(struct rq *rq, struct task_struct *p, int flags); +-extern void dequeue_task(struct rq *rq, struct task_struct *p, int flags); ++extern bool dequeue_task(struct rq *rq, struct task_struct *p, int flags); + + extern void check_class_changed(struct rq *rq, struct task_struct *p, + const struct sched_class *prev_class, +diff --git a/kernel/sched/stop_task.c b/kernel/sched/stop_task.c +index b1b8fe61c532..058dd42e3d9b 100644 +--- a/kernel/sched/stop_task.c ++++ b/kernel/sched/stop_task.c +@@ -41,26 +41,17 @@ static struct task_struct *pick_task_stop(struct rq *rq) + return rq->stop; + } + +-static struct task_struct *pick_next_task_stop(struct rq *rq) +-{ +- struct task_struct *p = pick_task_stop(rq); +- +- if (p) +- set_next_task_stop(rq, p, true); +- +- return p; +-} +- + static void + enqueue_task_stop(struct rq *rq, struct task_struct *p, int flags) + { + add_nr_running(rq, 1); + } + +-static void ++static bool + dequeue_task_stop(struct rq *rq, struct task_struct *p, int flags) + { + sub_nr_running(rq, 1); ++ return true; + } + + static void yield_task_stop(struct rq *rq) +@@ -68,7 +59,7 @@ static void yield_task_stop(struct rq *rq) + BUG(); /* the stop task should never yield, its pointless. */ + } + +-static void put_prev_task_stop(struct rq *rq, struct task_struct *prev) ++static void put_prev_task_stop(struct rq *rq, struct task_struct *prev, struct task_struct *next) + { + update_curr_common(rq); + } +@@ -111,13 +102,12 @@ DEFINE_SCHED_CLASS(stop) = { + + .wakeup_preempt = wakeup_preempt_stop, + +- .pick_next_task = pick_next_task_stop, ++ .pick_task = pick_task_stop, + .put_prev_task = put_prev_task_stop, + .set_next_task = set_next_task_stop, + + #ifdef CONFIG_SMP + .balance = balance_stop, +- .pick_task = pick_task_stop, + .select_task_rq = select_task_rq_stop, + .set_cpus_allowed = set_cpus_allowed_common, + #endif +diff --git a/kernel/sched/syscalls.c b/kernel/sched/syscalls.c +index ae1b42775ef9..c62acf509b74 100644 +--- a/kernel/sched/syscalls.c ++++ b/kernel/sched/syscalls.c +@@ -57,7 +57,7 @@ static int effective_prio(struct task_struct *p) + * keep the priority unchanged. Otherwise, update priority + * to the normal priority: + */ +- if (!rt_prio(p->prio)) ++ if (!rt_or_dl_prio(p->prio)) + return p->normal_prio; + return p->prio; + } +@@ -258,107 +258,6 @@ int sched_core_idle_cpu(int cpu) + + #endif + +-#ifdef CONFIG_SMP +-/* +- * This function computes an effective utilization for the given CPU, to be +- * used for frequency selection given the linear relation: f = u * f_max. +- * +- * The scheduler tracks the following metrics: +- * +- * cpu_util_{cfs,rt,dl,irq}() +- * cpu_bw_dl() +- * +- * Where the cfs,rt and dl util numbers are tracked with the same metric and +- * synchronized windows and are thus directly comparable. +- * +- * The cfs,rt,dl utilization are the running times measured with rq->clock_task +- * which excludes things like IRQ and steal-time. These latter are then accrued +- * in the IRQ utilization. +- * +- * The DL bandwidth number OTOH is not a measured metric but a value computed +- * based on the task model parameters and gives the minimal utilization +- * required to meet deadlines. +- */ +-unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, +- unsigned long *min, +- unsigned long *max) +-{ +- unsigned long util, irq, scale; +- struct rq *rq = cpu_rq(cpu); +- +- scale = arch_scale_cpu_capacity(cpu); +- +- /* +- * Early check to see if IRQ/steal time saturates the CPU, can be +- * because of inaccuracies in how we track these -- see +- * update_irq_load_avg(). +- */ +- irq = cpu_util_irq(rq); +- if (unlikely(irq >= scale)) { +- if (min) +- *min = scale; +- if (max) +- *max = scale; +- return scale; +- } +- +- if (min) { +- /* +- * The minimum utilization returns the highest level between: +- * - the computed DL bandwidth needed with the IRQ pressure which +- * steals time to the deadline task. +- * - The minimum performance requirement for CFS and/or RT. +- */ +- *min = max(irq + cpu_bw_dl(rq), uclamp_rq_get(rq, UCLAMP_MIN)); +- +- /* +- * When an RT task is runnable and uclamp is not used, we must +- * ensure that the task will run at maximum compute capacity. +- */ +- if (!uclamp_is_used() && rt_rq_is_runnable(&rq->rt)) +- *min = max(*min, scale); +- } +- +- /* +- * Because the time spend on RT/DL tasks is visible as 'lost' time to +- * CFS tasks and we use the same metric to track the effective +- * utilization (PELT windows are synchronized) we can directly add them +- * to obtain the CPU's actual utilization. +- */ +- util = util_cfs + cpu_util_rt(rq); +- util += cpu_util_dl(rq); +- +- /* +- * The maximum hint is a soft bandwidth requirement, which can be lower +- * than the actual utilization because of uclamp_max requirements. +- */ +- if (max) +- *max = min(scale, uclamp_rq_get(rq, UCLAMP_MAX)); +- +- if (util >= scale) +- return scale; +- +- /* +- * There is still idle time; further improve the number by using the +- * IRQ metric. Because IRQ/steal time is hidden from the task clock we +- * need to scale the task numbers: +- * +- * max - irq +- * U' = irq + --------- * U +- * max +- */ +- util = scale_irq_capacity(util, irq, scale); +- util += irq; +- +- return min(scale, util); +-} +- +-unsigned long sched_cpu_util(int cpu) +-{ +- return effective_cpu_util(cpu, cpu_util_cfs(cpu), NULL, NULL); +-} +-#endif /* CONFIG_SMP */ +- + /** + * find_process_by_pid - find a process with a matching PID value. + * @pid: the pid in question. +@@ -401,10 +300,20 @@ static void __setscheduler_params(struct task_struct *p, + + p->policy = policy; + +- if (dl_policy(policy)) ++ if (dl_policy(policy)) { + __setparam_dl(p, attr); +- else if (fair_policy(policy)) ++ } else if (fair_policy(policy)) { + p->static_prio = NICE_TO_PRIO(attr->sched_nice); ++ if (attr->sched_runtime) { ++ p->se.custom_slice = 1; ++ p->se.slice = clamp_t(u64, attr->sched_runtime, ++ NSEC_PER_MSEC/10, /* HZ=1000 * 10 */ ++ NSEC_PER_MSEC*100); /* HZ=100 / 10 */ ++ } else { ++ p->se.custom_slice = 0; ++ p->se.slice = sysctl_sched_base_slice; ++ } ++ } + + /* + * __sched_setscheduler() ensures attr->sched_priority == 0 when +@@ -700,7 +609,9 @@ int __sched_setscheduler(struct task_struct *p, + * but store a possible modification of reset_on_fork. + */ + if (unlikely(policy == p->policy)) { +- if (fair_policy(policy) && attr->sched_nice != task_nice(p)) ++ if (fair_policy(policy) && ++ (attr->sched_nice != task_nice(p) || ++ (attr->sched_runtime != p->se.slice))) + goto change; + if (rt_policy(policy) && attr->sched_priority != p->rt_priority) + goto change; +@@ -846,6 +757,9 @@ static int _sched_setscheduler(struct task_struct *p, int policy, + .sched_nice = PRIO_TO_NICE(p->static_prio), + }; + ++ if (p->se.custom_slice) ++ attr.sched_runtime = p->se.slice; ++ + /* Fixup the legacy SCHED_RESET_ON_FORK hack. */ + if ((policy != SETPARAM_POLICY) && (policy & SCHED_RESET_ON_FORK)) { + attr.sched_flags |= SCHED_FLAG_RESET_ON_FORK; +@@ -1012,12 +926,14 @@ static int sched_copy_attr(struct sched_attr __user *uattr, struct sched_attr *a + + static void get_params(struct task_struct *p, struct sched_attr *attr) + { +- if (task_has_dl_policy(p)) ++ if (task_has_dl_policy(p)) { + __getparam_dl(p, attr); +- else if (task_has_rt_policy(p)) ++ } else if (task_has_rt_policy(p)) { + attr->sched_priority = p->rt_priority; +- else ++ } else { + attr->sched_nice = task_nice(p); ++ attr->sched_runtime = p->se.slice; ++ } + } + + /** +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 76504b776d03..9748a4c8d668 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -516,6 +516,14 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) + if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) + set_rq_online(rq); + ++ /* ++ * Because the rq is not a task, dl_add_task_root_domain() did not ++ * move the fair server bw to the rd if it already started. ++ * Add it now. ++ */ ++ if (rq->fair_server.dl_server) ++ __dl_server_attach_root(&rq->fair_server, rq); ++ + rq_unlock_irqrestore(rq, &rf); + + if (old_rd) +diff --git a/kernel/time/hrtimer.c b/kernel/time/hrtimer.c +index b8ee320208d4..f4be3abbb47b 100644 +--- a/kernel/time/hrtimer.c ++++ b/kernel/time/hrtimer.c +@@ -1975,7 +1975,7 @@ static void __hrtimer_init_sleeper(struct hrtimer_sleeper *sl, + * expiry. + */ + if (IS_ENABLED(CONFIG_PREEMPT_RT)) { +- if (task_is_realtime(current) && !(mode & HRTIMER_MODE_SOFT)) ++ if (rt_or_dl_task_policy(current) && !(mode & HRTIMER_MODE_SOFT)) + mode |= HRTIMER_MODE_HARD; + } + +@@ -2075,7 +2075,7 @@ long hrtimer_nanosleep(ktime_t rqtp, const enum hrtimer_mode mode, + u64 slack; + + slack = current->timer_slack_ns; +- if (rt_task(current)) ++ if (rt_or_dl_task(current)) + slack = 0; + + hrtimer_init_sleeper_on_stack(&t, clockid, mode); +@@ -2280,7 +2280,7 @@ schedule_hrtimeout_range_clock(ktime_t *expires, u64 delta, + * Override any slack passed by the user if under + * rt contraints. + */ +- if (rt_task(current)) ++ if (rt_or_dl_task(current)) + delta = 0; + + hrtimer_init_sleeper_on_stack(&t, clock_id, mode); +diff --git a/kernel/trace/trace_sched_wakeup.c b/kernel/trace/trace_sched_wakeup.c +index 130ca7e7787e..ae2ace5e515a 100644 +--- a/kernel/trace/trace_sched_wakeup.c ++++ b/kernel/trace/trace_sched_wakeup.c +@@ -547,7 +547,7 @@ probe_wakeup(void *ignore, struct task_struct *p) + * - wakeup_dl handles tasks belonging to sched_dl class only. + */ + if (tracing_dl || (wakeup_dl && !dl_task(p)) || +- (wakeup_rt && !dl_task(p) && !rt_task(p)) || ++ (wakeup_rt && !rt_or_dl_task(p)) || + (!dl_task(p) && (p->prio >= wakeup_prio || p->prio >= current->prio))) + return; + +diff --git a/mm/page-writeback.c b/mm/page-writeback.c +index 3bd08b60a9b3..9bd709077621 100644 +--- a/mm/page-writeback.c ++++ b/mm/page-writeback.c +@@ -426,7 +426,7 @@ static void domain_dirty_limits(struct dirty_throttle_control *dtc) + bg_thresh = (bg_ratio * available_memory) / PAGE_SIZE; + + tsk = current; +- if (rt_task(tsk)) { ++ if (rt_or_dl_task(tsk)) { + bg_thresh += bg_thresh / 4 + global_wb_domain.dirty_limit / 32; + thresh += thresh / 4 + global_wb_domain.dirty_limit / 32; + } +@@ -485,7 +485,7 @@ static unsigned long node_dirty_limit(struct pglist_data *pgdat) + else + dirty = vm_dirty_ratio * node_memory / 100; + +- if (rt_task(tsk)) ++ if (rt_or_dl_task(tsk)) + dirty += dirty / 4; + + /* +diff --git a/mm/page_alloc.c b/mm/page_alloc.c +index f8b4dae35fc3..da29ddf87cd8 100644 +--- a/mm/page_alloc.c ++++ b/mm/page_alloc.c +@@ -4008,7 +4008,7 @@ gfp_to_alloc_flags(gfp_t gfp_mask, unsigned int order) + */ + if (alloc_flags & ALLOC_MIN_RESERVE) + alloc_flags &= ~ALLOC_CPUSET; +- } else if (unlikely(rt_task(current)) && in_task()) ++ } else if (unlikely(rt_or_dl_task(current)) && in_task()) + alloc_flags |= ALLOC_MIN_RESERVE; + + alloc_flags = gfp_to_alloc_flags_cma(gfp_mask, alloc_flags); +-- +2.47.0.rc0 + diff --git a/sys-kernel/git-sources/0003-bbr3.patch b/sys-kernel/gentoo-sources-6.11/0002-bbr3.patch similarity index 98% rename from sys-kernel/git-sources/0003-bbr3.patch rename to sys-kernel/gentoo-sources-6.11/0002-bbr3.patch index f59737d..b106e5e 100644 --- a/sys-kernel/git-sources/0003-bbr3.patch +++ b/sys-kernel/gentoo-sources-6.11/0002-bbr3.patch @@ -1,6 +1,6 @@ -From 76485d8c7c1cc6ab2f9d755ef5bf09ca98a9f81a Mon Sep 17 00:00:00 2001 +From 694e2eec893e51c71b3faa821f561b8c387b3bb7 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 17 Jun 2024 15:16:10 +0200 +Date: Fri, 4 Oct 2024 17:06:44 +0200 Subject: [PATCH 02/10] bbr3 Signed-off-by: Peter Jung @@ -39,7 +39,7 @@ index 6a5e08b937b3..27aab715490e 100644 fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h -index 7d6b1254c92d..2ce55f444434 100644 +index c0deaafebfdc..d53f042d936e 100644 --- a/include/net/inet_connection_sock.h +++ b/include/net/inet_connection_sock.h @@ -137,8 +137,8 @@ struct inet_connection_sock { @@ -54,7 +54,7 @@ index 7d6b1254c92d..2ce55f444434 100644 #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ diff --git a/include/net/tcp.h b/include/net/tcp.h -index 060e95b331a2..953244eefe7d 100644 +index 196c148fce8a..f37256b8abfd 100644 --- a/include/net/tcp.h +++ b/include/net/tcp.h @@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) @@ -66,7 +66,7 @@ index 060e95b331a2..953244eefe7d 100644 enum tcp_tw_status { TCP_TW_SUCCESS = 0, -@@ -778,6 +780,15 @@ static inline void tcp_fast_path_check(struct sock *sk) +@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk) u32 tcp_delack_max(const struct sock *sk); @@ -82,7 +82,7 @@ index 060e95b331a2..953244eefe7d 100644 /* Compute the actual rto_min value */ static inline u32 tcp_rto_min(const struct sock *sk) { -@@ -883,6 +894,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) +@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) return max_t(s64, t1 - t0, 0); } @@ -94,7 +94,7 @@ index 060e95b331a2..953244eefe7d 100644 /* provide the departure time in us unit */ static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) { -@@ -972,9 +988,14 @@ struct tcp_skb_cb { +@@ -973,9 +989,14 @@ struct tcp_skb_cb { /* pkts S/ACKed so far upon tx of skb, incl retrans: */ __u32 delivered; /* start of send pipeline phase */ @@ -111,7 +111,7 @@ index 060e95b331a2..953244eefe7d 100644 } tx; /* only used for outgoing skbs */ union { struct inet_skb_parm h4; -@@ -1078,6 +1099,7 @@ enum tcp_ca_event { +@@ -1087,6 +1108,7 @@ enum tcp_ca_event { CA_EVENT_LOSS, /* loss timeout */ CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ @@ -119,7 +119,7 @@ index 060e95b331a2..953244eefe7d 100644 }; /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ -@@ -1100,7 +1122,11 @@ enum tcp_ca_ack_event_flags { +@@ -1109,7 +1131,11 @@ enum tcp_ca_ack_event_flags { #define TCP_CONG_NON_RESTRICTED 0x1 /* Requires ECN/ECT set on all packets */ #define TCP_CONG_NEEDS_ECN 0x2 @@ -132,7 +132,7 @@ index 060e95b331a2..953244eefe7d 100644 union tcp_cc_info; -@@ -1120,10 +1146,13 @@ struct ack_sample { +@@ -1129,10 +1155,13 @@ struct ack_sample { */ struct rate_sample { u64 prior_mstamp; /* starting timestamp for interval */ @@ -147,7 +147,7 @@ index 060e95b331a2..953244eefe7d 100644 long interval_us; /* time for tp->delivered to incr "delivered" */ u32 snd_interval_us; /* snd interval for delivered packets */ u32 rcv_interval_us; /* rcv interval for delivered packets */ -@@ -1134,7 +1163,9 @@ struct rate_sample { +@@ -1143,7 +1172,9 @@ struct rate_sample { u32 last_end_seq; /* end_seq of most recently ACKed packet */ bool is_app_limited; /* is sample from packet with bubble in pipe? */ bool is_retrans; /* is sample from retransmission? */ @@ -157,7 +157,7 @@ index 060e95b331a2..953244eefe7d 100644 }; struct tcp_congestion_ops { -@@ -1158,8 +1189,11 @@ struct tcp_congestion_ops { +@@ -1167,8 +1198,11 @@ struct tcp_congestion_ops { /* hook for packet ack accounting (optional) */ void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); @@ -171,7 +171,7 @@ index 060e95b331a2..953244eefe7d 100644 /* call when packets are delivered to update cwnd and pacing rate, * after all the ca_state processing. (optional) -@@ -1225,6 +1259,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) +@@ -1234,6 +1268,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) } #endif @@ -186,7 +186,7 @@ index 060e95b331a2..953244eefe7d 100644 static inline bool tcp_ca_needs_ecn(const struct sock *sk) { const struct inet_connection_sock *icsk = inet_csk(sk); -@@ -1244,6 +1286,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) +@@ -1253,6 +1295,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) void tcp_set_ca_state(struct sock *sk, const u8 ca_state); /* From tcp_rate.c */ @@ -194,7 +194,7 @@ index 060e95b331a2..953244eefe7d 100644 void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, struct rate_sample *rs); -@@ -1256,6 +1299,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) +@@ -1265,6 +1308,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) return t1 > t2 || (t1 == t2 && after(seq1, seq2)); } @@ -216,7 +216,7 @@ index 060e95b331a2..953244eefe7d 100644 /* These functions determine how the current flow behaves in respect of SACK * handling. SACK is negotiated with the peer, and therefore it can vary * between different flows. -@@ -2418,7 +2476,7 @@ struct tcp_plb_state { +@@ -2416,7 +2474,7 @@ struct tcp_plb_state { u8 consec_cong_rounds:5, /* consecutive congested rounds */ unused:3; u32 pause_until; /* jiffies32 when PLB can resume rerouting */ @@ -324,7 +324,7 @@ index 8e94ed7c56a0..50dc9970cad2 100644 choice prompt "Default TCP congestion control" diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c -index 18227757ec0c..f180befc28bd 100644 +index 3f88d0961e5b..4273cac333f6 100644 --- a/net/ipv4/bpf_tcp_ca.c +++ b/net/ipv4/bpf_tcp_ca.c @@ -305,11 +305,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp @@ -355,10 +355,10 @@ index 18227757ec0c..f180befc28bd 100644 .undo_cwnd = bpf_tcp_ca_undo_cwnd, .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c -index e6790ea74877..b63e27eba536 100644 +index 831a18dc7aa6..d9faa8fef55e 100644 --- a/net/ipv4/tcp.c +++ b/net/ipv4/tcp.c -@@ -3120,6 +3120,7 @@ int tcp_disconnect(struct sock *sk, int flags) +@@ -3123,6 +3123,7 @@ int tcp_disconnect(struct sock *sk, int flags) tp->rx_opt.dsack = 0; tp->rx_opt.num_sacks = 0; tp->rcv_ooopack = 0; @@ -366,7 +366,7 @@ index e6790ea74877..b63e27eba536 100644 /* Clean up fastopen related fields */ -@@ -3846,6 +3847,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) +@@ -3849,6 +3850,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) info->tcpi_options |= TCPI_OPT_ECN; if (tp->ecn_flags & TCP_ECN_SEEN) info->tcpi_options |= TCPI_OPT_ECN_SEEN; @@ -3020,7 +3020,7 @@ index 760941e55153..a180fa648d5e 100644 MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); +MODULE_VERSION(__stringify(BBR_VERSION)); diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c -index 28ffcfbeef14..7b13915ba288 100644 +index 0306d257fa64..28f581c0dab7 100644 --- a/net/ipv4/tcp_cong.c +++ b/net/ipv4/tcp_cong.c @@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) @@ -3032,10 +3032,10 @@ index 28ffcfbeef14..7b13915ba288 100644 icsk->icsk_ca_ops->init(sk); if (tcp_ca_needs_ecn(sk)) diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c -index 9c04a9c8be9d..2c89efbc8ddf 100644 +index e37488d3453f..62eef7d067c2 100644 --- a/net/ipv4/tcp_input.c +++ b/net/ipv4/tcp_input.c -@@ -365,7 +365,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tcp_enter_quickack_mode(sk, 2); break; case INET_ECN_CE: @@ -3044,7 +3044,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { -@@ -376,7 +376,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) +@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) tp->ecn_flags |= TCP_ECN_SEEN; break; default: @@ -3053,7 +3053,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); tp->ecn_flags |= TCP_ECN_SEEN; break; -@@ -1115,7 +1115,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) +@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) */ static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) { @@ -3066,7 +3066,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 } void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) -@@ -1496,6 +1501,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, +@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); tcp_skb_pcount_add(skb, -pcount); @@ -3084,7 +3084,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 /* When we're adding to gso_segs == 1, gso_size will be zero, * in theory this shouldn't be necessary but as long as DSACK * code can come after this skb later on it's better to keep -@@ -3764,7 +3780,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) +@@ -3799,7 +3815,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) /* This routine deals with acks during a TLP episode and ends an episode by * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack */ @@ -3094,7 +3094,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 { struct tcp_sock *tp = tcp_sk(sk); -@@ -3781,6 +3798,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) +@@ -3816,6 +3833,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) /* ACK advances: there was a loss, so reduce cwnd. Reset * tlp_high_seq in tcp_init_cwnd_reduction() */ @@ -3102,7 +3102,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 tcp_init_cwnd_reduction(sk); tcp_set_ca_state(sk, TCP_CA_CWR); tcp_end_cwnd_reduction(sk); -@@ -3791,6 +3809,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) +@@ -3826,6 +3844,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) FLAG_NOT_DUP | FLAG_DATA_SACKED))) { /* Pure dupack: original and TLP probe arrived; no loss */ tp->tlp_high_seq = 0; @@ -3114,7 +3114,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 } } -@@ -3899,6 +3922,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -3934,6 +3957,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; rs.prior_in_flight = tcp_packets_in_flight(tp); @@ -3122,7 +3122,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 /* ts_recent update must be made after we are sure that the packet * is in window. -@@ -3973,7 +3997,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -4008,7 +4032,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_rack_update_reo_wnd(sk, &rs); if (tp->tlp_high_seq) @@ -3131,7 +3131,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 if (tcp_ack_is_dubious(sk, flag)) { if (!(flag & (FLAG_SND_UNA_ADVANCED | -@@ -3997,6 +4021,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -4032,6 +4056,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) delivered = tcp_newly_delivered(sk, delivered, flag); lost = tp->lost - lost; /* freshly marked lost */ rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); @@ -3139,7 +3139,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); tcp_xmit_recovery(sk, rexmit); -@@ -4016,7 +4041,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) +@@ -4051,7 +4076,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) tcp_ack_probe(sk); if (tp->tlp_high_seq) @@ -3148,7 +3148,7 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 return 1; old_ack: -@@ -5671,13 +5696,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) +@@ -5718,13 +5743,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) /* More than one full frame received... */ if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && @@ -3166,10 +3166,10 @@ index 9c04a9c8be9d..2c89efbc8ddf 100644 tcp_in_quickack_mode(sk) || /* Protocol state mandates a one-time immediate ACK */ diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c -index 538c06f95918..e4c861c071ae 100644 +index a19a9dbd3409..e0ef8406a326 100644 --- a/net/ipv4/tcp_minisocks.c +++ b/net/ipv4/tcp_minisocks.c -@@ -460,6 +460,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) +@@ -459,6 +459,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); bool ca_got_dst = false; @@ -3179,7 +3179,7 @@ index 538c06f95918..e4c861c071ae 100644 const struct tcp_congestion_ops *ca; diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 95618d0e78e4..3f4bdd2b6476 100644 +index 16c48df8df4c..6c3a1895238e 100644 --- a/net/ipv4/tcp_output.c +++ b/net/ipv4/tcp_output.c @@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) @@ -3274,7 +3274,7 @@ index 95618d0e78e4..3f4bdd2b6476 100644 } @@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); tcp_init_tso_segs(skb, mss_now); + tcp_set_tx_in_flight(sk, skb); @@ -3370,10 +3370,10 @@ index a8f6d9d06f2e..8737f2134648 100644 rs->interval_us = max(snd_us, ack_us); diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c -index 5bfd76a31af6..0c63590c5fce 100644 +index 4d40615dc8fc..f27941201ef2 100644 --- a/net/ipv4/tcp_timer.c +++ b/net/ipv4/tcp_timer.c -@@ -684,6 +684,7 @@ void tcp_write_timer_handler(struct sock *sk) +@@ -689,6 +689,7 @@ void tcp_write_timer_handler(struct sock *sk) return; } @@ -3382,6 +3382,5 @@ index 5bfd76a31af6..0c63590c5fce 100644 event = icsk->icsk_pending; -- -2.45.2 - +2.47.0.rc0 diff --git a/sys-kernel/git-sources/0009-ntsync.patch b/sys-kernel/gentoo-sources-6.11/0007-ntsync.patch similarity index 99% rename from sys-kernel/git-sources/0009-ntsync.patch rename to sys-kernel/gentoo-sources-6.11/0007-ntsync.patch index 436dbf5..9092ec2 100644 --- a/sys-kernel/git-sources/0009-ntsync.patch +++ b/sys-kernel/gentoo-sources-6.11/0007-ntsync.patch @@ -1,7 +1,7 @@ -From de83b2d5a68b825a0741a17cac95dd3690a51162 Mon Sep 17 00:00:00 2001 +From 2087698c3f9af692a9e088307a8f25da094bc7a2 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 17 Jun 2024 15:29:00 +0200 -Subject: [PATCH 09/10] ntsync +Date: Fri, 4 Oct 2024 17:08:21 +0200 +Subject: [PATCH 07/10] ntsync Signed-off-by: Peter Jung --- @@ -24,7 +24,7 @@ Signed-off-by: Peter Jung create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst -index 5926115ec0ed..1e78586662fb 100644 +index 274cc7546efc..9c1b15cd89ab 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst @@ -63,6 +63,7 @@ Everything else @@ -440,10 +440,10 @@ index 000000000000..767844637a7d + ``objs`` and in ``alert``. If this is attempted, the function fails + with ``EINVAL``. diff --git a/MAINTAINERS b/MAINTAINERS -index cf9c9221c388..cf155b1f9480 100644 +index cc40a9d9b8cd..2cd7168dc401 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -15997,6 +15997,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git +@@ -16319,6 +16319,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git F: Documentation/filesystems/ntfs3.rst F: fs/ntfs3/ @@ -460,7 +460,7 @@ index cf9c9221c388..cf155b1f9480 100644 M: Finn Thain L: linux-m68k@lists.linux-m68k.org diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig -index faf983680040..2907b5c23368 100644 +index 41c54051347a..bde398e12696 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig @@ -507,7 +507,6 @@ config OPEN_DICE @@ -1633,11 +1633,11 @@ index dcfa38fdc93c..4a8095a3fc34 100644 #endif diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile -index 9039f3709aff..d5aeaa8fe3ca 100644 +index bc8fe9e8f7f2..b1296bd8eb3f 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile -@@ -16,6 +16,7 @@ TARGETS += damon - TARGETS += devices +@@ -17,6 +17,7 @@ TARGETS += devices/error_logs + TARGETS += devices/probe TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf +TARGETS += drivers/ntsync @@ -3085,6 +3085,5 @@ index 000000000000..5fa2c9a0768c + +TEST_HARNESS_MAIN -- -2.45.2 - +2.47.0.rc0 diff --git a/sys-kernel/gentoo-sources-6.10.3/0010-perf-per-core.patch b/sys-kernel/gentoo-sources-6.11/0008-perf-per-core.patch similarity index 99% rename from sys-kernel/gentoo-sources-6.10.3/0010-perf-per-core.patch rename to sys-kernel/gentoo-sources-6.11/0008-perf-per-core.patch index 99c2a35..50b57d7 100644 --- a/sys-kernel/gentoo-sources-6.10.3/0010-perf-per-core.patch +++ b/sys-kernel/gentoo-sources-6.11/0008-perf-per-core.patch @@ -1,7 +1,7 @@ -From a2c8bc637c7a2e45c1189f2e92f3712715d957ba Mon Sep 17 00:00:00 2001 +From f3788bc44e2875141e8cf16b36365cb2bac541a6 Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 3 Aug 2024 09:34:27 +0200 -Subject: [PATCH 10/12] perf-per-core +Date: Fri, 4 Oct 2024 17:08:44 +0200 +Subject: [PATCH 08/10] perf-per-core Signed-off-by: Peter Jung --- @@ -29,7 +29,7 @@ index 7352ab89a55a..c12837e61bda 100644 System topology examples diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c -index 0c5e7a7c43ac..cd808b699ccc 100644 +index b985ca79cf97..8206038a01ac 100644 --- a/arch/x86/events/rapl.c +++ b/arch/x86/events/rapl.c @@ -39,6 +39,10 @@ @@ -856,7 +856,7 @@ index 0c5e7a7c43ac..cd808b699ccc 100644 } module_exit(intel_rapl_exit); diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h -index cb4f6c513c48..1ffe4260bef6 100644 +index a75a07f4931f..5a59713ec62b 100644 --- a/arch/x86/include/asm/processor.h +++ b/arch/x86/include/asm/processor.h @@ -98,6 +98,7 @@ struct cpuinfo_topology { @@ -868,7 +868,7 @@ index cb4f6c513c48..1ffe4260bef6 100644 // AMD Node ID and Nodes per Package info u32 amd_node_id; diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h -index e5b203fe7956..8c2fea7dd065 100644 +index aef70336d624..672fccf9f845 100644 --- a/arch/x86/include/asm/topology.h +++ b/arch/x86/include/asm/topology.h @@ -137,6 +137,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); @@ -904,5 +904,5 @@ index 9a6069e7133c..23722aa21e2f 100644 /* Package relative core ID */ -- -2.46.0.rc1 +2.47.0.rc0 diff --git a/sys-kernel/git-sources/0010-zstd.patch b/sys-kernel/gentoo-sources-6.11/0010-zstd.patch similarity index 99% rename from sys-kernel/git-sources/0010-zstd.patch rename to sys-kernel/gentoo-sources-6.11/0010-zstd.patch index 5b692da..347041b 100644 --- a/sys-kernel/git-sources/0010-zstd.patch +++ b/sys-kernel/gentoo-sources-6.11/0010-zstd.patch @@ -1,6 +1,6 @@ -From db3817dff7110c38462a1f918adec6a422f75406 Mon Sep 17 00:00:00 2001 +From cf0e4ae5c086f49c71b2a5aad50a589d8aa1799e Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Mon, 17 Jun 2024 15:29:10 +0200 +Date: Fri, 4 Oct 2024 17:09:19 +0200 Subject: [PATCH 10/10] zstd Signed-off-by: Peter Jung @@ -18648,6 +18648,5 @@ index f4ed952ed485..7d31518e9d5a 100644 EXPORT_SYMBOL(zstd_reset_dstream); -- -2.45.2 - +2.47.0.rc0 diff --git a/sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch new file mode 100644 index 0000000..3b8a030 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.11/0100-glitched-additional-timer-tick-frequencies.patch @@ -0,0 +1,55 @@ +From b27b06990e40226b04623ee1a863e807cebee48f Mon Sep 17 00:00:00 2001 +From: Andre Ramnitz +Date: Tue, 21 Mar 2023 00:12:08 +0100 +Subject: glitched: additional timer tick frequencies. + +--- + kernel/Kconfig.hz | 18 +++++++++++++++++- + 1 file changed, 17 insertions(+), 1 deletion(-) + +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d06888e..f648df15ef4c 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -5,7 +5,7 @@ + + choice + prompt "Timer frequency" +- default HZ_250 ++ default HZ_600 + help + Allows the configuration of the timer frequency. It is customary + to have the timer interrupt run at 1000 Hz but 100 Hz may be more +@@ -40,6 +40,20 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_600 ++ bool "600 HZ" ++ help ++ 600 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with great smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ ++ config HZ_900 ++ bool "900 HZ" ++ help ++ 900 Hz is a good timer frequency for desktops. Provides fast ++ interactivity with great smoothness. Like 300HZ on ++ steroids. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -53,6 +67,8 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 600 if HZ_600 ++ default 900 if HZ_900 + default 1000 if HZ_1000 + + config SCHED_HRTICK +-- +2.39.2 + diff --git a/sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch b/sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch new file mode 100644 index 0000000..7654052 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0001-preempt-lazy.patch @@ -0,0 +1,958 @@ +From 5ddf15cb65a8c14868cdc743474bd0a4fa9b586f Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Fri, 13 Dec 2024 23:03:09 +0800 +Subject: [PATCH] preempt-lazy + +Signed-off-by: Eric Naim +--- + arch/x86/Kconfig | 1 + + arch/x86/include/asm/thread_info.h | 6 +- + include/linux/entry-common.h | 3 +- + include/linux/entry-kvm.h | 5 +- + include/linux/preempt.h | 8 +- + include/linux/rcupdate.h | 2 +- + include/linux/rcutree.h | 2 +- + include/linux/sched.h | 3 +- + include/linux/srcutiny.h | 2 +- + include/linux/thread_info.h | 21 +++++- + include/linux/trace_events.h | 8 +- + kernel/Kconfig.preempt | 25 ++++++- + kernel/entry/common.c | 2 +- + kernel/entry/kvm.c | 4 +- + kernel/rcu/Kconfig | 4 +- + kernel/rcu/srcutiny.c | 14 ++-- + kernel/rcu/tree_plugin.h | 22 ++++-- + kernel/sched/core.c | 116 +++++++++++++++++++++++++---- + kernel/sched/debug.c | 7 +- + kernel/sched/fair.c | 6 +- + kernel/sched/sched.h | 1 + + kernel/trace/trace.c | 2 + + kernel/trace/trace_osnoise.c | 32 ++++---- + kernel/trace/trace_output.c | 16 +++- + 24 files changed, 232 insertions(+), 80 deletions(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index f127d0f1024e..4b28c191ae31 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -93,6 +93,7 @@ config X86 + select ARCH_HAS_NMI_SAFE_THIS_CPU_OPS + select ARCH_HAS_NON_OVERLAPPING_ADDRESS_SPACE + select ARCH_HAS_PMEM_API if X86_64 ++ select ARCH_HAS_PREEMPT_LAZY + select ARCH_HAS_PTE_DEVMAP if X86_64 + select ARCH_HAS_PTE_SPECIAL + select ARCH_HAS_HW_PTE_YOUNG +diff --git a/arch/x86/include/asm/thread_info.h b/arch/x86/include/asm/thread_info.h +index 12da7dfd5ef1..a55c214f3ba6 100644 +--- a/arch/x86/include/asm/thread_info.h ++++ b/arch/x86/include/asm/thread_info.h +@@ -87,8 +87,9 @@ struct thread_info { + #define TIF_NOTIFY_RESUME 1 /* callback before returning to user */ + #define TIF_SIGPENDING 2 /* signal pending */ + #define TIF_NEED_RESCHED 3 /* rescheduling necessary */ +-#define TIF_SINGLESTEP 4 /* reenable singlestep on user return*/ +-#define TIF_SSBD 5 /* Speculative store bypass disable */ ++#define TIF_NEED_RESCHED_LAZY 4 /* Lazy rescheduling needed */ ++#define TIF_SINGLESTEP 5 /* reenable singlestep on user return*/ ++#define TIF_SSBD 6 /* Speculative store bypass disable */ + #define TIF_SPEC_IB 9 /* Indirect branch speculation mitigation */ + #define TIF_SPEC_L1D_FLUSH 10 /* Flush L1D on mm switches (processes) */ + #define TIF_USER_RETURN_NOTIFY 11 /* notify kernel of userspace return */ +@@ -110,6 +111,7 @@ struct thread_info { + #define _TIF_NOTIFY_RESUME (1 << TIF_NOTIFY_RESUME) + #define _TIF_SIGPENDING (1 << TIF_SIGPENDING) + #define _TIF_NEED_RESCHED (1 << TIF_NEED_RESCHED) ++#define _TIF_NEED_RESCHED_LAZY (1 << TIF_NEED_RESCHED_LAZY) + #define _TIF_SINGLESTEP (1 << TIF_SINGLESTEP) + #define _TIF_SSBD (1 << TIF_SSBD) + #define _TIF_SPEC_IB (1 << TIF_SPEC_IB) +diff --git a/include/linux/entry-common.h b/include/linux/entry-common.h +index 1e50cdb83ae5..fc61d0205c97 100644 +--- a/include/linux/entry-common.h ++++ b/include/linux/entry-common.h +@@ -64,7 +64,8 @@ + + #define EXIT_TO_USER_MODE_WORK \ + (_TIF_SIGPENDING | _TIF_NOTIFY_RESUME | _TIF_UPROBE | \ +- _TIF_NEED_RESCHED | _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ ++ _TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | \ ++ _TIF_PATCH_PENDING | _TIF_NOTIFY_SIGNAL | \ + ARCH_EXIT_TO_USER_MODE_WORK) + + /** +diff --git a/include/linux/entry-kvm.h b/include/linux/entry-kvm.h +index 6813171afccb..16149f6625e4 100644 +--- a/include/linux/entry-kvm.h ++++ b/include/linux/entry-kvm.h +@@ -17,8 +17,9 @@ + #endif + + #define XFER_TO_GUEST_MODE_WORK \ +- (_TIF_NEED_RESCHED | _TIF_SIGPENDING | _TIF_NOTIFY_SIGNAL | \ +- _TIF_NOTIFY_RESUME | ARCH_XFER_TO_GUEST_MODE_WORK) ++ (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY | _TIF_SIGPENDING | \ ++ _TIF_NOTIFY_SIGNAL | _TIF_NOTIFY_RESUME | \ ++ ARCH_XFER_TO_GUEST_MODE_WORK) + + struct kvm_vcpu; + +diff --git a/include/linux/preempt.h b/include/linux/preempt.h +index ce76f1a45722..ca86235ac15c 100644 +--- a/include/linux/preempt.h ++++ b/include/linux/preempt.h +@@ -486,6 +486,7 @@ DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + extern bool preempt_model_none(void); + extern bool preempt_model_voluntary(void); + extern bool preempt_model_full(void); ++extern bool preempt_model_lazy(void); + + #else + +@@ -502,6 +503,11 @@ static inline bool preempt_model_full(void) + return IS_ENABLED(CONFIG_PREEMPT); + } + ++static inline bool preempt_model_lazy(void) ++{ ++ return IS_ENABLED(CONFIG_PREEMPT_LAZY); ++} ++ + #endif + + static inline bool preempt_model_rt(void) +@@ -519,7 +525,7 @@ static inline bool preempt_model_rt(void) + */ + static inline bool preempt_model_preemptible(void) + { +- return preempt_model_full() || preempt_model_rt(); ++ return preempt_model_full() || preempt_model_lazy() || preempt_model_rt(); + } + + #endif /* __LINUX_PREEMPT_H */ +diff --git a/include/linux/rcupdate.h b/include/linux/rcupdate.h +index 48e5c03df1dd..257e9ae34414 100644 +--- a/include/linux/rcupdate.h ++++ b/include/linux/rcupdate.h +@@ -95,9 +95,9 @@ static inline void __rcu_read_lock(void) + + static inline void __rcu_read_unlock(void) + { +- preempt_enable(); + if (IS_ENABLED(CONFIG_RCU_STRICT_GRACE_PERIOD)) + rcu_read_unlock_strict(); ++ preempt_enable(); + } + + static inline int rcu_preempt_depth(void) +diff --git a/include/linux/rcutree.h b/include/linux/rcutree.h +index 90a684f94776..ae8b5cb475a3 100644 +--- a/include/linux/rcutree.h ++++ b/include/linux/rcutree.h +@@ -104,7 +104,7 @@ extern int rcu_scheduler_active; + void rcu_end_inkernel_boot(void); + bool rcu_inkernel_boot_has_ended(void); + bool rcu_is_watching(void); +-#ifndef CONFIG_PREEMPTION ++#ifndef CONFIG_PREEMPT_RCU + void rcu_all_qs(void); + #endif + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index bb343136ddd0..ade641760900 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2002,7 +2002,8 @@ static inline void set_tsk_need_resched(struct task_struct *tsk) + + static inline void clear_tsk_need_resched(struct task_struct *tsk) + { +- clear_tsk_thread_flag(tsk,TIF_NEED_RESCHED); ++ atomic_long_andnot(_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY, ++ (atomic_long_t *)&task_thread_info(tsk)->flags); + } + + static inline int test_tsk_need_resched(struct task_struct *tsk) +diff --git a/include/linux/srcutiny.h b/include/linux/srcutiny.h +index 4d96bbdb45f0..1635c5e2662f 100644 +--- a/include/linux/srcutiny.h ++++ b/include/linux/srcutiny.h +@@ -64,7 +64,7 @@ static inline int __srcu_read_lock(struct srcu_struct *ssp) + { + int idx; + +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + idx = ((READ_ONCE(ssp->srcu_idx) + 1) & 0x2) >> 1; + WRITE_ONCE(ssp->srcu_lock_nesting[idx], READ_ONCE(ssp->srcu_lock_nesting[idx]) + 1); + preempt_enable(); +diff --git a/include/linux/thread_info.h b/include/linux/thread_info.h +index 9ea0b28068f4..cf2446c9c30d 100644 +--- a/include/linux/thread_info.h ++++ b/include/linux/thread_info.h +@@ -59,6 +59,14 @@ enum syscall_work_bit { + + #include + ++#ifndef TIF_NEED_RESCHED_LAZY ++#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY ++#error Inconsistent PREEMPT_LAZY ++#endif ++#define TIF_NEED_RESCHED_LAZY TIF_NEED_RESCHED ++#define _TIF_NEED_RESCHED_LAZY _TIF_NEED_RESCHED ++#endif ++ + #ifdef __KERNEL__ + + #ifndef arch_set_restart_data +@@ -179,22 +187,27 @@ static __always_inline unsigned long read_ti_thread_flags(struct thread_info *ti + + #ifdef _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H + +-static __always_inline bool tif_need_resched(void) ++static __always_inline bool tif_test_bit(int bit) + { +- return arch_test_bit(TIF_NEED_RESCHED, ++ return arch_test_bit(bit, + (unsigned long *)(¤t_thread_info()->flags)); + } + + #else + +-static __always_inline bool tif_need_resched(void) ++static __always_inline bool tif_test_bit(int bit) + { +- return test_bit(TIF_NEED_RESCHED, ++ return test_bit(bit, + (unsigned long *)(¤t_thread_info()->flags)); + } + + #endif /* _ASM_GENERIC_BITOPS_INSTRUMENTED_NON_ATOMIC_H */ + ++static __always_inline bool tif_need_resched(void) ++{ ++ return tif_test_bit(TIF_NEED_RESCHED); ++} ++ + #ifndef CONFIG_HAVE_ARCH_WITHIN_STACK_FRAMES + static inline int arch_within_stack_frames(const void * const stack, + const void * const stackend, +diff --git a/include/linux/trace_events.h b/include/linux/trace_events.h +index 42bedcddd511..4cae6f258137 100644 +--- a/include/linux/trace_events.h ++++ b/include/linux/trace_events.h +@@ -184,8 +184,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status); + + enum trace_flag_type { + TRACE_FLAG_IRQS_OFF = 0x01, +- TRACE_FLAG_IRQS_NOSUPPORT = 0x02, +- TRACE_FLAG_NEED_RESCHED = 0x04, ++ TRACE_FLAG_NEED_RESCHED = 0x02, ++ TRACE_FLAG_NEED_RESCHED_LAZY = 0x04, + TRACE_FLAG_HARDIRQ = 0x08, + TRACE_FLAG_SOFTIRQ = 0x10, + TRACE_FLAG_PREEMPT_RESCHED = 0x20, +@@ -211,11 +211,11 @@ static inline unsigned int tracing_gen_ctx(void) + + static inline unsigned int tracing_gen_ctx_flags(unsigned long irqflags) + { +- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++ return tracing_gen_ctx_irq_test(0); + } + static inline unsigned int tracing_gen_ctx(void) + { +- return tracing_gen_ctx_irq_test(TRACE_FLAG_IRQS_NOSUPPORT); ++ return tracing_gen_ctx_irq_test(0); + } + #endif + +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index fe782cd77388..7c1b29a3a491 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -11,12 +11,16 @@ config PREEMPT_BUILD + select PREEMPTION + select UNINLINE_SPIN_UNLOCK if !ARCH_INLINE_SPIN_UNLOCK + ++config ARCH_HAS_PREEMPT_LAZY ++ bool ++ + choice + prompt "Preemption Model" + default PREEMPT_NONE + + config PREEMPT_NONE + bool "No Forced Preemption (Server)" ++ depends on !PREEMPT_RT + select PREEMPT_NONE_BUILD if !PREEMPT_DYNAMIC + help + This is the traditional Linux preemption model, geared towards +@@ -32,6 +36,7 @@ config PREEMPT_NONE + config PREEMPT_VOLUNTARY + bool "Voluntary Kernel Preemption (Desktop)" + depends on !ARCH_NO_PREEMPT ++ depends on !PREEMPT_RT + select PREEMPT_VOLUNTARY_BUILD if !PREEMPT_DYNAMIC + help + This option reduces the latency of the kernel by adding more +@@ -51,7 +56,7 @@ config PREEMPT_VOLUNTARY + config PREEMPT + bool "Preemptible Kernel (Low-Latency Desktop)" + depends on !ARCH_NO_PREEMPT +- select PREEMPT_BUILD ++ select PREEMPT_BUILD if !PREEMPT_DYNAMIC + help + This option reduces the latency of the kernel by making + all kernel code (that is not executing in a critical section) +@@ -67,6 +72,20 @@ config PREEMPT + embedded system with latency requirements in the milliseconds + range. + ++config PREEMPT_LAZY ++ bool "Scheduler controlled preemption model" ++ depends on !ARCH_NO_PREEMPT ++ depends on ARCH_HAS_PREEMPT_LAZY ++ select PREEMPT_BUILD if !PREEMPT_DYNAMIC ++ help ++ This option provides a scheduler driven preemption model that ++ is fundamentally similar to full preemption, but is less ++ eager to preempt SCHED_NORMAL tasks in an attempt to ++ reduce lock holder preemption and recover some of the performance ++ gains seen from using Voluntary preemption. ++ ++endchoice ++ + config PREEMPT_RT + bool "Fully Preemptible Kernel (Real-Time)" + depends on EXPERT && ARCH_SUPPORTS_RT +@@ -84,8 +103,6 @@ config PREEMPT_RT + Select this if you are building a kernel for systems which + require real-time guarantees. + +-endchoice +- + config PREEMPT_COUNT + bool + +@@ -95,7 +112,7 @@ config PREEMPTION + + config PREEMPT_DYNAMIC + bool "Preemption behaviour defined on boot" +- depends on HAVE_PREEMPT_DYNAMIC && !PREEMPT_RT ++ depends on HAVE_PREEMPT_DYNAMIC + select JUMP_LABEL if HAVE_PREEMPT_DYNAMIC_KEY + select PREEMPT_BUILD + default y if HAVE_PREEMPT_DYNAMIC_CALL +diff --git a/kernel/entry/common.c b/kernel/entry/common.c +index 5b6934e23c21..e33691d5adf7 100644 +--- a/kernel/entry/common.c ++++ b/kernel/entry/common.c +@@ -98,7 +98,7 @@ __always_inline unsigned long exit_to_user_mode_loop(struct pt_regs *regs, + + local_irq_enable_exit_to_user(ti_work); + +- if (ti_work & _TIF_NEED_RESCHED) ++ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) + schedule(); + + if (ti_work & _TIF_UPROBE) +diff --git a/kernel/entry/kvm.c b/kernel/entry/kvm.c +index 2e0f75bcb7fd..8485f63863af 100644 +--- a/kernel/entry/kvm.c ++++ b/kernel/entry/kvm.c +@@ -13,7 +13,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) + return -EINTR; + } + +- if (ti_work & _TIF_NEED_RESCHED) ++ if (ti_work & (_TIF_NEED_RESCHED | _TIF_NEED_RESCHED_LAZY)) + schedule(); + + if (ti_work & _TIF_NOTIFY_RESUME) +@@ -24,7 +24,7 @@ static int xfer_to_guest_mode_work(struct kvm_vcpu *vcpu, unsigned long ti_work) + return ret; + + ti_work = read_thread_flags(); +- } while (ti_work & XFER_TO_GUEST_MODE_WORK || need_resched()); ++ } while (ti_work & XFER_TO_GUEST_MODE_WORK); + return 0; + } + +diff --git a/kernel/rcu/Kconfig b/kernel/rcu/Kconfig +index 3e079de0f5b4..9d52f87fac27 100644 +--- a/kernel/rcu/Kconfig ++++ b/kernel/rcu/Kconfig +@@ -18,7 +18,7 @@ config TREE_RCU + + config PREEMPT_RCU + bool +- default y if PREEMPTION ++ default y if (PREEMPT || PREEMPT_RT || PREEMPT_DYNAMIC) + select TREE_RCU + help + This option selects the RCU implementation that is +@@ -91,7 +91,7 @@ config NEED_TASKS_RCU + + config TASKS_RCU + bool +- default NEED_TASKS_RCU && (PREEMPTION || PREEMPT_AUTO) ++ default NEED_TASKS_RCU && PREEMPTION + select IRQ_WORK + + config FORCE_TASKS_RUDE_RCU +diff --git a/kernel/rcu/srcutiny.c b/kernel/rcu/srcutiny.c +index 4dcbf8aa80ff..f688bdad293e 100644 +--- a/kernel/rcu/srcutiny.c ++++ b/kernel/rcu/srcutiny.c +@@ -98,7 +98,7 @@ void __srcu_read_unlock(struct srcu_struct *ssp, int idx) + { + int newval; + +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + newval = READ_ONCE(ssp->srcu_lock_nesting[idx]) - 1; + WRITE_ONCE(ssp->srcu_lock_nesting[idx], newval); + preempt_enable(); +@@ -120,7 +120,7 @@ void srcu_drive_gp(struct work_struct *wp) + struct srcu_struct *ssp; + + ssp = container_of(wp, struct srcu_struct, srcu_work); +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + if (ssp->srcu_gp_running || ULONG_CMP_GE(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max))) { + preempt_enable(); + return; /* Already running or nothing to do. */ +@@ -138,7 +138,7 @@ void srcu_drive_gp(struct work_struct *wp) + WRITE_ONCE(ssp->srcu_gp_waiting, true); /* srcu_read_unlock() wakes! */ + preempt_enable(); + swait_event_exclusive(ssp->srcu_wq, !READ_ONCE(ssp->srcu_lock_nesting[idx])); +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + WRITE_ONCE(ssp->srcu_gp_waiting, false); /* srcu_read_unlock() cheap. */ + WRITE_ONCE(ssp->srcu_idx, ssp->srcu_idx + 1); + preempt_enable(); +@@ -159,7 +159,7 @@ void srcu_drive_gp(struct work_struct *wp) + * at interrupt level, but the ->srcu_gp_running checks will + * straighten that out. + */ +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + WRITE_ONCE(ssp->srcu_gp_running, false); + idx = ULONG_CMP_LT(ssp->srcu_idx, READ_ONCE(ssp->srcu_idx_max)); + preempt_enable(); +@@ -172,7 +172,7 @@ static void srcu_gp_start_if_needed(struct srcu_struct *ssp) + { + unsigned long cookie; + +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + cookie = get_state_synchronize_srcu(ssp); + if (ULONG_CMP_GE(READ_ONCE(ssp->srcu_idx_max), cookie)) { + preempt_enable(); +@@ -199,7 +199,7 @@ void call_srcu(struct srcu_struct *ssp, struct rcu_head *rhp, + + rhp->func = func; + rhp->next = NULL; +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + local_irq_save(flags); + *ssp->srcu_cb_tail = rhp; + ssp->srcu_cb_tail = &rhp->next; +@@ -261,7 +261,7 @@ unsigned long start_poll_synchronize_srcu(struct srcu_struct *ssp) + { + unsigned long ret; + +- preempt_disable(); // Needed for PREEMPT_AUTO ++ preempt_disable(); // Needed for PREEMPT_LAZY + ret = get_state_synchronize_srcu(ssp); + srcu_gp_start_if_needed(ssp); + preempt_enable(); +diff --git a/kernel/rcu/tree_plugin.h b/kernel/rcu/tree_plugin.h +index 1c7cbd145d5e..304e3405e6ec 100644 +--- a/kernel/rcu/tree_plugin.h ++++ b/kernel/rcu/tree_plugin.h +@@ -832,8 +832,17 @@ void rcu_read_unlock_strict(void) + { + struct rcu_data *rdp; + +- if (irqs_disabled() || preempt_count() || !rcu_state.gp_kthread) ++ if (irqs_disabled() || in_atomic_preempt_off() || !rcu_state.gp_kthread) + return; ++ ++ /* ++ * rcu_report_qs_rdp() can only be invoked with a stable rdp and ++ * from the local CPU. ++ * ++ * The in_atomic_preempt_off() check ensures that we come here holding ++ * the last preempt_count (which will get dropped once we return to ++ * __rcu_read_unlock(). ++ */ + rdp = this_cpu_ptr(&rcu_data); + rdp->cpu_no_qs.b.norm = false; + rcu_report_qs_rdp(rdp); +@@ -974,13 +983,16 @@ static void rcu_preempt_check_blocked_tasks(struct rcu_node *rnp) + */ + static void rcu_flavor_sched_clock_irq(int user) + { +- if (user || rcu_is_cpu_rrupt_from_idle()) { ++ if (user || rcu_is_cpu_rrupt_from_idle() || ++ (IS_ENABLED(CONFIG_PREEMPT_COUNT) && ++ (preempt_count() == HARDIRQ_OFFSET))) { + + /* + * Get here if this CPU took its interrupt from user +- * mode or from the idle loop, and if this is not a +- * nested interrupt. In this case, the CPU is in +- * a quiescent state, so note it. ++ * mode, from the idle loop without this being a nested ++ * interrupt, or while not holding the task preempt count ++ * (with PREEMPT_COUNT=y). In this case, the CPU is in a ++ * quiescent state, so note it. + * + * No memory barrier is required here because rcu_qs() + * references only CPU-local variables that other CPUs +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 76b27b2a9c56..e82948e247c1 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -941,10 +941,9 @@ static inline void hrtick_rq_init(struct rq *rq) + * this avoids any races wrt polling state changes and thereby avoids + * spurious IPIs. + */ +-static inline bool set_nr_and_not_polling(struct task_struct *p) ++static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) + { +- struct thread_info *ti = task_thread_info(p); +- return !(fetch_or(&ti->flags, _TIF_NEED_RESCHED) & _TIF_POLLING_NRFLAG); ++ return !(fetch_or(&ti->flags, 1 << tif) & _TIF_POLLING_NRFLAG); + } + + /* +@@ -969,9 +968,9 @@ static bool set_nr_if_polling(struct task_struct *p) + } + + #else +-static inline bool set_nr_and_not_polling(struct task_struct *p) ++static inline bool set_nr_and_not_polling(struct thread_info *ti, int tif) + { +- set_tsk_need_resched(p); ++ atomic_long_or(1 << tif, (atomic_long_t *)&ti->flags); + return true; + } + +@@ -1076,28 +1075,66 @@ void wake_up_q(struct wake_q_head *head) + * might also involve a cross-CPU call to trigger the scheduler on + * the target CPU. + */ +-void resched_curr(struct rq *rq) ++static void __resched_curr(struct rq *rq, int tif) + { + struct task_struct *curr = rq->curr; ++ struct thread_info *cti = task_thread_info(curr); + int cpu; + + lockdep_assert_rq_held(rq); + +- if (test_tsk_need_resched(curr)) ++ if (is_idle_task(curr) && tif == TIF_NEED_RESCHED_LAZY) ++ tif = TIF_NEED_RESCHED; ++ ++ if (cti->flags & ((1 << tif) | _TIF_NEED_RESCHED)) + return; + + cpu = cpu_of(rq); + + if (cpu == smp_processor_id()) { +- set_tsk_need_resched(curr); +- set_preempt_need_resched(); ++ set_ti_thread_flag(cti, tif); ++ if (tif == TIF_NEED_RESCHED) ++ set_preempt_need_resched(); + return; + } + +- if (set_nr_and_not_polling(curr)) +- smp_send_reschedule(cpu); +- else ++ if (set_nr_and_not_polling(cti, tif)) { ++ if (tif == TIF_NEED_RESCHED) ++ smp_send_reschedule(cpu); ++ } else { + trace_sched_wake_idle_without_ipi(cpu); ++ } ++} ++ ++void resched_curr(struct rq *rq) ++{ ++ __resched_curr(rq, TIF_NEED_RESCHED); ++} ++ ++#ifdef CONFIG_PREEMPT_DYNAMIC ++static DEFINE_STATIC_KEY_FALSE(sk_dynamic_preempt_lazy); ++static __always_inline bool dynamic_preempt_lazy(void) ++{ ++ return static_branch_unlikely(&sk_dynamic_preempt_lazy); ++} ++#else ++static __always_inline bool dynamic_preempt_lazy(void) ++{ ++ return IS_ENABLED(CONFIG_PREEMPT_LAZY); ++} ++#endif ++ ++static __always_inline int tif_need_resched_lazy(void) ++{ ++ if (dynamic_preempt_lazy()) ++ return TIF_NEED_RESCHED_LAZY; ++ ++ return TIF_NEED_RESCHED; ++} ++ ++void resched_curr_lazy(struct rq *rq) ++{ ++ __resched_curr(rq, tif_need_resched_lazy()); + } + + void resched_cpu(int cpu) +@@ -1192,7 +1229,7 @@ static void wake_up_idle_cpu(int cpu) + * and testing of the above solutions didn't appear to report + * much benefits. + */ +- if (set_nr_and_not_polling(rq->idle)) ++ if (set_nr_and_not_polling(task_thread_info(rq->idle), TIF_NEED_RESCHED)) + smp_send_reschedule(cpu); + else + trace_sched_wake_idle_without_ipi(cpu); +@@ -5604,6 +5641,10 @@ void sched_tick(void) + update_rq_clock(rq); + hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); + update_hw_load_avg(rq_clock_task(rq), rq, hw_pressure); ++ ++ if (dynamic_preempt_lazy() && tif_test_bit(TIF_NEED_RESCHED_LAZY)) ++ resched_curr(rq); ++ + curr->sched_class->task_tick(rq, curr, 0); + if (sched_feat(LATENCY_WARN)) + resched_latency = cpu_resched_latency(rq); +@@ -7219,7 +7260,7 @@ int __sched __cond_resched(void) + return 1; + } + /* +- * In preemptible kernels, ->rcu_read_lock_nesting tells the tick ++ * In PREEMPT_RCU kernels, ->rcu_read_lock_nesting tells the tick + * whether the current CPU is in an RCU read-side critical section, + * so the tick can report quiescent states even for CPUs looping + * in kernel context. In contrast, in non-preemptible kernels, +@@ -7228,6 +7269,8 @@ int __sched __cond_resched(void) + * RCU quiescent state. Therefore, the following code causes + * cond_resched() to report a quiescent state, but only when RCU + * is in urgent need of one. ++ * A third case, preemptible, but non-PREEMPT_RCU provides for ++ * urgently needed quiescent states via rcu_flavor_sched_clock_irq(). + */ + #ifndef CONFIG_PREEMPT_RCU + rcu_all_qs(); +@@ -7352,6 +7395,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP ++ * dynamic_preempt_lazy <- false + * + * VOLUNTARY: + * cond_resched <- __cond_resched +@@ -7359,6 +7403,7 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); + * preempt_schedule <- NOP + * preempt_schedule_notrace <- NOP + * irqentry_exit_cond_resched <- NOP ++ * dynamic_preempt_lazy <- false + * + * FULL: + * cond_resched <- RET0 +@@ -7366,6 +7411,15 @@ EXPORT_SYMBOL(__cond_resched_rwlock_write); + * preempt_schedule <- preempt_schedule + * preempt_schedule_notrace <- preempt_schedule_notrace + * irqentry_exit_cond_resched <- irqentry_exit_cond_resched ++ * dynamic_preempt_lazy <- false ++ * ++ * LAZY: ++ * cond_resched <- RET0 ++ * might_resched <- RET0 ++ * preempt_schedule <- preempt_schedule ++ * preempt_schedule_notrace <- preempt_schedule_notrace ++ * irqentry_exit_cond_resched <- irqentry_exit_cond_resched ++ * dynamic_preempt_lazy <- true + */ + + enum { +@@ -7373,30 +7427,41 @@ enum { + preempt_dynamic_none, + preempt_dynamic_voluntary, + preempt_dynamic_full, ++ preempt_dynamic_lazy, + }; + + int preempt_dynamic_mode = preempt_dynamic_undefined; + + int sched_dynamic_mode(const char *str) + { ++#ifndef CONFIG_PREEMPT_RT + if (!strcmp(str, "none")) + return preempt_dynamic_none; + + if (!strcmp(str, "voluntary")) + return preempt_dynamic_voluntary; ++#endif + + if (!strcmp(str, "full")) + return preempt_dynamic_full; + ++#ifdef CONFIG_ARCH_HAS_PREEMPT_LAZY ++ if (!strcmp(str, "lazy")) ++ return preempt_dynamic_lazy; ++#endif ++ + return -EINVAL; + } + ++#define preempt_dynamic_key_enable(f) static_key_enable(&sk_dynamic_##f.key) ++#define preempt_dynamic_key_disable(f) static_key_disable(&sk_dynamic_##f.key) ++ + #if defined(CONFIG_HAVE_PREEMPT_DYNAMIC_CALL) + #define preempt_dynamic_enable(f) static_call_update(f, f##_dynamic_enabled) + #define preempt_dynamic_disable(f) static_call_update(f, f##_dynamic_disabled) + #elif defined(CONFIG_HAVE_PREEMPT_DYNAMIC_KEY) +-#define preempt_dynamic_enable(f) static_key_enable(&sk_dynamic_##f.key) +-#define preempt_dynamic_disable(f) static_key_disable(&sk_dynamic_##f.key) ++#define preempt_dynamic_enable(f) preempt_dynamic_key_enable(f) ++#define preempt_dynamic_disable(f) preempt_dynamic_key_disable(f) + #else + #error "Unsupported PREEMPT_DYNAMIC mechanism" + #endif +@@ -7416,6 +7481,7 @@ static void __sched_dynamic_update(int mode) + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); ++ preempt_dynamic_key_disable(preempt_lazy); + + switch (mode) { + case preempt_dynamic_none: +@@ -7425,6 +7491,7 @@ static void __sched_dynamic_update(int mode) + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); ++ preempt_dynamic_key_disable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: none\n"); + break; +@@ -7436,6 +7503,7 @@ static void __sched_dynamic_update(int mode) + preempt_dynamic_disable(preempt_schedule); + preempt_dynamic_disable(preempt_schedule_notrace); + preempt_dynamic_disable(irqentry_exit_cond_resched); ++ preempt_dynamic_key_disable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: voluntary\n"); + break; +@@ -7447,9 +7515,22 @@ static void __sched_dynamic_update(int mode) + preempt_dynamic_enable(preempt_schedule); + preempt_dynamic_enable(preempt_schedule_notrace); + preempt_dynamic_enable(irqentry_exit_cond_resched); ++ preempt_dynamic_key_disable(preempt_lazy); + if (mode != preempt_dynamic_mode) + pr_info("Dynamic Preempt: full\n"); + break; ++ ++ case preempt_dynamic_lazy: ++ if (!klp_override) ++ preempt_dynamic_disable(cond_resched); ++ preempt_dynamic_disable(might_resched); ++ preempt_dynamic_enable(preempt_schedule); ++ preempt_dynamic_enable(preempt_schedule_notrace); ++ preempt_dynamic_enable(irqentry_exit_cond_resched); ++ preempt_dynamic_key_enable(preempt_lazy); ++ if (mode != preempt_dynamic_mode) ++ pr_info("Dynamic Preempt: lazy\n"); ++ break; + } + + preempt_dynamic_mode = mode; +@@ -7512,6 +7593,8 @@ static void __init preempt_dynamic_init(void) + sched_dynamic_update(preempt_dynamic_none); + } else if (IS_ENABLED(CONFIG_PREEMPT_VOLUNTARY)) { + sched_dynamic_update(preempt_dynamic_voluntary); ++ } else if (IS_ENABLED(CONFIG_PREEMPT_LAZY)) { ++ sched_dynamic_update(preempt_dynamic_lazy); + } else { + /* Default static call setting, nothing to do */ + WARN_ON_ONCE(!IS_ENABLED(CONFIG_PREEMPT)); +@@ -7532,6 +7615,7 @@ static void __init preempt_dynamic_init(void) + PREEMPT_MODEL_ACCESSOR(none); + PREEMPT_MODEL_ACCESSOR(voluntary); + PREEMPT_MODEL_ACCESSOR(full); ++PREEMPT_MODEL_ACCESSOR(lazy); + + #else /* !CONFIG_PREEMPT_DYNAMIC: */ + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index f4035c7a0fa1..a48b2a701ec2 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -245,11 +245,12 @@ static ssize_t sched_dynamic_write(struct file *filp, const char __user *ubuf, + static int sched_dynamic_show(struct seq_file *m, void *v) + { + static const char * preempt_modes[] = { +- "none", "voluntary", "full" ++ "none", "voluntary", "full", "lazy", + }; +- int i; ++ int j = ARRAY_SIZE(preempt_modes) - !IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY); ++ int i = IS_ENABLED(CONFIG_PREEMPT_RT) * 2; + +- for (i = 0; i < ARRAY_SIZE(preempt_modes); i++) { ++ for (; i < j; i++) { + if (preempt_dynamic_mode == i) + seq_puts(m, "("); + seq_puts(m, preempt_modes[i]); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 54e7c4c3e2c5..10e9484d1d43 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1264,7 +1264,7 @@ static void update_curr(struct cfs_rq *cfs_rq) + return; + + if (resched || did_preempt_short(cfs_rq, curr)) { +- resched_curr(rq); ++ resched_curr_lazy(rq); + clear_buddies(cfs_rq, curr); + } + } +@@ -5691,7 +5691,7 @@ entity_tick(struct cfs_rq *cfs_rq, struct sched_entity *curr, int queued) + * validating it and just reschedule. + */ + if (queued) { +- resched_curr(rq_of(cfs_rq)); ++ resched_curr_lazy(rq_of(cfs_rq)); + return; + } + /* +@@ -8855,7 +8855,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int + return; + + preempt: +- resched_curr(rq); ++ resched_curr_lazy(rq); + } + + static struct task_struct *pick_task_fair(struct rq *rq) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c5d6012794de..b5f3890f3050 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2696,6 +2696,7 @@ extern void init_sched_rt_class(void); + extern void init_sched_fair_class(void); + + extern void resched_curr(struct rq *rq); ++extern void resched_curr_lazy(struct rq *rq); + extern void resched_cpu(int cpu); + + extern void init_rt_bandwidth(struct rt_bandwidth *rt_b, u64 period, u64 runtime); +diff --git a/kernel/trace/trace.c b/kernel/trace/trace.c +index 6a891e00aa7f..acbed0ffe083 100644 +--- a/kernel/trace/trace.c ++++ b/kernel/trace/trace.c +@@ -2563,6 +2563,8 @@ unsigned int tracing_gen_ctx_irq_test(unsigned int irqs_status) + trace_flags |= TRACE_FLAG_NEED_RESCHED; + if (test_preempt_need_resched()) + trace_flags |= TRACE_FLAG_PREEMPT_RESCHED; ++ if (IS_ENABLED(CONFIG_ARCH_HAS_PREEMPT_LAZY) && tif_test_bit(TIF_NEED_RESCHED_LAZY)) ++ trace_flags |= TRACE_FLAG_NEED_RESCHED_LAZY; + return (trace_flags << 16) | (min_t(unsigned int, pc & 0xff, 0xf)) | + (min_t(unsigned int, migration_disable_value(), 0xf)) << 4; + } +diff --git a/kernel/trace/trace_osnoise.c b/kernel/trace/trace_osnoise.c +index a50ed23bee77..4a9087112526 100644 +--- a/kernel/trace/trace_osnoise.c ++++ b/kernel/trace/trace_osnoise.c +@@ -1537,27 +1537,25 @@ static int run_osnoise(void) + + /* + * In some cases, notably when running on a nohz_full CPU with +- * a stopped tick PREEMPT_RCU has no way to account for QSs. +- * This will eventually cause unwarranted noise as PREEMPT_RCU +- * will force preemption as the means of ending the current +- * grace period. We avoid this problem by calling +- * rcu_momentary_eqs(), which performs a zero duration +- * EQS allowing PREEMPT_RCU to end the current grace period. +- * This call shouldn't be wrapped inside an RCU critical +- * section. ++ * a stopped tick PREEMPT_RCU or PREEMPT_LAZY have no way to ++ * account for QSs. This will eventually cause unwarranted ++ * noise as RCU forces preemption as the means of ending the ++ * current grace period. We avoid this by calling ++ * rcu_momentary_eqs(), which performs a zero duration EQS ++ * allowing RCU to end the current grace period. This call ++ * shouldn't be wrapped inside an RCU critical section. + * +- * Note that in non PREEMPT_RCU kernels QSs are handled through +- * cond_resched() ++ * Normally QSs for other cases are handled through cond_resched(). ++ * For simplicity, however, we call rcu_momentary_eqs() for all ++ * configurations here. + */ +- if (IS_ENABLED(CONFIG_PREEMPT_RCU)) { +- if (!disable_irq) +- local_irq_disable(); ++ if (!disable_irq) ++ local_irq_disable(); + +- rcu_momentary_eqs(); ++ rcu_momentary_eqs(); + +- if (!disable_irq) +- local_irq_enable(); +- } ++ if (!disable_irq) ++ local_irq_enable(); + + /* + * For the non-preemptive kernel config: let threads runs, if +diff --git a/kernel/trace/trace_output.c b/kernel/trace/trace_output.c +index 868f2f912f28..23ca2155306b 100644 +--- a/kernel/trace/trace_output.c ++++ b/kernel/trace/trace_output.c +@@ -460,20 +460,32 @@ int trace_print_lat_fmt(struct trace_seq *s, struct trace_entry *entry) + (entry->flags & TRACE_FLAG_IRQS_OFF && bh_off) ? 'D' : + (entry->flags & TRACE_FLAG_IRQS_OFF) ? 'd' : + bh_off ? 'b' : +- (entry->flags & TRACE_FLAG_IRQS_NOSUPPORT) ? 'X' : ++ !IS_ENABLED(CONFIG_TRACE_IRQFLAGS_SUPPORT) ? 'X' : + '.'; + +- switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | ++ switch (entry->flags & (TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | + TRACE_FLAG_PREEMPT_RESCHED)) { ++ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: ++ need_resched = 'B'; ++ break; + case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'N'; + break; ++ case TRACE_FLAG_NEED_RESCHED_LAZY | TRACE_FLAG_PREEMPT_RESCHED: ++ need_resched = 'L'; ++ break; ++ case TRACE_FLAG_NEED_RESCHED | TRACE_FLAG_NEED_RESCHED_LAZY: ++ need_resched = 'b'; ++ break; + case TRACE_FLAG_NEED_RESCHED: + need_resched = 'n'; + break; + case TRACE_FLAG_PREEMPT_RESCHED: + need_resched = 'p'; + break; ++ case TRACE_FLAG_NEED_RESCHED_LAZY: ++ need_resched = 'l'; ++ break; + default: + need_resched = '.'; + break; +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch b/sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch new file mode 100644 index 0000000..20e109a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0002-amd-pstate.patch @@ -0,0 +1,902 @@ +From 5b24edbe81299a51cf1694d0e33c33d995e2c04d Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 19 Dec 2024 18:50:07 +0100 +Subject: [PATCH 02/12] amd-pstate + +Signed-off-by: Peter Jung +--- + Documentation/admin-guide/pm/amd-pstate.rst | 4 +- + arch/x86/include/asm/cpufeatures.h | 3 +- + arch/x86/include/asm/intel-family.h | 6 + + arch/x86/include/asm/processor.h | 18 ++ + arch/x86/include/asm/topology.h | 9 + + arch/x86/kernel/acpi/cppc.c | 23 ++ + arch/x86/kernel/cpu/debugfs.c | 1 + + arch/x86/kernel/cpu/scattered.c | 3 +- + arch/x86/kernel/cpu/topology_amd.c | 3 + + arch/x86/kernel/cpu/topology_common.c | 34 +++ + arch/x86/kernel/smpboot.c | 14 +- + arch/x86/mm/init.c | 23 +- + drivers/cpufreq/amd-pstate-ut.c | 6 +- + drivers/cpufreq/amd-pstate.c | 235 +++++++++----------- + tools/arch/x86/include/asm/cpufeatures.h | 2 +- + 15 files changed, 239 insertions(+), 145 deletions(-) + +diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst +index 210a808b74ec..412423c54f25 100644 +--- a/Documentation/admin-guide/pm/amd-pstate.rst ++++ b/Documentation/admin-guide/pm/amd-pstate.rst +@@ -251,9 +251,7 @@ performance supported in `AMD CPPC Performance Capability `_). + In some ASICs, the highest CPPC performance is not the one in the ``_CPC`` + table, so we need to expose it to sysfs. If boost is not active, but + still supported, this maximum frequency will be larger than the one in +-``cpuinfo``. On systems that support preferred core, the driver will have +-different values for some cores than others and this will reflect the values +-advertised by the platform at bootup. ++``cpuinfo``. + This attribute is read-only. + + ``amd_pstate_lowest_nonlinear_freq`` +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 913fd3a7bac6..a7c93191b7c6 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -473,7 +473,8 @@ + #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ + #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ + #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +-#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ ++#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* Fast CPPC */ ++#define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ + + /* + * BUG word(s) +diff --git a/arch/x86/include/asm/intel-family.h b/arch/x86/include/asm/intel-family.h +index 1a42f829667a..736764472048 100644 +--- a/arch/x86/include/asm/intel-family.h ++++ b/arch/x86/include/asm/intel-family.h +@@ -183,4 +183,10 @@ + /* Family 19 */ + #define INTEL_PANTHERCOVE_X IFM(19, 0x01) /* Diamond Rapids */ + ++/* CPU core types */ ++enum intel_cpu_type { ++ INTEL_CPU_TYPE_ATOM = 0x20, ++ INTEL_CPU_TYPE_CORE = 0x40, ++}; ++ + #endif /* _ASM_X86_INTEL_FAMILY_H */ +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 2d776635aa53..20e6009381ed 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -105,6 +105,24 @@ struct cpuinfo_topology { + // Cache level topology IDs + u32 llc_id; + u32 l2c_id; ++ ++ // Hardware defined CPU-type ++ union { ++ u32 cpu_type; ++ struct { ++ // CPUID.1A.EAX[23-0] ++ u32 intel_native_model_id :24; ++ // CPUID.1A.EAX[31-24] ++ u32 intel_type :8; ++ }; ++ struct { ++ // CPUID 0x80000026.EBX ++ u32 amd_num_processors :16, ++ amd_power_eff_ranking :8, ++ amd_native_model_id :4, ++ amd_type :4; ++ }; ++ }; + }; + + struct cpuinfo_x86 { +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index 92f3664dd933..fd41103ad342 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -114,6 +114,12 @@ enum x86_topology_domains { + TOPO_MAX_DOMAIN, + }; + ++enum x86_topology_cpu_type { ++ TOPO_CPU_TYPE_PERFORMANCE, ++ TOPO_CPU_TYPE_EFFICIENCY, ++ TOPO_CPU_TYPE_UNKNOWN, ++}; ++ + struct x86_topology_system { + unsigned int dom_shifts[TOPO_MAX_DOMAIN]; + unsigned int dom_size[TOPO_MAX_DOMAIN]; +@@ -149,6 +155,9 @@ extern unsigned int __max_threads_per_core; + extern unsigned int __num_threads_per_package; + extern unsigned int __num_cores_per_package; + ++const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c); ++enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c); ++ + static inline unsigned int topology_max_packages(void) + { + return __max_logical_packages; +diff --git a/arch/x86/kernel/acpi/cppc.c b/arch/x86/kernel/acpi/cppc.c +index aab9d0570841..d745dd586303 100644 +--- a/arch/x86/kernel/acpi/cppc.c ++++ b/arch/x86/kernel/acpi/cppc.c +@@ -239,8 +239,10 @@ EXPORT_SYMBOL_GPL(amd_detect_prefcore); + */ + int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) + { ++ enum x86_topology_cpu_type core_type = get_topology_cpu_type(&cpu_data(cpu)); + bool prefcore; + int ret; ++ u32 tmp; + + ret = amd_detect_prefcore(&prefcore); + if (ret) +@@ -266,6 +268,27 @@ int amd_get_boost_ratio_numerator(unsigned int cpu, u64 *numerator) + break; + } + } ++ ++ /* detect if running on heterogeneous design */ ++ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) { ++ switch (core_type) { ++ case TOPO_CPU_TYPE_UNKNOWN: ++ pr_warn("Undefined core type found for cpu %d\n", cpu); ++ break; ++ case TOPO_CPU_TYPE_PERFORMANCE: ++ /* use the max scale for performance cores */ ++ *numerator = CPPC_HIGHEST_PERF_PERFORMANCE; ++ return 0; ++ case TOPO_CPU_TYPE_EFFICIENCY: ++ /* use the highest perf value for efficiency cores */ ++ ret = amd_get_highest_perf(cpu, &tmp); ++ if (ret) ++ return ret; ++ *numerator = tmp; ++ return 0; ++ } ++ } ++ + *numerator = CPPC_HIGHEST_PERF_PREFCORE; + + return 0; +diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c +index 3baf3e435834..10719aba6276 100644 +--- a/arch/x86/kernel/cpu/debugfs.c ++++ b/arch/x86/kernel/cpu/debugfs.c +@@ -22,6 +22,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) + seq_printf(m, "die_id: %u\n", c->topo.die_id); + seq_printf(m, "cu_id: %u\n", c->topo.cu_id); + seq_printf(m, "core_id: %u\n", c->topo.core_id); ++ seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); +diff --git a/arch/x86/kernel/cpu/scattered.c b/arch/x86/kernel/cpu/scattered.c +index c84c30188fdf..307a91741534 100644 +--- a/arch/x86/kernel/cpu/scattered.c ++++ b/arch/x86/kernel/cpu/scattered.c +@@ -45,13 +45,14 @@ static const struct cpuid_bit cpuid_bits[] = { + { X86_FEATURE_HW_PSTATE, CPUID_EDX, 7, 0x80000007, 0 }, + { X86_FEATURE_CPB, CPUID_EDX, 9, 0x80000007, 0 }, + { X86_FEATURE_PROC_FEEDBACK, CPUID_EDX, 11, 0x80000007, 0 }, +- { X86_FEATURE_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, ++ { X86_FEATURE_AMD_FAST_CPPC, CPUID_EDX, 15, 0x80000007, 0 }, + { X86_FEATURE_MBA, CPUID_EBX, 6, 0x80000008, 0 }, + { X86_FEATURE_SMBA, CPUID_EBX, 2, 0x80000020, 0 }, + { X86_FEATURE_BMEC, CPUID_EBX, 3, 0x80000020, 0 }, + { X86_FEATURE_PERFMON_V2, CPUID_EAX, 0, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_V2, CPUID_EAX, 1, 0x80000022, 0 }, + { X86_FEATURE_AMD_LBR_PMC_FREEZE, CPUID_EAX, 2, 0x80000022, 0 }, ++ { X86_FEATURE_AMD_HETEROGENEOUS_CORES, CPUID_EAX, 30, 0x80000026, 0 }, + { 0, 0, 0, 0, 0 } + }; + +diff --git a/arch/x86/kernel/cpu/topology_amd.c b/arch/x86/kernel/cpu/topology_amd.c +index 7d476fa697ca..03b3c9c3a45e 100644 +--- a/arch/x86/kernel/cpu/topology_amd.c ++++ b/arch/x86/kernel/cpu/topology_amd.c +@@ -182,6 +182,9 @@ static void parse_topology_amd(struct topo_scan *tscan) + if (cpu_feature_enabled(X86_FEATURE_TOPOEXT)) + has_topoext = cpu_parse_topology_ext(tscan); + ++ if (cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) ++ tscan->c->topo.cpu_type = cpuid_ebx(0x80000026); ++ + if (!has_topoext && !parse_8000_0008(tscan)) + return; + +diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c +index 9a6069e7133c..8277c64f88db 100644 +--- a/arch/x86/kernel/cpu/topology_common.c ++++ b/arch/x86/kernel/cpu/topology_common.c +@@ -3,6 +3,7 @@ + + #include + ++#include + #include + #include + #include +@@ -27,6 +28,36 @@ void topology_set_dom(struct topo_scan *tscan, enum x86_topology_domains dom, + } + } + ++enum x86_topology_cpu_type get_topology_cpu_type(struct cpuinfo_x86 *c) ++{ ++ if (c->x86_vendor == X86_VENDOR_INTEL) { ++ switch (c->topo.intel_type) { ++ case INTEL_CPU_TYPE_ATOM: return TOPO_CPU_TYPE_EFFICIENCY; ++ case INTEL_CPU_TYPE_CORE: return TOPO_CPU_TYPE_PERFORMANCE; ++ } ++ } ++ if (c->x86_vendor == X86_VENDOR_AMD) { ++ switch (c->topo.amd_type) { ++ case 0: return TOPO_CPU_TYPE_PERFORMANCE; ++ case 1: return TOPO_CPU_TYPE_EFFICIENCY; ++ } ++ } ++ ++ return TOPO_CPU_TYPE_UNKNOWN; ++} ++ ++const char *get_topology_cpu_type_name(struct cpuinfo_x86 *c) ++{ ++ switch (get_topology_cpu_type(c)) { ++ case TOPO_CPU_TYPE_PERFORMANCE: ++ return "performance"; ++ case TOPO_CPU_TYPE_EFFICIENCY: ++ return "efficiency"; ++ default: ++ return "unknown"; ++ } ++} ++ + static unsigned int __maybe_unused parse_num_cores_legacy(struct cpuinfo_x86 *c) + { + struct { +@@ -87,6 +118,7 @@ static void parse_topology(struct topo_scan *tscan, bool early) + .cu_id = 0xff, + .llc_id = BAD_APICID, + .l2c_id = BAD_APICID, ++ .cpu_type = TOPO_CPU_TYPE_UNKNOWN, + }; + struct cpuinfo_x86 *c = tscan->c; + struct { +@@ -132,6 +164,8 @@ static void parse_topology(struct topo_scan *tscan, bool early) + case X86_VENDOR_INTEL: + if (!IS_ENABLED(CONFIG_CPU_SUP_INTEL) || !cpu_parse_topology_ext(tscan)) + parse_legacy(tscan); ++ if (c->cpuid_level >= 0x1a) ++ c->topo.cpu_type = cpuid_eax(0x1a); + break; + case X86_VENDOR_HYGON: + if (IS_ENABLED(CONFIG_CPU_SUP_HYGON)) +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index 766f092dab80..419e7ae09639 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -62,6 +62,8 @@ + #include + #include + ++#include ++ + #include + #include + #include +@@ -498,7 +500,17 @@ static int x86_cluster_flags(void) + static int x86_die_flags(void) + { + if (cpu_feature_enabled(X86_FEATURE_HYBRID_CPU)) +- return x86_sched_itmt_flags(); ++ return x86_sched_itmt_flags(); ++ ++ switch (boot_cpu_data.x86_vendor) { ++ case X86_VENDOR_AMD: ++ case X86_VENDOR_HYGON: ++ bool prefcore = false; ++ ++ amd_detect_prefcore(&prefcore); ++ if (prefcore || cpu_feature_enabled(X86_FEATURE_AMD_HETEROGENEOUS_CORES)) ++ return x86_sched_itmt_flags(); ++ }; + + return 0; + } +diff --git a/arch/x86/mm/init.c b/arch/x86/mm/init.c +index eb503f53c319..101725c149c4 100644 +--- a/arch/x86/mm/init.c ++++ b/arch/x86/mm/init.c +@@ -263,28 +263,33 @@ static void __init probe_page_size_mask(void) + } + + /* +- * INVLPG may not properly flush Global entries +- * on these CPUs when PCIDs are enabled. ++ * INVLPG may not properly flush Global entries on ++ * these CPUs. New microcode fixes the issue. + */ + static const struct x86_cpu_id invlpg_miss_ids[] = { +- X86_MATCH_VFM(INTEL_ALDERLAKE, 0), +- X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0), +- X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0), +- X86_MATCH_VFM(INTEL_RAPTORLAKE, 0), +- X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0), +- X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0), ++ X86_MATCH_VFM(INTEL_ALDERLAKE, 0x2e), ++ X86_MATCH_VFM(INTEL_ALDERLAKE_L, 0x42c), ++ X86_MATCH_VFM(INTEL_ATOM_GRACEMONT, 0x11), ++ X86_MATCH_VFM(INTEL_RAPTORLAKE, 0x118), ++ X86_MATCH_VFM(INTEL_RAPTORLAKE_P, 0x4117), ++ X86_MATCH_VFM(INTEL_RAPTORLAKE_S, 0x2e), + {} + }; + + static void setup_pcid(void) + { ++ const struct x86_cpu_id *invlpg_miss_match; ++ + if (!IS_ENABLED(CONFIG_X86_64)) + return; + + if (!boot_cpu_has(X86_FEATURE_PCID)) + return; + +- if (x86_match_cpu(invlpg_miss_ids)) { ++ invlpg_miss_match = x86_match_cpu(invlpg_miss_ids); ++ ++ if (invlpg_miss_match && ++ boot_cpu_data.microcode < invlpg_miss_match->driver_data) { + pr_info("Incomplete global flushes, disabling PCID"); + setup_clear_cpu_cap(X86_FEATURE_PCID); + return; +diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c +index f66701514d90..a261d7300951 100644 +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -227,10 +227,10 @@ static void amd_pstate_ut_check_freq(u32 index) + goto skip_test; + } + +- if (cpudata->min_freq != policy->min) { ++ if (cpudata->lowest_nonlinear_freq != policy->min) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; +- pr_err("%s cpu%d cpudata_min_freq=%d policy_min=%d, they should be equal!\n", +- __func__, cpu, cpudata->min_freq, policy->min); ++ pr_err("%s cpu%d cpudata_lowest_nonlinear_freq=%d policy_min=%d, they should be equal!\n", ++ __func__, cpu, cpudata->lowest_nonlinear_freq, policy->min); + goto skip_test; + } + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index 91d3c3b1c2d3..66e5dfc711c0 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -233,7 +233,7 @@ static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) + return index; + } + +-static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, ++static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, + u32 des_perf, u32 max_perf, bool fast_switch) + { + if (fast_switch) +@@ -243,7 +243,7 @@ static void pstate_update_perf(struct amd_cpudata *cpudata, u32 min_perf, + READ_ONCE(cpudata->cppc_req_cached)); + } + +-DEFINE_STATIC_CALL(amd_pstate_update_perf, pstate_update_perf); ++DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); + + static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, +@@ -306,11 +306,17 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, + return ret; + } + +-static inline int pstate_enable(bool enable) ++static inline int msr_cppc_enable(bool enable) + { + int ret, cpu; + unsigned long logical_proc_id_mask = 0; + ++ /* ++ * MSR_AMD_CPPC_ENABLE is write-once, once set it cannot be cleared. ++ */ ++ if (!enable) ++ return 0; ++ + if (enable == cppc_enabled) + return 0; + +@@ -332,7 +338,7 @@ static inline int pstate_enable(bool enable) + return 0; + } + +-static int cppc_enable(bool enable) ++static int shmem_cppc_enable(bool enable) + { + int cpu, ret = 0; + struct cppc_perf_ctrls perf_ctrls; +@@ -359,24 +365,28 @@ static int cppc_enable(bool enable) + return ret; + } + +-DEFINE_STATIC_CALL(amd_pstate_enable, pstate_enable); ++DEFINE_STATIC_CALL(amd_pstate_cppc_enable, msr_cppc_enable); + +-static inline int amd_pstate_enable(bool enable) ++static inline int amd_pstate_cppc_enable(bool enable) + { +- return static_call(amd_pstate_enable)(enable); ++ return static_call(amd_pstate_cppc_enable)(enable); + } + +-static int pstate_init_perf(struct amd_cpudata *cpudata) ++static int msr_init_perf(struct amd_cpudata *cpudata) + { +- u64 cap1; ++ u64 cap1, numerator; + + int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, + &cap1); + if (ret) + return ret; + +- WRITE_ONCE(cpudata->highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); +- WRITE_ONCE(cpudata->max_limit_perf, AMD_CPPC_HIGHEST_PERF(cap1)); ++ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); ++ if (ret) ++ return ret; ++ ++ WRITE_ONCE(cpudata->highest_perf, numerator); ++ WRITE_ONCE(cpudata->max_limit_perf, numerator); + WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); + WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); +@@ -385,16 +395,21 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) + return 0; + } + +-static int cppc_init_perf(struct amd_cpudata *cpudata) ++static int shmem_init_perf(struct amd_cpudata *cpudata) + { + struct cppc_perf_caps cppc_perf; ++ u64 numerator; + + int ret = cppc_get_perf_caps(cpudata->cpu, &cppc_perf); + if (ret) + return ret; + +- WRITE_ONCE(cpudata->highest_perf, cppc_perf.highest_perf); +- WRITE_ONCE(cpudata->max_limit_perf, cppc_perf.highest_perf); ++ ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); ++ if (ret) ++ return ret; ++ ++ WRITE_ONCE(cpudata->highest_perf, numerator); ++ WRITE_ONCE(cpudata->max_limit_perf, numerator); + WRITE_ONCE(cpudata->nominal_perf, cppc_perf.nominal_perf); + WRITE_ONCE(cpudata->lowest_nonlinear_perf, + cppc_perf.lowest_nonlinear_perf); +@@ -420,14 +435,14 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) + return ret; + } + +-DEFINE_STATIC_CALL(amd_pstate_init_perf, pstate_init_perf); ++DEFINE_STATIC_CALL(amd_pstate_init_perf, msr_init_perf); + + static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) + { + return static_call(amd_pstate_init_perf)(cpudata); + } + +-static void cppc_update_perf(struct amd_cpudata *cpudata, ++static void shmem_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, + u32 max_perf, bool fast_switch) + { +@@ -527,25 +542,41 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, + cpufreq_cpu_put(policy); + } + +-static int amd_pstate_verify(struct cpufreq_policy_data *policy) ++static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) + { +- cpufreq_verify_within_cpu_limits(policy); ++ /* ++ * Initialize lower frequency limit (i.e.policy->min) with ++ * lowest_nonlinear_frequency which is the most energy efficient ++ * frequency. Override the initial value set by cpufreq core and ++ * amd-pstate qos_requests. ++ */ ++ if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) { ++ struct cpufreq_policy *policy = cpufreq_cpu_get(policy_data->cpu); ++ struct amd_cpudata *cpudata; ++ ++ if (!policy) ++ return -EINVAL; ++ ++ cpudata = policy->driver_data; ++ policy_data->min = cpudata->lowest_nonlinear_freq; ++ cpufreq_cpu_put(policy); ++ } ++ ++ cpufreq_verify_within_cpu_limits(policy_data); ++ pr_debug("policy_max =%d, policy_min=%d\n", policy_data->max, policy_data->min); + + return 0; + } + + static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + { +- u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf; ++ u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq; + struct amd_cpudata *cpudata = policy->driver_data; + +- if (cpudata->boost_supported && !policy->boost_enabled) +- max_perf = READ_ONCE(cpudata->nominal_perf); +- else +- max_perf = READ_ONCE(cpudata->highest_perf); +- +- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); +- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); ++ max_perf = READ_ONCE(cpudata->highest_perf); ++ max_freq = READ_ONCE(cpudata->max_freq); ++ max_limit_perf = div_u64(policy->max * max_perf, max_freq); ++ min_limit_perf = div_u64(policy->min * max_perf, max_freq); + + lowest_perf = READ_ONCE(cpudata->lowest_perf); + if (min_limit_perf < lowest_perf) +@@ -825,7 +856,7 @@ static u32 amd_pstate_get_transition_delay_us(unsigned int cpu) + + transition_delay_ns = cppc_get_transition_latency(cpu); + if (transition_delay_ns == CPUFREQ_ETERNAL) { +- if (cpu_feature_enabled(X86_FEATURE_FAST_CPPC)) ++ if (cpu_feature_enabled(X86_FEATURE_AMD_FAST_CPPC)) + return AMD_PSTATE_FAST_CPPC_TRANSITION_DELAY; + else + return AMD_PSTATE_TRANSITION_DELAY; +@@ -864,7 +895,6 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) + { + int ret; + u32 min_freq, max_freq; +- u64 numerator; + u32 nominal_perf, nominal_freq; + u32 lowest_nonlinear_perf, lowest_nonlinear_freq; + u32 boost_ratio, lowest_nonlinear_ratio; +@@ -886,10 +916,7 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) + + nominal_perf = READ_ONCE(cpudata->nominal_perf); + +- ret = amd_get_boost_ratio_numerator(cpudata->cpu, &numerator); +- if (ret) +- return ret; +- boost_ratio = div_u64(numerator << SCHED_CAPACITY_SHIFT, nominal_perf); ++ boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); + max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; + + lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); +@@ -979,7 +1006,7 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + policy->fast_switch_possible = true; + + ret = freq_qos_add_request(&policy->constraints, &cpudata->req[0], +- FREQ_QOS_MIN, policy->cpuinfo.min_freq); ++ FREQ_QOS_MIN, FREQ_QOS_MIN_DEFAULT_VALUE); + if (ret < 0) { + dev_err(dev, "Failed to add min-freq constraint (%d)\n", ret); + goto free_cpudata1; +@@ -1023,7 +1050,7 @@ static int amd_pstate_cpu_resume(struct cpufreq_policy *policy) + { + int ret; + +- ret = amd_pstate_enable(true); ++ ret = amd_pstate_cppc_enable(true); + if (ret) + pr_err("failed to enable amd-pstate during resume, return %d\n", ret); + +@@ -1034,7 +1061,7 @@ static int amd_pstate_cpu_suspend(struct cpufreq_policy *policy) + { + int ret; + +- ret = amd_pstate_enable(false); ++ ret = amd_pstate_cppc_enable(false); + if (ret) + pr_err("failed to disable amd-pstate during suspend, return %d\n", ret); + +@@ -1167,25 +1194,41 @@ static ssize_t show_energy_performance_preference( + + static void amd_pstate_driver_cleanup(void) + { +- amd_pstate_enable(false); ++ amd_pstate_cppc_enable(false); + cppc_state = AMD_PSTATE_DISABLE; + current_pstate_driver = NULL; + } + ++static int amd_pstate_set_driver(int mode_idx) ++{ ++ if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { ++ cppc_state = mode_idx; ++ if (cppc_state == AMD_PSTATE_DISABLE) ++ pr_info("driver is explicitly disabled\n"); ++ ++ if (cppc_state == AMD_PSTATE_ACTIVE) ++ current_pstate_driver = &amd_pstate_epp_driver; ++ ++ if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) ++ current_pstate_driver = &amd_pstate_driver; ++ ++ return 0; ++ } ++ ++ return -EINVAL; ++} ++ + static int amd_pstate_register_driver(int mode) + { + int ret; + +- if (mode == AMD_PSTATE_PASSIVE || mode == AMD_PSTATE_GUIDED) +- current_pstate_driver = &amd_pstate_driver; +- else if (mode == AMD_PSTATE_ACTIVE) +- current_pstate_driver = &amd_pstate_epp_driver; +- else +- return -EINVAL; ++ ret = amd_pstate_set_driver(mode); ++ if (ret) ++ return ret; + + cppc_state = mode; + +- ret = amd_pstate_enable(true); ++ ret = amd_pstate_cppc_enable(true); + if (ret) { + pr_err("failed to enable cppc during amd-pstate driver registration, return %d\n", + ret); +@@ -1463,6 +1506,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } + ++ current_pstate_driver->adjust_perf = NULL; ++ + return 0; + + free_cpudata1: +@@ -1485,26 +1530,13 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) + static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u32 max_perf, min_perf, min_limit_perf, max_limit_perf; ++ u32 max_perf, min_perf; + u64 value; + s16 epp; + +- if (cpudata->boost_supported && !policy->boost_enabled) +- max_perf = READ_ONCE(cpudata->nominal_perf); +- else +- max_perf = READ_ONCE(cpudata->highest_perf); ++ max_perf = READ_ONCE(cpudata->highest_perf); + min_perf = READ_ONCE(cpudata->lowest_perf); +- max_limit_perf = div_u64(policy->max * max_perf, policy->cpuinfo.max_freq); +- min_limit_perf = div_u64(policy->min * max_perf, policy->cpuinfo.max_freq); +- +- if (min_limit_perf < min_perf) +- min_limit_perf = min_perf; +- +- if (max_limit_perf < min_limit_perf) +- max_limit_perf = min_limit_perf; +- +- WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); +- WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); ++ amd_pstate_update_min_max_limit(policy); + + max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf, + cpudata->max_limit_perf); +@@ -1541,12 +1573,6 @@ static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + epp = 0; + +- /* Set initial EPP value */ +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- value &= ~GENMASK_ULL(31, 24); +- value |= (u64)epp << 24; +- } +- + WRITE_ONCE(cpudata->cppc_req_cached, value); + return amd_pstate_set_epp(cpudata, epp); + } +@@ -1583,7 +1609,7 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) + u64 value, max_perf; + int ret; + +- ret = amd_pstate_enable(true); ++ ret = amd_pstate_cppc_enable(true); + if (ret) + pr_err("failed to enable amd pstate during resume, return %d\n", ret); + +@@ -1594,8 +1620,9 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.max_perf = max_perf; +- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); + cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(cpudata->epp_cached); ++ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + } + } + +@@ -1635,9 +1662,11 @@ static void amd_pstate_epp_offline(struct cpufreq_policy *policy) + wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); + } else { + perf_ctrls.desired_perf = 0; ++ perf_ctrls.min_perf = min_perf; + perf_ctrls.max_perf = min_perf; +- perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); + cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ perf_ctrls.energy_perf = AMD_CPPC_ENERGY_PERF_PREF(HWP_EPP_BALANCE_POWERSAVE); ++ cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); + } + mutex_unlock(&amd_pstate_limits_lock); + } +@@ -1657,13 +1686,6 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + return 0; + } + +-static int amd_pstate_epp_verify_policy(struct cpufreq_policy_data *policy) +-{ +- cpufreq_verify_within_cpu_limits(policy); +- pr_debug("policy_max =%d, policy_min=%d\n", policy->max, policy->min); +- return 0; +-} +- + static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +@@ -1677,7 +1699,7 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) + cpudata->suspended = true; + + /* disable CPPC in lowlevel firmware */ +- ret = amd_pstate_enable(false); ++ ret = amd_pstate_cppc_enable(false); + if (ret) + pr_err("failed to suspend, return %d\n", ret); + +@@ -1719,7 +1741,7 @@ static struct cpufreq_driver amd_pstate_driver = { + + static struct cpufreq_driver amd_pstate_epp_driver = { + .flags = CPUFREQ_CONST_LOOPS, +- .verify = amd_pstate_epp_verify_policy, ++ .verify = amd_pstate_verify, + .setpolicy = amd_pstate_epp_set_policy, + .init = amd_pstate_epp_cpu_init, + .exit = amd_pstate_epp_cpu_exit, +@@ -1733,26 +1755,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { + .attr = amd_pstate_epp_attr, + }; + +-static int __init amd_pstate_set_driver(int mode_idx) +-{ +- if (mode_idx >= AMD_PSTATE_DISABLE && mode_idx < AMD_PSTATE_MAX) { +- cppc_state = mode_idx; +- if (cppc_state == AMD_PSTATE_DISABLE) +- pr_info("driver is explicitly disabled\n"); +- +- if (cppc_state == AMD_PSTATE_ACTIVE) +- current_pstate_driver = &amd_pstate_epp_driver; +- +- if (cppc_state == AMD_PSTATE_PASSIVE || cppc_state == AMD_PSTATE_GUIDED) +- current_pstate_driver = &amd_pstate_driver; +- +- return 0; +- } +- +- return -EINVAL; +-} +- +-/** ++/* + * CPPC function is not supported for family ID 17H with model_ID ranging from 0x10 to 0x2F. + * show the debug message that helps to check if the CPU has CPPC support for loading issue. + */ +@@ -1842,10 +1845,10 @@ static int __init amd_pstate_init(void) + if (cppc_state == AMD_PSTATE_UNDEFINED) { + /* Disable on the following configs by default: + * 1. Undefined platforms +- * 2. Server platforms ++ * 2. Server platforms with CPUs older than Family 0x1A. + */ + if (amd_pstate_acpi_pm_profile_undefined() || +- amd_pstate_acpi_pm_profile_server()) { ++ (amd_pstate_acpi_pm_profile_server() && boot_cpu_data.x86 < 0x1A)) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; + } +@@ -1853,31 +1856,19 @@ static int __init amd_pstate_init(void) + cppc_state = CONFIG_X86_AMD_PSTATE_DEFAULT_MODE; + } + +- switch (cppc_state) { +- case AMD_PSTATE_DISABLE: ++ if (cppc_state == AMD_PSTATE_DISABLE) { + pr_info("driver load is disabled, boot with specific mode to enable this\n"); + return -ENODEV; +- case AMD_PSTATE_PASSIVE: +- case AMD_PSTATE_ACTIVE: +- case AMD_PSTATE_GUIDED: +- ret = amd_pstate_set_driver(cppc_state); +- if (ret) +- return ret; +- break; +- default: +- return -EINVAL; + } + + /* capability check */ + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + pr_debug("AMD CPPC MSR based functionality is supported\n"); +- if (cppc_state != AMD_PSTATE_ACTIVE) +- current_pstate_driver->adjust_perf = amd_pstate_adjust_perf; + } else { + pr_debug("AMD CPPC shared memory based functionality is supported\n"); +- static_call_update(amd_pstate_enable, cppc_enable); +- static_call_update(amd_pstate_init_perf, cppc_init_perf); +- static_call_update(amd_pstate_update_perf, cppc_update_perf); ++ static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable); ++ static_call_update(amd_pstate_init_perf, shmem_init_perf); ++ static_call_update(amd_pstate_update_perf, shmem_update_perf); + } + + if (amd_pstate_prefcore) { +@@ -1886,17 +1877,10 @@ static int __init amd_pstate_init(void) + return ret; + } + +- /* enable amd pstate feature */ +- ret = amd_pstate_enable(true); +- if (ret) { +- pr_err("failed to enable driver mode(%d)\n", cppc_state); +- return ret; +- } +- +- ret = cpufreq_register_driver(current_pstate_driver); ++ ret = amd_pstate_register_driver(cppc_state); + if (ret) { + pr_err("failed to register with return %d\n", ret); +- goto disable_driver; ++ return ret; + } + + dev_root = bus_get_dev_root(&cpu_subsys); +@@ -1913,8 +1897,7 @@ static int __init amd_pstate_init(void) + + global_attr_free: + cpufreq_unregister_driver(current_pstate_driver); +-disable_driver: +- amd_pstate_enable(false); ++ amd_pstate_cppc_enable(false); + return ret; + } + device_initcall(amd_pstate_init); +diff --git a/tools/arch/x86/include/asm/cpufeatures.h b/tools/arch/x86/include/asm/cpufeatures.h +index dd4682857c12..23698d0f4bb4 100644 +--- a/tools/arch/x86/include/asm/cpufeatures.h ++++ b/tools/arch/x86/include/asm/cpufeatures.h +@@ -472,7 +472,7 @@ + #define X86_FEATURE_BHI_CTRL (21*32+ 2) /* BHI_DIS_S HW control available */ + #define X86_FEATURE_CLEAR_BHB_HW (21*32+ 3) /* BHI_DIS_S HW control enabled */ + #define X86_FEATURE_CLEAR_BHB_LOOP_ON_VMEXIT (21*32+ 4) /* Clear branch history at vmexit using SW loop */ +-#define X86_FEATURE_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ ++#define X86_FEATURE_AMD_FAST_CPPC (21*32 + 5) /* AMD Fast CPPC */ + + /* + * BUG word(s) +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.12/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.12/0004-bbr3.patch new file mode 100644 index 0000000..5f6c27b --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0004-bbr3.patch @@ -0,0 +1,3386 @@ +From d03dc7618d35c0c3e5ab7373cff2032a8c3ecf9f Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 19 Dec 2024 18:50:32 +0100 +Subject: [PATCH 04/12] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 4 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 72 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 9 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2230 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 16 files changed, 1940 insertions(+), 553 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 6a5e08b937b3..27aab715490e 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -369,7 +369,9 @@ struct tcp_sock { + u8 compressed_ack; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ +- unused:5; ++ fast_ack_mode:2, /* which fast ack mode ? */ ++ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ ++ unused:2; + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ + fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index c0deaafebfdc..d53f042d936e 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -137,8 +137,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index d1948d357dad..7d99f0bec5f2 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -375,6 +375,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_LOW 16 ++#define TCP_ECN_ECT_PERMANENT 32 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +@@ -779,6 +781,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -884,6 +895,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -973,9 +989,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1088,6 +1109,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1110,7 +1132,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1130,10 +1156,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1144,7 +1173,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1168,8 +1199,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1235,6 +1269,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1254,6 +1296,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1266,6 +1309,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2417,7 +2475,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index db7254d52d93..38de18d921ea 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -507,12 +507,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index dbf896f3146c..4702cd2f1ffc 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ ++#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 6d2c97f8e9ef..ddc116ef22cb 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index 554804774628..2279e6e7bc9c 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,11 +280,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } + ++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) ++{ ++} ++ + static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) + { +@@ -315,7 +319,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, ++ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 4f77bd862e95..fd3a5551eda7 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3384,6 +3384,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4110,6 +4111,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..a180fa648d5e 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which ++ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index 0306d257fa64..28f581c0dab7 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 2d844e1f867f..efb92e47a632 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -370,7 +370,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -381,7 +381,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1120,7 +1120,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1501,6 +1506,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3826,7 +3842,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3843,6 +3860,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3853,6 +3871,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3961,6 +3984,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4035,7 +4059,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_rack_update_reo_wnd(sk, &rs); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4059,6 +4083,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4078,7 +4103,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5752,13 +5777,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index bb1fe1ba867a..050a80769de6 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -462,6 +462,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 8efc58716ce9..5798ce3db487 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -336,10 +336,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -351,6 +350,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -388,7 +390,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1603,7 +1606,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + int nlen; + u8 flags; +@@ -1678,6 +1681,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2035,13 +2062,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2767,6 +2793,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2981,6 +3008,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 79064580c8c0..697270ce1ea6 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -690,6 +690,7 @@ void tcp_write_timer_handler(struct sock *sk) + return; + } + ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.12/0006-crypto.patch b/sys-kernel/gentoo-sources-6.12/0006-crypto.patch new file mode 100644 index 0000000..195db65 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0006-crypto.patch @@ -0,0 +1,1606 @@ +From 03450504df5c4fe2d2ba5981aff7a532ab1ebf17 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 19 Dec 2024 18:51:01 +0100 +Subject: [PATCH 06/12] crypto + +Signed-off-by: Peter Jung +--- + arch/x86/crypto/Kconfig | 4 +- + arch/x86/crypto/aegis128-aesni-asm.S | 533 ++++++++-------------- + arch/x86/crypto/aegis128-aesni-glue.c | 145 +++--- + arch/x86/crypto/crc32c-intel_glue.c | 2 +- + arch/x86/crypto/crc32c-pcl-intel-asm_64.S | 354 +++++--------- + 5 files changed, 387 insertions(+), 651 deletions(-) + +diff --git a/arch/x86/crypto/Kconfig b/arch/x86/crypto/Kconfig +index 7b1bebed879d..3d2e38ba5240 100644 +--- a/arch/x86/crypto/Kconfig ++++ b/arch/x86/crypto/Kconfig +@@ -363,7 +363,7 @@ config CRYPTO_CHACHA20_X86_64 + - AVX-512VL (Advanced Vector Extensions-512VL) + + config CRYPTO_AEGIS128_AESNI_SSE2 +- tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE2)" ++ tristate "AEAD ciphers: AEGIS-128 (AES-NI/SSE4.1)" + depends on X86 && 64BIT + select CRYPTO_AEAD + select CRYPTO_SIMD +@@ -372,7 +372,7 @@ config CRYPTO_AEGIS128_AESNI_SSE2 + + Architecture: x86_64 using: + - AES-NI (AES New Instructions) +- - SSE2 (Streaming SIMD Extensions 2) ++ - SSE4.1 (Streaming SIMD Extensions 4.1) + + config CRYPTO_NHPOLY1305_SSE2 + tristate "Hash functions: NHPoly1305 (SSE2)" +diff --git a/arch/x86/crypto/aegis128-aesni-asm.S b/arch/x86/crypto/aegis128-aesni-asm.S +index 2de859173940..7294dc0ee7ba 100644 +--- a/arch/x86/crypto/aegis128-aesni-asm.S ++++ b/arch/x86/crypto/aegis128-aesni-asm.S +@@ -1,14 +1,13 @@ + /* SPDX-License-Identifier: GPL-2.0-only */ + /* +- * AES-NI + SSE2 implementation of AEGIS-128 ++ * AES-NI + SSE4.1 implementation of AEGIS-128 + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. ++ * Copyright 2024 Google LLC + */ + + #include +-#include +-#include + + #define STATE0 %xmm0 + #define STATE1 %xmm1 +@@ -20,11 +19,6 @@ + #define T0 %xmm6 + #define T1 %xmm7 + +-#define STATEP %rdi +-#define LEN %esi +-#define SRC %rdx +-#define DST %rcx +- + .section .rodata.cst16.aegis128_const, "aM", @progbits, 32 + .align 16 + .Laegis128_const_0: +@@ -34,11 +28,11 @@ + .byte 0xdb, 0x3d, 0x18, 0x55, 0x6d, 0xc2, 0x2f, 0xf1 + .byte 0x20, 0x11, 0x31, 0x42, 0x73, 0xb5, 0x28, 0xdd + +-.section .rodata.cst16.aegis128_counter, "aM", @progbits, 16 +-.align 16 +-.Laegis128_counter: +- .byte 0x00, 0x01, 0x02, 0x03, 0x04, 0x05, 0x06, 0x07 +- .byte 0x08, 0x09, 0x0a, 0x0b, 0x0c, 0x0d, 0x0e, 0x0f ++.section .rodata.cst32.zeropad_mask, "aM", @progbits, 32 ++.align 32 ++.Lzeropad_mask: ++ .octa 0xffffffffffffffffffffffffffffffff ++ .octa 0 + + .text + +@@ -61,140 +55,102 @@ + .endm + + /* +- * __load_partial: internal ABI +- * input: +- * LEN - bytes +- * SRC - src +- * output: +- * MSG - message block +- * changed: +- * T0 +- * %r8 +- * %r9 ++ * Load 1 <= LEN (%ecx) <= 15 bytes from the pointer SRC into the xmm register ++ * MSG and zeroize any remaining bytes. Clobbers %rax, %rcx, and %r8. + */ +-SYM_FUNC_START_LOCAL(__load_partial) +- xor %r9d, %r9d +- pxor MSG, MSG +- +- mov LEN, %r8d +- and $0x1, %r8 +- jz .Lld_partial_1 +- +- mov LEN, %r8d +- and $0x1E, %r8 +- add SRC, %r8 +- mov (%r8), %r9b +- +-.Lld_partial_1: +- mov LEN, %r8d +- and $0x2, %r8 +- jz .Lld_partial_2 +- +- mov LEN, %r8d +- and $0x1C, %r8 +- add SRC, %r8 +- shl $0x10, %r9 +- mov (%r8), %r9w +- +-.Lld_partial_2: +- mov LEN, %r8d +- and $0x4, %r8 +- jz .Lld_partial_4 +- +- mov LEN, %r8d +- and $0x18, %r8 +- add SRC, %r8 +- shl $32, %r9 +- mov (%r8), %r8d +- xor %r8, %r9 +- +-.Lld_partial_4: +- movq %r9, MSG +- +- mov LEN, %r8d +- and $0x8, %r8 +- jz .Lld_partial_8 +- +- mov LEN, %r8d +- and $0x10, %r8 +- add SRC, %r8 +- pslldq $8, MSG +- movq (%r8), T0 +- pxor T0, MSG +- +-.Lld_partial_8: +- RET +-SYM_FUNC_END(__load_partial) ++.macro load_partial ++ sub $8, %ecx /* LEN - 8 */ ++ jle .Lle8\@ ++ ++ /* Load 9 <= LEN <= 15 bytes: */ ++ movq (SRC), MSG /* Load first 8 bytes */ ++ mov (SRC, %rcx), %rax /* Load last 8 bytes */ ++ neg %ecx ++ shl $3, %ecx ++ shr %cl, %rax /* Discard overlapping bytes */ ++ pinsrq $1, %rax, MSG ++ jmp .Ldone\@ ++ ++.Lle8\@: ++ add $4, %ecx /* LEN - 4 */ ++ jl .Llt4\@ ++ ++ /* Load 4 <= LEN <= 8 bytes: */ ++ mov (SRC), %eax /* Load first 4 bytes */ ++ mov (SRC, %rcx), %r8d /* Load last 4 bytes */ ++ jmp .Lcombine\@ ++ ++.Llt4\@: ++ /* Load 1 <= LEN <= 3 bytes: */ ++ add $2, %ecx /* LEN - 2 */ ++ movzbl (SRC), %eax /* Load first byte */ ++ jl .Lmovq\@ ++ movzwl (SRC, %rcx), %r8d /* Load last 2 bytes */ ++.Lcombine\@: ++ shl $3, %ecx ++ shl %cl, %r8 ++ or %r8, %rax /* Combine the two parts */ ++.Lmovq\@: ++ movq %rax, MSG ++.Ldone\@: ++.endm + + /* +- * __store_partial: internal ABI +- * input: +- * LEN - bytes +- * DST - dst +- * output: +- * T0 - message block +- * changed: +- * %r8 +- * %r9 +- * %r10 ++ * Store 1 <= LEN (%ecx) <= 15 bytes from the xmm register \msg to the pointer ++ * DST. Clobbers %rax, %rcx, and %r8. + */ +-SYM_FUNC_START_LOCAL(__store_partial) +- mov LEN, %r8d +- mov DST, %r9 +- +- movq T0, %r10 +- +- cmp $8, %r8 +- jl .Lst_partial_8 +- +- mov %r10, (%r9) +- psrldq $8, T0 +- movq T0, %r10 +- +- sub $8, %r8 +- add $8, %r9 +- +-.Lst_partial_8: +- cmp $4, %r8 +- jl .Lst_partial_4 +- +- mov %r10d, (%r9) +- shr $32, %r10 +- +- sub $4, %r8 +- add $4, %r9 +- +-.Lst_partial_4: +- cmp $2, %r8 +- jl .Lst_partial_2 +- +- mov %r10w, (%r9) +- shr $0x10, %r10 +- +- sub $2, %r8 +- add $2, %r9 +- +-.Lst_partial_2: +- cmp $1, %r8 +- jl .Lst_partial_1 +- +- mov %r10b, (%r9) +- +-.Lst_partial_1: +- RET +-SYM_FUNC_END(__store_partial) ++.macro store_partial msg ++ sub $8, %ecx /* LEN - 8 */ ++ jl .Llt8\@ ++ ++ /* Store 8 <= LEN <= 15 bytes: */ ++ pextrq $1, \msg, %rax ++ mov %ecx, %r8d ++ shl $3, %ecx ++ ror %cl, %rax ++ mov %rax, (DST, %r8) /* Store last LEN - 8 bytes */ ++ movq \msg, (DST) /* Store first 8 bytes */ ++ jmp .Ldone\@ ++ ++.Llt8\@: ++ add $4, %ecx /* LEN - 4 */ ++ jl .Llt4\@ ++ ++ /* Store 4 <= LEN <= 7 bytes: */ ++ pextrd $1, \msg, %eax ++ mov %ecx, %r8d ++ shl $3, %ecx ++ ror %cl, %eax ++ mov %eax, (DST, %r8) /* Store last LEN - 4 bytes */ ++ movd \msg, (DST) /* Store first 4 bytes */ ++ jmp .Ldone\@ ++ ++.Llt4\@: ++ /* Store 1 <= LEN <= 3 bytes: */ ++ pextrb $0, \msg, 0(DST) ++ cmp $-2, %ecx /* LEN - 4 == -2, i.e. LEN == 2? */ ++ jl .Ldone\@ ++ pextrb $1, \msg, 1(DST) ++ je .Ldone\@ ++ pextrb $2, \msg, 2(DST) ++.Ldone\@: ++.endm + + /* +- * void crypto_aegis128_aesni_init(void *state, const void *key, const void *iv); ++ * void aegis128_aesni_init(struct aegis_state *state, ++ * const struct aegis_block *key, ++ * const u8 iv[AEGIS128_NONCE_SIZE]); + */ +-SYM_FUNC_START(crypto_aegis128_aesni_init) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_init) ++ .set STATEP, %rdi ++ .set KEYP, %rsi ++ .set IVP, %rdx + + /* load IV: */ +- movdqu (%rdx), T1 ++ movdqu (IVP), T1 + + /* load key: */ +- movdqa (%rsi), KEY ++ movdqa (KEYP), KEY + pxor KEY, T1 + movdqa T1, STATE0 + movdqa KEY, STATE3 +@@ -224,20 +180,22 @@ SYM_FUNC_START(crypto_aegis128_aesni_init) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_init) ++SYM_FUNC_END(aegis128_aesni_init) + + /* +- * void crypto_aegis128_aesni_ad(void *state, unsigned int length, +- * const void *data); ++ * void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, ++ * unsigned int len); ++ * ++ * len must be a multiple of 16. + */ +-SYM_FUNC_START(crypto_aegis128_aesni_ad) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_ad) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set LEN, %edx + +- cmp $0x10, LEN +- jb .Lad_out ++ test LEN, LEN ++ jz .Lad_out + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -246,89 +204,40 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + +- mov SRC, %r8 +- and $0xF, %r8 +- jnz .Lad_u_loop +- +-.align 8 +-.Lad_a_loop: +- movdqa 0x00(SRC), MSG +- aegis128_update +- pxor MSG, STATE4 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_1 +- +- movdqa 0x10(SRC), MSG +- aegis128_update +- pxor MSG, STATE3 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_2 +- +- movdqa 0x20(SRC), MSG +- aegis128_update +- pxor MSG, STATE2 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_3 +- +- movdqa 0x30(SRC), MSG +- aegis128_update +- pxor MSG, STATE1 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_4 +- +- movdqa 0x40(SRC), MSG +- aegis128_update +- pxor MSG, STATE0 +- sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_0 +- +- add $0x50, SRC +- jmp .Lad_a_loop +- + .align 8 +-.Lad_u_loop: ++.Lad_loop: + movdqu 0x00(SRC), MSG + aegis128_update + pxor MSG, STATE4 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_1 ++ jz .Lad_out_1 + + movdqu 0x10(SRC), MSG + aegis128_update + pxor MSG, STATE3 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_2 ++ jz .Lad_out_2 + + movdqu 0x20(SRC), MSG + aegis128_update + pxor MSG, STATE2 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_3 ++ jz .Lad_out_3 + + movdqu 0x30(SRC), MSG + aegis128_update + pxor MSG, STATE1 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_4 ++ jz .Lad_out_4 + + movdqu 0x40(SRC), MSG + aegis128_update + pxor MSG, STATE0 + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lad_out_0 ++ jz .Lad_out_0 + + add $0x50, SRC +- jmp .Lad_u_loop ++ jmp .Lad_loop + + /* store the state: */ + .Lad_out_0: +@@ -337,7 +246,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_1: +@@ -346,7 +254,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_2: +@@ -355,7 +262,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_3: +@@ -364,7 +270,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) +- FRAME_END + RET + + .Lad_out_4: +@@ -373,41 +278,38 @@ SYM_FUNC_START(crypto_aegis128_aesni_ad) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) +- FRAME_END +- RET +- + .Lad_out: +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_ad) ++SYM_FUNC_END(aegis128_aesni_ad) + +-.macro encrypt_block a s0 s1 s2 s3 s4 i +- movdq\a (\i * 0x10)(SRC), MSG ++.macro encrypt_block s0 s1 s2 s3 s4 i ++ movdqu (\i * 0x10)(SRC), MSG + movdqa MSG, T0 + pxor \s1, T0 + pxor \s4, T0 + movdqa \s2, T1 + pand \s3, T1 + pxor T1, T0 +- movdq\a T0, (\i * 0x10)(DST) ++ movdqu T0, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Lenc_out_\i ++ jz .Lenc_out_\i + .endm + + /* +- * void crypto_aegis128_aesni_enc(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, u8 *dst, ++ * unsigned int len); ++ * ++ * len must be nonzero and a multiple of 16. + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) +- FRAME_BEGIN +- +- cmp $0x10, LEN +- jb .Lenc_out ++SYM_FUNC_START(aegis128_aesni_enc) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -416,34 +318,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + +- mov SRC, %r8 +- or DST, %r8 +- and $0xF, %r8 +- jnz .Lenc_u_loop +- + .align 8 +-.Lenc_a_loop: +- encrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 +- encrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 +- encrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 +- encrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 +- encrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 ++.Lenc_loop: ++ encrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 ++ encrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 ++ encrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 ++ encrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 ++ encrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST +- jmp .Lenc_a_loop +- +-.align 8 +-.Lenc_u_loop: +- encrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 +- encrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 +- encrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 +- encrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 +- encrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 +- +- add $0x50, SRC +- add $0x50, DST +- jmp .Lenc_u_loop ++ jmp .Lenc_loop + + /* store the state: */ + .Lenc_out_0: +@@ -452,7 +337,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_1: +@@ -461,7 +345,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_2: +@@ -470,7 +353,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_3: +@@ -479,7 +361,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) +- FRAME_END + RET + + .Lenc_out_4: +@@ -488,20 +369,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- FRAME_END +- RET +- + .Lenc_out: +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_enc) ++SYM_FUNC_END(aegis128_aesni_enc) + + /* +- * void crypto_aegis128_aesni_enc_tail(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_enc_tail(struct aegis_state *state, const u8 *src, ++ * u8 *dst, unsigned int len); + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_enc_tail) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -511,7 +391,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) + movdqu 0x40(STATEP), STATE4 + + /* encrypt message: */ +- call __load_partial ++ mov LEN, %r9d ++ load_partial + + movdqa MSG, T0 + pxor STATE1, T0 +@@ -520,7 +401,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) + pand STATE3, T1 + pxor T1, T0 + +- call __store_partial ++ mov %r9d, LEN ++ store_partial T0 + + aegis128_update + pxor MSG, STATE4 +@@ -531,37 +413,36 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_enc_tail) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_enc_tail) ++SYM_FUNC_END(aegis128_aesni_enc_tail) + +-.macro decrypt_block a s0 s1 s2 s3 s4 i +- movdq\a (\i * 0x10)(SRC), MSG ++.macro decrypt_block s0 s1 s2 s3 s4 i ++ movdqu (\i * 0x10)(SRC), MSG + pxor \s1, MSG + pxor \s4, MSG + movdqa \s2, T1 + pand \s3, T1 + pxor T1, MSG +- movdq\a MSG, (\i * 0x10)(DST) ++ movdqu MSG, (\i * 0x10)(DST) + + aegis128_update + pxor MSG, \s4 + + sub $0x10, LEN +- cmp $0x10, LEN +- jl .Ldec_out_\i ++ jz .Ldec_out_\i + .endm + + /* +- * void crypto_aegis128_aesni_dec(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, u8 *dst, ++ * unsigned int len); ++ * ++ * len must be nonzero and a multiple of 16. + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) +- FRAME_BEGIN +- +- cmp $0x10, LEN +- jb .Ldec_out ++SYM_FUNC_START(aegis128_aesni_dec) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -570,34 +451,17 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu 0x30(STATEP), STATE3 + movdqu 0x40(STATEP), STATE4 + +- mov SRC, %r8 +- or DST, %r8 +- and $0xF, %r8 +- jnz .Ldec_u_loop +- + .align 8 +-.Ldec_a_loop: +- decrypt_block a STATE0 STATE1 STATE2 STATE3 STATE4 0 +- decrypt_block a STATE4 STATE0 STATE1 STATE2 STATE3 1 +- decrypt_block a STATE3 STATE4 STATE0 STATE1 STATE2 2 +- decrypt_block a STATE2 STATE3 STATE4 STATE0 STATE1 3 +- decrypt_block a STATE1 STATE2 STATE3 STATE4 STATE0 4 ++.Ldec_loop: ++ decrypt_block STATE0 STATE1 STATE2 STATE3 STATE4 0 ++ decrypt_block STATE4 STATE0 STATE1 STATE2 STATE3 1 ++ decrypt_block STATE3 STATE4 STATE0 STATE1 STATE2 2 ++ decrypt_block STATE2 STATE3 STATE4 STATE0 STATE1 3 ++ decrypt_block STATE1 STATE2 STATE3 STATE4 STATE0 4 + + add $0x50, SRC + add $0x50, DST +- jmp .Ldec_a_loop +- +-.align 8 +-.Ldec_u_loop: +- decrypt_block u STATE0 STATE1 STATE2 STATE3 STATE4 0 +- decrypt_block u STATE4 STATE0 STATE1 STATE2 STATE3 1 +- decrypt_block u STATE3 STATE4 STATE0 STATE1 STATE2 2 +- decrypt_block u STATE2 STATE3 STATE4 STATE0 STATE1 3 +- decrypt_block u STATE1 STATE2 STATE3 STATE4 STATE0 4 +- +- add $0x50, SRC +- add $0x50, DST +- jmp .Ldec_u_loop ++ jmp .Ldec_loop + + /* store the state: */ + .Ldec_out_0: +@@ -606,7 +470,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_1: +@@ -615,7 +478,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE0, 0x20(STATEP) + movdqu STATE1, 0x30(STATEP) + movdqu STATE2, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_2: +@@ -624,7 +486,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE4, 0x20(STATEP) + movdqu STATE0, 0x30(STATEP) + movdqu STATE1, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_3: +@@ -633,7 +494,6 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE3, 0x20(STATEP) + movdqu STATE4, 0x30(STATEP) + movdqu STATE0, 0x40(STATEP) +- FRAME_END + RET + + .Ldec_out_4: +@@ -642,20 +502,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec) + movdqu STATE2, 0x20(STATEP) + movdqu STATE3, 0x30(STATEP) + movdqu STATE4, 0x40(STATEP) +- FRAME_END +- RET +- + .Ldec_out: +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_dec) ++SYM_FUNC_END(aegis128_aesni_dec) + + /* +- * void crypto_aegis128_aesni_dec_tail(void *state, unsigned int length, +- * const void *src, void *dst); ++ * void aegis128_aesni_dec_tail(struct aegis_state *state, const u8 *src, ++ * u8 *dst, unsigned int len); + */ +-SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_dec_tail) ++ .set STATEP, %rdi ++ .set SRC, %rsi ++ .set DST, %rdx ++ .set LEN, %ecx /* {load,store}_partial rely on this being %ecx */ + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -665,7 +524,8 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) + movdqu 0x40(STATEP), STATE4 + + /* decrypt message: */ +- call __load_partial ++ mov LEN, %r9d ++ load_partial + + pxor STATE1, MSG + pxor STATE4, MSG +@@ -673,17 +533,13 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) + pand STATE3, T1 + pxor T1, MSG + +- movdqa MSG, T0 +- call __store_partial ++ mov %r9d, LEN ++ store_partial MSG + + /* mask with byte count: */ +- movd LEN, T0 +- punpcklbw T0, T0 +- punpcklbw T0, T0 +- punpcklbw T0, T0 +- punpcklbw T0, T0 +- movdqa .Laegis128_counter(%rip), T1 +- pcmpgtb T1, T0 ++ lea .Lzeropad_mask+16(%rip), %rax ++ sub %r9, %rax ++ movdqu (%rax), T0 + pand T0, MSG + + aegis128_update +@@ -695,18 +551,19 @@ SYM_TYPED_FUNC_START(crypto_aegis128_aesni_dec_tail) + movdqu STATE1, 0x20(STATEP) + movdqu STATE2, 0x30(STATEP) + movdqu STATE3, 0x40(STATEP) +- +- FRAME_END + RET +-SYM_FUNC_END(crypto_aegis128_aesni_dec_tail) ++SYM_FUNC_END(aegis128_aesni_dec_tail) + + /* +- * void crypto_aegis128_aesni_final(void *state, void *tag_xor, +- * unsigned int assoclen, +- * unsigned int cryptlen); ++ * void aegis128_aesni_final(struct aegis_state *state, ++ * struct aegis_block *tag_xor, ++ * unsigned int assoclen, unsigned int cryptlen); + */ +-SYM_FUNC_START(crypto_aegis128_aesni_final) +- FRAME_BEGIN ++SYM_FUNC_START(aegis128_aesni_final) ++ .set STATEP, %rdi ++ .set TAG_XOR, %rsi ++ .set ASSOCLEN, %edx ++ .set CRYPTLEN, %ecx + + /* load the state: */ + movdqu 0x00(STATEP), STATE0 +@@ -716,10 +573,8 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) + movdqu 0x40(STATEP), STATE4 + + /* prepare length block: */ +- movd %edx, MSG +- movd %ecx, T0 +- pslldq $8, T0 +- pxor T0, MSG ++ movd ASSOCLEN, MSG ++ pinsrd $2, CRYPTLEN, MSG + psllq $3, MSG /* multiply by 8 (to get bit count) */ + + pxor STATE3, MSG +@@ -734,7 +589,7 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) + aegis128_update; pxor MSG, STATE3 + + /* xor tag: */ +- movdqu (%rsi), MSG ++ movdqu (TAG_XOR), MSG + + pxor STATE0, MSG + pxor STATE1, MSG +@@ -742,8 +597,6 @@ SYM_FUNC_START(crypto_aegis128_aesni_final) + pxor STATE3, MSG + pxor STATE4, MSG + +- movdqu MSG, (%rsi) +- +- FRAME_END ++ movdqu MSG, (TAG_XOR) + RET +-SYM_FUNC_END(crypto_aegis128_aesni_final) ++SYM_FUNC_END(aegis128_aesni_final) +diff --git a/arch/x86/crypto/aegis128-aesni-glue.c b/arch/x86/crypto/aegis128-aesni-glue.c +index 4623189000d8..c19d8e3d96a3 100644 +--- a/arch/x86/crypto/aegis128-aesni-glue.c ++++ b/arch/x86/crypto/aegis128-aesni-glue.c +@@ -1,7 +1,7 @@ + // SPDX-License-Identifier: GPL-2.0-or-later + /* + * The AEGIS-128 Authenticated-Encryption Algorithm +- * Glue for AES-NI + SSE2 implementation ++ * Glue for AES-NI + SSE4.1 implementation + * + * Copyright (c) 2017-2018 Ondrej Mosnacek + * Copyright (C) 2017-2018 Red Hat, Inc. All rights reserved. +@@ -23,27 +23,6 @@ + #define AEGIS128_MIN_AUTH_SIZE 8 + #define AEGIS128_MAX_AUTH_SIZE 16 + +-asmlinkage void crypto_aegis128_aesni_init(void *state, void *key, void *iv); +- +-asmlinkage void crypto_aegis128_aesni_ad( +- void *state, unsigned int length, const void *data); +- +-asmlinkage void crypto_aegis128_aesni_enc( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_dec( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_enc_tail( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_dec_tail( +- void *state, unsigned int length, const void *src, void *dst); +- +-asmlinkage void crypto_aegis128_aesni_final( +- void *state, void *tag_xor, unsigned int cryptlen, +- unsigned int assoclen); +- + struct aegis_block { + u8 bytes[AEGIS128_BLOCK_SIZE] __aligned(AEGIS128_BLOCK_ALIGN); + }; +@@ -56,15 +35,31 @@ struct aegis_ctx { + struct aegis_block key; + }; + +-struct aegis_crypt_ops { +- int (*skcipher_walk_init)(struct skcipher_walk *walk, +- struct aead_request *req, bool atomic); ++asmlinkage void aegis128_aesni_init(struct aegis_state *state, ++ const struct aegis_block *key, ++ const u8 iv[AEGIS128_NONCE_SIZE]); + +- void (*crypt_blocks)(void *state, unsigned int length, const void *src, +- void *dst); +- void (*crypt_tail)(void *state, unsigned int length, const void *src, +- void *dst); +-}; ++asmlinkage void aegis128_aesni_ad(struct aegis_state *state, const u8 *data, ++ unsigned int len); ++ ++asmlinkage void aegis128_aesni_enc(struct aegis_state *state, const u8 *src, ++ u8 *dst, unsigned int len); ++ ++asmlinkage void aegis128_aesni_dec(struct aegis_state *state, const u8 *src, ++ u8 *dst, unsigned int len); ++ ++asmlinkage void aegis128_aesni_enc_tail(struct aegis_state *state, ++ const u8 *src, u8 *dst, ++ unsigned int len); ++ ++asmlinkage void aegis128_aesni_dec_tail(struct aegis_state *state, ++ const u8 *src, u8 *dst, ++ unsigned int len); ++ ++asmlinkage void aegis128_aesni_final(struct aegis_state *state, ++ struct aegis_block *tag_xor, ++ unsigned int assoclen, ++ unsigned int cryptlen); + + static void crypto_aegis128_aesni_process_ad( + struct aegis_state *state, struct scatterlist *sg_src, +@@ -85,16 +80,15 @@ static void crypto_aegis128_aesni_process_ad( + if (pos > 0) { + unsigned int fill = AEGIS128_BLOCK_SIZE - pos; + memcpy(buf.bytes + pos, src, fill); +- crypto_aegis128_aesni_ad(state, +- AEGIS128_BLOCK_SIZE, +- buf.bytes); ++ aegis128_aesni_ad(state, buf.bytes, ++ AEGIS128_BLOCK_SIZE); + pos = 0; + left -= fill; + src += fill; + } + +- crypto_aegis128_aesni_ad(state, left, src); +- ++ aegis128_aesni_ad(state, src, ++ left & ~(AEGIS128_BLOCK_SIZE - 1)); + src += left & ~(AEGIS128_BLOCK_SIZE - 1); + left &= AEGIS128_BLOCK_SIZE - 1; + } +@@ -110,24 +104,37 @@ static void crypto_aegis128_aesni_process_ad( + + if (pos > 0) { + memset(buf.bytes + pos, 0, AEGIS128_BLOCK_SIZE - pos); +- crypto_aegis128_aesni_ad(state, AEGIS128_BLOCK_SIZE, buf.bytes); ++ aegis128_aesni_ad(state, buf.bytes, AEGIS128_BLOCK_SIZE); + } + } + +-static void crypto_aegis128_aesni_process_crypt( +- struct aegis_state *state, struct skcipher_walk *walk, +- const struct aegis_crypt_ops *ops) ++static __always_inline void ++crypto_aegis128_aesni_process_crypt(struct aegis_state *state, ++ struct skcipher_walk *walk, bool enc) + { + while (walk->nbytes >= AEGIS128_BLOCK_SIZE) { +- ops->crypt_blocks(state, +- round_down(walk->nbytes, AEGIS128_BLOCK_SIZE), +- walk->src.virt.addr, walk->dst.virt.addr); ++ if (enc) ++ aegis128_aesni_enc(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ round_down(walk->nbytes, ++ AEGIS128_BLOCK_SIZE)); ++ else ++ aegis128_aesni_dec(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ round_down(walk->nbytes, ++ AEGIS128_BLOCK_SIZE)); + skcipher_walk_done(walk, walk->nbytes % AEGIS128_BLOCK_SIZE); + } + + if (walk->nbytes) { +- ops->crypt_tail(state, walk->nbytes, walk->src.virt.addr, +- walk->dst.virt.addr); ++ if (enc) ++ aegis128_aesni_enc_tail(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ walk->nbytes); ++ else ++ aegis128_aesni_dec_tail(state, walk->src.virt.addr, ++ walk->dst.virt.addr, ++ walk->nbytes); + skcipher_walk_done(walk, 0); + } + } +@@ -162,42 +169,39 @@ static int crypto_aegis128_aesni_setauthsize(struct crypto_aead *tfm, + return 0; + } + +-static void crypto_aegis128_aesni_crypt(struct aead_request *req, +- struct aegis_block *tag_xor, +- unsigned int cryptlen, +- const struct aegis_crypt_ops *ops) ++static __always_inline void ++crypto_aegis128_aesni_crypt(struct aead_request *req, ++ struct aegis_block *tag_xor, ++ unsigned int cryptlen, bool enc) + { + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_ctx *ctx = crypto_aegis128_aesni_ctx(tfm); + struct skcipher_walk walk; + struct aegis_state state; + +- ops->skcipher_walk_init(&walk, req, true); ++ if (enc) ++ skcipher_walk_aead_encrypt(&walk, req, true); ++ else ++ skcipher_walk_aead_decrypt(&walk, req, true); + + kernel_fpu_begin(); + +- crypto_aegis128_aesni_init(&state, ctx->key.bytes, req->iv); ++ aegis128_aesni_init(&state, &ctx->key, req->iv); + crypto_aegis128_aesni_process_ad(&state, req->src, req->assoclen); +- crypto_aegis128_aesni_process_crypt(&state, &walk, ops); +- crypto_aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); ++ crypto_aegis128_aesni_process_crypt(&state, &walk, enc); ++ aegis128_aesni_final(&state, tag_xor, req->assoclen, cryptlen); + + kernel_fpu_end(); + } + + static int crypto_aegis128_aesni_encrypt(struct aead_request *req) + { +- static const struct aegis_crypt_ops OPS = { +- .skcipher_walk_init = skcipher_walk_aead_encrypt, +- .crypt_blocks = crypto_aegis128_aesni_enc, +- .crypt_tail = crypto_aegis128_aesni_enc_tail, +- }; +- + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag = {}; + unsigned int authsize = crypto_aead_authsize(tfm); + unsigned int cryptlen = req->cryptlen; + +- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); ++ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, true); + + scatterwalk_map_and_copy(tag.bytes, req->dst, + req->assoclen + cryptlen, authsize, 1); +@@ -208,12 +212,6 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) + { + static const struct aegis_block zeros = {}; + +- static const struct aegis_crypt_ops OPS = { +- .skcipher_walk_init = skcipher_walk_aead_decrypt, +- .crypt_blocks = crypto_aegis128_aesni_dec, +- .crypt_tail = crypto_aegis128_aesni_dec_tail, +- }; +- + struct crypto_aead *tfm = crypto_aead_reqtfm(req); + struct aegis_block tag; + unsigned int authsize = crypto_aead_authsize(tfm); +@@ -222,27 +220,16 @@ static int crypto_aegis128_aesni_decrypt(struct aead_request *req) + scatterwalk_map_and_copy(tag.bytes, req->src, + req->assoclen + cryptlen, authsize, 0); + +- crypto_aegis128_aesni_crypt(req, &tag, cryptlen, &OPS); ++ crypto_aegis128_aesni_crypt(req, &tag, cryptlen, false); + + return crypto_memneq(tag.bytes, zeros.bytes, authsize) ? -EBADMSG : 0; + } + +-static int crypto_aegis128_aesni_init_tfm(struct crypto_aead *aead) +-{ +- return 0; +-} +- +-static void crypto_aegis128_aesni_exit_tfm(struct crypto_aead *aead) +-{ +-} +- + static struct aead_alg crypto_aegis128_aesni_alg = { + .setkey = crypto_aegis128_aesni_setkey, + .setauthsize = crypto_aegis128_aesni_setauthsize, + .encrypt = crypto_aegis128_aesni_encrypt, + .decrypt = crypto_aegis128_aesni_decrypt, +- .init = crypto_aegis128_aesni_init_tfm, +- .exit = crypto_aegis128_aesni_exit_tfm, + + .ivsize = AEGIS128_NONCE_SIZE, + .maxauthsize = AEGIS128_MAX_AUTH_SIZE, +@@ -267,7 +254,7 @@ static struct simd_aead_alg *simd_alg; + + static int __init crypto_aegis128_aesni_module_init(void) + { +- if (!boot_cpu_has(X86_FEATURE_XMM2) || ++ if (!boot_cpu_has(X86_FEATURE_XMM4_1) || + !boot_cpu_has(X86_FEATURE_AES) || + !cpu_has_xfeatures(XFEATURE_MASK_SSE, NULL)) + return -ENODEV; +@@ -286,6 +273,6 @@ module_exit(crypto_aegis128_aesni_module_exit); + + MODULE_LICENSE("GPL"); + MODULE_AUTHOR("Ondrej Mosnacek "); +-MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE2 implementation"); ++MODULE_DESCRIPTION("AEGIS-128 AEAD algorithm -- AESNI+SSE4.1 implementation"); + MODULE_ALIAS_CRYPTO("aegis128"); + MODULE_ALIAS_CRYPTO("aegis128-aesni"); +diff --git a/arch/x86/crypto/crc32c-intel_glue.c b/arch/x86/crypto/crc32c-intel_glue.c +index feccb5254c7e..52c5d47ef5a1 100644 +--- a/arch/x86/crypto/crc32c-intel_glue.c ++++ b/arch/x86/crypto/crc32c-intel_glue.c +@@ -41,7 +41,7 @@ + */ + #define CRC32C_PCL_BREAKEVEN 512 + +-asmlinkage unsigned int crc_pcl(const u8 *buffer, int len, ++asmlinkage unsigned int crc_pcl(const u8 *buffer, unsigned int len, + unsigned int crc_init); + #endif /* CONFIG_X86_64 */ + +diff --git a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +index bbcff1fb78cb..752812bc4991 100644 +--- a/arch/x86/crypto/crc32c-pcl-intel-asm_64.S ++++ b/arch/x86/crypto/crc32c-pcl-intel-asm_64.S +@@ -7,6 +7,7 @@ + * http://www.intel.com/content/dam/www/public/us/en/documents/white-papers/fast-crc-computation-paper.pdf + * + * Copyright (C) 2012 Intel Corporation. ++ * Copyright 2024 Google LLC + * + * Authors: + * Wajdi Feghali +@@ -44,185 +45,129 @@ + */ + + #include +-#include + + ## ISCSI CRC 32 Implementation with crc32 and pclmulqdq Instruction + +-.macro LABEL prefix n +-.L\prefix\n\(): +-.endm +- +-.macro JMPTBL_ENTRY i +-.quad .Lcrc_\i +-.endm +- +-.macro JNC_LESS_THAN j +- jnc .Lless_than_\j +-.endm +- +-# Define threshold where buffers are considered "small" and routed to more +-# efficient "by-1" code. This "by-1" code only handles up to 255 bytes, so +-# SMALL_SIZE can be no larger than 255. +- ++# Define threshold below which buffers are considered "small" and routed to ++# regular CRC code that does not interleave the CRC instructions. + #define SMALL_SIZE 200 + +-.if (SMALL_SIZE > 255) +-.error "SMALL_ SIZE must be < 256" +-.endif +- +-# unsigned int crc_pcl(u8 *buffer, int len, unsigned int crc_init); ++# unsigned int crc_pcl(const u8 *buffer, unsigned int len, unsigned int crc_init); + + .text + SYM_FUNC_START(crc_pcl) +-#define bufp rdi +-#define bufp_dw %edi +-#define bufp_w %di +-#define bufp_b %dil +-#define bufptmp %rcx +-#define block_0 %rcx +-#define block_1 %rdx +-#define block_2 %r11 +-#define len %rsi +-#define len_dw %esi +-#define len_w %si +-#define len_b %sil +-#define crc_init_arg %rdx +-#define tmp %rbx +-#define crc_init %r8 +-#define crc_init_dw %r8d +-#define crc1 %r9 +-#define crc2 %r10 +- +- pushq %rbx +- pushq %rdi +- pushq %rsi +- +- ## Move crc_init for Linux to a different +- mov crc_init_arg, crc_init ++#define bufp %rdi ++#define bufp_d %edi ++#define len %esi ++#define crc_init %edx ++#define crc_init_q %rdx ++#define n_misaligned %ecx /* overlaps chunk_bytes! */ ++#define n_misaligned_q %rcx ++#define chunk_bytes %ecx /* overlaps n_misaligned! */ ++#define chunk_bytes_q %rcx ++#define crc1 %r8 ++#define crc2 %r9 ++ ++ cmp $SMALL_SIZE, len ++ jb .Lsmall + + ################################################################ + ## 1) ALIGN: + ################################################################ +- +- mov %bufp, bufptmp # rdi = *buf +- neg %bufp +- and $7, %bufp # calculate the unalignment amount of ++ mov bufp_d, n_misaligned ++ neg n_misaligned ++ and $7, n_misaligned # calculate the misalignment amount of + # the address +- je .Lproc_block # Skip if aligned +- +- ## If len is less than 8 and we're unaligned, we need to jump +- ## to special code to avoid reading beyond the end of the buffer +- cmp $8, len +- jae .Ldo_align +- # less_than_8 expects length in upper 3 bits of len_dw +- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +- shl $32-3+1, len_dw +- jmp .Lless_than_8_post_shl1 ++ je .Laligned # Skip if aligned + ++ # Process 1 <= n_misaligned <= 7 bytes individually in order to align ++ # the remaining data to an 8-byte boundary. + .Ldo_align: +- #### Calculate CRC of unaligned bytes of the buffer (if any) +- movq (bufptmp), tmp # load a quadward from the buffer +- add %bufp, bufptmp # align buffer pointer for quadword +- # processing +- sub %bufp, len # update buffer length ++ movq (bufp), %rax ++ add n_misaligned_q, bufp ++ sub n_misaligned, len + .Lalign_loop: +- crc32b %bl, crc_init_dw # compute crc32 of 1-byte +- shr $8, tmp # get next byte +- dec %bufp ++ crc32b %al, crc_init # compute crc32 of 1-byte ++ shr $8, %rax # get next byte ++ dec n_misaligned + jne .Lalign_loop +- +-.Lproc_block: ++.Laligned: + + ################################################################ +- ## 2) PROCESS BLOCKS: ++ ## 2) PROCESS BLOCK: + ################################################################ + +- ## compute num of bytes to be processed +- movq len, tmp # save num bytes in tmp +- +- cmpq $128*24, len ++ cmp $128*24, len + jae .Lfull_block + +-.Lcontinue_block: +- cmpq $SMALL_SIZE, len +- jb .Lsmall +- +- ## len < 128*24 +- movq $2731, %rax # 2731 = ceil(2^16 / 24) +- mul len_dw +- shrq $16, %rax +- +- ## eax contains floor(bytes / 24) = num 24-byte chunks to do +- +- ## process rax 24-byte chunks (128 >= rax >= 0) +- +- ## compute end address of each block +- ## block 0 (base addr + RAX * 8) +- ## block 1 (base addr + RAX * 16) +- ## block 2 (base addr + RAX * 24) +- lea (bufptmp, %rax, 8), block_0 +- lea (block_0, %rax, 8), block_1 +- lea (block_1, %rax, 8), block_2 ++.Lpartial_block: ++ # Compute floor(len / 24) to get num qwords to process from each lane. ++ imul $2731, len, %eax # 2731 = ceil(2^16 / 24) ++ shr $16, %eax ++ jmp .Lcrc_3lanes + +- xor crc1, crc1 +- xor crc2, crc2 +- +- ## branch into array +- leaq jump_table(%rip), %bufp +- mov (%bufp,%rax,8), %bufp +- JMP_NOSPEC bufp +- +- ################################################################ +- ## 2a) PROCESS FULL BLOCKS: +- ################################################################ + .Lfull_block: +- movl $128,%eax +- lea 128*8*2(block_0), block_1 +- lea 128*8*3(block_0), block_2 +- add $128*8*1, block_0 +- +- xor crc1,crc1 +- xor crc2,crc2 +- +- # Fall through into top of crc array (crc_128) ++ # Processing 128 qwords from each lane. ++ mov $128, %eax + + ################################################################ +- ## 3) CRC Array: ++ ## 3) CRC each of three lanes: + ################################################################ + +- i=128 +-.rept 128-1 +-.altmacro +-LABEL crc_ %i +-.noaltmacro +- ENDBR +- crc32q -i*8(block_0), crc_init +- crc32q -i*8(block_1), crc1 +- crc32q -i*8(block_2), crc2 +- i=(i-1) +-.endr +- +-.altmacro +-LABEL crc_ %i +-.noaltmacro +- ENDBR +- crc32q -i*8(block_0), crc_init +- crc32q -i*8(block_1), crc1 +-# SKIP crc32 -i*8(block_2), crc2 ; Don't do this one yet +- +- mov block_2, block_0 ++.Lcrc_3lanes: ++ xor crc1,crc1 ++ xor crc2,crc2 ++ mov %eax, chunk_bytes ++ shl $3, chunk_bytes # num bytes to process from each lane ++ sub $5, %eax # 4 for 4x_loop, 1 for special last iter ++ jl .Lcrc_3lanes_4x_done ++ ++ # Unroll the loop by a factor of 4 to reduce the overhead of the loop ++ # bookkeeping instructions, which can compete with crc32q for the ALUs. ++.Lcrc_3lanes_4x_loop: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++ crc32q (bufp,chunk_bytes_q,2), crc2 ++ crc32q 8(bufp), crc_init_q ++ crc32q 8(bufp,chunk_bytes_q), crc1 ++ crc32q 8(bufp,chunk_bytes_q,2), crc2 ++ crc32q 16(bufp), crc_init_q ++ crc32q 16(bufp,chunk_bytes_q), crc1 ++ crc32q 16(bufp,chunk_bytes_q,2), crc2 ++ crc32q 24(bufp), crc_init_q ++ crc32q 24(bufp,chunk_bytes_q), crc1 ++ crc32q 24(bufp,chunk_bytes_q,2), crc2 ++ add $32, bufp ++ sub $4, %eax ++ jge .Lcrc_3lanes_4x_loop ++ ++.Lcrc_3lanes_4x_done: ++ add $4, %eax ++ jz .Lcrc_3lanes_last_qword ++ ++.Lcrc_3lanes_1x_loop: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++ crc32q (bufp,chunk_bytes_q,2), crc2 ++ add $8, bufp ++ dec %eax ++ jnz .Lcrc_3lanes_1x_loop ++ ++.Lcrc_3lanes_last_qword: ++ crc32q (bufp), crc_init_q ++ crc32q (bufp,chunk_bytes_q), crc1 ++# SKIP crc32q (bufp,chunk_bytes_q,2), crc2 ; Don't do this one yet + + ################################################################ + ## 4) Combine three results: + ################################################################ + +- lea (K_table-8)(%rip), %bufp # first entry is for idx 1 +- shlq $3, %rax # rax *= 8 +- pmovzxdq (%bufp,%rax), %xmm0 # 2 consts: K1:K2 +- leal (%eax,%eax,2), %eax # rax *= 3 (total *24) +- subq %rax, tmp # tmp -= rax*24 ++ lea (K_table-8)(%rip), %rax # first entry is for idx 1 ++ pmovzxdq (%rax,chunk_bytes_q), %xmm0 # 2 consts: K1:K2 ++ lea (chunk_bytes,chunk_bytes,2), %eax # chunk_bytes * 3 ++ sub %eax, len # len -= chunk_bytes * 3 + +- movq crc_init, %xmm1 # CRC for block 1 ++ movq crc_init_q, %xmm1 # CRC for block 1 + pclmulqdq $0x00, %xmm0, %xmm1 # Multiply by K2 + + movq crc1, %xmm2 # CRC for block 2 +@@ -230,103 +175,54 @@ LABEL crc_ %i + + pxor %xmm2,%xmm1 + movq %xmm1, %rax +- xor -i*8(block_2), %rax +- mov crc2, crc_init +- crc32 %rax, crc_init ++ xor (bufp,chunk_bytes_q,2), %rax ++ mov crc2, crc_init_q ++ crc32 %rax, crc_init_q ++ lea 8(bufp,chunk_bytes_q,2), bufp + + ################################################################ +- ## 5) Check for end: ++ ## 5) If more blocks remain, goto (2): + ################################################################ + +-LABEL crc_ 0 +- ENDBR +- mov tmp, len +- cmp $128*24, tmp +- jae .Lfull_block +- cmp $24, tmp +- jae .Lcontinue_block +- +-.Lless_than_24: +- shl $32-4, len_dw # less_than_16 expects length +- # in upper 4 bits of len_dw +- jnc .Lless_than_16 +- crc32q (bufptmp), crc_init +- crc32q 8(bufptmp), crc_init +- jz .Ldo_return +- add $16, bufptmp +- # len is less than 8 if we got here +- # less_than_8 expects length in upper 3 bits of len_dw +- # less_than_8_post_shl1 expects length = carryflag * 8 + len_dw[31:30] +- shl $2, len_dw +- jmp .Lless_than_8_post_shl1 ++ cmp $128*24, len ++ jae .Lfull_block ++ cmp $SMALL_SIZE, len ++ jae .Lpartial_block + + ####################################################################### +- ## 6) LESS THAN 256-bytes REMAIN AT THIS POINT (8-bits of len are full) ++ ## 6) Process any remainder without interleaving: + ####################################################################### + .Lsmall: +- shl $32-8, len_dw # Prepare len_dw for less_than_256 +- j=256 +-.rept 5 # j = {256, 128, 64, 32, 16} +-.altmacro +-LABEL less_than_ %j # less_than_j: Length should be in +- # upper lg(j) bits of len_dw +- j=(j/2) +- shl $1, len_dw # Get next MSB +- JNC_LESS_THAN %j +-.noaltmacro +- i=0 +-.rept (j/8) +- crc32q i(bufptmp), crc_init # Compute crc32 of 8-byte data +- i=i+8 +-.endr +- jz .Ldo_return # Return if remaining length is zero +- add $j, bufptmp # Advance buf +-.endr +- +-.Lless_than_8: # Length should be stored in +- # upper 3 bits of len_dw +- shl $1, len_dw +-.Lless_than_8_post_shl1: +- jnc .Lless_than_4 +- crc32l (bufptmp), crc_init_dw # CRC of 4 bytes +- jz .Ldo_return # return if remaining data is zero +- add $4, bufptmp +-.Lless_than_4: # Length should be stored in +- # upper 2 bits of len_dw +- shl $1, len_dw +- jnc .Lless_than_2 +- crc32w (bufptmp), crc_init_dw # CRC of 2 bytes +- jz .Ldo_return # return if remaining data is zero +- add $2, bufptmp +-.Lless_than_2: # Length should be stored in the MSB +- # of len_dw +- shl $1, len_dw +- jnc .Lless_than_1 +- crc32b (bufptmp), crc_init_dw # CRC of 1 byte +-.Lless_than_1: # Length should be zero +-.Ldo_return: +- movq crc_init, %rax +- popq %rsi +- popq %rdi +- popq %rbx ++ test len, len ++ jz .Ldone ++ mov len, %eax ++ shr $3, %eax ++ jz .Ldo_dword ++.Ldo_qwords: ++ crc32q (bufp), crc_init_q ++ add $8, bufp ++ dec %eax ++ jnz .Ldo_qwords ++.Ldo_dword: ++ test $4, len ++ jz .Ldo_word ++ crc32l (bufp), crc_init ++ add $4, bufp ++.Ldo_word: ++ test $2, len ++ jz .Ldo_byte ++ crc32w (bufp), crc_init ++ add $2, bufp ++.Ldo_byte: ++ test $1, len ++ jz .Ldone ++ crc32b (bufp), crc_init ++.Ldone: ++ mov crc_init, %eax + RET + SYM_FUNC_END(crc_pcl) + + .section .rodata, "a", @progbits +- ################################################################ +- ## jump table Table is 129 entries x 2 bytes each +- ################################################################ +-.align 4 +-jump_table: +- i=0 +-.rept 129 +-.altmacro +-JMPTBL_ENTRY %i +-.noaltmacro +- i=i+1 +-.endr +- +- + ################################################################ + ## PCLMULQDQ tables + ## Table is 128 entries x 2 words (8 bytes) each +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.12/0007-fixes.patch b/sys-kernel/gentoo-sources-6.12/0007-fixes.patch new file mode 100644 index 0000000..ec28dd2 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0007-fixes.patch @@ -0,0 +1,955 @@ +From 3e7168943409ace243e6d4b10896d6e71b5e0c4d Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 19 Dec 2024 18:51:12 +0100 +Subject: [PATCH 07/12] fixes + +Signed-off-by: Peter Jung +--- + arch/Kconfig | 4 +- + arch/x86/include/asm/futex.h | 8 ++- + arch/x86/mm/tlb.c | 2 +- + drivers/bluetooth/btmtk.c | 4 +- + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 50 +++++++++++++++-- + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 6 ++- + drivers/gpu/drm/drm_edid.c | 47 ++++++++++++++-- + drivers/hid/hid-ids.h | 1 + + fs/ntfs3/bitmap.c | 62 ++++++---------------- + fs/ntfs3/file.c | 32 ++++++----- + fs/ntfs3/frecord.c | 1 - + fs/ntfs3/fsntfs.c | 2 +- + fs/ntfs3/record.c | 16 ++++-- + fs/ntfs3/run.c | 6 +-- + kernel/futex/core.c | 22 -------- + kernel/futex/futex.h | 59 +++++++++++++++++++- + kernel/kprobes.c | 23 ++++---- + kernel/workqueue.c | 22 ++++++-- + scripts/package/PKGBUILD | 5 ++ + sound/pci/hda/patch_realtek.c | 2 + + 21 files changed, 256 insertions(+), 119 deletions(-) + +diff --git a/arch/Kconfig b/arch/Kconfig +index 00551f340dbe..833b2344ce79 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -1128,7 +1128,7 @@ config ARCH_MMAP_RND_BITS + int "Number of bits to use for ASLR of mmap base address" if EXPERT + range ARCH_MMAP_RND_BITS_MIN ARCH_MMAP_RND_BITS_MAX + default ARCH_MMAP_RND_BITS_DEFAULT if ARCH_MMAP_RND_BITS_DEFAULT +- default ARCH_MMAP_RND_BITS_MIN ++ default ARCH_MMAP_RND_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_BITS + help + This value can be used to select the number of bits to use to +@@ -1162,7 +1162,7 @@ config ARCH_MMAP_RND_COMPAT_BITS + int "Number of bits to use for ASLR of mmap base address for compatible applications" if EXPERT + range ARCH_MMAP_RND_COMPAT_BITS_MIN ARCH_MMAP_RND_COMPAT_BITS_MAX + default ARCH_MMAP_RND_COMPAT_BITS_DEFAULT if ARCH_MMAP_RND_COMPAT_BITS_DEFAULT +- default ARCH_MMAP_RND_COMPAT_BITS_MIN ++ default ARCH_MMAP_RND_COMPAT_BITS_MAX + depends on HAVE_ARCH_MMAP_RND_COMPAT_BITS + help + This value can be used to select the number of bits to use to +diff --git a/arch/x86/include/asm/futex.h b/arch/x86/include/asm/futex.h +index 99d345b686fa..6e2458088800 100644 +--- a/arch/x86/include/asm/futex.h ++++ b/arch/x86/include/asm/futex.h +@@ -48,7 +48,9 @@ do { \ + static __always_inline int arch_futex_atomic_op_inuser(int op, int oparg, int *oval, + u32 __user *uaddr) + { +- if (!user_access_begin(uaddr, sizeof(u32))) ++ if (can_do_masked_user_access()) ++ uaddr = masked_user_access_begin(uaddr); ++ else if (!user_access_begin(uaddr, sizeof(u32))) + return -EFAULT; + + switch (op) { +@@ -84,7 +86,9 @@ static inline int futex_atomic_cmpxchg_inatomic(u32 *uval, u32 __user *uaddr, + { + int ret = 0; + +- if (!user_access_begin(uaddr, sizeof(u32))) ++ if (can_do_masked_user_access()) ++ uaddr = masked_user_access_begin(uaddr); ++ else if (!user_access_begin(uaddr, sizeof(u32))) + return -EFAULT; + asm volatile("\n" + "1:\t" LOCK_PREFIX "cmpxchgl %3, %2\n" +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index b0678d59ebdb..a2becb85bea7 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -569,7 +569,7 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, + * mm_cpumask. The TLB shootdown code can figure out from + * cpu_tlbstate_shared.is_lazy whether or not to send an IPI. + */ +- if (WARN_ON_ONCE(prev != &init_mm && ++ if (IS_ENABLED(CONFIG_DEBUG_VM) && WARN_ON_ONCE(prev != &init_mm && + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + +diff --git a/drivers/bluetooth/btmtk.c b/drivers/bluetooth/btmtk.c +index 85e99641eaae..c1b6bcc6f7dd 100644 +--- a/drivers/bluetooth/btmtk.c ++++ b/drivers/bluetooth/btmtk.c +@@ -1329,7 +1329,6 @@ int btmtk_usb_setup(struct hci_dev *hdev) + fwname = FIRMWARE_MT7668; + break; + case 0x7922: +- case 0x7961: + case 0x7925: + /* Reset the device to ensure it's in the initial state before + * downloading the firmware to ensure. +@@ -1337,7 +1336,8 @@ int btmtk_usb_setup(struct hci_dev *hdev) + + if (!test_bit(BTMTK_FIRMWARE_LOADED, &btmtk_data->flags)) + btmtk_usb_subsys_reset(hdev, dev_id); +- ++ fallthrough; ++ case 0x7961: + btmtk_fw_get_filename(fw_bin_name, sizeof(fw_bin_name), dev_id, + fw_version, fw_flavor); + +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index 7617963901fa..03933b2c5ebc 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -855,6 +855,7 @@ struct amdgpu_device { + bool need_swiotlb; + bool accel_working; + struct notifier_block acpi_nb; ++ struct notifier_block pm_nb; + struct amdgpu_i2c_chan *i2c_bus[AMDGPU_MAX_I2C_BUS]; + struct debugfs_blob_wrapper debugfs_vbios_blob; + struct debugfs_blob_wrapper debugfs_discovery_blob; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index 51904906545e..d5d3391cc788 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -145,6 +145,8 @@ const char *amdgpu_asic_name[] = { + }; + + static inline void amdgpu_device_stop_pending_resets(struct amdgpu_device *adev); ++static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, ++ void *data); + + /** + * DOC: pcie_replay_count +@@ -4507,6 +4509,11 @@ int amdgpu_device_init(struct amdgpu_device *adev, + + amdgpu_device_check_iommu_direct_map(adev); + ++ adev->pm_nb.notifier_call = amdgpu_device_pm_notifier; ++ r = register_pm_notifier(&adev->pm_nb); ++ if (r) ++ goto failed; ++ + return 0; + + release_ras_con: +@@ -4571,6 +4578,8 @@ void amdgpu_device_fini_hw(struct amdgpu_device *adev) + drain_workqueue(adev->mman.bdev.wq); + adev->shutdown = true; + ++ unregister_pm_notifier(&adev->pm_nb); ++ + /* make sure IB test finished before entering exclusive mode + * to avoid preemption on IB test + */ +@@ -4688,8 +4697,8 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) + { + int ret; + +- /* No need to evict vram on APUs for suspend to ram or s2idle */ +- if ((adev->in_s3 || adev->in_s0ix) && (adev->flags & AMD_IS_APU)) ++ /* No need to evict vram on APUs unless going to S4 */ ++ if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) + return 0; + + ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); +@@ -4701,6 +4710,41 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) + /* + * Suspend & resume. + */ ++/** ++ * amdgpu_device_pm_notifier - Notification block for Suspend/Hibernate events ++ * @nb: notifier block ++ * @mode: suspend mode ++ * @data: data ++ * ++ * This function is called when the system is about to suspend or hibernate. ++ * It is used to evict resources from the device before the system goes to ++ * sleep while there is still access to swap. ++ */ ++static int amdgpu_device_pm_notifier(struct notifier_block *nb, unsigned long mode, ++ void *data) ++{ ++ struct amdgpu_device *adev = container_of(nb, struct amdgpu_device, pm_nb); ++ int r; ++ ++ switch (mode) { ++ case PM_HIBERNATION_PREPARE: ++ adev->in_s4 = true; ++ fallthrough; ++ case PM_SUSPEND_PREPARE: ++ r = amdgpu_device_evict_resources(adev); ++ /* ++ * This is considered non-fatal at this time because ++ * amdgpu_device_prepare() will also fatally evict resources. ++ * See https://gitlab.freedesktop.org/drm/amd/-/issues/3781 ++ */ ++ if (r) ++ drm_warn(adev_to_drm(adev), "Failed to evict resources, freeze active processes if problems occur: %d\n", r); ++ break; ++ } ++ ++ return NOTIFY_DONE; ++} ++ + /** + * amdgpu_device_prepare - prepare for device suspend + * +@@ -4740,7 +4784,7 @@ int amdgpu_device_prepare(struct drm_device *dev) + return 0; + + unprepare: +- adev->in_s0ix = adev->in_s3 = false; ++ adev->in_s0ix = adev->in_s3 = adev->in_s4 = false; + + return r; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +index 852e6f315576..94a9a9266f8e 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -2639,7 +2639,6 @@ static int amdgpu_pmops_freeze(struct device *dev) + struct amdgpu_device *adev = drm_to_adev(drm_dev); + int r; + +- adev->in_s4 = true; + r = amdgpu_device_suspend(drm_dev, true); + adev->in_s4 = false; + if (r) +@@ -3078,6 +3077,11 @@ static int __init amdgpu_init(void) + /* Ignore KFD init failures. Normal when CONFIG_HSA_AMD is not set. */ + amdgpu_amdkfd_init(); + ++ if (amdgpu_pp_feature_mask & PP_OVERDRIVE_MASK) { ++ add_taint(TAINT_CPU_OUT_OF_SPEC, LOCKDEP_STILL_OK); ++ pr_crit("Overdrive is enabled, please disable it before reporting any bugs.\n"); ++ } ++ + /* let modprobe override vga console setting */ + return pci_register_driver(&amdgpu_kms_pci_driver); + +diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c +index 855beafb76ff..ad78059ee954 100644 +--- a/drivers/gpu/drm/drm_edid.c ++++ b/drivers/gpu/drm/drm_edid.c +@@ -94,6 +94,8 @@ static int oui(u8 first, u8 second, u8 third) + #define EDID_QUIRK_NON_DESKTOP (1 << 12) + /* Cap the DSC target bitrate to 15bpp */ + #define EDID_QUIRK_CAP_DSC_15BPP (1 << 13) ++/* Fix up a particular 5120x1440@240Hz timing */ ++#define EDID_QUIRK_FIXUP_5120_1440_240 (1 << 14) + + #define MICROSOFT_IEEE_OUI 0xca125c + +@@ -182,6 +184,12 @@ static const struct edid_quirk { + EDID_QUIRK('S', 'A', 'M', 596, EDID_QUIRK_PREFER_LARGE_60), + EDID_QUIRK('S', 'A', 'M', 638, EDID_QUIRK_PREFER_LARGE_60), + ++ /* Samsung C49G95T */ ++ EDID_QUIRK('S', 'A', 'M', 0x7053, EDID_QUIRK_FIXUP_5120_1440_240), ++ ++ /* Samsung S49AG95 */ ++ EDID_QUIRK('S', 'A', 'M', 0x71ac, EDID_QUIRK_FIXUP_5120_1440_240), ++ + /* Sony PVM-2541A does up to 12 bpc, but only reports max 8 bpc */ + EDID_QUIRK('S', 'N', 'Y', 0x2541, EDID_QUIRK_FORCE_12BPC), + +@@ -6753,7 +6761,37 @@ static void update_display_info(struct drm_connector *connector, + drm_edid_to_eld(connector, drm_edid); + } + +-static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *dev, ++static void drm_mode_displayid_detailed_edid_quirks(struct drm_connector *connector, ++ struct drm_display_mode *mode) ++{ ++ unsigned int hsync_width; ++ unsigned int vsync_width; ++ ++ if (connector->display_info.quirks & EDID_QUIRK_FIXUP_5120_1440_240) { ++ if (mode->hdisplay == 5120 && mode->vdisplay == 1440 && ++ mode->clock == 1939490) { ++ hsync_width = mode->hsync_end - mode->hsync_start; ++ vsync_width = mode->vsync_end - mode->vsync_start; ++ ++ mode->clock = 2018490; ++ mode->hdisplay = 5120; ++ mode->hsync_start = 5120 + 8; ++ mode->hsync_end = 5120 + 8 + hsync_width; ++ mode->htotal = 5200; ++ ++ mode->vdisplay = 1440; ++ mode->vsync_start = 1440 + 165; ++ mode->vsync_end = 1440 + 165 + vsync_width; ++ mode->vtotal = 1619; ++ ++ drm_dbg_kms(connector->dev, ++ "[CONNECTOR:%d:%s] Samsung 240Hz mode quirk applied\n", ++ connector->base.id, connector->name); ++ } ++ } ++} ++ ++static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_connector *connector, + struct displayid_detailed_timings_1 *timings, + bool type_7) + { +@@ -6772,7 +6810,7 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d + bool hsync_positive = (timings->hsync[1] >> 7) & 0x1; + bool vsync_positive = (timings->vsync[1] >> 7) & 0x1; + +- mode = drm_mode_create(dev); ++ mode = drm_mode_create(connector->dev); + if (!mode) + return NULL; + +@@ -6795,6 +6833,9 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d + + if (timings->flags & 0x80) + mode->type |= DRM_MODE_TYPE_PREFERRED; ++ ++ drm_mode_displayid_detailed_edid_quirks(connector, mode); ++ + drm_mode_set_name(mode); + + return mode; +@@ -6817,7 +6858,7 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector, + for (i = 0; i < num_timings; i++) { + struct displayid_detailed_timings_1 *timings = &det->timings[i]; + +- newmode = drm_mode_displayid_detailed(connector->dev, timings, type_7); ++ newmode = drm_mode_displayid_detailed(connector, timings, type_7); + if (!newmode) + continue; + +diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h +index 0f23be98c56e..1b92729bd378 100644 +--- a/drivers/hid/hid-ids.h ++++ b/drivers/hid/hid-ids.h +@@ -210,6 +210,7 @@ + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD2 0x19b6 + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3 0x1a30 + #define USB_DEVICE_ID_ASUSTEK_ROG_Z13_LIGHTBAR 0x18c6 ++#define USB_DEVICE_ID_ASUSTEK_ROG_RAIKIRI_PAD 0x1abb + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY 0x1abe + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X 0x1b4c + #define USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD 0x196b +diff --git a/fs/ntfs3/bitmap.c b/fs/ntfs3/bitmap.c +index cf4fe21a5039..04107b950717 100644 +--- a/fs/ntfs3/bitmap.c ++++ b/fs/ntfs3/bitmap.c +@@ -710,20 +710,17 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) + { + int err = 0; + struct super_block *sb = wnd->sb; +- size_t bits0 = bits; + u32 wbits = 8 * sb->s_blocksize; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbit = bit & (wbits - 1); + struct buffer_head *bh; ++ u32 op; + +- while (iw < wnd->nwnd && bits) { +- u32 tail, op; +- ++ for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { + if (iw + 1 == wnd->nwnd) + wbits = wnd->bits_last; + +- tail = wbits - wbit; +- op = min_t(u32, tail, bits); ++ op = min_t(u32, wbits - wbit, bits); + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { +@@ -736,20 +733,15 @@ int wnd_set_free(struct wnd_bitmap *wnd, size_t bit, size_t bits) + ntfs_bitmap_clear_le(bh->b_data, wbit, op); + + wnd->free_bits[iw] += op; ++ wnd->total_zeroes += op; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + put_bh(bh); + +- wnd->total_zeroes += op; +- bits -= op; +- wbit = 0; +- iw += 1; ++ wnd_add_free_ext(wnd, bit, op, false); + } +- +- wnd_add_free_ext(wnd, bit, bits0, false); +- + return err; + } + +@@ -760,20 +752,17 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) + { + int err = 0; + struct super_block *sb = wnd->sb; +- size_t bits0 = bits; + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); + struct buffer_head *bh; ++ u32 op; + +- while (iw < wnd->nwnd && bits) { +- u32 tail, op; +- ++ for (; iw < wnd->nwnd && bits; iw++, bit += op, bits -= op, wbit = 0) { + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + +- tail = wbits - wbit; +- op = min_t(u32, tail, bits); ++ op = min_t(u32, wbits - wbit, bits); + + bh = wnd_map(wnd, iw); + if (IS_ERR(bh)) { +@@ -785,21 +774,16 @@ int wnd_set_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) + + ntfs_bitmap_set_le(bh->b_data, wbit, op); + wnd->free_bits[iw] -= op; ++ wnd->total_zeroes -= op; + + set_buffer_uptodate(bh); + mark_buffer_dirty(bh); + unlock_buffer(bh); + put_bh(bh); + +- wnd->total_zeroes -= op; +- bits -= op; +- wbit = 0; +- iw += 1; ++ if (!RB_EMPTY_ROOT(&wnd->start_tree)) ++ wnd_remove_free_ext(wnd, bit, op); + } +- +- if (!RB_EMPTY_ROOT(&wnd->start_tree)) +- wnd_remove_free_ext(wnd, bit, bits0); +- + return err; + } + +@@ -852,15 +836,13 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); ++ u32 op; + +- while (iw < wnd->nwnd && bits) { +- u32 tail, op; +- ++ for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + +- tail = wbits - wbit; +- op = min_t(u32, tail, bits); ++ op = min_t(u32, wbits - wbit, bits); + + if (wbits != wnd->free_bits[iw]) { + bool ret; +@@ -875,10 +857,6 @@ static bool wnd_is_free_hlp(struct wnd_bitmap *wnd, size_t bit, size_t bits) + if (!ret) + return false; + } +- +- bits -= op; +- wbit = 0; +- iw += 1; + } + + return true; +@@ -928,6 +906,7 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) + size_t iw = bit >> (sb->s_blocksize_bits + 3); + u32 wbits = 8 * sb->s_blocksize; + u32 wbit = bit & (wbits - 1); ++ u32 op; + size_t end; + struct rb_node *n; + struct e_node *e; +@@ -945,14 +924,11 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) + return false; + + use_wnd: +- while (iw < wnd->nwnd && bits) { +- u32 tail, op; +- ++ for (; iw < wnd->nwnd && bits; iw++, bits -= op, wbit = 0) { + if (unlikely(iw + 1 == wnd->nwnd)) + wbits = wnd->bits_last; + +- tail = wbits - wbit; +- op = min_t(u32, tail, bits); ++ op = min_t(u32, wbits - wbit, bits); + + if (wnd->free_bits[iw]) { + bool ret; +@@ -966,10 +942,6 @@ bool wnd_is_used(struct wnd_bitmap *wnd, size_t bit, size_t bits) + if (!ret) + goto out; + } +- +- bits -= op; +- wbit = 0; +- iw += 1; + } + ret = true; + +diff --git a/fs/ntfs3/file.c b/fs/ntfs3/file.c +index f704ceef9539..3f96a11804c9 100644 +--- a/fs/ntfs3/file.c ++++ b/fs/ntfs3/file.c +@@ -182,13 +182,15 @@ static int ntfs_extend_initialized_size(struct file *file, + loff_t pos = valid; + int err; + ++ if (valid >= new_valid) ++ return 0; ++ + if (is_resident(ni)) { + ni->i_valid = new_valid; + return 0; + } + + WARN_ON(is_compressed(ni)); +- WARN_ON(valid >= new_valid); + + for (;;) { + u32 zerofrom, len; +@@ -987,6 +989,7 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) + u64 frame_vbo; + pgoff_t index; + bool frame_uptodate; ++ struct folio *folio; + + if (frame_size < PAGE_SIZE) { + /* +@@ -1041,8 +1044,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) + if (err) { + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; +- unlock_page(page); +- put_page(page); ++ folio = page_folio(page); ++ folio_unlock(folio); ++ folio_put(folio); + } + goto out; + } +@@ -1052,9 +1056,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) + off = offset_in_page(valid); + for (; ip < pages_per_frame; ip++, off = 0) { + page = pages[ip]; ++ folio = page_folio(page); + zero_user_segment(page, off, PAGE_SIZE); + flush_dcache_page(page); +- SetPageUptodate(page); ++ folio_mark_uptodate(folio); + } + + ni_lock(ni); +@@ -1063,9 +1068,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) + + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; +- SetPageUptodate(page); +- unlock_page(page); +- put_page(page); ++ folio = page_folio(page); ++ folio_mark_uptodate(folio); ++ folio_unlock(folio); ++ folio_put(folio); + } + + if (err) +@@ -1107,8 +1113,9 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) + for (ip = 0; ip < pages_per_frame; + ip++) { + page = pages[ip]; +- unlock_page(page); +- put_page(page); ++ folio = page_folio(page); ++ folio_unlock(folio); ++ folio_put(folio); + } + goto out; + } +@@ -1149,9 +1156,10 @@ static ssize_t ntfs_compress_write(struct kiocb *iocb, struct iov_iter *from) + for (ip = 0; ip < pages_per_frame; ip++) { + page = pages[ip]; + ClearPageDirty(page); +- SetPageUptodate(page); +- unlock_page(page); +- put_page(page); ++ folio = page_folio(page); ++ folio_mark_uptodate(folio); ++ folio_unlock(folio); ++ folio_put(folio); + } + + if (err) +diff --git a/fs/ntfs3/frecord.c b/fs/ntfs3/frecord.c +index c33e818b3164..8b39d0ce5f28 100644 +--- a/fs/ntfs3/frecord.c ++++ b/fs/ntfs3/frecord.c +@@ -1958,7 +1958,6 @@ int ni_fiemap(struct ntfs_inode *ni, struct fiemap_extent_info *fieinfo, + if (end > alloc_size) + end = alloc_size; + +- + while (vbo < end) { + if (idx == -1) { + ok = run_lookup_entry(&run, vcn, &lcn, &clen, &idx); +diff --git a/fs/ntfs3/fsntfs.c b/fs/ntfs3/fsntfs.c +index 0fa636038b4e..03471bc9371c 100644 +--- a/fs/ntfs3/fsntfs.c ++++ b/fs/ntfs3/fsntfs.c +@@ -2699,4 +2699,4 @@ int ntfs_set_label(struct ntfs_sb_info *sbi, u8 *label, int len) + out: + __putname(uni); + return err; +-} +\ No newline at end of file ++} +diff --git a/fs/ntfs3/record.c b/fs/ntfs3/record.c +index f810f0419d25..61d53d39f3b9 100644 +--- a/fs/ntfs3/record.c ++++ b/fs/ntfs3/record.c +@@ -212,7 +212,7 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) + return NULL; + + if (off >= used || off < MFTRECORD_FIXUP_OFFSET_1 || +- !IS_ALIGNED(off, 4)) { ++ !IS_ALIGNED(off, 8)) { + return NULL; + } + +@@ -236,8 +236,11 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) + off += asize; + } + +- /* Can we use the first field (attr->type). */ +- /* NOTE: this code also checks attr->size availability. */ ++ /* ++ * Can we use the first fields: ++ * attr->type, ++ * attr->size ++ */ + if (off + 8 > used) { + static_assert(ALIGN(sizeof(enum ATTR_TYPE), 8) == 8); + return NULL; +@@ -259,10 +262,17 @@ struct ATTRIB *mi_enum_attr(struct mft_inode *mi, struct ATTRIB *attr) + + asize = le32_to_cpu(attr->size); + ++ if (!IS_ALIGNED(asize, 8)) ++ return NULL; ++ + /* Check overflow and boundary. */ + if (off + asize < off || off + asize > used) + return NULL; + ++ /* Can we use the field attr->non_res. */ ++ if (off + 9 > used) ++ return NULL; ++ + /* Check size of attribute. */ + if (!attr->non_res) { + /* Check resident fields. */ +diff --git a/fs/ntfs3/run.c b/fs/ntfs3/run.c +index 48566dff0dc9..6e86d66197ef 100644 +--- a/fs/ntfs3/run.c ++++ b/fs/ntfs3/run.c +@@ -1112,9 +1112,9 @@ int run_unpack_ex(struct runs_tree *run, struct ntfs_sb_info *sbi, CLST ino, + err = wnd_set_used_safe(wnd, lcn, len, &done); + if (zone) { + /* Restore zone. Lock mft run. */ +- struct rw_semaphore *lock; +- lock = is_mounted(sbi) ? &sbi->mft.ni->file.run_lock : +- NULL; ++ struct rw_semaphore *lock = ++ is_mounted(sbi) ? &sbi->mft.ni->file.run_lock : ++ NULL; + if (lock) + down_read(lock); + ntfs_refresh_zone(sbi); +diff --git a/kernel/futex/core.c b/kernel/futex/core.c +index 136768ae2637..9107704a6574 100644 +--- a/kernel/futex/core.c ++++ b/kernel/futex/core.c +@@ -451,28 +451,6 @@ struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key * + return NULL; + } + +-int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) +-{ +- int ret; +- +- pagefault_disable(); +- ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); +- pagefault_enable(); +- +- return ret; +-} +- +-int futex_get_value_locked(u32 *dest, u32 __user *from) +-{ +- int ret; +- +- pagefault_disable(); +- ret = __get_user(*dest, from); +- pagefault_enable(); +- +- return ret ? -EFAULT : 0; +-} +- + /** + * wait_for_owner_exiting - Block until the owner has exited + * @ret: owner's current futex lock status +diff --git a/kernel/futex/futex.h b/kernel/futex/futex.h +index 8b195d06f4e8..618ce1fe870e 100644 +--- a/kernel/futex/futex.h ++++ b/kernel/futex/futex.h +@@ -6,6 +6,7 @@ + #include + #include + #include ++#include + + #ifdef CONFIG_PREEMPT_RT + #include +@@ -225,10 +226,64 @@ extern bool __futex_wake_mark(struct futex_q *q); + extern void futex_wake_mark(struct wake_q_head *wake_q, struct futex_q *q); + + extern int fault_in_user_writeable(u32 __user *uaddr); +-extern int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval); +-extern int futex_get_value_locked(u32 *dest, u32 __user *from); + extern struct futex_q *futex_top_waiter(struct futex_hash_bucket *hb, union futex_key *key); + ++static inline int futex_cmpxchg_value_locked(u32 *curval, u32 __user *uaddr, u32 uval, u32 newval) ++{ ++ int ret; ++ ++ pagefault_disable(); ++ ret = futex_atomic_cmpxchg_inatomic(curval, uaddr, uval, newval); ++ pagefault_enable(); ++ ++ return ret; ++} ++ ++/* ++ * This does a plain atomic user space read, and the user pointer has ++ * already been verified earlier by get_futex_key() to be both aligned ++ * and actually in user space, just like futex_atomic_cmpxchg_inatomic(). ++ * ++ * We still want to avoid any speculation, and while __get_user() is ++ * the traditional model for this, it's actually slower than doing ++ * this manually these days. ++ * ++ * We could just have a per-architecture special function for it, ++ * the same way we do futex_atomic_cmpxchg_inatomic(), but rather ++ * than force everybody to do that, write it out long-hand using ++ * the low-level user-access infrastructure. ++ * ++ * This looks a bit overkill, but generally just results in a couple ++ * of instructions. ++ */ ++static __always_inline int futex_read_inatomic(u32 *dest, u32 __user *from) ++{ ++ u32 val; ++ ++ if (can_do_masked_user_access()) ++ from = masked_user_access_begin(from); ++ else if (!user_read_access_begin(from, sizeof(*from))) ++ return -EFAULT; ++ unsafe_get_user(val, from, Efault); ++ user_access_end(); ++ *dest = val; ++ return 0; ++Efault: ++ user_access_end(); ++ return -EFAULT; ++} ++ ++static inline int futex_get_value_locked(u32 *dest, u32 __user *from) ++{ ++ int ret; ++ ++ pagefault_disable(); ++ ret = futex_read_inatomic(dest, from); ++ pagefault_enable(); ++ ++ return ret; ++} ++ + extern void __futex_unqueue(struct futex_q *q); + extern void __futex_queue(struct futex_q *q, struct futex_hash_bucket *hb); + extern int futex_unqueue(struct futex_q *q); +diff --git a/kernel/kprobes.c b/kernel/kprobes.c +index da59c68df841..55d0835ea0cf 100644 +--- a/kernel/kprobes.c ++++ b/kernel/kprobes.c +@@ -1570,16 +1570,25 @@ static int check_kprobe_address_safe(struct kprobe *p, + if (ret) + return ret; + jump_label_lock(); +- preempt_disable(); + + /* Ensure the address is in a text area, and find a module if exists. */ + *probed_mod = NULL; + if (!core_kernel_text((unsigned long) p->addr)) { ++ guard(preempt)(); + *probed_mod = __module_text_address((unsigned long) p->addr); + if (!(*probed_mod)) { + ret = -EINVAL; + goto out; + } ++ ++ /* ++ * We must hold a refcount of the probed module while updating ++ * its code to prohibit unexpected unloading. ++ */ ++ if (unlikely(!try_module_get(*probed_mod))) { ++ ret = -ENOENT; ++ goto out; ++ } + } + /* Ensure it is not in reserved area. */ + if (in_gate_area_no_mm((unsigned long) p->addr) || +@@ -1588,21 +1597,13 @@ static int check_kprobe_address_safe(struct kprobe *p, + static_call_text_reserved(p->addr, p->addr) || + find_bug((unsigned long)p->addr) || + is_cfi_preamble_symbol((unsigned long)p->addr)) { ++ module_put(*probed_mod); + ret = -EINVAL; + goto out; + } + + /* Get module refcount and reject __init functions for loaded modules. */ + if (IS_ENABLED(CONFIG_MODULES) && *probed_mod) { +- /* +- * We must hold a refcount of the probed module while updating +- * its code to prohibit unexpected unloading. +- */ +- if (unlikely(!try_module_get(*probed_mod))) { +- ret = -ENOENT; +- goto out; +- } +- + /* + * If the module freed '.init.text', we couldn't insert + * kprobes in there. +@@ -1610,13 +1611,11 @@ static int check_kprobe_address_safe(struct kprobe *p, + if (within_module_init((unsigned long)p->addr, *probed_mod) && + !module_is_coming(*probed_mod)) { + module_put(*probed_mod); +- *probed_mod = NULL; + ret = -ENOENT; + } + } + + out: +- preempt_enable(); + jump_label_unlock(); + + return ret; +diff --git a/kernel/workqueue.c b/kernel/workqueue.c +index 9949ffad8df0..8b07576814a5 100644 +--- a/kernel/workqueue.c ++++ b/kernel/workqueue.c +@@ -3833,16 +3833,28 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, + { + bool wait = false; + struct pool_workqueue *pwq; ++ struct worker_pool *current_pool = NULL; + + if (flush_color >= 0) { + WARN_ON_ONCE(atomic_read(&wq->nr_pwqs_to_flush)); + atomic_set(&wq->nr_pwqs_to_flush, 1); + } + ++ /* ++ * For unbound workqueue, pwqs will map to only a few pools. ++ * Most of the time, pwqs within the same pool will be linked ++ * sequentially to wq->pwqs by cpu index. So in the majority ++ * of pwq iters, the pool is the same, only doing lock/unlock ++ * if the pool has changed. This can largely reduce expensive ++ * lock operations. ++ */ + for_each_pwq(pwq, wq) { +- struct worker_pool *pool = pwq->pool; +- +- raw_spin_lock_irq(&pool->lock); ++ if (current_pool != pwq->pool) { ++ if (likely(current_pool)) ++ raw_spin_unlock_irq(¤t_pool->lock); ++ current_pool = pwq->pool; ++ raw_spin_lock_irq(¤t_pool->lock); ++ } + + if (flush_color >= 0) { + WARN_ON_ONCE(pwq->flush_color != -1); +@@ -3859,9 +3871,11 @@ static bool flush_workqueue_prep_pwqs(struct workqueue_struct *wq, + pwq->work_color = work_color; + } + +- raw_spin_unlock_irq(&pool->lock); + } + ++ if (current_pool) ++ raw_spin_unlock_irq(¤t_pool->lock); ++ + if (flush_color >= 0 && atomic_dec_and_test(&wq->nr_pwqs_to_flush)) + complete(&wq->first_flusher->done); + +diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD +index f83493838cf9..4010899652b8 100644 +--- a/scripts/package/PKGBUILD ++++ b/scripts/package/PKGBUILD +@@ -91,6 +91,11 @@ _package-headers() { + "${srctree}/scripts/package/install-extmod-build" "${builddir}" + fi + ++ # required when DEBUG_INFO_BTF_MODULES is enabled ++ if [ -f tools/bpf/resolve_btfids/resolve_btfids ]; then ++ install -Dt "$builddir/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids ++ fi ++ + echo "Installing System.map and config..." + mkdir -p "${builddir}" + cp System.map "${builddir}/System.map" +diff --git a/sound/pci/hda/patch_realtek.c b/sound/pci/hda/patch_realtek.c +index 192fc75b51e6..d88fc0ca893d 100644 +--- a/sound/pci/hda/patch_realtek.c ++++ b/sound/pci/hda/patch_realtek.c +@@ -10604,6 +10604,8 @@ static const struct hda_quirk alc269_fixup_tbl[] = { + SND_PCI_QUIRK(0x1043, 0x1e1f, "ASUS Vivobook 15 X1504VAP", ALC2XX_FIXUP_HEADSET_MIC), + SND_PCI_QUIRK(0x1043, 0x1e51, "ASUS Zephyrus M15", ALC294_FIXUP_ASUS_GU502_PINS), + SND_PCI_QUIRK(0x1043, 0x1e5e, "ASUS ROG Strix G513", ALC294_FIXUP_ASUS_G513_PINS), ++ SND_PCI_QUIRK(0x1043, 0x1e63, "ASUS H7606W", ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC), ++ SND_PCI_QUIRK(0x1043, 0x1e83, "ASUS GA605W", ALC285_FIXUP_ASUS_GA403U_HEADSET_MIC), + SND_PCI_QUIRK(0x1043, 0x1e8e, "ASUS Zephyrus G15", ALC289_FIXUP_ASUS_GA401), + SND_PCI_QUIRK(0x1043, 0x1eb3, "ASUS Ally RCLA72", ALC287_FIXUP_TAS2781_I2C), + SND_PCI_QUIRK(0x1043, 0x1ed3, "ASUS HN7306W", ALC287_FIXUP_CS35L41_I2C_2), +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.10.3/0009-ntsync.patch b/sys-kernel/gentoo-sources-6.12/0008-ntsync.patch similarity index 84% rename from sys-kernel/gentoo-sources-6.10.3/0009-ntsync.patch rename to sys-kernel/gentoo-sources-6.12/0008-ntsync.patch index 4a16758..1efe29d 100644 --- a/sys-kernel/gentoo-sources-6.10.3/0009-ntsync.patch +++ b/sys-kernel/gentoo-sources-6.12/0008-ntsync.patch @@ -1,22 +1,22 @@ -From 36ef0070410e229e52c9de58d6021df36a4b1707 Mon Sep 17 00:00:00 2001 +From 46225020f04e55a29ae30473a9a8cf0d15f0979e Mon Sep 17 00:00:00 2001 From: Peter Jung -Date: Sat, 3 Aug 2024 09:34:15 +0200 -Subject: [PATCH 09/12] ntsync +Date: Thu, 19 Dec 2024 18:51:57 +0100 +Subject: [PATCH 08/12] ntsync Signed-off-by: Peter Jung --- Documentation/userspace-api/index.rst | 1 + - Documentation/userspace-api/ntsync.rst | 398 +++++ + Documentation/userspace-api/ntsync.rst | 385 +++++ MAINTAINERS | 9 + drivers/misc/Kconfig | 1 - - drivers/misc/ntsync.c | 989 +++++++++++- - include/uapi/linux/ntsync.h | 39 + + drivers/misc/ntsync.c | 992 +++++++++++- + include/uapi/linux/ntsync.h | 42 +- tools/testing/selftests/Makefile | 1 + .../selftests/drivers/ntsync/.gitignore | 1 + .../testing/selftests/drivers/ntsync/Makefile | 7 + tools/testing/selftests/drivers/ntsync/config | 1 + - .../testing/selftests/drivers/ntsync/ntsync.c | 1407 +++++++++++++++++ - 11 files changed, 2850 insertions(+), 4 deletions(-) + .../testing/selftests/drivers/ntsync/ntsync.c | 1343 +++++++++++++++++ + 11 files changed, 2767 insertions(+), 16 deletions(-) create mode 100644 Documentation/userspace-api/ntsync.rst create mode 100644 tools/testing/selftests/drivers/ntsync/.gitignore create mode 100644 tools/testing/selftests/drivers/ntsync/Makefile @@ -24,10 +24,10 @@ Signed-off-by: Peter Jung create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst -index 8a251d71fa6e..02bea81fb4bf 100644 +index 274cc7546efc..9c1b15cd89ab 100644 --- a/Documentation/userspace-api/index.rst +++ b/Documentation/userspace-api/index.rst -@@ -64,6 +64,7 @@ Everything else +@@ -63,6 +63,7 @@ Everything else vduse futex2 perf_ring_buffer @@ -37,10 +37,10 @@ index 8a251d71fa6e..02bea81fb4bf 100644 diff --git a/Documentation/userspace-api/ntsync.rst b/Documentation/userspace-api/ntsync.rst new file mode 100644 -index 000000000000..767844637a7d +index 000000000000..25e7c4aef968 --- /dev/null +++ b/Documentation/userspace-api/ntsync.rst -@@ -0,0 +1,398 @@ +@@ -0,0 +1,385 @@ +=================================== +NT synchronization primitive driver +=================================== @@ -116,19 +116,16 @@ index 000000000000..767844637a7d +structures used in ioctl calls:: + + struct ntsync_sem_args { -+ __u32 sem; + __u32 count; + __u32 max; + }; + + struct ntsync_mutex_args { -+ __u32 mutex; + __u32 owner; + __u32 count; + }; + + struct ntsync_event_args { -+ __u32 event; + __u32 signaled; + __u32 manual; + }; @@ -145,7 +142,7 @@ index 000000000000..767844637a7d + }; + +Depending on the ioctl, members of the structure may be used as input, -+output, or not at all. All ioctls return 0 on success. ++output, or not at all. + +The ioctls on the device file are as follows: + @@ -156,14 +153,13 @@ index 000000000000..767844637a7d + + .. list-table:: + -+ * - ``sem`` -+ - On output, contains a file descriptor to the created semaphore. + * - ``count`` + - Initial count of the semaphore. + * - ``max`` + - Maximum count of the semaphore. + + Fails with ``EINVAL`` if ``count`` is greater than ``max``. ++ On success, returns a file descriptor the created semaphore. + +.. c:macro:: NTSYNC_IOC_CREATE_MUTEX + @@ -172,8 +168,6 @@ index 000000000000..767844637a7d + + .. list-table:: + -+ * - ``mutex`` -+ - On output, contains a file descriptor to the created mutex. + * - ``count`` + - Initial recursion count of the mutex. + * - ``owner`` @@ -181,6 +175,7 @@ index 000000000000..767844637a7d + + If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is + zero and ``count`` is nonzero, the function fails with ``EINVAL``. ++ On success, returns a file descriptor the created mutex. + +.. c:macro:: NTSYNC_IOC_CREATE_EVENT + @@ -189,8 +184,6 @@ index 000000000000..767844637a7d + + .. list-table:: + -+ * - ``event`` -+ - On output, contains a file descriptor to the created event. + * - ``signaled`` + - If nonzero, the event is initially signaled, otherwise + nonsignaled. @@ -198,6 +191,8 @@ index 000000000000..767844637a7d + - If nonzero, the event is a manual-reset event, otherwise + auto-reset. + ++ On success, returns a file descriptor the created event. ++ +The ioctls on the individual objects are as follows: + +.. c:macro:: NTSYNC_IOC_SEM_POST @@ -220,8 +215,6 @@ index 000000000000..767844637a7d + + .. list-table:: + -+ * - ``mutex`` -+ - Ignored. + * - ``owner`` + - Specifies the owner trying to release this mutex. + * - ``count`` @@ -270,8 +263,6 @@ index 000000000000..767844637a7d + + .. list-table:: + -+ * - ``sem`` -+ - Ignored. + * - ``count`` + - On output, contains the current count of the semaphore. + * - ``max`` @@ -284,8 +275,6 @@ index 000000000000..767844637a7d + + .. list-table:: + -+ * - ``mutex`` -+ - Ignored. + * - ``owner`` + - On output, contains the current owner of the mutex, or zero + if the mutex is not currently owned. @@ -303,8 +292,6 @@ index 000000000000..767844637a7d + + .. list-table:: + -+ * - ``event`` -+ - Ignored. + * - ``signaled`` + - On output, contains the current state of the event. + * - ``manual`` @@ -440,10 +427,10 @@ index 000000000000..767844637a7d + ``objs`` and in ``alert``. If this is attempted, the function fails + with ``EINVAL``. diff --git a/MAINTAINERS b/MAINTAINERS -index b27470be2e6a..4112729fc23a 100644 +index a2d251917629..a30770b6f75a 100644 --- a/MAINTAINERS +++ b/MAINTAINERS -@@ -15983,6 +15983,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git +@@ -16501,6 +16501,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git F: Documentation/filesystems/ntfs3.rst F: fs/ntfs3/ @@ -460,10 +447,10 @@ index b27470be2e6a..4112729fc23a 100644 M: Finn Thain L: linux-m68k@lists.linux-m68k.org diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig -index faf983680040..2907b5c23368 100644 +index 3fe7e2a9bd29..6c8b999a5e08 100644 --- a/drivers/misc/Kconfig +++ b/drivers/misc/Kconfig -@@ -507,7 +507,6 @@ config OPEN_DICE +@@ -517,7 +517,6 @@ config OPEN_DICE config NTSYNC tristate "NT synchronization primitive emulation" @@ -472,7 +459,7 @@ index faf983680040..2907b5c23368 100644 This module provides kernel support for emulation of Windows NT synchronization primitives. It is not a hardware driver. diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c -index 3c2f743c58b0..87a24798a5c7 100644 +index 4954553b7baa..457ff28b789f 100644 --- a/drivers/misc/ntsync.c +++ b/drivers/misc/ntsync.c @@ -6,11 +6,17 @@ @@ -516,7 +503,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 enum ntsync_type type; -@@ -46,13 +57,335 @@ struct ntsync_obj { +@@ -46,22 +57,344 @@ struct ntsync_obj { __u32 count; __u32 max; } sem; @@ -852,7 +839,9 @@ index 3c2f743c58b0..87a24798a5c7 100644 /* * Actually change the semaphore state, returning -EOVERFLOW if it is made * invalid. -@@ -61,7 +394,7 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count) + */ +-static int post_sem_state(struct ntsync_obj *sem, __u32 count) ++static int release_sem_state(struct ntsync_obj *sem, __u32 count) { __u32 sum; @@ -861,9 +850,12 @@ index 3c2f743c58b0..87a24798a5c7 100644 if (check_add_overflow(sem->u.sem.count, count, &sum) || sum > sem->u.sem.max) -@@ -73,9 +406,11 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count) +@@ -71,11 +404,13 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count) + return 0; + } - static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) +-static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) ++static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) { + struct ntsync_device *dev = sem->dev; __u32 __user *user_args = argp; @@ -881,7 +873,8 @@ index 3c2f743c58b0..87a24798a5c7 100644 + all = ntsync_lock_obj(dev, sem); prev_count = sem->u.sem.count; - ret = post_sem_state(sem, args); +- ret = post_sem_state(sem, args); ++ ret = release_sem_state(sem, args); + if (!ret) { + if (all) + try_wake_all_obj(dev, sem); @@ -893,7 +886,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 if (!ret && put_user(prev_count, user_args)) ret = -EFAULT; -@@ -97,6 +437,226 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) +@@ -97,6 +437,220 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) return ret; } @@ -1053,8 +1046,6 @@ index 3c2f743c58b0..87a24798a5c7 100644 + if (sem->type != NTSYNC_TYPE_SEM) + return -EINVAL; + -+ args.sem = 0; -+ + all = ntsync_lock_obj(dev, sem); + + args.count = sem->u.sem.count; @@ -1078,8 +1069,6 @@ index 3c2f743c58b0..87a24798a5c7 100644 + if (mutex->type != NTSYNC_TYPE_MUTEX) + return -EINVAL; + -+ args.mutex = 0; -+ + all = ntsync_lock_obj(dev, mutex); + + args.count = mutex->u.mutex.count; @@ -1103,8 +1092,6 @@ index 3c2f743c58b0..87a24798a5c7 100644 + if (event->type != NTSYNC_TYPE_EVENT) + return -EINVAL; + -+ args.event = 0; -+ + all = ntsync_lock_obj(dev, event); + + args.manual = event->u.event.manual; @@ -1120,10 +1107,14 @@ index 3c2f743c58b0..87a24798a5c7 100644 static int ntsync_obj_release(struct inode *inode, struct file *file) { struct ntsync_obj *obj = file->private_data; -@@ -116,6 +676,22 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, +@@ -114,8 +668,24 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, + void __user *argp = (void __user *)parm; + switch (cmd) { - case NTSYNC_IOC_SEM_POST: - return ntsync_sem_post(obj, argp); +- case NTSYNC_IOC_SEM_POST: +- return ntsync_sem_post(obj, argp); ++ case NTSYNC_IOC_SEM_RELEASE: ++ return ntsync_sem_release(obj, argp); + case NTSYNC_IOC_SEM_READ: + return ntsync_sem_read(obj, argp); + case NTSYNC_IOC_MUTEX_UNLOCK: @@ -1143,7 +1134,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 default: return -ENOIOCTLCMD; } -@@ -141,6 +717,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, +@@ -140,6 +710,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, obj->dev = dev; get_file(dev->file); spin_lock_init(&obj->lock); @@ -1153,13 +1144,28 @@ index 3c2f743c58b0..87a24798a5c7 100644 return obj; } -@@ -191,6 +770,400 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) - return put_user(fd, &user_args->sem); - } +@@ -165,7 +738,6 @@ static int ntsync_obj_get_fd(struct ntsync_obj *obj) + static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) + { +- struct ntsync_sem_args __user *user_args = argp; + struct ntsync_sem_args args; + struct ntsync_obj *sem; + int fd; +@@ -182,12 +754,398 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) + sem->u.sem.count = args.count; + sem->u.sem.max = args.max; + fd = ntsync_obj_get_fd(sem); +- if (fd < 0) { ++ if (fd < 0) + kfree(sem); +- return fd; ++ ++ return fd; ++} ++ +static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp) +{ -+ struct ntsync_mutex_args __user *user_args = argp; + struct ntsync_mutex_args args; + struct ntsync_obj *mutex; + int fd; @@ -1176,17 +1182,14 @@ index 3c2f743c58b0..87a24798a5c7 100644 + mutex->u.mutex.count = args.count; + mutex->u.mutex.owner = args.owner; + fd = ntsync_obj_get_fd(mutex); -+ if (fd < 0) { ++ if (fd < 0) + kfree(mutex); -+ return fd; -+ } + -+ return put_user(fd, &user_args->mutex); ++ return fd; +} + +static int ntsync_create_event(struct ntsync_device *dev, void __user *argp) +{ -+ struct ntsync_event_args __user *user_args = argp; + struct ntsync_event_args args; + struct ntsync_obj *event; + int fd; @@ -1200,12 +1203,10 @@ index 3c2f743c58b0..87a24798a5c7 100644 + event->u.event.manual = args.manual; + event->u.event.signaled = args.signaled; + fd = ntsync_obj_get_fd(event); -+ if (fd < 0) { ++ if (fd < 0) + kfree(event); -+ return fd; -+ } + -+ return put_user(fd, &user_args->event); ++ return fd; +} + +static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) @@ -1219,8 +1220,9 @@ index 3c2f743c58b0..87a24798a5c7 100644 + if (file->f_op != &ntsync_obj_fops) { + fput(file); + return NULL; -+ } -+ + } + +- return put_user(fd, &user_args->sem); + obj = file->private_data; + if (obj->dev != dev) { + fput(file); @@ -1549,12 +1551,10 @@ index 3c2f743c58b0..87a24798a5c7 100644 + + kfree(q); + return ret; -+} -+ + } + static int ntsync_char_open(struct inode *inode, struct file *file) - { - struct ntsync_device *dev; -@@ -199,6 +1172,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file) +@@ -198,6 +1156,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file) if (!dev) return -ENOMEM; @@ -1563,7 +1563,7 @@ index 3c2f743c58b0..87a24798a5c7 100644 file->private_data = dev; dev->file = file; return nonseekable_open(inode, file); -@@ -220,8 +1195,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, +@@ -219,8 +1179,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, void __user *argp = (void __user *)parm; switch (cmd) { @@ -1581,21 +1581,25 @@ index 3c2f743c58b0..87a24798a5c7 100644 return -ENOIOCTLCMD; } diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h -index dcfa38fdc93c..4a8095a3fc34 100644 +index dcfa38fdc93c..6d06793512b1 100644 --- a/include/uapi/linux/ntsync.h +++ b/include/uapi/linux/ntsync.h -@@ -16,8 +16,47 @@ struct ntsync_sem_args { +@@ -11,13 +11,49 @@ + #include + + struct ntsync_sem_args { +- __u32 sem; + __u32 count; __u32 max; }; +-#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) +struct ntsync_mutex_args { -+ __u32 mutex; + __u32 owner; + __u32 count; +}; + +struct ntsync_event_args { -+ __u32 event; + __u32 manual; + __u32 signaled; +}; @@ -1615,13 +1619,14 @@ index dcfa38fdc93c..4a8095a3fc34 100644 + +#define NTSYNC_MAX_WAIT_COUNT 64 + - #define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) ++#define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) +#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) +#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) -+#define NTSYNC_IOC_CREATE_MUTEX _IOWR('N', 0x84, struct ntsync_sem_args) -+#define NTSYNC_IOC_CREATE_EVENT _IOWR('N', 0x87, struct ntsync_event_args) ++#define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) ++#define NTSYNC_IOC_CREATE_EVENT _IOW ('N', 0x87, struct ntsync_event_args) - #define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) +-#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) ++#define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) +#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) +#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) +#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) @@ -1633,11 +1638,11 @@ index dcfa38fdc93c..4a8095a3fc34 100644 #endif diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile -index 9039f3709aff..d5aeaa8fe3ca 100644 +index 363d031a16f7..ff18c0361e38 100644 --- a/tools/testing/selftests/Makefile +++ b/tools/testing/selftests/Makefile -@@ -16,6 +16,7 @@ TARGETS += damon - TARGETS += devices +@@ -18,6 +18,7 @@ TARGETS += devices/error_logs + TARGETS += devices/probe TARGETS += dmabuf-heaps TARGETS += drivers/dma-buf +TARGETS += drivers/ntsync @@ -1673,10 +1678,10 @@ index 000000000000..60539c826d06 +CONFIG_WINESYNC=y diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c new file mode 100644 -index 000000000000..5fa2c9a0768c +index 000000000000..3aad311574c4 --- /dev/null +++ b/tools/testing/selftests/drivers/ntsync/ntsync.c -@@ -0,0 +1,1407 @@ +@@ -0,0 +1,1343 @@ +// SPDX-License-Identifier: GPL-2.0-or-later +/* + * Various unit tests for the "ntsync" synchronization primitive driver. @@ -1714,9 +1719,9 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ((max), __max); \ + }) + -+static int post_sem(int sem, __u32 *count) ++static int release_sem(int sem, __u32 *count) +{ -+ return ioctl(sem, NTSYNC_IOC_SEM_POST, count); ++ return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count); +} + +static int read_mutex_state(int mutex, __u32 *count, __u32 *owner) @@ -1831,28 +1836,24 @@ index 000000000000..5fa2c9a0768c + + sem_args.count = 3; + sem_args.max = 2; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(-1, ret); ++ sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_EQ(-1, sem); + EXPECT_EQ(EINVAL, errno); + + sem_args.count = 2; + sem_args.max = 2; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); -+ sem = sem_args.sem; ++ sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, sem); + check_sem_state(sem, 2, 2); + + count = 0; -+ ret = post_sem(sem, &count); ++ ret = release_sem(sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + check_sem_state(sem, 2, 2); + + count = 1; -+ ret = post_sem(sem, &count); ++ ret = release_sem(sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(sem, 2, 2); @@ -1872,13 +1873,13 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(ETIMEDOUT, errno); + + count = 3; -+ ret = post_sem(sem, &count); ++ ret = release_sem(sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(sem, 0, 2); + + count = 2; -+ ret = post_sem(sem, &count); ++ ret = release_sem(sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(sem, 2, 2); @@ -1889,13 +1890,13 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(0, ret); + + count = 1; -+ ret = post_sem(sem, &count); ++ ret = release_sem(sem, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + check_sem_state(sem, 1, 2); + + count = ~0u; -+ ret = post_sem(sem, &count); ++ ret = release_sem(sem, &count); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOVERFLOW, errno); + check_sem_state(sem, 1, 2); @@ -1919,23 +1920,20 @@ index 000000000000..5fa2c9a0768c + + mutex_args.owner = 123; + mutex_args.count = 0; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(-1, ret); ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_EQ(-1, mutex); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 0; + mutex_args.count = 2; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(-1, ret); ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_EQ(-1, mutex); + EXPECT_EQ(EINVAL, errno); + + mutex_args.owner = 123; + mutex_args.count = 2; -+ mutex_args.mutex = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, mutex_args.mutex); -+ mutex = mutex_args.mutex; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, mutex); + check_mutex_state(mutex, 2, 123); + + ret = unlock_mutex(mutex, 0, &count); @@ -2036,11 +2034,8 @@ index 000000000000..5fa2c9a0768c + + mutex_args.owner = 0; + mutex_args.count = 0; -+ mutex_args.mutex = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, mutex_args.mutex); -+ mutex = mutex_args.mutex; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, mutex); + check_mutex_state(mutex, 0, 0); + + ret = wait_any(fd, 1, &mutex, 123, &index); @@ -2052,11 +2047,8 @@ index 000000000000..5fa2c9a0768c + + mutex_args.owner = 123; + mutex_args.count = ~0u; -+ mutex_args.mutex = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, mutex_args.mutex); -+ mutex = mutex_args.mutex; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, mutex); + check_mutex_state(mutex, ~0u, 123); + + ret = wait_any(fd, 1, &mutex, 123, &index); @@ -2079,11 +2071,8 @@ index 000000000000..5fa2c9a0768c + + event_args.manual = 1; + event_args.signaled = 0; -+ event_args.event = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, event_args.event); -+ event = event_args.event; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); + check_event_state(event, 0, 1); + + signaled = 0xdeadbeef; @@ -2147,11 +2136,8 @@ index 000000000000..5fa2c9a0768c + + event_args.manual = 0; + event_args.signaled = 1; -+ event_args.event = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, event_args.event); -+ event = event_args.event; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); + + check_event_state(event, 1, 0); + @@ -2210,62 +2196,55 @@ index 000000000000..5fa2c9a0768c + + sem_args.count = 2; + sem_args.max = 3; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); + + mutex_args.owner = 0; + mutex_args.count = 0; -+ mutex_args.mutex = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, mutex_args.mutex); -+ -+ objs[0] = sem_args.sem; -+ objs[1] = mutex_args.mutex; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 1, 3); -+ check_mutex_state(mutex_args.mutex, 0, 0); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 0, 3); -+ check_mutex_state(mutex_args.mutex, 0, 0); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 0, 0); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); -+ check_sem_state(sem_args.sem, 0, 3); -+ check_mutex_state(mutex_args.mutex, 1, 123); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 1, 123); + + count = 1; -+ ret = post_sem(sem_args.sem, &count); ++ ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 0, 3); -+ check_mutex_state(mutex_args.mutex, 1, 123); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 1, 123); + + ret = wait_any(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); -+ check_sem_state(sem_args.sem, 0, 3); -+ check_mutex_state(mutex_args.mutex, 2, 123); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 2, 123); + + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + + owner = 123; -+ ret = ioctl(mutex_args.mutex, NTSYNC_IOC_MUTEX_KILL, &owner); ++ ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(0, ret); + + ret = wait_any(fd, 2, objs, 456, &index); @@ -2277,24 +2256,27 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + ++ close(objs[1]); ++ + /* test waiting on the same object twice */ ++ + count = 2; -+ ret = post_sem(sem_args.sem, &count); ++ ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + -+ objs[0] = objs[1] = sem_args.sem; ++ objs[1] = objs[0]; + ret = wait_any(fd, 2, objs, 456, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 1, 3); ++ check_sem_state(objs[0], 1, 3); + + ret = wait_any(fd, 0, NULL, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + -+ for (i = 0; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i) -+ objs[i] = sem_args.sem; ++ for (i = 1; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i) ++ objs[i] = objs[0]; + + ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT, objs, 123, &index); + EXPECT_EQ(0, ret); @@ -2308,8 +2290,7 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + -+ close(sem_args.sem); -+ close(mutex_args.mutex); ++ close(objs[0]); + + close(fd); +} @@ -2327,88 +2308,81 @@ index 000000000000..5fa2c9a0768c + + sem_args.count = 2; + sem_args.max = 3; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); + + mutex_args.owner = 0; + mutex_args.count = 0; -+ mutex_args.mutex = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, mutex_args.mutex); -+ -+ event_args.manual = true; -+ event_args.signaled = true; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); -+ -+ objs[0] = sem_args.sem; -+ objs[1] = mutex_args.mutex; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 1, 3); -+ check_mutex_state(mutex_args.mutex, 1, 123); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 123); + + ret = wait_all(fd, 2, objs, 456, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); -+ check_sem_state(sem_args.sem, 1, 3); -+ check_mutex_state(mutex_args.mutex, 1, 123); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 0, 3); -+ check_mutex_state(mutex_args.mutex, 2, 123); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 2, 123); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); -+ check_sem_state(sem_args.sem, 0, 3); -+ check_mutex_state(mutex_args.mutex, 2, 123); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 2, 123); + + count = 3; -+ ret = post_sem(sem_args.sem, &count); ++ ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 2, 3); -+ check_mutex_state(mutex_args.mutex, 3, 123); ++ check_sem_state(objs[0], 2, 3); ++ check_mutex_state(objs[1], 3, 123); + + owner = 123; -+ ret = ioctl(mutex_args.mutex, NTSYNC_IOC_MUTEX_KILL, &owner); ++ ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner); + EXPECT_EQ(0, ret); + + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EOWNERDEAD, errno); -+ check_sem_state(sem_args.sem, 1, 3); -+ check_mutex_state(mutex_args.mutex, 1, 123); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 123); ++ ++ close(objs[1]); ++ ++ event_args.manual = true; ++ event_args.signaled = true; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, objs[1]); + -+ objs[0] = sem_args.sem; -+ objs[1] = event_args.event; + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); -+ check_sem_state(sem_args.sem, 0, 3); -+ check_event_state(event_args.event, 1, 1); ++ check_sem_state(objs[0], 0, 3); ++ check_event_state(objs[1], 1, 1); ++ ++ close(objs[1]); + + /* test waiting on the same object twice */ -+ objs[0] = objs[1] = sem_args.sem; ++ objs[1] = objs[0]; + ret = wait_all(fd, 2, objs, 123, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(EINVAL, errno); + -+ close(sem_args.sem); -+ close(mutex_args.mutex); -+ close(event_args.event); ++ close(objs[0]); + + close(fd); +} @@ -2469,20 +2443,13 @@ index 000000000000..5fa2c9a0768c + + sem_args.count = 0; + sem_args.max = 3; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); + + mutex_args.owner = 123; + mutex_args.count = 1; -+ mutex_args.mutex = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, mutex_args.mutex); -+ -+ objs[0] = sem_args.sem; -+ objs[1] = mutex_args.mutex; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); + + /* test waking the semaphore */ + @@ -2501,10 +2468,10 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(ETIMEDOUT, ret); + + count = 1; -+ ret = post_sem(sem_args.sem, &count); ++ ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); -+ check_sem_state(sem_args.sem, 0, 3); ++ check_sem_state(objs[0], 0, 3); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); @@ -2514,7 +2481,7 @@ index 000000000000..5fa2c9a0768c + /* test waking the mutex */ + + /* first grab it again for owner 123 */ -+ ret = wait_any(fd, 1, &mutex_args.mutex, 123, &index); ++ ret = wait_any(fd, 1, &objs[1], 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + @@ -2526,31 +2493,32 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ ret = unlock_mutex(mutex_args.mutex, 123, &count); ++ ret = unlock_mutex(objs[1], 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + -+ ret = unlock_mutex(mutex_args.mutex, 123, &count); ++ ret = unlock_mutex(objs[1], 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, mutex_args.count); -+ check_mutex_state(mutex_args.mutex, 1, 456); ++ check_mutex_state(objs[1], 1, 456); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + ++ close(objs[1]); ++ + /* test waking events */ + + event_args.manual = false; + event_args.signaled = false; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, objs[1]); + -+ objs[1] = event_args.event; + wait_args.timeout = get_abs_timeout(1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); @@ -2558,10 +2526,10 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled); ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); -+ check_event_state(event_args.event, 0, 0); ++ check_event_state(objs[1], 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); @@ -2575,24 +2543,23 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_PULSE, &signaled); ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); -+ check_event_state(event_args.event, 0, 0); ++ check_event_state(objs[1], 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + -+ close(event_args.event); ++ close(objs[1]); + + event_args.manual = true; + event_args.signaled = false; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, objs[1]); + -+ objs[1] = event_args.event; + wait_args.timeout = get_abs_timeout(1000); + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); @@ -2600,17 +2567,17 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled); ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); -+ check_event_state(event_args.event, 1, 1); ++ check_event_state(objs[1], 1, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + @@ -2621,31 +2588,28 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_PULSE, &signaled); ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); -+ check_event_state(event_args.event, 0, 1); ++ check_event_state(objs[1], 0, 1); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(1, wait_args.index); + -+ close(event_args.event); -+ + /* delete an object while it's being waited on */ + + wait_args.timeout = get_abs_timeout(200); + wait_args.owner = 123; -+ objs[1] = mutex_args.mutex; + ret = pthread_create(&thread, NULL, wait_thread, &thread_args); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ close(sem_args.sem); -+ close(mutex_args.mutex); ++ close(objs[0]); ++ close(objs[1]); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); @@ -2672,32 +2636,23 @@ index 000000000000..5fa2c9a0768c + + sem_args.count = 0; + sem_args.max = 3; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); + + mutex_args.owner = 123; + mutex_args.count = 1; -+ mutex_args.mutex = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, mutex_args.mutex); ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); + + manual_event_args.manual = true; + manual_event_args.signaled = true; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args); -+ EXPECT_EQ(0, ret); ++ objs[2] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args); ++ EXPECT_LE(0, objs[2]); + + auto_event_args.manual = false; + auto_event_args.signaled = true; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args); -+ EXPECT_EQ(0, ret); -+ -+ objs[0] = sem_args.sem; -+ objs[1] = mutex_args.mutex; -+ objs[2] = manual_event_args.event; -+ objs[3] = auto_event_args.event; ++ objs[3] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args); ++ EXPECT_EQ(0, objs[3]); + + wait_args.timeout = get_abs_timeout(1000); + wait_args.objs = (uintptr_t)objs; @@ -2713,54 +2668,54 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(ETIMEDOUT, ret); + + count = 1; -+ ret = post_sem(sem_args.sem, &count); ++ ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + -+ check_sem_state(sem_args.sem, 1, 3); ++ check_sem_state(objs[0], 1, 3); + -+ ret = wait_any(fd, 1, &sem_args.sem, 123, &index); ++ ret = wait_any(fd, 1, &objs[0], 123, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + -+ ret = unlock_mutex(mutex_args.mutex, 123, &count); ++ ret = unlock_mutex(objs[1], 123, &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, count); + + ret = pthread_tryjoin_np(thread, NULL); + EXPECT_EQ(EBUSY, ret); + -+ check_mutex_state(mutex_args.mutex, 0, 0); ++ check_mutex_state(objs[1], 0, 0); + -+ ret = ioctl(manual_event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ ret = ioctl(objs[2], NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + + count = 2; -+ ret = post_sem(sem_args.sem, &count); ++ ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, count); -+ check_sem_state(sem_args.sem, 2, 3); ++ check_sem_state(objs[0], 2, 3); + -+ ret = ioctl(auto_event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ ret = ioctl(objs[3], NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, signaled); + -+ ret = ioctl(manual_event_args.event, NTSYNC_IOC_EVENT_SET, &signaled); ++ ret = ioctl(objs[2], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + -+ ret = ioctl(auto_event_args.event, NTSYNC_IOC_EVENT_SET, &signaled); ++ ret = ioctl(objs[3], NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, signaled); + -+ check_sem_state(sem_args.sem, 1, 3); -+ check_mutex_state(mutex_args.mutex, 1, 456); -+ check_event_state(manual_event_args.event, 1, 1); -+ check_event_state(auto_event_args.event, 0, 0); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 456); ++ check_event_state(objs[2], 1, 1); ++ check_event_state(objs[3], 0, 0); + + ret = wait_for_thread(thread, 100); + EXPECT_EQ(0, ret); @@ -2776,10 +2731,10 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ close(sem_args.sem); -+ close(mutex_args.mutex); -+ close(manual_event_args.event); -+ close(auto_event_args.event); ++ close(objs[0]); ++ close(objs[1]); ++ close(objs[2]); ++ close(objs[3]); + + ret = wait_for_thread(thread, 200); + EXPECT_EQ(0, ret); @@ -2796,7 +2751,7 @@ index 000000000000..5fa2c9a0768c + struct ntsync_sem_args sem_args = {0}; + __u32 index, count, signaled; + struct wait_args thread_args; -+ int objs[2], fd, ret; ++ int objs[2], event, fd, ret; + pthread_t thread; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); @@ -2804,50 +2759,44 @@ index 000000000000..5fa2c9a0768c + + sem_args.count = 0; + sem_args.max = 2; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); -+ objs[0] = sem_args.sem; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); + + sem_args.count = 1; + sem_args.max = 2; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); -+ objs[1] = sem_args.sem; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[1]); + + event_args.manual = true; + event_args.signaled = true; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); + -+ ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index); ++ ret = wait_any_alert(fd, 0, NULL, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + -+ ret = wait_any_alert(fd, 0, NULL, 123, event_args.event, &index); ++ ret = wait_any_alert(fd, 0, NULL, 123, event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled); ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + -+ ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(1, index); + -+ ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + /* test wakeup via alert */ + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + + wait_args.timeout = get_abs_timeout(1000); @@ -2855,7 +2804,7 @@ index 000000000000..5fa2c9a0768c + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; -+ wait_args.alert = event_args.event; ++ wait_args.alert = event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = NTSYNC_IOC_WAIT_ANY; @@ -2865,7 +2814,7 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled); ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); @@ -2873,32 +2822,32 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + -+ close(event_args.event); ++ close(event); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); + + count = 1; -+ ret = post_sem(objs[0], &count); ++ ret = release_sem(objs[0], &count); + EXPECT_EQ(0, ret); + -+ ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + -+ ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + -+ ret = wait_any_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + -+ close(event_args.event); ++ close(event); + + close(objs[0]); + close(objs[1]); @@ -2913,7 +2862,7 @@ index 000000000000..5fa2c9a0768c + struct ntsync_sem_args sem_args = {0}; + struct wait_args thread_args; + __u32 index, count, signaled; -+ int objs[2], fd, ret; ++ int objs[2], event, fd, ret; + pthread_t thread; + + fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); @@ -2921,36 +2870,30 @@ index 000000000000..5fa2c9a0768c + + sem_args.count = 2; + sem_args.max = 2; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); -+ objs[0] = sem_args.sem; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); + + sem_args.count = 1; + sem_args.max = 2; -+ sem_args.sem = 0xdeadbeef; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); -+ EXPECT_EQ(0, ret); -+ EXPECT_NE(0xdeadbeef, sem_args.sem); -+ objs[1] = sem_args.sem; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[1]); + + event_args.manual = true; + event_args.signaled = true; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); + -+ ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + -+ ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + + /* test wakeup via alert */ + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); + EXPECT_EQ(0, ret); + + wait_args.timeout = get_abs_timeout(1000); @@ -2958,7 +2901,7 @@ index 000000000000..5fa2c9a0768c + wait_args.count = 2; + wait_args.owner = 123; + wait_args.index = 0xdeadbeef; -+ wait_args.alert = event_args.event; ++ wait_args.alert = event; + thread_args.fd = fd; + thread_args.args = &wait_args; + thread_args.request = NTSYNC_IOC_WAIT_ALL; @@ -2968,7 +2911,7 @@ index 000000000000..5fa2c9a0768c + ret = wait_for_thread(thread, 100); + EXPECT_EQ(ETIMEDOUT, ret); + -+ ret = ioctl(event_args.event, NTSYNC_IOC_EVENT_SET, &signaled); ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); + EXPECT_EQ(0, ret); + + ret = wait_for_thread(thread, 100); @@ -2976,32 +2919,32 @@ index 000000000000..5fa2c9a0768c + EXPECT_EQ(0, thread_args.ret); + EXPECT_EQ(2, wait_args.index); + -+ close(event_args.event); ++ close(event); + + /* test with an auto-reset event */ + + event_args.manual = false; + event_args.signaled = true; -+ ret = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); + + count = 2; -+ ret = post_sem(objs[1], &count); ++ ret = release_sem(objs[1], &count); + EXPECT_EQ(0, ret); + -+ ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(0, index); + -+ ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(0, ret); + EXPECT_EQ(2, index); + -+ ret = wait_all_alert(fd, 2, objs, 123, event_args.event, &index); ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); + EXPECT_EQ(-1, ret); + EXPECT_EQ(ETIMEDOUT, errno); + -+ close(event_args.event); ++ close(event); + + close(objs[0]); + close(objs[1]); @@ -3055,15 +2998,13 @@ index 000000000000..5fa2c9a0768c + + mutex_args.owner = 0; + mutex_args.count = 0; -+ ret = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); -+ EXPECT_EQ(0, ret); -+ stress_mutex = mutex_args.mutex; ++ stress_mutex = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, stress_mutex); + + event_args.manual = 1; + event_args.signaled = 0; -+ ret = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args); -+ EXPECT_EQ(0, ret); -+ stress_start_event = event_args.event; ++ stress_start_event = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, stress_start_event); + + for (i = 0; i < STRESS_THREADS; ++i) + pthread_create(&threads[i], NULL, stress_thread, NULL); @@ -3085,5 +3026,5 @@ index 000000000000..5fa2c9a0768c + +TEST_HARNESS_MAIN -- -2.46.0.rc1 +2.47.1 diff --git a/sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch b/sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch new file mode 100644 index 0000000..451afad --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0009-perf-per-core.patch @@ -0,0 +1,997 @@ +From 7bb6922864744721217f728cc01dcb53dcdcc2da Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 19 Dec 2024 18:52:26 +0100 +Subject: [PATCH 09/12] perf-per-core + +Signed-off-by: Peter Jung +--- + Documentation/arch/x86/topology.rst | 4 + + arch/x86/events/rapl.c | 507 ++++++++++++++------------ + arch/x86/include/asm/processor.h | 1 + + arch/x86/include/asm/topology.h | 1 + + arch/x86/kernel/cpu/debugfs.c | 1 + + arch/x86/kernel/cpu/topology_common.c | 1 + + include/linux/cpuhotplug.h | 1 - + 7 files changed, 288 insertions(+), 228 deletions(-) + +diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst +index 7352ab89a55a..c12837e61bda 100644 +--- a/Documentation/arch/x86/topology.rst ++++ b/Documentation/arch/x86/topology.rst +@@ -135,6 +135,10 @@ Thread-related topology information in the kernel: + The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo + "core_id." + ++ - topology_logical_core_id(); ++ ++ The logical core ID to which a thread belongs. ++ + + + System topology examples +diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c +index a481a939862e..d3bb3865c1b1 100644 +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -39,6 +39,10 @@ + * event: rapl_energy_psys + * perf code: 0x5 + * ++ * core counter: consumption of a single physical core ++ * event: rapl_energy_core (power_core PMU) ++ * perf code: 0x1 ++ * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * +@@ -70,18 +74,22 @@ MODULE_LICENSE("GPL"); + /* + * RAPL energy status counters + */ +-enum perf_rapl_events { ++enum perf_rapl_pkg_events { + PERF_RAPL_PP0 = 0, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + +- PERF_RAPL_MAX, +- NR_RAPL_DOMAINS = PERF_RAPL_MAX, ++ PERF_RAPL_PKG_EVENTS_MAX, ++ NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, + }; + +-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { ++#define PERF_RAPL_CORE 0 /* single core */ ++#define PERF_RAPL_CORE_EVENTS_MAX 1 ++#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX ++ ++static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { + "pp0-core", + "package", + "dram", +@@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { + "psys", + }; + ++static const char *const rapl_core_domain_name __initconst = "core"; ++ + /* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved +@@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \ + * considered as either pkg-scope or die-scope, and we are considering + * them as die-scope. + */ +-#define rapl_pmu_is_pkg_scope() \ ++#define rapl_pkg_pmu_is_pkg_scope() \ + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + +@@ -129,7 +139,8 @@ struct rapl_pmu { + struct rapl_pmus { + struct pmu pmu; + unsigned int nr_rapl_pmu; +- struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); ++ unsigned int cntr_mask; ++ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); + }; + + enum rapl_unit_quirk { +@@ -139,45 +150,43 @@ enum rapl_unit_quirk { + }; + + struct rapl_model { +- struct perf_msr *rapl_msrs; +- unsigned long events; ++ struct perf_msr *rapl_pkg_msrs; ++ struct perf_msr *rapl_core_msrs; ++ unsigned long pkg_events; ++ unsigned long core_events; + unsigned int msr_power_unit; + enum rapl_unit_quirk unit_quirk; + }; + + /* 1/2^hw_unit Joule */ +-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; +-static struct rapl_pmus *rapl_pmus; +-static cpumask_t rapl_cpu_mask; +-static unsigned int rapl_cntr_mask; ++static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; ++static int rapl_core_hw_unit __read_mostly; ++static struct rapl_pmus *rapl_pmus_pkg; ++static struct rapl_pmus *rapl_pmus_core; + static u64 rapl_timer_ms; +-static struct perf_msr *rapl_msrs; ++static struct rapl_model *rapl_model; + + /* +- * Helper functions to get the correct topology macros according to the ++ * Helper function to get the correct topology id according to the + * RAPL PMU scope. + */ +-static inline unsigned int get_rapl_pmu_idx(int cpu) +-{ +- return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : +- topology_logical_die_id(cpu); +-} +- +-static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu) ++static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) + { +- return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) : +- topology_die_cpumask(cpu); +-} +- +-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) +-{ +- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); +- + /* +- * The unsigned check also catches the '-1' return value for non +- * existent mappings in the topology map. ++ * Returns unsigned int, which converts the '-1' return value ++ * (for non-existent mappings in topology map) to UINT_MAX, so ++ * the error check in the caller is simplified. + */ +- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; ++ switch (scope) { ++ case PERF_PMU_SCOPE_PKG: ++ return topology_logical_package_id(cpu); ++ case PERF_PMU_SCOPE_DIE: ++ return topology_logical_die_id(cpu); ++ case PERF_PMU_SCOPE_CORE: ++ return topology_logical_core_id(cpu); ++ default: ++ return -EINVAL; ++ } + } + + static inline u64 rapl_read_counter(struct perf_event *event) +@@ -187,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event) + return raw; + } + +-static inline u64 rapl_scale(u64 v, int cfg) ++static inline u64 rapl_scale(u64 v, struct perf_event *event) + { +- if (cfg > NR_RAPL_DOMAINS) { +- pr_warn("Invalid domain %d, failed to scale data\n", cfg); +- return v; +- } ++ int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1]; ++ ++ if (event->pmu->scope == PERF_PMU_SCOPE_CORE) ++ hw_unit = rapl_core_hw_unit; ++ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ +- return v << (32 - rapl_hw_unit[cfg - 1]); ++ return v << (32 - hw_unit); + } + + static u64 rapl_event_update(struct perf_event *event) +@@ -226,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event) + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + +- sdelta = rapl_scale(delta, event->hw.config); ++ sdelta = rapl_scale(delta, event); + + local64_add(sdelta, &event->count); + +@@ -241,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) + + static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) + { +- struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); ++ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); + struct perf_event *event; + unsigned long flags; + +- if (!pmu->n_active) ++ if (!rapl_pmu->n_active) + return HRTIMER_NORESTART; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + +- list_for_each_entry(event, &pmu->active_list, active_entry) ++ list_for_each_entry(event, &rapl_pmu->active_list, active_entry) + rapl_event_update(event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + +- hrtimer_forward_now(hrtimer, pmu->timer_interval); ++ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); + + return HRTIMER_RESTART; + } + +-static void rapl_hrtimer_init(struct rapl_pmu *pmu) ++static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) + { +- struct hrtimer *hr = &pmu->hrtimer; ++ struct hrtimer *hr = &rapl_pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; + } + +-static void __rapl_pmu_event_start(struct rapl_pmu *pmu, ++static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, + struct perf_event *event) + { + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +@@ -276,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + + event->hw.state = 0; + +- list_add_tail(&event->active_entry, &pmu->active_list); ++ list_add_tail(&event->active_entry, &rapl_pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + +- pmu->n_active++; +- if (pmu->n_active == 1) +- rapl_start_hrtimer(pmu); ++ rapl_pmu->n_active++; ++ if (rapl_pmu->n_active == 1) ++ rapl_start_hrtimer(rapl_pmu); + } + + static void rapl_pmu_event_start(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); +- __rapl_pmu_event_start(pmu, event); +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); ++ __rapl_pmu_event_start(rapl_pmu, event); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static void rapl_pmu_event_stop(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { +- WARN_ON_ONCE(pmu->n_active <= 0); +- pmu->n_active--; +- if (pmu->n_active == 0) +- hrtimer_cancel(&pmu->hrtimer); ++ WARN_ON_ONCE(rapl_pmu->n_active <= 0); ++ rapl_pmu->n_active--; ++ if (rapl_pmu->n_active == 0) ++ hrtimer_cancel(&rapl_pmu->hrtimer); + + list_del(&event->active_entry); + +@@ -326,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) + hwc->state |= PERF_HES_UPTODATE; + } + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static int rapl_pmu_event_add(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) +- __rapl_pmu_event_start(pmu, event); ++ __rapl_pmu_event_start(rapl_pmu, event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + + return 0; + } +@@ -355,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags) + static int rapl_pmu_event_init(struct perf_event *event) + { + u64 cfg = event->attr.config & RAPL_EVENT_MASK; +- int bit, ret = 0; +- struct rapl_pmu *pmu; ++ int bit, rapl_pmus_scope, ret = 0; ++ struct rapl_pmu *rapl_pmu; ++ unsigned int rapl_pmu_idx; ++ struct rapl_pmus *rapl_pmus; + +- /* only look at RAPL events */ +- if (event->attr.type != rapl_pmus->pmu.type) +- return -ENOENT; ++ /* unsupported modes and filters */ ++ if (event->attr.sample_period) /* no sampling */ ++ return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) +@@ -369,29 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event) + if (event->cpu < 0) + return -EINVAL; + +- event->event_caps |= PERF_EV_CAP_READ_ACTIVE_PKG; +- +- if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) ++ rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); ++ if (!rapl_pmus) ++ return -EINVAL; ++ rapl_pmus_scope = rapl_pmus->pmu.scope; ++ ++ if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { ++ /* only look at RAPL package events */ ++ if (event->attr.type != rapl_pmus_pkg->pmu.type) ++ return -ENOENT; ++ ++ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) ++ return -EINVAL; ++ ++ bit = cfg - 1; ++ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; ++ } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { ++ /* only look at RAPL core events */ ++ if (event->attr.type != rapl_pmus_core->pmu.type) ++ return -ENOENT; ++ ++ cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) ++ return -EINVAL; ++ ++ bit = cfg - 1; ++ event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; ++ } else + return -EINVAL; +- +- cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); +- bit = cfg - 1; + + /* check event supported */ +- if (!(rapl_cntr_mask & (1 << bit))) ++ if (!(rapl_pmus->cntr_mask & (1 << bit))) + return -EINVAL; + +- /* unsupported modes and filters */ +- if (event->attr.sample_period) /* no sampling */ ++ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope); ++ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) + return -EINVAL; +- + /* must be done before validate_group */ +- pmu = cpu_to_rapl_pmu(event->cpu); +- if (!pmu) ++ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; ++ if (!rapl_pmu) + return -EINVAL; +- event->cpu = pmu->cpu; +- event->pmu_private = pmu; +- event->hw.event_base = rapl_msrs[bit].msr; ++ ++ event->pmu_private = rapl_pmu; + event->hw.config = cfg; + event->hw.idx = bit; + +@@ -403,34 +435,19 @@ static void rapl_pmu_event_read(struct perf_event *event) + rapl_event_update(event); + } + +-static ssize_t rapl_get_attr_cpumask(struct device *dev, +- struct device_attribute *attr, char *buf) +-{ +- return cpumap_print_to_pagebuf(true, buf, &rapl_cpu_mask); +-} +- +-static DEVICE_ATTR(cpumask, S_IRUGO, rapl_get_attr_cpumask, NULL); +- +-static struct attribute *rapl_pmu_attrs[] = { +- &dev_attr_cpumask.attr, +- NULL, +-}; +- +-static struct attribute_group rapl_pmu_attr_group = { +- .attrs = rapl_pmu_attrs, +-}; +- + RAPL_EVENT_ATTR_STR(energy-cores, rapl_cores, "event=0x01"); + RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); + RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); + RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); ++RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01"); + + RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); ++RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules"); + + /* + * we compute in 0.23 nJ increments regardless of MSR +@@ -440,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890 + RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); ++RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10"); + + /* + * There are no default events, but we need to create +@@ -467,7 +485,12 @@ static struct attribute_group rapl_pmu_format_group = { + }; + + static const struct attribute_group *rapl_attr_groups[] = { +- &rapl_pmu_attr_group, ++ &rapl_pmu_format_group, ++ &rapl_pmu_events_group, ++ NULL, ++}; ++ ++static const struct attribute_group *rapl_core_attr_groups[] = { + &rapl_pmu_format_group, + &rapl_pmu_events_group, + NULL, +@@ -533,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = { + .attrs = rapl_events_psys, + }; + ++static struct attribute *rapl_events_core[] = { ++ EVENT_PTR(rapl_core), ++ EVENT_PTR(rapl_core_unit), ++ EVENT_PTR(rapl_core_scale), ++ NULL, ++}; ++ ++static struct attribute_group rapl_events_core_group = { ++ .name = "events", ++ .attrs = rapl_events_core, ++}; ++ + static bool test_msr(int idx, void *data) + { + return test_bit(idx, (unsigned long *) data); +@@ -558,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { + }; + + /* +- * Force to PERF_RAPL_MAX size due to: +- * - perf_msr_probe(PERF_RAPL_MAX) ++ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: ++ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) + * - want to use same event codes across both architectures + */ +-static struct perf_msr amd_rapl_msrs[] = { ++static struct perf_msr amd_rapl_pkg_msrs[] = { + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, +@@ -570,77 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = { + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, + }; + +-static int rapl_cpu_offline(unsigned int cpu) +-{ +- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); +- int target; +- +- /* Check if exiting cpu is used for collecting rapl events */ +- if (!cpumask_test_and_clear_cpu(cpu, &rapl_cpu_mask)) +- return 0; +- +- pmu->cpu = -1; +- /* Find a new cpu to collect rapl events */ +- target = cpumask_any_but(get_rapl_pmu_cpumask(cpu), cpu); +- +- /* Migrate rapl events to the new target */ +- if (target < nr_cpu_ids) { +- cpumask_set_cpu(target, &rapl_cpu_mask); +- pmu->cpu = target; +- perf_pmu_migrate_context(pmu->pmu, cpu, target); +- } +- return 0; +-} +- +-static int rapl_cpu_online(unsigned int cpu) +-{ +- s32 rapl_pmu_idx = get_rapl_pmu_idx(cpu); +- if (rapl_pmu_idx < 0) { +- pr_err("topology_logical_(package/die)_id() returned a negative value"); +- return -EINVAL; +- } +- struct rapl_pmu *pmu = cpu_to_rapl_pmu(cpu); +- int target; +- +- if (!pmu) { +- pmu = kzalloc_node(sizeof(*pmu), GFP_KERNEL, cpu_to_node(cpu)); +- if (!pmu) +- return -ENOMEM; +- +- raw_spin_lock_init(&pmu->lock); +- INIT_LIST_HEAD(&pmu->active_list); +- pmu->pmu = &rapl_pmus->pmu; +- pmu->timer_interval = ms_to_ktime(rapl_timer_ms); +- rapl_hrtimer_init(pmu); +- +- rapl_pmus->pmus[rapl_pmu_idx] = pmu; +- } +- +- /* +- * Check if there is an online cpu in the package which collects rapl +- * events already. +- */ +- target = cpumask_any_and(&rapl_cpu_mask, get_rapl_pmu_cpumask(cpu)); +- if (target < nr_cpu_ids) +- return 0; +- +- cpumask_set_cpu(cpu, &rapl_cpu_mask); +- pmu->cpu = cpu; +- return 0; +-} ++static struct perf_msr amd_rapl_core_msrs[] = { ++ [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group, ++ test_msr, false, RAPL_MSR_MASK }, ++}; + +-static int rapl_check_hw_unit(struct rapl_model *rm) ++static int rapl_check_hw_unit(void) + { + u64 msr_rapl_power_unit_bits; + int i; + + /* protect rdmsrl() to handle virtualization */ +- if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) ++ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) + return -1; +- for (i = 0; i < NR_RAPL_DOMAINS; i++) +- rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) ++ rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; ++ ++ rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + +- switch (rm->unit_quirk) { ++ switch (rapl_model->unit_quirk) { + /* + * DRAM domain on HSW server and KNL has fixed energy unit which can be + * different than the unit from power unit MSR. See +@@ -648,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm) + * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + */ + case RAPL_UNIT_QUIRK_INTEL_HSW: +- rapl_hw_unit[PERF_RAPL_RAM] = 16; ++ rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16; + break; + /* SPR uses a fixed energy unit for Psys domain. */ + case RAPL_UNIT_QUIRK_INTEL_SPR: +- rapl_hw_unit[PERF_RAPL_PSYS] = 0; ++ rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0; + break; + default: + break; + } + +- + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter +@@ -667,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + rapl_timer_ms = 2; +- if (rapl_hw_unit[0] < 32) { ++ if (rapl_pkg_hw_unit[0] < 32) { + rapl_timer_ms = (1000 / (2 * 100)); +- rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); ++ rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1)); + } + return 0; + } +@@ -677,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm) + static void __init rapl_advertise(void) + { + int i; ++ int num_counters = hweight32(rapl_pmus_pkg->cntr_mask); ++ ++ if (rapl_pmus_core) ++ num_counters += hweight32(rapl_pmus_core->cntr_mask); + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", +- hweight32(rapl_cntr_mask), rapl_timer_ms); ++ num_counters, rapl_timer_ms); + +- for (i = 0; i < NR_RAPL_DOMAINS; i++) { +- if (rapl_cntr_mask & (1 << i)) { ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { ++ if (rapl_pmus_pkg->cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", +- rapl_domain_names[i], rapl_hw_unit[i]); ++ rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); + } + } ++ ++ if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE))) ++ pr_info("hw unit of domain %s 2^-%d Joules\n", ++ rapl_core_domain_name, rapl_core_hw_unit); + } + +-static void cleanup_rapl_pmus(void) ++static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) + { + int i; + + for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) +- kfree(rapl_pmus->pmus[i]); ++ kfree(rapl_pmus->rapl_pmu[i]); + kfree(rapl_pmus); + } + +@@ -707,17 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = { + NULL, + }; + +-static int __init init_rapl_pmus(void) ++static const struct attribute_group *rapl_core_attr_update[] = { ++ &rapl_events_core_group, ++ NULL, ++}; ++ ++static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) ++{ ++ struct rapl_pmu *rapl_pmu; ++ int idx; ++ ++ for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { ++ rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL); ++ if (!rapl_pmu) ++ goto free; ++ ++ raw_spin_lock_init(&rapl_pmu->lock); ++ INIT_LIST_HEAD(&rapl_pmu->active_list); ++ rapl_pmu->pmu = &rapl_pmus->pmu; ++ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); ++ rapl_hrtimer_init(rapl_pmu); ++ ++ rapl_pmus->rapl_pmu[idx] = rapl_pmu; ++ } ++ ++ return 0; ++free: ++ for (; idx > 0; idx--) ++ kfree(rapl_pmus->rapl_pmu[idx - 1]); ++ return -ENOMEM; ++} ++ ++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope, ++ const struct attribute_group **rapl_attr_groups, ++ const struct attribute_group **rapl_attr_update) + { + int nr_rapl_pmu = topology_max_packages(); ++ struct rapl_pmus *rapl_pmus; + +- if (!rapl_pmu_is_pkg_scope()) +- nr_rapl_pmu *= topology_max_dies_per_package(); ++ /* ++ * rapl_pmu_scope must be either PKG, DIE or CORE ++ */ ++ if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) ++ nr_rapl_pmu *= topology_max_dies_per_package(); ++ else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE) ++ nr_rapl_pmu *= topology_num_cores_per_package(); ++ else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG) ++ return -EINVAL; + +- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); ++ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); + if (!rapl_pmus) + return -ENOMEM; + ++ *rapl_pmus_ptr = rapl_pmus; ++ + rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; + rapl_pmus->pmu.attr_groups = rapl_attr_groups; + rapl_pmus->pmu.attr_update = rapl_attr_update; +@@ -728,77 +761,81 @@ static int __init init_rapl_pmus(void) + rapl_pmus->pmu.start = rapl_pmu_event_start; + rapl_pmus->pmu.stop = rapl_pmu_event_stop; + rapl_pmus->pmu.read = rapl_pmu_event_read; ++ rapl_pmus->pmu.scope = rapl_pmu_scope; + rapl_pmus->pmu.module = THIS_MODULE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; +- return 0; ++ ++ return init_rapl_pmu(rapl_pmus); + } + + static struct rapl_model model_snb = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_snbep = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsw = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsx = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_knl = { +- .events = BIT(PERF_RAPL_PKG) | ++ .pkg_events = BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_skl = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_spr = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PSYS), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_spr_msrs, ++ .rapl_pkg_msrs = intel_rapl_spr_msrs, + }; + + static struct rapl_model model_amd_hygon = { +- .events = BIT(PERF_RAPL_PKG), ++ .pkg_events = BIT(PERF_RAPL_PKG), ++ .core_events = BIT(PERF_RAPL_CORE), + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, +- .rapl_msrs = amd_rapl_msrs, ++ .rapl_pkg_msrs = amd_rapl_pkg_msrs, ++ .rapl_core_msrs = amd_rapl_core_msrs, + }; + + static const struct x86_cpu_id rapl_model_match[] __initconst = { +@@ -854,57 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); + static int __init rapl_pmu_init(void) + { + const struct x86_cpu_id *id; +- struct rapl_model *rm; ++ int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE; + int ret; + ++ if (rapl_pkg_pmu_is_pkg_scope()) ++ rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG; ++ + id = x86_match_cpu(rapl_model_match); + if (!id) + return -ENODEV; + +- rm = (struct rapl_model *) id->driver_data; ++ rapl_model = (struct rapl_model *) id->driver_data; + +- rapl_msrs = rm->rapl_msrs; +- +- rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, +- false, (void *) &rm->events); +- +- ret = rapl_check_hw_unit(rm); ++ ret = rapl_check_hw_unit(); + if (ret) + return ret; + +- ret = init_rapl_pmus(); ++ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups, ++ rapl_attr_update); + if (ret) + return ret; + +- /* +- * Install callbacks. Core will call them for each online cpu. +- */ +- ret = cpuhp_setup_state(CPUHP_AP_PERF_X86_RAPL_ONLINE, +- "perf/x86/rapl:online", +- rapl_cpu_online, rapl_cpu_offline); ++ rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, ++ PERF_RAPL_PKG_EVENTS_MAX, false, ++ (void *) &rapl_model->pkg_events); ++ ++ ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); + if (ret) + goto out; + +- ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); +- if (ret) +- goto out1; ++ if (rapl_model->core_events) { ++ ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE, ++ rapl_core_attr_groups, ++ rapl_core_attr_update); ++ if (ret) { ++ pr_warn("power-core PMU initialization failed (%d)\n", ret); ++ goto core_init_failed; ++ } ++ ++ rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, ++ PERF_RAPL_CORE_EVENTS_MAX, false, ++ (void *) &rapl_model->core_events); ++ ++ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1); ++ if (ret) { ++ pr_warn("power-core PMU registration failed (%d)\n", ret); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ } ++ } + ++core_init_failed: + rapl_advertise(); + return 0; + +-out1: +- cpuhp_remove_state(CPUHP_AP_PERF_X86_RAPL_ONLINE); + out: + pr_warn("Initialization failed (%d), disabled\n", ret); +- cleanup_rapl_pmus(); ++ cleanup_rapl_pmus(rapl_pmus_pkg); + return ret; + } + module_init(rapl_pmu_init); + + static void __exit intel_rapl_exit(void) + { +- cpuhp_remove_state_nocalls(CPUHP_AP_PERF_X86_RAPL_ONLINE); +- perf_pmu_unregister(&rapl_pmus->pmu); +- cleanup_rapl_pmus(); ++ if (rapl_pmus_core) { ++ perf_pmu_unregister(&rapl_pmus_core->pmu); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ } ++ perf_pmu_unregister(&rapl_pmus_pkg->pmu); ++ cleanup_rapl_pmus(rapl_pmus_pkg); + } + module_exit(intel_rapl_exit); +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 20e6009381ed..c0cd10182e90 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -98,6 +98,7 @@ struct cpuinfo_topology { + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; ++ u32 logical_core_id; + + // AMD Node ID and Nodes per Package info + u32 amd_node_id; +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index fd41103ad342..3973cb9bb2e6 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); + #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) + #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) + #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) ++#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) + #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) + #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) + #define topology_ppin(cpu) (cpu_data(cpu).ppin) +diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c +index 10719aba6276..cacfd3f6abef 100644 +--- a/arch/x86/kernel/cpu/debugfs.c ++++ b/arch/x86/kernel/cpu/debugfs.c +@@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) + seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); ++ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); + seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); +diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c +index 8277c64f88db..b5a5e1411469 100644 +--- a/arch/x86/kernel/cpu/topology_common.c ++++ b/arch/x86/kernel/cpu/topology_common.c +@@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early) + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); ++ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); + } + + /* Package relative core ID */ +diff --git a/include/linux/cpuhotplug.h b/include/linux/cpuhotplug.h +index 2361ed4d2b15..37a9afffb59e 100644 +--- a/include/linux/cpuhotplug.h ++++ b/include/linux/cpuhotplug.h +@@ -208,7 +208,6 @@ enum cpuhp_state { + CPUHP_AP_PERF_X86_UNCORE_ONLINE, + CPUHP_AP_PERF_X86_AMD_UNCORE_ONLINE, + CPUHP_AP_PERF_X86_AMD_POWER_ONLINE, +- CPUHP_AP_PERF_X86_RAPL_ONLINE, + CPUHP_AP_PERF_S390_CF_ONLINE, + CPUHP_AP_PERF_S390_SF_ONLINE, + CPUHP_AP_PERF_ARM_CCI_ONLINE, +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.12/0010-pksm.patch b/sys-kernel/gentoo-sources-6.12/0010-pksm.patch new file mode 100644 index 0000000..051549c --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0010-pksm.patch @@ -0,0 +1,433 @@ +From 607312cadf367e1baf1362f85e0568ebae5b6d59 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 19 Dec 2024 18:52:41 +0100 +Subject: [PATCH 10/12] pksm + +Signed-off-by: Peter Jung +--- + arch/alpha/kernel/syscalls/syscall.tbl | 3 + + arch/arm/tools/syscall.tbl | 3 + + arch/m68k/kernel/syscalls/syscall.tbl | 3 + + arch/microblaze/kernel/syscalls/syscall.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n32.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n64.tbl | 3 + + arch/mips/kernel/syscalls/syscall_o32.tbl | 3 + + arch/parisc/kernel/syscalls/syscall.tbl | 3 + + arch/powerpc/kernel/syscalls/syscall.tbl | 3 + + arch/s390/kernel/syscalls/syscall.tbl | 3 + + arch/sh/kernel/syscalls/syscall.tbl | 3 + + arch/sparc/kernel/syscalls/syscall.tbl | 3 + + arch/x86/entry/syscalls/syscall_32.tbl | 3 + + arch/x86/entry/syscalls/syscall_64.tbl | 3 + + arch/xtensa/kernel/syscalls/syscall.tbl | 3 + + include/linux/syscalls.h | 3 + + include/uapi/asm-generic/unistd.h | 9 +- + kernel/sys.c | 138 ++++++++++++++++++ + kernel/sys_ni.c | 3 + + scripts/syscall.tbl | 3 + + .../arch/powerpc/entry/syscalls/syscall.tbl | 3 + + .../perf/arch/s390/entry/syscalls/syscall.tbl | 3 + + 22 files changed, 206 insertions(+), 1 deletion(-) + +diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl +index 74720667fe09..e6a11f3c0a2e 100644 +--- a/arch/alpha/kernel/syscalls/syscall.tbl ++++ b/arch/alpha/kernel/syscalls/syscall.tbl +@@ -502,3 +502,6 @@ + 570 common lsm_set_self_attr sys_lsm_set_self_attr + 571 common lsm_list_modules sys_lsm_list_modules + 572 common mseal sys_mseal ++573 common process_ksm_enable sys_process_ksm_enable ++574 common process_ksm_disable sys_process_ksm_disable ++575 common process_ksm_status sys_process_ksm_status +diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl +index 23c98203c40f..10a3099decbe 100644 +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -477,3 +477,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl +index 22a3cbd4c602..12d2c7594bf0 100644 +--- a/arch/m68k/kernel/syscalls/syscall.tbl ++++ b/arch/m68k/kernel/syscalls/syscall.tbl +@@ -462,3 +462,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl +index 2b81a6bd78b2..e2a93c856eed 100644 +--- a/arch/microblaze/kernel/syscalls/syscall.tbl ++++ b/arch/microblaze/kernel/syscalls/syscall.tbl +@@ -468,3 +468,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl +index 953f5b7dc723..b921fbf56fa6 100644 +--- a/arch/mips/kernel/syscalls/syscall_n32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl +@@ -401,3 +401,6 @@ + 460 n32 lsm_set_self_attr sys_lsm_set_self_attr + 461 n32 lsm_list_modules sys_lsm_list_modules + 462 n32 mseal sys_mseal ++463 n32 process_ksm_enable sys_process_ksm_enable ++464 n32 process_ksm_disable sys_process_ksm_disable ++465 n32 process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl +index 1464c6be6eb3..8d7f9ddd66f4 100644 +--- a/arch/mips/kernel/syscalls/syscall_n64.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl +@@ -377,3 +377,6 @@ + 460 n64 lsm_set_self_attr sys_lsm_set_self_attr + 461 n64 lsm_list_modules sys_lsm_list_modules + 462 n64 mseal sys_mseal ++463 n64 process_ksm_enable sys_process_ksm_enable ++464 n64 process_ksm_disable sys_process_ksm_disable ++465 n64 process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl +index 2439a2491cff..9d6142739954 100644 +--- a/arch/mips/kernel/syscalls/syscall_o32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl +@@ -450,3 +450,6 @@ + 460 o32 lsm_set_self_attr sys_lsm_set_self_attr + 461 o32 lsm_list_modules sys_lsm_list_modules + 462 o32 mseal sys_mseal ++463 o32 process_ksm_enable sys_process_ksm_enable ++464 o32 process_ksm_disable sys_process_ksm_disable ++465 o32 process_ksm_status sys_process_ksm_status +diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl +index 66dc406b12e4..9d46476fd908 100644 +--- a/arch/parisc/kernel/syscalls/syscall.tbl ++++ b/arch/parisc/kernel/syscalls/syscall.tbl +@@ -461,3 +461,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl +index ebae8415dfbb..16f71bc2f6f0 100644 +--- a/arch/powerpc/kernel/syscalls/syscall.tbl ++++ b/arch/powerpc/kernel/syscalls/syscall.tbl +@@ -553,3 +553,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl +index 01071182763e..7394bad8178e 100644 +--- a/arch/s390/kernel/syscalls/syscall.tbl ++++ b/arch/s390/kernel/syscalls/syscall.tbl +@@ -465,3 +465,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl +index c55fd7696d40..b9fc31221b87 100644 +--- a/arch/sh/kernel/syscalls/syscall.tbl ++++ b/arch/sh/kernel/syscalls/syscall.tbl +@@ -466,3 +466,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl +index cfdfb3707c16..0d79fd772854 100644 +--- a/arch/sparc/kernel/syscalls/syscall.tbl ++++ b/arch/sparc/kernel/syscalls/syscall.tbl +@@ -508,3 +508,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 534c74b14fab..c546a30575f1 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -468,3 +468,6 @@ + 460 i386 lsm_set_self_attr sys_lsm_set_self_attr + 461 i386 lsm_list_modules sys_lsm_list_modules + 462 i386 mseal sys_mseal ++463 i386 process_ksm_enable sys_process_ksm_enable ++464 i386 process_ksm_disable sys_process_ksm_disable ++465 i386 process_ksm_status sys_process_ksm_status +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index 7093ee21c0d1..0fcd10ba8dfe 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -386,6 +386,9 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status + + # + # Due to a historical design error, certain syscalls are numbered differently +diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl +index 67083fc1b2f5..c1aecee4ad9b 100644 +--- a/arch/xtensa/kernel/syscalls/syscall.tbl ++++ b/arch/xtensa/kernel/syscalls/syscall.tbl +@@ -433,3 +433,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index 5758104921e6..cc9c4fac2412 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -818,6 +818,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); + asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); + asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags); + asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, + unsigned long prot, unsigned long pgoff, + unsigned long flags); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index 5bf6148cac2b..613e559ad6e0 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -841,8 +841,15 @@ __SYSCALL(__NR_lsm_list_modules, sys_lsm_list_modules) + #define __NR_mseal 462 + __SYSCALL(__NR_mseal, sys_mseal) + ++#define __NR_process_ksm_enable 463 ++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable) ++#define __NR_process_ksm_disable 464 ++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable) ++#define __NR_process_ksm_status 465 ++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status) ++ + #undef __NR_syscalls +-#define __NR_syscalls 463 ++#define __NR_syscalls 466 + + /* + * 32 bit systems traditionally used different +diff --git a/kernel/sys.c b/kernel/sys.c +index 4da31f28fda8..fcd3aeaddd05 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -2791,6 +2791,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + return error; + } + ++#ifdef CONFIG_KSM ++enum pkc_action { ++ PKSM_ENABLE = 0, ++ PKSM_DISABLE, ++ PKSM_STATUS, ++}; ++ ++static long do_process_ksm_control(int pidfd, enum pkc_action action) ++{ ++ long ret; ++ struct task_struct *task; ++ struct mm_struct *mm; ++ unsigned int f_flags; ++ ++ task = pidfd_get_task(pidfd, &f_flags); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ ++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); ++ if (IS_ERR_OR_NULL(mm)) { ++ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; ++ goto release_task; ++ } ++ ++ /* Require CAP_SYS_NICE for influencing process performance. */ ++ if (!capable(CAP_SYS_NICE)) { ++ ret = -EPERM; ++ goto release_mm; ++ } ++ ++ if (mmap_write_lock_killable(mm)) { ++ ret = -EINTR; ++ goto release_mm; ++ } ++ ++ switch (action) { ++ case PKSM_ENABLE: ++ ret = ksm_enable_merge_any(mm); ++ break; ++ case PKSM_DISABLE: ++ ret = ksm_disable_merge_any(mm); ++ break; ++ case PKSM_STATUS: ++ ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags); ++ break; ++ } ++ ++ mmap_write_unlock(mm); ++ ++release_mm: ++ mmput(mm); ++release_task: ++ put_task_struct(task); ++out: ++ return ret; ++} ++#endif /* CONFIG_KSM */ ++ ++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_ENABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_DISABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_STATUS); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++#ifdef CONFIG_KSM ++static ssize_t process_ksm_enable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_enable); ++} ++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable); ++ ++static ssize_t process_ksm_disable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_disable); ++} ++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable); ++ ++static ssize_t process_ksm_status_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_status); ++} ++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status); ++ ++static struct attribute *process_ksm_sysfs_attrs[] = { ++ &process_ksm_enable_attr.attr, ++ &process_ksm_disable_attr.attr, ++ &process_ksm_status_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group process_ksm_sysfs_attr_group = { ++ .attrs = process_ksm_sysfs_attrs, ++ .name = "process_ksm", ++}; ++ ++static int __init process_ksm_sysfs_init(void) ++{ ++ return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group); ++} ++subsys_initcall(process_ksm_sysfs_init); ++#endif /* CONFIG_KSM */ ++ + SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) + { +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index c00a86931f8c..d82213d68522 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -186,6 +186,9 @@ COND_SYSCALL(mincore); + COND_SYSCALL(madvise); + COND_SYSCALL(process_madvise); + COND_SYSCALL(process_mrelease); ++COND_SYSCALL(process_ksm_enable); ++COND_SYSCALL(process_ksm_disable); ++COND_SYSCALL(process_ksm_status); + COND_SYSCALL(remap_file_pages); + COND_SYSCALL(mbind); + COND_SYSCALL(get_mempolicy); +diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl +index 845e24eb372e..227d9cc12365 100644 +--- a/scripts/syscall.tbl ++++ b/scripts/syscall.tbl +@@ -403,3 +403,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +index ebae8415dfbb..16f71bc2f6f0 100644 +--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +@@ -553,3 +553,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status +diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl +index 01071182763e..7394bad8178e 100644 +--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl +@@ -465,3 +465,6 @@ + 460 common lsm_set_self_attr sys_lsm_set_self_attr sys_lsm_set_self_attr + 461 common lsm_list_modules sys_lsm_list_modules sys_lsm_list_modules + 462 common mseal sys_mseal sys_mseal ++463 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++464 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++465 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.12/0012-zstd.patch b/sys-kernel/gentoo-sources-6.12/0012-zstd.patch new file mode 100644 index 0000000..7518743 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/0012-zstd.patch @@ -0,0 +1,18652 @@ +From a7a211a9bf51bfd07e645e8362b40f249d00d13b Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 19 Dec 2024 18:53:34 +0100 +Subject: [PATCH 12/12] zstd + +Signed-off-by: Peter Jung +--- + include/linux/zstd.h | 2 +- + include/linux/zstd_errors.h | 23 +- + include/linux/zstd_lib.h | 850 +++++-- + lib/zstd/Makefile | 2 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 149 ++ + lib/zstd/common/bitstream.h | 127 +- + lib/zstd/common/compiler.h | 134 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 9 +- + lib/zstd/common/debug.h | 34 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 12 +- + lib/zstd/common/error_private.h | 84 +- + lib/zstd/common/fse.h | 94 +- + lib/zstd/common/fse_decompress.c | 130 +- + lib/zstd/common/huf.h | 237 +- + lib/zstd/common/mem.h | 3 +- + lib/zstd/common/portability_macros.h | 28 +- + lib/zstd/common/zstd_common.c | 38 +- + lib/zstd/common/zstd_deps.h | 16 +- + lib/zstd/common/zstd_internal.h | 109 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 74 +- + lib/zstd/compress/hist.c | 3 +- + lib/zstd/compress/hist.h | 3 +- + lib/zstd/compress/huf_compress.c | 441 ++-- + lib/zstd/compress/zstd_compress.c | 2111 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 359 ++- + lib/zstd/compress/zstd_compress_literals.c | 155 +- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 7 +- + lib/zstd/compress/zstd_compress_sequences.h | 3 +- + lib/zstd/compress/zstd_compress_superblock.c | 376 ++- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 169 +- + lib/zstd/compress/zstd_double_fast.c | 143 +- + lib/zstd/compress/zstd_double_fast.h | 17 +- + lib/zstd/compress/zstd_fast.c | 596 +++-- + lib/zstd/compress/zstd_fast.h | 6 +- + lib/zstd/compress/zstd_lazy.c | 732 +++--- + lib/zstd/compress/zstd_lazy.h | 138 +- + lib/zstd/compress/zstd_ldm.c | 21 +- + lib/zstd/compress/zstd_ldm.h | 3 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 497 ++-- + lib/zstd/compress/zstd_opt.h | 41 +- + lib/zstd/decompress/huf_decompress.c | 887 ++++--- + lib/zstd/decompress/zstd_ddict.c | 9 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 358 ++- + lib/zstd/decompress/zstd_decompress_block.c | 708 +++--- + lib/zstd/decompress/zstd_decompress_block.h | 10 +- + .../decompress/zstd_decompress_internal.h | 9 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 5 +- + lib/zstd/zstd_compress_module.c | 2 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 58 files changed, 6577 insertions(+), 3531 deletions(-) + create mode 100644 lib/zstd/common/allocations.h + create mode 100644 lib/zstd/common/bits.h + +diff --git a/include/linux/zstd.h b/include/linux/zstd.h +index b2c7cf310c8f..ac59ae9a18d7 100644 +--- a/include/linux/zstd.h ++++ b/include/linux/zstd.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h +index 58b6dd45a969..6d5cf55f0bf3 100644 +--- a/include/linux/zstd_errors.h ++++ b/include/linux/zstd_errors.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -17,8 +18,17 @@ + + + /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY ++#define ZSTDERRORLIB_VISIBLE ++ ++#ifndef ZSTDERRORLIB_HIDDEN ++# if (__GNUC__ >= 4) && !defined(__MINGW32__) ++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) ++# else ++# define ZSTDERRORLIB_HIDDEN ++# endif ++#endif ++ ++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE + + /*-********************************************* + * Error codes list +@@ -43,14 +53,17 @@ typedef enum { + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, ++ ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, ++ ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, +@@ -58,11 +71,15 @@ typedef enum { + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, ++ ZSTD_error_noForwardProgress_destFull = 80, ++ ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, ++ ZSTD_error_sequenceProducer_failed = 106, ++ ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ + } ZSTD_ErrorCode; + +diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h +index 79d55465d5c1..6320fedcf8a4 100644 +--- a/include/linux/zstd_lib.h ++++ b/include/linux/zstd_lib.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,23 +12,42 @@ + #ifndef ZSTD_H_235446 + #define ZSTD_H_235446 + +-/* ====== Dependency ======*/ ++/* ====== Dependencies ======*/ + #include /* INT_MAX */ + #include /* size_t */ + + + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ +-#ifndef ZSTDLIB_VISIBLE ++#define ZSTDLIB_VISIBLE ++ ++#ifndef ZSTDLIB_HIDDEN + # if (__GNUC__ >= 4) && !defined(__MINGW32__) +-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) + # else +-# define ZSTDLIB_VISIBLE + # define ZSTDLIB_HIDDEN + # endif + #endif ++ + #define ZSTDLIB_API ZSTDLIB_VISIBLE + ++/* Deprecation warnings : ++ * Should these warnings be a problem, it is generally possible to disable them, ++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. ++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. ++ */ ++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS ++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ ++#else ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) ++# elif (__GNUC__ >= 3) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) ++# else ++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") ++# define ZSTD_DEPRECATED(message) ++# endif ++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ ++ + + /* ***************************************************************************** + Introduction +@@ -65,7 +85,7 @@ + /*------ Version ------*/ + #define ZSTD_VERSION_MAJOR 1 + #define ZSTD_VERSION_MINOR 5 +-#define ZSTD_VERSION_RELEASE 2 ++#define ZSTD_VERSION_RELEASE 6 + #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + /*! ZSTD_versionNumber() : +@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void); + ***************************************/ + /*! ZSTD_compress() : + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, +@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); ++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") ++ZSTDLIB_API ++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ + * `src` should point to the start of a ZSTD frame or skippable frame. +@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) + + + /*====== Helper functions ======*/ +-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_compressBound() : ++ * maximum compressed size in worst case single-pass scenario. ++ * When invoking `ZSTD_compress()` or any other one-pass compression function, ++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) ++ * as it eliminates one potential failure scenario, ++ * aka not enough room in dst buffer to write the compressed frame. ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE . ++ * In which case, ZSTD_compressBound() will return an error code ++ * which can be tested using ZSTD_isError(). ++ * ++ * ZSTD_COMPRESSBOUND() : ++ * same as ZSTD_compressBound(), but as a macro. ++ * It can be used to produce constants, which can be useful for static allocation, ++ * for example to size a static array on stack. ++ * Will produce constant value 0 if srcSize too large. ++ */ ++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) ++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_isError() : ++ * Most ZSTD_* functions returning a size_t value can be tested for error, ++ * using ZSTD_isError(). ++ * @return 1 if error, 0 otherwise ++ */ + ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ + ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ + ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +@@ -183,7 +228,7 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compres + /*= Compression context + * When compressing many times, + * it is recommended to allocate a context just once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. +@@ -196,9 +241,9 @@ ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer * + + /*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. +- * Important : in order to behave similarly to `ZSTD_compress()`, +- * this function compresses at requested compression level, +- * __ignoring any other parameter__ . ++ * Important : in order to mirror `ZSTD_compress()` behavior, ++ * this function compresses at the requested compression level, ++ * __ignoring any other advanced parameter__ . + * If any advanced parameter was set using the advanced API, + * they will all be reset. Only `compressionLevel` remains. + */ +@@ -210,7 +255,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + /*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ + typedef struct ZSTD_DCtx_s ZSTD_DCtx; +@@ -220,7 +265,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer * + /*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. +- * Compatible with sticky parameters. ++ * Compatible with sticky parameters (see below). + */ + ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +@@ -236,12 +281,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! +- * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . ++ * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supersedes all other "advanced" API entry points in the experimental section. +- * In the future, we expect to remove from experimental API entry points which are redundant with this API. ++ * In the future, we expect to remove API entry points from experimental which are redundant with this API. + */ + + +@@ -324,6 +369,19 @@ typedef enum { + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ ++ ++ ZSTD_c_targetCBlockSize=130, /* v1.5.6+ ++ * Attempts to fit compressed block size into approximatively targetCBlockSize. ++ * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. ++ * Note that it's not a guarantee, just a convergence target (default:0). ++ * No target when targetCBlockSize == 0. ++ * This is helpful in low bandwidth streaming environments to improve end-to-end latency, ++ * when a client can make use of partial documents (a prominent example being Chrome). ++ * Note: this parameter is stable since v1.5.6. ++ * It was present as an experimental parameter in earlier versions, ++ * but it's not recommended using it with earlier library versions ++ * due to massive performance regressions. ++ */ + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio +@@ -403,7 +461,6 @@ typedef enum { + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode +- * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer +@@ -412,6 +469,9 @@ typedef enum { + * ZSTD_c_validateSequences + * ZSTD_c_useBlockSplitter + * ZSTD_c_useRowMatchFinder ++ * ZSTD_c_prefetchCDictTables ++ * ZSTD_c_enableSeqProducerFallback ++ * ZSTD_c_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. +@@ -421,7 +481,7 @@ typedef enum { + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, +- ZSTD_c_experimentalParam6=1003, ++ /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, +@@ -430,7 +490,11 @@ typedef enum { + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, +- ZSTD_c_experimentalParam15=1012 ++ ZSTD_c_experimentalParam15=1012, ++ ZSTD_c_experimentalParam16=1013, ++ ZSTD_c_experimentalParam17=1014, ++ ZSTD_c_experimentalParam18=1015, ++ ZSTD_c_experimentalParam19=1016 + } ZSTD_cParameter; + + typedef struct { +@@ -493,7 +557,7 @@ typedef enum { + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". +- * This removes any reference to any dictionary too. ++ * This also removes any reference to any dictionary or external sequence producer. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. +@@ -502,11 +566,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + + /*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. ++ * (note that this entry point doesn't even expose a compression level parameter). + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data, though it is possible it fails for other reasons. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +@@ -543,13 +609,17 @@ typedef enum { + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts ++ * ZSTD_d_disableHuffmanAssembly ++ * ZSTD_d_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, +- ZSTD_d_experimentalParam4=1003 ++ ZSTD_d_experimentalParam4=1003, ++ ZSTD_d_experimentalParam5=1004, ++ ZSTD_d_experimentalParam6=1005 + + } ZSTD_dParameter; + +@@ -604,14 +674,14 @@ typedef struct ZSTD_outBuffer_s { + * A ZSTD_CStream object is required to track streaming operation. + * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. + * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +-* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. ++* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. + * + * For parallel execution, use one separate ZSTD_CStream per thread. + * + * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. + * + * Parameters are sticky : when starting a new compression on the same context, +-* it will re-use the same sticky parameters as previous compression session. ++* it will reuse the same sticky parameters as previous compression session. + * When in doubt, it's recommended to fully initialize the context before usage. + * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), + * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +@@ -700,6 +770,11 @@ typedef enum { + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. ++ * - note: if an operation ends with an error, it may leave @cctx in an undefined state. ++ * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. ++ * In order to be re-employed after an error, a state must be reset, ++ * which can be done explicitly (ZSTD_CCtx_reset()), ++ * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) + */ + ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, +@@ -728,8 +803,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * This following is a legacy streaming API, available since v1.0+ . + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. +- * Streaming in combination with advanced parameters and dictionary compression +- * can only be used through the new API. + ******************************************************************************/ + + /*! +@@ -738,6 +811,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ++ * ++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API ++ * to compress with a dictionary. + */ + ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); + /*! +@@ -758,7 +834,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + * + * A ZSTD_DStream object is required to track streaming operations. + * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +-* ZSTD_DStream objects can be re-used multiple times. ++* ZSTD_DStream objects can be reused multiple times. + * + * Use ZSTD_initDStream() to start a new decompression operation. + * @return : recommended first input size +@@ -788,13 +864,37 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer + + /*===== Streaming decompression functions =====*/ + +-/* This function is redundant with the advanced API and equivalent to: ++/*! ZSTD_initDStream() : ++ * Initialize/reset DStream state for new decompression operation. ++ * Call before new decompression operation using same DStream. + * ++ * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ + ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + ++/*! ZSTD_decompressStream() : ++ * Streaming decompression function. ++ * Call repetitively to consume full input updating it as necessary. ++ * Function will update both input and output `pos` fields exposing current state via these fields: ++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input ++ * on the next call. ++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers. ++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers, ++ * call ZSTD_decompressStream() again to flush remaining data to output. ++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * @return : 0 when a frame is completely decoded and fully flushed, ++ * or an error code, which can be tested using ZSTD_isError(), ++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. ++ * ++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state. ++ * It's UB to invoke `ZSTD_decompressStream()` on such a state. ++ * In order to re-use such a state, it must be first reset, ++ * which can be done explicitly (`ZSTD_DCtx_reset()`), ++ * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) ++ */ + ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + + ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +@@ -913,7 +1013,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). +- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. ++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. +@@ -925,9 +1025,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Advanced dictionary and prefix API (Requires v1.4.0+) + * + * This API allows dictionaries to be used with ZSTD_compress2(), +- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and +- * only reset with the context is reset with ZSTD_reset_parameters or +- * ZSTD_reset_session_and_parameters. Prefixes are single-use. ++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). ++ * Dictionaries are sticky, they remain valid when same context is reused, ++ * they only reset when the context is reset ++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. ++ * In contrast, Prefixes are single-use. + ******************************************************************************/ + + +@@ -937,8 +1039,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". +- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. +- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). ++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, ++ * until parameters are reset, a new dictionary is loaded, or the dictionary ++ * is explicitly invalidated by loading a NULL dictionary. + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, +@@ -947,11 +1050,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() +- * to precisely select how dictionary content must be interpreted. */ ++ * to precisely select how dictionary content must be interpreted. ++ * Note 5 : This method does not benefit from LDM (long distance mode). ++ * If you want to employ LDM on some large dictionary content, ++ * prefer employing ZSTD_CCtx_refPrefix() described below. ++ */ + ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ +- * Reference a prepared dictionary, to be used for all next compressed frames. ++ * Reference a prepared dictionary, to be used for all future compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. +@@ -970,6 +1077,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). ++ * This method is compatible with LDM (long distance mode). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. +@@ -986,9 +1094,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ +- * Create an internal DDict from dict buffer, +- * to be used to decompress next frames. +- * The dictionary remains valid for all future frames, until explicitly invalidated. ++ * Create an internal DDict from dict buffer, to be used to decompress all future frames. ++ * The dictionary remains valid for all future frames, until explicitly invalidated, or ++ * a new dictionary is loaded. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". +@@ -1012,9 +1120,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * ++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary ++ * will be managed, and referencing a dictionary effectively "discards" any previous one. ++ * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). +- * Note 1 : Currently, only one dictionary can be managed. +- * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +@@ -1071,24 +1180,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE + #endif + +-/* Deprecation warnings : +- * Should these warnings be a problem, it is generally possible to disable them, +- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. +- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. +- */ +-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +-#else +-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +-# elif (__GNUC__ >= 3) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +-# else +-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +-# endif +-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ +- + /* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** +@@ -1123,6 +1214,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ + #define ZSTD_STRATEGY_MIN ZSTD_fast + #define ZSTD_STRATEGY_MAX ZSTD_btultra2 ++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ + + + #define ZSTD_OVERLAPLOG_MIN 0 +@@ -1146,7 +1238,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) + + /* Advanced parameter bounds */ +-#define ZSTD_TARGETCBLOCKSIZE_MIN 64 ++#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ + #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX + #define ZSTD_SRCSIZEHINT_MIN 0 + #define ZSTD_SRCSIZEHINT_MAX INT_MAX +@@ -1303,7 +1395,7 @@ typedef enum { + } ZSTD_paramSwitch_e; + + /* ************************************* +-* Frame size functions ++* Frame header and size functions + ***************************************/ + + /*! ZSTD_findDecompressedSize() : +@@ -1350,29 +1442,122 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size + * or an error code (if srcSize is too small) */ + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; ++typedef struct { ++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ ++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ ++ unsigned blockSizeMax; ++ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ unsigned headerSize; ++ unsigned dictID; ++ unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; ++} ZSTD_frameHeader; ++ ++/*! ZSTD_getFrameHeader() : ++ * decode Frame Header, or requires larger `srcSize`. ++ * @return : 0, `zfhPtr` is correctly filled, ++ * >0, `srcSize` is too small, value is wanted `srcSize` amount, ++ * or an error code, which can be tested using ZSTD_isError() */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ ++/*! ZSTD_getFrameHeader_advanced() : ++ * same as ZSTD_getFrameHeader(), ++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ ++/*! ZSTD_decompressionMargin() : ++ * Zstd supports in-place decompression, where the input and output buffers overlap. ++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, ++ * and the input buffer must be at the end of the output buffer. ++ * ++ * _______________________ Output Buffer ________________________ ++ * | | ++ * | ____ Input Buffer ____| ++ * | | | ++ * v v v ++ * |---------------------------------------|-----------|----------| ++ * ^ ^ ^ ++ * |___________________ Output_Size ___________________|_ Margin _| ++ * ++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). ++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or ++ * ZSTD_decompressDCtx(). ++ * NOTE: This function supports multi-frame input. ++ * ++ * @param src The compressed frame(s) ++ * @param srcSize The size of the compressed frame(s) ++ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); ++ ++/*! ZSTD_DECOMPRESS_MARGIN() : ++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from ++ * the compressed frame, compute it from the original size and the blockSizeLog. ++ * See ZSTD_decompressionMargin() for details. ++ * ++ * WARNING: This macro does not support multi-frame input, the input must be a single ++ * zstd frame. If you need that support use the function, or implement it yourself. ++ * ++ * @param originalSize The original uncompressed size of the data. ++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). ++ * Unless you explicitly set the windowLog smaller than ++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. ++ */ ++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ ++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ ++ 4 /* checksum */ + \ ++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ ++ (blockSize) /* One block of margin */ \ ++ )) ++ + typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ + } ZSTD_sequenceFormat_e; + ++/*! ZSTD_sequenceBound() : ++ * `srcSize` : size of the input buffer ++ * @return : upper-bound for the number of sequences that can be generated ++ * from a buffer of srcSize bytes ++ * ++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); ++ + /*! ZSTD_generateSequences() : +- * Generate sequences using ZSTD_compress2, given a source buffer. ++ * WARNING: This function is meant for debugging and informational purposes ONLY! ++ * Its implementation is flawed, and it will be deleted in a future version. ++ * It is not guaranteed to succeed, as there are several cases where it will give ++ * up and fail. You should NOT use this function in production code. ++ * ++ * This function is deprecated, and will be removed in a future version. ++ * ++ * Generate sequences using ZSTD_compress2(), given a source buffer. ++ * ++ * @param zc The compression context to be used for ZSTD_compress2(). Set any ++ * compression parameters you need on this context. ++ * @param outSeqs The output sequences buffer of size @p outSeqsSize ++ * @param outSeqsSize The size of the output sequences buffer. ++ * ZSTD_sequenceBound(srcSize) is an upper bound on the number ++ * of sequences that can be generated. ++ * @param src The source buffer to generate sequences from of size @p srcSize. ++ * @param srcSize The size of the source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * +- * zc can be used to insert custom compression params. +- * This function invokes ZSTD_compress2 +- * +- * The output of this function can be fed into ZSTD_compressSequences() with CCtx +- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters +- * @return : number of sequences generated ++ * @returns The number of sequences generated, necessarily less than ++ * ZSTD_sequenceBound(srcSize), or an error code that can be checked ++ * with ZSTD_isError(). + */ +- +-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +- size_t outSeqsSize, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") ++ZSTDLIB_STATIC_API size_t ++ZSTD_generateSequences(ZSTD_CCtx* zc, ++ ZSTD_Sequence* outSeqs, size_t outSeqsSize, ++ const void* src, size_t srcSize); + + /*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals +@@ -1388,7 +1573,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + + /*! ZSTD_compressSequences() : +- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. ++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. ++ * @src contains the entire input (not just the literals). ++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals + * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * The entire source is compressed into a single frame. + * +@@ -1413,11 +1600,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. + * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history +- * @return : final compressed size or a ZSTD error. ++ * @return : final compressed size, or a ZSTD error code. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); + + + /*! ZSTD_writeSkippableFrame() : +@@ -1464,48 +1652,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + /*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. ++ * This is useful in combination with ZSTD_initStatic(), ++ * which makes it possible to employ a static buffer for ZSTD_CCtx* state. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough +- * for any compression level up to selected one. +- * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate +- * does not include space for a window buffer. +- * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. ++ * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() ++ * associated with any compression level up to max specified one. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * ++ * Note that the size estimation is specific for one-shot compression, ++ * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) ++ * nor other potential ways of using a ZSTD_CCtx* state. ++ * + * When srcSize can be bound by a known and rather "small" value, +- * this fact can be used to provide a tighter estimation +- * because the CCtx compression context will need less memory. +- * This tighter estimation can be provided by more advanced functions ++ * this knowledge can be used to provide a tighter budget estimation ++ * because the ZSTD_CCtx* state will need less memory for small inputs. ++ * This tighter estimation can be provided by employing more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * +- * Note 2 : only single-threaded compression is supported. ++ * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); + + /*! ZSTD_estimateCStreamSize() : +- * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. +- * It will also consider src size to be arbitrarily "large", which is worst case. ++ * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression ++ * using any compression level up to the max specified one. ++ * It will also consider src size to be arbitrarily "large", which is a worst case scenario. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. +- * ZSTD_DStream memory budget depends on window Size. ++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. ++ * ++ * ZSTD_DStream memory budget depends on frame's window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); ++ * Any frame requesting a window size larger than max specified one will be rejected. + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. +- * In this case, get total size by adding ZSTD_estimate?DictSize */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); ++ * In this case, get total size by adding ZSTD_estimate?DictSize ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + + /*! ZSTD_estimate?DictSize() : +@@ -1649,22 +1848,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + * This function never fails (wide contract) */ + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + ++/*! ZSTD_CCtx_setCParams() : ++ * Set all parameters provided within @p cparams into the working @p cctx. ++ * Note : if modifying parameters during compression (MT mode only), ++ * note that changes to the .windowLog parameter will be ignored. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ * On failure, no parameters are updated. ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ ++/*! ZSTD_CCtx_setFParams() : ++ * Set all parameters provided within @p fparams into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); ++ ++/*! ZSTD_CCtx_setParams() : ++ * Set all parameters provided within @p params into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); ++ + /*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- const void* dict,size_t dictSize, +- ZSTD_parameters params); ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ const void* dict,size_t dictSize, ++ ZSTD_parameters params); + + /*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -1737,11 +1959,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +-/* Tries to fit compressed block size to be around targetCBlockSize. +- * No target when targetCBlockSize == 0. +- * There is no guarantee on compressed block size (default:0) */ +-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 +- + /* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, +@@ -1808,13 +2025,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * +- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same +- * between calls, except for the modifications that zstd makes to pos (the +- * caller must not modify pos). This is checked by the compressor, and +- * compression will fail if it ever changes. This means the only flush +- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end +- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) +- * MUST not be modified during compression or you will get data corruption. ++ * Tells the compressor that input data presented with ZSTD_inBuffer ++ * will ALWAYS be the same between calls. ++ * Technically, the @src pointer must never be changed, ++ * and the @pos field can only be updated by zstd. ++ * However, it's possible to increase the @size field, ++ * allowing scenarios where more data can be appended after compressions starts. ++ * These conditions are checked by the compressor, ++ * and compression will fail if they are not respected. ++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) ++ * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until +@@ -1822,18 +2042,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * +- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. +- * That means this flag cannot be used with ZSTD_compressStream(). +- * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds +- * memory. However, compression WILL fail if you violate the preconditions. ++ * memory. However, compression WILL fail if conditions are not respected. + * +- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST +- * not be modified during compression or you will get data corruption. This +- * is because zstd needs to reference data in the ZSTD_inBuffer to find ++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST ++ * not be modified during compression or it will result in data corruption. ++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, +- * but passing this flag tells zstd to use the user provided buffer. ++ * but passing this flag tells zstd to rely on user provided buffer instead. + */ + #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +@@ -1878,7 +2095,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Without validation, providing a sequence that does not conform to the zstd spec will cause + * undefined behavior, and may produce a corrupted block. + * +- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for ++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. + * +@@ -1928,6 +2145,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + ++/* ZSTD_c_prefetchCDictTables ++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto. ++ * ++ * In some situations, zstd uses CDict tables in-place rather than copying them ++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). ++ * In such situations, compression speed is seriously impacted when CDict tables are ++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables ++ * when they are used in-place. ++ * ++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. ++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables ++ * into the working context, so there is no need to prefetch. This parameter is ++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be ++ * useful but memcpy() is too expensive. The exact range of input sizes where this ++ * makes sense is best determined by careful experimentation. ++ * ++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, ++ * but in the future zstd may conditionally enable this feature via an auto-detection ++ * heuristic for cold CDicts. ++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. ++ */ ++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 ++ ++/* ZSTD_c_enableSeqProducerFallback ++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. ++ * ++ * Controls whether zstd will fall back to an internal sequence producer if an ++ * external sequence producer is registered and returns an error code. This fallback ++ * is block-by-block: the internal sequence producer will only be called for blocks ++ * where the external sequence producer returns an error code. Fallback parsing will ++ * follow any other cParam settings, such as compression level, the same as in a ++ * normal (fully-internal) compression operation. ++ * ++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API ++ * documentation (below) before setting this parameter. */ ++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 ++ ++/* ZSTD_c_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * This parameter can be used to set an upper bound on the blocksize ++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper ++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make ++ * compressBound() inaccurate). Only currently meant to be used for testing. ++ * ++ */ ++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 ++ ++/* ZSTD_c_searchForExternalRepcodes ++ * This parameter affects how zstd parses external sequences, such as sequences ++ * provided through the compressSequences() API or from an external block-level ++ * sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in ++ * external sequences, even if those repcodes are not explicitly indicated in ++ * the "rep" field. Note that this is the only way to exploit repcode matches ++ * while using compressSequences() or an external sequence producer, since zstd ++ * currently ignores the "rep" field of external sequences. ++ * ++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in ++ * external sequences, regardless of whether the "rep" field has been set. This ++ * reduces sequence compression overhead by about 25% while sacrificing some ++ * compression ratio. ++ * ++ * The default value is ZSTD_ps_auto, for which the library will enable/disable ++ * based on compression level. ++ * ++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future. ++ */ ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 ++ + /*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. +@@ -2084,7 +2374,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * +- * When this flags is enabled zstd won't allocate an output buffer, because ++ * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. +@@ -2137,6 +2427,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + */ + #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + ++/* ZSTD_d_disableHuffmanAssembly ++ * Set to 1 to disable the Huffman assembly implementation. ++ * The default value is 0, which allows zstd to use the Huffman assembly ++ * implementation if available. ++ * ++ * This parameter can be used to disable Huffman assembly at runtime. ++ * If you want to disable it at compile time you can define the macro ++ * ZSTD_DISABLE_ASM. ++ */ ++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 ++ ++/* ZSTD_d_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * Forces the decompressor to reject blocks whose content size is ++ * larger than the configured maxBlockSize. When maxBlockSize is ++ * larger than the windowSize, the windowSize is used instead. ++ * This saves memory on the decoder when you know all blocks are small. ++ * ++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize. ++ * ++ * WARNING: This causes the decoder to reject otherwise valid frames ++ * that have block sizes larger than the configured maxBlockSize. ++ */ ++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 ++ + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). +@@ -2145,6 +2462,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") ++ZSTDLIB_STATIC_API + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + /*! ZSTD_decompressStream_simpleArgs() : +@@ -2181,6 +2499,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); +@@ -2198,17 +2517,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + + /*! ZSTD_initCStream_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd parameter and leave the rest as-is. +- * for ((param, value) : params) { +- * ZSTD_CCtx_setParameter(zcs, param, value); +- * } ++ * ZSTD_CCtx_setParams(zcs, params); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * +@@ -2218,6 +2535,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, +@@ -2232,15 +2550,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + + /*! ZSTD_initCStream_usingCDict_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. +- * for ((fParam, value) : fParams) { +- * ZSTD_CCtx_setParameter(zcs, fParam, value); +- * } ++ * ZSTD_CCtx_setFParams(zcs, fParams); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * +@@ -2250,6 +2566,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, +@@ -2264,7 +2581,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * explicitly specified. + * + * start a new frame, using same parameters from previous frame. +- * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. ++ * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. +@@ -2274,6 +2591,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +@@ -2319,8 +2637,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + + /*! +@@ -2330,8 +2648,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + + /*! +@@ -2339,18 +2657,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * +- * re-use decompression parameters from previous init; saves dictionary loading +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x ++ * reuse decompression parameters from previous init; saves dictionary loading + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + ++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* ++ * ++ * *** OVERVIEW *** ++ * The Block-Level Sequence Producer API allows users to provide their own custom ++ * sequence producer which libzstd invokes to process each block. The produced list ++ * of sequences (literals and matches) is then post-processed by libzstd to produce ++ * valid compressed blocks. ++ * ++ * This block-level offload API is a more granular complement of the existing ++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers ++ * an easier migration story for applications already integrated with libzstd: the ++ * user application continues to invoke the same compression functions ++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits ++ * from the specific advantages of the external sequence producer. For example, ++ * the sequence producer could be tuned to take advantage of known characteristics ++ * of the input, to offer better speed / ratio, or could leverage hardware ++ * acceleration not available within libzstd itself. ++ * ++ * See contrib/externalSequenceProducer for an example program employing the ++ * Block-Level Sequence Producer API. ++ * ++ * *** USAGE *** ++ * The user is responsible for implementing a function of type ++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following ++ * arguments to the user-provided function: ++ * ++ * - sequenceProducerState: a pointer to a user-managed state for the sequence ++ * producer. ++ * ++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. ++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory ++ * backing outSeqs is managed by the CCtx. ++ * ++ * - src, srcSize: an input buffer for the sequence producer to parse. ++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * - dict, dictSize: a history buffer, which may be empty, which the sequence ++ * producer may reference as it parses the src buffer. Currently, zstd will ++ * always pass dictSize == 0 into external sequence producers, but this will ++ * change in the future. ++ * ++ * - compressionLevel: a signed integer representing the zstd compression level ++ * set by the user for the current operation. The sequence producer may choose ++ * to use this information to change its compression strategy and speed/ratio ++ * tradeoff. Note: the compression level does not reflect zstd parameters set ++ * through the advanced API. ++ * ++ * - windowSize: a size_t representing the maximum allowed offset for external ++ * sequences. Note that sequence offsets are sometimes allowed to exceed the ++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md ++ * for details. ++ * ++ * The user-provided function shall return a size_t representing the number of ++ * sequences written to outSeqs. This return value will be treated as an error ++ * code if it is greater than outSeqsCapacity. The return value must be non-zero ++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided ++ * for convenience, but any value greater than outSeqsCapacity will be treated as ++ * an error code. ++ * ++ * If the user-provided function does not return an error code, the sequences ++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may ++ * occur if the parse is not valid. A parse is defined to be valid if the ++ * following conditions hold: ++ * - The sum of matchLengths and literalLengths must equal srcSize. ++ * - All sequences in the parse, except for the final sequence, must have ++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have ++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. ++ * - All offsets must respect the windowSize parameter as specified in ++ * doc/zstd_compression_format.md. ++ * - If the final sequence has matchLength == 0, it must also have offset == 0. ++ * ++ * zstd will only validate these conditions (and fail compression if they do not ++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence ++ * validation has a performance cost. ++ * ++ * If the user-provided function returns an error, zstd will either fall back ++ * to an internal sequence producer or fail the compression operation. The user can ++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback ++ * cParam. Fallback compression will follow any other cParam settings, such as ++ * compression level, the same as in a normal compression operation. ++ * ++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F ++ * function by calling ++ * ZSTD_registerSequenceProducer(cctx, ++ * sequenceProducerState, ++ * sequenceProducer) ++ * This setting will persist until the next parameter reset of the CCtx. ++ * ++ * The sequenceProducerState must be initialized by the user before calling ++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the ++ * sequenceProducerState. ++ * ++ * *** LIMITATIONS *** ++ * This API is compatible with all zstd compression APIs which respect advanced parameters. ++ * However, there are three limitations: ++ * ++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. ++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level ++ * external sequence producer. ++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some ++ * cases (see its documentation for details). Users must explicitly set ++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external ++ * sequence producer is registered. ++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default ++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should ++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence ++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). ++ * ++ * Second, history buffers are not currently supported. Concretely, zstd will always pass ++ * dictSize == 0 to the external sequence producer (for now). This has two implications: ++ * - Dictionaries are not currently supported. Compression will *not* fail if the user ++ * references a dictionary, but the dictionary won't have any effect. ++ * - Stream history is not currently supported. All advanced compression APIs, including ++ * streaming APIs, work with external sequence producers, but each block is treated as ++ * an independent chunk without history from previous blocks. ++ * ++ * Third, multi-threading within a single compression is not currently supported. In other words, ++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. ++ * Multi-threading across compressions is fine: simply create one CCtx per thread. ++ * ++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to ++ * overcoming them. It is purely a question of engineering effort. ++ */ ++ ++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) ++ ++typedef size_t (*ZSTD_sequenceProducer_F) ( ++ void* sequenceProducerState, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize, ++ const void* dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize ++); ++ ++/*! ZSTD_registerSequenceProducer() : ++ * Instruct zstd to use a block-level external sequence producer function. ++ * ++ * The sequenceProducerState must be initialized by the caller, and the caller is ++ * responsible for managing its lifetime. This parameter is sticky across ++ * compressions. It will remain set until the user explicitly resets compression ++ * parameters. ++ * ++ * Sequence producer registration is considered to be an "advanced parameter", ++ * part of the "advanced API". This means it will only have an effect on compression ++ * APIs which respect advanced parameters, such as compress2() and compressStream2(). ++ * Older compression APIs such as compressCCtx(), which predate the introduction of ++ * "advanced parameters", will ignore any external sequence producer setting. ++ * ++ * The sequence producer can be "cleared" by registering a NULL function pointer. This ++ * removes all limitations described above in the "LIMITATIONS" section of the API docs. ++ * ++ * The user is strongly encouraged to read the full API documentation (above) before ++ * calling this function. */ ++ZSTDLIB_STATIC_API void ++ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* cctx, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++/*! ZSTD_CCtxParams_registerSequenceProducer() : ++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. ++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), ++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). ++ * ++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() ++ * is required, then this function is for you. Otherwise, you probably don't need it. ++ * ++ * See tests/zstreamtest.c for example usage. */ ++ZSTDLIB_STATIC_API void ++ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++ + /* ******************************************************************* +-* Buffer-less and synchronous inner streaming functions ++* Buffer-less and synchronous inner streaming functions (DEPRECATED) ++* ++* This API is deprecated, and will be removed in a future version. ++* It allows streaming (de)compression with user allocated buffers. ++* However, it is hard to use, and not as well tested as the rest of ++* our API. + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. ++* Please use the normal streaming API instead: ZSTD_compressStream2, ++* and ZSTD_decompressStream. ++* If there is functionality that you need, but it doesn't provide, ++* please open an issue on our GitHub. + ********************************************************************* */ + + /* +@@ -2358,11 +2860,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. +- ZSTD_CCtx object can be re-used multiple times within successive compression operations. ++ ZSTD_CCtx object can be reused multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2380,36 +2881,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + +- `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. ++ `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. + */ + + /*===== Buffer-less streaming compression functions =====*/ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ++ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. +- A ZSTD_DCtx object can be re-used multiple times. ++ A ZSTD_DCtx object can be reused multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, +@@ -2428,7 +2939,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +2959,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2471,27 +2982,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + */ + + /*===== Buffer-less streaming decompression functions =====*/ +-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +-typedef struct { +- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ +- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ +- unsigned blockSizeMax; +- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ +- unsigned headerSize; +- unsigned dictID; +- unsigned checksumFlag; +-} ZSTD_frameHeader; + +-/*! ZSTD_getFrameHeader() : +- * decode Frame Header, or requires larger `srcSize`. +- * @return : 0, `zfhPtr` is correctly filled, +- * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +-/*! ZSTD_getFrameHeader_advanced() : +- * same as ZSTD_getFrameHeader(), +- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +@@ -2502,6 +2993,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2509,11 +3001,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + +-/* ============================ */ +-/* Block level API */ +-/* ============================ */ ++/* ========================================= */ ++/* Block level API (DEPRECATED) */ ++/* ========================================= */ + + /*! ++ ++ This API is deprecated in favor of the regular compression API. ++ You can get the frame header down to 2 bytes by setting: ++ - ZSTD_c_format = ZSTD_f_zstd1_magicless ++ - ZSTD_c_contentSizeFlag = 0 ++ - ZSTD_c_checksumFlag = 0 ++ - ZSTD_c_dictIDFlag = 0 ++ ++ This API is not as well tested as our normal API, so we recommend not using it. ++ We will be removing it in a future version. If the normal API doesn't provide ++ the functionality you need, please open a GitHub issue. ++ + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. +@@ -2524,7 +3028,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2541,11 +3044,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + */ + + /*===== Raw zstd block functions =====*/ ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + +- + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile +index 20f08c644b71..464c410b2768 100644 +--- a/lib/zstd/Makefile ++++ b/lib/zstd/Makefile +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + # ################################################################ +-# Copyright (c) Facebook, Inc. ++# Copyright (c) Meta Platforms, Inc. and affiliates. + # All rights reserved. + # + # This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h +new file mode 100644 +index 000000000000..16c3d08e8d1a +--- /dev/null ++++ b/lib/zstd/common/allocations.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++/* This file provides custom allocation primitives ++ */ ++ ++#define ZSTD_DEPS_NEED_MALLOC ++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ ++ ++#include "compiler.h" /* MEM_STATIC */ ++#define ZSTD_STATIC_LINKING_ONLY ++#include /* ZSTD_customMem */ ++ ++#ifndef ZSTD_ALLOCATIONS_H ++#define ZSTD_ALLOCATIONS_H ++ ++/* custom memory allocation functions */ ++ ++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) ++ return customMem.customAlloc(customMem.opaque, size); ++ return ZSTD_malloc(size); ++} ++ ++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) { ++ /* calloc implemented as malloc+memset; ++ * not as efficient as calloc, but next best guess for custom malloc */ ++ void* const ptr = customMem.customAlloc(customMem.opaque, size); ++ ZSTD_memset(ptr, 0, size); ++ return ptr; ++ } ++ return ZSTD_calloc(1, size); ++} ++ ++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ++{ ++ if (ptr!=NULL) { ++ if (customMem.customFree) ++ customMem.customFree(customMem.opaque, ptr); ++ else ++ ZSTD_free(ptr); ++ } ++} ++ ++#endif /* ZSTD_ALLOCATIONS_H */ +diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h +new file mode 100644 +index 000000000000..aa3487ec4b6a +--- /dev/null ++++ b/lib/zstd/common/bits.h +@@ -0,0 +1,149 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_BITS_H ++#define ZSTD_BITS_H ++ ++#include "mem.h" ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ 30, 22, 20, 15, 25, 17, 4, 8, ++ 31, 27, 13, 23, 21, 19, 16, 7, ++ 26, 12, 18, 6, 11, 5, 10, 9}; ++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++# else ++ return ZSTD_countTrailingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) { ++ assert(val != 0); ++ { ++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, ++ 11, 14, 16, 18, 22, 25, 3, 30, ++ 8, 12, 20, 28, 15, 17, 24, 7, ++ 19, 27, 23, 6, 26, 5, 4, 31}; ++ val |= val >> 1; ++ val |= val >> 2; ++ val |= val >> 4; ++ val |= val >> 8; ++ val |= val >> 16; ++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++# else ++ return ZSTD_countLeadingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) ++{ ++ if (MEM_isLittleEndian()) { ++ if (MEM_64bits()) { ++ return ZSTD_countTrailingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countTrailingZeros32((U32)val) >> 3; ++ } ++ } else { /* Big Endian CPU */ ++ if (MEM_64bits()) { ++ return ZSTD_countLeadingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countLeadingZeros32((U32)val) >> 3; ++ } ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ ++{ ++ assert(val != 0); ++ return 31 - ZSTD_countLeadingZeros32(val); ++} ++ ++/* ZSTD_rotateRight_*(): ++ * Rotates a bitfield to the right by "count" bits. ++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts ++ */ ++MEM_STATIC ++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { ++ assert(count < 64); ++ count &= 0x3F; /* for fickle pattern recognition */ ++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); ++} ++ ++MEM_STATIC ++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { ++ assert(count < 32); ++ count &= 0x1F; /* for fickle pattern recognition */ ++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); ++} ++ ++MEM_STATIC ++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { ++ assert(count < 16); ++ count &= 0x0F; /* for fickle pattern recognition */ ++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++} ++ ++#endif /* ZSTD_BITS_H */ +diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h +index feef3a1b1d60..6a13f1f0f1e8 100644 +--- a/lib/zstd/common/bitstream.h ++++ b/lib/zstd/common/bitstream.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * bitstream + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -27,6 +28,7 @@ + #include "compiler.h" /* UNLIKELY() */ + #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ + #include "error_private.h" /* error codes and messages */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /*========================================= +@@ -79,19 +81,20 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + /*-******************************************** + * bitStream decoding API (read backward) + **********************************************/ ++typedef size_t BitContainerType; + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; + const char* limitPtr; + } BIT_DStream_t; + +-typedef enum { BIT_DStream_unfinished = 0, +- BIT_DStream_endOfBuffer = 1, +- BIT_DStream_completed = 2, +- BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ +- /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ ++typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ ++ BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ ++ BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ ++ BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ ++ } BIT_DStream_status; /* result of BIT_reloadDStream() */ + + MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); + MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); +@@ -101,7 +104,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + + /* Start by invoking BIT_initDStream(). + * A chunk of the bitStream is then stored into a local register. +-* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). ++* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). + * You can then retrieve bitFields stored into the local register, **in reverse order**. + * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. + * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +@@ -122,33 +125,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); + /* faster, but works only if nbBits >= 1 */ + +- +- +-/*-************************************************************** +-* Internal functions +-****************************************************************/ +-MEM_STATIC unsigned BIT_highbit32 (U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, +- 11, 14, 16, 18, 22, 25, 3, 30, +- 8, 12, 20, 28, 15, 17, 24, 7, +- 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- + /*===== Local Constants =====*/ + static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, +@@ -178,6 +154,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + return 0; + } + ++FORCE_INLINE_TEMPLATE size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) ++{ ++ assert(nbBits < BIT_MASK_SIZE); ++ return bitContainer & BIT_mask[nbBits]; ++} ++ + /*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +@@ -187,7 +169,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; ++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; + bitC->bitPos += nbBits; + } + +@@ -266,35 +248,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { +- case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); ++ case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + ZSTD_FALLTHROUGH; + +- case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); ++ case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + ZSTD_FALLTHROUGH; + +- case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); ++ case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + ZSTD_FALLTHROUGH; + +- case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; ++ case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; + ZSTD_FALLTHROUGH; + +- case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; ++ case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; + ZSTD_FALLTHROUGH; + +- case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; ++ case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; + ZSTD_FALLTHROUGH; + + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; +@@ -303,12 +285,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + return srcSize; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) ++FORCE_INLINE_TEMPLATE size_t BIT_getUpperBits(BitContainerType bitContainer, U32 const start) + { + return bitContainer >> start; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) + { + U32 const regMask = sizeof(bitContainer)*8 - 1; + /* if start > regMask, bitstream is corrupted, and result is undefined */ +@@ -325,19 +307,13 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c + #endif + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +-{ +- assert(nbBits < BIT_MASK_SIZE); +- return bitContainer & BIT_mask[nbBits]; +-} +- + /*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) + { + /* arbitrate between double-shift and shift+mask */ + #if 1 +@@ -360,7 +336,7 @@ MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); + } + +-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + { + bitD->bitsConsumed += nbBits; + } +@@ -369,7 +345,7 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) ++FORCE_INLINE_TEMPLATE size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); +@@ -377,7 +353,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n + } + + /*! BIT_readBitsFast() : +- * unsafe version; only works only if nbBits >= 1 */ ++ * unsafe version; only works if nbBits >= 1 */ + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBitsFast(bitD, nbBits); +@@ -386,6 +362,21 @@ MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + return value; + } + ++/*! BIT_reloadDStream_internal() : ++ * Simple variant of BIT_reloadDStream(), with two conditions: ++ * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 ++ * 2. look window is valid after shifted down : bitD->ptr >= bitD->start ++ */ ++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) ++{ ++ assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); ++ bitD->ptr -= bitD->bitsConsumed >> 3; ++ assert(bitD->ptr >= bitD->start); ++ bitD->bitsConsumed &= 7; ++ bitD->bitContainer = MEM_readLEST(bitD->ptr); ++ return BIT_DStream_unfinished; ++} ++ + /*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! +@@ -396,31 +387,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) + { + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; +- assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); +- bitD->ptr -= bitD->bitsConsumed >> 3; +- bitD->bitsConsumed &= 7; +- bitD->bitContainer = MEM_readLEST(bitD->ptr); +- return BIT_DStream_unfinished; ++ return BIT_reloadDStream_internal(bitD); + } + + /*! BIT_reloadDStream() : + * Refill `bitD` from buffer previously set in BIT_initDStream() . +- * This function is safe, it guarantees it will not read beyond src buffer. ++ * This function is safe, it guarantees it will not never beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) ++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) + { +- if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ ++ /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ ++ if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { ++ static const BitContainerType zeroFilled = 0; ++ bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ ++ /* overflow detected, erroneous scenario or end of stream: no update */ + return BIT_DStream_overflow; ++ } ++ ++ assert(bitD->ptr >= bitD->start); + + if (bitD->ptr >= bitD->limitPtr) { +- return BIT_reloadDStreamFast(bitD); ++ return BIT_reloadDStream_internal(bitD); + } + if (bitD->ptr == bitD->start) { ++ /* reached end of bitStream => no update */ + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } +- /* start < ptr < limitPtr */ ++ /* start < ptr < limitPtr => cautious update */ + { U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { +diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h +index c42d39faf9bd..508ee25537bb 100644 +--- a/lib/zstd/common/compiler.h ++++ b/lib/zstd/common/compiler.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,6 +12,8 @@ + #ifndef ZSTD_COMPILER_H + #define ZSTD_COMPILER_H + ++#include ++ + #include "portability_macros.h" + + /*-******************************************************* +@@ -41,12 +44,15 @@ + */ + #define WIN_CDECL + ++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ ++#define UNUSED_ATTR __attribute__((unused)) ++ + /* + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to eliminate the constant + * branches. + */ +-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR ++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR + /* + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers +@@ -61,11 +67,21 @@ + #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 + # define HINT_INLINE static INLINE_KEYWORD + #else +-# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR ++# define HINT_INLINE FORCE_INLINE_TEMPLATE + #endif + +-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +-#define UNUSED_ATTR __attribute__((unused)) ++/* "soft" inline : ++ * The compiler is free to select if it's a good idea to inline or not. ++ * The main objective is to silence compiler warnings ++ * when a defined function in included but not used. ++ * ++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. ++ * Updating the prefix is probably preferable, but requires a fairly large codemod, ++ * since this name is used everywhere. ++ */ ++#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ ++#define MEM_STATIC static __inline UNUSED_ATTR ++#endif + + /* force no inlining */ + #define FORCE_NOINLINE static __attribute__((__noinline__)) +@@ -86,23 +102,24 @@ + # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) + # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) + #elif defined(__aarch64__) +-# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) +-# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) ++# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) ++# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) + #else +-# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +-# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ ++# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ ++# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ + #endif /* NO_PREFETCH */ + + #define CACHELINE_SIZE 64 + +-#define PREFETCH_AREA(p, s) { \ +- const char* const _ptr = (const char*)(p); \ +- size_t const _size = (size_t)(s); \ +- size_t _pos; \ +- for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ +- PREFETCH_L2(_ptr + _pos); \ +- } \ +-} ++#define PREFETCH_AREA(p, s) \ ++ do { \ ++ const char* const _ptr = (const char*)(p); \ ++ size_t const _size = (size_t)(s); \ ++ size_t _pos; \ ++ for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ ++ PREFETCH_L2(_ptr + _pos); \ ++ } \ ++ } while (0) + + /* vectorization + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, +@@ -126,9 +143,9 @@ + #define UNLIKELY(x) (__builtin_expect((x), 0)) + + #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) +-# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } ++# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) + #else +-# define ZSTD_UNREACHABLE { assert(0); } ++# define ZSTD_UNREACHABLE do { assert(0); } while (0) + #endif + + /* disable warnings */ +@@ -179,6 +196,85 @@ + * Sanitizer + *****************************************************************/ + ++/* ++ * Zstd relies on pointer overflow in its decompressor. ++ * We add this attribute to functions that rely on pointer overflow. ++ */ ++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# if __has_attribute(no_sanitize) ++# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 ++ /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) ++# else ++ /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) ++# endif ++# else ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# endif ++#endif ++ ++/* ++ * Helper function to perform a wrapped pointer difference without trigging ++ * UBSAN. ++ * ++ * @returns lhs - rhs with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) ++{ ++ return lhs - rhs; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer add without triggering UBSAN. ++ * ++ * @return ptr + add with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) ++{ ++ return ptr + add; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer subtraction without triggering ++ * UBSAN. ++ * ++ * @return ptr - sub with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) ++{ ++ return ptr - sub; ++} ++ ++/* ++ * Helper function to add to a pointer that works around C's undefined behavior ++ * of adding 0 to NULL. ++ * ++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`. ++ */ ++MEM_STATIC ++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) ++{ ++ return add > 0 ? ptr + add : ptr; ++} ++ ++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an ++ * abundance of caution, disable our custom poisoning on mingw. */ ++#ifdef __MINGW32__ ++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE ++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 ++#endif ++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE ++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 ++#endif ++#endif ++ + + + #endif /* ZSTD_COMPILER_H */ +diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h +index 0db7b42407ee..d8319a2bef4c 100644 +--- a/lib/zstd/common/cpu.h ++++ b/lib/zstd/common/cpu.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c +index bb863c9ea616..8eb6aa9a3b20 100644 +--- a/lib/zstd/common/debug.c ++++ b/lib/zstd/common/debug.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -21,4 +22,10 @@ + + #include "debug.h" + ++#if (DEBUGLEVEL>=2) ++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a ++ * translation unit is empty. So remove this from Linux kernel builds, but ++ * otherwise just leave it in. ++ */ + int g_debuglevel = DEBUGLEVEL; ++#endif +diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h +index 6dd88d1fbd02..226ba3c57ec3 100644 +--- a/lib/zstd/common/debug.h ++++ b/lib/zstd/common/debug.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -82,18 +83,27 @@ extern int g_debuglevel; /* the variable is only declared, + It's useful when enabling very verbose levels + on selective conditions (such as position in src) */ + +-# define RAWLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__VA_ARGS__); \ +- } } +-# define DEBUGLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \ +- ZSTD_DEBUG_PRINT(" \n"); \ +- } } ++# define RAWLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__VA_ARGS__); \ ++ } \ ++ } while (0) ++ ++#define STRINGIFY(x) #x ++#define TOSTRING(x) STRINGIFY(x) ++#define LINE_AS_STRING TOSTRING(__LINE__) ++ ++# define DEBUGLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ ++ ZSTD_DEBUG_PRINT(" \n"); \ ++ } \ ++ } while (0) + #else +-# define RAWLOG(l, ...) {} /* disabled */ +-# define DEBUGLOG(l, ...) {} /* disabled */ ++# define RAWLOG(l, ...) do { } while (0) /* disabled */ ++# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ + #endif + + +diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c +index fef67056f052..6cdd82233fb5 100644 +--- a/lib/zstd/common/entropy_common.c ++++ b/lib/zstd/common/entropy_common.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * Common functions of New Generation Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,8 +20,8 @@ + #include "error_private.h" /* ERR_*, ERROR */ + #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ + #include "huf.h" ++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ + + + /*=== Version ===*/ +@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + /*-************************************************************** + * FSE NCount encoding-decoding + ****************************************************************/ +-static U32 FSE_ctz(U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_ctz(val); +-# else /* Software version */ +- U32 count = 0; +- while ((val & 1) == 0) { +- val >>= 1; +- ++count; +- } +- return count; +-# endif +- } +-} +- + FORCE_INLINE_TEMPLATE + size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * repeat. + * Avoid UB by setting the high bit to 1. + */ +- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { +@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; +- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + } + charnum += 3 * repeats; + bitStream >>= 2 * repeats; +@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * know that threshold > 1. + */ + if (remaining <= 1) break; +- nbBits = BIT_highbit32(remaining) + 1; ++ nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); + } + if (charnum >= maxSV1) break; +@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + const void* src, size_t srcSize) + { + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); ++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ +- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; ++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; +- U32 const verif = 1 << BIT_highbit32(rest); +- U32 const lastWeight = BIT_highbit32(rest) + 1; ++ U32 const verif = 1 << ZSTD_highbit32(rest); ++ U32 const lastWeight = ZSTD_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; +@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, +- int bmi2) ++ int flags) + { + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } + #endif +- (void)bmi2; ++ (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c +index 6d1135f8c373..a4062d30d170 100644 +--- a/lib/zstd/common/error_private.c ++++ b/lib/zstd/common/error_private.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; +- case PREFIX(corruption_detected): return "Corrupted block detected"; ++ case PREFIX(corruption_detected): return "Data corruption detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; ++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; ++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; +@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; ++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; ++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; ++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; ++ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; + case PREFIX(maxCode): + default: return notErrorCode; + } +diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h +index ca5101e542fa..0410ca415b54 100644 +--- a/lib/zstd/common/error_private.h ++++ b/lib/zstd/common/error_private.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -49,8 +50,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } + + /* check and forward error code */ +-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e +-#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } ++#define CHECK_V_F(e, f) \ ++ size_t const e = f; \ ++ do { \ ++ if (ERR_isError(e)) \ ++ return e; \ ++ } while (0) ++#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) + + + /*-**************************************** +@@ -84,10 +90,12 @@ void _force_has_format_string(const char *format, ...) { + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +-#define _FORCE_HAS_FORMAT_STRING(...) \ +- if (0) { \ +- _force_has_format_string(__VA_ARGS__); \ +- } ++#define _FORCE_HAS_FORMAT_STRING(...) \ ++ do { \ ++ if (0) { \ ++ _force_has_format_string(__VA_ARGS__); \ ++ } \ ++ } while (0) + + #define ERR_QUOTE(str) #str + +@@ -98,48 +106,50 @@ void _force_has_format_string(const char *format, ...) { + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +-#define RETURN_ERROR_IF(cond, err, ...) \ +- if (cond) { \ +- RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } ++#define RETURN_ERROR_IF(cond, err, ...) \ ++ do { \ ++ if (cond) { \ ++ RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } \ ++ } while (0) + + /* + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +-#define RETURN_ERROR(err, ...) \ +- do { \ +- RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } while(0); ++#define RETURN_ERROR(err, ...) \ ++ do { \ ++ RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } while(0) + + /* + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +-#define FORWARD_IF_ERROR(err, ...) \ +- do { \ +- size_t const err_code = (err); \ +- if (ERR_isError(err_code)) { \ +- RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ +- __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return err_code; \ +- } \ +- } while(0); ++#define FORWARD_IF_ERROR(err, ...) \ ++ do { \ ++ size_t const err_code = (err); \ ++ if (ERR_isError(err_code)) { \ ++ RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return err_code; \ ++ } \ ++ } while(0) + + + #endif /* ERROR_H_MODULE */ +diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h +index 4507043b2287..2185a578617d 100644 +--- a/lib/zstd/common/fse.h ++++ b/lib/zstd/common/fse.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -50,34 +51,6 @@ + FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ + + +-/*-**************************************** +-* FSE simple functions +-******************************************/ +-/*! FSE_compress() : +- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. +- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). +- @return : size of compressed data (<= dstCapacity). +- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. +- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +-*/ +-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/*! FSE_decompress(): +- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', +- into already allocated destination buffer 'dst', of size 'dstCapacity'. +- @return : size of regenerated data (<= maxDstSize), +- or an error code, which can be tested using FSE_isError() . +- +- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! +- Why ? : making this distinction requires a header. +- Header management is intentionally delegated to the user layer, which can better manage special cases. +-*/ +-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, +- const void* cSrc, size_t cSrcSize); +- +- + /*-***************************************** + * Tool functions + ******************************************/ +@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return + FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +-/*-***************************************** +-* FSE advanced functions +-******************************************/ +-/*! FSE_compress2() : +- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' +- Both parameters can be defined as '0' to mean : use default value +- @return : size of compressed data +- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. +- if FSE_isError(return), it's an error code. +-*/ +-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +- +- + /*-***************************************** + * FSE detailed API + ******************************************/ +@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + /*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ + typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + + /*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). +@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +-/*! Constructor and Destructor of FSE_DTable. +- Note that its size depends on 'tableLog' */ + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); +- +-/*! FSE_buildDTable(): +- Builds 'dt', which must be already allocated, using FSE_createDTable(). +- return : 0, or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +- +-/*! FSE_decompress_usingDTable(): +- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` +- into `dst` which must be already allocated. +- @return : size of regenerated data (necessarily <= `dstCapacity`), +- or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + + /*! + Tutorial : +@@ -286,6 +227,7 @@ If there is an error, the function will return an error code, which can be teste + + #endif /* FSE_H */ + ++ + #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY) + #define FSE_H_FSE_STATIC_LINKING_ONLY + +@@ -317,16 +259,6 @@ If there is an error, the function will return an error code, which can be teste + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); + /*< same as FSE_optimalTableLog(), which used `minus==2` */ + +-/* FSE_compress_wksp() : +- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). +- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. +- */ +-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +- +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ +- + size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); + /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +@@ -344,19 +276,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi + FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); +-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ +- +-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); +-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ +- +-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) ++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) + #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ +- + size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ ++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. ++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + + typedef enum { + FSE_repeat_none, /*< Cannot use the previous table */ +@@ -539,20 +463,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); +- BIT_addBits(bitC, statePtr->value, nbBitsOut); ++ BIT_addBits(bitC, (size_t)statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } + + MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) + { +- BIT_addBits(bitC, statePtr->value, statePtr->stateLog); ++ BIT_addBits(bitC, (size_t)statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); + } + + + /* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. +- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) ++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ + MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c +index 8dcb8ca39767..3a17e84f27bf 100644 +--- a/lib/zstd/common/fse_decompress.c ++++ b/lib/zstd/common/fse_decompress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy decoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -22,8 +23,8 @@ + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" + #include "error_private.h" +-#define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" ++#include "zstd_deps.h" /* ZSTD_memcpy */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -55,19 +56,6 @@ + #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) + #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + +- +-/* Function templates */ +-FSE_DTable* FSE_createDTable (unsigned tableLog) +-{ +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +-} +- +-void FSE_freeDTable (FSE_DTable* dt) +-{ +- ZSTD_free(dt); +-} +- + static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) + { + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ +@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + symbolNext[s] = 1; + } else { + if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; +- symbolNext[s] = normalizedCounter[s]; ++ symbolNext[s] = (U16)normalizedCounter[s]; + } } } + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); + } +@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ +- { +- U64 const add = 0x0101010101010101ull; ++ { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; +@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; +- } +- } ++ pos += (size_t)n; ++ } } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (u=0; utableLog = 0; +- DTableH->fastMode = 0; +- +- cell->newState = 0; +- cell->symbol = symbolValue; +- cell->nbBits = 0; +- +- return 0; +-} +- +- +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +-{ +- void* ptr = dt; +- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; +- void* dPtr = dt + 1; +- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSV1 = tableMask+1; +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* Build Decoding Table */ +- DTableH->tableLog = (U16)nbBits; +- DTableH->fastMode = 1; +- for (s=0; sfastMode; +- +- /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); +- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +-} +- +- +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +-{ +- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); ++ assert(op >= ostart); ++ return (size_t)(op-ostart); + } + + typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; +- FSE_DTable dtable[]; /* Dynamically sized */ + } FSE_DecompressWksp; + + +@@ -327,13 +250,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; ++ size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); ++ FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; + +- DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); ++ FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); + ++ /* correct offset to dtable depends on this property */ ++ FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); ++ + /* normal FSE decoding mode */ +- { +- size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); ++ { size_t const NCountLength = ++ FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + if (FSE_isError(NCountLength)) return NCountLength; + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); +@@ -342,19 +270,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + } + + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); +- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); ++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); ++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + +- CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); ++ CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); + + { +- const void* ptr = wksp->dtable; ++ const void* ptr = dtable; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); +- return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); ++ if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); ++ return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + } + } + +@@ -382,9 +311,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } + +- +-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +- +- +- + #endif /* FSE_COMMONDEFS_ONLY */ +diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h +index 5042ff870308..57462466e188 100644 +--- a/lib/zstd/common/huf.h ++++ b/lib/zstd/common/huf.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -18,99 +19,22 @@ + + /* *** Dependencies *** */ + #include "zstd_deps.h" /* size_t */ +- +- +-/* *** library symbols visibility *** */ +-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, +- * HUF symbols remain "private" (internal symbols for library only). +- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +-# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +-# define HUF_PUBLIC_API __declspec(dllexport) +-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +-#else +-# define HUF_PUBLIC_API +-#endif +- +- +-/* ========================== */ +-/* *** simple functions *** */ +-/* ========================== */ +- +-/* HUF_compress() : +- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. +- * 'dst' buffer must be already allocated. +- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). +- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. +- * @return : size of compressed data (<= `dstCapacity`). +- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) +- */ +-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/* HUF_decompress() : +- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', +- * into already allocated buffer 'dst', of minimum size 'dstSize'. +- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. +- * Note : in contrast with FSE, HUF_decompress can regenerate +- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, +- * because it knows size to regenerate (originalSize). +- * @return : size of regenerated data (== originalSize), +- * or an error code, which can be tested using HUF_isError() +- */ +-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, +- const void* cSrc, size_t cSrcSize); ++#include "mem.h" /* U32 */ ++#define FSE_STATIC_LINKING_ONLY ++#include "fse.h" + + + /* *** Tool functions *** */ +-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ +-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ ++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ ++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ + + /* Error Management */ +-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ +-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ +- ++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ ++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ + +-/* *** Advanced function *** */ + +-/* HUF_compress2() : +- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. +- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . +- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog); +- +-/* HUF_compress4X_wksp() : +- * Same as HUF_compress2(), but uses externally allocated `workSpace`. +- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) +-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog, +- void* workSpace, size_t wkspSize); +- +-#endif /* HUF_H_298734234 */ +- +-/* ****************************************************************** +- * WARNING !! +- * The following section contains advanced and experimental definitions +- * which shall never be used in the context of a dynamic library, +- * because they are not guaranteed to remain stable in the future. +- * Only consider them in association with static linking. +- * *****************************************************************/ +-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +-#define HUF_H_HUF_STATIC_LINKING_ONLY +- +-/* *** Dependencies *** */ +-#include "mem.h" /* U32 */ +-#define FSE_STATIC_LINKING_ONLY +-#include "fse.h" +- + + /* *** Constants *** */ + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ +@@ -151,25 +75,49 @@ typedef U32 HUF_DTable; + /* **************************************** + * Advanced decompression functions + ******************************************/ +-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-#endif + +-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ +-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++/* ++ * Huffman flags bitset. ++ * For all flags, 0 is the default value. ++ */ ++typedef enum { ++ /* ++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. ++ * Otherwise: Ignored. ++ */ ++ HUF_flags_bmi2 = (1 << 0), ++ /* ++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. ++ * If unset: Use heuristic to find the table depth. ++ */ ++ HUF_flags_optimalDepth = (1 << 1), ++ /* ++ * If set: If the previous table can encode the input, always reuse the previous table. ++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. ++ */ ++ HUF_flags_preferRepeat = (1 << 2), ++ /* ++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. ++ * If unset: Always histogram the entire input. ++ */ ++ HUF_flags_suspectUncompressible = (1 << 3), ++ /* ++ * If set: Don't use assembly implementations ++ * If unset: Allow using assembly implementations ++ */ ++ HUF_flags_disableAsm = (1 << 4), ++ /* ++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. ++ * If unset: Use the fast decoding loop when possible. ++ */ ++ HUF_flags_disableFast = (1 << 5) ++} HUF_flags_e; + + + /* **************************************** + * HUF detailed API + * ****************************************/ ++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra + + /*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") +@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); ++unsigned HUF_minTableLog(unsigned symbolCardinality); ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); ++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, ++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ + size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +@@ -196,6 +144,7 @@ typedef enum { + HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; ++ + /* HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) ++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) + #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) + size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, +@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, +- int bmi2); ++ int flags); + + /* HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +@@ -246,9 +195,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + + /* HUF_getNbBitsFromCTable() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX +- * Note 1 : is not inlined, as HUF_CElt definition is private */ ++ * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 ++ * Note 2 : is not inlined, as HUF_CElt definition is private ++ */ + U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); + ++typedef struct { ++ BYTE tableLog; ++ BYTE maxSymbolValue; ++ BYTE unused[sizeof(size_t) - 2]; ++} HUF_CTableHeader; ++ ++/* HUF_readCTableHeader() : ++ * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. ++ */ ++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); ++ + /* + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics +@@ -276,32 +238,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) + #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +- + + /* ====================== */ + /* single stream variants */ + /* ====================== */ + +-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + /* HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -312,47 +254,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + +-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +-#endif +- +-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#endif ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif +- +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); ++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ + #endif + + /* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #endif +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + +-#endif /* HUF_STATIC_LINKING_ONLY */ ++#endif /* HUF_H_298734234 */ + +diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h +index c22a2e69bf46..d9bd752fe17b 100644 +--- a/lib/zstd/common/mem.h ++++ b/lib/zstd/common/mem.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,6 +24,7 @@ + /*-**************************************** + * Compiler specifics + ******************************************/ ++#undef MEM_STATIC /* may be already defined from common/compiler.h */ + #define MEM_STATIC static inline + + /*-************************************************************** +diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h +index 0e3b2c0a527d..f08638cced6c 100644 +--- a/lib/zstd/common/portability_macros.h ++++ b/lib/zstd/common/portability_macros.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,7 +13,7 @@ + #define ZSTD_PORTABILITY_MACROS_H + + /* +- * This header file contains macro defintions to support portability. ++ * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * +@@ -45,6 +46,8 @@ + /* Mark the internal assembly functions as hidden */ + #ifdef __ELF__ + # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func ++#elif defined(__APPLE__) ++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func + #else + # define ZSTD_HIDE_ASM_FUNCTION(func) + #endif +@@ -65,7 +68,7 @@ + #endif + + /* +- * Only enable assembly for GNUC comptabile compilers, ++ * Only enable assembly for GNUC compatible compilers, + * because other platforms may not support GAS assembly syntax. + * + * Only enable assembly for Linux / MacOS, other platforms may +@@ -90,4 +93,23 @@ + */ + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + ++/* ++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in ++ * assembly sources when CET is enabled. ++ * ++ * Additionally, any function that may be called indirectly must begin ++ * with ZSTD_CET_ENDBRANCH. ++ */ ++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ ++ && defined(__has_include) ++# if __has_include() ++# include ++# define ZSTD_CET_ENDBRANCH _CET_ENDBR ++# endif ++#endif ++ ++#ifndef ZSTD_CET_ENDBRANCH ++# define ZSTD_CET_ENDBRANCH ++#endif ++ + #endif /* ZSTD_PORTABILITY_MACROS_H */ +diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c +index 3d7e35b309b5..44b95b25344a 100644 +--- a/lib/zstd/common/zstd_common.c ++++ b/lib/zstd/common/zstd_common.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,6 @@ + * Dependencies + ***************************************/ + #define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + #include "error_private.h" + #include "zstd_internal.h" + +@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + /*! ZSTD_getErrorString() : + * provides error code string from enum */ + const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } +- +- +- +-/*=************************************************************** +-* Custom allocator +-****************************************************************/ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) +- return customMem.customAlloc(customMem.opaque, size); +- return ZSTD_malloc(size); +-} +- +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) { +- /* calloc implemented as malloc+memset; +- * not as efficient as calloc, but next best guess for custom malloc */ +- void* const ptr = customMem.customAlloc(customMem.opaque, size); +- ZSTD_memset(ptr, 0, size); +- return ptr; +- } +- return ZSTD_calloc(1, size); +-} +- +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +-{ +- if (ptr!=NULL) { +- if (customMem.customFree) +- customMem.customFree(customMem.opaque, ptr); +- else +- ZSTD_free(ptr); +- } +-} +diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h +index 2c34e8a33a1c..f931f7d0e294 100644 +--- a/lib/zstd/common/zstd_deps.h ++++ b/lib/zstd/common/zstd_deps.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { + + #endif /* ZSTD_DEPS_IO */ + #endif /* ZSTD_DEPS_NEED_IO */ ++ ++/* ++ * Only requested when MSAN is enabled. ++ * Need: ++ * intptr_t ++ */ ++#ifdef ZSTD_DEPS_NEED_STDINT ++#ifndef ZSTD_DEPS_STDINT ++#define ZSTD_DEPS_STDINT ++ ++/* intptr_t already provided by ZSTD_DEPS_COMMON */ ++ ++#endif /* ZSTD_DEPS_STDINT */ ++#endif /* ZSTD_DEPS_NEED_STDINT */ +diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h +index 93305d9b41bb..11da1233e890 100644 +--- a/lib/zstd/common/zstd_internal.h ++++ b/lib/zstd/common/zstd_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -28,7 +29,6 @@ + #include + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "huf.h" + #include /* XXH_reset, update, digest */ + #define ZSTD_TRACE 0 +@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + #define ZSTD_FRAMECHECKSUMSIZE 4 + + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ ++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ ++#define MIN_LITERALS_FOR_4_STREAMS 6 + +-#define HufLog 12 + typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + + #define LONGNBSEQ 0x7F00 +@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy + #define MINMATCH 3 + + #define Litbits 8 ++#define LitHufLog 11 + #define MaxLit ((1<= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); +@@ -225,12 +228,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. + */ +-#ifdef __aarch64__ +- do { +- COPY16(op, ip); +- } +- while (op < oend); +-#else + ZSTD_copy16(op, ip); + if (16 >= length) return; + op += 16; +@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + COPY16(op, ip); + } + while (op < oend); +-#endif + } + } + +@@ -289,11 +285,11 @@ typedef enum { + typedef struct { + seqDef* sequencesStart; + seqDef* sequences; /* ptr to end of sequences */ +- BYTE* litStart; +- BYTE* lit; /* ptr to end of literals */ +- BYTE* llCode; +- BYTE* mlCode; +- BYTE* ofCode; ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + +@@ -301,8 +297,8 @@ typedef struct { + * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment + * the existing value of the litLength or matchLength by 0x10000. + */ +- ZSTD_longLengthType_e longLengthType; +- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ + } seqStore_t; + + typedef struct { +@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + seqLen.matchLength = seq->mlBase + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { +- seqLen.litLength += 0xFFFF; ++ seqLen.litLength += 0x10000; + } + if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- seqLen.matchLength += 0xFFFF; ++ seqLen.matchLength += 0x10000; + } + } + return seqLen; +@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ + typedef struct { ++ size_t nbBlocks; + size_t compressedSize; + unsigned long long decompressedBound; + } ZSTD_frameSizeInfo; /* decompress & legacy */ + + const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ +- +-/* custom memory allocation functions */ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); +- +- +-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- +-/* +- * Counts the number of trailing zeros of a `size_t`. +- * Most compilers should support CTZ as a builtin. A backup +- * implementation is provided if the builtin isn't supported, but +- * it may not be terribly efficient. +- */ +-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +-{ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return __builtin_ctzll((U64)val); +-# else +- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, +- 4, 25, 14, 28, 9, 34, 20, 56, +- 5, 17, 26, 54, 15, 41, 29, 43, +- 10, 31, 38, 35, 21, 45, 49, 57, +- 63, 6, 12, 18, 24, 27, 33, 55, +- 16, 53, 40, 42, 30, 37, 44, 48, +- 62, 11, 23, 32, 52, 39, 36, 47, +- 61, 22, 51, 46, 60, 50, 59, 58 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return __builtin_ctz((U32)val); +-# else +- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, +- 30, 22, 20, 15, 25, 17, 4, 8, +- 31, 27, 13, 23, 21, 19, 16, 7, +- 26, 12, 18, 6, 11, 5, 10, 9 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +-} ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + + + /* ZSTD_invalidateRepCodes() : +@@ -420,13 +357,13 @@ typedef struct { + + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: decompress, fullbench */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + + /*! ZSTD_decodeSeqHeaders() : + * decode sequence header from src */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: zstd_decompress_block, fullbench */ + size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize); + +diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h +index d9a76112ec3a..6ab8be6532ef 100644 +--- a/lib/zstd/compress/clevels.h ++++ b/lib/zstd/compress/clevels.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c +index ec5b1ca6d71a..44a3c10becf2 100644 +--- a/lib/zstd/compress/fse_compress.c ++++ b/lib/zstd/compress/fse_compress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy encoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -25,7 +26,8 @@ + #include "../common/error_private.h" + #define ZSTD_DEPS_NEED_MALLOC + #define ZSTD_DEPS_NEED_MATH64 +-#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : +- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ ++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ +@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + break; + default : + assert(normalizedCounter[s] > 1); +- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); ++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) +- + 1 /* round up to whole nb bytes */ +- + 2 /* additional two bytes for bitstream flush */; ++ + 1 /* round up to whole nb bytes */ ++ + 2 /* additional two bytes for bitstream flush */; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ + } + +@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; +- nbBits = tableLog+1; ++ nbBits = (int)tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { +@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + } + while (symbol >= start+3) { + start+=3; +- bitStream += 3 << bitCount; ++ bitStream += 3U << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; +@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ +- bitStream += count << bitCount; ++ bitStream += (U32)count << bitCount; + bitCount += nbBits; + bitCount -= (count>8); + out+= (bitCount+7) /8; + +- return (out-ostart); ++ assert(out >= ostart); ++ return (size_t)(out-ostart); + } + + +@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, + * FSE Compression Code + ****************************************************************/ + +-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +-{ +- size_t size; +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); +- return (FSE_CTable*)ZSTD_malloc(size); +-} +- +-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } +- + /* provides the minimum logSize to safely represent a distribution */ + static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + { +- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; +- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; ++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; ++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) + { +- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; ++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ +@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + return tableLog; + } + +- +-/* fake FSE_CTable, for raw (uncompressed) input */ +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) +-{ +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSymbolValue = tableMask; +- void* const ptr = ct; +- U16* const tableU16 = ( (U16*) ptr) + 2; +- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ +- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* header */ +- tableU16[-2] = (U16) nbBits; +- tableU16[-1] = (U16) maxSymbolValue; +- +- /* Build table */ +- for (s=0; s= 2 ++ ++static size_t showU32(const U32* arr, size_t size) + { +- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ size_t u; ++ for (u=0; u= sizeof(HUF_WriteCTableWksp)); ++ ++ assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); ++ assert(HUF_readCTableHeader(CTable).tableLog == huffLog); ++ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); +@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + return ((maxSymbolValue+1)/2) + 1; + } + +-/*! HUF_writeCTable() : +- `CTable` : Huffman tree to save, using huf representation. +- @return : size of saved CTable */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, +- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +-{ +- HUF_WriteCTableWksp wksp; +- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +-} +- + + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) + { +@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + +- CTable[0] = tableLog; ++ *maxSymbolValuePtr = nbSymbols - 1; ++ ++ HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; +@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) ++ return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); + } + + +-typedef struct nodeElt_s { +- U32 count; +- U16 parent; +- BYTE byte; +- BYTE nbBits; +-} nodeElt; +- + /* + * HUF_setMaxHeight(): +- * Enforces maxNbBits on the Huffman tree described in huffNode. ++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * +- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts +- * the tree to so that it is a valid canonical Huffman tree. ++ * It attempts to convert all nodes with nbBits > @targetNbBits ++ * to employ @targetNbBits instead. Then it adjusts the tree ++ * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, +- * where largestBits is the return value <= maxNbBits. ++ * where largestBits is the return value (expected <= targetNbBits). + * +- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. ++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. ++ * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. +- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree ++ * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will +- * respect maxNbBits. +- * @return The maximum number of bits of the Huffman tree after adjustment, +- * necessarily no more than maxNbBits. ++ * respect targetNbBits. ++ * @return The maximum number of bits of the Huffman tree after adjustment. + */ +-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) ++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) + { + const U32 largestBits = huffNode[lastNonNull].nbBits; +- /* early exit : no elt > maxNbBits, so the tree is already valid. */ +- if (largestBits <= maxNbBits) return largestBits; ++ /* early exit : no elt > targetNbBits, so the tree is already valid. */ ++ if (largestBits <= targetNbBits) return largestBits; ++ ++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; +- const U32 baseCost = 1 << (largestBits - maxNbBits); ++ const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + +- /* Adjust any ranks > maxNbBits to maxNbBits. ++ /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ +- while (huffNode[n].nbBits > maxNbBits) { ++ while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); +- huffNode[n].nbBits = (BYTE)maxNbBits; ++ huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } +- /* n stops at huffNode[n].nbBits <= maxNbBits */ +- assert(huffNode[n].nbBits <= maxNbBits); +- /* n end at index of smallest symbol using < maxNbBits */ +- while (huffNode[n].nbBits == maxNbBits) --n; ++ /* n stops at huffNode[n].nbBits <= targetNbBits */ ++ assert(huffNode[n].nbBits <= targetNbBits); ++ /* n end at index of smallest symbol using < targetNbBits */ ++ while (huffNode[n].nbBits == targetNbBits) --n; + +- /* renorm totalCost from 2^largestBits to 2^maxNbBits ++ /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ +- assert((totalCost & (baseCost - 1)) == 0); +- totalCost >>= (largestBits - maxNbBits); ++ assert(((U32)totalCost & (baseCost - 1)) == 0); ++ totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ +@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); +- { U32 currentNbBits = maxNbBits; ++ { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; +- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ +- rankLast[maxNbBits-currentNbBits] = (U32)pos; ++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ ++ rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ +- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; ++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; +@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; +- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) ++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ +@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ +- /* special case : no rank 1 symbol (using maxNbBits-1); +- * let's create one from largest rank 0 (using maxNbBits). ++ /* special case : no rank 1 symbol (using targetNbBits-1); ++ * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { +- while (huffNode[n].nbBits == maxNbBits) n--; ++ while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); +@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + +- return maxNbBits; ++ return targetNbBits; + } + + typedef struct { +@@ -429,7 +500,7 @@ typedef struct { + U16 curr; + } rankPos; + +-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; ++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + + /* Number of buckets available for HUF_sort() */ + #define RANK_POSITION_TABLE_SIZE 192 +@@ -448,8 +519,8 @@ typedef struct { + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ + #define RANK_POSITION_MAX_COUNT_LOG 32 +-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ ++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) ++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + + /* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. +@@ -457,7 +528,7 @@ typedef struct { + static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count +- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; ++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; + } + + /* Helper swap function for HUF_quickSortPartition() */ +@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; ++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); +@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + assert(HUF_isSorted(huffNode, maxSymbolValue1)); + } + ++ + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). +@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; ++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; +@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + ++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); ++ + return nonNullRank; + } + +@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ + for (n=0; nhuffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + ++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); ++ ++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); ++ + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) +- return ERROR(workSpace_tooSmall); ++ return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) +- return ERROR(maxSymbolValue_tooLarge); ++ return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); ++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + +- /* enforce maxTableLog */ ++ /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + +@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, + } + + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { +- HUF_CElt const* ct = CTable + 1; +- int bad = 0; +- int s; +- for (s = 0; s <= (int)maxSymbolValue; ++s) { +- bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); +- } +- return !bad; ++ HUF_CTableHeader header = HUF_readCTableHeader(CTable); ++ HUF_CElt const* ct = CTable + 1; ++ int bad = 0; ++ int s; ++ ++ assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); ++ ++ if (header.maxSymbolValue < maxSymbolValue) ++ return 0; ++ ++ for (s = 0; s <= (int)maxSymbolValue; ++s) { ++ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); ++ } ++ return !bad; + } + + size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id + #if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); +- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; ++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC) + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } + } + +@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) + { +- U32 const tableLog = (U32)CTable[0]; ++ U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; +- BYTE* op = ostart; + HUF_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ +- { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); ++ { BYTE* op = ostart; ++ size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) +@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- (void)bmi2; ++ (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); + } + + #endif + +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +-{ +- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + static size_t + HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, int bmi2) ++ const HUF_CElt* CTable, int flags) + { + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; +@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + op += 6; /* jumpTable */ + + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; +@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; +@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; +@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } +@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + return (size_t)(op-ostart); + } + +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; +@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, +- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) ++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) + { + size_t const cSize = (nbStreams==HUF_singleStream) ? +- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : +- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); ++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : ++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; +@@ -1168,6 +1249,81 @@ typedef struct { + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) ++{ ++ unsigned cardinality = 0; ++ unsigned i; ++ ++ for (i = 0; i < maxSymbolValue + 1; i++) { ++ if (count[i] != 0) cardinality += 1; ++ } ++ ++ return cardinality; ++} ++ ++unsigned HUF_minTableLog(unsigned symbolCardinality) ++{ ++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; ++ return minBitsSymbols; ++} ++ ++unsigned HUF_optimalTableLog( ++ unsigned maxTableLog, ++ size_t srcSize, ++ unsigned maxSymbolValue, ++ void* workSpace, size_t wkspSize, ++ HUF_CElt* table, ++ const unsigned* count, ++ int flags) ++{ ++ assert(srcSize > 1); /* Not supported, RLE should be used instead */ ++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); ++ ++ if (!(flags & HUF_flags_optimalDepth)) { ++ /* cheap evaluation, based on FSE */ ++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ } ++ ++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); ++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); ++ size_t hSize, newSize; ++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); ++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); ++ size_t optSize = ((size_t) ~0) - 1; ++ unsigned optLog = maxTableLog, optLogGuess; ++ ++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); ++ ++ /* Search until size increases */ ++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { ++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); ++ ++ { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); ++ if (ERR_isError(maxBits)) continue; ++ ++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; ++ ++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); ++ } ++ ++ if (ERR_isError(hSize)) continue; ++ ++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; ++ ++ if (newSize > optSize + 1) { ++ break; ++ } ++ ++ if (newSize < optSize) { ++ optSize = newSize; ++ optLog = optLogGuess; ++ } ++ } ++ assert(optLog <= HUF_TABLELOG_MAX); ++ return optLog; ++ } ++} ++ + /* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, +- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, +- const int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) + { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + ++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ +@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ +- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { ++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; ++ DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; +@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } ++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat +@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ +- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; +- } +- /* Zero unused symbols in CTable, so we can check it for validity */ +- { +- size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); +- size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); +- ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); ++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + + /* Write table description header */ +@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ +@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize, + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, table->CTable, bmi2); +-} +- +- +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_singleStream, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ nbStreams, table->CTable, flags); + } + + size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +- int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, +- repeat, preferRepeat, bmi2, suspectUncompressible); +-} +- +-/* HUF_compress4X_repeat(): +- * compress input using 4 streams. +- * provide workspace to generate compression tables */ +-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_fourStreams, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ repeat, flags); + } + + /* HUF_compress4X_repeat(): + * compress input using 4 streams. + * consider skipping quickly +- * re-use an existing huffman compression table */ ++ * reuse an existing huffman compression table */ + size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, +- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); ++ hufTable, repeat, flags); + } +- +diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c +index 16bb995bc6c4..885167f7e47b 100644 +--- a/lib/zstd/compress/zstd_compress.c ++++ b/lib/zstd/compress/zstd_compress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,12 +12,12 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ + #include "../common/mem.h" + #include "hist.h" /* HIST_countFast_wksp */ + #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_compress_internal.h" + #include "zstd_compress_sequences.h" +@@ -27,6 +28,7 @@ + #include "zstd_opt.h" + #include "zstd_ldm.h" + #include "zstd_compress_superblock.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + + /* *************************************************************** + * Tuning parameters +@@ -55,14 +57,17 @@ + * Helper functions + ***************************************/ + /* ZSTD_compressBound() +- * Note that the result from this function is only compatible with the "normal" +- * full-block strategy. +- * When there are a lot of small blocks due to frequent flush in streaming mode +- * the overhead of headers can make the compressed data to be larger than the +- * return value of ZSTD_compressBound(). ++ * Note that the result from this function is only valid for ++ * the one-pass compression functions. ++ * When employing the streaming mode, ++ * if flushes are frequently altering the size of blocks, ++ * the overhead from block headers can make the compressed data larger ++ * than the return value of ZSTD_compressBound(). + */ + size_t ZSTD_compressBound(size_t srcSize) { +- return ZSTD_COMPRESSBOUND(srcSize); ++ size_t const r = ZSTD_COMPRESSBOUND(srcSize); ++ if (r==0) return ERROR(srcSize_wrong); ++ return r; + } + + +@@ -168,15 +173,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) + + size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) + { ++ DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); +- { +- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); +- if (!cctxInWorkspace) { +- ZSTD_customFree(cctx, cctx->customMem); +- } ++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; + } +@@ -257,9 +260,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); + } + +-/* Returns 1 if compression parameters are such that we should ++/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). +- * Returns 0 otherwise. ++ * Returns ZSTD_ps_disable otherwise. + */ + static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +@@ -267,6 +270,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; + } + ++static int ZSTD_resolveExternalSequenceValidation(int mode) { ++ return mode; ++} ++ ++/* Resolves maxBlockSize to the default if no value is present. */ ++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { ++ if (maxBlockSize == 0) { ++ return ZSTD_BLOCKSIZE_MAX; ++ } else { ++ return maxBlockSize; ++ } ++} ++ ++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) { ++ if (value != ZSTD_ps_auto) return value; ++ if (cLevel < 10) { ++ return ZSTD_ps_disable; ++ } else { ++ return ZSTD_ps_enable; ++ } ++} ++ ++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. ++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ ++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { ++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; ++} ++ + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) + { +@@ -284,6 +315,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + } + cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); ++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); ++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); ++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, ++ cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; + } +@@ -329,10 +364,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) + #define ZSTD_NO_CLEVEL 0 + + /* +- * Initializes the cctxParams from params and compressionLevel. ++ * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) ++static void ++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ++ const ZSTD_parameters* params, ++ int compressionLevel) + { + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); +@@ -345,6 +383,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); ++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); ++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); ++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); + } +@@ -359,7 +400,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete + + /* + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. +- * @param param Validated zstd parameters. ++ * @param params Validated zstd parameters. + */ + static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +@@ -455,8 +496,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + return bounds; + + case ZSTD_c_enableLongDistanceMatching: +- bounds.lowerBound = 0; +- bounds.upperBound = 1; ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: +@@ -549,6 +590,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + ++ case ZSTD_c_prefetchCDictTables: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ ++ case ZSTD_c_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; +@@ -567,10 +628,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) + return 0; + } + +-#define BOUNDCHECK(cParam, val) { \ +- RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ +- parameter_outOfBound, "Param out of bounds"); \ +-} ++#define BOUNDCHECK(cParam, val) \ ++ do { \ ++ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ ++ parameter_outOfBound, "Param out of bounds"); \ ++ } while (0) + + + static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +@@ -613,6 +675,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + default: + return 0; + } +@@ -625,7 +691,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { +- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); ++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) +@@ -668,6 +734,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); +@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); +- CCtxParams->cParams.minMatch = value; ++ CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); +- CCtxParams->cParams.targetLength = value; ++ CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : +@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; +- return CCtxParams->fParams.contentSizeFlag; ++ return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; +- return CCtxParams->fParams.checksumFlag; ++ return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); +@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); +- return CCtxParams->forceWindow; ++ return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; +- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); ++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } +@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); +- return CCtxParams->enableDedicatedDictSearch; ++ return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : ++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); + CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); +- CCtxParams->ldmParams.hashLog = value; ++ CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); +- CCtxParams->ldmParams.minMatchLength = value; ++ CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); +- CCtxParams->ldmParams.bucketSizeLog = value; ++ CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); +- CCtxParams->ldmParams.hashRateLog = value; ++ CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : +- if (value!=0) /* 0 ==> default */ ++ if (value!=0) { /* 0 ==> default */ ++ value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); +- CCtxParams->targetCBlockSize = value; ++ } ++ CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; +- return CCtxParams->srcSizeHint; ++ return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); +@@ -849,7 +922,7 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_validateSequences: + BOUNDCHECK(ZSTD_c_validateSequences, value); + CCtxParams->validateSequences = value; +- return CCtxParams->validateSequences; ++ return (size_t)CCtxParams->validateSequences; + + case ZSTD_c_useBlockSplitter: + BOUNDCHECK(ZSTD_c_useBlockSplitter, value); +@@ -864,7 +937,28 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; +- return CCtxParams->deterministicRefPrefix; ++ return (size_t)CCtxParams->deterministicRefPrefix; ++ ++ case ZSTD_c_prefetchCDictTables: ++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); ++ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->prefetchCDictTables; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); ++ CCtxParams->enableMatchFinderFallback = value; ++ return (size_t)CCtxParams->enableMatchFinderFallback; ++ ++ case ZSTD_c_maxBlockSize: ++ if (value!=0) /* 0 ==> default */ ++ BOUNDCHECK(ZSTD_c_maxBlockSize, value); ++ CCtxParams->maxBlockSize = value; ++ return CCtxParams->maxBlockSize; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->searchForExternalRepcodes; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +@@ -980,6 +1074,18 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; ++ case ZSTD_c_prefetchCDictTables: ++ *value = (int)CCtxParams->prefetchCDictTables; ++ break; ++ case ZSTD_c_enableSeqProducerFallback: ++ *value = CCtxParams->enableMatchFinderFallback; ++ break; ++ case ZSTD_c_maxBlockSize: ++ *value = (int)CCtxParams->maxBlockSize; ++ break; ++ case ZSTD_c_searchForExternalRepcodes: ++ *value = (int)CCtxParams->searchForExternalRepcodes; ++ break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +@@ -1006,9 +1112,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( + return 0; + } + ++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); ++ /* only update if all parameters are valid */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setParams"); ++ /* First check cParams, because we want to update all or none. */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); ++ /* Finally set cParams, which should succeed. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); ++ return 0; ++} ++ + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) + { +- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); ++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; +@@ -1024,9 +1168,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + + /* +- * Initializes the local dict using the requested parameters. +- * NOTE: This does not use the pledged src size, because it may be used for more +- * than one compression. ++ * Initializes the local dictionary using requested parameters. ++ * NOTE: Initialization does not employ the pledged src size, ++ * because the dictionary may be used for multiple compressions. + */ + static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + { +@@ -1039,8 +1183,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + return 0; + } + if (dl->cdict != NULL) { +- assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ ++ assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); +@@ -1060,26 +1204,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + } + + size_t ZSTD_CCtx_loadDictionary_advanced( +- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) ++ ZSTD_CCtx* cctx, ++ const void* dict, size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_dictContentType_e dictContentType) + { +- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't load a dictionary when ctx is not in init stage."); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); +- ZSTD_clearAllDicts(cctx); /* in case one already exists */ +- if (dict == NULL || dictSize == 0) /* no dictionary mode */ ++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ++ "Can't load a dictionary when cctx is not in init stage."); ++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ ++ if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { ++ /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, +- "no malloc for static CCtx"); ++ "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); +- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); ++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, ++ "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); +- cctx->localDict.dictBuffer = dictBuffer; +- cctx->localDict.dict = dictBuffer; ++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ ++ cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; +@@ -1149,7 +1297,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't reset parameters only when not in init stage."); ++ "Reset parameters is only possible during init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } +@@ -1178,11 +1326,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) + static ZSTD_compressionParameters + ZSTD_clampCParams(ZSTD_compressionParameters cParams) + { +-# define CLAMP_TYPE(cParam, val, type) { \ +- ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ +- if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ +- } ++# define CLAMP_TYPE(cParam, val, type) \ ++ do { \ ++ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ ++ if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ ++ } while (0) + # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); +@@ -1247,12 +1396,55 @@ static ZSTD_compressionParameters + ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, +- ZSTD_cParamMode_e mode) ++ ZSTD_cParamMode_e mode, ++ ZSTD_paramSwitch_e useRowMatchFinder) + { + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + ++ /* Cascade the selected strategy down to the next-highest one built into ++ * this binary. */ ++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btultra2) { ++ cPar.strategy = ZSTD_btultra; ++ } ++ if (cPar.strategy == ZSTD_btultra) { ++ cPar.strategy = ZSTD_btopt; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btopt) { ++ cPar.strategy = ZSTD_btlazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btlazy2) { ++ cPar.strategy = ZSTD_lazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy2) { ++ cPar.strategy = ZSTD_lazy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy) { ++ cPar.strategy = ZSTD_greedy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_greedy) { ++ cPar.strategy = ZSTD_dfast; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_dfast) { ++ cPar.strategy = ZSTD_fast; ++ cPar.targetLength = 0; ++ } ++#endif ++ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: +@@ -1281,8 +1473,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + } + + /* resize windowLog if input is small enough, to use less memory */ +- if ( (srcSize < maxWindowResize) +- && (dictSize < maxWindowResize) ) { ++ if ( (srcSize <= maxWindowResize) ++ && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : +@@ -1300,6 +1492,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + ++ /* We can't use more than 32 bits of hash in total, so that means that we require: ++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 ++ */ ++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { ++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; ++ if (cPar.hashLog > maxShortCacheHashLog) { ++ cPar.hashLog = maxShortCacheHashLog; ++ } ++ if (cPar.chainLog > maxShortCacheHashLog) { ++ cPar.chainLog = maxShortCacheHashLog; ++ } ++ } ++ ++ ++ /* At this point, we aren't 100% sure if we are using the row match finder. ++ * Unless it is explicitly disabled, conservatively assume that it is enabled. ++ * In this case it will only be disabled for small sources, so shrinking the ++ * hash log a little bit shouldn't result in any ratio loss. ++ */ ++ if (useRowMatchFinder == ZSTD_ps_auto) ++ useRowMatchFinder = ZSTD_ps_enable; ++ ++ /* We can't hash more than 32-bits in total. So that means that we require: ++ * (hashLog - rowLog + 8) <= 32 ++ */ ++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { ++ /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); ++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; ++ U32 const maxHashLog = maxRowHashLog + rowLog; ++ assert(cPar.hashLog >= rowLog); ++ if (cPar.hashLog > maxHashLog) { ++ cPar.hashLog = maxHashLog; ++ } ++ } ++ + return cPar; + } + +@@ -1310,7 +1538,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + { + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; +- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); ++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); + } + + static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +@@ -1341,7 +1569,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ +- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); + } + + static size_t +@@ -1367,10 +1595,10 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) + + ZSTD_cwksp_aligned_alloc_size((1<strategy, useRowMatchFinder) +- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) ++ ? ZSTD_cwksp_aligned_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace +@@ -1386,6 +1614,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; + } + ++/* Helper function for calculating memory requirements. ++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ ++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { ++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; ++ return blockSize / divider; ++} ++ + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, +@@ -1393,12 +1628,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_paramSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, +- const U64 pledgedSrcSize) ++ const U64 pledgedSrcSize, ++ int useSequenceProducer, ++ size_t maxBlockSize) + { + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (cParams->minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); +@@ -1417,6 +1653,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ size_t const externalSeqSpace = useSequenceProducer ++ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ : 0; ++ + size_t const neededSpace = + cctxSpace + + entropySpace + +@@ -1425,7 +1666,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ldmSeqSpace + + matchStateSize + + tokenSpace + +- bufferSpace; ++ bufferSpace + ++ externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +@@ -1443,7 +1685,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( +- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); ++ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + + size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +@@ -1493,7 +1735,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; +@@ -1504,7 +1746,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, +- ZSTD_CONTENTSIZE_UNKNOWN); ++ ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + } + +@@ -1637,6 +1879,19 @@ typedef enum { + ZSTD_resetTarget_CCtx + } ZSTD_resetTarget_e; + ++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ ++static U64 ZSTD_bitmix(U64 val, U64 len) { ++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); ++ val *= 0x9FB21C651E98DF25ULL; ++ val ^= (val >> 35) + len ; ++ val *= 0x9FB21C651E98DF25ULL; ++ return val ^ (val >> 28); ++} ++ ++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ ++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { ++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); ++} + + static size_t + ZSTD_reset_matchState(ZSTD_matchState_t* ms, +@@ -1664,6 +1919,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + + ms->hashLog3 = hashLog3; ++ ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + +@@ -1685,22 +1941,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp_clean_tables(ws); + } + +- /* opt parser space */ +- if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { +- DEBUGLOG(4, "reserving optimal parser space"); +- ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); +- ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); +- ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); +- ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); +- ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); +- } +- + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +- { /* Row match finder needs an additional table of hashes ("tags") */ +- size_t const tagTableSize = hSize*sizeof(U16); +- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ /* Row match finder needs an additional table of hashes ("tags") */ ++ size_t const tagTableSize = hSize; ++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use ++ * 0 when we reset a Cdict */ ++ if(forWho == ZSTD_resetTarget_CCtx) { ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ++ ZSTD_advanceHashSalt(ms); ++ } else { ++ /* When we are not salting we want to always memset the memory */ ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); ++ ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ ms->hashSalt = 0; + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +@@ -1709,6 +1962,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + } + ++ /* opt parser space */ ++ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { ++ DEBUGLOG(4, "reserving optimal parser space"); ++ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); ++ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); ++ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); ++ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); ++ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); ++ } ++ + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, +@@ -1768,6 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->useBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); ++ assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +@@ -1776,9 +2041,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(params->maxBlockSize, windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +@@ -1795,8 +2059,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, +- buffInSize, buffOutSize, pledgedSrcSize); +- int resizeWorkspace; ++ buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); + +@@ -1805,7 +2068,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + { /* Check if workspace is large enough, alloc a new one if needed */ + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); +- resizeWorkspace = workspaceTooSmall || workspaceWasteful; ++ int resizeWorkspace = workspaceTooSmall || workspaceWasteful; + DEBUGLOG(4, "Need %zu B workspace", neededSpace); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + +@@ -1838,6 +2101,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; ++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; +@@ -1854,13 +2118,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + ++ FORWARD_IF_ERROR(ZSTD_reset_matchState( ++ &zc->blockState.matchState, ++ ws, ++ ¶ms->cParams, ++ params->useRowMatchFinder, ++ crp, ++ needsIndexReset, ++ ZSTD_resetTarget_CCtx), ""); ++ ++ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); ++ ++ /* ldm hash table */ ++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { ++ /* TODO: avoid memset? */ ++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->maxNbLdmSequences = maxNbLdmSeq; ++ ++ ZSTD_window_init(&zc->ldmState.window); ++ zc->ldmState.loadedDictEnd = 0; ++ } ++ ++ /* reserve space for block-level external sequences */ ++ if (ZSTD_hasExtSeqProd(params)) { ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ zc->extSeqBufCapacity = maxNbExternalSeq; ++ zc->extSeqBuf = ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ } ++ ++ /* buffers */ ++ + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + +- /* buffers */ + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); +@@ -1883,32 +2180,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); +- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); +- +- FORWARD_IF_ERROR(ZSTD_reset_matchState( +- &zc->blockState.matchState, +- ws, +- ¶ms->cParams, +- params->useRowMatchFinder, +- crp, +- needsIndexReset, +- ZSTD_resetTarget_CCtx), ""); +- +- /* ldm hash table */ +- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { +- /* TODO: avoid memset? */ +- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; +- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); +- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); +- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); +- zc->maxNbLdmSequences = maxNbLdmSeq; +- +- ZSTD_window_init(&zc->ldmState.window); +- zc->ldmState.loadedDictEnd = 0; +- } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + +@@ -1980,7 +2254,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, +- cdict->dictContentSize, ZSTD_cpm_attachDict); ++ cdict->dictContentSize, ZSTD_cpm_attachDict, ++ params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +@@ -2019,6 +2294,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + return 0; + } + ++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ++ ZSTD_compressionParameters const* cParams) { ++ if (ZSTD_CDictIndicesAreTagged(cParams)){ ++ /* Remove tags from the CDict table if they are present. ++ * See docs on "short cache" in zstd_compress_internal.h for context. */ ++ size_t i; ++ for (i = 0; i < tableSize; i++) { ++ U32 const taggedIndex = src[i]; ++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; ++ dst[i] = index; ++ } ++ } else { ++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); ++ } ++} ++ + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, +@@ -2054,21 +2345,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + +- ZSTD_memcpy(cctx->blockState.matchState.hashTable, +- cdict->matchState.hashTable, +- hSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, ++ cdict->matchState.hashTable, ++ hSize, cdict_cParams); ++ + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +- ZSTD_memcpy(cctx->blockState.matchState.chainTable, +- cdict->matchState.chainTable, +- chainSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, ++ cdict->matchState.chainTable, ++ chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +- size_t const tagTableSize = hSize*sizeof(U16); ++ size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, +- cdict->matchState.tagTable, +- tagTableSize); ++ cdict->matchState.tagTable, ++ tagTableSize); ++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + +@@ -2147,6 +2440,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; ++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); +@@ -2294,7 +2588,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par + + /* See doc/zstd_compression_format.md for detailed format description */ + +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + { + const seqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; +@@ -2302,18 +2596,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; ++ int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); ++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) ++ longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; ++ return longOffsets; + } + + /* ZSTD_useTargetCBlockSize(): +@@ -2347,6 +2647,7 @@ typedef struct { + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ ++ int longOffsets; + } ZSTD_symbolEncodingTypeStats_t; + + /* ZSTD_buildSequencesStatistics(): +@@ -2357,11 +2658,13 @@ typedef struct { + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +- BYTE* dst, const BYTE* const dstEnd, +- ZSTD_strategy strategy, unsigned* countWorkspace, +- void* entropyWorkspace, size_t entropyWkspSize) { ++ZSTD_buildSequencesStatistics( ++ const seqStore_t* seqStorePtr, size_t nbSeq, ++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, ++ BYTE* dst, const BYTE* const dstEnd, ++ ZSTD_strategy strategy, unsigned* countWorkspace, ++ void* entropyWorkspace, size_t entropyWkspSize) ++{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; +@@ -2375,7 +2678,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + + stats.lastCountSize = 0; + /* convert length/distances into codes */ +- ZSTD_seqToCodes(seqStorePtr); ++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ +@@ -2480,22 +2783,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + */ + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- void* entropyWorkspace, size_t entropyWkspSize, +- const int bmi2) ++ZSTD_entropyCompressSeqStore_internal( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ const int bmi2) + { +- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + const seqDef* const sequences = seqStorePtr->sequencesStart; +- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; +@@ -2503,29 +2806,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; ++ int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ + { const BYTE* const literals = seqStorePtr->litStart; +- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; ++ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ + size_t const cSize = ZSTD_compressLiterals( +- &prevEntropy->huf, &nextEntropy->huf, +- cctxParams->cParams.strategy, +- ZSTD_literalsCompressionIsDisabled(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, +- bmi2, suspectUncompressible); ++ &prevEntropy->huf, &nextEntropy->huf, ++ cctxParams->cParams.strategy, ++ ZSTD_literalsCompressionIsDisabled(cctxParams), ++ suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; +@@ -2551,11 +2856,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } +- { +- ZSTD_symbolEncodingTypeStats_t stats; +- BYTE* seqHead = op++; ++ { BYTE* const seqHead = op++; + /* build stats for sequences */ +- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, ++ const ZSTD_symbolEncodingTypeStats_t stats = ++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, +@@ -2564,6 +2868,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; ++ longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +@@ -2598,14 +2903,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + } + + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- size_t srcSize, +- void* entropyWorkspace, size_t entropyWkspSize, +- int bmi2) ++ZSTD_entropyCompressSeqStore( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) + { + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, +@@ -2615,15 +2921,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ +- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) ++ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) { ++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ ++ } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. ++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. ++ */ ++ assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; + } + +@@ -2635,40 +2947,43 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS + static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, +- ZSTD_compressBlock_doubleFast, +- ZSTD_compressBlock_greedy, +- ZSTD_compressBlock_lazy, +- ZSTD_compressBlock_lazy2, +- ZSTD_compressBlock_btlazy2, +- ZSTD_compressBlock_btopt, +- ZSTD_compressBlock_btultra, +- ZSTD_compressBlock_btultra2 }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST, ++ ZSTD_COMPRESSBLOCK_GREEDY, ++ ZSTD_COMPRESSBLOCK_LAZY, ++ ZSTD_COMPRESSBLOCK_LAZY2, ++ ZSTD_COMPRESSBLOCK_BTLAZY2, ++ ZSTD_COMPRESSBLOCK_BTOPT, ++ ZSTD_COMPRESSBLOCK_BTULTRA, ++ ZSTD_COMPRESSBLOCK_BTULTRA2 ++ }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, +- ZSTD_compressBlock_doubleFast_extDict, +- ZSTD_compressBlock_greedy_extDict, +- ZSTD_compressBlock_lazy_extDict, +- ZSTD_compressBlock_lazy2_extDict, +- ZSTD_compressBlock_btlazy2_extDict, +- ZSTD_compressBlock_btopt_extDict, +- ZSTD_compressBlock_btultra_extDict, +- ZSTD_compressBlock_btultra_extDict }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ++ }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, +- ZSTD_compressBlock_doubleFast_dictMatchState, +- ZSTD_compressBlock_greedy_dictMatchState, +- ZSTD_compressBlock_lazy_dictMatchState, +- ZSTD_compressBlock_lazy2_dictMatchState, +- ZSTD_compressBlock_btlazy2_dictMatchState, +- ZSTD_compressBlock_btopt_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ++ }, + { NULL /* default for 0 */, + NULL, + NULL, +- ZSTD_compressBlock_greedy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch, ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, + NULL, + NULL, + NULL, +@@ -2681,18 +2996,26 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS + DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { + static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { +- { ZSTD_compressBlock_greedy_row, +- ZSTD_compressBlock_lazy_row, +- ZSTD_compressBlock_lazy2_row }, +- { ZSTD_compressBlock_greedy_extDict_row, +- ZSTD_compressBlock_lazy_extDict_row, +- ZSTD_compressBlock_lazy2_extDict_row }, +- { ZSTD_compressBlock_greedy_dictMatchState_row, +- ZSTD_compressBlock_lazy_dictMatchState_row, +- ZSTD_compressBlock_lazy2_dictMatchState_row }, +- { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ++ } + }; + DEBUGLOG(4, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); +@@ -2718,6 +3041,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) + ssPtr->longLengthType = ZSTD_llt_none; + } + ++/* ZSTD_postProcessSequenceProducerResult() : ++ * Validates and post-processes sequences obtained through the external matchfinder API: ++ * - Checks whether nbExternalSeqs represents an error condition. ++ * - Appends a block delimiter to outSeqs if one is not already present. ++ * See zstd.h for context regarding block delimiters. ++ * Returns the number of sequences after post-processing, or an error code. */ ++static size_t ZSTD_postProcessSequenceProducerResult( ++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ++) { ++ RETURN_ERROR_IF( ++ nbExternalSeqs > outSeqsCapacity, ++ sequenceProducer_failed, ++ "External sequence producer returned error code %lu", ++ (unsigned long)nbExternalSeqs ++ ); ++ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == 0 && srcSize > 0, ++ sequenceProducer_failed, ++ "Got zero sequences from external sequence producer for a non-empty src buffer!" ++ ); ++ ++ if (srcSize == 0) { ++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); ++ return 1; ++ } ++ ++ { ++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; ++ ++ /* We can return early if lastSeq is already a block delimiter. */ ++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { ++ return nbExternalSeqs; ++ } ++ ++ /* This error condition is only possible if the external matchfinder ++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == outSeqsCapacity, ++ sequenceProducer_failed, ++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ++ ); ++ ++ /* lastSeq is not a block delimiter, so we need to append one. */ ++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); ++ return nbExternalSeqs + 1; ++ } ++} ++ ++/* ZSTD_fastSequenceLengthSum() : ++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. ++ * Similar to another function in zstd_compress.c (determine_blockSize), ++ * except it doesn't check for a block delimiter to end summation. ++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). ++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ ++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { ++ size_t matchLenSum, litLenSum, i; ++ matchLenSum = 0; ++ litLenSum = 0; ++ for (i = 0; i < seqBufSize; i++) { ++ litLenSum += seqBuf[i].litLength; ++ matchLenSum += seqBuf[i].matchLength; ++ } ++ return litLenSum + matchLenSum; ++} ++ + typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; + + static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2727,7 +3116,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); +- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { +@@ -2763,6 +3154,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, +@@ -2774,6 +3174,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + rawSeqStore_t ldmSeqStore = kNullRawSeqStore; + ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ +@@ -2788,10 +3196,74 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); +- } else { /* not long range mode */ +- ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, +- zc->appliedParams.useRowMatchFinder, +- dictMode); ++ } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { ++ assert( ++ zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) ++ ); ++ assert(zc->appliedParams.extSeqProdFunc != NULL); ++ ++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; ++ ++ size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( ++ zc->appliedParams.extSeqProdState, ++ zc->extSeqBuf, ++ zc->extSeqBufCapacity, ++ src, srcSize, ++ NULL, 0, /* dict and dictSize, currently not supported */ ++ zc->appliedParams.compressionLevel, ++ windowSize ++ ); ++ ++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( ++ zc->extSeqBuf, ++ nbExternalSeqs, ++ zc->extSeqBufCapacity, ++ srcSize ++ ); ++ ++ /* Return early if there is no error, since we don't need to worry about last literals */ ++ if (!ZSTD_isError(nbPostProcessedSeqs)) { ++ ZSTD_sequencePosition seqPos = {0,0,0}; ++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); ++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); ++ FORWARD_IF_ERROR( ++ ZSTD_copySequencesToSeqStoreExplicitBlockDelim( ++ zc, &seqPos, ++ zc->extSeqBuf, nbPostProcessedSeqs, ++ src, srcSize, ++ zc->appliedParams.searchForExternalRepcodes ++ ), ++ "Failed to copy external sequences to seqStore!" ++ ); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); ++ return ZSTDbss_compress; ++ } ++ ++ /* Propagate the error if fallback is disabled */ ++ if (!zc->appliedParams.enableMatchFinderFallback) { ++ return nbPostProcessedSeqs; ++ } ++ ++ /* Fallback to software matchfinder */ ++ { ZSTD_blockCompressor const blockCompressor = ++ ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG( ++ 5, ++ "External sequence producer returned error code %lu. Falling back to internal parser.", ++ (unsigned long)nbExternalSeqs ++ ); ++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); ++ } } ++ } else { /* not long range mode and no external matchfinder */ ++ ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); + ms->ldmSeqStore = NULL; + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } +@@ -2801,29 +3273,38 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + return ZSTDbss_compress; + } + +-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) ++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const seqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) + { +- const seqStore_t* seqStore = ZSTD_getSeqStore(zc); +- const seqDef* seqStoreSeqs = seqStore->sequencesStart; +- size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; +- size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); +- size_t literalsRead = 0; +- size_t lastLLSize; ++ const seqDef* inSeqs = seqStore->sequencesStart; ++ const size_t nbInSequences = seqStore->sequences - inSeqs; ++ const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); + +- ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; ++ ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; ++ const size_t nbOutSequences = nbInSequences + 1; ++ size_t nbOutLiterals = 0; ++ repcodes_t repcodes; + size_t i; +- repcodes_t updatedRepcodes; + +- assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); +- /* Ensure we have enough space for last literals "sequence" */ +- assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); +- ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (i = 0; i < seqStoreSeqSize; ++i) { +- U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; +- outSeqs[i].litLength = seqStoreSeqs[i].litLength; +- outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; ++ /* Bounds check that we have enough space for every input sequence ++ * and the block delimiter ++ */ ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ RETURN_ERROR_IF( ++ nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), ++ dstSize_tooSmall, ++ "Not enough space to copy sequences"); ++ ++ ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); ++ for (i = 0; i < nbInSequences; ++i) { ++ U32 rawOffset; ++ outSeqs[i].litLength = inSeqs[i].litLength; ++ outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; + outSeqs[i].rep = 0; + ++ /* Handle the possible single length >= 64K ++ * There can only be one because we add MINMATCH to every match length, ++ * and blocks are at most 128K. ++ */ + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { + outSeqs[i].litLength += 0x10000; +@@ -2832,37 +3313,55 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + } + } + +- if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { +- /* Derive the correct offset corresponding to a repcode */ +- outSeqs[i].rep = seqStoreSeqs[i].offBase; ++ /* Determine the raw offset given the offBase, which may be a repcode. */ ++ if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { ++ const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); ++ assert(repcode > 0); ++ outSeqs[i].rep = repcode; + if (outSeqs[i].litLength != 0) { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; ++ rawOffset = repcodes.rep[repcode - 1]; + } else { +- if (outSeqs[i].rep == 3) { +- rawOffset = updatedRepcodes.rep[0] - 1; ++ if (repcode == 3) { ++ assert(repcodes.rep[0] > 1); ++ rawOffset = repcodes.rep[0] - 1; + } else { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; ++ rawOffset = repcodes.rep[repcode]; + } + } ++ } else { ++ rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); + } + outSeqs[i].offset = rawOffset; +- /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode +- so we provide seqStoreSeqs[i].offset - 1 */ +- ZSTD_updateRep(updatedRepcodes.rep, +- seqStoreSeqs[i].offBase - 1, +- seqStoreSeqs[i].litLength == 0); +- literalsRead += outSeqs[i].litLength; ++ ++ /* Update repcode history for the sequence */ ++ ZSTD_updateRep(repcodes.rep, ++ inSeqs[i].offBase, ++ inSeqs[i].litLength == 0); ++ ++ nbOutLiterals += outSeqs[i].litLength; + } + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ +- assert(seqStoreLiteralsSize >= literalsRead); +- lastLLSize = seqStoreLiteralsSize - literalsRead; +- outSeqs[i].litLength = (U32)lastLLSize; +- outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; +- seqStoreSeqSize++; +- zc->seqCollector.seqIndex += seqStoreSeqSize; ++ assert(nbInLiterals >= nbOutLiterals); ++ { ++ const size_t lastLLSize = nbInLiterals - nbOutLiterals; ++ outSeqs[nbInSequences].litLength = (U32)lastLLSize; ++ outSeqs[nbInSequences].matchLength = 0; ++ outSeqs[nbInSequences].offset = 0; ++ assert(nbOutSequences == nbInSequences + 1); ++ } ++ seqCollector->seqIndex += nbOutSequences; ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ ++ return 0; ++} ++ ++size_t ZSTD_sequenceBound(size_t srcSize) { ++ const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; ++ const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; ++ return maxNbSeq + maxNbDelims; + } + + size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +@@ -2871,6 +3370,16 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + const size_t dstCapacity = ZSTD_compressBound(srcSize); + void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); + SeqCollector seqCollector; ++ { ++ int targetCBlockSize; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); ++ RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); ++ } ++ { ++ int nbWorkers; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); ++ RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); ++ } + + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + +@@ -2880,8 +3389,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + +- ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); +- ZSTD_customFree(dst, ZSTD_defaultCMem); ++ { ++ const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ++ ZSTD_customFree(dst, ZSTD_defaultCMem); ++ FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); ++ } ++ assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); + return zc->seqCollector.seqIndex; + } + +@@ -2910,19 +3423,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; +- size_t u; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { ++ size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; +- } +- } +- } ++ } } } + return 1; + } + +@@ -2938,7 +3449,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) + return nbSeqs < 4 && nbLits < 10; + } + +-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) ++static void ++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) + { + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; +@@ -2946,7 +3458,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c + } + + /* Writes the block header */ +-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { ++static void ++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) ++{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); +@@ -2959,13 +3473,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace +- * @return : size of huffman description table or error code */ +-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +- const ZSTD_hufCTables_t* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_hufCTablesMetadata_t* hufMetadata, +- const int literalsCompressionIsDisabled, +- void* workspace, size_t wkspSize) ++ * @return : size of huffman description table, or an error code ++ */ ++static size_t ++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const int literalsCompressionIsDisabled, ++ void* workspace, size_t wkspSize, ++ int hufFlags) + { + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; +@@ -2973,9 +3490,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; +- const size_t nodeWkspSize = wkspEnd-nodeWksp; ++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +- unsigned huffLog = HUF_TABLELOG_DEFAULT; ++ unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +@@ -2990,73 +3507,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + + /* small ? don't even attempt compression (speed opt) */ + #ifndef COMPRESS_LITERALS_SIZE_MIN +-#define COMPRESS_LITERALS_SIZE_MIN 63 ++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ + #endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Scan input and build symbol stats */ +- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); ++ { size_t const largest = ++ HIST_count_wksp (countWksp, &maxSymbolValue, ++ (const BYTE*)src, srcSize, ++ workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { ++ /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { ++ /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Validate the previous Huffman table */ +- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { ++ if (repeat == HUF_repeat_check ++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); ++ assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; +- { /* Build and write the CTable */ +- size_t const newCSize = HUF_estimateCompressedSize( +- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +- size_t const hSize = HUF_writeCTable_wksp( +- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +- nodeWksp, nodeWkspSize); +- /* Check against repeating the previous CTable */ +- if (repeat != HUF_repeat_none) { +- size_t const oldCSize = HUF_estimateCompressedSize( +- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +- DEBUGLOG(5, "set_repeat - smaller"); +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_repeat; +- return 0; +- } +- } +- if (newCSize + hSize >= srcSize) { +- DEBUGLOG(5, "set_basic - no gains"); ++ } ++ { /* Build and write the CTable */ ++ size_t const newCSize = HUF_estimateCompressedSize( ++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); ++ size_t const hSize = HUF_writeCTable_wksp( ++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), ++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, ++ nodeWksp, nodeWkspSize); ++ /* Check against repeating the previous CTable */ ++ if (repeat != HUF_repeat_none) { ++ size_t const oldCSize = HUF_estimateCompressedSize( ++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); ++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { ++ DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_basic; ++ hufMetadata->hType = set_repeat; + return 0; +- } +- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +- hufMetadata->hType = set_compressed; +- nextHuf->repeatMode = HUF_repeat_check; +- return hSize; ++ } } ++ if (newCSize + hSize >= srcSize) { ++ DEBUGLOG(5, "set_basic - no gains"); ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ hufMetadata->hType = set_basic; ++ return 0; + } ++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); ++ hufMetadata->hType = set_compressed; ++ nextHuf->repeatMode = HUF_repeat_check; ++ return hSize; + } + } + +@@ -3066,8 +3587,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + * and updates nextEntropy to the appropriate repeatMode. + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; ++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) ++{ ++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; +@@ -3078,16 +3600,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. +- * @return : size of fse tables or error code */ +-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +- const ZSTD_fseCTables_t* prevEntropy, +- ZSTD_fseCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize) ++ * @return : size of fse tables or error code */ ++static size_t ++ZSTD_buildBlockEntropyStats_sequences( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_fseCTables_t* prevEntropy, ++ ZSTD_fseCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize) + { + ZSTD_strategy const strategy = cctxParams->cParams.strategy; +- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; +@@ -3114,23 +3638,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE +- * +- * @return : 0 on success or error code ++ * @return : 0 on success, or an error code ++ * Note : also employed in superblock + */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize) +-{ +- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize) ++{ ++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); ++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; ++ + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), +- workspace, wkspSize); ++ workspace, wkspSize, hufFlags); ++ + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +@@ -3143,11 +3672,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + } + + /* Returns the size estimate for the literals section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +- const ZSTD_hufCTables_t* huf, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, ++ const ZSTD_hufCTables_t* huf, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +@@ -3169,12 +3699,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz + } + + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +- const FSE_CTable* fseCTable, +- const U8* additionalBits, +- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +- void* workspace, size_t wkspSize) ++static size_t ++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, ++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, ++ const FSE_CTable* fseCTable, ++ const U8* additionalBits, ++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, ++ void* workspace, size_t wkspSize) + { + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; +@@ -3206,99 +3737,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + } + + /* Returns the size estimate for the sequences section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +- fseTables->offcodeCTable, NULL, +- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +- workspace, wkspSize); ++ fseTables->offcodeCTable, NULL, ++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +- fseTables->litlengthCTable, LL_bits, +- LL_defaultNorm, LL_defaultNormLog, MaxLL, +- workspace, wkspSize); ++ fseTables->litlengthCTable, LL_bits, ++ LL_defaultNorm, LL_defaultNormLog, MaxLL, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +- fseTables->matchlengthCTable, ML_bits, +- ML_defaultNorm, ML_defaultNormLog, MaxML, +- workspace, wkspSize); ++ fseTables->matchlengthCTable, ML_bits, ++ ML_defaultNorm, ML_defaultNormLog, MaxML, ++ workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + + /* Returns the size estimate for a given stream of literals, of, ll, ml */ +-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +- const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_entropyCTables_t* entropy, +- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { ++static size_t ++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, ++ const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_entropyCTables_t* entropy, ++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize, ++ int writeLitEntropy, int writeSeqEntropy) ++{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +- workspace, wkspSize, writeSeqEntropy); ++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, ++ workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; + } + + /* Builds entropy statistics and uses them for blocksize estimation. + * +- * Returns the estimated compressed size of the seqStore, or a zstd error. ++ * @return: estimated compressed size of the seqStore, or a zstd error. + */ +-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; ++static size_t ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) ++{ ++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), ""); ++ return ZSTD_estimateBlockSize( ++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), +- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ &zc->blockState.nextCBlock->entropy, ++ entropyMetadata, ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); + } + + /* Returns literals bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) ++{ + size_t literalsBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ seqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; +- } +- } ++ } } + return literalsBytes; + } + + /* Returns match bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) ++{ + size_t matchBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; +- } +- } ++ } } + return matchBytes; + } + +@@ -3307,15 +3846,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { + */ + static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, +- size_t startIdx, size_t endIdx) { +- BYTE* const litEnd = originalSeqStore->lit; +- size_t literalsBytes; +- size_t literalsBytesPreceding = 0; +- ++ size_t startIdx, size_t endIdx) ++{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ +@@ -3328,13 +3864,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +- resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +- resultSeqStore->lit = litEnd; ++ assert(resultSeqStore->lit == originalSeqStore->lit); + } else { +- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; ++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; +@@ -3342,20 +3877,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + + /* +- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. +- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). ++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. ++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ + static U32 +-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +-{ +- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ +- assert(STORED_IS_REPCODE(offCode)); +- if (adjustedOffCode == ZSTD_REP_NUM) { +- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +- assert(rep[0] > 0); ++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) ++{ ++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ ++ assert(OFFBASE_IS_REPCODE(offBase)); ++ if (adjustedRepCode == ZSTD_REP_NUM) { ++ assert(ll0); ++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 ++ * This is only valid if it results in a valid offset value, aka > 0. ++ * Note : it may happen that `rep[0]==1` in exceptional circumstances. ++ * In which case this function will return 0, which is an invalid offset. ++ * It's not an issue though, since this value will be ++ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). ++ */ + return rep[0] - 1; + } +- return rep[adjustedOffCode]; ++ return rep[adjustedRepCode]; + } + + /* +@@ -3371,30 +3912,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +- seqStore_t* const seqStore, U32 const nbSeq) { ++static void ++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, ++ const seqStore_t* const seqStore, U32 const nbSeq) ++{ + U32 idx = 0; ++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { + seqDef* const seq = seqStore->sequencesStart + idx; +- U32 const ll0 = (seq->litLength == 0); +- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +- assert(seq->offBase > 0); +- if (STORED_IS_REPCODE(offCode)) { +- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); ++ U32 const offBase = seq->offBase; ++ assert(offBase > 0); ++ if (OFFBASE_IS_REPCODE(offBase)) { ++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); ++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { +- seq->offBase = cRawOffset + ZSTD_REP_NUM; ++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ +- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); ++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } + } + +@@ -3404,10 +3948,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ + * Returns the total size of that block (including header) or a ZSTD error code. + */ + static size_t +-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, ++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, ++ const seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, ++ const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) + { + const U32 rleMaxLength = 25; +@@ -3442,8 +3987,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + cSeqsSize = 1; + } + ++ /* Sequence collection not supported when block splitting */ + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3481,45 +4027,49 @@ typedef struct { + + /* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. +- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then +- * we do not recurse. ++ * If advantageous to split, then we recurse down the two sub-blocks. ++ * If not, or if an error occurred in estimation, then we do not recurse. + * +- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. ++ * Note: The recursion depth is capped by a heuristic minimum number of sequences, ++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * +- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize ++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. ++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ + static void + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) + { +- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + ++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); ++ assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); ++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } +- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", ++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { ++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; +@@ -3527,14 +4077,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end + } + } + +-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. ++/* Base recursive function. ++ * Populates a table with intra-block partition indices that can improve compression ratio. + * +- * Returns the number of splits made (which equals the size of the partition table - 1). ++ * @return: number of splits made (which equals the size of the partition table - 1). + */ +-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +- seqStoreSplits splits = {partitions, 0}; ++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) ++{ ++ seqStoreSplits splits; ++ splits.splitLocations = partitions; ++ splits.idx = 0; + if (nbSeq <= 4) { +- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); ++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } +@@ -3550,18 +4104,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ + static size_t +-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) ++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t blockSize, ++ U32 lastBlock, U32 nbSeq) + { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; +- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); ++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ ++ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +@@ -3583,30 +4139,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { +- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +- &dRep, &cRep, +- op, dstCapacity, +- ip, blockSize, +- lastBlock, 0 /* isPartition */); ++ size_t cSizeSingleBlock = ++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, ++ &dRep, &cRep, ++ op, dstCapacity, ++ ip, blockSize, ++ lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { +- size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + +- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); ++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ +@@ -3621,7 +4178,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); +- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); ++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; +@@ -3629,10 +4187,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; +- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize); + } +- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +- * for the next block. ++ /* cRep and dRep may have diverged during the compression. ++ * If so, we use the dRep repcodes for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); + return cSize; +@@ -3643,8 +4201,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) + { +- const BYTE* ip = (const BYTE*)src; +- BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +@@ -3655,7 +4211,8 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; +@@ -3673,9 +4230,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) + { +- /* This the upper bound for the length of an rle block. +- * This isn't the actual upper bound. Finding the real threshold +- * needs further investigation. ++ /* This is an estimated upper bound for the length of an rle block. ++ * This isn't the actual upper bound. ++ * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; +@@ -3687,11 +4244,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); +- if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } ++ if (bss == ZSTDbss_noCompress) { ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = 0; ++ goto out; ++ } + } + + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3767,10 +4328,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ +- { +- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); ++ { size_t const cSize = ++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { +- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); ++ size_t const maxCSize = ++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +@@ -3778,7 +4340,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + } + } + } +- } ++ } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. +@@ -3836,7 +4398,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + * All blocks will be terminated, all input will be consumed. + * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. + * Frame is supposed already started (header already produced) +-* @return : compressed size, or an error code ++* @return : compressed size, or an error code + */ + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -3860,7 +4422,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + +- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); + if (remaining < blockSize) blockSize = remaining; +@@ -3899,7 +4463,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } +- } ++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ + + + ip += blockSize; +@@ -4001,19 +4565,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) + } + } + +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) + { +- RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, +- "wrong cctx stage"); +- RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, +- parameter_unsupported, +- "incompatible with ldm"); ++ assert(cctx->stage == ZSTDcs_init); ++ assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + cctx->externSeqStore.posInSequence = 0; +- return 0; + } + + +@@ -4078,31 +4638,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + } + } + +-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressContinue_public() */ ++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); ++} + +-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) + { + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); +- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); ++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); + } + +-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ ++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++{ ++ return ZSTD_getBlockSize_deprecated(cctx); ++} ++ ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); +- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); ++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++{ ++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); ++} ++ + /*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +@@ -4111,25 +4691,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, +- ZSTD_dictTableLoadMethod_e dtlm) ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) + { + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + +- /* Assert that we the ms params match the params we're being given */ ++ /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +- if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ { /* Ensure large dictionaries can't cause index overflow */ ++ + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ +- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +- /* We must have cleared our windows when our source is this large. */ +- assert(ZSTD_window_isEmpty(ms->window)); +- if (loadLdmDict) +- assert(ZSTD_window_isEmpty(ls->window)); ++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; ++ ++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); ++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { ++ /* Some dictionary matchfinders in zstd use "short cache", ++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each ++ * CDict hashtable entry as a tag rather than as part of an index. ++ * When short cache is used, we need to truncate the dictionary ++ * so that its indices don't overlap with the tag. */ ++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; ++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); ++ assert(!loadLdmDict); ++ } ++ + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; +@@ -4138,35 +4729,58 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } + } + +- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ /* We must have cleared our windows when our source is this large. */ ++ assert(ZSTD_window_isEmpty(ms->window)); ++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); ++ } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); +- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +- ms->forceNonContiguous = params->deterministicRefPrefix; + +- if (loadLdmDict) { ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ ++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ++ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); + } + ++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ ++ if (params->cParams.strategy < ZSTD_btultra) { ++ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28); ++ if (srcSize > maxDictSize) { ++ ip = iend - maxDictSize; ++ src = ip; ++ srcSize = maxDictSize; ++ } ++ } ++ ++ ms->nextToUpdate = (U32)(ip - ms->window.base); ++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ++ ms->forceNonContiguous = params->deterministicRefPrefix; ++ + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + +- if (loadLdmDict) +- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); +- + switch(params->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, dtlm); ++ ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, dtlm); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); +@@ -4174,7 +4788,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { +- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); ++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); +@@ -4183,14 +4797,23 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); + } + } ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + default: +@@ -4237,11 +4860,10 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ +- if (!hasZeroWeights) ++ if (!hasZeroWeights && maxSymbolValue == 255) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); +- RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + +@@ -4327,6 +4949,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + const BYTE* dictPtr = (const BYTE*)dict; +@@ -4345,7 +4968,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( +- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); ++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; + } +@@ -4361,6 +4984,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); +@@ -4373,13 +4997,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) +- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); ++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( +- ms, ls, ws, params, dict, dictSize, dtlm); ++ ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ +@@ -4387,13 +5011,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( +- bs, ms, ws, params, dict, dictSize, dtlm, workspace); ++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); + } + + #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) + #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + + /*! ZSTD_compressBegin_internal() : ++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ + static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, +@@ -4426,11 +5051,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, +- cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->entropyWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, +- dictContentType, dtlm, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; +@@ -4471,11 +5096,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + &cctxParams, pledgedSrcSize); + } + +-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++static size_t ++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) + { + ZSTD_CCtx_params cctxParams; +- { +- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); +@@ -4483,9 +5108,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); + } + ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++{ ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); ++} ++ + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) + { +- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); + } + + +@@ -4496,14 +5127,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + { + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; +- size_t fhSize = 0; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { +- fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); ++ size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; +@@ -4513,8 +5143,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; +- RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); +- MEM_writeLE32(op, cBlockHeader24); ++ ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); ++ MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } +@@ -4537,9 +5168,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) + (void)extraCSize; + } + +-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, +@@ -4563,6 +5194,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + return cSize + endResult; + } + ++/* NOTE: Must just wrap ZSTD_compressEnd_public() */ ++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); ++} ++ + size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -4591,7 +5230,7 @@ size_t ZSTD_compress_advanced_internal( + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, +@@ -4709,7 +5348,7 @@ static size_t ZSTD_initCDict_internal( + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, +- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); ++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; +@@ -4813,7 +5452,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + if (!cdict) + return NULL; + +- if (ZSTD_isError( ZSTD_initCDict_internal(cdict, ++ if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cctxParams) )) { +@@ -4908,6 +5547,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; ++ cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, +@@ -4987,12 +5627,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( + + /* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + { + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + } + ++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++{ ++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); ++} ++ + /*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +@@ -5002,7 +5647,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) + { + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + /*! ZSTD_compress_usingCDict_advanced(): +@@ -5199,30 +5844,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) + + static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) + { +- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; +- if (hintInSize==0) hintInSize = cctx->blockSize; +- return hintInSize; ++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ return cctx->blockSize - cctx->stableIn_notConsumed; ++ } ++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); ++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; ++ if (hintInSize==0) hintInSize = cctx->blockSize; ++ return hintInSize; ++ } + } + + /* ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants +- * non-static, because can be called from zstdmt_compress.c +- * @return : hint size for next input */ ++ * @return : hint size for next input to complete ongoing block */ + static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) + { +- const char* const istart = (const char*)input->src; +- const char* const iend = input->size != 0 ? istart + input->size : istart; +- const char* ip = input->pos != 0 ? istart + input->pos : istart; +- char* const ostart = (char*)output->dst; +- char* const oend = output->size != 0 ? ostart + output->size : ostart; +- char* op = output->pos != 0 ? ostart + output->pos : ostart; ++ const char* const istart = (assert(input != NULL), (const char*)input->src); ++ const char* const iend = (istart != NULL) ? istart + input->size : istart; ++ const char* ip = (istart != NULL) ? istart + input->pos : istart; ++ char* const ostart = (assert(output != NULL), (char*)output->dst); ++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; ++ char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ +- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); ++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); ++ assert(zcs != NULL); ++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ assert(input->pos >= zcs->stableIn_notConsumed); ++ input->pos -= zcs->stableIn_notConsumed; ++ if (ip) ip -= zcs->stableIn_notConsumed; ++ zcs->stableIn_notConsumed = 0; ++ } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); +@@ -5231,8 +5887,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } +- assert(output->pos <= output->size); ++ if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); ++ if (output->dst == NULL) assert(output->size == 0); ++ assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { +@@ -5247,7 +5905,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ +- size_t const cSize = ZSTD_compressEnd(zcs, ++ size_t const cSize = ZSTD_compressEnd_public(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); +@@ -5264,8 +5922,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); + zcs->inBuffPos += loaded; +- if (loaded != 0) +- ip += loaded; ++ if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ +@@ -5276,6 +5933,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + /* empty */ + someMoreWork = 0; break; + } ++ } else { ++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ if ( (flushMode == ZSTD_e_continue) ++ && ( (size_t)(iend - ip) < zcs->blockSize) ) { ++ /* can't compress a full block : stop here */ ++ zcs->stableIn_notConsumed = (size_t)(iend - ip); ++ ip = iend; /* pretend to have consumed input */ ++ someMoreWork = 0; break; ++ } ++ if ( (flushMode == ZSTD_e_flush) ++ && (ip == iend) ) { ++ /* empty */ ++ someMoreWork = 0; break; ++ } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); +@@ -5283,9 +5954,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + void* cDst; + size_t cSize; + size_t oSize = oend-op; +- size_t const iSize = inputBuffered +- ? zcs->inBuffPos - zcs->inToCompress +- : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress ++ : MIN((size_t)(iend - ip), zcs->blockSize); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else +@@ -5293,9 +5963,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ++ ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ++ ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +@@ -5308,19 +5978,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; +- } else { +- unsigned const lastBlock = (ip + iSize == iend); +- assert(flushMode == ZSTD_e_end /* Already validated */); ++ } else { /* !inputBuffered, hence ZSTD_bm_stable */ ++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); ++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ +- if (iSize > 0) +- ip += iSize; ++ if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +- if (lastBlock) +- assert(ip == iend); ++ if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; +@@ -5390,8 +6057,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf + /* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) ++static void ++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) + { ++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } +@@ -5410,22 +6079,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + { + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; +- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); +- if (endOp != ZSTD_e_end) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); ++ if (expect.src != input->src || expect.pos != input->pos) ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } ++ (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) +- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; + } + + static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, +- size_t inSize) { ++ size_t inSize) ++{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ +@@ -5439,9 +6108,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.compressionLevel = cctx->cdict->compressionLevel; + } + DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ +- { +- size_t const dictSize = prefixDict.dict ++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ ++ ++ { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); +@@ -5453,6 +6122,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); ++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); ++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); ++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +@@ -5479,6 +6151,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + return 0; + } + ++/* @return provides a minimum amount of data remaining to be flushed from internal buffers ++ */ + size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, +@@ -5493,8 +6167,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { +- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); +- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ ++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ ++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; ++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ ++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ ++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ ++ if (cctx->stableIn_notConsumed) { /* not the first time */ ++ /* check stable source guarantees */ ++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); ++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); ++ } ++ /* pretend input was consumed, to give a sense forward progress */ ++ input->pos = input->size; ++ /* save stable inBuffer, for later control, and flush/end */ ++ cctx->expectedInBuffer = *input; ++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ ++ cctx->stableIn_notConsumed += inputSize; ++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ ++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + +@@ -5512,13 +6205,20 @@ size_t ZSTD_compressStream2_simpleArgs ( + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } + + size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5541,6 +6241,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; ++ + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); +@@ -5551,64 +6252,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + } + } + +-typedef struct { +- U32 idx; /* Index in array of ZSTD_Sequence */ +- U32 posInSequence; /* Position within sequence at idx */ +- size_t posInSrc; /* Number of bytes given by sequences provided so far */ +-} ZSTD_sequencePosition; +- + /* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ + static size_t +-ZSTD_validateSequence(U32 offCode, U32 matchLength, +- size_t posInSrc, U32 windowLog, size_t dictSize) ++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch, ++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) + { +- U32 const windowSize = 1 << windowLog; ++ U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); ++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; ++ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ ++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; + } + + /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) ++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) + { +- U32 offCode = STORE_OFFSET(rawOffset); ++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { +- offCode = STORE_REPCODE_1; ++ offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { +- offCode = STORE_REPCODE(2 - ll0); ++ offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { +- offCode = STORE_REPCODE(3 - ll0); ++ offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { +- offCode = STORE_REPCODE_3; ++ offBase = REPCODE3_TO_OFFBASE; + } +- return offCode; ++ return offBase; + } + +-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of +- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ++ ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; ++ U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; + repcodes_t updatedRepcodes; + U32 dictSize; + ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize); ++ + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5617,25 +6315,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + dictSize = 0; + } + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; +- U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; +- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ U32 offBase; ++ ++ if (externalRepSearch == ZSTD_ps_disable) { ++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); ++ } else { ++ U32 const ll0 = (litLength == 0); ++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } + +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ assert(externalRepSearch != ZSTD_ps_auto); ++ assert(idx >= startIdx); ++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { ++ U32* const rep = updatedRepcodes.rep; ++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ ++ ++ if (lastSeqIdx >= startIdx + 2) { ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (lastSeqIdx == startIdx + 1) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else { ++ assert(lastSeqIdx == startIdx); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } ++ } ++ + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); + + if (inSeqs[idx].litLength) { +@@ -5644,26 +6372,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } +- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); ++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; + return 0; + } + +-/* Returns the number of bytes to move the current read position back by. Only non-zero +- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something +- * went wrong. +- * +- * This function will attempt to scan through blockSize bytes represented by the sequences +- * in inSeqs, storing any (partial) sequences. +- * +- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to +- * avoid splitting a match, or to avoid splitting a match such that it would produce a match +- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; +@@ -5675,6 +6392,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + ++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ ++ (void)externalRepSearch; ++ + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5682,7 +6402,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } else { + dictSize = 0; + } +- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { +@@ -5690,7 +6410,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; +- U32 offCode; ++ U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { +@@ -5704,7 +6424,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; +- idx++; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ +@@ -5744,21 +6463,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); +- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; ++ if (!finalMatchSplit) ++ idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); +@@ -5781,7 +6502,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + + typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); + static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + { + ZSTD_sequenceCopier sequenceCopier = NULL; +@@ -5795,6 +6516,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + return sequenceCopier; + } + ++/* Discover the size of next block by searching for the delimiter. ++ * Note that a block delimiter **must** exist in this mode, ++ * otherwise it's an input error. ++ * The block size retrieved will be later compared to ensure it remains within bounds */ ++static size_t ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ int end = 0; ++ size_t blockSize = 0; ++ size_t spos = seqPos.idx; ++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); ++ assert(spos <= inSeqsSize); ++ while (spos < inSeqsSize) { ++ end = (inSeqs[spos].offset == 0); ++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; ++ if (end) { ++ if (inSeqs[spos].matchLength != 0) ++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); ++ break; ++ } ++ spos++; ++ } ++ if (!end) ++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); ++ return blockSize; ++} ++ ++/* More a "target" block size */ ++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) ++{ ++ int const lastBlock = (remaining <= blockSize); ++ return lastBlock ? remaining : blockSize; ++} ++ ++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode, ++ size_t blockSize, size_t remaining, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) ++ return blockSize_noDelimiter(blockSize, remaining); ++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); ++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); ++ if (explicitBlockSize > blockSize) ++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); ++ if (explicitBlockSize > remaining) ++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); ++ return explicitBlockSize; ++ } ++} ++ + /* Compress, block-by-block, all of the sequences given. + * + * Returns the cumulative size of all compressed blocks (including their headers), +@@ -5807,9 +6579,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + const void* src, size_t srcSize) + { + size_t cSize = 0; +- U32 lastBlock; +- size_t blockSize; +- size_t compressedSeqsSize; + size_t remaining = srcSize; + ZSTD_sequencePosition seqPos = {0, 0, 0}; + +@@ -5829,22 +6598,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + } + + while (remaining) { ++ size_t compressedSeqsSize; + size_t cBlockSize; + size_t additionalByteAdjustment; +- lastBlock = remaining <= cctx->blockSize; +- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; ++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, ++ cctx->blockSize, remaining, ++ inSeqs, inSeqsSize, seqPos); ++ U32 const lastBlock = (blockSize == remaining); ++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); ++ assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); +- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); ++ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize); + +- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); ++ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); + FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); + blockSize -= additionalByteAdjustment; + + /* If blocks are too small, emit as a nocompress block */ +- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; +@@ -5853,6 +6629,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + continue; + } + ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, +@@ -5861,11 +6638,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); +- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && +- ZSTD_isRLE((BYTE const*)src, srcSize)) { ++ ZSTD_isRLE(ip, blockSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 +@@ -5876,12 +6653,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ +@@ -5893,11 +6670,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; +- DEBUGLOG(4, "cSize running total: %zu", cSize); + + if (lastBlock) { + break; +@@ -5908,12 +6684,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + ++ DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; + } + +-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, ++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) + { +@@ -5923,7 +6702,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + size_t frameHeaderSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ +- DEBUGLOG(3, "ZSTD_compressSequences()"); ++ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); + /* Begin writing output, starting with frame header */ +@@ -5951,26 +6730,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + cSize += 4; + } + +- DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; + } + + /*====== Finalize ======*/ + ++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) ++{ ++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; ++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ return stableInput ? zcs->expectedInBuffer : nullInput; ++} ++ + /*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ + size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); ++ input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); + } + + + size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); +- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); ++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; +@@ -6092,7 +6879,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ +- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } + } + +@@ -6127,3 +6914,29 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); + } ++ ++void ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* zc, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc ++) { ++ assert(zc != NULL); ++ ZSTD_CCtxParams_registerSequenceProducer( ++ &zc->requestedParams, extSeqProdState, extSeqProdFunc ++ ); ++} ++ ++void ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc ++) { ++ assert(params != NULL); ++ if (extSeqProdFunc != NULL) { ++ params->extSeqProdFunc = extSeqProdFunc; ++ params->extSeqProdState = extSeqProdState; ++ } else { ++ params->extSeqProdFunc = NULL; ++ params->extSeqProdState = NULL; ++ } ++} +diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h +index 71697a11ae30..53cb582a8d2b 100644 +--- a/lib/zstd/compress/zstd_compress_internal.h ++++ b/lib/zstd/compress/zstd_compress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,6 +21,7 @@ + ***************************************/ + #include "../common/zstd_internal.h" + #include "zstd_cwksp.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ + + + /*-************************************* +@@ -32,7 +34,7 @@ + It's not a big deal though : candidate will just be sorted again. + Additionally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. +- The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. ++ The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. + This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ + + +@@ -111,12 +113,13 @@ typedef struct { + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize); ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize); + + /* ******************************* + * Compression internals structs * +@@ -142,26 +145,33 @@ typedef struct { + size_t capacity; /* The capacity starting from `seq` pointer */ + } rawSeqStore_t; + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_sequencePosition; ++ + UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + + typedef struct { +- int price; +- U32 off; +- U32 mlen; +- U32 litlen; +- U32 rep[ZSTD_REP_NUM]; ++ int price; /* price from beginning of segment to this position */ ++ U32 off; /* offset of previous match */ ++ U32 mlen; /* length of previous match */ ++ U32 litlen; /* nb of literals since previous match */ ++ U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ + } ZSTD_optimal_t; + + typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; + ++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) + typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + unsigned* litFreq; /* table of literals statistics, of size 256 */ + unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ +- ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ +- ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ ++ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ ++ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ +@@ -212,8 +222,10 @@ struct ZSTD_matchState_t { + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ ++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ ++ U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ ++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + + U32* hashTable; + U32* hashTable3; +@@ -228,6 +240,18 @@ struct ZSTD_matchState_t { + const ZSTD_matchState_t* dictMatchState; + ZSTD_compressionParameters cParams; + const rawSeqStore_t* ldmSeqStore; ++ ++ /* Controls prefetching in some dictMatchState matchfinders. ++ * This behavior is controlled from the cctx ms. ++ * This parameter has no effect in the cdict ms. */ ++ int prefetchCDictTables; ++ ++ /* When == 0, lazy match finders insert every position. ++ * When != 0, lazy match finders only insert positions they search. ++ * This allows them to skip much faster over incompressible data, ++ * at a small cost to compression ratio. ++ */ ++ int lazySkipping; + }; + + typedef struct { +@@ -324,6 +348,25 @@ struct ZSTD_CCtx_params_s { + + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ + ZSTD_customMem customMem; ++ ++ /* Controls prefetching in some dictMatchState matchfinders */ ++ ZSTD_paramSwitch_e prefetchCDictTables; ++ ++ /* Controls whether zstd will fall back to an internal matchfinder ++ * if the external matchfinder returns an error code. */ ++ int enableMatchFinderFallback; ++ ++ /* Parameters for the external sequence producer API. ++ * Users set these parameters through ZSTD_registerSequenceProducer(). ++ * It is not possible to set these parameters individually through the public API. */ ++ void* extSeqProdState; ++ ZSTD_sequenceProducer_F extSeqProdFunc; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; ++ ++ /* Controls repcode search in external sequence parsing */ ++ ZSTD_paramSwitch_e searchForExternalRepcodes; + }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ + + #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) +@@ -404,6 +447,7 @@ struct ZSTD_CCtx_s { + + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; ++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + + /* Dictionary */ +@@ -417,9 +461,14 @@ struct ZSTD_CCtx_s { + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; ++ ++ /* Buffer for output from external sequence producer */ ++ ZSTD_Sequence* extSeqBuf; ++ size_t extSeqBufCapacity; + }; + + typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + + typedef enum { + ZSTD_noDict = 0, +@@ -441,7 +490,7 @@ typedef enum { + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ +- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. ++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. +@@ -500,9 +549,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) + /* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) + { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); ++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); +@@ -510,7 +561,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi + return ZSTD_blockHeaderSize + srcSize; + } + +-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) + { + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); +@@ -529,7 +581,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) + { + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + return (srcSize >> minlog) + 2; + } + +@@ -565,29 +617,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con + while (ip < iend) *op++ = *ip++; + } + +-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +-#define STORE_REPCODE_1 STORE_REPCODE(1) +-#define STORE_REPCODE_2 STORE_REPCODE(2) +-#define STORE_REPCODE_3 STORE_REPCODE(3) +-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +-#define STORED_TO_OFFBASE(o) ((o)+1) +-#define OFFBASE_TO_STORED(o) ((o)-1) ++ ++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) ++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) ++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) ++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ ++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) ++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) ++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) ++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) ++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ + + /*! ZSTD_storeSeq() : +- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. +- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +- * Allowed to overread literals up to litLimit. ++ * Allowed to over-read literals up to litLimit. + */ + HINT_INLINE UNUSED_ATTR void + ZSTD_storeSeq(seqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, +- U32 offBase_minus1, ++ U32 offBase, + size_t matchLength) + { + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; +@@ -596,8 +646,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); +- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", +- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); ++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", ++ pos, (U32)litLength, (U32)matchLength, (U32)offBase); + } + #endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); +@@ -607,9 +657,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. +- * First copy 16 bytes, because literals are likely short. +- */ +- assert(WILDCOPY_OVERLENGTH >= 16); ++ * First copy 16 bytes, because literals are likely short. ++ */ ++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); +@@ -628,7 +678,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ +- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); ++ seqStorePtr->sequences[0].offBase = offBase; + + /* match Length */ + assert(matchLength >= MINMATCH); +@@ -646,17 +696,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + + /* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) +- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() ++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ + MEM_STATIC void +-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ ++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; +- rep[0] = STORED_OFFSET(offBase_minus1); ++ rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ +- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; ++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +@@ -673,11 +723,11 @@ typedef struct repcodes_s { + } repcodes_t; + + MEM_STATIC repcodes_t +-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { + repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); ++ ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; + } + +@@ -685,59 +735,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 + /*-************************************* + * Match length counter + ***************************************/ +-static unsigned ZSTD_NbCommonBytes (size_t val) +-{ +- if (MEM_isLittleEndian()) { +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_ctzll((U64)val) >> 3); +-# else +- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, +- 0, 3, 1, 3, 1, 4, 2, 7, +- 0, 2, 3, 6, 1, 5, 3, 5, +- 1, 3, 4, 4, 2, 5, 6, 7, +- 7, 0, 1, 2, 3, 3, 4, 6, +- 2, 6, 5, 5, 3, 4, 5, 6, +- 7, 1, 2, 4, 6, 4, 4, 5, +- 7, 2, 6, 5, 7, 6, 7, 7 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_ctz((U32)val) >> 3); +-# else +- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, +- 3, 2, 2, 1, 3, 2, 0, 1, +- 3, 3, 1, 2, 2, 2, 2, 0, +- 3, 1, 2, 0, 1, 0, 1, 1 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +- } else { /* Big Endian CPU */ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_clzll(val) >> 3); +-# else +- unsigned r; +- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ +- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } +- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } +- r += (!val); +- return r; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_clz((U32)val) >> 3); +-# else +- unsigned r; +- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } +- r += (!val); +- return r; +-# endif +- } } +-} +- +- + MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) + { + const BYTE* const pStart = pIn; +@@ -783,32 +780,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + * Hashes + ***************************************/ + static const U32 prime3bytes = 506832829U; +-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } +-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ ++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } ++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ ++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } + + static const U32 prime4bytes = 2654435761U; +-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } ++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } + + static const U64 prime5bytes = 889523592379ULL; +-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } ++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } + + static const U64 prime6bytes = 227718039650203ULL; +-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } ++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } + + static const U64 prime7bytes = 58295818150454627ULL; +-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } ++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } + + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } ++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } ++ + + MEM_STATIC FORCE_INLINE_ATTR + size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ + switch(mls) + { + default: +@@ -820,6 +828,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + } + } + ++MEM_STATIC FORCE_INLINE_ATTR ++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ ++ switch(mls) ++ { ++ default: ++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); ++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); ++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); ++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); ++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); ++ } ++} ++ ++ + /* ZSTD_ipow() : + * Return base^exponent. + */ +@@ -1011,7 +1037,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + */ +-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) + { + /* preemptive overflow correction: +@@ -1167,10 +1195,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + +- if (blockEndIdx > loadedDictEnd + maxDist) { ++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. ++ * ++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected ++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. ++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use ++ * dictMatchState, so setting it to NULL is not a problem. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; +@@ -1199,7 +1232,9 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_update(ZSTD_window_t* window, + void const* src, size_t srcSize, + int forceNonContiguous) + { +@@ -1302,6 +1337,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) + + #endif + ++/* Short Cache */ ++ ++/* Normally, zstd matchfinders follow this flow: ++ * 1. Compute hash at ip ++ * 2. Load index from hashTable[hash] ++ * 3. Check if *ip == *(base + index) ++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. ++ * ++ * Short cache is an optimization which allows us to avoid step 3 most of the time ++ * when the data doesn't actually match. With short cache, the flow becomes: ++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. ++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. ++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. ++ * ++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to ++ * dictMatchState matchfinders. ++ */ ++#define ZSTD_SHORT_CACHE_TAG_BITS 8 ++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) ++ ++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. ++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ ++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { ++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); ++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); ++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; ++} ++ ++/* Helper function for short cache matchfinders. ++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ ++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { ++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; ++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; ++ return tag1 == tag2; ++} + + + /* =============================================================== +@@ -1381,11 +1452,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity); + * This cannot be used when long range matching is enabled. + * Zstd will use these sequences, and pass the literals to a secondary block + * compressor. +- * @return : An error code on failure. + * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory + * access and data corruption. + */ +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); + + /* ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +@@ -1396,4 +1466,55 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + */ + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + ++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of ++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. ++ * Note that the block delimiter must include the last literals of the block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns the number of bytes to move the current read position back by. ++ * Only non-zero if we ended up splitting a sequence. ++ * Otherwise, it may return a ZSTD error if something went wrong. ++ * ++ * This function will attempt to scan through blockSize bytes ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. ++ * ++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to ++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match ++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ ++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { ++ return params->extSeqProdFunc != NULL; ++} ++ ++/* =============================================================== ++ * Deprecated definitions that are still used internally to avoid ++ * deprecation warnings. These functions are exactly equivalent to ++ * their public variants, but avoid the deprecation warnings. ++ * =============================================================== */ ++ ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ++ ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ ++ + #endif /* ZSTD_COMPRESS_H */ +diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c +index 52b0a8059aba..3e9ea46a670a 100644 +--- a/lib/zstd/compress/zstd_compress_literals.c ++++ b/lib/zstd/compress/zstd_compress_literals.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,11 +14,36 @@ + ***************************************/ + #include "zstd_compress_literals.h" + ++ ++/* ************************************************************** ++* Debug Traces ++****************************************************************/ ++#if DEBUGLEVEL >= 2 ++ ++static size_t showHexa(const void* src, size_t srcSize) ++{ ++ const BYTE* const ip = (const BYTE*)src; ++ size_t u; ++ for (u=0; u31) + (srcSize>4095); + ++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); ++ + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) +@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, + } + + ZSTD_memcpy(ostart + flSize, src, srcSize); +- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); ++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; + } + ++static int allBytesIdentical(const void* src, size_t srcSize) ++{ ++ assert(srcSize >= 1); ++ assert(src != NULL); ++ { const BYTE b = ((const BYTE*)src)[0]; ++ size_t p; ++ for (p=1; p31) + (srcSize>4095); + +- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ ++ assert(dstCapacity >= 4); (void)dstCapacity; ++ assert(allBytesIdentical(src, srcSize)); + + switch(flSize) + { +@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* + } + + ostart[flSize] = *(const BYTE*)src; +- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); ++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); + return flSize+1; + } + +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible) ++/* ZSTD_minLiteralsToCompress() : ++ * returns minimal amount of literals ++ * for literal compression to even be attempted. ++ * Minimum is made tighter as compression strategy increases. ++ */ ++static size_t ++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) ++{ ++ assert((int)strategy >= 0); ++ assert((int)strategy <= 9); ++ /* btultra2 : min 8 bytes; ++ * then 2x larger for each successive compression strategy ++ * max threshold 64 bytes */ ++ { int const shift = MIN(9-(int)strategy, 3); ++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; ++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); ++ return mintc; ++ } ++} ++ ++size_t ZSTD_compressLiterals ( ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ void* entropyWorkspace, size_t entropyWorkspaceSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, ++ int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2) + { +- size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", +- disableLiteralCompression, (U32)srcSize); ++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", ++ disableLiteralCompression, (U32)srcSize, dstCapacity); ++ ++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + +- /* small ? don't even attempt compression (speed opt) */ +-# define COMPRESS_LITERALS_SIZE_MIN 63 +- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ /* if too small, don't even attempt compression (speed opt) */ ++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; +- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; ++ int const flags = 0 ++ | (bmi2 ? HUF_flags_bmi2 : 0) ++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) ++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) ++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); ++ ++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); ++ huf_compress_f huf_compress; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; +- cLitSize = singleStream ? +- HUF_compress1X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : +- HUF_compress4X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); ++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; ++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, ++ src, srcSize, ++ HUF_SYMBOLVALUE_MAX, LitHufLog, ++ entropyWorkspace, entropyWorkspaceSize, ++ (HUF_CElt*)nextHuf->CTable, ++ &repeat, flags); ++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ +- DEBUGLOG(5, "Reusing previous huffman table"); ++ DEBUGLOG(5, "reusing statistics from previous huffman block"); + hType = set_repeat; + } + } + +- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ { size_t const minGain = ZSTD_minGain(srcSize, strategy); ++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); ++ } } + if (cLitSize==1) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); +- } ++ /* A return value of 1 signals that the alphabet consists of a single symbol. ++ * However, in some rare circumstances, it could be the compressed size (a single byte). ++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. ++ * (it's also necessary to not generate statistics). ++ * Therefore, in such a case, actively check that all bytes are identical. */ ++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); ++ } } + + if (hType == set_compressed) { + /* using a newly constructed table */ +@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); ++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); +diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h +index 9775fb97cb70..a2a85d6b69e5 100644 +--- a/lib/zstd/compress/zstd_compress_literals.h ++++ b/lib/zstd/compress/zstd_compress_literals.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,16 +17,24 @@ + + size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ++/* ZSTD_compressRleLiteralsBlock() : ++ * Conditions : ++ * - All bytes in @src are identical ++ * - dstCapacity >= 4 */ + size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, ++/* ZSTD_compressLiterals(): ++ * @entropyWorkspace: must be aligned on 4-bytes boundaries ++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE ++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding ++ */ ++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible); ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2); + + #endif /* ZSTD_COMPRESS_LITERALS_H */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c +index 21ddc1b37acf..5c028c78d889 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.c ++++ b/lib/zstd/compress/zstd_compress_sequences.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq) + { + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on +- * comprssibility. ++ * compressibility. + */ + return nbSeq >= 2048; + } +@@ -166,7 +167,7 @@ ZSTD_selectEncodingType( + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { +- /* Prefer set_basic over set_rle when there are 2 or less symbols, ++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h +index 7991364c2f71..7fe6f4ff5cf2 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.h ++++ b/lib/zstd/compress/zstd_compress_sequences.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c +index 17d836cc84e8..41f6521b27cd 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.c ++++ b/lib/zstd/compress/zstd_compress_superblock.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -36,13 +37,14 @@ + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block +- * Or 0 if it unable to compress. ++ * Or 0 if unable to compress. + * Or error code */ +-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- const BYTE* literals, size_t litSize, +- void* dst, size_t dstSize, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const BYTE* literals, size_t litSize, ++ void* dst, size_t dstSize, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); +@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + +- (void)bmi2; /* TODO bmi2... */ +- + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; +@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + +- /* TODO bmi2 */ +- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) +- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); ++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; ++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) ++ : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { +@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } +@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); +- return op-ostart; ++ return (size_t)(op-ostart); + } + +-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { +- const seqDef* const sstart = sequences; +- const seqDef* const send = sequences + nbSeq; +- const seqDef* sp = sstart; ++static size_t ++ZSTD_seqDecompressedSize(seqStore_t const* seqStore, ++ const seqDef* sequences, size_t nbSeqs, ++ size_t litSize, int lastSubBlock) ++{ + size_t matchLengthSum = 0; + size_t litLengthSum = 0; +- (void)(litLengthSum); /* suppress unused variable warning on some environments */ +- while (send-sp > 0) { +- ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); ++ size_t n; ++ for (n=0; ncParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; +@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); +- if (nbSeq < 0x7F) ++ if (nbSeq < 128) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ +@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +- op, oend - op, ++ op, (size_t)(oend - op), + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, +@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + #endif + + *entropyWritten = 1; +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* ZSTD_compressSubBlock() : +@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, +- op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; +@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, +- op, oend-op, ++ op, (size_t)(oend-op), + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ +- { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; ++ { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } +- return op-ostart; ++ return (size_t)(op-ostart); + } + + static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, +@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + +-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, ++typedef struct { ++ size_t estLitSize; ++ size_t estBlockSize; ++} EstimatedBlockSize; ++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, +@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { +- size_t cSizeEstimate = 0; +- cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); +- cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, ++ int writeLitEntropy, int writeSeqEntropy) ++{ ++ EstimatedBlockSize ebs; ++ ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); ++ ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); +- return cSizeEstimate + ZSTD_blockHeaderSize; ++ ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; ++ return ebs; + } + + static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +@@ -415,13 +427,56 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe + return 0; + } + ++static size_t countLiterals(seqStore_t const* seqStore, const seqDef* sp, size_t seqCount) ++{ ++ size_t n, total = 0; ++ assert(sp != NULL); ++ for (n=0; n %zu bytes", seqCount, (const void*)sp, total); ++ return total; ++} ++ ++#define BYTESCALE 256 ++ ++static size_t sizeBlockSequences(const seqDef* sp, size_t nbSeqs, ++ size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, ++ int firstSubBlock) ++{ ++ size_t n, budget = 0, inSize=0; ++ /* entropy headers */ ++ size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ ++ assert(firstSubBlock==0 || firstSubBlock==1); ++ budget += headerSize; ++ ++ /* first sequence => at least one sequence*/ ++ budget += sp[0].litLength * avgLitCost + avgSeqCost; ++ if (budget > targetBudget) return 1; ++ inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); ++ ++ /* loop over sequences */ ++ for (n=1; n targetBudget) ++ /* though continue to expand until the sub-block is deemed compressible */ ++ && (budget < inSize * BYTESCALE) ) ++ break; ++ } ++ ++ return n; ++} ++ + /* ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. +- * Entropy will be written to the first block. +- * The following blocks will use repeat mode to compress. +- * All sub-blocks are compressed blocks (no raw or rle blocks). +- * @return : compressed size of the super block (which is multiple ZSTD blocks) +- * Or 0 if it failed to compress. */ ++ * Entropy will be written into the first block. ++ * The following blocks use repeat_mode to compress. ++ * Sub-blocks are all compressed, except the last one when beneficial. ++ * @return : compressed size of the super block (which features multiple ZSTD blocks) ++ * or 0 if it failed to compress. */ + static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, +@@ -434,10 +489,12 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + { + const seqDef* const sstart = seqStorePtr->sequencesStart; + const seqDef* const send = seqStorePtr->sequences; +- const seqDef* sp = sstart; ++ const seqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ ++ size_t const nbSeqs = (size_t)(send - sstart); + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; ++ size_t const nbLiterals = (size_t)(lend - lstart); + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; +@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; +- size_t targetCBlockSize = cctxParams->targetCBlockSize; +- size_t litSize, seqCount; +- int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; ++ size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ ++ size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); ++ int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); + int writeSeqEntropy = 1; +- int lastSequence = 0; +- +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", +- (unsigned)(lend-lp), (unsigned)(send-sstart)); +- +- litSize = 0; +- seqCount = 0; +- do { +- size_t cBlockSizeEstimate = 0; +- if (sstart == send) { +- lastSequence = 1; +- } else { +- const seqDef* const sequence = sp + seqCount; +- lastSequence = sequence == send - 1; +- litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; +- seqCount++; +- } +- if (lastSequence) { +- assert(lp <= lend); +- assert(litSize <= (size_t)(lend - lp)); +- litSize = (size_t)(lend - lp); ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", ++ (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); ++ ++ /* let's start by a general estimation for the full block */ ++ if (nbSeqs > 0) { ++ EstimatedBlockSize const ebs = ++ ZSTD_estimateSubBlockSize(lp, nbLiterals, ++ ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, ++ &nextCBlock->entropy, entropyMetadata, ++ workspace, wkspSize, ++ writeLitEntropy, writeSeqEntropy); ++ /* quick estimation */ ++ size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; ++ size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; ++ const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); ++ size_t n, avgBlockBudget, blockBudgetSupp=0; ++ avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; ++ DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", ++ (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, ++ (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); ++ /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately ++ * this will result in the production of a single uncompressed block covering @srcSize.*/ ++ if (ebs.estBlockSize > srcSize) return 0; ++ ++ /* compress and write sub-blocks */ ++ assert(nbSubBlocks>0); ++ for (n=0; n < nbSubBlocks-1; n++) { ++ /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ ++ size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), ++ avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); ++ /* if reached last sequence : break to last sub-block (simplification) */ ++ assert(seqCount <= (size_t)(send-sp)); ++ if (sp + seqCount == send) break; ++ assert(seqCount > 0); ++ /* compress sub-block */ ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = countLiterals(seqStorePtr, sp, seqCount); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ 0); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* check compressibility, update state components */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; ++ } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; ++ blockBudgetSupp = 0; ++ } } ++ /* otherwise : do not compress yet, coalesce current sub-block with following one */ + } +- /* I think there is an optimization opportunity here. +- * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful +- * since it recalculates estimate from scratch. +- * For example, it would recount literal distribution and symbol codes every time. +- */ +- cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, +- &nextCBlock->entropy, entropyMetadata, +- workspace, wkspSize, writeLitEntropy, writeSeqEntropy); +- if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { +- int litEntropyWritten = 0; +- int seqEntropyWritten = 0; +- const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); +- const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, +- sp, seqCount, +- lp, litSize, +- llCodePtr, mlCodePtr, ofCodePtr, +- cctxParams, +- op, oend-op, +- bmi2, writeLitEntropy, writeSeqEntropy, +- &litEntropyWritten, &seqEntropyWritten, +- lastBlock && lastSequence); +- FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); +- if (cSize > 0 && cSize < decompressedSize) { +- DEBUGLOG(5, "Committed the sub-block"); +- assert(ip + decompressedSize <= iend); +- ip += decompressedSize; +- sp += seqCount; +- lp += litSize; +- op += cSize; +- llCodePtr += seqCount; +- mlCodePtr += seqCount; +- ofCodePtr += seqCount; +- litSize = 0; +- seqCount = 0; +- /* Entropy only needs to be written once */ +- if (litEntropyWritten) { +- writeLitEntropy = 0; +- } +- if (seqEntropyWritten) { +- writeSeqEntropy = 0; +- } ++ } /* if (nbSeqs > 0) */ ++ ++ /* write last block */ ++ DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = (size_t)(lend - lp); ++ size_t seqCount = (size_t)(send - sp); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ lastBlock); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* update pointers, the nb of literals borrowed from next sequence must be preserved */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; + } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; + } +- } while (!lastSequence); ++ } ++ ++ + if (writeLitEntropy) { +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); ++ DEBUGLOG(5, "Literal entropy tables were never written"); + ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); ++ DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); + return 0; + } ++ + if (ip < iend) { +- size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); ++ /* some data left : last part of the block sent uncompressed */ ++ size_t const rSize = (size_t)((iend - ip)); ++ size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); ++ DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { +- seqDef const* seq; ++ const seqDef* seq; + repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { +- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); ++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); +- return op-ostart; ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", ++ (unsigned)(op-ostart)); ++ return (size_t)(op-ostart); + } + + size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, +- void const* src, size_t srcSize, +- unsigned lastBlock) { ++ const void* src, size_t srcSize, ++ unsigned lastBlock) ++{ + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, +diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h +index 224ece79546e..826bbc9e029b 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.h ++++ b/lib/zstd/compress/zstd_compress_superblock.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h +index 349fc923c355..86bc3c2c23c7 100644 +--- a/lib/zstd/compress/zstd_cwksp.h ++++ b/lib/zstd/compress/zstd_cwksp.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,9 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_internal.h" ++#include "../common/portability_macros.h" + + + /*-************************************* +@@ -41,8 +44,9 @@ + ***************************************/ + typedef enum { + ZSTD_cwksp_alloc_objects, +- ZSTD_cwksp_alloc_buffers, +- ZSTD_cwksp_alloc_aligned ++ ZSTD_cwksp_alloc_aligned_init_once, ++ ZSTD_cwksp_alloc_aligned, ++ ZSTD_cwksp_alloc_buffers + } ZSTD_cwksp_alloc_phase_e; + + /* +@@ -95,8 +99,8 @@ typedef enum { + * + * Workspace Layout: + * +- * [ ... workspace ... ] +- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] ++ * [ ... workspace ... ] ++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: +@@ -120,9 +124,18 @@ typedef enum { + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. These tables are 64-byte aligned. + * +- * - Aligned: these buffers are used for various purposes that require 4 byte +- * alignment, but don't require any initialization before they're used. These +- * buffers are each aligned to 64 bytes. ++ * - Init once: these buffers require to be initialized at least once before ++ * use. They should be used when we want to skip memory initialization ++ * while not triggering memory checkers (like Valgrind) when reading from ++ * from this memory without writing to it first. ++ * These buffers should be used carefully as they might contain data ++ * from previous compressions. ++ * Buffers are aligned to 64 bytes. ++ * ++ * - Aligned: these buffers don't require any initialization before they're ++ * used. The user of the buffer should make sure they write into a buffer ++ * location before reading from it. ++ * Buffers are aligned to 64 bytes. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can +@@ -134,8 +147,9 @@ typedef enum { + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects +- * 2. Buffers +- * 3. Aligned/Tables ++ * 2. Init once / Tables ++ * 3. Aligned / Tables ++ * 4. Buffers / Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +@@ -147,6 +161,7 @@ typedef struct { + void* tableEnd; + void* tableValidEnd; + void* allocStart; ++ void* initOnceStart; + + BYTE allocFailed; + int workspaceOversizedDuration; +@@ -159,6 +174,7 @@ typedef struct { + ***************************************/ + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); + + MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; +@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); ++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); ++ assert(ws->workspace <= ws->initOnceStart); + } + + /* +@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + * for internal purposes (currently only alignment). + */ + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +- * to align the beginning of the aligned section. +- * +- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +- * aligneds being sized in multiples of 64 bytes. ++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES ++ * bytes to align the beginning of tables section and end of buffers; + */ +- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; ++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; + } + +@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert((alignBytes & alignBytesMask) == 0); +- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(bytes < alignBytes); + return bytes; + } + ++/* ++ * Returns the initial value for allocStart which is used to determine the position from ++ * which we can allocate from the end of the workspace. ++ */ ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { ++ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); ++} ++ + /* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, +@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + { + assert(phase >= ws->phase); + if (phase > ws->phase) { +- /* Going from allocating objects to allocating buffers */ +- if (ws->phase < ZSTD_cwksp_alloc_buffers && +- phase >= ZSTD_cwksp_alloc_buffers) { ++ /* Going from allocating objects to allocating initOnce / tables */ ++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && ++ phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; +- } ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + +- /* Going from allocating buffers to allocating aligneds/tables */ +- if (ws->phase < ZSTD_cwksp_alloc_aligned && +- phase >= ZSTD_cwksp_alloc_aligned) { +- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +- size_t const bytesToAlign = +- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +- memory_allocation, "aligned phase - alignment initial allocation failed!"); +- } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +- void* const alloc = ws->objectEnd; ++ void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +- void* const objectEnd = (BYTE*)alloc + bytesToAlign; ++ void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); +@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; +- } } } ++ } ++ } ++ } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + */ + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) + { +- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); ++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); + } + + /* +@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); + } + ++/* ++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ * This memory has been initialized at least once in the past. ++ * This doesn't mean it has been initialized this time, and it might contain data from previous ++ * operations. ++ * The main usage is for algorithms that might need read access into uninitialized memory. ++ * The algorithm must maintain safety under these conditions and must make sure it doesn't ++ * leak any of the past data (directly or in side channels). ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) ++{ ++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); ++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ if(ptr && ptr < ws->initOnceStart) { ++ /* We assume the memory following the current allocation is either: ++ * 1. Not usable as initOnce memory (end of workspace) ++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) ++ * 3. An ASAN redzone, in which case we don't want to write on it ++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. ++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ++ ws->initOnceStart = ptr; ++ } ++ return ptr; ++} ++ + /* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + */ +@@ -356,18 +396,22 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) + + /* + * Aligned on 64 bytes. These buffers have the special property that +- * their values remain constrained, allowing us to re-use them without ++ * their values remain constrained, allowing us to reuse them without + * memset()-ing them. + */ + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + { +- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; ++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + +- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +- return NULL; ++ /* We can only start allocating tables after we are done reserving space for objects at the ++ * start of the workspace */ ++ if(ws->phase < phase) { ++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { ++ return NULL; ++ } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; +@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { +- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); ++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); + } + ZSTD_cwksp_mark_tables_clean(ws); + } +@@ -478,14 +522,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + + + ws->tableEnd = ws->objectEnd; +- ws->allocStart = ws->workspaceEnd; ++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); + ws->allocFailed = 0; +- if (ws->phase > ZSTD_cwksp_alloc_buffers) { +- ws->phase = ZSTD_cwksp_alloc_buffers; ++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { ++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; + } + ZSTD_cwksp_assert_internal_consistency(ws); + } + ++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); ++} ++ ++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) ++ + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); ++} ++ + /* + * The provided workspace takes ownership of the buffer [start, start+size). + * Any existing values in the workspace are ignored (the previously managed +@@ -498,6 +551,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; + ZSTD_cwksp_clear(ws); +@@ -529,15 +583,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { + ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); + } + +-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +-} +- +-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) +- + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); +-} +- + MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + return ws->allocFailed; + } +@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +- size_t const estimatedSpace, int resizedWorkspace) { +- if (resizedWorkspace) { +- /* Resized/newly allocated wksp should have exact bounds */ +- return ZSTD_cwksp_used(ws) == estimatedSpace; +- } else { +- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +- * than estimatedSpace. See the comments in zstd_cwksp.h for details. +- */ +- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +- } ++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { ++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice ++ * the alignment bytes difference between estimation and actual usage */ ++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && ++ ZSTD_cwksp_used(ws) <= estimatedSpace; + } + + +diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c +index 76933dea2624..5ff54f17d92f 100644 +--- a/lib/zstd/compress/zstd_double_fast.c ++++ b/lib/zstd/compress/zstd_double_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,49 @@ + #include "zstd_compress_internal.h" + #include "zstd_double_fast.h" + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashLarge = ms->hashTable; ++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ U32* const hashSmall = ms->chainTable; ++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; ++ ++ /* Always insert every fastHashFillStep position into the hash tables. ++ * Insert the other positions into the large hash table if their entry ++ * is empty. ++ */ ++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ U32 i; ++ for (i = 0; i < fastHashFillStep; ++i) { ++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); ++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); ++ if (i == 0) { ++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); ++ } ++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { ++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); ++ } ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ if (dtlm == ZSTD_dtlm_fast) ++ break; ++ } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -43,11 +85,24 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; +- } } ++ } } ++} ++ ++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); ++ } + } + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) +@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; +@@ -100,8 +155,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; +- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ +@@ -131,7 +186,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -175,9 +230,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + } while (ip1 <= ilimit); + + _cleanup: ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -217,7 +276,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + hashLong[hl1] = (U32)(ip1 - base); + } + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -243,7 +302,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -254,6 +313,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, +@@ -275,7 +335,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; +@@ -286,8 +345,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); +- const U32 dictHBitsL = dictCParams->hashLog; +- const U32 dictHBitsS = dictCParams->chainLog; ++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); +@@ -295,6 +354,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashLong, hashTableBytes); ++ PREFETCH_AREA(dictHashSmall, chainTableBytes); ++ } ++ + /* init */ + ip += (dictAndPrefixLength == 0); + +@@ -309,8 +375,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); +- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); +- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); ++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); ++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; +@@ -328,7 +398,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -340,9 +410,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL) { + /* check dictMatchState long match */ +- U32 const dictMatchIndexL = dictHashLong[dictHL]; ++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + +@@ -358,9 +428,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } +- } else { ++ } else if (dictTagsMatchS) { + /* check dictMatchState short match */ +- U32 const dictMatchIndexS = dictHashSmall[dictHS]; ++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + +@@ -375,10 +445,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + continue; + + _search_next_long: +- + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); ++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; ++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + +@@ -391,9 +462,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL3) { + /* check dict long +1 match */ +- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; ++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { +@@ -419,7 +490,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + offset_2 = offset_1; + offset_1 = offset; + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -448,7 +519,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -461,8 +532,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } /* while (ip < ilimit) */ + + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -527,7 +598,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_doubleFast_extDict_generic( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_doubleFast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) +@@ -585,7 +658,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; +@@ -596,7 +669,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +@@ -621,7 +694,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + } + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; +@@ -653,7 +726,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -694,3 +767,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict( + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); + } + } ++ ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ +diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h +index 6822bde65a1d..b7ddc714f13e 100644 +--- a/lib/zstd/compress/zstd_double_fast.h ++++ b/lib/zstd/compress/zstd_double_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -15,8 +16,12 @@ + #include "../common/mem.h" /* U32 */ + #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ + void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); ++ + size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +@@ -27,6 +32,14 @@ size_t ZSTD_compressBlock_doubleFast_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ + + + #endif /* ZSTD_DOUBLE_FAST_H */ +diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c +index a752e6beab52..b7a63ba4ce56 100644 +--- a/lib/zstd/compress/zstd_fast.c ++++ b/lib/zstd/compress/zstd_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,46 @@ + #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ + #include "zstd_fast.h" + ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashTable = ms->hashTable; ++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_full); ++ ++ /* Always insert every fastHashFillStep position into the hash table. ++ * Insert the other positions if their hash entry is empty. ++ */ ++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } ++ ++ if (dtlm == ZSTD_dtlm_fast) continue; ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ { U32 p; ++ for (p = 1; p < fastHashFillStep; ++p) { ++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); ++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); ++ } } } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) + { +@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + ++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_fast); ++ + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ +@@ -42,6 +85,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + } } } } + } + ++void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillHashTableForCCtx(ms, end, dtlm); ++ } ++} ++ + + /* + * If you squint hard enough (and ignore repcodes), the search operation at any +@@ -89,8 +144,9 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + * + * This is also the work we do at the beginning to enter the loop initially. + */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_fast_noDict_generic( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_noDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls, U32 const hasStep) +@@ -117,7 +173,7 @@ ZSTD_compressBlock_fast_noDict_generic( + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ +@@ -141,8 +197,8 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; +- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; ++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; ++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +@@ -180,8 +236,14 @@ ZSTD_compressBlock_fast_noDict_generic( + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; +- offcode = STORE_REPCODE_1; ++ offcode = REPCODE1_TO_OFFBASE; + mLength += 4; ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 is before the ++ * repcode (ip2). */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _match; + } + +@@ -195,6 +257,12 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 == ip0 + 1, so ++ * we know we will resume searching after ip1 */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _offset; + } + +@@ -224,6 +292,21 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* first write next hash table entry; we've already calculated it */ ++ if (step <= 4) { ++ /* We need to avoid writing an index into the hash table >= the ++ * position at which we will pick up our searching after we've ++ * taken this match. ++ * ++ * The minimum possible match has length 4, so the earliest ip0 ++ * can be after we take this match will be the current ip0 + 4. ++ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely ++ * write this position. ++ */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ + goto _offset; + } + +@@ -254,9 +337,24 @@ ZSTD_compressBlock_fast_noDict_generic( + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + ++ /* When the repcodes are outside of the prefix, we set them to zero before the loop. ++ * When the offsets are still zero, we need to restore them after the block to have a correct ++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both ++ * offsets were invalid. We need to figure out which offset to refill with. ++ * - If both offsets are zero they are in the same order. ++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. ++ * - If only one is zero, we need to decide which offset to restore. ++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. ++ * - It is impossible for rep_offset2 to be non-zero. ++ * ++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then ++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. ++ */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; ++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; ++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -267,7 +365,7 @@ ZSTD_compressBlock_fast_noDict_generic( + match0 = base + idx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); +- offcode = STORE_OFFSET(rep_offset1); ++ offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ +@@ -287,11 +385,6 @@ ZSTD_compressBlock_fast_noDict_generic( + ip0 += mLength; + anchor = ip0; + +- /* write next hash table entry */ +- if (ip1 < ip0) { +- hashTable[hash1] = (U32)(ip1 - base); +- } +- + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ +@@ -306,7 +399,7 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; +- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } +@@ -369,6 +462,7 @@ size_t ZSTD_compressBlock_fast( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_fast_dictMatchState_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) +@@ -380,14 +474,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; ++ const BYTE* ip0 = istart; ++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; +@@ -397,13 +491,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); +- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); +- const U32 dictHLog = dictCParams->hashLog; ++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); ++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; +- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); ++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + +@@ -413,106 +507,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashTable, hashTableBytes); ++ } ++ + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); +- ip += (dictAndPrefixLength == 0); ++ ip0 += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + +- /* Main Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ++ /* Outer search loop */ ++ assert(stepSize >= 1); ++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + size_t mLength; +- size_t const h = ZSTD_hashPtr(ip, hlog, mls); +- U32 const curr = (U32)(ip-base); +- U32 const matchIndex = hashTable[h]; +- const BYTE* match = base + matchIndex; +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* repMatch = (repIndex < prefixStartIndex) ? +- dictBase + (repIndex - dictIndexDelta) : +- base + repIndex; +- hashTable[h] = curr; /* update hash table */ +- +- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +- } else if ( (matchIndex <= prefixStartIndex) ) { +- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); +- U32 const dictMatchIndex = dictHashTable[dictHash]; +- const BYTE* dictMatch = dictBase + dictMatchIndex; +- if (dictMatchIndex <= dictStartIndex || +- MEM_read32(dictMatch) != MEM_read32(ip)) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a dict match */ +- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); +- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; +- while (((ip>anchor) & (dictMatch>dictStart)) +- && (ip[-1] == dictMatch[-1])) { +- ip--; dictMatch--; mLength++; ++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ ++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); ++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); ++ ++ U32 matchIndex = hashTable[hash0]; ++ U32 curr = (U32)(ip0 - base); ++ size_t step = stepSize; ++ const size_t kStepIncr = 1 << kSearchStrength; ++ const BYTE* nextStep = ip0 + kStepIncr; ++ ++ /* Inner search loop */ ++ while (1) { ++ const BYTE* match = base + matchIndex; ++ const U32 repIndex = curr + 1 - offset_1; ++ const BYTE* repMatch = (repIndex < prefixStartIndex) ? ++ dictBase + (repIndex - dictIndexDelta) : ++ base + repIndex; ++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); ++ hashTable[hash0] = curr; /* update hash table */ ++ ++ if (((U32) ((prefixStartIndex - 1) - repIndex) >= ++ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ ++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { ++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ++ ip0++; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ++ break; ++ } ++ ++ if (dictTagsMatch) { ++ /* Found a possible dict match */ ++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* dictMatch = dictBase + dictMatchIndex; ++ if (dictMatchIndex > dictStartIndex && ++ MEM_read32(dictMatch) == MEM_read32(ip0)) { ++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ ++ if (matchIndex <= prefixStartIndex) { ++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); ++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; ++ while (((ip0 > anchor) & (dictMatch > dictStart)) ++ && (ip0[-1] == dictMatch[-1])) { ++ ip0--; ++ dictMatch--; ++ mLength++; ++ } /* catch up */ ++ offset_2 = offset_1; ++ offset_1 = offset; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; ++ } ++ } ++ } ++ ++ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { ++ /* found a regular match */ ++ U32 const offset = (U32) (ip0 - match); ++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; ++ while (((ip0 > anchor) & (match > prefixStart)) ++ && (ip0[-1] == match[-1])) { ++ ip0--; ++ match--; ++ mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; + } +- } else if (MEM_read32(match) != MEM_read32(ip)) { +- /* it's not a match, and we're not going to check the dictionary */ +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a regular match */ +- U32 const offset = (U32)(ip-match); +- mLength = ZSTD_count(ip+4, match+4, iend) + 4; +- while (((ip>anchor) & (match>prefixStart)) +- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; +- offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- } ++ ++ /* Prepare for next iteration */ ++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); ++ matchIndex = hashTable[hash1]; ++ ++ if (ip1 >= nextStep) { ++ step++; ++ nextStep += kStepIncr; ++ } ++ ip0 = ip1; ++ ip1 = ip1 + step; ++ if (ip1 > ilimit) goto _cleanup; ++ ++ curr = (U32)(ip0 - base); ++ hash0 = hash1; ++ } /* end inner search loop */ + + /* match found */ +- ip += mLength; +- anchor = ip; ++ assert(mLength); ++ ip0 += mLength; ++ anchor = ip0; + +- if (ip <= ilimit) { ++ if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); ++ while (ip0 <= ilimit) { ++ U32 const current2 = (U32)(ip0-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ++ ip0 += repLength2; ++ anchor = ip0; + continue; + } + break; + } + } ++ ++ /* Prepare for next iteration */ ++ assert(ip0 == anchor); ++ ip1 = ip0 + stepSize; + } + ++_cleanup: + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -545,7 +688,9 @@ size_t ZSTD_compressBlock_fast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_fast_extDict_generic( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) + { +@@ -553,11 +698,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ +- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); +@@ -570,6 +714,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; ++ ++ const BYTE* ip0 = istart; ++ const BYTE* ip1; ++ const BYTE* ip2; ++ const BYTE* ip3; ++ U32 current0; ++ ++ ++ size_t hash0; /* hash for ip0 */ ++ size_t hash1; /* hash for ip1 */ ++ U32 idx; /* match idx for ip0 */ ++ const BYTE* idxBase; /* base pointer for idx */ ++ ++ U32 offcode; ++ const BYTE* match0; ++ size_t mLength; ++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ ++ ++ size_t step; ++ const BYTE* nextStep; ++ const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ + +@@ -579,75 +745,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + +- /* Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ +- const size_t h = ZSTD_hashPtr(ip, hlog, mls); +- const U32 matchIndex = hashTable[h]; +- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; +- const BYTE* match = matchBase + matchIndex; +- const U32 curr = (U32)(ip-base); +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; +- const BYTE* const repMatch = repBase + repIndex; +- hashTable[h] = curr; /* update hash table */ +- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); +- +- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); +- ip += rLength; +- anchor = ip; +- } else { +- if ( (matchIndex < dictStartIndex) || +- (MEM_read32(match) != MEM_read32(ip)) ) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; ++ { U32 const curr = (U32)(ip0 - base); ++ U32 const maxRep = curr - dictStartIndex; ++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; ++ } ++ ++ /* start each op */ ++_start: /* Requires: ip0 */ ++ ++ step = stepSize; ++ nextStep = ip0 + kStepIncr; ++ ++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ++ ip1 = ip0 + 1; ++ ip2 = ip0 + step; ++ ip3 = ip2 + 1; ++ ++ if (ip3 >= ilimit) { ++ goto _cleanup; ++ } ++ ++ hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ ++ idx = hashTable[hash0]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ do { ++ { /* load repcode match for ip[2] */ ++ U32 const current2 = (U32)(ip2 - base); ++ U32 const repIndex = current2 - offset_1; ++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; ++ U32 rval; ++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ ++ & (offset_1 > 0) ) { ++ rval = MEM_read32(repBase + repIndex); ++ } else { ++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ + } +- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; +- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; +- U32 const offset = curr - matchIndex; +- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; +- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = offset; /* update offset history */ +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- ip += mLength; +- anchor = ip; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ /* check repcode at ip[2] */ ++ if (MEM_read32(ip2) == rval) { ++ ip0 = ip2; ++ match0 = repBase + repIndex; ++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ assert((match0 != prefixStart) & (match0 != dictStart)); ++ mLength = ip0[-1] == match0[-1]; ++ ip0 -= mLength; ++ match0 -= mLength; ++ offcode = REPCODE1_TO_OFFBASE; ++ mLength += 4; ++ goto _match; + } } + +- if (ip <= ilimit) { +- /* Fill Table */ +- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); +- /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); +- U32 const repIndex2 = current2 - offset_2; +- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; +- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; +- continue; +- } +- break; +- } } } ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip3; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip0 + step; ++ ip3 = ip1 + step; ++ ++ /* calculate step */ ++ if (ip2 >= nextStep) { ++ step++; ++ PREFETCH_L1(ip1 + 64); ++ PREFETCH_L1(ip1 + 128); ++ nextStep += kStepIncr; ++ } ++ } while (ip3 < ilimit); ++ ++_cleanup: ++ /* Note that there are probably still a couple positions we could search. ++ * However, it seems to be a meaningful performance hit to try to search ++ * them. So let's not. */ ++ ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ +- rep[0] = offset_1; +- rep[1] = offset_2; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); ++ ++_offset: /* Requires: ip0, idx, idxBase */ ++ ++ /* Compute the offset code. */ ++ { U32 const offset = current0 - idx; ++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; ++ matchEnd = idx < prefixStartIndex ? dictEnd : iend; ++ match0 = idxBase + idx; ++ offset_2 = offset_1; ++ offset_1 = offset; ++ offcode = OFFSET_TO_OFFBASE(offset); ++ mLength = 4; ++ ++ /* Count the backwards match length. */ ++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { ++ ip0--; ++ match0--; ++ mLength++; ++ } } ++ ++_match: /* Requires: ip0, match0, offcode, matchEnd */ ++ ++ /* Count the forward length. */ ++ assert(matchEnd != 0); ++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); ++ ++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); ++ ++ ip0 += mLength; ++ anchor = ip0; ++ ++ /* write next hash table entry */ ++ if (ip1 < ip0) { ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ ++ /* Fill table and check for immediate repcode. */ ++ if (ip0 <= ilimit) { ++ /* Fill Table */ ++ assert(base+current0+2 > istart); /* check base overflow */ ++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); ++ ++ while (ip0 <= ilimit) { ++ U32 const repIndex2 = (U32)(ip0-base) - offset_2; ++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; ++ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ ++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { ++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ++ ip0 += repLength2; ++ anchor = ip0; ++ continue; ++ } ++ break; ++ } } ++ ++ goto _start; + } + + ZSTD_GEN_FAST_FN(extDict, 4, 0) +@@ -660,6 +953,7 @@ size_t ZSTD_compressBlock_fast_extDict( + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; ++ assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ +diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h +index fddc2f532d21..e64d9e1b2d39 100644 +--- a/lib/zstd/compress/zstd_fast.h ++++ b/lib/zstd/compress/zstd_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,8 @@ + #include "zstd_compress_internal.h" + + void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c +index 0298a01a7504..3e88d8a1a136 100644 +--- a/lib/zstd/compress/zstd_lazy.c ++++ b/lib/zstd/compress/zstd_lazy.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -10,14 +11,23 @@ + + #include "zstd_compress_internal.h" + #include "zstd_lazy.h" ++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) ++ ++#define kLazySkippingStep 8 + + + /*-************************************* + * Binary Tree search + ***************************************/ + +-static void +-ZSTD_updateDUBT(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_updateDUBT(ZSTD_matchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) + { +@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, + * sort one already inserted but unsorted position + * assumption : curr >= btlow == (curr - btmask) + * doesn't fail */ +-static void +-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, + U32 curr, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +@@ -149,8 +160,9 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, + } + + +-static size_t +-ZSTD_DUBT_findBetterDictMatch ( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBetterDictMatch ( + const ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, +@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", +- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); ++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ +@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } +@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + +-static size_t +-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +- size_t* offsetPtr, ++ size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; +- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) ++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any +@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, +- offsetPtr, bestLength, nbCompares, ++ offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", +- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); ++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +@@ -378,17 +391,18 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + + + /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +- size_t* offsetPtr, ++ size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) + { + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); +- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); ++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); + } + + /* ********************************* +@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; +@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + + /* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, +- const BYTE* ip, U32 const mls) ++ const BYTE* ip, U32 const mls, U32 const lazySkipping) + { + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; +@@ -632,6 +648,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; ++ /* Stop inserting every position when in the lazy skipping mode. */ ++ if (lazySkipping) ++ break; + } + + ms->nextToUpdate = target; +@@ -640,11 +659,12 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); ++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); + } + + /* inlining is important to hardwire a hot branch (template emulation) */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_HcFindBestMatch( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch( + } + + /* HC4 match finder */ +- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); ++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch( + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch( + * (SIMD) Row-based matchfinder + ***********************************/ + /* Constants for row-based hash */ +-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +- assert(val != 0); +-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +- if (sizeof(size_t) == 4) { +- U32 mostSignificantWord = (U32)(val >> 32); +- U32 leastSignificantWord = (U32)val; +- if (leastSignificantWord == 0) { +- return 32 + (U32)__builtin_ctz(mostSignificantWord); +- } else { +- return (U32)__builtin_ctz(leastSignificantWord); +- } +- } else { +- return (U32)__builtin_ctzll(val); +- } +-# else +- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +- */ +- val = ~val & (val - 1ULL); /* Lowest set bit mask */ +- val = val - ((val >> 1) & 0x5555555555555555); +- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +-# endif +-} +- +-/* ZSTD_rotateRight_*(): +- * Rotates a bitfield to the right by "count" bits. +- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts +- */ +-FORCE_INLINE_TEMPLATE +-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +- assert(count < 64); +- count &= 0x3F; /* for fickle pattern recognition */ +- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +- assert(count < 32); +- count &= 0x1F; /* for fickle pattern recognition */ +- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +- assert(count < 16); +- count &= 0x0F; /* for fickle pattern recognition */ +- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { ++ return ZSTD_countTrailingZeros64(val); + } + + /* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" +- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) ++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +- U32 const next = (*tagRow - 1) & rowMask; +- *tagRow = (BYTE)next; +- return next; ++ U32 next = (*tagRow-1) & rowMask; ++ next += (next == 0) ? rowMask : 0; /* skip first position */ ++ *tagRow = (BYTE)next; ++ return next; + } + + /* ZSTD_isAligned(): +@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + /* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { ++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); +@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) + { + U32 const* const hashTable = ms->hashTable; +- U16 const* const tagTable = ms->tagTable; ++ BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { +- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; +@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +- U16 const* tagTable, BYTE const* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, ++ BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, +- U32 const rowLog, U32 const mls) ++ U32 const rowLog, U32 const mls, ++ U64 const hashSalt) + { +- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab + /* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, +- U32 updateStartIdx, U32 const updateEndIdx, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, ++ U32 updateStartIdx, U32 const updateEndIdx, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) ++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; +- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +- Explicit cast allows us to get exact desired position within each row */ ++ BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; ++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } + } +@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; +@@ -971,7 +953,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); ++} ++ ++/* Returns the mask width of bits group of which will be set to 1. Given not all ++ * architectures have easy movemask instruction, this helps to iterate over ++ * groups of bits easier and faster. ++ */ ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ (void)rowEntries; ++#if defined(ZSTD_ARCH_ARM_NEON) ++ /* NEON path only works for little endian */ ++ if (!MEM_isLittleEndian()) { ++ return 1; ++ } ++ if (rowEntries == 16) { ++ return 4; ++ } ++ if (rowEntries == 32) { ++ return 2; ++ } ++ if (rowEntries == 64) { ++ return 1; ++ } ++#endif ++ return 1; + } + + #if defined(ZSTD_ARCH_X86_SSE2) +@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U + } + #endif + +-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches +- * the hash at the nth position in a row of the tagTable. +- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield +- * to match up with the actual layout of the entries within the hashTable */ ++#if defined(ZSTD_ARCH_ARM_NEON) ++FORCE_INLINE_TEMPLATE ZSTD_VecMask ++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ if (rowEntries == 16) { ++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. ++ * After that groups of 4 bits represent the equalMask. We lower ++ * all bits except the highest in these groups by doing AND with ++ * 0x88 = 0b10001000. ++ */ ++ const uint8x16_t chunk = vld1q_u8(src); ++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); ++ const uint8x8_t res = vshrn_n_u16(equalMask, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; ++ } else if (rowEntries == 32) { ++ /* Same idea as with rowEntries == 16 but doing AND with ++ * 0x55 = 0b01010101. ++ */ ++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); ++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); ++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); ++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); ++ const uint8x8_t res = vsli_n_u8(t0, t1, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; ++ } else { /* rowEntries == 64 */ ++ const uint8x16x4_t chunk = vld4q_u8(src); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); ++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); ++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); ++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); ++ ++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); ++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); ++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); ++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); ++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped); ++ } ++} ++#endif ++ ++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by ++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" ++ * matches the hash at the nth position in a row of the tagTable. ++ * Each row is a circular buffer beginning at the value of "headGrouped". So we ++ * must rotate the "matches" bitfield to match up with the actual layout of the ++ * entries within the hashTable */ + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) + { +- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; ++ const BYTE* const src = tagRow; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); + + #if defined(ZSTD_ARCH_X86_SSE2) + +- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); ++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); + + #else /* SW or NEON-LE */ + + # if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { +- if (rowEntries == 16) { +- const uint8x16_t chunk = vld1q_u8(src); +- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +- const U16 hi = (U16)vgetq_lane_u8(t3, 8); +- const U16 lo = (U16)vgetq_lane_u8(t3, 0); +- return ZSTD_rotateRight_U16((hi << 8) | lo, head); +- } else if (rowEntries == 32) { +- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +- const uint8x8x2_t t3 = vuzp_u8(t2, t0); +- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +- return ZSTD_rotateRight_U32(matches, head); +- } else { /* rowEntries == 64 */ +- const uint8x16x4_t chunk = vld4q_u8(src); +- const uint8x16_t dup = vdupq_n_u8(tag); +- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); +- +- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +- return ZSTD_rotateRight_U64(matches, head); +- } ++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); + } + # endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ +- { const size_t chunkSize = sizeof(size_t); ++ { const int chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; +@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + } + matches = ~matches; + if (rowEntries == 16) { +- return ZSTD_rotateRight_U16((U16)matches, head); ++ return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { +- return ZSTD_rotateRight_U32((U32)matches, head); ++ return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { +- return ZSTD_rotateRight_U64((U64)matches, head); ++ return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } + #endif +@@ -1103,20 +1124,21 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + + /* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: +- * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" +- * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines ++ * - Generate a hash for current input posistion and split it into a one byte of tag and `rowHashLog` bits of index. ++ * - The hash is salted by a value that changes on every contex reset, so when the same table is used ++ * we will avoid collisions that would otherwise slow us down by intorducing phantom matches. ++ * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines + * which row to insert into. +- * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can +- * be considered as a circular buffer with a "head" index that resides in the tagTable. +- * - Also insert the "tag" into the equivalent row and position in the tagTable. +- * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. +- * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, +- * for alignment/performance reasons, leaving some bytes unused. +- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and ++ * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can ++ * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes ++ * per row). ++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. ++ * - Insert the tag into the equivalent row and position in the tagTable. + */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_RowFindBestMatch( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +@@ -1125,7 +1147,7 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowLog) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -1143,8 +1165,11 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ ++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); ++ const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; ++ U32 hash; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms = ms->dictMatchState; +@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch( + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; +- U16* const dmsTagTable = dms->tagTable; ++ BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch( + } + + /* Update the hashTable and tagTable up to (but not including) ip */ +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ if (!ms->lazySkipping) { ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); ++ } else { ++ /* Stop inserting every position when in the lazy skipping mode. ++ * The hash cache is also not kept up to date in this mode. ++ */ ++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); ++ ms->nextToUpdate = curr; ++ } ++ ms->hashSaltEntropy += hash; /* collect salt entropy */ ++ + { /* Get the hash for ip, compute the appropriate row */ +- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); +- U32 const head = *tagRow & rowMask; ++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; ++ if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; +@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; ++ tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + +@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch( + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch( + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + +- { U32 const head = *dmsTagRow & rowMask; ++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; ++ if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Return the longest match */ +@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } +@@ -1472,8 +1512,9 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + * Common parser - lazy strategy + *********************************/ + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_lazy_generic( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_lazy_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, +@@ -1491,7 +1532,8 @@ ZSTD_compressBlock_lazy_generic( + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + +- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; ++ U32 offset_1 = rep[0], offset_2 = rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; +@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic( + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; +- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 +@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic( + assert(offset_2 <= dictAndPrefixLength); + } + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + +@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic( + } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); ++ { size_t offbaseFound = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; ++ ip += step; ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1579,12 +1631,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 1"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1596,17 +1648,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1615,12 +1667,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 2"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1632,17 +1684,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ +@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic( + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { ++ if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { +- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ ++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) ++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + if (isDxS) { +@@ -1686,8 +1745,8 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; +@@ -1701,166 +1760,181 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + +- /* Save reps for next block */ +- rep[0] = offset_1 ? offset_1 : savedOffset; +- rep[1] = offset_2 ? offset_2 : savedOffset; ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ ++ /* save reps for next block */ ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + + +-size_t ZSTD_compressBlock_btlazy2( ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_lazy2( ++size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_greedy( ++size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dictMatchState( ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); + } + +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); + } + +-/* Row-based matchfinder */ +-size_t ZSTD_compressBlock_lazy2_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_row( ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); + } + +- + size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); + } ++#endif + ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_lazy_extDict_generic( + ZSTD_matchState_t* ms, seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], +@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + +@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + } } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); ++ ip += step + 1; /* jump faster over incompressible sections */ ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1939,7 +2023,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1951,18 +2035,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1971,7 +2055,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1983,38 +2067,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ if (OFFBASE_IS_OFFSET(offBase)) { ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + while (ip <= ilimit) { +@@ -2029,8 +2120,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -2045,8 +2136,9 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + +- ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +@@ -2054,49 +2146,55 @@ size_t ZSTD_compressBlock_greedy_extDict( + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); + } + +-size_t ZSTD_compressBlock_lazy_extDict( ++size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict( ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); + } + +-size_t ZSTD_compressBlock_btlazy2_extDict( ++size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); + } ++#endif + +-size_t ZSTD_compressBlock_greedy_extDict_row( ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) ++ + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); + } + +-size_t ZSTD_compressBlock_lazy_extDict_row( ++size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict_row( ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); + } ++#endif +diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h +index e5bdf4df8dde..22c9201f4e63 100644 +--- a/lib/zstd/compress/zstd_lazy.h ++++ b/lib/zstd/compress/zstd_lazy.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -22,98 +23,175 @@ + */ + #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + ++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); + void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); + + void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); + + void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ ++#endif + +-size_t ZSTD_compressBlock_btlazy2( ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2( ++size_t ZSTD_compressBlock_greedy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy( ++size_t ZSTD_compressBlock_greedy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy( ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_row( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_row( ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_row( ++size_t ZSTD_compressBlock_greedy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++size_t ZSTD_compressBlock_greedy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_GREEDY NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState( ++size_t ZSTD_compressBlock_lazy_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++size_t ZSTD_compressBlock_lazy_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++ ++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_greedy_extDict( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict( ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_extDict_row( ++size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict_row( ++ ++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_extDict_row( ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- ++ ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL ++#endif ++ + + + #endif /* ZSTD_LAZY_H */ +diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c +index dd86fc83e7dd..07f3bc6437ce 100644 +--- a/lib/zstd/compress/zstd_ldm.c ++++ b/lib/zstd/compress/zstd_ldm.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -242,11 +243,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + switch(ms->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: +@@ -318,7 +323,9 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) + } + } + +-static size_t ZSTD_ldm_generateSequences_internal( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_ldm_generateSequences_internal( + ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) + { +@@ -549,7 +556,7 @@ size_t ZSTD_ldm_generateSequences( + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the +- * the offset against maxDist directly. ++ * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may +@@ -689,7 +696,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); +- int i; + /* End signal */ + if (sequence.offset == 0) + break; +@@ -702,6 +708,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { ++ int i; + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; +@@ -711,7 +718,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, +- STORE_OFFSET(sequence.offset), ++ OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } +diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h +index fbc6a5e88fd7..c540731abde7 100644 +--- a/lib/zstd/compress/zstd_ldm.h ++++ b/lib/zstd/compress/zstd_ldm.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h +index 647f865be290..cfccfc46f6f7 100644 +--- a/lib/zstd/compress/zstd_ldm_geartab.h ++++ b/lib/zstd/compress/zstd_ldm_geartab.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c +index fd82acfda62f..a87b66ac8d24 100644 +--- a/lib/zstd/compress/zstd_opt.c ++++ b/lib/zstd/compress/zstd_opt.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,11 +13,14 @@ + #include "hist.h" + #include "zstd_opt.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + + #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ + #define ZSTD_MAX_PRICE (1<<30) + +-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + + /*-************************************* +@@ -26,27 +30,35 @@ + #if 0 /* approximation at bit level (for tests) */ + # define BITCOST_ACCURACY 0 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) ++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) + #elif 0 /* fractional bit accuracy (for tests) */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) + #else /* opt==approx, ultra==accurate */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) ++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) + #endif + ++/* ZSTD_bitWeight() : ++ * provide estimated "cost" of a stat in full bits only */ + MEM_STATIC U32 ZSTD_bitWeight(U32 stat) + { + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); + } + ++/* ZSTD_fracWeight() : ++ * provide fractional-bit "cost" of a stat, ++ * using linear interpolation approximation */ + MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + { + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; ++ /* Fweight was meant for "Fractional weight" ++ * but it's effectively a value between 1 and 2 ++ * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); +@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + /* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +-MEM_STATIC double ZSTD_fCost(U32 price) ++MEM_STATIC double ZSTD_fCost(int price) + { + return (double)price / (BITCOST_MULTIPLIER*8); + } +@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts) + return total; + } + +-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) ++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; ++ ++static U32 ++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) + { + U32 s, sum=0; +- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); ++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", ++ (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); + for (s=0; s> shift); +- sum += table[s]; ++ unsigned const base = base1 ? 1 : (table[s]>0); ++ unsigned const newStat = base + (table[s] >> shift); ++ sum += newStat; ++ table[s] = newStat; + } + return sum; + } + + /* ZSTD_scaleStats() : +- * reduce all elements in table is sum too large ++ * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + { +@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; +- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); ++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); + } + + /* ZSTD_rescaleFreqs() : +@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + +- if (optPtr->litLengthSum == 0) { /* first block : init */ +- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ +- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); ++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ ++ ++ /* heuristic: use pre-defined stats for too small inputs */ ++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { ++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { +- /* huffman table presumed generated by dictionary */ ++ ++ /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { ++ /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; +@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + +- } else { /* not a dictionary */ ++ } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { ++ /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ +- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); ++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { +@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + +- + } + +- } else { /* new block : re-use previous statistics, scaled down */ ++ } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr, + int optLevel) + { ++ DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) +@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ +- { U32 price = litLength * optPtr->litSumBasePrice; ++ { U32 price = optPtr->litSumBasePrice * litLength; ++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; ++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { +- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ +- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; ++ price -= litPrice; + } + return price; + } +@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); +- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +- * because it isn't representable in the zstd format. So instead just +- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +- * would be all literals. ++ ++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX ++ * because it isn't representable in the zstd format. ++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. ++ * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); +@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + } + + /* ZSTD_getMatchPrice() : +- * Provides the cost of the match part (offset + matchLength) of a sequence ++ * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. +- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 ++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ + FORCE_INLINE_TEMPLATE U32 +-ZSTD_getMatchPrice(U32 const offcode, ++ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) + { + U32 price; +- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); ++ U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + +- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ +- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); ++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ ++ return WEIGHT(mlBase, optLevel) ++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); +@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode, + } + + /* ZSTD_updateStats() : +- * assumption : literals + litLengtn <= iend */ ++ * assumption : literals + litLength <= iend */ + static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, +- U32 offsetCode, U32 matchLength) ++ U32 offBase, U32 matchLength) + { + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { +@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, + optPtr->litLengthSum++; + } + +- /* offset code : expected to follow storeSeq() numeric representation */ +- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); ++ /* offset code : follows storeSeq() numeric representation */ ++ { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; +@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) + + /* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip) + { + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; +@@ -408,7 +438,9 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position + * @return : nb of positions added */ +-static U32 ZSTD_insertBt1( ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertBt1( + const ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const target, +@@ -527,6 +559,7 @@ static U32 ZSTD_insertBt1( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_updateTree_internal( + ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +@@ -535,7 +568,7 @@ void ZSTD_updateTree_internal( + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; +- DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", ++ DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { +@@ -553,15 +586,18 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { + } + + FORCE_INLINE_TEMPLATE +-U32 ZSTD_insertBtAndGetAllMatches ( +- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ +- ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, +- const U32 rep[ZSTD_REP_NUM], +- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ +- const U32 lengthToBeat, +- U32 const mls /* template */) ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ++ZSTD_insertBtAndGetAllMatches ( ++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ++ ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip, const BYTE* const iLimit, ++ const ZSTD_dictMode_e dictMode, ++ const U32 rep[ZSTD_REP_NUM], ++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ ++ const U32 lengthToBeat, ++ const U32 mls /* template */) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; +- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ ++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) +@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ +- matches[0].off = STORE_OFFSET(curr - matchIndex3); ++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | +@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + } + + if (matchLength > bestLength) { +- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; +- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -792,7 +828,9 @@ typedef U32 (*ZSTD_getAllMatchesFn)( + U32 const ll0, + U32 const lengthToBeat); + +-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, + ZSTD_matchState_t* ms, + U32* nextToUpdate3, +@@ -960,7 +998,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) + { + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; +- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ ++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ +@@ -971,11 +1009,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); +- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", +- candidateOffCode, candidateMatchLength, currPosInBlock); ++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); ++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", ++ candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; +- matches[*nbMatches].off = candidateOffCode; ++ matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } + } +@@ -1011,11 +1049,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + * Optimal parser + *********************************/ + +-static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +-{ +- return sol.litlen + sol.mlen; +-} +- + #if 0 /* debug */ + + static void +@@ -1033,7 +1066,13 @@ listStats(const U32* table, int lastEltID) + + #endif + +-FORCE_INLINE_TEMPLATE size_t ++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) ++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) ++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) ++ ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t + ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + seqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], +@@ -1059,9 +1098,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; +- ZSTD_optimal_t lastSequence; ++ ZSTD_optimal_t lastStretch; + ZSTD_optLdm_t optLdm; + ++ ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); ++ + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); +@@ -1082,103 +1123,139 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const ll0 = !litlen; + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(ip-istart), (U32)(iend - ip)); +- if (!nbMatches) { ip++; continue; } ++ (U32)(ip-istart), (U32)(iend-ip)); ++ if (!nbMatches) { ++ DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); ++ ip++; ++ continue; ++ } ++ ++ /* Match found: let's store this solution, and eventually find more candidates. ++ * During this forward pass, @opt is used to store stretches, ++ * defined as "a match followed by N literals". ++ * Note how this is different from a Sequence, which is "N literals followed by a match". ++ * Storing stretches allows us to store different match predecessors ++ * for each literal position part of a literals run. */ + + /* initialize opt[0] */ +- { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; +- U32 const maxOffcode = matches[nbMatches-1].off; +- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", +- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); ++ U32 const maxOffBase = matches[nbMatches-1].off; ++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", ++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { +- lastSequence.litlen = litlen; +- lastSequence.mlen = maxML; +- lastSequence.off = maxOffcode; +- DEBUGLOG(6, "large match (%u>%u), immediate encoding", ++ lastStretch.litlen = 0; ++ lastStretch.mlen = maxML; ++ lastStretch.off = maxOffBase; ++ DEBUGLOG(6, "large match (%u>%u) => immediate encoding", + maxML, sufficient_len); + cur = 0; +- last_pos = ZSTD_totalLen(lastSequence); ++ last_pos = maxML; + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + assert(opt[0].price >= 0); +- { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); +- U32 pos; ++ { U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { +- opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ ++ opt[pos].price = ZSTD_MAX_PRICE; ++ opt[pos].mlen = 0; ++ opt[pos].litlen = litlen + pos; + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { +- U32 const offcode = matches[matchNb].off; ++ U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { +- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); +- U32 const sequencePrice = literalsPrice + matchPrice; ++ int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); ++ int const sequencePrice = opt[0].price + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; +- opt[pos].off = offcode; +- opt[pos].litlen = litlen; +- opt[pos].price = (int)sequencePrice; +- } } ++ opt[pos].off = offBase; ++ opt[pos].litlen = 0; /* end of match */ ++ opt[pos].price = sequencePrice + LL_PRICE(0); ++ } ++ } + last_pos = pos-1; ++ opt[pos].price = ZSTD_MAX_PRICE; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; +- assert(cur < ZSTD_OPT_NUM); +- DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) ++ assert(cur <= ZSTD_OPT_NUM); ++ DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur); + + /* Fix current position with one literal if cheaper */ +- { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; ++ { U32 const litlen = opt[cur-1].litlen + 1; + int const price = opt[cur-1].price +- + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) +- + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) +- - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); ++ + LIT_PRICE(ip+cur-1) ++ + LL_INCPRICE(litlen); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { ++ ZSTD_optimal_t const prevMatch = opt[cur]; + DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", + inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); +- opt[cur].mlen = 0; +- opt[cur].off = 0; ++ opt[cur] = opt[cur-1]; + opt[cur].litlen = litlen; + opt[cur].price = price; ++ if ( (optLevel >= 1) /* additional check only for higher modes */ ++ && (prevMatch.litlen == 0) /* replace a match */ ++ && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ ++ && LIKELY(ip + cur < iend) ++ ) { ++ /* check next position, in case it would be cheaper */ ++ int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); ++ int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); ++ DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", ++ cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); ++ if ( (with1literal < withMoreLiterals) ++ && (with1literal < opt[cur+1].price) ) { ++ /* update offset history - before it disappears */ ++ U32 const prev = cur - prevMatch.mlen; ++ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); ++ assert(cur >= prevMatch.mlen); ++ DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", ++ ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), ++ newReps.rep[0], newReps.rep[1], newReps.rep[2] ); ++ opt[cur+1] = prevMatch; /* mlen & offbase */ ++ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(repcodes_t)); ++ opt[cur+1].litlen = 1; ++ opt[cur+1].price = with1literal; ++ if (last_pos < cur+1) last_pos = cur+1; ++ } ++ } + } else { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), +- opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); ++ DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f)", ++ inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); + } + } + +- /* Set the repcodes of the current position. We must do it here +- * because we rely on the repcodes of the 2nd to last sequence being +- * correct to set the next chunks repcodes during the backward +- * traversal. ++ /* Offset history is not updated during match comparison. ++ * Do it here, now that the match is selected and confirmed. + */ + ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); + assert(cur >= opt[cur].mlen); +- if (opt[cur].mlen != 0) { ++ if (opt[cur].litlen == 0) { ++ /* just finished a match => alter offset history */ + U32 const prev = cur - opt[cur].mlen; +- repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); ++ repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); + ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); +- } else { +- ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ +@@ -1188,15 +1265,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { +- DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); ++ DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + assert(opt[cur].price >= 0); +- { U32 const ll0 = (opt[cur].mlen != 0); +- U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; +- U32 const previousPrice = (U32)opt[cur].price; +- U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); ++ { U32 const ll0 = (opt[cur].litlen == 0); ++ int const previousPrice = opt[cur].price; ++ int const basePrice = previousPrice + LL_PRICE(0); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); + U32 matchNb; + +@@ -1208,18 +1284,17 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + continue; + } + +- { U32 const maxML = matches[nbMatches-1].len; +- DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", +- inr-istart, cur, nbMatches, maxML); +- +- if ( (maxML > sufficient_len) +- || (cur + maxML >= ZSTD_OPT_NUM) ) { +- lastSequence.mlen = maxML; +- lastSequence.off = matches[nbMatches-1].off; +- lastSequence.litlen = litlen; +- cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ +- last_pos = cur + ZSTD_totalLen(lastSequence); +- if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ ++ { U32 const longestML = matches[nbMatches-1].len; ++ DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of longest ML=%u", ++ inr-istart, cur, nbMatches, longestML); ++ ++ if ( (longestML > sufficient_len) ++ || (cur + longestML >= ZSTD_OPT_NUM) ++ || (ip + cur + longestML >= iend) ) { ++ lastStretch.mlen = longestML; ++ lastStretch.off = matches[nbMatches-1].off; ++ lastStretch.litlen = 0; ++ last_pos = cur + longestML; + goto _shortestPath; + } } + +@@ -1230,20 +1305,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + +- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", +- matchNb, matches[matchNb].off, lastML, litlen); ++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", ++ matchNb, matches[matchNb].off, lastML, opt[cur].litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; +- int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); ++ int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); +- while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ ++ while (last_pos < pos) { ++ /* fill empty positions, for future comparisons */ ++ last_pos++; ++ opt[last_pos].price = ZSTD_MAX_PRICE; ++ opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ ++ } + opt[pos].mlen = mlen; + opt[pos].off = offset; +- opt[pos].litlen = litlen; ++ opt[pos].litlen = 0; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", +@@ -1251,52 +1331,86 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } ++ opt[last_pos+1].price = ZSTD_MAX_PRICE; + } /* for (cur = 1; cur <= last_pos; cur++) */ + +- lastSequence = opt[last_pos]; +- cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ +- assert(cur < ZSTD_OPT_NUM); /* control overflow*/ ++ lastStretch = opt[last_pos]; ++ assert(cur >= lastStretch.mlen); ++ cur = last_pos - lastStretch.mlen; + + _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); ++ assert(last_pos >= lastStretch.mlen); ++ assert(cur == last_pos - lastStretch.mlen); + +- /* Set the next chunk's repcodes based on the repcodes of the beginning +- * of the last match, and the last sequence. This avoids us having to +- * update them while traversing the sequences. +- */ +- if (lastSequence.mlen != 0) { +- repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); +- ZSTD_memcpy(rep, &reps, sizeof(reps)); ++ if (lastStretch.mlen==0) { ++ /* no solution : all matches have been converted into literals */ ++ assert(lastStretch.litlen == (ip - anchor) + last_pos); ++ ip += last_pos; ++ continue; ++ } ++ assert(lastStretch.off > 0); ++ ++ /* Update offset history */ ++ if (lastStretch.litlen == 0) { ++ /* finishing on a match : update offset history */ ++ repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); ++ ZSTD_memcpy(rep, &reps, sizeof(repcodes_t)); + } else { +- ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(rep, lastStretch.rep, sizeof(repcodes_t)); ++ assert(cur >= lastStretch.litlen); ++ cur -= lastStretch.litlen; + } + +- { U32 const storeEnd = cur + 1; ++ /* Let's write the shortest path solution. ++ * It is stored in @opt in reverse order, ++ * starting from @storeEnd (==cur+2), ++ * effectively partially @opt overwriting. ++ * Content is changed too: ++ * - So far, @opt stored stretches, aka a match followed by literals ++ * - Now, it will store sequences, aka literals followed by a match ++ */ ++ { U32 const storeEnd = cur + 2; + U32 storeStart = storeEnd; +- U32 seqPos = cur; ++ U32 stretchPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; +- assert(storeEnd < ZSTD_OPT_NUM); +- DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); +- opt[storeEnd] = lastSequence; +- while (seqPos > 0) { +- U32 const backDist = ZSTD_totalLen(opt[seqPos]); ++ assert(storeEnd < ZSTD_OPT_SIZE); ++ DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", ++ storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); ++ if (lastStretch.litlen > 0) { ++ /* last "sequence" is unfinished: just a bunch of literals */ ++ opt[storeEnd].litlen = lastStretch.litlen; ++ opt[storeEnd].mlen = 0; ++ storeStart = storeEnd-1; ++ opt[storeStart] = lastStretch; ++ } { ++ opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ ++ storeStart = storeEnd; ++ } ++ while (1) { ++ ZSTD_optimal_t nextStretch = opt[stretchPos]; ++ opt[storeStart].litlen = nextStretch.litlen; ++ DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", ++ opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); ++ if (nextStretch.mlen == 0) { ++ /* reaching beginning of segment */ ++ break; ++ } + storeStart--; +- DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); +- opt[storeStart] = opt[seqPos]; +- seqPos = (seqPos > backDist) ? seqPos - backDist : 0; ++ opt[storeStart] = nextStretch; /* note: litlen will be fixed */ ++ assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); ++ stretchPos -= nextStretch.litlen + nextStretch.mlen; + } + + /* save sequences */ +- DEBUGLOG(6, "sending selected sequences into seqStore") ++ DEBUGLOG(6, "sending selected sequences into seqStore"); + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; +- U32 const offCode = opt[storePos].off; ++ U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", + anchor - istart, (unsigned)llen, (unsigned)mlen); +@@ -1308,11 +1422,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + } + + assert(anchor + llen <= iend); +- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); +- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); ++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); ++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } ++ DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); ++ ++ /* update all costs */ + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ +@@ -1320,21 +1437,27 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt0( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt2( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +@@ -1342,20 +1465,23 @@ size_t ZSTD_compressBlock_btopt( + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + + + + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + /* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. +- * this function cannot error, hence its contract must be respected. ++ * this function cannot error out, its narrow contract must be respected. + */ +-static void +-ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, +- U32 rep[ZSTD_REP_NUM], +- const void* src, size_t srcSize) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_initStats_ultra(ZSTD_matchState_t* ms, ++ seqStore_t* seqStore, ++ U32 rep[ZSTD_REP_NUM], ++ const void* src, size_t srcSize) + { + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); +@@ -1368,7 +1494,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + +- /* invalidate first scan from history */ ++ /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; +@@ -1392,10 +1518,10 @@ size_t ZSTD_compressBlock_btultra2( + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + +- /* 2-pass strategy: ++ /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics +- * and seed next round's statistics with it. +- * After 1st pass, function forgets everything, and starts a new block. ++ * in order to seed next round's statistics with it. ++ * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), +@@ -1404,15 +1530,17 @@ size_t ZSTD_compressBlock_btultra2( + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ +- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ +- && (srcSize > ZSTD_PREDEF_THRESHOLD) ++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ ++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) +@@ -1420,18 +1548,20 @@ size_t ZSTD_compressBlock_btopt_dictMatchState( + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_btultra_dictMatchState( ++size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + +-size_t ZSTD_compressBlock_btopt_extDict( ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); ++ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + + size_t ZSTD_compressBlock_btultra_extDict( +@@ -1440,6 +1570,7 @@ size_t ZSTD_compressBlock_btultra_extDict( + { + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries +diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h +index 22b862858ba7..ac1b743d27cd 100644 +--- a/lib/zstd/compress/zstd_opt.h ++++ b/lib/zstd/compress/zstd_opt.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,30 +15,40 @@ + + #include "zstd_compress_internal.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + /* used in ZSTD_loadDictionaryContent() */ + void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra( ++size_t ZSTD_compressBlock_btopt_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra2( ++size_t ZSTD_compressBlock_btopt_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTOPT NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL ++#endif + +-size_t ZSTD_compressBlock_btopt_dictMatchState( ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_dictMatchState( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +@@ -45,6 +56,20 @@ size_t ZSTD_compressBlock_btultra_extDict( + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ ++size_t ZSTD_compressBlock_btultra2( ++ ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); ++ ++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 ++#else ++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL ++#endif + + + #endif /* ZSTD_OPT_H */ +diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c +index 60958afebc41..ac8b87f48f84 100644 +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,10 +20,10 @@ + #include "../common/compiler.h" + #include "../common/bitstream.h" /* BIT_* */ + #include "../common/fse.h" /* to compress headers */ +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/error_private.h" + #include "../common/zstd_internal.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + + /* ************************************************************** + * Constants +@@ -34,6 +35,12 @@ + * Macros + ****************************************************************/ + ++#ifdef HUF_DISABLE_FAST_DECODE ++# define HUF_ENABLE_FAST_DECODE 0 ++#else ++# define HUF_ENABLE_FAST_DECODE 1 ++#endif ++ + /* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. +@@ -43,27 +50,25 @@ + #error "Cannot force the use of the X1 and X2 decoders at the same time!" + #endif + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE ++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is ++ * supported at runtime, so we can add the BMI2 target attribute. ++ * When it is disabled, we will still get BMI2 if it is enabled statically. ++ */ ++#if DYNAMIC_BMI2 ++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE + #else +-# define HUF_ASM_X86_64_BMI2_ATTRS ++# define HUF_FAST_BMI2_ATTRS + #endif + + #define HUF_EXTERN_C + #define HUF_ASM_DECL HUF_EXTERN_C + +-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) ++#if DYNAMIC_BMI2 + # define HUF_NEED_BMI2_FUNCTION 1 + #else + # define HUF_NEED_BMI2_FUNCTION 0 + #endif + +-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +-# define HUF_NEED_DEFAULT_FUNCTION 1 +-#else +-# define HUF_NEED_DEFAULT_FUNCTION 0 +-#endif +- + /* ************************************************************** + * Error Management + ****************************************************************/ +@@ -80,6 +85,11 @@ + /* ************************************************************** + * BMI2 Variant Wrappers + ****************************************************************/ ++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, ++ const void *cSrc, ++ size_t cSrcSize, ++ const HUF_DTable *DTable); ++ + #if DYNAMIC_BMI2 + + #define HUF_DGEN(fn) \ +@@ -101,9 +111,9 @@ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- if (bmi2) { \ ++ if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ +@@ -113,9 +123,9 @@ + + #define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- (void)bmi2; \ ++ (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) + return dtd; + } + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 +- +-static size_t HUF_initDStream(BYTE const* ip) { ++static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; +- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); ++ assert(sizeof(size_t) == 8); + return value << bitsConsumed; + } ++ ++ ++/* ++ * The input/output arguments to the Huffman fast decoding loop: ++ * ++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. ++ * op [in/out] - The output pointers, must be updated to reflect what is written. ++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. ++ * dt [in] - The decoding table. ++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read ++ * down to this pointer. It may be below iend[0]. ++ * oend [in] - The end of the output stream. op[3] must not cross oend. ++ * iend [in] - The end of each input stream. ip[i] may cross iend[i], ++ * as long as it is above ilowest, but that indicates corruption. ++ */ + typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; +- BYTE const* ilimit; ++ BYTE const* ilowest; + BYTE* oend; + BYTE const* iend[4]; +-} HUF_DecompressAsmArgs; ++} HUF_DecompressFastArgs; ++ ++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + + /* +- * Initializes args for the asm decoding loop. +- * @returns 0 on success +- * 1 if the fallback implementation should be used. ++ * Initializes args for the fast decoding loop. ++ * @returns 1 on success ++ * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) ++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) + { + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + +- const BYTE* const ilimit = (const BYTE*)src + 6 + 8; ++ const BYTE* const istart = (const BYTE*)src; + +- BYTE* const oend = (BYTE*)dst + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + +- /* The following condition is false on x32 platform, +- * but HUF_asm is not compatible with this ABI */ +- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; ++ /* The fast decoding loop assumes 64-bit little-endian. ++ * This condition is false on x32. ++ */ ++ if (!MEM_isLittleEndian() || MEM_32bits()) ++ return 0; ++ ++ /* Avoid nullptr addition */ ++ if (dstSize == 0) ++ return 0; ++ assert(dst != NULL); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) +@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) +- return 1; ++ return 0; + + /* Read the jump table. */ + { +- const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); +@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + +- /* HUF_initDStream() requires this, and this small of an input ++ /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. +- * length1 must be >= 16 so that ip[0] >= ilimit before the loop +- * starts. + */ +- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +- return 1; ++ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) ++ return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ +@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) +- return 1; ++ return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. +@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ +- args->bits[0] = HUF_initDStream(args->ip[0]); +- args->bits[1] = HUF_initDStream(args->ip[1]); +- args->bits[2] = HUF_initDStream(args->ip[2]); +- args->bits[3] = HUF_initDStream(args->ip[3]); +- +- /* If ip[] >= ilimit, it is guaranteed to be safe to +- * reload bits[]. It may be beyond its section, but is +- * guaranteed to be valid (>= istart). +- */ +- args->ilimit = ilimit; ++ args->bits[0] = HUF_initFastDStream(args->ip[0]); ++ args->bits[1] = HUF_initFastDStream(args->ip[1]); ++ args->bits[2] = HUF_initFastDStream(args->ip[2]); ++ args->bits[3] = HUF_initFastDStream(args->ip[3]); ++ ++ /* The decoders must be sure to never read beyond ilowest. ++ * This is lower than iend[0], but allowing decoders to read ++ * down to ilowest can allow an extra iteration or two in the ++ * fast loop. ++ */ ++ args->ilowest = istart; + + args->oend = oend; + args->dt = dt; + +- return 0; ++ return 1; + } + +-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) ++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) + { + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) +@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ +- bit->bitContainer = MEM_readLE64(args->ip[stream]); +- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); +- bit->start = (const char*)args->iend[0]; ++ assert(sizeof(size_t) == 8); ++ bit->bitContainer = MEM_readLEST(args->ip[stream]); ++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); ++ bit->start = (const char*)args->ilowest; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; + } +-#endif ++ ++/* Calls X(N) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM(X) \ ++ do { \ ++ X(0); \ ++ X(1); \ ++ X(2); \ ++ X(3); \ ++ } while (0) ++ ++/* Calls X(N, var) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ ++ do { \ ++ X(0, (var)); \ ++ X(1, (var)); \ ++ X(2, (var)); \ ++ X(3, (var)); \ ++ } while (0) + + + #ifndef HUF_FORCE_DECOMPRESS_X2 +@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi + static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { +- D4 = (symbol << 8) + nbBits; ++ D4 = (U64)((symbol << 8) + nbBits); + } else { +- D4 = symbol + (nbBits << 8); ++ D4 = (U64)(symbol + (nbBits << 8)); + } ++ assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; + } +@@ -329,13 +379,7 @@ typedef struct { + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; + } HUF_ReadDTableX1_Workspace; + +- +-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog = 0; + U32 nbSymbols = 0; +@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + +@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ +- { +- int n; +- int nextRankStart = 0; ++ { int n; ++ U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { +@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ +- { +- U32 w; +- int symbol=wksp->rankVal[0]; +- int rankStart=0; ++ { U32 w; ++ int symbol = wksp->rankVal[0]; ++ int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; +@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog + } + + #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ +- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) ++ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + +- return pEnd-pStart; ++ return (size_t)(pEnd-pStart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body( + const HUF_DTable* DTable) + { + BYTE* op = (BYTE*)dst; +- BYTE* const oend = op + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; +@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body( + return dstSize; + } + ++/* HUF_decompress4X1_usingDTable_internal_body(): ++ * Conditions : ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body( + { + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body( + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6); /* validated above */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ U16 const* const dtable = (U16 const*)args->dt; ++ BYTE* const oend = args->oend; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local variables */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each iteration produces 5 output symbols per stream */ ++ size_t const oiters = (size_t)(oend - op[3]) / 5; ++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes ++ * per stream. ++ */ ++ size_t const iiters = (size_t)(ip[0] - ilowest) / 7; ++ /* We can safely run iters iterations before running bounds checks */ ++ size_t const iters = MIN(oiters, iiters); ++ size_t const symbols = iters * 5; ++ ++ /* We can simply check that op[3] < olimit, instead of checking all ++ * of our bounds, since we can't hit the other bounds until we've run ++ * iters iterations, which only happens when op[3] == olimit. ++ */ ++ olimit = op[3] + symbols; ++ ++ /* Exit fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ ++ do { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ int const entry = (int)dtable[index]; \ ++ bits[(_stream)] <<= (entry & 0x3F); \ ++ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ ++ } while (0) ++ ++#define HUF_4X1_RELOAD_STREAM(_stream) \ ++ do { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ op[(_stream)] += 5; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols in each of the 4 streams */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); ++ ++ /* Reload each of the 4 the bitstreams */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ ++#undef HUF_4X1_DECODE_SYMBOL ++#undef HUF_4X1_RELOAD_STREAM ++ } + +-static HUF_ASM_X86_64_BMI2_ATTRS ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++/* ++ * @returns @p dstSize on success (>= 6) ++ * 0 if the fallback implementation should be used ++ * An error if an error occurred ++ */ ++static HUF_FAST_BMI2_ATTRS + size_t +-HUF_decompress4X1_usingDTable_internal_bmi2_asm( ++HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) + { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; +- { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +- FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ BYTE const* const ilowest = (BYTE const*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; ++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + +- /* Our loop guarantees that ip[] >= ilimit and that we haven't ++ /* Our loop guarantees that ip[] >= ilowest and that we haven't + * overwritten any op[]. + */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bit streams one by one. */ +- { +- size_t const segmentSize = (dstSize+3) / 4; ++ { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { +@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + } + + /* decoded size */ ++ assert(dstSize != 0); + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +- +-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, +- const void *cSrc, +- size_t cSrcSize, +- const HUF_DTable *DTable); + + HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + #endif +-} +- +- +-size_t HUF_decompress1X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} + +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- const BYTE* ip = (const BYTE*) cSrc; +- +- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); +- if (HUF_isError(hSize)) return hSize; +- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); +- ip += hSize; cSrcSize -= hSize; +- +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + +-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +-} +- +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); ++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +- + #endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +@@ -1040,14 +1175,7 @@ typedef struct { + + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +- const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); +@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ +@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c + } + + #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, +@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body( + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; +- BYTE* const oend = ostart + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); +@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body( + /* decoded size */ + return dstSize; + } ++ ++/* HUF_decompress4X2_usingDTable_internal_body(): ++ * Conditions: ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body( + const HUF_DTable* DTable) + { + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body( + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + +- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ +- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ ++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6 /* validated above */); + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ BYTE* oend[4]; ++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local registers. */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ oend[0] = op[1]; ++ oend[1] = op[2]; ++ oend[2] = op[3]; ++ oend[3] = args->oend; ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= oend[stream]); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each loop does 5 table lookups for each of the 4 streams. ++ * Each table lookup consumes up to 11 bits of input, and produces ++ * up to 2 bytes of output. ++ */ ++ /* We can consume up to 7 bytes of input per iteration per stream. ++ * We also know that each input pointer is >= ip[0]. So we can run ++ * iters loops before running out of input. ++ */ ++ size_t iters = (size_t)(ip[0] - ilowest) / 7; ++ /* Each iteration can produce up to 10 bytes of output per stream. ++ * Each output stream my advance at different rates. So take the ++ * minimum number of safe iterations among all the output streams. ++ */ ++ for (stream = 0; stream < 4; ++stream) { ++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; ++ iters = MIN(iters, oiters); ++ } ++ ++ /* Each iteration produces at least 5 output symbols. So until ++ * op[3] crosses olimit, we know we haven't executed iters ++ * iterations yet. This saves us maintaining an iters counter, ++ * at the expense of computing the remaining # of iterations ++ * more frequently. ++ */ ++ olimit = op[3] + (iters * 5); ++ ++ /* Exit the fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif + +-static HUF_ASM_X86_64_BMI2_ATTRS size_t +-HUF_decompress4X2_usingDTable_internal_bmi2_asm( ++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ ++ do { \ ++ if ((_decode3) || (_stream) != 3) { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ HUF_DEltX2 const entry = dtable[index]; \ ++ MEM_write16(op[(_stream)], entry.sequence); \ ++ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ ++ op[(_stream)] += (entry.length); \ ++ } \ ++ } while (0) ++ ++#define HUF_4X2_RELOAD_STREAM(_stream) \ ++ do { \ ++ HUF_4X2_DECODE_SYMBOL(3, 1); \ ++ { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols from each of the first 3 streams. ++ * The final stream will be decoded during the reload phase ++ * to reduce register pressure. ++ */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ ++ /* Decode one symbol from the final stream */ ++ HUF_4X2_DECODE_SYMBOL(3, 1); ++ ++ /* Decode 4 symbols from the final stream & reload bitstreams. ++ * The final stream is reloaded last, meaning that all 5 symbols ++ * are decoded from the final stream before it is reloaded. ++ */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ } ++ ++#undef HUF_4X2_DECODE_SYMBOL ++#undef HUF_4X2_RELOAD_STREAM ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++ ++static HUF_FAST_BMI2_ATTRS size_t ++HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) { ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; ++ const BYTE* const ilowest = (const BYTE*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; + { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + + /* note : op4 already verified within main loop */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bitStreams one by one */ + { +@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( + /* decoded size */ + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + #endif ++ ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +-size_t HUF_decompress1X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- + size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); ++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); + } + +- +-size_t HUF_decompress4X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- +-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + /* Universal decompression selectors */ + /* ***********************************/ + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- + + #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) + #endif + } + +- +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, +- size_t dstSize, const void* cSrc, +- size_t cSrcSize, void* workSpace, +- size_t wkspSize) +-{ +- /* validation checks */ +- if (dstSize == 0) return ERROR(dstSize_tooSmall); +- if (cSrcSize == 0) return ERROR(corruption_detected); +- +- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)algoNb; +- assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)algoNb; +- assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#else +- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): +- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#endif +- } +-} +- + size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): ++ cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #endif + } + } + + +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + #endif + +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #else +- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : +- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : ++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #endif + } + } +- +diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c +index dbbc7919de53..30ef65e1ab5c 100644 +--- a/lib/zstd/decompress/zstd_ddict.c ++++ b/lib/zstd/decompress/zstd_ddict.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,12 +15,12 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/cpu.h" /* bmi2 */ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_decompress_internal.h" + #include "zstd_ddict.h" +@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + ZSTD_memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; +- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); +@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) + unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) + { + if (ddict==NULL) return 0; +- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); ++ return ddict->dictID; + } +diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h +index 8c1a79d666f8..de459a0dacd1 100644 +--- a/lib/zstd/decompress/zstd_ddict.h ++++ b/lib/zstd/decompress/zstd_ddict.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c +index 6b3177c94711..c9cbc45f6ed9 100644 +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -53,13 +54,15 @@ + * Dependencies + *********************************************************/ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ ++#include "../common/error_private.h" ++#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "../common/mem.h" /* low level memory routines */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ +-#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ +@@ -72,11 +75,11 @@ + *************************************/ + + #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. +- * Currently, that means a 0.75 load factor. +- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded +- * the load factor of the ddict hash set. +- */ ++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. ++ * Currently, that means a 0.75 load factor. ++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded ++ * the load factor of the ddict hash set. ++ */ + + #define DDICT_HASHSET_TABLE_BASE_SIZE 64 + #define DDICT_HASHSET_RESIZE_FACTOR 2 +@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; ++ dctx->disableHufAsm = 0; ++ dctx->maxBlockSizeParam = 0; + } + + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; ++ dctx->isFrameDecompression = 1; + #if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); + #endif +@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ ++** or an error code, which can be tested using ZSTD_isError() */ + size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) + { + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + +- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ +- if (srcSize < minInputSize) return minInputSize; +- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); ++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); ++ ++ if (srcSize > 0) { ++ /* note : technically could be considered an assert(), since it's an invalid entry */ ++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); ++ } ++ if (srcSize < minInputSize) { ++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { ++ /* when receiving less than @minInputSize bytes, ++ * control these bytes at least correspond to a supported magic number ++ * in order to error out early if they don't. ++ **/ ++ size_t const toCopy = MIN(4, srcSize); ++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); ++ assert(src != NULL); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { ++ /* not a zstd frame : let's check if it's a skippable frame */ ++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { ++ RETURN_ERROR(prefix_unknown, ++ "first bytes don't correspond to any supported magic number"); ++ } } } ++ return minInputSize; ++ } + ++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { +@@ -540,61 +570,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); +- { +- size_t const skippableSize = skippableHeaderSize + sizeU32; ++ { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } + } + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * +- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. ++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize) ++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, /* optional, can be NULL */ ++ const void* src, size_t srcSize) + { +- U32 const magicNumber = MEM_readLE32(src); +- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; +- +- /* check input validity */ +- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + +- /* deliver payload */ +- if (skippableContentSize > 0 && dst != NULL) +- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +- if (magicVariant != NULL) +- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +- return skippableContentSize; ++ { U32 const magicNumber = MEM_readLE32(src); ++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); ++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; ++ ++ /* check input validity */ ++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); ++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); ++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ ++ /* deliver payload */ ++ if (skippableContentSize > 0 && dst != NULL) ++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); ++ if (magicVariant != NULL) ++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; ++ return skippableContentSize; ++ } + } + + /* ZSTD_findDecompressedSize() : +- * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames +- * @return : decompressed size of the frames contained */ ++ * note: compatible with legacy mode ++ * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { +- unsigned long long totalDstSize = 0; ++ U64 totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- if (ZSTD_isError(skippableSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; +@@ -602,17 +633,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + continue; + } + +- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); +- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; ++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); ++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- /* check for overflow */ +- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; +- totalDstSize += ret; ++ if (U64_MAX - totalDstSize < fcs) ++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ ++ totalDstSize += fcs; + } ++ /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); +- if (ZSTD_isError(frameSrcSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; ++ assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; +@@ -676,13 +707,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) + return frameSizeInfo; + } + +-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) ++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) + { + ZSTD_frameSizeInfo frameSizeInfo; + ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + + +- if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) ++ if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || +@@ -696,7 +727,7 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ZSTD_frameHeader zfh; + + /* Extract Frame Header */ +- { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); ++ { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) +@@ -730,23 +761,26 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ip += 4; + } + ++ frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize +- : nbBlocks * zfh.blockSizeMax; ++ : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } + } + ++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); ++ return frameSizeInfo.compressedSize; ++} ++ + /* ZSTD_findFrameCompressedSize() : +- * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame +- * `srcSize` must be at least as large as the frame contained +- * @return : the compressed size of the frame starting at `src` */ ++ * See docs in zstd.h ++ * Note: compatible with legacy mode */ + size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) + { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); +- return frameSizeInfo.compressedSize; ++ return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); + } + + /* ZSTD_decompressBound() : +@@ -760,7 +794,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) +@@ -773,6 +807,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + return bound; + } + ++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) ++{ ++ size_t margin = 0; ++ unsigned maxBlockSize = 0; ++ ++ /* Iterate over each frame */ ++ while (srcSize > 0) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); ++ size_t const compressedSize = frameSizeInfo.compressedSize; ++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ++ ZSTD_frameHeader zfh; ++ ++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); ++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) ++ return ERROR(corruption_detected); ++ ++ if (zfh.frameType == ZSTD_frame) { ++ /* Add the frame header to our margin */ ++ margin += zfh.headerSize; ++ /* Add the checksum to our margin */ ++ margin += zfh.checksumFlag ? 4 : 0; ++ /* Add 3 bytes per block */ ++ margin += 3 * frameSizeInfo.nbBlocks; ++ ++ /* Compute the max block size */ ++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); ++ } else { ++ assert(zfh.frameType == ZSTD_skippableFrame); ++ /* Add the entire skippable frame size to our margin. */ ++ margin += compressedSize; ++ } ++ ++ assert(srcSize >= compressedSize); ++ src = (const BYTE*)src + compressedSize; ++ srcSize -= compressedSize; ++ } ++ ++ /* Add the max block size back to the margin. */ ++ margin += maxBlockSize; ++ ++ return margin; ++} + + /*-************************************************************* + * Frame decoding +@@ -856,6 +932,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + ++ /* Shrink the blockSizeMax if enabled */ ++ if (dctx->maxBlockSizeParam != 0) ++ dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); ++ + /* Loop on each block */ + while (1) { + BYTE* oBlockEnd = oend; +@@ -888,7 +968,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + switch(blockProperties.blockType) + { + case bt_compressed: +- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); + break; + case bt_raw : + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ +@@ -901,12 +982,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } +- +- if (ZSTD_isError(decodedSize)) return decodedSize; +- if (dctx->validateChecksum) ++ FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); ++ DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); ++ if (dctx->validateChecksum) { + xxh64_update(&dctx->xxhState, op, decodedSize); +- if (decodedSize != 0) ++ } ++ if (decodedSize) /* support dst = NULL,0 */ { + op += decodedSize; ++ } + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; +@@ -930,12 +1013,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); + } + +-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, +@@ -955,17 +1041,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + + +- { U32 const magicNumber = MEM_readLE32(src); +- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", +- (unsigned)magicNumber, ZSTD_MAGICNUMBER); ++ if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { ++ U32 const magicNumber = MEM_readLE32(src); ++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { ++ /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); ++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; +- continue; ++ continue; /* check next frame */ + } } + + if (ddict) { +@@ -1061,8 +1148,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr + size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + + /* +- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, +- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can ++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we ++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce +@@ -1181,7 +1268,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); +- rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : +@@ -1250,6 +1338,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); ++ assert(dctx->format != ZSTD_f_zstd1_magicless); + ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; +@@ -1262,7 +1351,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } + } + +@@ -1303,11 +1392,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; +@@ -1403,10 +1492,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; +- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; ++ dctx->isFrameDecompression = 1; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; +@@ -1465,7 +1555,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. +- * Needed dictionary is a hidden information. ++ * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. +@@ -1474,7 +1564,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ + unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) + { +- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; ++ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +@@ -1581,7 +1671,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di + size_t ZSTD_initDStream(ZSTD_DStream* zds) + { + DEBUGLOG(4, "ZSTD_initDStream"); +- return ZSTD_initDStream_usingDDict(zds, NULL); ++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); ++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); ++ return ZSTD_startingInputLength(zds->format); + } + + /* ZSTD_initDStream_usingDDict() : +@@ -1589,6 +1681,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) + * this function cannot fail */ + size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + { ++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +@@ -1599,6 +1692,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + * this function cannot fail */ + size_t ZSTD_resetDStream(ZSTD_DStream* dctx) + { ++ DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); + } +@@ -1670,6 +1764,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; ++ case ZSTD_d_disableHuffmanAssembly: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ case ZSTD_d_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ + default:; + } + bounds.error = ERROR(parameter_unsupported); +@@ -1710,6 +1813,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ *value = (int)dctx->disableHufAsm; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ *value = dctx->maxBlockSizeParam; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1743,6 +1852,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); ++ dctx->disableHufAsm = value != 0; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); ++ dctx->maxBlockSizeParam = value; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1754,6 +1871,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; ++ dctx->isFrameDecompression = 1; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { +@@ -1770,11 +1888,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) + return ZSTD_sizeof_DCtx(dctx); + } + +-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) + { +- size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +- /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ +- unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); ++ size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); ++ /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block ++ * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing ++ * the block at the beginning of the output buffer, and maintain a full window. ++ * ++ * We need another blockSize worth of buffer so that we can store split ++ * literals at the end of the block without overwriting the extDict window. ++ */ ++ unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, +@@ -1782,6 +1906,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long + return minRBSize; + } + ++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++{ ++ return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); ++} ++ + size_t ZSTD_estimateDStreamSize(size_t windowSize) + { + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +@@ -1918,7 +2047,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } +- DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { + return hSize; /* error */ + } +@@ -1932,6 +2060,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->lhSize += remainingInput; + } + input->pos = input->size; ++ /* check first few bytes */ ++ FORWARD_IF_ERROR( ++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), ++ "First few bytes detected incorrect" ); ++ /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); +@@ -1943,14 +2076,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { +- size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart)); ++ size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; +- DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") ++ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); ++ assert(istart != NULL); + ip = istart + cSize; +- op += decompressedSize; ++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; +@@ -1969,7 +2103,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + +- if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ ++ if (zds->format == ZSTD_f_zstd1 ++ && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { +@@ -1985,11 +2120,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); ++ if (zds->maxBlockSizeParam != 0) ++ zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered +- ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) ++ ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); +@@ -2034,6 +2171,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); ++ assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; +@@ -2048,7 +2186,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ +- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); ++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { +@@ -2057,8 +2195,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } +- ip += loadedSize; +- zds->inPos += loadedSize; ++ if (loadedSize != 0) { ++ /* ip may be NULL */ ++ ip += loadedSize; ++ zds->inPos += loadedSize; ++ } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ +@@ -2068,14 +2209,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + break; + } + case zdss_flush: +- { size_t const toFlushSize = zds->outEnd - zds->outStart; ++ { ++ size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); +- op += flushedSize; ++ ++ op = op ? op + flushedSize : op; ++ + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) +- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { ++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); +@@ -2089,7 +2233,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ +@@ -2102,8 +2246,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { +- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); +- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); ++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); ++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { +@@ -2140,11 +2284,17 @@ size_t ZSTD_decompressStream_simpleArgs ( + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; +- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; ++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } +diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c +index c1913b8e7c89..9fe9a12c8a2c 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.c ++++ b/lib/zstd/decompress/zstd_decompress_block.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,12 +21,12 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/zstd_internal.h" + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /*_******************************************************* + * Macros +@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } + * Block decoding + ***************************************************************/ + ++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) ++{ ++ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; ++ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ return blockSizeMax; ++} ++ + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, +@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) + { +- if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) +- { +- /* room for litbuffer to fit without read faulting */ +- dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); ++ assert(litSize <= blockSizeMax); ++ assert(dctx->isFrameDecompression || streaming == not_streaming); ++ assert(expectedWriteSize <= blockSizeMax); ++ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { ++ /* If we aren't streaming, we can just put the literals after the output ++ * of the current block. We don't need to worry about overwriting the ++ * extDict of our window, because it doesn't exist. ++ * So if we have space after the end of the block, just put it there. ++ */ ++ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; +- } +- else if (litSize > ZSTD_LITBUFFEREXTRASIZE) +- { +- /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ ++ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { ++ /* Literals fit entirely within the extra buffer, put them there to avoid ++ * having to split the literals. ++ */ ++ dctx->litBuffer = dctx->litExtraBuffer; ++ dctx->litBufferEnd = dctx->litBuffer + litSize; ++ dctx->litBufferLocation = ZSTD_not_in_dst; ++ } else { ++ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); ++ /* Literals must be split between the output block and the extra lit ++ * buffer. We fill the extra lit buffer with the tail of the literals, ++ * and put the rest of the literals at the end of the block, with ++ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. ++ * This MUST not write more than our maxBlockSize beyond dst, because in ++ * streaming mode, that could overwrite part of our extDict window. ++ */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; +- } +- else { +- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ ++ } else { ++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; +- } +- else +- { +- /* fits entirely within litExtraBuffer, so no split is necessary */ +- dctx->litBuffer = dctx->litExtraBuffer; +- dctx->litBufferEnd = dctx->litBuffer + litSize; +- dctx->litBufferLocation = ZSTD_not_in_dst; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); + } + } + +-/* Hidden declaration for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, +- const void* src, size_t srcSize, +- void* dst, size_t dstCapacity, const streaming_operation streaming); + /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current +@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + * + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, ++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) + { +@@ -125,6 +141,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + { const BYTE* const istart = (const BYTE*) src; + symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + + switch(litEncType) + { +@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + ZSTD_FALLTHROUGH; + + case set_compressed: +- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); ++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); ++ int const flags = 0 ++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) ++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); ++ if (!singleStream) ++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, ++ "Not enough literals (%zu) for the 4-streams mode (min %u)", ++ litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); +@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + if (litEncType==set_repeat) { + if (singleStream) { +- hufSuccess = HUF_decompress1X_usingDTable_bmi2( ++ hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } else { +- hufSuccess = HUF_decompress4X_usingDTable_bmi2( ++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); ++ hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } + } else { + if (singleStream) { +@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace)); ++ sizeof(dctx->workspace), flags); + #else +- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( ++ hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + #endif + } else { +- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( ++ hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) + { ++ assert(litSize > ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); +@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } + + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ +@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 1: + lhSize = 2; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; +- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) +@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + } + } + ++/* Hidden declaration for fullbench */ ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity); ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity) ++{ ++ dctx->isFrameDecompression = 0; ++ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); ++} ++ + /* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions +@@ -506,14 +548,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; ++ assert(n>=0); ++ pos += (size_t)n; + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ ++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } +@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (u=0; u 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); +@@ -681,8 +719,16 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + } + *nbSeqPtr = nbSeq; + ++ if (nbSeq == 0) { ++ /* No sequence : section ends immediately */ ++ RETURN_ERROR_IF(ip != iend, corruption_detected, ++ "extraneous data present in the Sequences section"); ++ return (size_t)(ip - istart); ++ } ++ + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ ++ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ + { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); + symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); + symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); +@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt + /* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { ++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + +@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op, + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op, + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); ++ ++#if defined(__aarch64__) ++ /* prefetch sequence starting from match that will be used for copy later */ ++ PREFETCH_L1(match); ++#endif + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend +@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + } + + /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum +- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) ++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + + typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; + ++/* ++ * ZSTD_decodeSequence(): ++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets ++ * only used in 32-bit mode ++ * @return : Sequence (litL + matchL + offset) ++ */ + FORCE_INLINE_TEMPLATE seq_t +-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) ++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) + { + seq_t seq; ++ /* ++ * ZSTD_seqSymbol is a 64 bits wide structure. ++ * It can be loaded in one operation ++ * and its fields extracted by simply shifting or bit-extracting on aarch64. ++ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh ++ * operations that cause performance drop. This can be avoided by using this ++ * ZSTD_memcpy hack. ++ */ ++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) ++ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; ++ ZSTD_seqSymbol* const llDInfo = &llDInfoS; ++ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; ++ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; ++ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); ++#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; ++#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; +@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; ++ ++ assert(llBits <= MaxLLBits); ++ assert(mlBits <= MaxMLBits); ++ assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only +- * valuable to mark likelyness for clang, it gives around 3-4% of ++ * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; +- #if defined(__clang__) +- if (LIKELY(ofBits > 1)) { +- #else + if (ofBits > 1) { +- #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +- assert(ofBits <= MaxOff); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); ++ /* Always read extra bits, this keeps the logic simple, ++ * avoids branches, and avoids accidentally reading 0 bits. ++ */ ++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); +- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ ++ offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); +@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; +- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ ++ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; +@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + seq.offset = offset; + } + +- #if defined(__clang__) +- if (UNLIKELY(mlBits > 0)) +- #else + if (mlBits > 0) +- #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +- #if defined(__clang__) +- if (UNLIKELY(llBits > 0)) +- #else + if (llBits > 0) +- #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) +@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + +- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ +- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ if (!isLastSeq) { ++ /* don't update FSE state for last Sequence */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ ++ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ BIT_reloadDStream(&seqState->DStream); ++ } + } + + return seq; + } + +-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) ++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) ++#if DEBUGLEVEL >= 1 ++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) + { + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ +@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix + /* Dictionary is active. */ + return 1; + } ++#endif + +-MEM_STATIC void ZSTD_assertValidSequence( ++static void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) + { + #if DEBUGLEVEL >= 1 +- size_t const windowSize = dctx->fParams.windowSize; +- size_t const sequenceSize = seq.litLength + seq.matchLength; +- BYTE const* const oLitEnd = op + seq.litLength; +- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", +- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); +- assert(op <= oend); +- assert((size_t)(oend - op) >= sequenceSize); +- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); +- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { +- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); +- /* Offset must be within the dictionary. */ +- assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); +- assert(seq.offset <= windowSize + dictSize); +- } else { +- /* Offset must be within our window. */ +- assert(seq.offset <= windowSize); ++ if (dctx->isFrameDecompression) { ++ size_t const windowSize = dctx->fParams.windowSize; ++ size_t const sequenceSize = seq.litLength + seq.matchLength; ++ BYTE const* const oLitEnd = op + seq.litLength; ++ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", ++ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); ++ assert(op <= oend); ++ assert((size_t)(oend - op) >= sequenceSize); ++ assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); ++ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { ++ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); ++ /* Offset must be within the dictionary. */ ++ assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); ++ assert(seq.offset <= windowSize + dictSize); ++ } else { ++ /* Offset must be within our window. */ ++ assert(seq.offset <= windowSize); ++ } + } + #else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +@@ -1322,23 +1404,21 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = ostart + maxDstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); + +- /* Regen sequences */ ++ /* Literals are split between internal buffer & output buffer */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; +@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + BIT_DStream_completed < BIT_DStream_overflow); + + /* decompress without overrunning litPtr begins */ +- { +- seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression +@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + #endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ +- for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { +- size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ for ( ; nbSeq; nbSeq--) { ++ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); ++ if (litPtr + sequence.litLength > dctx->litBufferEnd) break; ++ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif +- if (UNLIKELY(ZSTD_isError(oneSeqSize))) +- return oneSeqSize; +- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +- op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); +- sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +- } ++ if (UNLIKELY(ZSTD_isError(oneSeqSize))) ++ return oneSeqSize; ++ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); ++ op += oneSeqSize; ++ } } ++ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; +@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (--nbSeq) +- BIT_reloadDStream(&(seqState.DStream)); + } ++ nbSeq--; + } + } + +- if (nbSeq > 0) /* there is remaining lit from extra buffer */ +- { ++ if (nbSeq > 0) { ++ /* there is remaining lit from extra buffer */ + + #if defined(__x86_64__) + __asm__(".p2align 6"); +@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + # endif + #endif + +- for (; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ +- { +- size_t const lastLLSize = litBufferEnd - litPtr; ++ if (dctx->litBufferLocation == ZSTD_split) { ++ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ ++ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); +@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } +- { size_t const lastLLSize = litBufferEnd - litPtr; ++ /* copy last literals from internal buffer */ ++ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -1539,21 +1616,19 @@ DONT_VECTORIZE + ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_body"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + + /* Regen sequences */ + if (nbSeq) { +@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + +- ZSTD_STATIC_ASSERT( +- BIT_DStream_unfinished < BIT_DStream_completed && +- BIT_DStream_endOfBuffer < BIT_DStream_completed && +- BIT_DStream_completed < BIT_DStream_overflow); +- + #if defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + # endif + #endif + +- for ( ; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + + /* check if reached exact end */ +- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); +- RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ assert(nbSeq == 0); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- { size_t const lastLLSize = litEnd - litPtr; ++ { size_t const lastLLSize = (size_t)(litEnd - litPtr); ++ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + + static size_t + ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, ++FORCE_INLINE_TEMPLATE ++ ++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) + { + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; +- const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. +- * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. ++ * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- (void)frame; + + /* Regen sequences */ + if (nbSeq) { +@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body( + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ +- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) +@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif +- if (ZSTD_isError(oneSeqSize)) return oneSeqSize; ++ if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); +- sequences[seqNb & STORED_SEQS_MASK] = sequence; +- op += oneSeqSize; +- } ++ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); ++ sequences[seqNb & STORED_SEQS_MASK] = sequence; ++ op += oneSeqSize; ++ } } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ +- oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ++ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body( + op += oneSeqSize; + } + } +- RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; +@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body( + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body( + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ +- { ++ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { +@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body( + } + } + +- return op-ostart; ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1851,20 +1908,18 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static BMI2_TARGET_ATTRIBUTE size_t + DONT_VECTORIZE + ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1873,10 +1928,9 @@ static BMI2_TARGET_ATTRIBUTE size_t + ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1886,37 +1940,34 @@ typedef size_t (*ZSTD_decompressSequences_t)( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame); ++ const ZSTD_longOffset_e isLongOffset); + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + static size_t + ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequences"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static size_t + ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1931,69 +1982,114 @@ static size_t + ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + ++/* ++ * @returns The total size of the history referenceable by zstd, including ++ * both the prefix and the extDict. At @p op any offset larger than this ++ * is invalid. ++ */ ++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) ++{ ++ return (size_t)(op - virtualStart); ++} ++ ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : ++/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) +- * compared to maximum possible of (1< 22) total += 1; ++ ZSTD_OffsetInfo info = {0, 0}; ++ /* If nbSeq == 0, then the offTable is uninitialized, but we have ++ * no sequences, so both values should be 0. ++ */ ++ if (nbSeq != 0) { ++ const void* ptr = offTable; ++ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; ++ const ZSTD_seqSymbol* table = offTable + 1; ++ U32 const max = 1 << tableLog; ++ U32 u; ++ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); ++ ++ assert(max <= (1 << OffFSELog)); /* max not too large */ ++ for (u=0; u 22) info.longOffsetShare += 1; ++ } ++ ++ assert(tableLog <= OffFSELog); ++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + +- assert(tableLog <= OffFSELog); +- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ ++ return info; ++} + +- return total; ++/* ++ * @returns The maximum offset we can decode in one read of our bitstream, without ++ * reloading more bits in the middle of the offset bits read. Any offsets larger ++ * than this must use the long offset decoder. ++ */ ++static size_t ZSTD_maxShortOffset(void) ++{ ++ if (MEM_64bits()) { ++ /* We can decode any offset without reloading bits. ++ * This might change if the max window size grows. ++ */ ++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ++ return (size_t)-1; ++ } else { ++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. ++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. ++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. ++ */ ++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; ++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; ++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); ++ return maxOffset; ++ } + } +-#endif + + size_t + ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming) ++ const void* src, size_t srcSize, const streaming_operation streaming) + { /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; +- /* isLongOffset must be true if there are long offsets. +- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. +- * We don't expect that to be the case in 64-bit mode. +- * In block mode, window size is not known, so we have to be conservative. +- * (note: but it could be evaluated from current-lowLimit) +- */ +- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); +- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); +- +- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); ++ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); ++ ++ /* Note : the wording of the specification ++ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). ++ * This generally does not happen, as it makes little sense, ++ * since an uncompressed block would feature same size and have no decompression cost. ++ * Also, note that decoder from reference libzstd before < v1.5.4 ++ * would consider this edge case as an error. ++ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) ++ * for broader compatibility with the deployed ecosystem of zstd decoders */ ++ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); +- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); ++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; +@@ -2001,6 +2097,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + + /* Build Decoding Tables */ + { ++ /* Compute the maximum block size, which must also work when !frame and fParams are unset. ++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. ++ */ ++ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); ++ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); ++ /* isLongOffset must be true if there are long offsets. ++ * Offsets are long if they are larger than ZSTD_maxShortOffset(). ++ * We don't expect that to be the case in 64-bit mode. ++ * ++ * We check here to see if our history is large enough to allow long offsets. ++ * If it isn't, then we can't possible have (valid) long offsets. If the offset ++ * is invalid, then it is okay to read it incorrectly. ++ * ++ * If isLongOffsets is true, then we will later check our decoding table to see ++ * if it is even possible to generate long offsets. ++ */ ++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. +@@ -2008,6 +2121,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; ++#else ++ /* Set to 1 to avoid computing offset info if we don't need to. ++ * Otherwise this value is ignored. ++ */ ++ int usePrefetchDecoder = 1; + #endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); +@@ -2015,40 +2133,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + ip += seqHSize; + srcSize -= seqHSize; + +- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, ++ "invalid dst"); + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if ( !usePrefetchDecoder +- && (!frame || (dctx->fParams.windowSize > (1<<24))) +- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ +- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); +- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ +- usePrefetchDecoder = (shareLongOffsets >= minShare); ++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, ++ * compute information about the share of long offsets, and the maximum nbAdditionalBits. ++ * NOTE: could probably use a larger nbSeq limit ++ */ ++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); ++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { ++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small ++ * enough, then we know it is impossible to have too long an offset in this block, so we can ++ * use the regular offset decoder. ++ */ ++ isLongOffset = ZSTD_lo_isRegularOffset; ++ } ++ if (!usePrefetchDecoder) { ++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ ++ usePrefetchDecoder = (info.longOffsetShare >= minShare); ++ } + } +-#endif + + dctx->ddictIsCold = 0; + + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if (usePrefetchDecoder) ++ if (usePrefetchDecoder) { ++#else ++ (void)usePrefetchDecoder; ++ { + #endif + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif ++ } + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + if (dctx->litBufferLocation == ZSTD_split) +- return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + else +- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif + } + } + + ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + { + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ +@@ -2060,13 +2193,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + } + + +-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t dSize; ++ dctx->isFrameDecompression = 0; + ZSTD_checkContinuity(dctx, dst, dstCapacity); +- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); ++ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); ++ FORWARD_IF_ERROR(dSize, ""); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; + } ++ ++ ++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ ++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); ++} +diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h +index 3d2d57a5d25a..becffbd89364 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.h ++++ b/lib/zstd/decompress/zstd_decompress_block.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -47,7 +48,7 @@ typedef enum { + */ + size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming); ++ const void* src, size_t srcSize, const streaming_operation streaming); + + /* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) +@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + ++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ + + #endif /* ZSTD_DEC_BLOCK_H */ +diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h +index 98102edb6a83..0f02526be774 100644 +--- a/lib/zstd/decompress/zstd_decompress_internal.h ++++ b/lib/zstd/decompress/zstd_decompress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = { + + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) ++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + + typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ +- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ ++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; + } ZSTD_entropyDTables_t; +@@ -152,6 +154,7 @@ struct ZSTD_DCtx_s + size_t litSize; + size_t rleSize; + size_t staticSize; ++ int isFrameDecompression; + #if DYNAMIC_BMI2 != 0 + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + #endif +@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s + ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ ++ int disableHufAsm; ++ int maxBlockSizeParam; + + /* streaming */ + ZSTD_dStreamStage streamStage; +diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h +index a06ca187aab5..8a47eb2a4514 100644 +--- a/lib/zstd/decompress_sources.h ++++ b/lib/zstd/decompress_sources.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c +index 22686e367e6f..466828e35752 100644 +--- a/lib/zstd/zstd_common_module.c ++++ b/lib/zstd/zstd_common_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); + EXPORT_SYMBOL_GPL(ZSTD_isError); + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +-EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customFree); + + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Common"); +diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c +index bd8784449b31..ceaf352d03e2 100644 +--- a/lib/zstd/zstd_compress_module.c ++++ b/lib/zstd/zstd_compress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c +index 469fc3059be0..0ae819f0c927 100644 +--- a/lib/zstd/zstd_decompress_module.c ++++ b/lib/zstd/zstd_decompress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream); + + size_t zstd_reset_dstream(zstd_dstream *dstream) + { +- return ZSTD_resetDStream(dstream); ++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); + } + EXPORT_SYMBOL(zstd_reset_dstream); + +-- +2.47.1 + diff --git a/sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch b/sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch new file mode 100644 index 0000000..0930bbf --- /dev/null +++ b/sys-kernel/gentoo-sources-6.12/9999-workqueue_increase_maximum_concurrency_limit.patch @@ -0,0 +1,11 @@ +--- a/include/linux/workqueue.h 2024-11-18 19:21:27.602930590 +0100 ++++ b/include/linux/workqueue.h 2024-11-19 00:04:41.586700929 +0100 +@@ -412,7 +412,7 @@ + }; + + enum wq_consts { +- WQ_MAX_ACTIVE = 512, /* I like 512, better ideas? */ ++ WQ_MAX_ACTIVE = 2048, /* I like 2048, better ideas? */ + WQ_UNBOUND_MAX_ACTIVE = WQ_MAX_ACTIVE, + WQ_DFL_ACTIVE = WQ_MAX_ACTIVE / 2, + diff --git a/sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch b/sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch new file mode 100644 index 0000000..d5e5f37 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0001-amd-pstate.patch @@ -0,0 +1,885 @@ +From 46a700551a5ff45cbc27671d7ebd176826adb1c6 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:27:14 +0100 +Subject: [PATCH 01/12] amd-pstate + +Signed-off-by: Peter Jung +--- + drivers/cpufreq/amd-pstate-trace.h | 52 +++- + drivers/cpufreq/amd-pstate-ut.c | 12 +- + drivers/cpufreq/amd-pstate.c | 397 +++++++++++++++-------------- + drivers/cpufreq/amd-pstate.h | 3 - + 4 files changed, 259 insertions(+), 205 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate-trace.h b/drivers/cpufreq/amd-pstate-trace.h +index 35f38ae67fb1..8d692415d905 100644 +--- a/drivers/cpufreq/amd-pstate-trace.h ++++ b/drivers/cpufreq/amd-pstate-trace.h +@@ -32,7 +32,6 @@ TRACE_EVENT(amd_pstate_perf, + u64 aperf, + u64 tsc, + unsigned int cpu_id, +- bool changed, + bool fast_switch + ), + +@@ -44,7 +43,6 @@ TRACE_EVENT(amd_pstate_perf, + aperf, + tsc, + cpu_id, +- changed, + fast_switch + ), + +@@ -57,7 +55,6 @@ TRACE_EVENT(amd_pstate_perf, + __field(unsigned long long, aperf) + __field(unsigned long long, tsc) + __field(unsigned int, cpu_id) +- __field(bool, changed) + __field(bool, fast_switch) + ), + +@@ -70,11 +67,10 @@ TRACE_EVENT(amd_pstate_perf, + __entry->aperf = aperf; + __entry->tsc = tsc; + __entry->cpu_id = cpu_id; +- __entry->changed = changed; + __entry->fast_switch = fast_switch; + ), + +- TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u changed=%s fast_switch=%s", ++ TP_printk("amd_min_perf=%lu amd_des_perf=%lu amd_max_perf=%lu freq=%llu mperf=%llu aperf=%llu tsc=%llu cpu_id=%u fast_switch=%s", + (unsigned long)__entry->min_perf, + (unsigned long)__entry->target_perf, + (unsigned long)__entry->capacity, +@@ -83,11 +79,55 @@ TRACE_EVENT(amd_pstate_perf, + (unsigned long long)__entry->aperf, + (unsigned long long)__entry->tsc, + (unsigned int)__entry->cpu_id, +- (__entry->changed) ? "true" : "false", + (__entry->fast_switch) ? "true" : "false" + ) + ); + ++TRACE_EVENT(amd_pstate_epp_perf, ++ ++ TP_PROTO(unsigned int cpu_id, ++ unsigned int highest_perf, ++ unsigned int epp, ++ unsigned int min_perf, ++ unsigned int max_perf, ++ bool boost ++ ), ++ ++ TP_ARGS(cpu_id, ++ highest_perf, ++ epp, ++ min_perf, ++ max_perf, ++ boost), ++ ++ TP_STRUCT__entry( ++ __field(unsigned int, cpu_id) ++ __field(unsigned int, highest_perf) ++ __field(unsigned int, epp) ++ __field(unsigned int, min_perf) ++ __field(unsigned int, max_perf) ++ __field(bool, boost) ++ ), ++ ++ TP_fast_assign( ++ __entry->cpu_id = cpu_id; ++ __entry->highest_perf = highest_perf; ++ __entry->epp = epp; ++ __entry->min_perf = min_perf; ++ __entry->max_perf = max_perf; ++ __entry->boost = boost; ++ ), ++ ++ TP_printk("cpu%u: [%u<->%u]/%u, epp=%u, boost=%u", ++ (unsigned int)__entry->cpu_id, ++ (unsigned int)__entry->min_perf, ++ (unsigned int)__entry->max_perf, ++ (unsigned int)__entry->highest_perf, ++ (unsigned int)__entry->epp, ++ (bool)__entry->boost ++ ) ++); ++ + #endif /* _AMD_PSTATE_TRACE_H */ + + /* This part must be outside protection */ +diff --git a/drivers/cpufreq/amd-pstate-ut.c b/drivers/cpufreq/amd-pstate-ut.c +index a261d7300951..3a0a380c3590 100644 +--- a/drivers/cpufreq/amd-pstate-ut.c ++++ b/drivers/cpufreq/amd-pstate-ut.c +@@ -207,7 +207,6 @@ static void amd_pstate_ut_check_freq(u32 index) + int cpu = 0; + struct cpufreq_policy *policy = NULL; + struct amd_cpudata *cpudata = NULL; +- u32 nominal_freq_khz; + + for_each_possible_cpu(cpu) { + policy = cpufreq_cpu_get(cpu); +@@ -215,14 +214,13 @@ static void amd_pstate_ut_check_freq(u32 index) + break; + cpudata = policy->driver_data; + +- nominal_freq_khz = cpudata->nominal_freq*1000; +- if (!((cpudata->max_freq >= nominal_freq_khz) && +- (nominal_freq_khz > cpudata->lowest_nonlinear_freq) && ++ if (!((cpudata->max_freq >= cpudata->nominal_freq) && ++ (cpudata->nominal_freq > cpudata->lowest_nonlinear_freq) && + (cpudata->lowest_nonlinear_freq > cpudata->min_freq) && + (cpudata->min_freq > 0))) { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d max=%d >= nominal=%d > lowest_nonlinear=%d > min=%d > 0, the formula is incorrect!\n", +- __func__, cpu, cpudata->max_freq, nominal_freq_khz, ++ __func__, cpu, cpudata->max_freq, cpudata->nominal_freq, + cpudata->lowest_nonlinear_freq, cpudata->min_freq); + goto skip_test; + } +@@ -236,13 +234,13 @@ static void amd_pstate_ut_check_freq(u32 index) + + if (cpudata->boost_supported) { + if ((policy->max == cpudata->max_freq) || +- (policy->max == nominal_freq_khz)) ++ (policy->max == cpudata->nominal_freq)) + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_PASS; + else { + amd_pstate_ut_cases[index].result = AMD_PSTATE_UT_RESULT_FAIL; + pr_err("%s cpu%d policy_max=%d should be equal cpu_max=%d or cpu_nominal=%d !\n", + __func__, cpu, policy->max, cpudata->max_freq, +- nominal_freq_khz); ++ cpudata->nominal_freq); + goto skip_test; + } + } else { +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index f71057c2cf90..6a1e02389831 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -22,6 +22,7 @@ + + #define pr_fmt(fmt) KBUILD_MODNAME ": " fmt + ++#include + #include + #include + #include +@@ -88,6 +89,11 @@ static bool cppc_enabled; + static bool amd_pstate_prefcore = true; + static struct quirk_entry *quirks; + ++#define AMD_CPPC_MAX_PERF_MASK GENMASK(7, 0) ++#define AMD_CPPC_MIN_PERF_MASK GENMASK(15, 8) ++#define AMD_CPPC_DES_PERF_MASK GENMASK(23, 16) ++#define AMD_CPPC_EPP_PERF_MASK GENMASK(31, 24) ++ + /* + * AMD Energy Preference Performance (EPP) + * The EPP is used in the CCLK DPM controller to drive +@@ -180,120 +186,145 @@ static inline int get_mode_idx_from_str(const char *str, size_t size) + static DEFINE_MUTEX(amd_pstate_limits_lock); + static DEFINE_MUTEX(amd_pstate_driver_lock); + +-static s16 amd_pstate_get_epp(struct amd_cpudata *cpudata, u64 cppc_req_cached) ++static s16 msr_get_epp(struct amd_cpudata *cpudata) + { +- u64 epp; ++ u64 value; + int ret; + +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- if (!cppc_req_cached) { +- epp = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, +- &cppc_req_cached); +- if (epp) +- return epp; +- } +- epp = (cppc_req_cached >> 24) & 0xFF; +- } else { +- ret = cppc_get_epp_perf(cpudata->cpu, &epp); +- if (ret < 0) { +- pr_debug("Could not retrieve energy perf value (%d)\n", ret); +- return -EIO; +- } ++ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); ++ if (ret < 0) { ++ pr_debug("Could not retrieve energy perf value (%d)\n", ret); ++ return ret; + } + +- return (s16)(epp & 0xff); ++ return FIELD_GET(AMD_CPPC_EPP_PERF_MASK, value); + } + +-static int amd_pstate_get_energy_pref_index(struct amd_cpudata *cpudata) ++DEFINE_STATIC_CALL(amd_pstate_get_epp, msr_get_epp); ++ ++static inline s16 amd_pstate_get_epp(struct amd_cpudata *cpudata) + { +- s16 epp; +- int index = -EINVAL; ++ return static_call(amd_pstate_get_epp)(cpudata); ++} + +- epp = amd_pstate_get_epp(cpudata, 0); +- if (epp < 0) +- return epp; ++static s16 shmem_get_epp(struct amd_cpudata *cpudata) ++{ ++ u64 epp; ++ int ret; + +- switch (epp) { +- case AMD_CPPC_EPP_PERFORMANCE: +- index = EPP_INDEX_PERFORMANCE; +- break; +- case AMD_CPPC_EPP_BALANCE_PERFORMANCE: +- index = EPP_INDEX_BALANCE_PERFORMANCE; +- break; +- case AMD_CPPC_EPP_BALANCE_POWERSAVE: +- index = EPP_INDEX_BALANCE_POWERSAVE; +- break; +- case AMD_CPPC_EPP_POWERSAVE: +- index = EPP_INDEX_POWERSAVE; +- break; +- default: +- break; ++ ret = cppc_get_epp_perf(cpudata->cpu, &epp); ++ if (ret < 0) { ++ pr_debug("Could not retrieve energy perf value (%d)\n", ret); ++ return ret; + } + +- return index; ++ return (s16)(epp & 0xff); + } + +-static void msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, +- u32 des_perf, u32 max_perf, bool fast_switch) ++static int msr_update_perf(struct amd_cpudata *cpudata, u32 min_perf, ++ u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) + { +- if (fast_switch) +- wrmsrl(MSR_AMD_CPPC_REQ, READ_ONCE(cpudata->cppc_req_cached)); +- else +- wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, +- READ_ONCE(cpudata->cppc_req_cached)); ++ u64 value, prev; ++ ++ value = prev = READ_ONCE(cpudata->cppc_req_cached); ++ ++ value &= ~(AMD_CPPC_MAX_PERF_MASK | AMD_CPPC_MIN_PERF_MASK | ++ AMD_CPPC_DES_PERF_MASK | AMD_CPPC_EPP_PERF_MASK); ++ value |= FIELD_PREP(AMD_CPPC_MAX_PERF_MASK, max_perf); ++ value |= FIELD_PREP(AMD_CPPC_DES_PERF_MASK, des_perf); ++ value |= FIELD_PREP(AMD_CPPC_MIN_PERF_MASK, min_perf); ++ value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); ++ ++ if (value == prev) ++ return 0; ++ ++ if (fast_switch) { ++ wrmsrl(MSR_AMD_CPPC_REQ, value); ++ return 0; ++ } else { ++ int ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); ++ ++ if (ret) ++ return ret; ++ } ++ ++ WRITE_ONCE(cpudata->cppc_req_cached, value); ++ WRITE_ONCE(cpudata->epp_cached, epp); ++ ++ return 0; + } + + DEFINE_STATIC_CALL(amd_pstate_update_perf, msr_update_perf); + +-static inline void amd_pstate_update_perf(struct amd_cpudata *cpudata, ++static inline int amd_pstate_update_perf(struct amd_cpudata *cpudata, + u32 min_perf, u32 des_perf, +- u32 max_perf, bool fast_switch) ++ u32 max_perf, u32 epp, ++ bool fast_switch) + { +- static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, +- max_perf, fast_switch); ++ return static_call(amd_pstate_update_perf)(cpudata, min_perf, des_perf, ++ max_perf, epp, fast_switch); + } + +-static int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) ++static int msr_set_epp(struct amd_cpudata *cpudata, u32 epp) + { ++ u64 value, prev; + int ret; +- struct cppc_perf_ctrls perf_ctrls; + +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- u64 value = READ_ONCE(cpudata->cppc_req_cached); +- +- value &= ~GENMASK_ULL(31, 24); +- value |= (u64)epp << 24; +- WRITE_ONCE(cpudata->cppc_req_cached, value); ++ value = prev = READ_ONCE(cpudata->cppc_req_cached); ++ value &= ~AMD_CPPC_EPP_PERF_MASK; ++ value |= FIELD_PREP(AMD_CPPC_EPP_PERF_MASK, epp); + +- ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); +- if (!ret) +- cpudata->epp_cached = epp; +- } else { +- amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, +- cpudata->max_limit_perf, false); ++ if (value == prev) ++ return 0; + +- perf_ctrls.energy_perf = epp; +- ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); +- if (ret) { +- pr_debug("failed to set energy perf value (%d)\n", ret); +- return ret; +- } +- cpudata->epp_cached = epp; ++ ret = wrmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, value); ++ if (ret) { ++ pr_err("failed to set energy perf value (%d)\n", ret); ++ return ret; + } + ++ /* update both so that msr_update_perf() can effectively check */ ++ WRITE_ONCE(cpudata->epp_cached, epp); ++ WRITE_ONCE(cpudata->cppc_req_cached, value); ++ + return ret; + } + +-static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, +- int pref_index) ++DEFINE_STATIC_CALL(amd_pstate_set_epp, msr_set_epp); ++ ++static inline int amd_pstate_set_epp(struct amd_cpudata *cpudata, u32 epp) ++{ ++ return static_call(amd_pstate_set_epp)(cpudata, epp); ++} ++ ++static int shmem_set_epp(struct amd_cpudata *cpudata, u32 epp) + { +- int epp = -EINVAL; + int ret; ++ struct cppc_perf_ctrls perf_ctrls; ++ ++ if (epp == cpudata->epp_cached) ++ return 0; ++ ++ perf_ctrls.energy_perf = epp; ++ ret = cppc_set_epp_perf(cpudata->cpu, &perf_ctrls, 1); ++ if (ret) { ++ pr_debug("failed to set energy perf value (%d)\n", ret); ++ return ret; ++ } ++ WRITE_ONCE(cpudata->epp_cached, epp); ++ ++ return ret; ++} ++ ++static int amd_pstate_set_energy_pref_index(struct cpufreq_policy *policy, ++ int pref_index) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ int epp; + + if (!pref_index) + epp = cpudata->epp_default; +- +- if (epp == -EINVAL) ++ else + epp = epp_values[pref_index]; + + if (epp > 0 && cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) { +@@ -301,9 +332,15 @@ static int amd_pstate_set_energy_pref_index(struct amd_cpudata *cpudata, + return -EBUSY; + } + +- ret = amd_pstate_set_epp(cpudata, epp); ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, ++ epp, ++ FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), ++ FIELD_GET(AMD_CPPC_MAX_PERF_MASK, cpudata->cppc_req_cached), ++ policy->boost_enabled); ++ } + +- return ret; ++ return amd_pstate_set_epp(cpudata, epp); + } + + static inline int msr_cppc_enable(bool enable) +@@ -442,17 +479,23 @@ static inline int amd_pstate_init_perf(struct amd_cpudata *cpudata) + return static_call(amd_pstate_init_perf)(cpudata); + } + +-static void shmem_update_perf(struct amd_cpudata *cpudata, +- u32 min_perf, u32 des_perf, +- u32 max_perf, bool fast_switch) ++static int shmem_update_perf(struct amd_cpudata *cpudata, u32 min_perf, ++ u32 des_perf, u32 max_perf, u32 epp, bool fast_switch) + { + struct cppc_perf_ctrls perf_ctrls; + ++ if (cppc_state == AMD_PSTATE_ACTIVE) { ++ int ret = shmem_set_epp(cpudata, epp); ++ ++ if (ret) ++ return ret; ++ } ++ + perf_ctrls.max_perf = max_perf; + perf_ctrls.min_perf = min_perf; + perf_ctrls.desired_perf = des_perf; + +- cppc_set_perf(cpudata->cpu, &perf_ctrls); ++ return cppc_set_perf(cpudata->cpu, &perf_ctrls); + } + + static inline bool amd_pstate_sample(struct amd_cpudata *cpudata) +@@ -493,14 +536,8 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, + { + unsigned long max_freq; + struct cpufreq_policy *policy = cpufreq_cpu_get(cpudata->cpu); +- u64 prev = READ_ONCE(cpudata->cppc_req_cached); + u32 nominal_perf = READ_ONCE(cpudata->nominal_perf); +- u64 value = prev; + +- min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf, +- cpudata->max_limit_perf); +- max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf, +- cpudata->max_limit_perf); + des_perf = clamp_t(unsigned long, des_perf, min_perf, max_perf); + + max_freq = READ_ONCE(cpudata->max_limit_freq); +@@ -511,34 +548,18 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u32 min_perf, + des_perf = 0; + } + +- value &= ~AMD_CPPC_MIN_PERF(~0L); +- value |= AMD_CPPC_MIN_PERF(min_perf); +- +- value &= ~AMD_CPPC_DES_PERF(~0L); +- value |= AMD_CPPC_DES_PERF(des_perf); +- + /* limit the max perf when core performance boost feature is disabled */ + if (!cpudata->boost_supported) + max_perf = min_t(unsigned long, nominal_perf, max_perf); + +- value &= ~AMD_CPPC_MAX_PERF(~0L); +- value |= AMD_CPPC_MAX_PERF(max_perf); +- + if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { + trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, + cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc, +- cpudata->cpu, (value != prev), fast_switch); ++ cpudata->cpu, fast_switch); + } + +- if (value == prev) +- goto cpufreq_policy_put; +- +- WRITE_ONCE(cpudata->cppc_req_cached, value); +- +- amd_pstate_update_perf(cpudata, min_perf, des_perf, +- max_perf, fast_switch); ++ amd_pstate_update_perf(cpudata, min_perf, des_perf, max_perf, 0, fast_switch); + +-cpufreq_policy_put: + cpufreq_cpu_put(policy); + } + +@@ -570,7 +591,7 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) + + static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + { +- u32 max_limit_perf, min_limit_perf, lowest_perf, max_perf, max_freq; ++ u32 max_limit_perf, min_limit_perf, max_perf, max_freq; + struct amd_cpudata *cpudata = policy->driver_data; + + max_perf = READ_ONCE(cpudata->highest_perf); +@@ -578,12 +599,8 @@ static int amd_pstate_update_min_max_limit(struct cpufreq_policy *policy) + max_limit_perf = div_u64(policy->max * max_perf, max_freq); + min_limit_perf = div_u64(policy->min * max_perf, max_freq); + +- lowest_perf = READ_ONCE(cpudata->lowest_perf); +- if (min_limit_perf < lowest_perf) +- min_limit_perf = lowest_perf; +- +- if (max_limit_perf < min_limit_perf) +- max_limit_perf = min_limit_perf; ++ if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) ++ min_limit_perf = min(cpudata->nominal_perf, max_limit_perf); + + WRITE_ONCE(cpudata->max_limit_perf, max_limit_perf); + WRITE_ONCE(cpudata->min_limit_perf, min_limit_perf); +@@ -682,7 +699,7 @@ static void amd_pstate_adjust_perf(unsigned int cpu, + if (min_perf < lowest_nonlinear_perf) + min_perf = lowest_nonlinear_perf; + +- max_perf = cap_perf; ++ max_perf = cpudata->max_limit_perf; + if (max_perf < min_perf) + max_perf = min_perf; + +@@ -704,8 +721,8 @@ static int amd_pstate_cpu_boost_update(struct cpufreq_policy *policy, bool on) + + if (on) + policy->cpuinfo.max_freq = max_freq; +- else if (policy->cpuinfo.max_freq > nominal_freq * 1000) +- policy->cpuinfo.max_freq = nominal_freq * 1000; ++ else if (policy->cpuinfo.max_freq > nominal_freq) ++ policy->cpuinfo.max_freq = nominal_freq; + + policy->max = policy->cpuinfo.max_freq; + +@@ -730,8 +747,6 @@ static int amd_pstate_set_boost(struct cpufreq_policy *policy, int state) + guard(mutex)(&amd_pstate_driver_lock); + + ret = amd_pstate_cpu_boost_update(policy, state); +- WRITE_ONCE(cpudata->boost_state, !ret ? state : false); +- policy->boost_enabled = !ret ? state : false; + refresh_frequency_limits(policy); + + return ret; +@@ -752,9 +767,6 @@ static int amd_pstate_init_boost_support(struct amd_cpudata *cpudata) + goto exit_err; + } + +- /* at least one CPU supports CPB, even if others fail later on to set up */ +- current_pstate_driver->boost_enabled = true; +- + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_K7_HWCR, &boost_val); + if (ret) { + pr_err_once("failed to read initial CPU boost state!\n"); +@@ -906,29 +918,29 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) + return ret; + + if (quirks && quirks->lowest_freq) +- min_freq = quirks->lowest_freq * 1000; ++ min_freq = quirks->lowest_freq; + else +- min_freq = cppc_perf.lowest_freq * 1000; ++ min_freq = cppc_perf.lowest_freq; + + if (quirks && quirks->nominal_freq) +- nominal_freq = quirks->nominal_freq ; ++ nominal_freq = quirks->nominal_freq; + else + nominal_freq = cppc_perf.nominal_freq; + + nominal_perf = READ_ONCE(cpudata->nominal_perf); + + boost_ratio = div_u64(cpudata->highest_perf << SCHED_CAPACITY_SHIFT, nominal_perf); +- max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT) * 1000; ++ max_freq = (nominal_freq * boost_ratio >> SCHED_CAPACITY_SHIFT); + + lowest_nonlinear_perf = READ_ONCE(cpudata->lowest_nonlinear_perf); + lowest_nonlinear_ratio = div_u64(lowest_nonlinear_perf << SCHED_CAPACITY_SHIFT, + nominal_perf); +- lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT) * 1000; ++ lowest_nonlinear_freq = (nominal_freq * lowest_nonlinear_ratio >> SCHED_CAPACITY_SHIFT); + +- WRITE_ONCE(cpudata->min_freq, min_freq); +- WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq); +- WRITE_ONCE(cpudata->nominal_freq, nominal_freq); +- WRITE_ONCE(cpudata->max_freq, max_freq); ++ WRITE_ONCE(cpudata->min_freq, min_freq * 1000); ++ WRITE_ONCE(cpudata->lowest_nonlinear_freq, lowest_nonlinear_freq * 1000); ++ WRITE_ONCE(cpudata->nominal_freq, nominal_freq * 1000); ++ WRITE_ONCE(cpudata->max_freq, max_freq * 1000); + + /** + * Below values need to be initialized correctly, otherwise driver will fail to load +@@ -938,13 +950,13 @@ static int amd_pstate_init_freq(struct amd_cpudata *cpudata) + */ + if (min_freq <= 0 || max_freq <= 0 || nominal_freq <= 0 || min_freq > max_freq) { + pr_err("min_freq(%d) or max_freq(%d) or nominal_freq(%d) value is incorrect\n", +- min_freq, max_freq, nominal_freq * 1000); ++ min_freq, max_freq, nominal_freq); + return -EINVAL; + } + +- if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq * 1000) { ++ if (lowest_nonlinear_freq <= min_freq || lowest_nonlinear_freq > nominal_freq) { + pr_err("lowest_nonlinear_freq(%d) value is out of range [min_freq(%d), nominal_freq(%d)]\n", +- lowest_nonlinear_freq, min_freq, nominal_freq * 1000); ++ lowest_nonlinear_freq, min_freq, nominal_freq); + return -EINVAL; + } + +@@ -1161,7 +1173,6 @@ static ssize_t show_energy_performance_available_preferences( + static ssize_t store_energy_performance_preference( + struct cpufreq_policy *policy, const char *buf, size_t count) + { +- struct amd_cpudata *cpudata = policy->driver_data; + char str_preference[21]; + ssize_t ret; + +@@ -1175,7 +1186,7 @@ static ssize_t store_energy_performance_preference( + + guard(mutex)(&amd_pstate_limits_lock); + +- ret = amd_pstate_set_energy_pref_index(cpudata, ret); ++ ret = amd_pstate_set_energy_pref_index(policy, ret); + + return ret ? ret : count; + } +@@ -1186,9 +1197,22 @@ static ssize_t show_energy_performance_preference( + struct amd_cpudata *cpudata = policy->driver_data; + int preference; + +- preference = amd_pstate_get_energy_pref_index(cpudata); +- if (preference < 0) +- return preference; ++ switch (cpudata->epp_cached) { ++ case AMD_CPPC_EPP_PERFORMANCE: ++ preference = EPP_INDEX_PERFORMANCE; ++ break; ++ case AMD_CPPC_EPP_BALANCE_PERFORMANCE: ++ preference = EPP_INDEX_BALANCE_PERFORMANCE; ++ break; ++ case AMD_CPPC_EPP_BALANCE_POWERSAVE: ++ preference = EPP_INDEX_BALANCE_POWERSAVE; ++ break; ++ case AMD_CPPC_EPP_POWERSAVE: ++ preference = EPP_INDEX_POWERSAVE; ++ break; ++ default: ++ return -EINVAL; ++ } + + return sysfs_emit(buf, "%s\n", energy_perf_strings[preference]); + } +@@ -1237,6 +1261,9 @@ static int amd_pstate_register_driver(int mode) + return ret; + } + ++ /* at least one CPU supports CPB */ ++ current_pstate_driver->boost_enabled = cpu_feature_enabled(X86_FEATURE_CPB); ++ + ret = cpufreq_register_driver(current_pstate_driver); + if (ret) { + amd_pstate_driver_cleanup(); +@@ -1448,7 +1475,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + return -ENOMEM; + + cpudata->cpu = policy->cpu; +- cpudata->epp_policy = 0; + + ret = amd_pstate_init_perf(cpudata); + if (ret) +@@ -1474,8 +1500,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + + policy->driver_data = cpudata; + +- cpudata->epp_cached = cpudata->epp_default = amd_pstate_get_epp(cpudata, 0); +- + policy->min = policy->cpuinfo.min_freq; + policy->max = policy->cpuinfo.max_freq; + +@@ -1486,10 +1510,13 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + * the default cpufreq governor is neither powersave nor performance. + */ + if (amd_pstate_acpi_pm_profile_server() || +- amd_pstate_acpi_pm_profile_undefined()) ++ amd_pstate_acpi_pm_profile_undefined()) { + policy->policy = CPUFREQ_POLICY_PERFORMANCE; +- else ++ cpudata->epp_default = amd_pstate_get_epp(cpudata); ++ } else { + policy->policy = CPUFREQ_POLICY_POWERSAVE; ++ cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE; ++ } + + if (cpu_feature_enabled(X86_FEATURE_CPPC)) { + ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); +@@ -1502,6 +1529,9 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + return ret; + WRITE_ONCE(cpudata->cppc_cap1_cached, value); + } ++ ret = amd_pstate_set_epp(cpudata, cpudata->epp_default); ++ if (ret) ++ return ret; + + current_pstate_driver->adjust_perf = NULL; + +@@ -1527,51 +1557,24 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) + static int amd_pstate_epp_update_limit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +- u32 max_perf, min_perf; +- u64 value; +- s16 epp; ++ u32 epp; + +- max_perf = READ_ONCE(cpudata->highest_perf); +- min_perf = READ_ONCE(cpudata->lowest_perf); + amd_pstate_update_min_max_limit(policy); + +- max_perf = clamp_t(unsigned long, max_perf, cpudata->min_limit_perf, +- cpudata->max_limit_perf); +- min_perf = clamp_t(unsigned long, min_perf, cpudata->min_limit_perf, +- cpudata->max_limit_perf); +- value = READ_ONCE(cpudata->cppc_req_cached); +- + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) +- min_perf = min(cpudata->nominal_perf, max_perf); +- +- /* Initial min/max values for CPPC Performance Controls Register */ +- value &= ~AMD_CPPC_MIN_PERF(~0L); +- value |= AMD_CPPC_MIN_PERF(min_perf); +- +- value &= ~AMD_CPPC_MAX_PERF(~0L); +- value |= AMD_CPPC_MAX_PERF(max_perf); +- +- /* CPPC EPP feature require to set zero to the desire perf bit */ +- value &= ~AMD_CPPC_DES_PERF(~0L); +- value |= AMD_CPPC_DES_PERF(0); +- +- cpudata->epp_policy = cpudata->policy; ++ epp = 0; ++ else ++ epp = READ_ONCE(cpudata->epp_cached); + +- /* Get BIOS pre-defined epp value */ +- epp = amd_pstate_get_epp(cpudata, value); +- if (epp < 0) { +- /** +- * This return value can only be negative for shared_memory +- * systems where EPP register read/write not supported. +- */ +- return epp; ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, epp, ++ cpudata->min_limit_perf, ++ cpudata->max_limit_perf, ++ policy->boost_enabled); + } + +- if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) +- epp = 0; +- +- WRITE_ONCE(cpudata->cppc_req_cached, value); +- return amd_pstate_set_epp(cpudata, epp); ++ return amd_pstate_update_perf(cpudata, cpudata->min_limit_perf, 0U, ++ cpudata->max_limit_perf, epp, false); + } + + static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) +@@ -1600,8 +1603,9 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) + return 0; + } + +-static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) ++static int amd_pstate_epp_reenable(struct cpufreq_policy *policy) + { ++ struct amd_cpudata *cpudata = policy->driver_data; + u64 max_perf; + int ret; + +@@ -1611,17 +1615,26 @@ static void amd_pstate_epp_reenable(struct amd_cpudata *cpudata) + + max_perf = READ_ONCE(cpudata->highest_perf); + +- amd_pstate_update_perf(cpudata, 0, 0, max_perf, false); +- amd_pstate_set_epp(cpudata, cpudata->epp_cached); ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, ++ cpudata->epp_cached, ++ FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cpudata->cppc_req_cached), ++ max_perf, policy->boost_enabled); ++ } ++ ++ return amd_pstate_update_perf(cpudata, 0, 0, max_perf, cpudata->epp_cached, false); + } + + static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; ++ int ret; + + pr_debug("AMD CPU Core %d going online\n", cpudata->cpu); + +- amd_pstate_epp_reenable(cpudata); ++ ret = amd_pstate_epp_reenable(policy); ++ if (ret) ++ return ret; + cpudata->suspended = false; + + return 0; +@@ -1639,10 +1652,14 @@ static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) + + guard(mutex)(&amd_pstate_limits_lock); + +- amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, false); +- amd_pstate_set_epp(cpudata, AMD_CPPC_EPP_BALANCE_POWERSAVE); ++ if (trace_amd_pstate_epp_perf_enabled()) { ++ trace_amd_pstate_epp_perf(cpudata->cpu, cpudata->highest_perf, ++ AMD_CPPC_EPP_BALANCE_POWERSAVE, ++ min_perf, min_perf, policy->boost_enabled); ++ } + +- return 0; ++ return amd_pstate_update_perf(cpudata, min_perf, 0, min_perf, ++ AMD_CPPC_EPP_BALANCE_POWERSAVE, false); + } + + static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) +@@ -1673,7 +1690,7 @@ static int amd_pstate_epp_resume(struct cpufreq_policy *policy) + guard(mutex)(&amd_pstate_limits_lock); + + /* enable amd pstate from suspend state*/ +- amd_pstate_epp_reenable(cpudata); ++ amd_pstate_epp_reenable(policy); + + cpudata->suspended = false; + } +@@ -1826,6 +1843,8 @@ static int __init amd_pstate_init(void) + static_call_update(amd_pstate_cppc_enable, shmem_cppc_enable); + static_call_update(amd_pstate_init_perf, shmem_init_perf); + static_call_update(amd_pstate_update_perf, shmem_update_perf); ++ static_call_update(amd_pstate_get_epp, shmem_get_epp); ++ static_call_update(amd_pstate_set_epp, shmem_set_epp); + } + + if (amd_pstate_prefcore) { +diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h +index cd573bc6b6db..9747e3be6cee 100644 +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -57,7 +57,6 @@ struct amd_aperf_mperf { + * @hw_prefcore: check whether HW supports preferred core featue. + * Only when hw_prefcore and early prefcore param are true, + * AMD P-State driver supports preferred core featue. +- * @epp_policy: Last saved policy used to set energy-performance preference + * @epp_cached: Cached CPPC energy-performance preference value + * @policy: Cpufreq policy value + * @cppc_cap1_cached Cached MSR_AMD_CPPC_CAP1 register value +@@ -94,13 +93,11 @@ struct amd_cpudata { + bool hw_prefcore; + + /* EPP feature related attributes*/ +- s16 epp_policy; + s16 epp_cached; + u32 policy; + u64 cppc_cap1_cached; + bool suspended; + s16 epp_default; +- bool boost_state; + }; + + /* +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch b/sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch new file mode 100644 index 0000000..b4fc866 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0002-amd-tlb-broadcast.patch @@ -0,0 +1,1350 @@ +From 379e6b90eecaf17f29691bcfcdd588d03a934b0d Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:27:28 +0100 +Subject: [PATCH 02/12] amd-tlb-broadcast + +Signed-off-by: Peter Jung +--- + arch/x86/Kconfig | 2 +- + arch/x86/Kconfig.cpu | 4 + + arch/x86/hyperv/mmu.c | 1 - + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/include/asm/disabled-features.h | 8 +- + arch/x86/include/asm/mmu.h | 12 + + arch/x86/include/asm/mmu_context.h | 10 +- + arch/x86/include/asm/msr-index.h | 2 + + arch/x86/include/asm/paravirt.h | 5 - + arch/x86/include/asm/paravirt_types.h | 2 - + arch/x86/include/asm/tlb.h | 138 +++++++ + arch/x86/include/asm/tlbflush.h | 69 ++++ + arch/x86/kernel/alternative.c | 10 +- + arch/x86/kernel/cpu/amd.c | 10 + + arch/x86/kernel/kvm.c | 1 - + arch/x86/kernel/paravirt.c | 6 - + arch/x86/mm/pgtable.c | 16 +- + arch/x86/mm/tlb.c | 450 ++++++++++++++++++++--- + arch/x86/xen/mmu_pv.c | 1 - + include/linux/mm_types.h | 1 + + mm/memory.c | 1 - + mm/mmap.c | 2 - + mm/swap_state.c | 1 - + mm/vma.c | 2 - + tools/arch/x86/include/asm/msr-index.h | 2 + + 25 files changed, 668 insertions(+), 89 deletions(-) + +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 757333fe82c7..3d143bd2c054 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -273,7 +273,7 @@ config X86 + select HAVE_PCI + select HAVE_PERF_REGS + select HAVE_PERF_USER_STACK_DUMP +- select MMU_GATHER_RCU_TABLE_FREE if PARAVIRT ++ select MMU_GATHER_RCU_TABLE_FREE + select MMU_GATHER_MERGE_VMAS + select HAVE_POSIX_CPU_TIMERS_TASK_WORK + select HAVE_REGS_AND_STACK_ACCESS_API +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index 2a7279d80460..25c55cc17c5e 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -401,6 +401,10 @@ menuconfig PROCESSOR_SELECT + This lets you choose what x86 vendor support code your kernel + will include. + ++config BROADCAST_TLB_FLUSH ++ def_bool y ++ depends on CPU_SUP_AMD && 64BIT ++ + config CPU_SUP_INTEL + default y + bool "Support Intel processors" if PROCESSOR_SELECT +diff --git a/arch/x86/hyperv/mmu.c b/arch/x86/hyperv/mmu.c +index 1cc113200ff5..cbe6c71e17c1 100644 +--- a/arch/x86/hyperv/mmu.c ++++ b/arch/x86/hyperv/mmu.c +@@ -240,5 +240,4 @@ void hyperv_setup_mmu_ops(void) + + pr_info("Using hypercall for remote TLB flush\n"); + pv_ops.mmu.flush_tlb_multi = hyperv_flush_tlb_multi; +- pv_ops.mmu.tlb_remove_table = tlb_remove_table; + } +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 645aa360628d..bf727839326f 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -338,6 +338,7 @@ + #define X86_FEATURE_CLZERO (13*32+ 0) /* "clzero" CLZERO instruction */ + #define X86_FEATURE_IRPERF (13*32+ 1) /* "irperf" Instructions Retired Count */ + #define X86_FEATURE_XSAVEERPTR (13*32+ 2) /* "xsaveerptr" Always save/restore FP error pointers */ ++#define X86_FEATURE_INVLPGB (13*32+ 3) /* INVLPGB and TLBSYNC instructions supported */ + #define X86_FEATURE_RDPRU (13*32+ 4) /* "rdpru" Read processor register at user level */ + #define X86_FEATURE_WBNOINVD (13*32+ 9) /* "wbnoinvd" WBNOINVD instruction */ + #define X86_FEATURE_AMD_IBPB (13*32+12) /* Indirect Branch Prediction Barrier */ +diff --git a/arch/x86/include/asm/disabled-features.h b/arch/x86/include/asm/disabled-features.h +index c492bdc97b05..be8c38855068 100644 +--- a/arch/x86/include/asm/disabled-features.h ++++ b/arch/x86/include/asm/disabled-features.h +@@ -129,6 +129,12 @@ + #define DISABLE_SEV_SNP (1 << (X86_FEATURE_SEV_SNP & 31)) + #endif + ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++#define DISABLE_INVLPGB 0 ++#else ++#define DISABLE_INVLPGB (1 << (X86_FEATURE_INVLPGB & 31)) ++#endif ++ + /* + * Make sure to add features to the correct mask + */ +@@ -146,7 +152,7 @@ + #define DISABLED_MASK11 (DISABLE_RETPOLINE|DISABLE_RETHUNK|DISABLE_UNRET| \ + DISABLE_CALL_DEPTH_TRACKING|DISABLE_USER_SHSTK) + #define DISABLED_MASK12 (DISABLE_FRED|DISABLE_LAM) +-#define DISABLED_MASK13 0 ++#define DISABLED_MASK13 (DISABLE_INVLPGB) + #define DISABLED_MASK14 0 + #define DISABLED_MASK15 0 + #define DISABLED_MASK16 (DISABLE_PKU|DISABLE_OSPKE|DISABLE_LA57|DISABLE_UMIP| \ +diff --git a/arch/x86/include/asm/mmu.h b/arch/x86/include/asm/mmu.h +index 3b496cdcb74b..8b8055a8eb9e 100644 +--- a/arch/x86/include/asm/mmu.h ++++ b/arch/x86/include/asm/mmu.h +@@ -69,6 +69,18 @@ typedef struct { + u16 pkey_allocation_map; + s16 execute_only_pkey; + #endif ++ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++ /* ++ * The global ASID will be a non-zero value when the process has ++ * the same ASID across all CPUs, allowing it to make use of ++ * hardware-assisted remote TLB invalidation like AMD INVLPGB. ++ */ ++ u16 global_asid; ++ ++ /* The process is transitioning to a new global ASID number. */ ++ bool asid_transition; ++#endif + } mm_context_t; + + #define INIT_MM_CONTEXT(mm) \ +diff --git a/arch/x86/include/asm/mmu_context.h b/arch/x86/include/asm/mmu_context.h +index 795fdd53bd0a..2398058b6e83 100644 +--- a/arch/x86/include/asm/mmu_context.h ++++ b/arch/x86/include/asm/mmu_context.h +@@ -2,7 +2,6 @@ + #ifndef _ASM_X86_MMU_CONTEXT_H + #define _ASM_X86_MMU_CONTEXT_H + +-#include + #include + #include + #include +@@ -13,6 +12,7 @@ + #include + #include + #include ++#include + + extern atomic64_t last_mm_ctx_id; + +@@ -139,6 +139,11 @@ static inline void mm_reset_untag_mask(struct mm_struct *mm) + #define enter_lazy_tlb enter_lazy_tlb + extern void enter_lazy_tlb(struct mm_struct *mm, struct task_struct *tsk); + ++#define mm_init_global_asid mm_init_global_asid ++extern void mm_init_global_asid(struct mm_struct *mm); ++ ++extern void mm_free_global_asid(struct mm_struct *mm); ++ + /* + * Init a new mm. Used on mm copies, like at fork() + * and on mm's that are brand-new, like at execve(). +@@ -161,6 +166,8 @@ static inline int init_new_context(struct task_struct *tsk, + mm->context.execute_only_pkey = -1; + } + #endif ++ ++ mm_init_global_asid(mm); + mm_reset_untag_mask(mm); + init_new_context_ldt(mm); + return 0; +@@ -170,6 +177,7 @@ static inline int init_new_context(struct task_struct *tsk, + static inline void destroy_context(struct mm_struct *mm) + { + destroy_context_ldt(mm); ++ mm_free_global_asid(mm); + } + + extern void switch_mm(struct mm_struct *prev, struct mm_struct *next, +diff --git a/arch/x86/include/asm/msr-index.h b/arch/x86/include/asm/msr-index.h +index 61e991507353..6844ebeed377 100644 +--- a/arch/x86/include/asm/msr-index.h ++++ b/arch/x86/include/asm/msr-index.h +@@ -25,6 +25,7 @@ + #define _EFER_SVME 12 /* Enable virtualization */ + #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ + #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ ++#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ + #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ + + #define EFER_SCE (1<<_EFER_SCE) +@@ -34,6 +35,7 @@ + #define EFER_SVME (1<<_EFER_SVME) + #define EFER_LMSLE (1<<_EFER_LMSLE) + #define EFER_FFXSR (1<<_EFER_FFXSR) ++#define EFER_TCE (1<<_EFER_TCE) + #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) + + /* +diff --git a/arch/x86/include/asm/paravirt.h b/arch/x86/include/asm/paravirt.h +index d4eb9e1d61b8..794ba3647c6c 100644 +--- a/arch/x86/include/asm/paravirt.h ++++ b/arch/x86/include/asm/paravirt.h +@@ -91,11 +91,6 @@ static inline void __flush_tlb_multi(const struct cpumask *cpumask, + PVOP_VCALL2(mmu.flush_tlb_multi, cpumask, info); + } + +-static inline void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- PVOP_VCALL2(mmu.tlb_remove_table, tlb, table); +-} +- + static inline void paravirt_arch_exit_mmap(struct mm_struct *mm) + { + PVOP_VCALL1(mmu.exit_mmap, mm); +diff --git a/arch/x86/include/asm/paravirt_types.h b/arch/x86/include/asm/paravirt_types.h +index 8d4fbe1be489..13405959e4db 100644 +--- a/arch/x86/include/asm/paravirt_types.h ++++ b/arch/x86/include/asm/paravirt_types.h +@@ -136,8 +136,6 @@ struct pv_mmu_ops { + void (*flush_tlb_multi)(const struct cpumask *cpus, + const struct flush_tlb_info *info); + +- void (*tlb_remove_table)(struct mmu_gather *tlb, void *table); +- + /* Hook for intercepting the destruction of an mm_struct. */ + void (*exit_mmap)(struct mm_struct *mm); + void (*notify_page_enc_status_changed)(unsigned long pfn, int npages, bool enc); +diff --git a/arch/x86/include/asm/tlb.h b/arch/x86/include/asm/tlb.h +index 4d3c9d00d6b6..a74b57512761 100644 +--- a/arch/x86/include/asm/tlb.h ++++ b/arch/x86/include/asm/tlb.h +@@ -6,6 +6,9 @@ + static inline void tlb_flush(struct mmu_gather *tlb); + + #include ++#include ++#include ++#include + + static inline void tlb_flush(struct mmu_gather *tlb) + { +@@ -38,4 +41,139 @@ static inline void invlpg(unsigned long addr) + { + asm volatile("invlpg (%0)" ::"r" (addr) : "memory"); + } ++enum addr_stride { ++ PTE_STRIDE = 0, ++ PMD_STRIDE = 1 ++}; ++ ++/* ++ * INVLPGB can be targeted by virtual address, PCID, ASID, or any combination ++ * of the three. For example: ++ * - FLAG_VA | FLAG_INCLUDE_GLOBAL: invalidate all TLB entries at the address ++ * - FLAG_PCID: invalidate all TLB entries matching the PCID ++ * ++ * The first is used to invalidate (kernel) mappings at a particular ++ * address across all processes. ++ * ++ * The latter invalidates all TLB entries matching a PCID. ++ */ ++#define INVLPGB_FLAG_VA BIT(0) ++#define INVLPGB_FLAG_PCID BIT(1) ++#define INVLPGB_FLAG_ASID BIT(2) ++#define INVLPGB_FLAG_INCLUDE_GLOBAL BIT(3) ++#define INVLPGB_FLAG_FINAL_ONLY BIT(4) ++#define INVLPGB_FLAG_INCLUDE_NESTED BIT(5) ++ ++/* The implied mode when all bits are clear: */ ++#define INVLPGB_MODE_ALL_NONGLOBALS 0UL ++ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++/* ++ * INVLPGB does broadcast TLB invalidation across all the CPUs in the system. ++ * ++ * The INVLPGB instruction is weakly ordered, and a batch of invalidations can ++ * be done in a parallel fashion. ++ * ++ * The instruction takes the number of extra pages to invalidate, beyond the ++ * first page, while __invlpgb gets the more human readable number of pages to ++ * invalidate. ++ * ++ * The bits in rax[0:2] determine respectively which components of the address ++ * (VA, PCID, ASID) get compared when flushing. If neither bits are set, *any* ++ * address in the specified range matches. ++ * ++ * Since it is desired to only flush TLB entries for the ASID that is executing ++ * the instruction (a host/hypervisor or a guest), the ASID valid bit should ++ * always be set. On a host/hypervisor, the hardware will use the ASID value ++ * specified in EDX[15:0] (which should be 0). On a guest, the hardware will ++ * use the actual ASID value of the guest. ++ * ++ * TLBSYNC is used to ensure that pending INVLPGB invalidations initiated from ++ * this CPU have completed. ++ */ ++static inline void __invlpgb(unsigned long asid, unsigned long pcid, ++ unsigned long addr, u16 nr_pages, ++ enum addr_stride stride, u8 flags) ++{ ++ u64 rax = addr | flags | INVLPGB_FLAG_ASID; ++ u32 ecx = (stride << 31) | (nr_pages - 1); ++ u32 edx = (pcid << 16) | asid; ++ ++ /* The low bits in rax are for flags. Verify addr is clean. */ ++ VM_WARN_ON_ONCE(addr & ~PAGE_MASK); ++ ++ /* INVLPGB; supported in binutils >= 2.36. */ ++ asm volatile(".byte 0x0f, 0x01, 0xfe" :: "a" (rax), "c" (ecx), "d" (edx)); ++} ++ ++static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) ++{ ++ __invlpgb(asid, pcid, 0, 1, 0, flags); ++} ++ ++static inline void __tlbsync(void) ++{ ++ /* ++ * TLBSYNC waits for INVLPGB instructions originating on the same CPU ++ * to have completed. Print a warning if the task has been migrated, ++ * and might not be waiting on all the INVLPGBs issued during this TLB ++ * invalidation sequence. ++ */ ++ cant_migrate(); ++ ++ /* TLBSYNC: supported in binutils >= 0.36. */ ++ asm volatile(".byte 0x0f, 0x01, 0xff" ::: "memory"); ++} ++#else ++/* Some compilers (I'm looking at you clang!) simply can't do DCE */ ++static inline void __invlpgb(unsigned long asid, unsigned long pcid, ++ unsigned long addr, u16 nr_pages, ++ enum addr_stride s, u8 flags) { } ++static inline void __invlpgb_all(unsigned long asid, unsigned long pcid, u8 flags) { } ++static inline void __tlbsync(void) { } ++#endif ++ ++static inline void invlpgb_flush_user_nr_nosync(unsigned long pcid, ++ unsigned long addr, ++ u16 nr, bool stride) ++{ ++ enum addr_stride str = stride ? PMD_STRIDE : PTE_STRIDE; ++ u8 flags = INVLPGB_FLAG_PCID | INVLPGB_FLAG_VA; ++ ++ __invlpgb(0, pcid, addr, nr, str, flags); ++} ++ ++/* Flush all mappings for a given PCID, not including globals. */ ++static inline void invlpgb_flush_single_pcid_nosync(unsigned long pcid) ++{ ++ __invlpgb_all(0, pcid, INVLPGB_FLAG_PCID); ++} ++ ++/* Flush all mappings, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_all(void) ++{ ++ /* ++ * TLBSYNC at the end needs to make sure all flushes done on the ++ * current CPU have been executed system-wide. Therefore, make ++ * sure nothing gets migrated in-between but disable preemption ++ * as it is cheaper. ++ */ ++ guard(preempt)(); ++ __invlpgb_all(0, 0, INVLPGB_FLAG_INCLUDE_GLOBAL); ++ __tlbsync(); ++} ++ ++/* Flush addr, including globals, for all PCIDs. */ ++static inline void invlpgb_flush_addr_nosync(unsigned long addr, u16 nr) ++{ ++ __invlpgb(0, 0, addr, nr, PTE_STRIDE, INVLPGB_FLAG_INCLUDE_GLOBAL); ++} ++ ++/* Flush all mappings for all PCIDs except globals. */ ++static inline void invlpgb_flush_all_nonglobals(void) ++{ ++ guard(preempt)(); ++ __invlpgb_all(0, 0, INVLPGB_MODE_ALL_NONGLOBALS); ++ __tlbsync(); ++} + #endif /* _ASM_X86_TLB_H */ +diff --git a/arch/x86/include/asm/tlbflush.h b/arch/x86/include/asm/tlbflush.h +index 02fc2aa06e9e..0bc91488c9c2 100644 +--- a/arch/x86/include/asm/tlbflush.h ++++ b/arch/x86/include/asm/tlbflush.h +@@ -6,6 +6,7 @@ + #include + #include + ++#include + #include + #include + #include +@@ -183,6 +184,9 @@ static inline void cr4_init_shadow(void) + extern unsigned long mmu_cr4_features; + extern u32 *trampoline_cr4_features; + ++/* How many pages can be invalidated with one INVLPGB. */ ++extern u16 invlpgb_count_max; ++ + extern void initialize_tlbstate_and_flush(void); + + /* +@@ -231,6 +235,71 @@ void flush_tlb_one_kernel(unsigned long addr); + void flush_tlb_multi(const struct cpumask *cpumask, + const struct flush_tlb_info *info); + ++static inline bool is_dyn_asid(u16 asid) ++{ ++ return asid < TLB_NR_DYN_ASIDS; ++} ++ ++static inline bool is_global_asid(u16 asid) ++{ ++ return !is_dyn_asid(asid); ++} ++ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++static inline u16 mm_global_asid(struct mm_struct *mm) ++{ ++ u16 asid; ++ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return 0; ++ ++ asid = smp_load_acquire(&mm->context.global_asid); ++ ++ /* mm->context.global_asid is either 0, or a global ASID */ ++ VM_WARN_ON_ONCE(asid && is_dyn_asid(asid)); ++ ++ return asid; ++} ++ ++static inline void mm_init_global_asid(struct mm_struct *mm) ++{ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ mm->context.global_asid = 0; ++ mm->context.asid_transition = false; ++ } ++} ++ ++static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) ++{ ++ /* ++ * Notably flush_tlb_mm_range() -> broadcast_tlb_flush() -> ++ * finish_asid_transition() needs to observe asid_transition = true ++ * once it observes global_asid. ++ */ ++ mm->context.asid_transition = true; ++ smp_store_release(&mm->context.global_asid, asid); ++} ++ ++static inline void mm_clear_asid_transition(struct mm_struct *mm) ++{ ++ WRITE_ONCE(mm->context.asid_transition, false); ++} ++ ++static inline bool mm_in_asid_transition(struct mm_struct *mm) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return false; ++ ++ return mm && READ_ONCE(mm->context.asid_transition); ++} ++#else ++static inline u16 mm_global_asid(struct mm_struct *mm) { return 0; } ++static inline void mm_init_global_asid(struct mm_struct *mm) { } ++static inline void mm_assign_global_asid(struct mm_struct *mm, u16 asid) { } ++static inline void mm_clear_asid_transition(struct mm_struct *mm) { } ++static inline bool mm_in_asid_transition(struct mm_struct *mm) { return false; } ++#endif /* CONFIG_BROADCAST_TLB_FLUSH */ ++ + #ifdef CONFIG_PARAVIRT + #include + #endif +diff --git a/arch/x86/kernel/alternative.c b/arch/x86/kernel/alternative.c +index 243843e44e89..c71b575bf229 100644 +--- a/arch/x86/kernel/alternative.c ++++ b/arch/x86/kernel/alternative.c +@@ -1854,11 +1854,18 @@ static inline temp_mm_state_t use_temporary_mm(struct mm_struct *mm) + return temp_state; + } + ++__ro_after_init struct mm_struct *poking_mm; ++__ro_after_init unsigned long poking_addr; ++ + static inline void unuse_temporary_mm(temp_mm_state_t prev_state) + { + lockdep_assert_irqs_disabled(); ++ + switch_mm_irqs_off(NULL, prev_state.mm, current); + ++ /* Clear the cpumask, to indicate no TLB flushing is needed anywhere */ ++ cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(poking_mm)); ++ + /* + * Restore the breakpoints if they were disabled before the temporary mm + * was loaded. +@@ -1867,9 +1874,6 @@ static inline void unuse_temporary_mm(temp_mm_state_t prev_state) + hw_breakpoint_restore(); + } + +-__ro_after_init struct mm_struct *poking_mm; +-__ro_after_init unsigned long poking_addr; +- + static void text_poke_memcpy(void *dst, const void *src, size_t len) + { + memcpy(dst, src, len); +diff --git a/arch/x86/kernel/cpu/amd.c b/arch/x86/kernel/cpu/amd.c +index 79d2e17f6582..05ca61b66461 100644 +--- a/arch/x86/kernel/cpu/amd.c ++++ b/arch/x86/kernel/cpu/amd.c +@@ -29,6 +29,8 @@ + + #include "cpu.h" + ++u16 invlpgb_count_max __ro_after_init; ++ + static inline int rdmsrl_amd_safe(unsigned msr, unsigned long long *p) + { + u32 gprs[8] = { 0 }; +@@ -1069,6 +1071,10 @@ static void init_amd(struct cpuinfo_x86 *c) + + /* AMD CPUs don't need fencing after x2APIC/TSC_DEADLINE MSR writes. */ + clear_cpu_cap(c, X86_FEATURE_APIC_MSRS_FENCE); ++ ++ /* Enable Translation Cache Extension */ ++ if (cpu_has(c, X86_FEATURE_TCE)) ++ msr_set_bit(MSR_EFER, _EFER_TCE); + } + + #ifdef CONFIG_X86_32 +@@ -1135,6 +1141,10 @@ static void cpu_detect_tlb_amd(struct cpuinfo_x86 *c) + tlb_lli_2m[ENTRIES] = eax & mask; + + tlb_lli_4m[ENTRIES] = tlb_lli_2m[ENTRIES] >> 1; ++ ++ /* Max number of pages INVLPGB can invalidate in one shot */ ++ if (cpu_has(c, X86_FEATURE_INVLPGB)) ++ invlpgb_count_max = (cpuid_edx(0x80000008) & 0xffff) + 1; + } + + static const struct cpu_dev amd_cpu_dev = { +diff --git a/arch/x86/kernel/kvm.c b/arch/x86/kernel/kvm.c +index 21e9e4845354..83b7679658b1 100644 +--- a/arch/x86/kernel/kvm.c ++++ b/arch/x86/kernel/kvm.c +@@ -838,7 +838,6 @@ static void __init kvm_guest_init(void) + #ifdef CONFIG_SMP + if (pv_tlb_flush_supported()) { + pv_ops.mmu.flush_tlb_multi = kvm_flush_tlb_multi; +- pv_ops.mmu.tlb_remove_table = tlb_remove_table; + pr_info("KVM setup pv remote TLB flush\n"); + } + +diff --git a/arch/x86/kernel/paravirt.c b/arch/x86/kernel/paravirt.c +index fec381533555..c019771e0123 100644 +--- a/arch/x86/kernel/paravirt.c ++++ b/arch/x86/kernel/paravirt.c +@@ -59,11 +59,6 @@ void __init native_pv_lock_init(void) + static_branch_enable(&virt_spin_lock_key); + } + +-static void native_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_page(tlb, table); +-} +- + struct static_key paravirt_steal_enabled; + struct static_key paravirt_steal_rq_enabled; + +@@ -191,7 +186,6 @@ struct paravirt_patch_template pv_ops = { + .mmu.flush_tlb_kernel = native_flush_tlb_global, + .mmu.flush_tlb_one_user = native_flush_tlb_one_user, + .mmu.flush_tlb_multi = native_flush_tlb_multi, +- .mmu.tlb_remove_table = native_tlb_remove_table, + + .mmu.exit_mmap = paravirt_nop, + .mmu.notify_page_enc_status_changed = paravirt_nop, +diff --git a/arch/x86/mm/pgtable.c b/arch/x86/mm/pgtable.c +index 5745a354a241..3dc4af1f7868 100644 +--- a/arch/x86/mm/pgtable.c ++++ b/arch/x86/mm/pgtable.c +@@ -18,14 +18,6 @@ EXPORT_SYMBOL(physical_mask); + #define PGTABLE_HIGHMEM 0 + #endif + +-#ifndef CONFIG_PARAVIRT +-static inline +-void paravirt_tlb_remove_table(struct mmu_gather *tlb, void *table) +-{ +- tlb_remove_page(tlb, table); +-} +-#endif +- + gfp_t __userpte_alloc_gfp = GFP_PGTABLE_USER | PGTABLE_HIGHMEM; + + pgtable_t pte_alloc_one(struct mm_struct *mm) +@@ -54,7 +46,7 @@ void ___pte_free_tlb(struct mmu_gather *tlb, struct page *pte) + { + pagetable_pte_dtor(page_ptdesc(pte)); + paravirt_release_pte(page_to_pfn(pte)); +- paravirt_tlb_remove_table(tlb, pte); ++ tlb_remove_table(tlb, pte); + } + + #if CONFIG_PGTABLE_LEVELS > 2 +@@ -70,7 +62,7 @@ void ___pmd_free_tlb(struct mmu_gather *tlb, pmd_t *pmd) + tlb->need_flush_all = 1; + #endif + pagetable_pmd_dtor(ptdesc); +- paravirt_tlb_remove_table(tlb, ptdesc_page(ptdesc)); ++ tlb_remove_table(tlb, ptdesc_page(ptdesc)); + } + + #if CONFIG_PGTABLE_LEVELS > 3 +@@ -80,14 +72,14 @@ void ___pud_free_tlb(struct mmu_gather *tlb, pud_t *pud) + + pagetable_pud_dtor(ptdesc); + paravirt_release_pud(__pa(pud) >> PAGE_SHIFT); +- paravirt_tlb_remove_table(tlb, virt_to_page(pud)); ++ tlb_remove_table(tlb, virt_to_page(pud)); + } + + #if CONFIG_PGTABLE_LEVELS > 4 + void ___p4d_free_tlb(struct mmu_gather *tlb, p4d_t *p4d) + { + paravirt_release_p4d(__pa(p4d) >> PAGE_SHIFT); +- paravirt_tlb_remove_table(tlb, virt_to_page(p4d)); ++ tlb_remove_table(tlb, virt_to_page(p4d)); + } + #endif /* CONFIG_PGTABLE_LEVELS > 4 */ + #endif /* CONFIG_PGTABLE_LEVELS > 3 */ +diff --git a/arch/x86/mm/tlb.c b/arch/x86/mm/tlb.c +index 90a9e4740913..7505c2d94bc0 100644 +--- a/arch/x86/mm/tlb.c ++++ b/arch/x86/mm/tlb.c +@@ -74,13 +74,15 @@ + * use different names for each of them: + * + * ASID - [0, TLB_NR_DYN_ASIDS-1] +- * the canonical identifier for an mm ++ * the canonical identifier for an mm, dynamically allocated on each CPU ++ * [TLB_NR_DYN_ASIDS, MAX_ASID_AVAILABLE-1] ++ * the canonical, global identifier for an mm, identical across all CPUs + * +- * kPCID - [1, TLB_NR_DYN_ASIDS] ++ * kPCID - [1, MAX_ASID_AVAILABLE] + * the value we write into the PCID part of CR3; corresponds to the + * ASID+1, because PCID 0 is special. + * +- * uPCID - [2048 + 1, 2048 + TLB_NR_DYN_ASIDS] ++ * uPCID - [2048 + 1, 2048 + MAX_ASID_AVAILABLE] + * for KPTI each mm has two address spaces and thus needs two + * PCID values, but we can still do with a single ASID denomination + * for each mm. Corresponds to kPCID + 2048. +@@ -225,6 +227,20 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + return; + } + ++ /* ++ * TLB consistency for global ASIDs is maintained with hardware assisted ++ * remote TLB flushing. Global ASIDs are always up to date. ++ */ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ u16 global_asid = mm_global_asid(next); ++ ++ if (global_asid) { ++ *new_asid = global_asid; ++ *need_flush = false; ++ return; ++ } ++ } ++ + if (this_cpu_read(cpu_tlbstate.invalidate_other)) + clear_asid_other(); + +@@ -251,6 +267,268 @@ static void choose_new_asid(struct mm_struct *next, u64 next_tlb_gen, + *need_flush = true; + } + ++/* ++ * Global ASIDs are allocated for multi-threaded processes that are ++ * active on multiple CPUs simultaneously, giving each of those ++ * processes the same PCID on every CPU, for use with hardware-assisted ++ * TLB shootdown on remote CPUs, like AMD INVLPGB or Intel RAR. ++ * ++ * These global ASIDs are held for the lifetime of the process. ++ */ ++static DEFINE_RAW_SPINLOCK(global_asid_lock); ++static u16 last_global_asid = MAX_ASID_AVAILABLE; ++static DECLARE_BITMAP(global_asid_used, MAX_ASID_AVAILABLE); ++static DECLARE_BITMAP(global_asid_freed, MAX_ASID_AVAILABLE); ++static int global_asid_available = MAX_ASID_AVAILABLE - TLB_NR_DYN_ASIDS - 1; ++ ++/* ++ * When the search for a free ASID in the global ASID space reaches ++ * MAX_ASID_AVAILABLE, a global TLB flush guarantees that previously ++ * freed global ASIDs are safe to re-use. ++ * ++ * This way the global flush only needs to happen at ASID rollover ++ * time, and not at ASID allocation time. ++ */ ++static void reset_global_asid_space(void) ++{ ++ lockdep_assert_held(&global_asid_lock); ++ ++ invlpgb_flush_all_nonglobals(); ++ ++ /* ++ * The TLB flush above makes it safe to re-use the previously ++ * freed global ASIDs. ++ */ ++ bitmap_andnot(global_asid_used, global_asid_used, ++ global_asid_freed, MAX_ASID_AVAILABLE); ++ bitmap_clear(global_asid_freed, 0, MAX_ASID_AVAILABLE); ++ ++ /* Restart the search from the start of global ASID space. */ ++ last_global_asid = TLB_NR_DYN_ASIDS; ++} ++ ++static u16 allocate_global_asid(void) ++{ ++ u16 asid; ++ ++ lockdep_assert_held(&global_asid_lock); ++ ++ /* The previous allocation hit the edge of available address space */ ++ if (last_global_asid >= MAX_ASID_AVAILABLE - 1) ++ reset_global_asid_space(); ++ ++ asid = find_next_zero_bit(global_asid_used, MAX_ASID_AVAILABLE, last_global_asid); ++ ++ if (asid >= MAX_ASID_AVAILABLE && !global_asid_available) { ++ /* This should never happen. */ ++ VM_WARN_ONCE(1, "Unable to allocate global ASID despite %d available\n", ++ global_asid_available); ++ return 0; ++ } ++ ++ /* Claim this global ASID. */ ++ __set_bit(asid, global_asid_used); ++ last_global_asid = asid; ++ global_asid_available--; ++ return asid; ++} ++ ++/* ++ * Check whether a process is currently active on more than @threshold CPUs. ++ * This is a cheap estimation on whether or not it may make sense to assign ++ * a global ASID to this process, and use broadcast TLB invalidation. ++ */ ++static bool mm_active_cpus_exceeds(struct mm_struct *mm, int threshold) ++{ ++ int count = 0; ++ int cpu; ++ ++ /* This quick check should eliminate most single threaded programs. */ ++ if (cpumask_weight(mm_cpumask(mm)) <= threshold) ++ return false; ++ ++ /* Slower check to make sure. */ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ /* Skip the CPUs that aren't really running this process. */ ++ if (per_cpu(cpu_tlbstate.loaded_mm, cpu) != mm) ++ continue; ++ ++ if (per_cpu(cpu_tlbstate_shared.is_lazy, cpu)) ++ continue; ++ ++ if (++count > threshold) ++ return true; ++ } ++ return false; ++} ++ ++/* ++ * Assign a global ASID to the current process, protecting against ++ * races between multiple threads in the process. ++ */ ++static void use_global_asid(struct mm_struct *mm) ++{ ++ u16 asid; ++ ++ guard(raw_spinlock_irqsave)(&global_asid_lock); ++ ++ /* This process is already using broadcast TLB invalidation. */ ++ if (mm_global_asid(mm)) ++ return; ++ ++ /* ++ * The last global ASID was consumed while waiting for the lock. ++ * ++ * If this fires, a more aggressive ASID reuse scheme might be ++ * needed. ++ */ ++ if (!global_asid_available) { ++ VM_WARN_ONCE(1, "Ran out of global ASIDs\n"); ++ return; ++ } ++ ++ asid = allocate_global_asid(); ++ if (!asid) ++ return; ++ ++ mm_assign_global_asid(mm, asid); ++} ++ ++void mm_free_global_asid(struct mm_struct *mm) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return; ++ ++ if (!mm_global_asid(mm)) ++ return; ++ ++ guard(raw_spinlock_irqsave)(&global_asid_lock); ++ ++ /* The global ASID can be re-used only after flush at wrap-around. */ ++#ifdef CONFIG_BROADCAST_TLB_FLUSH ++ __set_bit(mm->context.global_asid, global_asid_freed); ++ ++ mm->context.global_asid = 0; ++ global_asid_available++; ++#endif ++} ++ ++/* ++ * Is the mm transitioning from a CPU-local ASID to a global ASID? ++ */ ++static bool mm_needs_global_asid(struct mm_struct *mm, u16 asid) ++{ ++ u16 global_asid = mm_global_asid(mm); ++ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return false; ++ ++ /* Process is transitioning to a global ASID */ ++ if (global_asid && asid != global_asid) ++ return true; ++ ++ return false; ++} ++ ++/* ++ * x86 has 4k ASIDs (2k when compiled with KPTI), but the largest x86 ++ * systems have over 8k CPUs. Because of this potential ASID shortage, ++ * global ASIDs are handed out to processes that have frequent TLB ++ * flushes and are active on 4 or more CPUs simultaneously. ++ */ ++static void consider_global_asid(struct mm_struct *mm) ++{ ++ if (!cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ return; ++ ++ /* Check every once in a while. */ ++ if ((current->pid & 0x1f) != (jiffies & 0x1f)) ++ return; ++ ++ /* ++ * Assign a global ASID if the process is active on ++ * 4 or more CPUs simultaneously. ++ */ ++ if (mm_active_cpus_exceeds(mm, 3)) ++ use_global_asid(mm); ++} ++ ++static void finish_asid_transition(struct flush_tlb_info *info) ++{ ++ struct mm_struct *mm = info->mm; ++ int bc_asid = mm_global_asid(mm); ++ int cpu; ++ ++ if (!mm_in_asid_transition(mm)) ++ return; ++ ++ for_each_cpu(cpu, mm_cpumask(mm)) { ++ /* ++ * The remote CPU is context switching. Wait for that to ++ * finish, to catch the unlikely case of it switching to ++ * the target mm with an out of date ASID. ++ */ ++ while (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) == LOADED_MM_SWITCHING) ++ cpu_relax(); ++ ++ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm, cpu)) != mm) ++ continue; ++ ++ /* ++ * If at least one CPU is not using the global ASID yet, ++ * send a TLB flush IPI. The IPI should cause stragglers ++ * to transition soon. ++ * ++ * This can race with the CPU switching to another task; ++ * that results in a (harmless) extra IPI. ++ */ ++ if (READ_ONCE(per_cpu(cpu_tlbstate.loaded_mm_asid, cpu)) != bc_asid) { ++ flush_tlb_multi(mm_cpumask(info->mm), info); ++ return; ++ } ++ } ++ ++ /* All the CPUs running this process are using the global ASID. */ ++ mm_clear_asid_transition(mm); ++} ++ ++static void broadcast_tlb_flush(struct flush_tlb_info *info) ++{ ++ bool pmd = info->stride_shift == PMD_SHIFT; ++ unsigned long asid = mm_global_asid(info->mm); ++ unsigned long addr = info->start; ++ ++ /* ++ * TLB flushes with INVLPGB are kicked off asynchronously. ++ * The inc_mm_tlb_gen() guarantees page table updates are done ++ * before these TLB flushes happen. ++ */ ++ if (info->end == TLB_FLUSH_ALL) { ++ invlpgb_flush_single_pcid_nosync(kern_pcid(asid)); ++ /* Do any CPUs supporting INVLPGB need PTI? */ ++ if (cpu_feature_enabled(X86_FEATURE_PTI)) ++ invlpgb_flush_single_pcid_nosync(user_pcid(asid)); ++ } else do { ++ unsigned long nr = 1; ++ ++ if (info->stride_shift <= PMD_SHIFT) { ++ nr = (info->end - addr) >> info->stride_shift; ++ nr = clamp_val(nr, 1, invlpgb_count_max); ++ } ++ ++ invlpgb_flush_user_nr_nosync(kern_pcid(asid), addr, nr, pmd); ++ if (cpu_feature_enabled(X86_FEATURE_PTI)) ++ invlpgb_flush_user_nr_nosync(user_pcid(asid), addr, nr, pmd); ++ ++ addr += nr << info->stride_shift; ++ } while (addr < info->end); ++ ++ finish_asid_transition(info); ++ ++ /* Wait for the INVLPGBs kicked off above to finish. */ ++ __tlbsync(); ++} ++ + /* + * Given an ASID, flush the corresponding user ASID. We can delay this + * until the next time we switch to it. +@@ -556,7 +834,8 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, + */ + if (prev == next) { + /* Not actually switching mm's */ +- VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != ++ VM_WARN_ON(is_dyn_asid(prev_asid) && ++ this_cpu_read(cpu_tlbstate.ctxs[prev_asid].ctx_id) != + next->context.ctx_id); + + /* +@@ -573,6 +852,20 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, + !cpumask_test_cpu(cpu, mm_cpumask(next)))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + ++ /* Check if the current mm is transitioning to a global ASID */ ++ if (mm_needs_global_asid(next, prev_asid)) { ++ next_tlb_gen = atomic64_read(&next->context.tlb_gen); ++ choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); ++ goto reload_tlb; ++ } ++ ++ /* ++ * Broadcast TLB invalidation keeps this ASID up to date ++ * all the time. ++ */ ++ if (is_global_asid(prev_asid)) ++ return; ++ + /* + * If the CPU is not in lazy TLB mode, we are just switching + * from one thread in a process to another thread in the same +@@ -607,30 +900,32 @@ void switch_mm_irqs_off(struct mm_struct *unused, struct mm_struct *next, + cond_mitigation(tsk); + + /* +- * Stop remote flushes for the previous mm. +- * Skip kernel threads; we never send init_mm TLB flushing IPIs, +- * but the bitmap manipulation can cause cache line contention. ++ * Let nmi_uaccess_okay() and finish_asid_transition() ++ * know that CR3 is changing. + */ +- if (prev != &init_mm) { +- VM_WARN_ON_ONCE(!cpumask_test_cpu(cpu, +- mm_cpumask(prev))); +- cpumask_clear_cpu(cpu, mm_cpumask(prev)); +- } ++ this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); ++ barrier(); ++ ++ /* ++ * Leave this CPU in prev's mm_cpumask. Atomic writes to ++ * mm_cpumask can be expensive under contention. The CPU ++ * will be removed lazily at TLB flush time. ++ */ ++ VM_WARN_ON_ONCE(prev != &init_mm && !cpumask_test_cpu(cpu, ++ mm_cpumask(prev))); + + /* Start receiving IPIs and then read tlb_gen (and LAM below) */ +- if (next != &init_mm) ++ if (next != &init_mm && !cpumask_test_cpu(cpu, mm_cpumask(next))) + cpumask_set_cpu(cpu, mm_cpumask(next)); + next_tlb_gen = atomic64_read(&next->context.tlb_gen); + + choose_new_asid(next, next_tlb_gen, &new_asid, &need_flush); +- +- /* Let nmi_uaccess_okay() know that we're changing CR3. */ +- this_cpu_write(cpu_tlbstate.loaded_mm, LOADED_MM_SWITCHING); +- barrier(); + } + ++reload_tlb: + new_lam = mm_lam_cr3_mask(next); + if (need_flush) { ++ VM_WARN_ON_ONCE(is_global_asid(new_asid)); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].ctx_id, next->context.ctx_id); + this_cpu_write(cpu_tlbstate.ctxs[new_asid].tlb_gen, next_tlb_gen); + load_new_mm_cr3(next->pgd, new_asid, new_lam, true); +@@ -749,7 +1044,7 @@ static void flush_tlb_func(void *info) + const struct flush_tlb_info *f = info; + struct mm_struct *loaded_mm = this_cpu_read(cpu_tlbstate.loaded_mm); + u32 loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); +- u64 local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ u64 local_tlb_gen; + bool local = smp_processor_id() == f->initiating_cpu; + unsigned long nr_invalidate = 0; + u64 mm_tlb_gen; +@@ -760,15 +1055,28 @@ static void flush_tlb_func(void *info) + if (!local) { + inc_irq_stat(irq_tlb_count); + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); ++ } + +- /* Can only happen on remote CPUs */ +- if (f->mm && f->mm != loaded_mm) +- return; ++ /* The CPU was left in the mm_cpumask of the target mm. Clear it. */ ++ if (f->mm && f->mm != loaded_mm) { ++ cpumask_clear_cpu(raw_smp_processor_id(), mm_cpumask(f->mm)); ++ trace_tlb_flush(TLB_REMOTE_WRONG_CPU, 0); ++ return; + } + + if (unlikely(loaded_mm == &init_mm)) + return; + ++ /* Reload the ASID if transitioning into or out of a global ASID */ ++ if (mm_needs_global_asid(loaded_mm, loaded_mm_asid)) { ++ switch_mm_irqs_off(NULL, loaded_mm, NULL); ++ loaded_mm_asid = this_cpu_read(cpu_tlbstate.loaded_mm_asid); ++ } ++ ++ /* Broadcast ASIDs are always kept up to date with INVLPGB. */ ++ if (is_global_asid(loaded_mm_asid)) ++ return; ++ + VM_WARN_ON(this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].ctx_id) != + loaded_mm->context.ctx_id); + +@@ -786,6 +1094,8 @@ static void flush_tlb_func(void *info) + return; + } + ++ local_tlb_gen = this_cpu_read(cpu_tlbstate.ctxs[loaded_mm_asid].tlb_gen); ++ + if (unlikely(f->new_tlb_gen != TLB_GENERATION_INVALID && + f->new_tlb_gen <= local_tlb_gen)) { + /* +@@ -953,7 +1263,7 @@ STATIC_NOPV void native_flush_tlb_multi(const struct cpumask *cpumask, + * up on the new contents of what used to be page tables, while + * doing a speculative memory access. + */ +- if (info->freed_tables) ++ if (info->freed_tables || mm_in_asid_transition(info->mm)) + on_each_cpu_mask(cpumask, flush_tlb_func, (void *)info, true); + else + on_each_cpu_cond_mask(should_flush_tlb, flush_tlb_func, +@@ -1000,6 +1310,15 @@ static struct flush_tlb_info *get_flush_tlb_info(struct mm_struct *mm, + BUG_ON(this_cpu_inc_return(flush_tlb_info_idx) != 1); + #endif + ++ /* ++ * If the number of flushes is so large that a full flush ++ * would be faster, do a full flush. ++ */ ++ if ((end - start) >> stride_shift > tlb_single_page_flush_ceiling) { ++ start = 0; ++ end = TLB_FLUSH_ALL; ++ } ++ + info->start = start; + info->end = end; + info->mm = mm; +@@ -1026,17 +1345,8 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + bool freed_tables) + { + struct flush_tlb_info *info; ++ int cpu = get_cpu(); + u64 new_tlb_gen; +- int cpu; +- +- cpu = get_cpu(); +- +- /* Should we flush just the requested range? */ +- if ((end == TLB_FLUSH_ALL) || +- ((end - start) >> stride_shift) > tlb_single_page_flush_ceiling) { +- start = 0; +- end = TLB_FLUSH_ALL; +- } + + /* This is also a barrier that synchronizes with switch_mm(). */ + new_tlb_gen = inc_mm_tlb_gen(mm); +@@ -1049,9 +1359,12 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { ++ if (mm_global_asid(mm)) { ++ broadcast_tlb_flush(info); ++ } else if (cpumask_any_but(mm_cpumask(mm), cpu) < nr_cpu_ids) { + info->trim_cpumask = should_trim_cpumask(mm); + flush_tlb_multi(mm_cpumask(mm), info); ++ consider_global_asid(mm); + } else if (mm == this_cpu_read(cpu_tlbstate.loaded_mm)) { + lockdep_assert_irqs_enabled(); + local_irq_disable(); +@@ -1064,7 +1377,6 @@ void flush_tlb_mm_range(struct mm_struct *mm, unsigned long start, + mmu_notifier_arch_invalidate_secondary_tlbs(mm, start, end); + } + +- + static void do_flush_tlb_all(void *info) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH_RECEIVED); +@@ -1074,7 +1386,32 @@ static void do_flush_tlb_all(void *info) + void flush_tlb_all(void) + { + count_vm_tlb_event(NR_TLB_REMOTE_FLUSH); +- on_each_cpu(do_flush_tlb_all, NULL, 1); ++ ++ /* First try (faster) hardware-assisted TLB invalidation. */ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ invlpgb_flush_all(); ++ else ++ /* Fall back to the IPI-based invalidation. */ ++ on_each_cpu(do_flush_tlb_all, NULL, 1); ++} ++ ++/* Flush an arbitrarily large range of memory with INVLPGB. */ ++static void invlpgb_kernel_range_flush(struct flush_tlb_info *info) ++{ ++ unsigned long addr, nr; ++ ++ for (addr = info->start; addr < info->end; addr += nr << PAGE_SHIFT) { ++ nr = (info->end - addr) >> PAGE_SHIFT; ++ ++ /* ++ * INVLPGB has a limit on the size of ranges it can ++ * flush. Break up large flushes. ++ */ ++ nr = clamp_val(nr, 1, invlpgb_count_max); ++ ++ invlpgb_flush_addr_nosync(addr, nr); ++ } ++ __tlbsync(); + } + + static void do_kernel_range_flush(void *info) +@@ -1087,24 +1424,37 @@ static void do_kernel_range_flush(void *info) + flush_tlb_one_kernel(addr); + } + +-void flush_tlb_kernel_range(unsigned long start, unsigned long end) ++static void kernel_tlb_flush_all(struct flush_tlb_info *info) + { +- /* Balance as user space task's flush, a bit conservative */ +- if (end == TLB_FLUSH_ALL || +- (end - start) > tlb_single_page_flush_ceiling << PAGE_SHIFT) { ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ invlpgb_flush_all(); ++ else + on_each_cpu(do_flush_tlb_all, NULL, 1); +- } else { +- struct flush_tlb_info *info; +- +- preempt_disable(); +- info = get_flush_tlb_info(NULL, start, end, 0, false, +- TLB_GENERATION_INVALID); ++} + ++static void kernel_tlb_flush_range(struct flush_tlb_info *info) ++{ ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) ++ invlpgb_kernel_range_flush(info); ++ else + on_each_cpu(do_kernel_range_flush, info, 1); ++} + +- put_flush_tlb_info(); +- preempt_enable(); +- } ++void flush_tlb_kernel_range(unsigned long start, unsigned long end) ++{ ++ struct flush_tlb_info *info; ++ ++ guard(preempt)(); ++ ++ info = get_flush_tlb_info(NULL, start, end, PAGE_SHIFT, false, ++ TLB_GENERATION_INVALID); ++ ++ if (info->end == TLB_FLUSH_ALL) ++ kernel_tlb_flush_all(info); ++ else ++ kernel_tlb_flush_range(info); ++ ++ put_flush_tlb_info(); + } + + /* +@@ -1283,7 +1633,9 @@ void arch_tlbbatch_flush(struct arch_tlbflush_unmap_batch *batch) + * a local TLB flush is needed. Optimize this use-case by calling + * flush_tlb_func_local() directly in this case. + */ +- if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { ++ if (cpu_feature_enabled(X86_FEATURE_INVLPGB)) { ++ invlpgb_flush_all_nonglobals(); ++ } else if (cpumask_any_but(&batch->cpumask, cpu) < nr_cpu_ids) { + flush_tlb_multi(&batch->cpumask, info); + } else if (cpumask_test_cpu(cpu, &batch->cpumask)) { + lockdep_assert_irqs_enabled(); +diff --git a/arch/x86/xen/mmu_pv.c b/arch/x86/xen/mmu_pv.c +index d078de2c952b..38971c6dcd4b 100644 +--- a/arch/x86/xen/mmu_pv.c ++++ b/arch/x86/xen/mmu_pv.c +@@ -2189,7 +2189,6 @@ static const typeof(pv_ops) xen_mmu_ops __initconst = { + .flush_tlb_kernel = xen_flush_tlb, + .flush_tlb_one_user = xen_flush_tlb_one_user, + .flush_tlb_multi = xen_flush_tlb_multi, +- .tlb_remove_table = tlb_remove_table, + + .pgd_alloc = xen_pgd_alloc, + .pgd_free = xen_pgd_free, +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 14fc1b39c0cf..a199e299b0d4 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1402,6 +1402,7 @@ enum tlb_flush_reason { + TLB_LOCAL_SHOOTDOWN, + TLB_LOCAL_MM_SHOOTDOWN, + TLB_REMOTE_SEND_IPI, ++ TLB_REMOTE_WRONG_CPU, + NR_TLB_FLUSH_REASONS, + }; + +diff --git a/mm/memory.c b/mm/memory.c +index b6015e230822..eb5fdd558442 100644 +--- a/mm/memory.c ++++ b/mm/memory.c +@@ -1935,7 +1935,6 @@ void zap_page_range_single(struct vm_area_struct *vma, unsigned long address, + struct mmu_notifier_range range; + struct mmu_gather tlb; + +- lru_add_drain(); + mmu_notifier_range_init(&range, MMU_NOTIFY_CLEAR, 0, vma->vm_mm, + address, end); + hugetlb_zap_begin(vma, &range.start, &range.end); +diff --git a/mm/mmap.c b/mm/mmap.c +index aec208f90337..d628b7900d2d 100644 +--- a/mm/mmap.c ++++ b/mm/mmap.c +@@ -1664,7 +1664,6 @@ void exit_mmap(struct mm_struct *mm) + goto destroy; + } + +- lru_add_drain(); + flush_cache_mm(mm); + tlb_gather_mmu_fullmm(&tlb, mm); + /* update_hiwater_rss(mm) here? but nobody should be looking */ +@@ -2107,7 +2106,6 @@ int relocate_vma_down(struct vm_area_struct *vma, unsigned long shift) + vma, new_start, length, false, true)) + return -ENOMEM; + +- lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + next = vma_next(&vmi); + if (new_end > old_start) { +diff --git a/mm/swap_state.c b/mm/swap_state.c +index e0c0321b8ff7..ca42b2be64d9 100644 +--- a/mm/swap_state.c ++++ b/mm/swap_state.c +@@ -317,7 +317,6 @@ void free_pages_and_swap_cache(struct encoded_page **pages, int nr) + struct folio_batch folios; + unsigned int refs[PAGEVEC_SIZE]; + +- lru_add_drain(); + folio_batch_init(&folios); + for (int i = 0; i < nr; i++) { + struct folio *folio = page_folio(encoded_page_ptr(pages[i])); +diff --git a/mm/vma.c b/mm/vma.c +index b126683397fc..bf2e91454019 100644 +--- a/mm/vma.c ++++ b/mm/vma.c +@@ -398,7 +398,6 @@ void unmap_region(struct ma_state *mas, struct vm_area_struct *vma, + struct mm_struct *mm = vma->vm_mm; + struct mmu_gather tlb; + +- lru_add_drain(); + tlb_gather_mmu(&tlb, mm); + update_hiwater_rss(mm); + unmap_vmas(&tlb, mas, vma, vma->vm_start, vma->vm_end, vma->vm_end, +@@ -1130,7 +1129,6 @@ static inline void vms_clear_ptes(struct vma_munmap_struct *vms, + * were isolated before we downgraded mmap_lock. + */ + mas_set(mas_detach, 1); +- lru_add_drain(); + tlb_gather_mmu(&tlb, vms->vma->vm_mm); + update_hiwater_rss(vms->vma->vm_mm); + unmap_vmas(&tlb, mas_detach, vms->vma, vms->start, vms->end, +diff --git a/tools/arch/x86/include/asm/msr-index.h b/tools/arch/x86/include/asm/msr-index.h +index 3ae84c3b8e6d..dc1c1057f26e 100644 +--- a/tools/arch/x86/include/asm/msr-index.h ++++ b/tools/arch/x86/include/asm/msr-index.h +@@ -25,6 +25,7 @@ + #define _EFER_SVME 12 /* Enable virtualization */ + #define _EFER_LMSLE 13 /* Long Mode Segment Limit Enable */ + #define _EFER_FFXSR 14 /* Enable Fast FXSAVE/FXRSTOR */ ++#define _EFER_TCE 15 /* Enable Translation Cache Extensions */ + #define _EFER_AUTOIBRS 21 /* Enable Automatic IBRS */ + + #define EFER_SCE (1<<_EFER_SCE) +@@ -34,6 +35,7 @@ + #define EFER_SVME (1<<_EFER_SVME) + #define EFER_LMSLE (1<<_EFER_LMSLE) + #define EFER_FFXSR (1<<_EFER_FFXSR) ++#define EFER_TCE (1<<_EFER_TCE) + #define EFER_AUTOIBRS (1<<_EFER_AUTOIBRS) + + /* +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0003-bbr3.patch b/sys-kernel/gentoo-sources-6.13/0003-bbr3.patch new file mode 100644 index 0000000..889f841 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0003-bbr3.patch @@ -0,0 +1,3386 @@ +From 8e25c43b4f65d6249ffcdd8631af68e32aabe985 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:27:38 +0100 +Subject: [PATCH 03/12] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 4 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 72 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 9 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2230 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 16 files changed, 1940 insertions(+), 553 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index f88daaa76d83..b0f79a5888a2 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -368,7 +368,9 @@ struct tcp_sock { + u8 compressed_ack; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ +- unused:5; ++ fast_ack_mode:2, /* which fast ack mode ? */ ++ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ ++ unused:2; + u8 thin_lto : 1,/* Use linear timeouts for thin streams */ + fastopen_connect:1, /* FASTOPEN_CONNECT sockopt */ + fastopen_no_cookie:1, /* Allow send/recv SYN+data without a cookie */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index c7f42844c79a..170250145598 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -137,8 +137,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index bc04599547c3..1ac0efa5a854 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -376,6 +376,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_LOW 16 ++#define TCP_ECN_ECT_PERMANENT 32 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +@@ -793,6 +795,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -898,6 +909,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -987,9 +1003,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1102,6 +1123,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1124,7 +1146,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1144,10 +1170,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1158,7 +1187,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1182,8 +1213,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1249,6 +1283,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1268,6 +1310,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1280,6 +1323,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2431,7 +2489,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index db7254d52d93..38de18d921ea 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -507,12 +507,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index dbf896f3146c..4702cd2f1ffc 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ ++#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 6d2c97f8e9ef..ddc116ef22cb 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index 554804774628..2279e6e7bc9c 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,11 +280,15 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } + ++static void bpf_tcp_ca_skb_marked_lost(struct sock *sk, const struct sk_buff *skb) ++{ ++} ++ + static void bpf_tcp_ca_cong_control(struct sock *sk, u32 ack, int flag, + const struct rate_sample *rs) + { +@@ -315,7 +319,8 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, ++ .skb_marked_lost = bpf_tcp_ca_skb_marked_lost, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index d74281eca14f..61aa756120ad 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3379,6 +3379,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4105,6 +4106,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..a180fa648d5e 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +456,7 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +945,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which ++ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2359,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2396,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index df758adbb445..e98e5dbc050e 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 0ee22e10fcfa..492c143aed1b 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -376,7 +376,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -387,7 +387,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1126,7 +1126,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1507,6 +1512,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3832,7 +3848,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3849,6 +3866,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3859,6 +3877,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3967,6 +3990,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4041,7 +4065,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_rack_update_reo_wnd(sk, &rs); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4065,6 +4089,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4084,7 +4109,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5758,13 +5783,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 789e495d3bd6..dea9123e5c5d 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -466,6 +466,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index bc95d2a5924f..d4c45ca6fe06 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1606,7 +1609,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + int nlen; + u8 flags; +@@ -1681,6 +1684,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2038,13 +2065,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2770,6 +2796,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2982,6 +3009,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index b412ed88ccd9..d70f8b742b21 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -699,6 +699,7 @@ void tcp_write_timer_handler(struct sock *sk) + return; + } + ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0005-crypto.patch b/sys-kernel/gentoo-sources-6.13/0005-crypto.patch new file mode 100644 index 0000000..a508f49 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0005-crypto.patch @@ -0,0 +1,774 @@ +From 0b97cf6a4825ec41c53e59294964c5e94810a593 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:28:58 +0100 +Subject: [PATCH 05/12] crypto + +Signed-off-by: Peter Jung +--- + arch/x86/crypto/aes-gcm-avx10-x86_64.S | 119 ++++----- + arch/x86/crypto/aes-xts-avx-x86_64.S | 329 +++++++++++++------------ + arch/x86/crypto/aesni-intel_glue.c | 10 +- + 3 files changed, 221 insertions(+), 237 deletions(-) + +diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-avx10-x86_64.S +index 97e0ee515fc5..02ee11083d4f 100644 +--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S ++++ b/arch/x86/crypto/aes-gcm-avx10-x86_64.S +@@ -88,7 +88,7 @@ + + // A shuffle mask that reflects the bytes of 16-byte blocks + .Lbswap_mask: +- .octa 0x000102030405060708090a0b0c0d0e0f ++ .octa 0x000102030405060708090a0b0c0d0e0f + + // This is the GHASH reducing polynomial without its constant term, i.e. + // x^128 + x^7 + x^2 + x, represented using the backwards mapping +@@ -384,8 +384,8 @@ + vpshufd $0xd3, H_CUR_XMM, %xmm0 + vpsrad $31, %xmm0, %xmm0 + vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM +- vpand .Lgfpoly_and_internal_carrybit(%rip), %xmm0, %xmm0 +- vpxor %xmm0, H_CUR_XMM, H_CUR_XMM ++ // H_CUR_XMM ^= xmm0 & gfpoly_and_internal_carrybit ++ vpternlogd $0x78, .Lgfpoly_and_internal_carrybit(%rip), %xmm0, H_CUR_XMM + + // Load the gfpoly constant. + vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY +@@ -562,6 +562,32 @@ + vpxord RNDKEY0, V3, V3 + .endm + ++// Do the last AES round for four vectors of counter blocks V0-V3, XOR source ++// data with the resulting keystream, and write the result to DST and ++// GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) ++.macro _aesenclast_and_xor_4x ++ // XOR the source data with the last round key, saving the result in ++ // GHASHDATA[0-3]. This reduces latency by taking advantage of the ++ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). ++ vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 ++ vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 ++ vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 ++ vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 ++ ++ // Do the last AES round. This handles the XOR with the source data ++ // too, as per the optimization described above. ++ vaesenclast GHASHDATA0, V0, GHASHDATA0 ++ vaesenclast GHASHDATA1, V1, GHASHDATA1 ++ vaesenclast GHASHDATA2, V2, GHASHDATA2 ++ vaesenclast GHASHDATA3, V3, GHASHDATA3 ++ ++ // Store the en/decrypted data to DST. ++ vmovdqu8 GHASHDATA0, 0*VL(DST) ++ vmovdqu8 GHASHDATA1, 1*VL(DST) ++ vmovdqu8 GHASHDATA2, 2*VL(DST) ++ vmovdqu8 GHASHDATA3, 3*VL(DST) ++.endm ++ + // void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, + // const u32 le_ctr[4], u8 ghash_acc[16], + // const u8 *src, u8 *dst, int datalen); +@@ -640,7 +666,7 @@ + // LE_CTR contains the next set of little-endian counter blocks. + .set LE_CTR, V12 + +- // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-5] contain cached AES round keys, ++ // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, + // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, + // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. + .set RNDKEY0, V13 +@@ -650,15 +676,10 @@ + .set RNDKEY_M7, V17 + .set RNDKEY_M6, V18 + .set RNDKEY_M5, V19 +- +- // RNDKEYLAST[0-3] temporarily store the last AES round key XOR'd with +- // the corresponding block of source data. This is useful because +- // vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a), and key ^ b can +- // be computed in parallel with the AES rounds. +- .set RNDKEYLAST0, V20 +- .set RNDKEYLAST1, V21 +- .set RNDKEYLAST2, V22 +- .set RNDKEYLAST3, V23 ++ .set RNDKEY_M4, V20 ++ .set RNDKEY_M3, V21 ++ .set RNDKEY_M2, V22 ++ .set RNDKEY_M1, V23 + + // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These + // cannot coincide with anything used for AES encryption, since for +@@ -713,7 +734,7 @@ + // Pre-subtracting 4*VL from DATALEN saves an instruction from the main + // loop and also ensures that at least one write always occurs to + // DATALEN, zero-extending it and allowing DATALEN64 to be used later. +- sub $4*VL, DATALEN ++ add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 + jl .Lcrypt_loop_4x_done\@ + + // Load powers of the hash key. +@@ -748,26 +769,15 @@ + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b +- vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 +- vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 +- vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 +- vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 +- vaesenclast RNDKEYLAST0, V0, GHASHDATA0 +- vaesenclast RNDKEYLAST1, V1, GHASHDATA1 +- vaesenclast RNDKEYLAST2, V2, GHASHDATA2 +- vaesenclast RNDKEYLAST3, V3, GHASHDATA3 +- vmovdqu8 GHASHDATA0, 0*VL(DST) +- vmovdqu8 GHASHDATA1, 1*VL(DST) +- vmovdqu8 GHASHDATA2, 2*VL(DST) +- vmovdqu8 GHASHDATA3, 3*VL(DST) +- add $4*VL, SRC +- add $4*VL, DST +- sub $4*VL, DATALEN ++ _aesenclast_and_xor_4x ++ sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 ++ sub $-4*VL, DST ++ add $-4*VL, DATALEN + jl .Lghash_last_ciphertext_4x\@ + .endif + + // Cache as many additional AES round keys as possible. +-.irp i, 9,8,7,6,5 ++.irp i, 9,8,7,6,5,4,3,2,1 + vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY_M\i + .endr + +@@ -799,50 +809,17 @@ + _vaesenc_4x RNDKEY + 128: + +- // XOR the source data with the last round key, saving the result in +- // RNDKEYLAST[0-3]. This reduces latency by taking advantage of the +- // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). +-.if \enc +- vpxord 0*VL(SRC), RNDKEYLAST, RNDKEYLAST0 +- vpxord 1*VL(SRC), RNDKEYLAST, RNDKEYLAST1 +- vpxord 2*VL(SRC), RNDKEYLAST, RNDKEYLAST2 +- vpxord 3*VL(SRC), RNDKEYLAST, RNDKEYLAST3 +-.else +- vpxord GHASHDATA0, RNDKEYLAST, RNDKEYLAST0 +- vpxord GHASHDATA1, RNDKEYLAST, RNDKEYLAST1 +- vpxord GHASHDATA2, RNDKEYLAST, RNDKEYLAST2 +- vpxord GHASHDATA3, RNDKEYLAST, RNDKEYLAST3 +-.endif +- + // Finish the AES encryption of the counter blocks in V0-V3, interleaved + // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. +-.irp i, 9,8,7,6,5 ++.irp i, 9,8,7,6,5,4,3,2,1 ++ _ghash_step_4x (9 - \i) + _vaesenc_4x RNDKEY_M\i +- _ghash_step_4x (9 - \i) +-.endr +-.irp i, 4,3,2,1 +- vbroadcasti32x4 -\i*16(RNDKEYLAST_PTR), RNDKEY +- _vaesenc_4x RNDKEY +- _ghash_step_4x (9 - \i) + .endr + _ghash_step_4x 9 +- +- // Do the last AES round. This handles the XOR with the source data +- // too, as per the optimization described above. +- vaesenclast RNDKEYLAST0, V0, GHASHDATA0 +- vaesenclast RNDKEYLAST1, V1, GHASHDATA1 +- vaesenclast RNDKEYLAST2, V2, GHASHDATA2 +- vaesenclast RNDKEYLAST3, V3, GHASHDATA3 +- +- // Store the en/decrypted data to DST. +- vmovdqu8 GHASHDATA0, 0*VL(DST) +- vmovdqu8 GHASHDATA1, 1*VL(DST) +- vmovdqu8 GHASHDATA2, 2*VL(DST) +- vmovdqu8 GHASHDATA3, 3*VL(DST) +- +- add $4*VL, SRC +- add $4*VL, DST +- sub $4*VL, DATALEN ++ _aesenclast_and_xor_4x ++ sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 ++ sub $-4*VL, DST ++ add $-4*VL, DATALEN + jge .Lcrypt_loop_4x\@ + + .if \enc +@@ -856,7 +833,7 @@ + .Lcrypt_loop_4x_done\@: + + // Undo the extra subtraction by 4*VL and check whether data remains. +- add $4*VL, DATALEN ++ sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 + jz .Ldone\@ + + // The data length isn't a multiple of 4*VL. Process the remaining data +@@ -940,7 +917,7 @@ + // GHASH. However, any such blocks are all-zeroes, and the values that + // they're multiplied with are also all-zeroes. Therefore they just add + // 0 * 0 = 0 to the final GHASH result, which makes no difference. +- vmovdqu8 (POWERS_PTR), H_POW1 ++ vmovdqu8 (POWERS_PTR), H_POW1 + .if \enc + vmovdqu8 V0, V1{%k1}{z} + .endif +diff --git a/arch/x86/crypto/aes-xts-avx-x86_64.S b/arch/x86/crypto/aes-xts-avx-x86_64.S +index 48f97b79f7a9..8a3e23fbcf85 100644 +--- a/arch/x86/crypto/aes-xts-avx-x86_64.S ++++ b/arch/x86/crypto/aes-xts-avx-x86_64.S +@@ -80,22 +80,6 @@ + .byte 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80, 0x80 + .text + +-// Function parameters +-.set KEY, %rdi // Initially points to crypto_aes_ctx, then is +- // advanced to point to 7th-from-last round key +-.set SRC, %rsi // Pointer to next source data +-.set DST, %rdx // Pointer to next destination data +-.set LEN, %ecx // Remaining length in bytes +-.set LEN8, %cl +-.set LEN64, %rcx +-.set TWEAK, %r8 // Pointer to next tweak +- +-// %rax holds the AES key length in bytes. +-.set KEYLEN, %eax +-.set KEYLEN64, %rax +- +-// %r9-r11 are available as temporaries. +- + .macro _define_Vi i + .if VL == 16 + .set V\i, %xmm\i +@@ -112,41 +96,31 @@ + // Define register aliases V0-V15, or V0-V31 if all 32 SIMD registers + // are available, that map to the xmm, ymm, or zmm registers according + // to the selected Vector Length (VL). +- _define_Vi 0 +- _define_Vi 1 +- _define_Vi 2 +- _define_Vi 3 +- _define_Vi 4 +- _define_Vi 5 +- _define_Vi 6 +- _define_Vi 7 +- _define_Vi 8 +- _define_Vi 9 +- _define_Vi 10 +- _define_Vi 11 +- _define_Vi 12 +- _define_Vi 13 +- _define_Vi 14 +- _define_Vi 15 ++.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15 ++ _define_Vi \i ++.endr + .if USE_AVX10 +- _define_Vi 16 +- _define_Vi 17 +- _define_Vi 18 +- _define_Vi 19 +- _define_Vi 20 +- _define_Vi 21 +- _define_Vi 22 +- _define_Vi 23 +- _define_Vi 24 +- _define_Vi 25 +- _define_Vi 26 +- _define_Vi 27 +- _define_Vi 28 +- _define_Vi 29 +- _define_Vi 30 +- _define_Vi 31 ++.irp i, 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 ++ _define_Vi \i ++.endr + .endif + ++ // Function parameters ++ .set KEY, %rdi // Initially points to crypto_aes_ctx, then is ++ // advanced to point to 7th-from-last round key ++ .set SRC, %rsi // Pointer to next source data ++ .set DST, %rdx // Pointer to next destination data ++ .set LEN, %ecx // Remaining length in bytes ++ .set LEN8, %cl ++ .set LEN64, %rcx ++ .set TWEAK, %r8 // Pointer to next tweak ++ ++ // %rax holds the AES key length in bytes. ++ .set KEYLEN, %eax ++ .set KEYLEN64, %rax ++ ++ // %r9-r11 are available as temporaries. ++ + // V0-V3 hold the data blocks during the main loop, or temporary values + // otherwise. V4-V5 hold temporary values. + +@@ -214,6 +188,7 @@ + .endm + + // Move a vector between memory and a register. ++// The register operand must be in the first 16 vector registers. + .macro _vmovdqu src, dst + .if VL < 64 + vmovdqu \src, \dst +@@ -234,11 +209,12 @@ + .endm + + // XOR two vectors together. ++// Any register operands must be in the first 16 vector registers. + .macro _vpxor src1, src2, dst +-.if USE_AVX10 +- vpxord \src1, \src2, \dst +-.else ++.if VL < 64 + vpxor \src1, \src2, \dst ++.else ++ vpxord \src1, \src2, \dst + .endif + .endm + +@@ -259,8 +235,12 @@ + vpshufd $0x13, \src, \tmp + vpaddq \src, \src, \dst + vpsrad $31, \tmp, \tmp ++.if USE_AVX10 ++ vpternlogd $0x78, GF_POLY_XMM, \tmp, \dst ++.else + vpand GF_POLY_XMM, \tmp, \tmp + vpxor \tmp, \dst, \dst ++.endif + .endm + + // Given the XTS tweak(s) in the vector \src, compute the next vector of +@@ -369,9 +349,14 @@ + + // Do one step in computing the next set of tweaks using the VPCLMULQDQ method + // (the same method _next_tweakvec uses for VL > 16). This means multiplying +-// each tweak by x^(4*VL/16) independently. Since 4*VL/16 is a multiple of 8 +-// when VL > 16 (which it is here), the needed shift amounts are byte-aligned, +-// which allows the use of vpsrldq and vpslldq to do 128-bit wide shifts. ++// each tweak by x^(4*VL/16) independently. ++// ++// Since 4*VL/16 is a multiple of 8 when VL > 16 (which it is here), the needed ++// shift amounts are byte-aligned, which allows the use of vpsrldq and vpslldq ++// to do 128-bit wide shifts. The 128-bit left shift (vpslldq) saves ++// instructions directly. The 128-bit right shift (vpsrldq) performs better ++// than a 64-bit right shift on Intel CPUs in the context where it is used here, ++// because it runs on a different execution port from the AES instructions. + .macro _tweak_step_pclmul i + .if \i == 0 + vpsrldq $(128 - 4*VL/16) / 8, TWEAK0, NEXT_TWEAK0 +@@ -406,7 +391,7 @@ + // \i that include at least 0 through 19, then 1000 which signals the last step. + // + // This is used to interleave the computation of the next set of tweaks with the +-// AES en/decryptions, which increases performance in some cases. ++// AES en/decryptions, which increases performance in some cases. Clobbers V5. + .macro _tweak_step i + .if VL == 16 + _tweak_step_mulx \i +@@ -443,9 +428,10 @@ + // the last round needs different instructions. + // + // An alternative approach would be to roll up all the round loops. We +- // don't do that because it isn't compatible with caching the round keys +- // in registers which we do when possible (see below), and also because +- // it seems unwise to rely *too* heavily on the CPU's branch predictor. ++ // don't do that because (a) it isn't compatible with caching the round ++ // keys in registers which we do when possible (see below), (b) we ++ // interleave the AES rounds with the XTS tweak computation, and (c) it ++ // seems unwise to rely *too* heavily on the CPU's branch predictor. + lea OFFS-16(KEY, KEYLEN64, 4), KEY + + // If all 32 SIMD registers are available, cache all the round keys. +@@ -472,90 +458,94 @@ + .endif + .endm + +-// Do a single round of AES encryption (if \enc==1) or decryption (if \enc==0) +-// on the block(s) in \data using the round key(s) in \key. The register length +-// determines the number of AES blocks en/decrypted. +-.macro _vaes enc, last, key, data ++// Do a single non-last round of AES encryption (if \enc==1) or decryption (if ++// \enc==0) on the block(s) in \data using the round key(s) in \key. The ++// register length determines the number of AES blocks en/decrypted. ++.macro _vaes enc, key, data + .if \enc +-.if \last +- vaesenclast \key, \data, \data +-.else + vaesenc \key, \data, \data +-.endif +-.else +-.if \last +- vaesdeclast \key, \data, \data + .else + vaesdec \key, \data, \data + .endif ++.endm ++ ++// Same as _vaes, but does the last round. ++.macro _vaeslast enc, key, data ++.if \enc ++ vaesenclast \key, \data, \data ++.else ++ vaesdeclast \key, \data, \data + .endif + .endm + +-// Do a single round of AES en/decryption on the block(s) in \data, using the +-// same key for all block(s). The round key is loaded from the appropriate +-// register or memory location for round \i. May clobber V4. +-.macro _vaes_1x enc, last, i, xmm_suffix, data ++// Do a single non-last round of AES en/decryption on the block(s) in \data, ++// using the same key for all block(s). The round key is loaded from the ++// appropriate register or memory location for round \i. May clobber \tmp. ++.macro _vaes_1x enc, i, xmm_suffix, data, tmp + .if USE_AVX10 +- _vaes \enc, \last, KEY\i\xmm_suffix, \data ++ _vaes \enc, KEY\i\xmm_suffix, \data + .else + .ifnb \xmm_suffix +- _vaes \enc, \last, (\i-7)*16(KEY), \data ++ _vaes \enc, (\i-7)*16(KEY), \data + .else +- _vbroadcast128 (\i-7)*16(KEY), V4 +- _vaes \enc, \last, V4, \data ++ _vbroadcast128 (\i-7)*16(KEY), \tmp ++ _vaes \enc, \tmp, \data + .endif + .endif + .endm + +-// Do a single round of AES en/decryption on the blocks in registers V0-V3, +-// using the same key for all blocks. The round key is loaded from the ++// Do a single non-last round of AES en/decryption on the blocks in registers ++// V0-V3, using the same key for all blocks. The round key is loaded from the + // appropriate register or memory location for round \i. In addition, does two +-// steps of the computation of the next set of tweaks. May clobber V4. +-.macro _vaes_4x enc, last, i ++// steps of the computation of the next set of tweaks. May clobber V4 and V5. ++.macro _vaes_4x enc, i + .if USE_AVX10 + _tweak_step (2*(\i-5)) +- _vaes \enc, \last, KEY\i, V0 +- _vaes \enc, \last, KEY\i, V1 ++ _vaes \enc, KEY\i, V0 ++ _vaes \enc, KEY\i, V1 + _tweak_step (2*(\i-5) + 1) +- _vaes \enc, \last, KEY\i, V2 +- _vaes \enc, \last, KEY\i, V3 ++ _vaes \enc, KEY\i, V2 ++ _vaes \enc, KEY\i, V3 + .else + _vbroadcast128 (\i-7)*16(KEY), V4 + _tweak_step (2*(\i-5)) +- _vaes \enc, \last, V4, V0 +- _vaes \enc, \last, V4, V1 ++ _vaes \enc, V4, V0 ++ _vaes \enc, V4, V1 + _tweak_step (2*(\i-5) + 1) +- _vaes \enc, \last, V4, V2 +- _vaes \enc, \last, V4, V3 ++ _vaes \enc, V4, V2 ++ _vaes \enc, V4, V3 + .endif + .endm + + // Do tweaked AES en/decryption (i.e., XOR with \tweak, then AES en/decrypt, + // then XOR with \tweak again) of the block(s) in \data. To process a single + // block, use xmm registers and set \xmm_suffix=_XMM. To process a vector of +-// length VL, use V* registers and leave \xmm_suffix empty. May clobber V4. +-.macro _aes_crypt enc, xmm_suffix, tweak, data ++// length VL, use V* registers and leave \xmm_suffix empty. Clobbers \tmp. ++.macro _aes_crypt enc, xmm_suffix, tweak, data, tmp + _xor3 KEY0\xmm_suffix, \tweak, \data + cmp $24, KEYLEN + jl .Laes128\@ + je .Laes192\@ +- _vaes_1x \enc, 0, 1, \xmm_suffix, \data +- _vaes_1x \enc, 0, 2, \xmm_suffix, \data ++ _vaes_1x \enc, 1, \xmm_suffix, \data, tmp=\tmp ++ _vaes_1x \enc, 2, \xmm_suffix, \data, tmp=\tmp + .Laes192\@: +- _vaes_1x \enc, 0, 3, \xmm_suffix, \data +- _vaes_1x \enc, 0, 4, \xmm_suffix, \data ++ _vaes_1x \enc, 3, \xmm_suffix, \data, tmp=\tmp ++ _vaes_1x \enc, 4, \xmm_suffix, \data, tmp=\tmp + .Laes128\@: +- _vaes_1x \enc, 0, 5, \xmm_suffix, \data +- _vaes_1x \enc, 0, 6, \xmm_suffix, \data +- _vaes_1x \enc, 0, 7, \xmm_suffix, \data +- _vaes_1x \enc, 0, 8, \xmm_suffix, \data +- _vaes_1x \enc, 0, 9, \xmm_suffix, \data +- _vaes_1x \enc, 0, 10, \xmm_suffix, \data +- _vaes_1x \enc, 0, 11, \xmm_suffix, \data +- _vaes_1x \enc, 0, 12, \xmm_suffix, \data +- _vaes_1x \enc, 0, 13, \xmm_suffix, \data +- _vaes_1x \enc, 1, 14, \xmm_suffix, \data +- _vpxor \tweak, \data, \data ++.irp i, 5,6,7,8,9,10,11,12,13 ++ _vaes_1x \enc, \i, \xmm_suffix, \data, tmp=\tmp ++.endr ++.if USE_AVX10 ++ vpxord KEY14\xmm_suffix, \tweak, \tmp ++.else ++.ifnb \xmm_suffix ++ vpxor 7*16(KEY), \tweak, \tmp ++.else ++ _vbroadcast128 7*16(KEY), \tmp ++ vpxor \tweak, \tmp, \tmp ++.endif ++.endif ++ _vaeslast \enc, \tmp, \data + .endm + + .macro _aes_xts_crypt enc +@@ -581,7 +571,7 @@ + // Compute the first set of tweaks TWEAK[0-3]. + _compute_first_set_of_tweaks + +- sub $4*VL, LEN ++ add $-4*VL, LEN // shorter than 'sub 4*VL' when VL=32 + jl .Lhandle_remainder\@ + + .Lmain_loop\@: +@@ -589,10 +579,10 @@ + + // XOR each source block with its tweak and the zero-th round key. + .if USE_AVX10 +- vmovdqu8 0*VL(SRC), V0 +- vmovdqu8 1*VL(SRC), V1 +- vmovdqu8 2*VL(SRC), V2 +- vmovdqu8 3*VL(SRC), V3 ++ _vmovdqu 0*VL(SRC), V0 ++ _vmovdqu 1*VL(SRC), V1 ++ _vmovdqu 2*VL(SRC), V2 ++ _vmovdqu 3*VL(SRC), V3 + vpternlogd $0x96, TWEAK0, KEY0, V0 + vpternlogd $0x96, TWEAK1, KEY0, V1 + vpternlogd $0x96, TWEAK2, KEY0, V2 +@@ -612,28 +602,43 @@ + je .Laes192\@ + // Do all the AES rounds on the data blocks, interleaved with + // the computation of the next set of tweaks. +- _vaes_4x \enc, 0, 1 +- _vaes_4x \enc, 0, 2 ++ _vaes_4x \enc, 1 ++ _vaes_4x \enc, 2 + .Laes192\@: +- _vaes_4x \enc, 0, 3 +- _vaes_4x \enc, 0, 4 ++ _vaes_4x \enc, 3 ++ _vaes_4x \enc, 4 + .Laes128\@: +- _vaes_4x \enc, 0, 5 +- _vaes_4x \enc, 0, 6 +- _vaes_4x \enc, 0, 7 +- _vaes_4x \enc, 0, 8 +- _vaes_4x \enc, 0, 9 +- _vaes_4x \enc, 0, 10 +- _vaes_4x \enc, 0, 11 +- _vaes_4x \enc, 0, 12 +- _vaes_4x \enc, 0, 13 +- _vaes_4x \enc, 1, 14 +- +- // XOR in the tweaks again. +- _vpxor TWEAK0, V0, V0 +- _vpxor TWEAK1, V1, V1 +- _vpxor TWEAK2, V2, V2 +- _vpxor TWEAK3, V3, V3 ++.irp i, 5,6,7,8,9,10,11,12,13 ++ _vaes_4x \enc, \i ++.endr ++ // Do the last AES round, then XOR the results with the tweaks again. ++ // Reduce latency by doing the XOR before the vaesenclast, utilizing the ++ // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a) ++ // (and likewise for vaesdeclast). ++.if USE_AVX10 ++ _tweak_step 18 ++ _tweak_step 19 ++ vpxord TWEAK0, KEY14, V4 ++ vpxord TWEAK1, KEY14, V5 ++ _vaeslast \enc, V4, V0 ++ _vaeslast \enc, V5, V1 ++ vpxord TWEAK2, KEY14, V4 ++ vpxord TWEAK3, KEY14, V5 ++ _vaeslast \enc, V4, V2 ++ _vaeslast \enc, V5, V3 ++.else ++ _vbroadcast128 7*16(KEY), V4 ++ _tweak_step 18 // uses V5 ++ _tweak_step 19 // uses V5 ++ vpxor TWEAK0, V4, V5 ++ _vaeslast \enc, V5, V0 ++ vpxor TWEAK1, V4, V5 ++ _vaeslast \enc, V5, V1 ++ vpxor TWEAK2, V4, V5 ++ vpxor TWEAK3, V4, V4 ++ _vaeslast \enc, V5, V2 ++ _vaeslast \enc, V4, V3 ++.endif + + // Store the destination blocks. + _vmovdqu V0, 0*VL(DST) +@@ -644,9 +649,9 @@ + // Finish computing the next set of tweaks. + _tweak_step 1000 + +- add $4*VL, SRC +- add $4*VL, DST +- sub $4*VL, LEN ++ sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 ++ sub $-4*VL, DST ++ add $-4*VL, LEN + jge .Lmain_loop\@ + + // Check for the uncommon case where the data length isn't a multiple of +@@ -670,7 +675,7 @@ + jl .Lvec_at_a_time_done\@ + .Lvec_at_a_time\@: + _vmovdqu (SRC), V0 +- _aes_crypt \enc, , TWEAK0, V0 ++ _aes_crypt \enc, , TWEAK0, V0, tmp=V1 + _vmovdqu V0, (DST) + _next_tweakvec TWEAK0, V0, V1, TWEAK0 + add $VL, SRC +@@ -687,7 +692,7 @@ + jl .Lblock_at_a_time_done\@ + .Lblock_at_a_time\@: + vmovdqu (SRC), %xmm0 +- _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 ++ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 + vmovdqu %xmm0, (DST) + _next_tweak TWEAK0_XMM, %xmm0, TWEAK0_XMM + add $16, SRC +@@ -715,7 +720,7 @@ + // Do it now by advancing the tweak and decrypting the last full block. + _next_tweak TWEAK0_XMM, %xmm0, TWEAK1_XMM + vmovdqu (SRC), %xmm0 +- _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0 ++ _aes_crypt \enc, _XMM, TWEAK1_XMM, %xmm0, tmp=%xmm1 + .endif + + .if USE_AVX10 +@@ -758,47 +763,49 @@ + vpblendvb %xmm3, %xmm0, %xmm1, %xmm0 + .endif + // En/decrypt again and store the last full block. +- _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0 ++ _aes_crypt \enc, _XMM, TWEAK0_XMM, %xmm0, tmp=%xmm1 + vmovdqu %xmm0, (DST) + jmp .Ldone\@ + .endm + + // void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + // u8 iv[AES_BLOCK_SIZE]); ++// ++// Encrypt |iv| using the AES key |tweak_key| to get the first tweak. Assumes ++// that the CPU supports AES-NI and AVX, but not necessarily VAES or AVX10. + SYM_TYPED_FUNC_START(aes_xts_encrypt_iv) +- vmovdqu (%rsi), %xmm0 +- vpxor (%rdi), %xmm0, %xmm0 +- movl 480(%rdi), %eax // AES key length +- lea -16(%rdi, %rax, 4), %rdi +- cmp $24, %eax ++ .set TWEAK_KEY, %rdi ++ .set IV, %rsi ++ .set KEYLEN, %eax ++ .set KEYLEN64, %rax ++ ++ vmovdqu (IV), %xmm0 ++ vpxor (TWEAK_KEY), %xmm0, %xmm0 ++ movl 480(TWEAK_KEY), KEYLEN ++ lea -16(TWEAK_KEY, KEYLEN64, 4), TWEAK_KEY ++ cmp $24, KEYLEN + jl .Lencrypt_iv_aes128 + je .Lencrypt_iv_aes192 +- vaesenc -6*16(%rdi), %xmm0, %xmm0 +- vaesenc -5*16(%rdi), %xmm0, %xmm0 ++ vaesenc -6*16(TWEAK_KEY), %xmm0, %xmm0 ++ vaesenc -5*16(TWEAK_KEY), %xmm0, %xmm0 + .Lencrypt_iv_aes192: +- vaesenc -4*16(%rdi), %xmm0, %xmm0 +- vaesenc -3*16(%rdi), %xmm0, %xmm0 ++ vaesenc -4*16(TWEAK_KEY), %xmm0, %xmm0 ++ vaesenc -3*16(TWEAK_KEY), %xmm0, %xmm0 + .Lencrypt_iv_aes128: +- vaesenc -2*16(%rdi), %xmm0, %xmm0 +- vaesenc -1*16(%rdi), %xmm0, %xmm0 +- vaesenc 0*16(%rdi), %xmm0, %xmm0 +- vaesenc 1*16(%rdi), %xmm0, %xmm0 +- vaesenc 2*16(%rdi), %xmm0, %xmm0 +- vaesenc 3*16(%rdi), %xmm0, %xmm0 +- vaesenc 4*16(%rdi), %xmm0, %xmm0 +- vaesenc 5*16(%rdi), %xmm0, %xmm0 +- vaesenc 6*16(%rdi), %xmm0, %xmm0 +- vaesenclast 7*16(%rdi), %xmm0, %xmm0 +- vmovdqu %xmm0, (%rsi) ++.irp i, -2,-1,0,1,2,3,4,5,6 ++ vaesenc \i*16(TWEAK_KEY), %xmm0, %xmm0 ++.endr ++ vaesenclast 7*16(TWEAK_KEY), %xmm0, %xmm0 ++ vmovdqu %xmm0, (IV) + RET + SYM_FUNC_END(aes_xts_encrypt_iv) + + // Below are the actual AES-XTS encryption and decryption functions, + // instantiated from the above macro. They all have the following prototype: + // +-// void (*xts_asm_func)(const struct crypto_aes_ctx *key, +-// const u8 *src, u8 *dst, unsigned int len, +-// u8 tweak[AES_BLOCK_SIZE]); ++// void (*xts_crypt_func)(const struct crypto_aes_ctx *key, ++// const u8 *src, u8 *dst, int len, ++// u8 tweak[AES_BLOCK_SIZE]); + // + // |key| is the data key. |tweak| contains the next tweak; the encryption of + // the original IV with the tweak key was already done. This function supports +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index fbf43482e1f5..11e95fc62636 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -505,7 +505,7 @@ static int xts_setkey_aesni(struct crypto_skcipher *tfm, const u8 *key, + typedef void (*xts_encrypt_iv_func)(const struct crypto_aes_ctx *tweak_key, + u8 iv[AES_BLOCK_SIZE]); + typedef void (*xts_crypt_func)(const struct crypto_aes_ctx *key, +- const u8 *src, u8 *dst, unsigned int len, ++ const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]); + + /* This handles cases where the source and/or destination span pages. */ +@@ -624,14 +624,14 @@ static void aesni_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + } + + static void aesni_xts_encrypt(const struct crypto_aes_ctx *key, +- const u8 *src, u8 *dst, unsigned int len, ++ const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]) + { + aesni_xts_enc(key, dst, src, len, tweak); + } + + static void aesni_xts_decrypt(const struct crypto_aes_ctx *key, +- const u8 *src, u8 *dst, unsigned int len, ++ const u8 *src, u8 *dst, int len, + u8 tweak[AES_BLOCK_SIZE]) + { + aesni_xts_dec(key, dst, src, len, tweak); +@@ -790,10 +790,10 @@ asmlinkage void aes_xts_encrypt_iv(const struct crypto_aes_ctx *tweak_key, + \ + asmlinkage void \ + aes_xts_encrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ +- u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ ++ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ + asmlinkage void \ + aes_xts_decrypt_##suffix(const struct crypto_aes_ctx *key, const u8 *src, \ +- u8 *dst, unsigned int len, u8 tweak[AES_BLOCK_SIZE]); \ ++ u8 *dst, int len, u8 tweak[AES_BLOCK_SIZE]); \ + \ + static int xts_encrypt_##suffix(struct skcipher_request *req) \ + { \ +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch b/sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch new file mode 100644 index 0000000..eda5a21 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0007-itmt-core-ranking.patch @@ -0,0 +1,365 @@ +From 226e2a915189fff660383f067038534fe0346694 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:31:52 +0100 +Subject: [PATCH 07/12] itmt-core-ranking + +Signed-off-by: Peter Jung +--- + arch/x86/include/asm/topology.h | 4 +- + arch/x86/kernel/itmt.c | 81 ++++++++++++++------------------- + arch/x86/kernel/smpboot.c | 8 +--- + kernel/sched/fair.c | 42 +++++++++++++---- + kernel/sched/sched.h | 1 - + kernel/sched/topology.c | 15 +----- + 6 files changed, 69 insertions(+), 82 deletions(-) + +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index fd41103ad342..63bab25a4896 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -250,7 +250,7 @@ extern bool x86_topology_update; + #include + + DECLARE_PER_CPU_READ_MOSTLY(int, sched_core_priority); +-extern unsigned int __read_mostly sysctl_sched_itmt_enabled; ++extern bool __read_mostly sysctl_sched_itmt_enabled; + + /* Interface to set priority of a cpu */ + void sched_set_itmt_core_prio(int prio, int core_cpu); +@@ -263,7 +263,7 @@ void sched_clear_itmt_support(void); + + #else /* CONFIG_SCHED_MC_PRIO */ + +-#define sysctl_sched_itmt_enabled 0 ++#define sysctl_sched_itmt_enabled false + static inline void sched_set_itmt_core_prio(int prio, int core_cpu) + { + } +diff --git a/arch/x86/kernel/itmt.c b/arch/x86/kernel/itmt.c +index 51b805c727fc..9cea1fc36c18 100644 +--- a/arch/x86/kernel/itmt.c ++++ b/arch/x86/kernel/itmt.c +@@ -19,6 +19,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -34,49 +35,38 @@ static bool __read_mostly sched_itmt_capable; + * of higher turbo frequency for cpus supporting Intel Turbo Boost Max + * Technology 3.0. + * +- * It can be set via /proc/sys/kernel/sched_itmt_enabled ++ * It can be set via /sys/kernel/debug/x86/sched_itmt_enabled + */ +-unsigned int __read_mostly sysctl_sched_itmt_enabled; ++bool __read_mostly sysctl_sched_itmt_enabled; + +-static int sched_itmt_update_handler(const struct ctl_table *table, int write, +- void *buffer, size_t *lenp, loff_t *ppos) ++static ssize_t sched_itmt_enabled_write(struct file *filp, ++ const char __user *ubuf, ++ size_t cnt, loff_t *ppos) + { +- unsigned int old_sysctl; +- int ret; ++ ssize_t result; ++ bool orig; + +- mutex_lock(&itmt_update_mutex); ++ guard(mutex)(&itmt_update_mutex); + +- if (!sched_itmt_capable) { +- mutex_unlock(&itmt_update_mutex); +- return -EINVAL; +- } +- +- old_sysctl = sysctl_sched_itmt_enabled; +- ret = proc_dointvec_minmax(table, write, buffer, lenp, ppos); ++ orig = sysctl_sched_itmt_enabled; ++ result = debugfs_write_file_bool(filp, ubuf, cnt, ppos); + +- if (!ret && write && old_sysctl != sysctl_sched_itmt_enabled) { ++ if (sysctl_sched_itmt_enabled != orig) { + x86_topology_update = true; + rebuild_sched_domains(); + } + +- mutex_unlock(&itmt_update_mutex); +- +- return ret; ++ return result; + } + +-static struct ctl_table itmt_kern_table[] = { +- { +- .procname = "sched_itmt_enabled", +- .data = &sysctl_sched_itmt_enabled, +- .maxlen = sizeof(unsigned int), +- .mode = 0644, +- .proc_handler = sched_itmt_update_handler, +- .extra1 = SYSCTL_ZERO, +- .extra2 = SYSCTL_ONE, +- }, ++static const struct file_operations dfs_sched_itmt_fops = { ++ .read = debugfs_read_file_bool, ++ .write = sched_itmt_enabled_write, ++ .open = simple_open, ++ .llseek = default_llseek, + }; + +-static struct ctl_table_header *itmt_sysctl_header; ++static struct dentry *dfs_sched_itmt; + + /** + * sched_set_itmt_support() - Indicate platform supports ITMT +@@ -97,16 +87,18 @@ static struct ctl_table_header *itmt_sysctl_header; + */ + int sched_set_itmt_support(void) + { +- mutex_lock(&itmt_update_mutex); ++ guard(mutex)(&itmt_update_mutex); + +- if (sched_itmt_capable) { +- mutex_unlock(&itmt_update_mutex); ++ if (sched_itmt_capable) + return 0; +- } + +- itmt_sysctl_header = register_sysctl("kernel", itmt_kern_table); +- if (!itmt_sysctl_header) { +- mutex_unlock(&itmt_update_mutex); ++ dfs_sched_itmt = debugfs_create_file_unsafe("sched_itmt_enabled", ++ 0644, ++ arch_debugfs_dir, ++ &sysctl_sched_itmt_enabled, ++ &dfs_sched_itmt_fops); ++ if (IS_ERR_OR_NULL(dfs_sched_itmt)) { ++ dfs_sched_itmt = NULL; + return -ENOMEM; + } + +@@ -117,8 +109,6 @@ int sched_set_itmt_support(void) + x86_topology_update = true; + rebuild_sched_domains(); + +- mutex_unlock(&itmt_update_mutex); +- + return 0; + } + +@@ -134,18 +124,15 @@ int sched_set_itmt_support(void) + */ + void sched_clear_itmt_support(void) + { +- mutex_lock(&itmt_update_mutex); ++ guard(mutex)(&itmt_update_mutex); + +- if (!sched_itmt_capable) { +- mutex_unlock(&itmt_update_mutex); ++ if (!sched_itmt_capable) + return; +- } ++ + sched_itmt_capable = false; + +- if (itmt_sysctl_header) { +- unregister_sysctl_table(itmt_sysctl_header); +- itmt_sysctl_header = NULL; +- } ++ debugfs_remove(dfs_sched_itmt); ++ dfs_sched_itmt = NULL; + + if (sysctl_sched_itmt_enabled) { + /* disable sched_itmt if we are no longer ITMT capable */ +@@ -153,8 +140,6 @@ void sched_clear_itmt_support(void) + x86_topology_update = true; + rebuild_sched_domains(); + } +- +- mutex_unlock(&itmt_update_mutex); + } + + int arch_asym_cpu_priority(int cpu) +diff --git a/arch/x86/kernel/smpboot.c b/arch/x86/kernel/smpboot.c +index f1fac08fdef2..ef63b1c0b491 100644 +--- a/arch/x86/kernel/smpboot.c ++++ b/arch/x86/kernel/smpboot.c +@@ -482,12 +482,6 @@ static int x86_core_flags(void) + return cpu_core_flags() | x86_sched_itmt_flags(); + } + #endif +-#ifdef CONFIG_SCHED_SMT +-static int x86_smt_flags(void) +-{ +- return cpu_smt_flags(); +-} +-#endif + #ifdef CONFIG_SCHED_CLUSTER + static int x86_cluster_flags(void) + { +@@ -510,7 +504,7 @@ static void __init build_sched_topology(void) + + #ifdef CONFIG_SCHED_SMT + x86_topology[i++] = (struct sched_domain_topology_level){ +- cpu_smt_mask, x86_smt_flags, SD_INIT_NAME(SMT) ++ cpu_smt_mask, cpu_smt_flags, SD_INIT_NAME(SMT) + }; + #endif + #ifdef CONFIG_SCHED_CLUSTER +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cdb81cb0812c..232e2695a2cd 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9866,6 +9866,8 @@ struct sg_lb_stats { + unsigned int group_weight; + enum group_type group_type; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ ++ unsigned int asym_prefer_cpu; /* Group CPU with highest asym priority */ ++ int highest_asym_prio; /* Asym priority of asym_prefer_cpu */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + #ifdef CONFIG_NUMA_BALANCING +@@ -10195,7 +10197,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group + (sgs->group_weight - sgs->idle_cpus != 1)) + return false; + +- return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); ++ return sched_asym(env->sd, env->dst_cpu, sgs->asym_prefer_cpu); + } + + /* One group has more than one SMT CPU while the other group does not */ +@@ -10276,6 +10278,17 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) + return check_cpu_capacity(rq, sd); + } + ++static inline void ++update_sg_pick_asym_prefer(struct sg_lb_stats *sgs, int cpu) ++{ ++ int asym_prio = arch_asym_cpu_priority(cpu); ++ ++ if (asym_prio > sgs->highest_asym_prio) { ++ sgs->asym_prefer_cpu = cpu; ++ sgs->highest_asym_prio = asym_prio; ++ } ++} ++ + /** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @env: The load balancing environment. +@@ -10292,11 +10305,13 @@ static inline void update_sg_lb_stats(struct lb_env *env, + bool *sg_overloaded, + bool *sg_overutilized) + { +- int i, nr_running, local_group; ++ int i, nr_running, local_group, sd_flags = env->sd->flags; ++ bool balancing_at_rd = !env->sd->parent; + + memset(sgs, 0, sizeof(*sgs)); + + local_group = group == sds->local; ++ sgs->highest_asym_prio = INT_MIN; + + for_each_cpu_and(i, sched_group_span(group), env->cpus) { + struct rq *rq = cpu_rq(i); +@@ -10310,16 +10325,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, + nr_running = rq->nr_running; + sgs->sum_nr_running += nr_running; + +- if (nr_running > 1) +- *sg_overloaded = 1; ++ if (sd_flags & SD_ASYM_PACKING) ++ update_sg_pick_asym_prefer(sgs, i); + + if (cpu_overutilized(i)) + *sg_overutilized = 1; + +-#ifdef CONFIG_NUMA_BALANCING +- sgs->nr_numa_running += rq->nr_numa_running; +- sgs->nr_preferred_running += rq->nr_preferred_running; +-#endif + /* + * No need to call idle_cpu() if nr_running is not 0 + */ +@@ -10329,10 +10340,21 @@ static inline void update_sg_lb_stats(struct lb_env *env, + continue; + } + ++ /* Overload indicator is only updated at root domain */ ++ if (balancing_at_rd && nr_running > 1) ++ *sg_overloaded = 1; ++ ++#ifdef CONFIG_NUMA_BALANCING ++ /* Only fbq_classify_group() uses this to classify NUMA groups */ ++ if (sd_flags & SD_NUMA) { ++ sgs->nr_numa_running += rq->nr_numa_running; ++ sgs->nr_preferred_running += rq->nr_preferred_running; ++ } ++#endif + if (local_group) + continue; + +- if (env->sd->flags & SD_ASYM_CPUCAPACITY) { ++ if (sd_flags & SD_ASYM_CPUCAPACITY) { + /* Check for a misfit task on the cpu */ + if (sgs->group_misfit_task_load < rq->misfit_task_load) { + sgs->group_misfit_task_load = rq->misfit_task_load; +@@ -10427,7 +10449,7 @@ static bool update_sd_pick_busiest(struct lb_env *env, + + case group_asym_packing: + /* Prefer to move from lowest priority CPU's work */ +- return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); ++ return sched_asym_prefer(busiest->asym_prefer_cpu, sgs->asym_prefer_cpu); + + case group_misfit_task: + /* +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 18f3955ddb8f..6a9efb0fd86f 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2056,7 +2056,6 @@ struct sched_group { + unsigned int group_weight; + unsigned int cores; + struct sched_group_capacity *sgc; +- int asym_prefer_cpu; /* CPU of highest priority in group */ + int flags; + + /* +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 9748a4c8d668..59b8157cb114 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1302,7 +1302,7 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + WARN_ON(!sg); + + do { +- int cpu, cores = 0, max_cpu = -1; ++ int cpu, cores = 0; + + sg->group_weight = cpumask_weight(sched_group_span(sg)); + +@@ -1314,19 +1314,6 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + #endif + } + sg->cores = cores; +- +- if (!(sd->flags & SD_ASYM_PACKING)) +- goto next; +- +- for_each_cpu(cpu, sched_group_span(sg)) { +- if (max_cpu < 0) +- max_cpu = cpu; +- else if (sched_asym_prefer(cpu, max_cpu)) +- max_cpu = cpu; +- } +- sg->asym_prefer_cpu = max_cpu; +- +-next: + sg = sg->next; + } while (sg != sd->groups); + +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0008-ntsync.patch b/sys-kernel/gentoo-sources-6.13/0008-ntsync.patch new file mode 100644 index 0000000..3819a18 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0008-ntsync.patch @@ -0,0 +1,3050 @@ +From 5d635a3b91cbeba5def2a1a1bf1fd64b4a511923 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:32:02 +0100 +Subject: [PATCH 08/12] ntsync + +Signed-off-by: Peter Jung +--- + Documentation/userspace-api/index.rst | 1 + + Documentation/userspace-api/ntsync.rst | 385 +++++ + MAINTAINERS | 9 + + drivers/misc/Kconfig | 1 - + drivers/misc/ntsync.c | 1001 +++++++++++- + include/uapi/linux/ntsync.h | 42 +- + tools/testing/selftests/Makefile | 1 + + .../selftests/drivers/ntsync/.gitignore | 1 + + .../testing/selftests/drivers/ntsync/Makefile | 7 + + tools/testing/selftests/drivers/ntsync/config | 1 + + .../testing/selftests/drivers/ntsync/ntsync.c | 1343 +++++++++++++++++ + 11 files changed, 2773 insertions(+), 19 deletions(-) + create mode 100644 Documentation/userspace-api/ntsync.rst + create mode 100644 tools/testing/selftests/drivers/ntsync/.gitignore + create mode 100644 tools/testing/selftests/drivers/ntsync/Makefile + create mode 100644 tools/testing/selftests/drivers/ntsync/config + create mode 100644 tools/testing/selftests/drivers/ntsync/ntsync.c + +diff --git a/Documentation/userspace-api/index.rst b/Documentation/userspace-api/index.rst +index 274cc7546efc..9c1b15cd89ab 100644 +--- a/Documentation/userspace-api/index.rst ++++ b/Documentation/userspace-api/index.rst +@@ -63,6 +63,7 @@ Everything else + vduse + futex2 + perf_ring_buffer ++ ntsync + + .. only:: subproject and html + +diff --git a/Documentation/userspace-api/ntsync.rst b/Documentation/userspace-api/ntsync.rst +new file mode 100644 +index 000000000000..25e7c4aef968 +--- /dev/null ++++ b/Documentation/userspace-api/ntsync.rst +@@ -0,0 +1,385 @@ ++=================================== ++NT synchronization primitive driver ++=================================== ++ ++This page documents the user-space API for the ntsync driver. ++ ++ntsync is a support driver for emulation of NT synchronization ++primitives by user-space NT emulators. It exists because implementation ++in user-space, using existing tools, cannot match Windows performance ++while offering accurate semantics. It is implemented entirely in ++software, and does not drive any hardware device. ++ ++This interface is meant as a compatibility tool only, and should not ++be used for general synchronization. Instead use generic, versatile ++interfaces such as futex(2) and poll(2). ++ ++Synchronization primitives ++========================== ++ ++The ntsync driver exposes three types of synchronization primitives: ++semaphores, mutexes, and events. ++ ++A semaphore holds a single volatile 32-bit counter, and a static 32-bit ++integer denoting the maximum value. It is considered signaled (that is, ++can be acquired without contention, or will wake up a waiting thread) ++when the counter is nonzero. The counter is decremented by one when a ++wait is satisfied. Both the initial and maximum count are established ++when the semaphore is created. ++ ++A mutex holds a volatile 32-bit recursion count, and a volatile 32-bit ++identifier denoting its owner. A mutex is considered signaled when its ++owner is zero (indicating that it is not owned). The recursion count is ++incremented when a wait is satisfied, and ownership is set to the given ++identifier. ++ ++A mutex also holds an internal flag denoting whether its previous owner ++has died; such a mutex is said to be abandoned. Owner death is not ++tracked automatically based on thread death, but rather must be ++communicated using ``NTSYNC_IOC_MUTEX_KILL``. An abandoned mutex is ++inherently considered unowned. ++ ++Except for the "unowned" semantics of zero, the actual value of the ++owner identifier is not interpreted by the ntsync driver at all. The ++intended use is to store a thread identifier; however, the ntsync ++driver does not actually validate that a calling thread provides ++consistent or unique identifiers. ++ ++An event is similar to a semaphore with a maximum count of one. It holds ++a volatile boolean state denoting whether it is signaled or not. There ++are two types of events, auto-reset and manual-reset. An auto-reset ++event is designaled when a wait is satisfied; a manual-reset event is ++not. The event type is specified when the event is created. ++ ++Unless specified otherwise, all operations on an object are atomic and ++totally ordered with respect to other operations on the same object. ++ ++Objects are represented by files. When all file descriptors to an ++object are closed, that object is deleted. ++ ++Char device ++=========== ++ ++The ntsync driver creates a single char device /dev/ntsync. Each file ++description opened on the device represents a unique instance intended ++to back an individual NT virtual machine. Objects created by one ntsync ++instance may only be used with other objects created by the same ++instance. ++ ++ioctl reference ++=============== ++ ++All operations on the device are done through ioctls. There are four ++structures used in ioctl calls:: ++ ++ struct ntsync_sem_args { ++ __u32 count; ++ __u32 max; ++ }; ++ ++ struct ntsync_mutex_args { ++ __u32 owner; ++ __u32 count; ++ }; ++ ++ struct ntsync_event_args { ++ __u32 signaled; ++ __u32 manual; ++ }; ++ ++ struct ntsync_wait_args { ++ __u64 timeout; ++ __u64 objs; ++ __u32 count; ++ __u32 owner; ++ __u32 index; ++ __u32 alert; ++ __u32 flags; ++ __u32 pad; ++ }; ++ ++Depending on the ioctl, members of the structure may be used as input, ++output, or not at all. ++ ++The ioctls on the device file are as follows: ++ ++.. c:macro:: NTSYNC_IOC_CREATE_SEM ++ ++ Create a semaphore object. Takes a pointer to struct ++ :c:type:`ntsync_sem_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``count`` ++ - Initial count of the semaphore. ++ * - ``max`` ++ - Maximum count of the semaphore. ++ ++ Fails with ``EINVAL`` if ``count`` is greater than ``max``. ++ On success, returns a file descriptor the created semaphore. ++ ++.. c:macro:: NTSYNC_IOC_CREATE_MUTEX ++ ++ Create a mutex object. Takes a pointer to struct ++ :c:type:`ntsync_mutex_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``count`` ++ - Initial recursion count of the mutex. ++ * - ``owner`` ++ - Initial owner of the mutex. ++ ++ If ``owner`` is nonzero and ``count`` is zero, or if ``owner`` is ++ zero and ``count`` is nonzero, the function fails with ``EINVAL``. ++ On success, returns a file descriptor the created mutex. ++ ++.. c:macro:: NTSYNC_IOC_CREATE_EVENT ++ ++ Create an event object. Takes a pointer to struct ++ :c:type:`ntsync_event_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``signaled`` ++ - If nonzero, the event is initially signaled, otherwise ++ nonsignaled. ++ * - ``manual`` ++ - If nonzero, the event is a manual-reset event, otherwise ++ auto-reset. ++ ++ On success, returns a file descriptor the created event. ++ ++The ioctls on the individual objects are as follows: ++ ++.. c:macro:: NTSYNC_IOC_SEM_POST ++ ++ Post to a semaphore object. Takes a pointer to a 32-bit integer, ++ which on input holds the count to be added to the semaphore, and on ++ output contains its previous count. ++ ++ If adding to the semaphore's current count would raise the latter ++ past the semaphore's maximum count, the ioctl fails with ++ ``EOVERFLOW`` and the semaphore is not affected. If raising the ++ semaphore's count causes it to become signaled, eligible threads ++ waiting on this semaphore will be woken and the semaphore's count ++ decremented appropriately. ++ ++.. c:macro:: NTSYNC_IOC_MUTEX_UNLOCK ++ ++ Release a mutex object. Takes a pointer to struct ++ :c:type:`ntsync_mutex_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``owner`` ++ - Specifies the owner trying to release this mutex. ++ * - ``count`` ++ - On output, contains the previous recursion count. ++ ++ If ``owner`` is zero, the ioctl fails with ``EINVAL``. If ``owner`` ++ is not the current owner of the mutex, the ioctl fails with ++ ``EPERM``. ++ ++ The mutex's count will be decremented by one. If decrementing the ++ mutex's count causes it to become zero, the mutex is marked as ++ unowned and signaled, and eligible threads waiting on it will be ++ woken as appropriate. ++ ++.. c:macro:: NTSYNC_IOC_SET_EVENT ++ ++ Signal an event object. Takes a pointer to a 32-bit integer, which on ++ output contains the previous state of the event. ++ ++ Eligible threads will be woken, and auto-reset events will be ++ designaled appropriately. ++ ++.. c:macro:: NTSYNC_IOC_RESET_EVENT ++ ++ Designal an event object. Takes a pointer to a 32-bit integer, which ++ on output contains the previous state of the event. ++ ++.. c:macro:: NTSYNC_IOC_PULSE_EVENT ++ ++ Wake threads waiting on an event object while leaving it in an ++ unsignaled state. Takes a pointer to a 32-bit integer, which on ++ output contains the previous state of the event. ++ ++ A pulse operation can be thought of as a set followed by a reset, ++ performed as a single atomic operation. If two threads are waiting on ++ an auto-reset event which is pulsed, only one will be woken. If two ++ threads are waiting a manual-reset event which is pulsed, both will ++ be woken. However, in both cases, the event will be unsignaled ++ afterwards, and a simultaneous read operation will always report the ++ event as unsignaled. ++ ++.. c:macro:: NTSYNC_IOC_READ_SEM ++ ++ Read the current state of a semaphore object. Takes a pointer to ++ struct :c:type:`ntsync_sem_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``count`` ++ - On output, contains the current count of the semaphore. ++ * - ``max`` ++ - On output, contains the maximum count of the semaphore. ++ ++.. c:macro:: NTSYNC_IOC_READ_MUTEX ++ ++ Read the current state of a mutex object. Takes a pointer to struct ++ :c:type:`ntsync_mutex_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``owner`` ++ - On output, contains the current owner of the mutex, or zero ++ if the mutex is not currently owned. ++ * - ``count`` ++ - On output, contains the current recursion count of the mutex. ++ ++ If the mutex is marked as abandoned, the function fails with ++ ``EOWNERDEAD``. In this case, ``count`` and ``owner`` are set to ++ zero. ++ ++.. c:macro:: NTSYNC_IOC_READ_EVENT ++ ++ Read the current state of an event object. Takes a pointer to struct ++ :c:type:`ntsync_event_args`, which is used as follows: ++ ++ .. list-table:: ++ ++ * - ``signaled`` ++ - On output, contains the current state of the event. ++ * - ``manual`` ++ - On output, contains 1 if the event is a manual-reset event, ++ and 0 otherwise. ++ ++.. c:macro:: NTSYNC_IOC_KILL_OWNER ++ ++ Mark a mutex as unowned and abandoned if it is owned by the given ++ owner. Takes an input-only pointer to a 32-bit integer denoting the ++ owner. If the owner is zero, the ioctl fails with ``EINVAL``. If the ++ owner does not own the mutex, the function fails with ``EPERM``. ++ ++ Eligible threads waiting on the mutex will be woken as appropriate ++ (and such waits will fail with ``EOWNERDEAD``, as described below). ++ ++.. c:macro:: NTSYNC_IOC_WAIT_ANY ++ ++ Poll on any of a list of objects, atomically acquiring at most one. ++ Takes a pointer to struct :c:type:`ntsync_wait_args`, which is ++ used as follows: ++ ++ .. list-table:: ++ ++ * - ``timeout`` ++ - Absolute timeout in nanoseconds. If ``NTSYNC_WAIT_REALTIME`` ++ is set, the timeout is measured against the REALTIME clock; ++ otherwise it is measured against the MONOTONIC clock. If the ++ timeout is equal to or earlier than the current time, the ++ function returns immediately without sleeping. If ``timeout`` ++ is U64_MAX, the function will sleep until an object is ++ signaled, and will not fail with ``ETIMEDOUT``. ++ * - ``objs`` ++ - Pointer to an array of ``count`` file descriptors ++ (specified as an integer so that the structure has the same ++ size regardless of architecture). If any object is ++ invalid, the function fails with ``EINVAL``. ++ * - ``count`` ++ - Number of objects specified in the ``objs`` array. ++ If greater than ``NTSYNC_MAX_WAIT_COUNT``, the function fails ++ with ``EINVAL``. ++ * - ``owner`` ++ - Mutex owner identifier. If any object in ``objs`` is a mutex, ++ the ioctl will attempt to acquire that mutex on behalf of ++ ``owner``. If ``owner`` is zero, the ioctl fails with ++ ``EINVAL``. ++ * - ``index`` ++ - On success, contains the index (into ``objs``) of the object ++ which was signaled. If ``alert`` was signaled instead, ++ this contains ``count``. ++ * - ``alert`` ++ - Optional event object file descriptor. If nonzero, this ++ specifies an "alert" event object which, if signaled, will ++ terminate the wait. If nonzero, the identifier must point to a ++ valid event. ++ * - ``flags`` ++ - Zero or more flags. Currently the only flag is ++ ``NTSYNC_WAIT_REALTIME``, which causes the timeout to be ++ measured against the REALTIME clock instead of MONOTONIC. ++ * - ``pad`` ++ - Unused, must be set to zero. ++ ++ This function attempts to acquire one of the given objects. If unable ++ to do so, it sleeps until an object becomes signaled, subsequently ++ acquiring it, or the timeout expires. In the latter case the ioctl ++ fails with ``ETIMEDOUT``. The function only acquires one object, even ++ if multiple objects are signaled. ++ ++ A semaphore is considered to be signaled if its count is nonzero, and ++ is acquired by decrementing its count by one. A mutex is considered ++ to be signaled if it is unowned or if its owner matches the ``owner`` ++ argument, and is acquired by incrementing its recursion count by one ++ and setting its owner to the ``owner`` argument. An auto-reset event ++ is acquired by designaling it; a manual-reset event is not affected ++ by acquisition. ++ ++ Acquisition is atomic and totally ordered with respect to other ++ operations on the same object. If two wait operations (with different ++ ``owner`` identifiers) are queued on the same mutex, only one is ++ signaled. If two wait operations are queued on the same semaphore, ++ and a value of one is posted to it, only one is signaled. ++ ++ If an abandoned mutex is acquired, the ioctl fails with ++ ``EOWNERDEAD``. Although this is a failure return, the function may ++ otherwise be considered successful. The mutex is marked as owned by ++ the given owner (with a recursion count of 1) and as no longer ++ abandoned, and ``index`` is still set to the index of the mutex. ++ ++ The ``alert`` argument is an "extra" event which can terminate the ++ wait, independently of all other objects. ++ ++ It is valid to pass the same object more than once, including by ++ passing the same event in the ``objs`` array and in ``alert``. If a ++ wakeup occurs due to that object being signaled, ``index`` is set to ++ the lowest index corresponding to that object. ++ ++ The function may fail with ``EINTR`` if a signal is received. ++ ++.. c:macro:: NTSYNC_IOC_WAIT_ALL ++ ++ Poll on a list of objects, atomically acquiring all of them. Takes a ++ pointer to struct :c:type:`ntsync_wait_args`, which is used ++ identically to ``NTSYNC_IOC_WAIT_ANY``, except that ``index`` is ++ always filled with zero on success if not woken via alert. ++ ++ This function attempts to simultaneously acquire all of the given ++ objects. If unable to do so, it sleeps until all objects become ++ simultaneously signaled, subsequently acquiring them, or the timeout ++ expires. In the latter case the ioctl fails with ``ETIMEDOUT`` and no ++ objects are modified. ++ ++ Objects may become signaled and subsequently designaled (through ++ acquisition by other threads) while this thread is sleeping. Only ++ once all objects are simultaneously signaled does the ioctl acquire ++ them and return. The entire acquisition is atomic and totally ordered ++ with respect to other operations on any of the given objects. ++ ++ If an abandoned mutex is acquired, the ioctl fails with ++ ``EOWNERDEAD``. Similarly to ``NTSYNC_IOC_WAIT_ANY``, all objects are ++ nevertheless marked as acquired. Note that if multiple mutex objects ++ are specified, there is no way to know which were marked as ++ abandoned. ++ ++ As with "any" waits, the ``alert`` argument is an "extra" event which ++ can terminate the wait. Critically, however, an "all" wait will ++ succeed if all members in ``objs`` are signaled, *or* if ``alert`` is ++ signaled. In the latter case ``index`` will be set to ``count``. As ++ with "any" waits, if both conditions are filled, the former takes ++ priority, and objects in ``objs`` will be acquired. ++ ++ Unlike ``NTSYNC_IOC_WAIT_ANY``, it is not valid to pass the same ++ object more than once, nor is it valid to pass the same object in ++ ``objs`` and in ``alert``. If this is attempted, the function fails ++ with ``EINVAL``. +diff --git a/MAINTAINERS b/MAINTAINERS +index 0fa7c5728f1e..efecb59adfe6 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -16709,6 +16709,15 @@ T: git https://github.com/Paragon-Software-Group/linux-ntfs3.git + F: Documentation/filesystems/ntfs3.rst + F: fs/ntfs3/ + ++NTSYNC SYNCHRONIZATION PRIMITIVE DRIVER ++M: Elizabeth Figura ++L: wine-devel@winehq.org ++S: Supported ++F: Documentation/userspace-api/ntsync.rst ++F: drivers/misc/ntsync.c ++F: include/uapi/linux/ntsync.h ++F: tools/testing/selftests/drivers/ntsync/ ++ + NUBUS SUBSYSTEM + M: Finn Thain + L: linux-m68k@lists.linux-m68k.org +diff --git a/drivers/misc/Kconfig b/drivers/misc/Kconfig +index 09cbe3f0ab1e..fb772bfe27c3 100644 +--- a/drivers/misc/Kconfig ++++ b/drivers/misc/Kconfig +@@ -517,7 +517,6 @@ config OPEN_DICE + + config NTSYNC + tristate "NT synchronization primitive emulation" +- depends on BROKEN + help + This module provides kernel support for emulation of Windows NT + synchronization primitives. It is not a hardware driver. +diff --git a/drivers/misc/ntsync.c b/drivers/misc/ntsync.c +index 4954553b7baa..586b86243e1d 100644 +--- a/drivers/misc/ntsync.c ++++ b/drivers/misc/ntsync.c +@@ -6,11 +6,17 @@ + */ + + #include ++#include + #include + #include ++#include ++#include + #include + #include ++#include + #include ++#include ++#include + #include + #include + #include +@@ -19,6 +25,8 @@ + + enum ntsync_type { + NTSYNC_TYPE_SEM, ++ NTSYNC_TYPE_MUTEX, ++ NTSYNC_TYPE_EVENT, + }; + + /* +@@ -30,10 +38,13 @@ enum ntsync_type { + * + * Both rely on struct file for reference counting. Individual + * ntsync_obj objects take a reference to the device when created. ++ * Wait operations take a reference to each object being waited on for ++ * the duration of the wait. + */ + + struct ntsync_obj { + spinlock_t lock; ++ int dev_locked; + + enum ntsync_type type; + +@@ -46,22 +57,344 @@ struct ntsync_obj { + __u32 count; + __u32 max; + } sem; ++ struct { ++ __u32 count; ++ pid_t owner; ++ bool ownerdead; ++ } mutex; ++ struct { ++ bool manual; ++ bool signaled; ++ } event; + } u; ++ ++ /* ++ * any_waiters is protected by the object lock, but all_waiters is ++ * protected by the device wait_all_lock. ++ */ ++ struct list_head any_waiters; ++ struct list_head all_waiters; ++ ++ /* ++ * Hint describing how many tasks are queued on this object in a ++ * wait-all operation. ++ * ++ * Any time we do a wake, we may need to wake "all" waiters as well as ++ * "any" waiters. In order to atomically wake "all" waiters, we must ++ * lock all of the objects, and that means grabbing the wait_all_lock ++ * below (and, due to lock ordering rules, before locking this object). ++ * However, wait-all is a rare operation, and grabbing the wait-all ++ * lock for every wake would create unnecessary contention. ++ * Therefore we first check whether all_hint is zero, and, if it is, ++ * we skip trying to wake "all" waiters. ++ * ++ * Since wait requests must originate from user-space threads, we're ++ * limited here by PID_MAX_LIMIT, so there's no risk of overflow. ++ */ ++ atomic_t all_hint; ++}; ++ ++struct ntsync_q_entry { ++ struct list_head node; ++ struct ntsync_q *q; ++ struct ntsync_obj *obj; ++ __u32 index; ++}; ++ ++struct ntsync_q { ++ struct task_struct *task; ++ __u32 owner; ++ ++ /* ++ * Protected via atomic_try_cmpxchg(). Only the thread that wins the ++ * compare-and-swap may actually change object states and wake this ++ * task. ++ */ ++ atomic_t signaled; ++ ++ bool all; ++ bool ownerdead; ++ __u32 count; ++ struct ntsync_q_entry entries[]; + }; + + struct ntsync_device { ++ /* ++ * Wait-all operations must atomically grab all objects, and be totally ++ * ordered with respect to each other and wait-any operations. ++ * If one thread is trying to acquire several objects, another thread ++ * cannot touch the object at the same time. ++ * ++ * This device-wide lock is used to serialize wait-for-all ++ * operations, and operations on an object that is involved in a ++ * wait-for-all. ++ */ ++ struct mutex wait_all_lock; ++ + struct file *file; + }; + ++/* ++ * Single objects are locked using obj->lock. ++ * ++ * Multiple objects are 'locked' while holding dev->wait_all_lock. ++ * In this case however, individual objects are not locked by holding ++ * obj->lock, but by setting obj->dev_locked. ++ * ++ * This means that in order to lock a single object, the sequence is slightly ++ * more complicated than usual. Specifically it needs to check obj->dev_locked ++ * after acquiring obj->lock, if set, it needs to drop the lock and acquire ++ * dev->wait_all_lock in order to serialize against the multi-object operation. ++ */ ++ ++static void dev_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ lockdep_assert_held(&dev->wait_all_lock); ++ lockdep_assert(obj->dev == dev); ++ spin_lock(&obj->lock); ++ /* ++ * By setting obj->dev_locked inside obj->lock, it is ensured that ++ * anyone holding obj->lock must see the value. ++ */ ++ obj->dev_locked = 1; ++ spin_unlock(&obj->lock); ++} ++ ++static void dev_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ lockdep_assert_held(&dev->wait_all_lock); ++ lockdep_assert(obj->dev == dev); ++ spin_lock(&obj->lock); ++ obj->dev_locked = 0; ++ spin_unlock(&obj->lock); ++} ++ ++static void obj_lock(struct ntsync_obj *obj) ++{ ++ struct ntsync_device *dev = obj->dev; ++ ++ for (;;) { ++ spin_lock(&obj->lock); ++ if (likely(!obj->dev_locked)) ++ break; ++ ++ spin_unlock(&obj->lock); ++ mutex_lock(&dev->wait_all_lock); ++ spin_lock(&obj->lock); ++ /* ++ * obj->dev_locked should be set and released under the same ++ * wait_all_lock section, since we now own this lock, it should ++ * be clear. ++ */ ++ lockdep_assert(!obj->dev_locked); ++ spin_unlock(&obj->lock); ++ mutex_unlock(&dev->wait_all_lock); ++ } ++} ++ ++static void obj_unlock(struct ntsync_obj *obj) ++{ ++ spin_unlock(&obj->lock); ++} ++ ++static bool ntsync_lock_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ bool all; ++ ++ obj_lock(obj); ++ all = atomic_read(&obj->all_hint); ++ if (unlikely(all)) { ++ obj_unlock(obj); ++ mutex_lock(&dev->wait_all_lock); ++ dev_lock_obj(dev, obj); ++ } ++ ++ return all; ++} ++ ++static void ntsync_unlock_obj(struct ntsync_device *dev, struct ntsync_obj *obj, bool all) ++{ ++ if (all) { ++ dev_unlock_obj(dev, obj); ++ mutex_unlock(&dev->wait_all_lock); ++ } else { ++ obj_unlock(obj); ++ } ++} ++ ++#define ntsync_assert_held(obj) \ ++ lockdep_assert((lockdep_is_held(&(obj)->lock) != LOCK_STATE_NOT_HELD) || \ ++ ((lockdep_is_held(&(obj)->dev->wait_all_lock) != LOCK_STATE_NOT_HELD) && \ ++ (obj)->dev_locked)) ++ ++static bool is_signaled(struct ntsync_obj *obj, __u32 owner) ++{ ++ ntsync_assert_held(obj); ++ ++ switch (obj->type) { ++ case NTSYNC_TYPE_SEM: ++ return !!obj->u.sem.count; ++ case NTSYNC_TYPE_MUTEX: ++ if (obj->u.mutex.owner && obj->u.mutex.owner != owner) ++ return false; ++ return obj->u.mutex.count < UINT_MAX; ++ case NTSYNC_TYPE_EVENT: ++ return obj->u.event.signaled; ++ } ++ ++ WARN(1, "bad object type %#x\n", obj->type); ++ return false; ++} ++ ++/* ++ * "locked_obj" is an optional pointer to an object which is already locked and ++ * should not be locked again. This is necessary so that changing an object's ++ * state and waking it can be a single atomic operation. ++ */ ++static void try_wake_all(struct ntsync_device *dev, struct ntsync_q *q, ++ struct ntsync_obj *locked_obj) ++{ ++ __u32 count = q->count; ++ bool can_wake = true; ++ int signaled = -1; ++ __u32 i; ++ ++ lockdep_assert_held(&dev->wait_all_lock); ++ if (locked_obj) ++ lockdep_assert(locked_obj->dev_locked); ++ ++ for (i = 0; i < count; i++) { ++ if (q->entries[i].obj != locked_obj) ++ dev_lock_obj(dev, q->entries[i].obj); ++ } ++ ++ for (i = 0; i < count; i++) { ++ if (!is_signaled(q->entries[i].obj, q->owner)) { ++ can_wake = false; ++ break; ++ } ++ } ++ ++ if (can_wake && atomic_try_cmpxchg(&q->signaled, &signaled, 0)) { ++ for (i = 0; i < count; i++) { ++ struct ntsync_obj *obj = q->entries[i].obj; ++ ++ switch (obj->type) { ++ case NTSYNC_TYPE_SEM: ++ obj->u.sem.count--; ++ break; ++ case NTSYNC_TYPE_MUTEX: ++ if (obj->u.mutex.ownerdead) ++ q->ownerdead = true; ++ obj->u.mutex.ownerdead = false; ++ obj->u.mutex.count++; ++ obj->u.mutex.owner = q->owner; ++ break; ++ case NTSYNC_TYPE_EVENT: ++ if (!obj->u.event.manual) ++ obj->u.event.signaled = false; ++ break; ++ } ++ } ++ wake_up_process(q->task); ++ } ++ ++ for (i = 0; i < count; i++) { ++ if (q->entries[i].obj != locked_obj) ++ dev_unlock_obj(dev, q->entries[i].obj); ++ } ++} ++ ++static void try_wake_all_obj(struct ntsync_device *dev, struct ntsync_obj *obj) ++{ ++ struct ntsync_q_entry *entry; ++ ++ lockdep_assert_held(&dev->wait_all_lock); ++ lockdep_assert(obj->dev_locked); ++ ++ list_for_each_entry(entry, &obj->all_waiters, node) ++ try_wake_all(dev, entry->q, obj); ++} ++ ++static void try_wake_any_sem(struct ntsync_obj *sem) ++{ ++ struct ntsync_q_entry *entry; ++ ++ ntsync_assert_held(sem); ++ lockdep_assert(sem->type == NTSYNC_TYPE_SEM); ++ ++ list_for_each_entry(entry, &sem->any_waiters, node) { ++ struct ntsync_q *q = entry->q; ++ int signaled = -1; ++ ++ if (!sem->u.sem.count) ++ break; ++ ++ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { ++ sem->u.sem.count--; ++ wake_up_process(q->task); ++ } ++ } ++} ++ ++static void try_wake_any_mutex(struct ntsync_obj *mutex) ++{ ++ struct ntsync_q_entry *entry; ++ ++ ntsync_assert_held(mutex); ++ lockdep_assert(mutex->type == NTSYNC_TYPE_MUTEX); ++ ++ list_for_each_entry(entry, &mutex->any_waiters, node) { ++ struct ntsync_q *q = entry->q; ++ int signaled = -1; ++ ++ if (mutex->u.mutex.count == UINT_MAX) ++ break; ++ if (mutex->u.mutex.owner && mutex->u.mutex.owner != q->owner) ++ continue; ++ ++ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { ++ if (mutex->u.mutex.ownerdead) ++ q->ownerdead = true; ++ mutex->u.mutex.ownerdead = false; ++ mutex->u.mutex.count++; ++ mutex->u.mutex.owner = q->owner; ++ wake_up_process(q->task); ++ } ++ } ++} ++ ++static void try_wake_any_event(struct ntsync_obj *event) ++{ ++ struct ntsync_q_entry *entry; ++ ++ ntsync_assert_held(event); ++ lockdep_assert(event->type == NTSYNC_TYPE_EVENT); ++ ++ list_for_each_entry(entry, &event->any_waiters, node) { ++ struct ntsync_q *q = entry->q; ++ int signaled = -1; ++ ++ if (!event->u.event.signaled) ++ break; ++ ++ if (atomic_try_cmpxchg(&q->signaled, &signaled, entry->index)) { ++ if (!event->u.event.manual) ++ event->u.event.signaled = false; ++ wake_up_process(q->task); ++ } ++ } ++} ++ + /* + * Actually change the semaphore state, returning -EOVERFLOW if it is made + * invalid. + */ +-static int post_sem_state(struct ntsync_obj *sem, __u32 count) ++static int release_sem_state(struct ntsync_obj *sem, __u32 count) + { + __u32 sum; + +- lockdep_assert_held(&sem->lock); ++ ntsync_assert_held(sem); + + if (check_add_overflow(sem->u.sem.count, count, &sum) || + sum > sem->u.sem.max) +@@ -71,11 +404,13 @@ static int post_sem_state(struct ntsync_obj *sem, __u32 count) + return 0; + } + +-static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) ++static int ntsync_sem_release(struct ntsync_obj *sem, void __user *argp) + { ++ struct ntsync_device *dev = sem->dev; + __u32 __user *user_args = argp; + __u32 prev_count; + __u32 args; ++ bool all; + int ret; + + if (copy_from_user(&args, argp, sizeof(args))) +@@ -84,12 +419,17 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) + if (sem->type != NTSYNC_TYPE_SEM) + return -EINVAL; + +- spin_lock(&sem->lock); ++ all = ntsync_lock_obj(dev, sem); + + prev_count = sem->u.sem.count; +- ret = post_sem_state(sem, args); ++ ret = release_sem_state(sem, args); ++ if (!ret) { ++ if (all) ++ try_wake_all_obj(dev, sem); ++ try_wake_any_sem(sem); ++ } + +- spin_unlock(&sem->lock); ++ ntsync_unlock_obj(dev, sem, all); + + if (!ret && put_user(prev_count, user_args)) + ret = -EFAULT; +@@ -97,13 +437,229 @@ static int ntsync_sem_post(struct ntsync_obj *sem, void __user *argp) + return ret; + } + +-static int ntsync_obj_release(struct inode *inode, struct file *file) ++/* ++ * Actually change the mutex state, returning -EPERM if not the owner. ++ */ ++static int unlock_mutex_state(struct ntsync_obj *mutex, ++ const struct ntsync_mutex_args *args) + { +- struct ntsync_obj *obj = file->private_data; ++ ntsync_assert_held(mutex); ++ ++ if (mutex->u.mutex.owner != args->owner) ++ return -EPERM; ++ ++ if (!--mutex->u.mutex.count) ++ mutex->u.mutex.owner = 0; ++ return 0; ++} ++ ++static int ntsync_mutex_unlock(struct ntsync_obj *mutex, void __user *argp) ++{ ++ struct ntsync_mutex_args __user *user_args = argp; ++ struct ntsync_device *dev = mutex->dev; ++ struct ntsync_mutex_args args; ++ __u32 prev_count; ++ bool all; ++ int ret; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ if (!args.owner) ++ return -EINVAL; ++ ++ if (mutex->type != NTSYNC_TYPE_MUTEX) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, mutex); ++ ++ prev_count = mutex->u.mutex.count; ++ ret = unlock_mutex_state(mutex, &args); ++ if (!ret) { ++ if (all) ++ try_wake_all_obj(dev, mutex); ++ try_wake_any_mutex(mutex); ++ } ++ ++ ntsync_unlock_obj(dev, mutex, all); ++ ++ if (!ret && put_user(prev_count, &user_args->count)) ++ ret = -EFAULT; ++ ++ return ret; ++} ++ ++/* ++ * Actually change the mutex state to mark its owner as dead, ++ * returning -EPERM if not the owner. ++ */ ++static int kill_mutex_state(struct ntsync_obj *mutex, __u32 owner) ++{ ++ ntsync_assert_held(mutex); ++ ++ if (mutex->u.mutex.owner != owner) ++ return -EPERM; ++ ++ mutex->u.mutex.ownerdead = true; ++ mutex->u.mutex.owner = 0; ++ mutex->u.mutex.count = 0; ++ return 0; ++} ++ ++static int ntsync_mutex_kill(struct ntsync_obj *mutex, void __user *argp) ++{ ++ struct ntsync_device *dev = mutex->dev; ++ __u32 owner; ++ bool all; ++ int ret; ++ ++ if (get_user(owner, (__u32 __user *)argp)) ++ return -EFAULT; ++ if (!owner) ++ return -EINVAL; ++ ++ if (mutex->type != NTSYNC_TYPE_MUTEX) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, mutex); ++ ++ ret = kill_mutex_state(mutex, owner); ++ if (!ret) { ++ if (all) ++ try_wake_all_obj(dev, mutex); ++ try_wake_any_mutex(mutex); ++ } ++ ++ ntsync_unlock_obj(dev, mutex, all); ++ ++ return ret; ++} ++ ++static int ntsync_event_set(struct ntsync_obj *event, void __user *argp, bool pulse) ++{ ++ struct ntsync_device *dev = event->dev; ++ __u32 prev_state; ++ bool all; ++ ++ if (event->type != NTSYNC_TYPE_EVENT) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, event); ++ ++ prev_state = event->u.event.signaled; ++ event->u.event.signaled = true; ++ if (all) ++ try_wake_all_obj(dev, event); ++ try_wake_any_event(event); ++ if (pulse) ++ event->u.event.signaled = false; ++ ++ ntsync_unlock_obj(dev, event, all); ++ ++ if (put_user(prev_state, (__u32 __user *)argp)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int ntsync_event_reset(struct ntsync_obj *event, void __user *argp) ++{ ++ struct ntsync_device *dev = event->dev; ++ __u32 prev_state; ++ bool all; ++ ++ if (event->type != NTSYNC_TYPE_EVENT) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, event); ++ ++ prev_state = event->u.event.signaled; ++ event->u.event.signaled = false; ++ ++ ntsync_unlock_obj(dev, event, all); ++ ++ if (put_user(prev_state, (__u32 __user *)argp)) ++ return -EFAULT; ++ ++ return 0; ++} ++ ++static int ntsync_sem_read(struct ntsync_obj *sem, void __user *argp) ++{ ++ struct ntsync_sem_args __user *user_args = argp; ++ struct ntsync_device *dev = sem->dev; ++ struct ntsync_sem_args args; ++ bool all; ++ ++ if (sem->type != NTSYNC_TYPE_SEM) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, sem); ++ ++ args.count = sem->u.sem.count; ++ args.max = sem->u.sem.max; ++ ++ ntsync_unlock_obj(dev, sem, all); ++ ++ if (copy_to_user(user_args, &args, sizeof(args))) ++ return -EFAULT; ++ return 0; ++} + ++static int ntsync_mutex_read(struct ntsync_obj *mutex, void __user *argp) ++{ ++ struct ntsync_mutex_args __user *user_args = argp; ++ struct ntsync_device *dev = mutex->dev; ++ struct ntsync_mutex_args args; ++ bool all; ++ int ret; ++ ++ if (mutex->type != NTSYNC_TYPE_MUTEX) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, mutex); ++ ++ args.count = mutex->u.mutex.count; ++ args.owner = mutex->u.mutex.owner; ++ ret = mutex->u.mutex.ownerdead ? -EOWNERDEAD : 0; ++ ++ ntsync_unlock_obj(dev, mutex, all); ++ ++ if (copy_to_user(user_args, &args, sizeof(args))) ++ return -EFAULT; ++ return ret; ++} ++ ++static int ntsync_event_read(struct ntsync_obj *event, void __user *argp) ++{ ++ struct ntsync_event_args __user *user_args = argp; ++ struct ntsync_device *dev = event->dev; ++ struct ntsync_event_args args; ++ bool all; ++ ++ if (event->type != NTSYNC_TYPE_EVENT) ++ return -EINVAL; ++ ++ all = ntsync_lock_obj(dev, event); ++ ++ args.manual = event->u.event.manual; ++ args.signaled = event->u.event.signaled; ++ ++ ntsync_unlock_obj(dev, event, all); ++ ++ if (copy_to_user(user_args, &args, sizeof(args))) ++ return -EFAULT; ++ return 0; ++} ++ ++static void ntsync_free_obj(struct ntsync_obj *obj) ++{ + fput(obj->dev->file); + kfree(obj); ++} + ++static int ntsync_obj_release(struct inode *inode, struct file *file) ++{ ++ ntsync_free_obj(file->private_data); + return 0; + } + +@@ -114,8 +670,24 @@ static long ntsync_obj_ioctl(struct file *file, unsigned int cmd, + void __user *argp = (void __user *)parm; + + switch (cmd) { +- case NTSYNC_IOC_SEM_POST: +- return ntsync_sem_post(obj, argp); ++ case NTSYNC_IOC_SEM_RELEASE: ++ return ntsync_sem_release(obj, argp); ++ case NTSYNC_IOC_SEM_READ: ++ return ntsync_sem_read(obj, argp); ++ case NTSYNC_IOC_MUTEX_UNLOCK: ++ return ntsync_mutex_unlock(obj, argp); ++ case NTSYNC_IOC_MUTEX_KILL: ++ return ntsync_mutex_kill(obj, argp); ++ case NTSYNC_IOC_MUTEX_READ: ++ return ntsync_mutex_read(obj, argp); ++ case NTSYNC_IOC_EVENT_SET: ++ return ntsync_event_set(obj, argp, false); ++ case NTSYNC_IOC_EVENT_RESET: ++ return ntsync_event_reset(obj, argp); ++ case NTSYNC_IOC_EVENT_PULSE: ++ return ntsync_event_set(obj, argp, true); ++ case NTSYNC_IOC_EVENT_READ: ++ return ntsync_event_read(obj, argp); + default: + return -ENOIOCTLCMD; + } +@@ -140,6 +712,9 @@ static struct ntsync_obj *ntsync_alloc_obj(struct ntsync_device *dev, + obj->dev = dev; + get_file(dev->file); + spin_lock_init(&obj->lock); ++ INIT_LIST_HEAD(&obj->any_waiters); ++ INIT_LIST_HEAD(&obj->all_waiters); ++ atomic_set(&obj->all_hint, 0); + + return obj; + } +@@ -165,7 +740,6 @@ static int ntsync_obj_get_fd(struct ntsync_obj *obj) + + static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) + { +- struct ntsync_sem_args __user *user_args = argp; + struct ntsync_sem_args args; + struct ntsync_obj *sem; + int fd; +@@ -182,12 +756,398 @@ static int ntsync_create_sem(struct ntsync_device *dev, void __user *argp) + sem->u.sem.count = args.count; + sem->u.sem.max = args.max; + fd = ntsync_obj_get_fd(sem); +- if (fd < 0) { +- kfree(sem); +- return fd; ++ if (fd < 0) ++ ntsync_free_obj(sem); ++ ++ return fd; ++} ++ ++static int ntsync_create_mutex(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_mutex_args args; ++ struct ntsync_obj *mutex; ++ int fd; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ if (!args.owner != !args.count) ++ return -EINVAL; ++ ++ mutex = ntsync_alloc_obj(dev, NTSYNC_TYPE_MUTEX); ++ if (!mutex) ++ return -ENOMEM; ++ mutex->u.mutex.count = args.count; ++ mutex->u.mutex.owner = args.owner; ++ fd = ntsync_obj_get_fd(mutex); ++ if (fd < 0) ++ ntsync_free_obj(mutex); ++ ++ return fd; ++} ++ ++static int ntsync_create_event(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_event_args args; ++ struct ntsync_obj *event; ++ int fd; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ event = ntsync_alloc_obj(dev, NTSYNC_TYPE_EVENT); ++ if (!event) ++ return -ENOMEM; ++ event->u.event.manual = args.manual; ++ event->u.event.signaled = args.signaled; ++ fd = ntsync_obj_get_fd(event); ++ if (fd < 0) ++ ntsync_free_obj(event); ++ ++ return fd; ++} ++ ++static struct ntsync_obj *get_obj(struct ntsync_device *dev, int fd) ++{ ++ struct file *file = fget(fd); ++ struct ntsync_obj *obj; ++ ++ if (!file) ++ return NULL; ++ ++ if (file->f_op != &ntsync_obj_fops) { ++ fput(file); ++ return NULL; ++ } ++ ++ obj = file->private_data; ++ if (obj->dev != dev) { ++ fput(file); ++ return NULL; + } + +- return put_user(fd, &user_args->sem); ++ return obj; ++} ++ ++static void put_obj(struct ntsync_obj *obj) ++{ ++ fput(obj->file); ++} ++ ++static int ntsync_schedule(const struct ntsync_q *q, const struct ntsync_wait_args *args) ++{ ++ ktime_t timeout = ns_to_ktime(args->timeout); ++ clockid_t clock = CLOCK_MONOTONIC; ++ ktime_t *timeout_ptr; ++ int ret = 0; ++ ++ timeout_ptr = (args->timeout == U64_MAX ? NULL : &timeout); ++ ++ if (args->flags & NTSYNC_WAIT_REALTIME) ++ clock = CLOCK_REALTIME; ++ ++ do { ++ if (signal_pending(current)) { ++ ret = -ERESTARTSYS; ++ break; ++ } ++ ++ set_current_state(TASK_INTERRUPTIBLE); ++ if (atomic_read(&q->signaled) != -1) { ++ ret = 0; ++ break; ++ } ++ ret = schedule_hrtimeout_range_clock(timeout_ptr, 0, HRTIMER_MODE_ABS, clock); ++ } while (ret < 0); ++ __set_current_state(TASK_RUNNING); ++ ++ return ret; ++} ++ ++/* ++ * Allocate and initialize the ntsync_q structure, but do not queue us yet. ++ */ ++static int setup_wait(struct ntsync_device *dev, ++ const struct ntsync_wait_args *args, bool all, ++ struct ntsync_q **ret_q) ++{ ++ int fds[NTSYNC_MAX_WAIT_COUNT + 1]; ++ const __u32 count = args->count; ++ struct ntsync_q *q; ++ __u32 total_count; ++ __u32 i, j; ++ ++ if (args->pad || (args->flags & ~NTSYNC_WAIT_REALTIME)) ++ return -EINVAL; ++ ++ if (args->count > NTSYNC_MAX_WAIT_COUNT) ++ return -EINVAL; ++ ++ total_count = count; ++ if (args->alert) ++ total_count++; ++ ++ if (copy_from_user(fds, u64_to_user_ptr(args->objs), ++ array_size(count, sizeof(*fds)))) ++ return -EFAULT; ++ if (args->alert) ++ fds[count] = args->alert; ++ ++ q = kmalloc(struct_size(q, entries, total_count), GFP_KERNEL); ++ if (!q) ++ return -ENOMEM; ++ q->task = current; ++ q->owner = args->owner; ++ atomic_set(&q->signaled, -1); ++ q->all = all; ++ q->ownerdead = false; ++ q->count = count; ++ ++ for (i = 0; i < total_count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = get_obj(dev, fds[i]); ++ ++ if (!obj) ++ goto err; ++ ++ if (all) { ++ /* Check that the objects are all distinct. */ ++ for (j = 0; j < i; j++) { ++ if (obj == q->entries[j].obj) { ++ put_obj(obj); ++ goto err; ++ } ++ } ++ } ++ ++ entry->obj = obj; ++ entry->q = q; ++ entry->index = i; ++ } ++ ++ *ret_q = q; ++ return 0; ++ ++err: ++ for (j = 0; j < i; j++) ++ put_obj(q->entries[j].obj); ++ kfree(q); ++ return -EINVAL; ++} ++ ++static void try_wake_any_obj(struct ntsync_obj *obj) ++{ ++ switch (obj->type) { ++ case NTSYNC_TYPE_SEM: ++ try_wake_any_sem(obj); ++ break; ++ case NTSYNC_TYPE_MUTEX: ++ try_wake_any_mutex(obj); ++ break; ++ case NTSYNC_TYPE_EVENT: ++ try_wake_any_event(obj); ++ break; ++ } ++} ++ ++static int ntsync_wait_any(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_wait_args args; ++ __u32 i, total_count; ++ struct ntsync_q *q; ++ int signaled; ++ bool all; ++ int ret; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ ret = setup_wait(dev, &args, false, &q); ++ if (ret < 0) ++ return ret; ++ ++ total_count = args.count; ++ if (args.alert) ++ total_count++; ++ ++ /* queue ourselves */ ++ ++ for (i = 0; i < total_count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ all = ntsync_lock_obj(dev, obj); ++ list_add_tail(&entry->node, &obj->any_waiters); ++ ntsync_unlock_obj(dev, obj, all); ++ } ++ ++ /* ++ * Check if we are already signaled. ++ * ++ * Note that the API requires that normal objects are checked before ++ * the alert event. Hence we queue the alert event last, and check ++ * objects in order. ++ */ ++ ++ for (i = 0; i < total_count; i++) { ++ struct ntsync_obj *obj = q->entries[i].obj; ++ ++ if (atomic_read(&q->signaled) != -1) ++ break; ++ ++ all = ntsync_lock_obj(dev, obj); ++ try_wake_any_obj(obj); ++ ntsync_unlock_obj(dev, obj, all); ++ } ++ ++ /* sleep */ ++ ++ ret = ntsync_schedule(q, &args); ++ ++ /* and finally, unqueue */ ++ ++ for (i = 0; i < total_count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ all = ntsync_lock_obj(dev, obj); ++ list_del(&entry->node); ++ ntsync_unlock_obj(dev, obj, all); ++ ++ put_obj(obj); ++ } ++ ++ signaled = atomic_read(&q->signaled); ++ if (signaled != -1) { ++ struct ntsync_wait_args __user *user_args = argp; ++ ++ /* even if we caught a signal, we need to communicate success */ ++ ret = q->ownerdead ? -EOWNERDEAD : 0; ++ ++ if (put_user(signaled, &user_args->index)) ++ ret = -EFAULT; ++ } else if (!ret) { ++ ret = -ETIMEDOUT; ++ } ++ ++ kfree(q); ++ return ret; ++} ++ ++static int ntsync_wait_all(struct ntsync_device *dev, void __user *argp) ++{ ++ struct ntsync_wait_args args; ++ struct ntsync_q *q; ++ int signaled; ++ __u32 i; ++ int ret; ++ ++ if (copy_from_user(&args, argp, sizeof(args))) ++ return -EFAULT; ++ ++ ret = setup_wait(dev, &args, true, &q); ++ if (ret < 0) ++ return ret; ++ ++ /* queue ourselves */ ++ ++ mutex_lock(&dev->wait_all_lock); ++ ++ for (i = 0; i < args.count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ atomic_inc(&obj->all_hint); ++ ++ /* ++ * obj->all_waiters is protected by dev->wait_all_lock rather ++ * than obj->lock, so there is no need to acquire obj->lock ++ * here. ++ */ ++ list_add_tail(&entry->node, &obj->all_waiters); ++ } ++ if (args.alert) { ++ struct ntsync_q_entry *entry = &q->entries[args.count]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ dev_lock_obj(dev, obj); ++ list_add_tail(&entry->node, &obj->any_waiters); ++ dev_unlock_obj(dev, obj); ++ } ++ ++ /* check if we are already signaled */ ++ ++ try_wake_all(dev, q, NULL); ++ ++ mutex_unlock(&dev->wait_all_lock); ++ ++ /* ++ * Check if the alert event is signaled, making sure to do so only ++ * after checking if the other objects are signaled. ++ */ ++ ++ if (args.alert) { ++ struct ntsync_obj *obj = q->entries[args.count].obj; ++ ++ if (atomic_read(&q->signaled) == -1) { ++ bool all = ntsync_lock_obj(dev, obj); ++ try_wake_any_obj(obj); ++ ntsync_unlock_obj(dev, obj, all); ++ } ++ } ++ ++ /* sleep */ ++ ++ ret = ntsync_schedule(q, &args); ++ ++ /* and finally, unqueue */ ++ ++ mutex_lock(&dev->wait_all_lock); ++ ++ for (i = 0; i < args.count; i++) { ++ struct ntsync_q_entry *entry = &q->entries[i]; ++ struct ntsync_obj *obj = entry->obj; ++ ++ /* ++ * obj->all_waiters is protected by dev->wait_all_lock rather ++ * than obj->lock, so there is no need to acquire it here. ++ */ ++ list_del(&entry->node); ++ ++ atomic_dec(&obj->all_hint); ++ ++ put_obj(obj); ++ } ++ ++ mutex_unlock(&dev->wait_all_lock); ++ ++ if (args.alert) { ++ struct ntsync_q_entry *entry = &q->entries[args.count]; ++ struct ntsync_obj *obj = entry->obj; ++ bool all; ++ ++ all = ntsync_lock_obj(dev, obj); ++ list_del(&entry->node); ++ ntsync_unlock_obj(dev, obj, all); ++ ++ put_obj(obj); ++ } ++ ++ signaled = atomic_read(&q->signaled); ++ if (signaled != -1) { ++ struct ntsync_wait_args __user *user_args = argp; ++ ++ /* even if we caught a signal, we need to communicate success */ ++ ret = q->ownerdead ? -EOWNERDEAD : 0; ++ ++ if (put_user(signaled, &user_args->index)) ++ ret = -EFAULT; ++ } else if (!ret) { ++ ret = -ETIMEDOUT; ++ } ++ ++ kfree(q); ++ return ret; + } + + static int ntsync_char_open(struct inode *inode, struct file *file) +@@ -198,6 +1158,8 @@ static int ntsync_char_open(struct inode *inode, struct file *file) + if (!dev) + return -ENOMEM; + ++ mutex_init(&dev->wait_all_lock); ++ + file->private_data = dev; + dev->file = file; + return nonseekable_open(inode, file); +@@ -219,8 +1181,16 @@ static long ntsync_char_ioctl(struct file *file, unsigned int cmd, + void __user *argp = (void __user *)parm; + + switch (cmd) { ++ case NTSYNC_IOC_CREATE_EVENT: ++ return ntsync_create_event(dev, argp); ++ case NTSYNC_IOC_CREATE_MUTEX: ++ return ntsync_create_mutex(dev, argp); + case NTSYNC_IOC_CREATE_SEM: + return ntsync_create_sem(dev, argp); ++ case NTSYNC_IOC_WAIT_ALL: ++ return ntsync_wait_all(dev, argp); ++ case NTSYNC_IOC_WAIT_ANY: ++ return ntsync_wait_any(dev, argp); + default: + return -ENOIOCTLCMD; + } +@@ -238,6 +1208,7 @@ static struct miscdevice ntsync_misc = { + .minor = MISC_DYNAMIC_MINOR, + .name = NTSYNC_NAME, + .fops = &ntsync_fops, ++ .mode = 0666, // Setting file permissions to 0666 + }; + + module_misc_device(ntsync_misc); +diff --git a/include/uapi/linux/ntsync.h b/include/uapi/linux/ntsync.h +index dcfa38fdc93c..6d06793512b1 100644 +--- a/include/uapi/linux/ntsync.h ++++ b/include/uapi/linux/ntsync.h +@@ -11,13 +11,49 @@ + #include + + struct ntsync_sem_args { +- __u32 sem; + __u32 count; + __u32 max; + }; + +-#define NTSYNC_IOC_CREATE_SEM _IOWR('N', 0x80, struct ntsync_sem_args) ++struct ntsync_mutex_args { ++ __u32 owner; ++ __u32 count; ++}; ++ ++struct ntsync_event_args { ++ __u32 manual; ++ __u32 signaled; ++}; ++ ++#define NTSYNC_WAIT_REALTIME 0x1 ++ ++struct ntsync_wait_args { ++ __u64 timeout; ++ __u64 objs; ++ __u32 count; ++ __u32 index; ++ __u32 flags; ++ __u32 owner; ++ __u32 alert; ++ __u32 pad; ++}; ++ ++#define NTSYNC_MAX_WAIT_COUNT 64 ++ ++#define NTSYNC_IOC_CREATE_SEM _IOW ('N', 0x80, struct ntsync_sem_args) ++#define NTSYNC_IOC_WAIT_ANY _IOWR('N', 0x82, struct ntsync_wait_args) ++#define NTSYNC_IOC_WAIT_ALL _IOWR('N', 0x83, struct ntsync_wait_args) ++#define NTSYNC_IOC_CREATE_MUTEX _IOW ('N', 0x84, struct ntsync_mutex_args) ++#define NTSYNC_IOC_CREATE_EVENT _IOW ('N', 0x87, struct ntsync_event_args) + +-#define NTSYNC_IOC_SEM_POST _IOWR('N', 0x81, __u32) ++#define NTSYNC_IOC_SEM_RELEASE _IOWR('N', 0x81, __u32) ++#define NTSYNC_IOC_MUTEX_UNLOCK _IOWR('N', 0x85, struct ntsync_mutex_args) ++#define NTSYNC_IOC_MUTEX_KILL _IOW ('N', 0x86, __u32) ++#define NTSYNC_IOC_EVENT_SET _IOR ('N', 0x88, __u32) ++#define NTSYNC_IOC_EVENT_RESET _IOR ('N', 0x89, __u32) ++#define NTSYNC_IOC_EVENT_PULSE _IOR ('N', 0x8a, __u32) ++#define NTSYNC_IOC_SEM_READ _IOR ('N', 0x8b, struct ntsync_sem_args) ++#define NTSYNC_IOC_MUTEX_READ _IOR ('N', 0x8c, struct ntsync_mutex_args) ++#define NTSYNC_IOC_EVENT_READ _IOR ('N', 0x8d, struct ntsync_event_args) + + #endif +diff --git a/tools/testing/selftests/Makefile b/tools/testing/selftests/Makefile +index 2401e973c359..a8c9648e5adc 100644 +--- a/tools/testing/selftests/Makefile ++++ b/tools/testing/selftests/Makefile +@@ -18,6 +18,7 @@ TARGETS += devices/error_logs + TARGETS += devices/probe + TARGETS += dmabuf-heaps + TARGETS += drivers/dma-buf ++TARGETS += drivers/ntsync + TARGETS += drivers/s390x/uvdevice + TARGETS += drivers/net + TARGETS += drivers/net/bonding +diff --git a/tools/testing/selftests/drivers/ntsync/.gitignore b/tools/testing/selftests/drivers/ntsync/.gitignore +new file mode 100644 +index 000000000000..848573a3d3ea +--- /dev/null ++++ b/tools/testing/selftests/drivers/ntsync/.gitignore +@@ -0,0 +1 @@ ++ntsync +diff --git a/tools/testing/selftests/drivers/ntsync/Makefile b/tools/testing/selftests/drivers/ntsync/Makefile +new file mode 100644 +index 000000000000..dbf2b055c0b2 +--- /dev/null ++++ b/tools/testing/selftests/drivers/ntsync/Makefile +@@ -0,0 +1,7 @@ ++# SPDX-LICENSE-IDENTIFIER: GPL-2.0-only ++TEST_GEN_PROGS := ntsync ++ ++CFLAGS += $(KHDR_INCLUDES) ++LDLIBS += -lpthread ++ ++include ../../lib.mk +diff --git a/tools/testing/selftests/drivers/ntsync/config b/tools/testing/selftests/drivers/ntsync/config +new file mode 100644 +index 000000000000..60539c826d06 +--- /dev/null ++++ b/tools/testing/selftests/drivers/ntsync/config +@@ -0,0 +1 @@ ++CONFIG_WINESYNC=y +diff --git a/tools/testing/selftests/drivers/ntsync/ntsync.c b/tools/testing/selftests/drivers/ntsync/ntsync.c +new file mode 100644 +index 000000000000..3aad311574c4 +--- /dev/null ++++ b/tools/testing/selftests/drivers/ntsync/ntsync.c +@@ -0,0 +1,1343 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * Various unit tests for the "ntsync" synchronization primitive driver. ++ * ++ * Copyright (C) 2021-2022 Elizabeth Figura ++ */ ++ ++#define _GNU_SOURCE ++#include ++#include ++#include ++#include ++#include ++#include ++#include "../../kselftest_harness.h" ++ ++static int read_sem_state(int sem, __u32 *count, __u32 *max) ++{ ++ struct ntsync_sem_args args; ++ int ret; ++ ++ memset(&args, 0xcc, sizeof(args)); ++ ret = ioctl(sem, NTSYNC_IOC_SEM_READ, &args); ++ *count = args.count; ++ *max = args.max; ++ return ret; ++} ++ ++#define check_sem_state(sem, count, max) \ ++ ({ \ ++ __u32 __count, __max; \ ++ int ret = read_sem_state((sem), &__count, &__max); \ ++ EXPECT_EQ(0, ret); \ ++ EXPECT_EQ((count), __count); \ ++ EXPECT_EQ((max), __max); \ ++ }) ++ ++static int release_sem(int sem, __u32 *count) ++{ ++ return ioctl(sem, NTSYNC_IOC_SEM_RELEASE, count); ++} ++ ++static int read_mutex_state(int mutex, __u32 *count, __u32 *owner) ++{ ++ struct ntsync_mutex_args args; ++ int ret; ++ ++ memset(&args, 0xcc, sizeof(args)); ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &args); ++ *count = args.count; ++ *owner = args.owner; ++ return ret; ++} ++ ++#define check_mutex_state(mutex, count, owner) \ ++ ({ \ ++ __u32 __count, __owner; \ ++ int ret = read_mutex_state((mutex), &__count, &__owner); \ ++ EXPECT_EQ(0, ret); \ ++ EXPECT_EQ((count), __count); \ ++ EXPECT_EQ((owner), __owner); \ ++ }) ++ ++static int unlock_mutex(int mutex, __u32 owner, __u32 *count) ++{ ++ struct ntsync_mutex_args args; ++ int ret; ++ ++ args.owner = owner; ++ args.count = 0xdeadbeef; ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_UNLOCK, &args); ++ *count = args.count; ++ return ret; ++} ++ ++static int read_event_state(int event, __u32 *signaled, __u32 *manual) ++{ ++ struct ntsync_event_args args; ++ int ret; ++ ++ memset(&args, 0xcc, sizeof(args)); ++ ret = ioctl(event, NTSYNC_IOC_EVENT_READ, &args); ++ *signaled = args.signaled; ++ *manual = args.manual; ++ return ret; ++} ++ ++#define check_event_state(event, signaled, manual) \ ++ ({ \ ++ __u32 __signaled, __manual; \ ++ int ret = read_event_state((event), &__signaled, &__manual); \ ++ EXPECT_EQ(0, ret); \ ++ EXPECT_EQ((signaled), __signaled); \ ++ EXPECT_EQ((manual), __manual); \ ++ }) ++ ++static int wait_objs(int fd, unsigned long request, __u32 count, ++ const int *objs, __u32 owner, int alert, __u32 *index) ++{ ++ struct ntsync_wait_args args = {0}; ++ struct timespec timeout; ++ int ret; ++ ++ clock_gettime(CLOCK_MONOTONIC, &timeout); ++ ++ args.timeout = timeout.tv_sec * 1000000000 + timeout.tv_nsec; ++ args.count = count; ++ args.objs = (uintptr_t)objs; ++ args.owner = owner; ++ args.index = 0xdeadbeef; ++ args.alert = alert; ++ ret = ioctl(fd, request, &args); ++ *index = args.index; ++ return ret; ++} ++ ++static int wait_any(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) ++{ ++ return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, count, objs, owner, 0, index); ++} ++ ++static int wait_all(int fd, __u32 count, const int *objs, __u32 owner, __u32 *index) ++{ ++ return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, count, objs, owner, 0, index); ++} ++ ++static int wait_any_alert(int fd, __u32 count, const int *objs, ++ __u32 owner, int alert, __u32 *index) ++{ ++ return wait_objs(fd, NTSYNC_IOC_WAIT_ANY, ++ count, objs, owner, alert, index); ++} ++ ++static int wait_all_alert(int fd, __u32 count, const int *objs, ++ __u32 owner, int alert, __u32 *index) ++{ ++ return wait_objs(fd, NTSYNC_IOC_WAIT_ALL, ++ count, objs, owner, alert, index); ++} ++ ++TEST(semaphore_state) ++{ ++ struct ntsync_sem_args sem_args; ++ struct timespec timeout; ++ __u32 count, index; ++ int fd, ret, sem; ++ ++ clock_gettime(CLOCK_MONOTONIC, &timeout); ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ sem_args.count = 3; ++ sem_args.max = 2; ++ sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_EQ(-1, sem); ++ EXPECT_EQ(EINVAL, errno); ++ ++ sem_args.count = 2; ++ sem_args.max = 2; ++ sem = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, sem); ++ check_sem_state(sem, 2, 2); ++ ++ count = 0; ++ ret = release_sem(sem, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, count); ++ check_sem_state(sem, 2, 2); ++ ++ count = 1; ++ ret = release_sem(sem, &count); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOVERFLOW, errno); ++ check_sem_state(sem, 2, 2); ++ ++ ret = wait_any(fd, 1, &sem, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(sem, 1, 2); ++ ++ ret = wait_any(fd, 1, &sem, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(sem, 0, 2); ++ ++ ret = wait_any(fd, 1, &sem, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ count = 3; ++ ret = release_sem(sem, &count); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOVERFLOW, errno); ++ check_sem_state(sem, 0, 2); ++ ++ count = 2; ++ ret = release_sem(sem, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ check_sem_state(sem, 2, 2); ++ ++ ret = wait_any(fd, 1, &sem, 123, &index); ++ EXPECT_EQ(0, ret); ++ ret = wait_any(fd, 1, &sem, 123, &index); ++ EXPECT_EQ(0, ret); ++ ++ count = 1; ++ ret = release_sem(sem, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ check_sem_state(sem, 1, 2); ++ ++ count = ~0u; ++ ret = release_sem(sem, &count); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOVERFLOW, errno); ++ check_sem_state(sem, 1, 2); ++ ++ close(sem); ++ ++ close(fd); ++} ++ ++TEST(mutex_state) ++{ ++ struct ntsync_mutex_args mutex_args; ++ __u32 owner, count, index; ++ struct timespec timeout; ++ int fd, ret, mutex; ++ ++ clock_gettime(CLOCK_MONOTONIC, &timeout); ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ mutex_args.owner = 123; ++ mutex_args.count = 0; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_EQ(-1, mutex); ++ EXPECT_EQ(EINVAL, errno); ++ ++ mutex_args.owner = 0; ++ mutex_args.count = 2; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_EQ(-1, mutex); ++ EXPECT_EQ(EINVAL, errno); ++ ++ mutex_args.owner = 123; ++ mutex_args.count = 2; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, mutex); ++ check_mutex_state(mutex, 2, 123); ++ ++ ret = unlock_mutex(mutex, 0, &count); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EINVAL, errno); ++ ++ ret = unlock_mutex(mutex, 456, &count); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EPERM, errno); ++ check_mutex_state(mutex, 2, 123); ++ ++ ret = unlock_mutex(mutex, 123, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, count); ++ check_mutex_state(mutex, 1, 123); ++ ++ ret = unlock_mutex(mutex, 123, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, count); ++ check_mutex_state(mutex, 0, 0); ++ ++ ret = unlock_mutex(mutex, 123, &count); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EPERM, errno); ++ ++ ret = wait_any(fd, 1, &mutex, 456, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_mutex_state(mutex, 1, 456); ++ ++ ret = wait_any(fd, 1, &mutex, 456, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_mutex_state(mutex, 2, 456); ++ ++ ret = unlock_mutex(mutex, 456, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, count); ++ check_mutex_state(mutex, 1, 456); ++ ++ ret = wait_any(fd, 1, &mutex, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ owner = 0; ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EINVAL, errno); ++ ++ owner = 123; ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EPERM, errno); ++ check_mutex_state(mutex, 1, 456); ++ ++ owner = 456; ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); ++ EXPECT_EQ(0, ret); ++ ++ memset(&mutex_args, 0xcc, sizeof(mutex_args)); ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOWNERDEAD, errno); ++ EXPECT_EQ(0, mutex_args.count); ++ EXPECT_EQ(0, mutex_args.owner); ++ ++ memset(&mutex_args, 0xcc, sizeof(mutex_args)); ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOWNERDEAD, errno); ++ EXPECT_EQ(0, mutex_args.count); ++ EXPECT_EQ(0, mutex_args.owner); ++ ++ ret = wait_any(fd, 1, &mutex, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOWNERDEAD, errno); ++ EXPECT_EQ(0, index); ++ check_mutex_state(mutex, 1, 123); ++ ++ owner = 123; ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_KILL, &owner); ++ EXPECT_EQ(0, ret); ++ ++ memset(&mutex_args, 0xcc, sizeof(mutex_args)); ++ ret = ioctl(mutex, NTSYNC_IOC_MUTEX_READ, &mutex_args); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOWNERDEAD, errno); ++ EXPECT_EQ(0, mutex_args.count); ++ EXPECT_EQ(0, mutex_args.owner); ++ ++ ret = wait_any(fd, 1, &mutex, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOWNERDEAD, errno); ++ EXPECT_EQ(0, index); ++ check_mutex_state(mutex, 1, 123); ++ ++ close(mutex); ++ ++ mutex_args.owner = 0; ++ mutex_args.count = 0; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, mutex); ++ check_mutex_state(mutex, 0, 0); ++ ++ ret = wait_any(fd, 1, &mutex, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_mutex_state(mutex, 1, 123); ++ ++ close(mutex); ++ ++ mutex_args.owner = 123; ++ mutex_args.count = ~0u; ++ mutex = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, mutex); ++ check_mutex_state(mutex, ~0u, 123); ++ ++ ret = wait_any(fd, 1, &mutex, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ close(mutex); ++ ++ close(fd); ++} ++ ++TEST(manual_event_state) ++{ ++ struct ntsync_event_args event_args; ++ __u32 index, signaled; ++ int fd, event, ret; ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ event_args.manual = 1; ++ event_args.signaled = 0; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); ++ check_event_state(event, 0, 1); ++ ++ signaled = 0xdeadbeef; ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(event, 1, 1); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ check_event_state(event, 1, 1); ++ ++ ret = wait_any(fd, 1, &event, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_event_state(event, 1, 1); ++ ++ signaled = 0xdeadbeef; ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ check_event_state(event, 0, 1); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(event, 0, 1); ++ ++ ret = wait_any(fd, 1, &event, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ check_event_state(event, 0, 1); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(event, 0, 1); ++ ++ close(event); ++ ++ close(fd); ++} ++ ++TEST(auto_event_state) ++{ ++ struct ntsync_event_args event_args; ++ __u32 index, signaled; ++ int fd, event, ret; ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ event_args.manual = 0; ++ event_args.signaled = 1; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); ++ ++ check_event_state(event, 1, 0); ++ ++ signaled = 0xdeadbeef; ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ check_event_state(event, 1, 0); ++ ++ ret = wait_any(fd, 1, &event, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_event_state(event, 0, 0); ++ ++ signaled = 0xdeadbeef; ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(event, 0, 0); ++ ++ ret = wait_any(fd, 1, &event, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ check_event_state(event, 0, 0); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_PULSE, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(event, 0, 0); ++ ++ close(event); ++ ++ close(fd); ++} ++ ++TEST(test_wait_any) ++{ ++ int objs[NTSYNC_MAX_WAIT_COUNT + 1], fd, ret; ++ struct ntsync_mutex_args mutex_args = {0}; ++ struct ntsync_sem_args sem_args = {0}; ++ __u32 owner, index, count, i; ++ struct timespec timeout; ++ ++ clock_gettime(CLOCK_MONOTONIC, &timeout); ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ sem_args.count = 2; ++ sem_args.max = 3; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); ++ ++ mutex_args.owner = 0; ++ mutex_args.count = 0; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); ++ ++ ret = wait_any(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 0, 0); ++ ++ ret = wait_any(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 0, 0); ++ ++ ret = wait_any(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, index); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 1, 123); ++ ++ count = 1; ++ ret = release_sem(objs[0], &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ ++ ret = wait_any(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 1, 123); ++ ++ ret = wait_any(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, index); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 2, 123); ++ ++ ret = wait_any(fd, 2, objs, 456, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ owner = 123; ++ ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_any(fd, 2, objs, 456, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOWNERDEAD, errno); ++ EXPECT_EQ(1, index); ++ ++ ret = wait_any(fd, 2, objs, 456, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, index); ++ ++ close(objs[1]); ++ ++ /* test waiting on the same object twice */ ++ ++ count = 2; ++ ret = release_sem(objs[0], &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ ++ objs[1] = objs[0]; ++ ret = wait_any(fd, 2, objs, 456, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 1, 3); ++ ++ ret = wait_any(fd, 0, NULL, 456, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ for (i = 1; i < NTSYNC_MAX_WAIT_COUNT + 1; ++i) ++ objs[i] = objs[0]; ++ ++ ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ ++ ret = wait_any(fd, NTSYNC_MAX_WAIT_COUNT + 1, objs, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EINVAL, errno); ++ ++ ret = wait_any(fd, -1, objs, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EINVAL, errno); ++ ++ close(objs[0]); ++ ++ close(fd); ++} ++ ++TEST(test_wait_all) ++{ ++ struct ntsync_event_args event_args = {0}; ++ struct ntsync_mutex_args mutex_args = {0}; ++ struct ntsync_sem_args sem_args = {0}; ++ __u32 owner, index, count; ++ int objs[2], fd, ret; ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ sem_args.count = 2; ++ sem_args.max = 3; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); ++ ++ mutex_args.owner = 0; ++ mutex_args.count = 0; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); ++ ++ ret = wait_all(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 123); ++ ++ ret = wait_all(fd, 2, objs, 456, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 123); ++ ++ ret = wait_all(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 2, 123); ++ ++ ret = wait_all(fd, 2, objs, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ check_sem_state(objs[0], 0, 3); ++ check_mutex_state(objs[1], 2, 123); ++ ++ count = 3; ++ ret = release_sem(objs[0], &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ ++ ret = wait_all(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 2, 3); ++ check_mutex_state(objs[1], 3, 123); ++ ++ owner = 123; ++ ret = ioctl(objs[1], NTSYNC_IOC_MUTEX_KILL, &owner); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_all(fd, 2, objs, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EOWNERDEAD, errno); ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 123); ++ ++ close(objs[1]); ++ ++ event_args.manual = true; ++ event_args.signaled = true; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, objs[1]); ++ ++ ret = wait_all(fd, 2, objs, 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ check_sem_state(objs[0], 0, 3); ++ check_event_state(objs[1], 1, 1); ++ ++ close(objs[1]); ++ ++ /* test waiting on the same object twice */ ++ objs[1] = objs[0]; ++ ret = wait_all(fd, 2, objs, 123, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(EINVAL, errno); ++ ++ close(objs[0]); ++ ++ close(fd); ++} ++ ++struct wake_args { ++ int fd; ++ int obj; ++}; ++ ++struct wait_args { ++ int fd; ++ unsigned long request; ++ struct ntsync_wait_args *args; ++ int ret; ++ int err; ++}; ++ ++static void *wait_thread(void *arg) ++{ ++ struct wait_args *args = arg; ++ ++ args->ret = ioctl(args->fd, args->request, args->args); ++ args->err = errno; ++ return NULL; ++} ++ ++static __u64 get_abs_timeout(unsigned int ms) ++{ ++ struct timespec timeout; ++ clock_gettime(CLOCK_MONOTONIC, &timeout); ++ return (timeout.tv_sec * 1000000000) + timeout.tv_nsec + (ms * 1000000); ++} ++ ++static int wait_for_thread(pthread_t thread, unsigned int ms) ++{ ++ struct timespec timeout; ++ ++ clock_gettime(CLOCK_REALTIME, &timeout); ++ timeout.tv_nsec += ms * 1000000; ++ timeout.tv_sec += (timeout.tv_nsec / 1000000000); ++ timeout.tv_nsec %= 1000000000; ++ return pthread_timedjoin_np(thread, NULL, &timeout); ++} ++ ++TEST(wake_any) ++{ ++ struct ntsync_event_args event_args = {0}; ++ struct ntsync_mutex_args mutex_args = {0}; ++ struct ntsync_wait_args wait_args = {0}; ++ struct ntsync_sem_args sem_args = {0}; ++ struct wait_args thread_args; ++ __u32 count, index, signaled; ++ int objs[2], fd, ret; ++ pthread_t thread; ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ sem_args.count = 0; ++ sem_args.max = 3; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); ++ ++ mutex_args.owner = 123; ++ mutex_args.count = 1; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); ++ ++ /* test waking the semaphore */ ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ wait_args.objs = (uintptr_t)objs; ++ wait_args.count = 2; ++ wait_args.owner = 456; ++ wait_args.index = 0xdeadbeef; ++ thread_args.fd = fd; ++ thread_args.args = &wait_args; ++ thread_args.request = NTSYNC_IOC_WAIT_ANY; ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ count = 1; ++ ret = release_sem(objs[0], &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ check_sem_state(objs[0], 0, 3); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(0, wait_args.index); ++ ++ /* test waking the mutex */ ++ ++ /* first grab it again for owner 123 */ ++ ret = wait_any(fd, 1, &objs[1], 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ wait_args.owner = 456; ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ ret = unlock_mutex(objs[1], 123, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, count); ++ ++ ret = pthread_tryjoin_np(thread, NULL); ++ EXPECT_EQ(EBUSY, ret); ++ ++ ret = unlock_mutex(objs[1], 123, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, mutex_args.count); ++ check_mutex_state(objs[1], 1, 456); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(1, wait_args.index); ++ ++ close(objs[1]); ++ ++ /* test waking events */ ++ ++ event_args.manual = false; ++ event_args.signaled = false; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, objs[1]); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(objs[1], 0, 0); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(1, wait_args.index); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(objs[1], 0, 0); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(1, wait_args.index); ++ ++ close(objs[1]); ++ ++ event_args.manual = true; ++ event_args.signaled = false; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, objs[1]); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(objs[1], 1, 1); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(1, wait_args.index); ++ ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ ret = ioctl(objs[1], NTSYNC_IOC_EVENT_PULSE, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ check_event_state(objs[1], 0, 1); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(1, wait_args.index); ++ ++ /* delete an object while it's being waited on */ ++ ++ wait_args.timeout = get_abs_timeout(200); ++ wait_args.owner = 123; ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ close(objs[0]); ++ close(objs[1]); ++ ++ ret = wait_for_thread(thread, 200); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(-1, thread_args.ret); ++ EXPECT_EQ(ETIMEDOUT, thread_args.err); ++ ++ close(fd); ++} ++ ++TEST(wake_all) ++{ ++ struct ntsync_event_args manual_event_args = {0}; ++ struct ntsync_event_args auto_event_args = {0}; ++ struct ntsync_mutex_args mutex_args = {0}; ++ struct ntsync_wait_args wait_args = {0}; ++ struct ntsync_sem_args sem_args = {0}; ++ struct wait_args thread_args; ++ __u32 count, index, signaled; ++ int objs[4], fd, ret; ++ pthread_t thread; ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ sem_args.count = 0; ++ sem_args.max = 3; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); ++ ++ mutex_args.owner = 123; ++ mutex_args.count = 1; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, objs[1]); ++ ++ manual_event_args.manual = true; ++ manual_event_args.signaled = true; ++ objs[2] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &manual_event_args); ++ EXPECT_LE(0, objs[2]); ++ ++ auto_event_args.manual = false; ++ auto_event_args.signaled = true; ++ objs[3] = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &auto_event_args); ++ EXPECT_EQ(0, objs[3]); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ wait_args.objs = (uintptr_t)objs; ++ wait_args.count = 4; ++ wait_args.owner = 456; ++ thread_args.fd = fd; ++ thread_args.args = &wait_args; ++ thread_args.request = NTSYNC_IOC_WAIT_ALL; ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ count = 1; ++ ret = release_sem(objs[0], &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ ++ ret = pthread_tryjoin_np(thread, NULL); ++ EXPECT_EQ(EBUSY, ret); ++ ++ check_sem_state(objs[0], 1, 3); ++ ++ ret = wait_any(fd, 1, &objs[0], 123, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ ++ ret = unlock_mutex(objs[1], 123, &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, count); ++ ++ ret = pthread_tryjoin_np(thread, NULL); ++ EXPECT_EQ(EBUSY, ret); ++ ++ check_mutex_state(objs[1], 0, 0); ++ ++ ret = ioctl(objs[2], NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ ++ count = 2; ++ ret = release_sem(objs[0], &count); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, count); ++ check_sem_state(objs[0], 2, 3); ++ ++ ret = ioctl(objs[3], NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, signaled); ++ ++ ret = ioctl(objs[2], NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ ++ ret = ioctl(objs[3], NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, signaled); ++ ++ check_sem_state(objs[0], 1, 3); ++ check_mutex_state(objs[1], 1, 456); ++ check_event_state(objs[2], 1, 1); ++ check_event_state(objs[3], 0, 0); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ ++ /* delete an object while it's being waited on */ ++ ++ wait_args.timeout = get_abs_timeout(200); ++ wait_args.owner = 123; ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ close(objs[0]); ++ close(objs[1]); ++ close(objs[2]); ++ close(objs[3]); ++ ++ ret = wait_for_thread(thread, 200); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(-1, thread_args.ret); ++ EXPECT_EQ(ETIMEDOUT, thread_args.err); ++ ++ close(fd); ++} ++ ++TEST(alert_any) ++{ ++ struct ntsync_event_args event_args = {0}; ++ struct ntsync_wait_args wait_args = {0}; ++ struct ntsync_sem_args sem_args = {0}; ++ __u32 index, count, signaled; ++ struct wait_args thread_args; ++ int objs[2], event, fd, ret; ++ pthread_t thread; ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ sem_args.count = 0; ++ sem_args.max = 2; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); ++ ++ sem_args.count = 1; ++ sem_args.max = 2; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[1]); ++ ++ event_args.manual = true; ++ event_args.signaled = true; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); ++ ++ ret = wait_any_alert(fd, 0, NULL, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_any_alert(fd, 0, NULL, 123, event, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(1, index); ++ ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, index); ++ ++ /* test wakeup via alert */ ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ wait_args.objs = (uintptr_t)objs; ++ wait_args.count = 2; ++ wait_args.owner = 123; ++ wait_args.index = 0xdeadbeef; ++ wait_args.alert = event; ++ thread_args.fd = fd; ++ thread_args.args = &wait_args; ++ thread_args.request = NTSYNC_IOC_WAIT_ANY; ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(2, wait_args.index); ++ ++ close(event); ++ ++ /* test with an auto-reset event */ ++ ++ event_args.manual = false; ++ event_args.signaled = true; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); ++ ++ count = 1; ++ ret = release_sem(objs[0], &count); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, index); ++ ++ ret = wait_any_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ close(event); ++ ++ close(objs[0]); ++ close(objs[1]); ++ ++ close(fd); ++} ++ ++TEST(alert_all) ++{ ++ struct ntsync_event_args event_args = {0}; ++ struct ntsync_wait_args wait_args = {0}; ++ struct ntsync_sem_args sem_args = {0}; ++ struct wait_args thread_args; ++ __u32 index, count, signaled; ++ int objs[2], event, fd, ret; ++ pthread_t thread; ++ ++ fd = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, fd); ++ ++ sem_args.count = 2; ++ sem_args.max = 2; ++ objs[0] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[0]); ++ ++ sem_args.count = 1; ++ sem_args.max = 2; ++ objs[1] = ioctl(fd, NTSYNC_IOC_CREATE_SEM, &sem_args); ++ EXPECT_LE(0, objs[1]); ++ ++ event_args.manual = true; ++ event_args.signaled = true; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); ++ ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, index); ++ ++ /* test wakeup via alert */ ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_RESET, &signaled); ++ EXPECT_EQ(0, ret); ++ ++ wait_args.timeout = get_abs_timeout(1000); ++ wait_args.objs = (uintptr_t)objs; ++ wait_args.count = 2; ++ wait_args.owner = 123; ++ wait_args.index = 0xdeadbeef; ++ wait_args.alert = event; ++ thread_args.fd = fd; ++ thread_args.args = &wait_args; ++ thread_args.request = NTSYNC_IOC_WAIT_ALL; ++ ret = pthread_create(&thread, NULL, wait_thread, &thread_args); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(ETIMEDOUT, ret); ++ ++ ret = ioctl(event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_for_thread(thread, 100); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, thread_args.ret); ++ EXPECT_EQ(2, wait_args.index); ++ ++ close(event); ++ ++ /* test with an auto-reset event */ ++ ++ event_args.manual = false; ++ event_args.signaled = true; ++ event = ioctl(fd, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, event); ++ ++ count = 2; ++ ret = release_sem(objs[1], &count); ++ EXPECT_EQ(0, ret); ++ ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(0, index); ++ ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(0, ret); ++ EXPECT_EQ(2, index); ++ ++ ret = wait_all_alert(fd, 2, objs, 123, event, &index); ++ EXPECT_EQ(-1, ret); ++ EXPECT_EQ(ETIMEDOUT, errno); ++ ++ close(event); ++ ++ close(objs[0]); ++ close(objs[1]); ++ ++ close(fd); ++} ++ ++#define STRESS_LOOPS 10000 ++#define STRESS_THREADS 4 ++ ++static unsigned int stress_counter; ++static int stress_device, stress_start_event, stress_mutex; ++ ++static void *stress_thread(void *arg) ++{ ++ struct ntsync_wait_args wait_args = {0}; ++ __u32 index, count, i; ++ int ret; ++ ++ wait_args.timeout = UINT64_MAX; ++ wait_args.count = 1; ++ wait_args.objs = (uintptr_t)&stress_start_event; ++ wait_args.owner = gettid(); ++ wait_args.index = 0xdeadbeef; ++ ++ ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args); ++ ++ wait_args.objs = (uintptr_t)&stress_mutex; ++ ++ for (i = 0; i < STRESS_LOOPS; ++i) { ++ ioctl(stress_device, NTSYNC_IOC_WAIT_ANY, &wait_args); ++ ++ ++stress_counter; ++ ++ unlock_mutex(stress_mutex, wait_args.owner, &count); ++ } ++ ++ return NULL; ++} ++ ++TEST(stress_wait) ++{ ++ struct ntsync_event_args event_args; ++ struct ntsync_mutex_args mutex_args; ++ pthread_t threads[STRESS_THREADS]; ++ __u32 signaled, i; ++ int ret; ++ ++ stress_device = open("/dev/ntsync", O_CLOEXEC | O_RDONLY); ++ ASSERT_LE(0, stress_device); ++ ++ mutex_args.owner = 0; ++ mutex_args.count = 0; ++ stress_mutex = ioctl(stress_device, NTSYNC_IOC_CREATE_MUTEX, &mutex_args); ++ EXPECT_LE(0, stress_mutex); ++ ++ event_args.manual = 1; ++ event_args.signaled = 0; ++ stress_start_event = ioctl(stress_device, NTSYNC_IOC_CREATE_EVENT, &event_args); ++ EXPECT_LE(0, stress_start_event); ++ ++ for (i = 0; i < STRESS_THREADS; ++i) ++ pthread_create(&threads[i], NULL, stress_thread, NULL); ++ ++ ret = ioctl(stress_start_event, NTSYNC_IOC_EVENT_SET, &signaled); ++ EXPECT_EQ(0, ret); ++ ++ for (i = 0; i < STRESS_THREADS; ++i) { ++ ret = pthread_join(threads[i], NULL); ++ EXPECT_EQ(0, ret); ++ } ++ ++ EXPECT_EQ(STRESS_LOOPS * STRESS_THREADS, stress_counter); ++ ++ close(stress_start_event); ++ close(stress_mutex); ++ close(stress_device); ++} ++ ++TEST_HARNESS_MAIN +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch b/sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch new file mode 100644 index 0000000..c3dc64f --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0009-perf-per-core.patch @@ -0,0 +1,898 @@ +From 7de62a7c4da5a2b267f3faacc8d50eb24fdfd89e Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:32:14 +0100 +Subject: [PATCH 09/12] perf-per-core + +Signed-off-by: Peter Jung +--- + Documentation/arch/x86/topology.rst | 4 + + arch/x86/events/rapl.c | 415 ++++++++++++++++---------- + arch/x86/include/asm/processor.h | 1 + + arch/x86/include/asm/topology.h | 1 + + arch/x86/kernel/cpu/debugfs.c | 1 + + arch/x86/kernel/cpu/topology_common.c | 1 + + 6 files changed, 273 insertions(+), 150 deletions(-) + +diff --git a/Documentation/arch/x86/topology.rst b/Documentation/arch/x86/topology.rst +index 7352ab89a55a..c12837e61bda 100644 +--- a/Documentation/arch/x86/topology.rst ++++ b/Documentation/arch/x86/topology.rst +@@ -135,6 +135,10 @@ Thread-related topology information in the kernel: + The ID of the core to which a thread belongs. It is also printed in /proc/cpuinfo + "core_id." + ++ - topology_logical_core_id(); ++ ++ The logical core ID to which a thread belongs. ++ + + + System topology examples +diff --git a/arch/x86/events/rapl.c b/arch/x86/events/rapl.c +index a8defc813c36..d3bb3865c1b1 100644 +--- a/arch/x86/events/rapl.c ++++ b/arch/x86/events/rapl.c +@@ -39,6 +39,10 @@ + * event: rapl_energy_psys + * perf code: 0x5 + * ++ * core counter: consumption of a single physical core ++ * event: rapl_energy_core (power_core PMU) ++ * perf code: 0x1 ++ * + * We manage those counters as free running (read-only). They may be + * use simultaneously by other tools, such as turbostat. + * +@@ -70,18 +74,22 @@ MODULE_LICENSE("GPL"); + /* + * RAPL energy status counters + */ +-enum perf_rapl_events { ++enum perf_rapl_pkg_events { + PERF_RAPL_PP0 = 0, /* all cores */ + PERF_RAPL_PKG, /* entire package */ + PERF_RAPL_RAM, /* DRAM */ + PERF_RAPL_PP1, /* gpu */ + PERF_RAPL_PSYS, /* psys */ + +- PERF_RAPL_MAX, +- NR_RAPL_DOMAINS = PERF_RAPL_MAX, ++ PERF_RAPL_PKG_EVENTS_MAX, ++ NR_RAPL_PKG_DOMAINS = PERF_RAPL_PKG_EVENTS_MAX, + }; + +-static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { ++#define PERF_RAPL_CORE 0 /* single core */ ++#define PERF_RAPL_CORE_EVENTS_MAX 1 ++#define NR_RAPL_CORE_DOMAINS PERF_RAPL_CORE_EVENTS_MAX ++ ++static const char *const rapl_pkg_domain_names[NR_RAPL_PKG_DOMAINS] __initconst = { + "pp0-core", + "package", + "dram", +@@ -89,6 +97,8 @@ static const char *const rapl_domain_names[NR_RAPL_DOMAINS] __initconst = { + "psys", + }; + ++static const char *const rapl_core_domain_name __initconst = "core"; ++ + /* + * event code: LSB 8 bits, passed in attr->config + * any other bit is reserved +@@ -112,7 +122,7 @@ static struct perf_pmu_events_attr event_attr_##v = { \ + * considered as either pkg-scope or die-scope, and we are considering + * them as die-scope. + */ +-#define rapl_pmu_is_pkg_scope() \ ++#define rapl_pkg_pmu_is_pkg_scope() \ + (boot_cpu_data.x86_vendor == X86_VENDOR_AMD || \ + boot_cpu_data.x86_vendor == X86_VENDOR_HYGON) + +@@ -129,7 +139,8 @@ struct rapl_pmu { + struct rapl_pmus { + struct pmu pmu; + unsigned int nr_rapl_pmu; +- struct rapl_pmu *pmus[] __counted_by(nr_rapl_pmu); ++ unsigned int cntr_mask; ++ struct rapl_pmu *rapl_pmu[] __counted_by(nr_rapl_pmu); + }; + + enum rapl_unit_quirk { +@@ -139,44 +150,43 @@ enum rapl_unit_quirk { + }; + + struct rapl_model { +- struct perf_msr *rapl_msrs; +- unsigned long events; ++ struct perf_msr *rapl_pkg_msrs; ++ struct perf_msr *rapl_core_msrs; ++ unsigned long pkg_events; ++ unsigned long core_events; + unsigned int msr_power_unit; + enum rapl_unit_quirk unit_quirk; + }; + + /* 1/2^hw_unit Joule */ +-static int rapl_hw_unit[NR_RAPL_DOMAINS] __read_mostly; +-static struct rapl_pmus *rapl_pmus; +-static unsigned int rapl_cntr_mask; ++static int rapl_pkg_hw_unit[NR_RAPL_PKG_DOMAINS] __read_mostly; ++static int rapl_core_hw_unit __read_mostly; ++static struct rapl_pmus *rapl_pmus_pkg; ++static struct rapl_pmus *rapl_pmus_core; + static u64 rapl_timer_ms; +-static struct perf_msr *rapl_msrs; ++static struct rapl_model *rapl_model; + + /* +- * Helper functions to get the correct topology macros according to the ++ * Helper function to get the correct topology id according to the + * RAPL PMU scope. + */ +-static inline unsigned int get_rapl_pmu_idx(int cpu) +-{ +- return rapl_pmu_is_pkg_scope() ? topology_logical_package_id(cpu) : +- topology_logical_die_id(cpu); +-} +- +-static inline const struct cpumask *get_rapl_pmu_cpumask(int cpu) +-{ +- return rapl_pmu_is_pkg_scope() ? topology_core_cpumask(cpu) : +- topology_die_cpumask(cpu); +-} +- +-static inline struct rapl_pmu *cpu_to_rapl_pmu(unsigned int cpu) ++static inline unsigned int get_rapl_pmu_idx(int cpu, int scope) + { +- unsigned int rapl_pmu_idx = get_rapl_pmu_idx(cpu); +- + /* +- * The unsigned check also catches the '-1' return value for non +- * existent mappings in the topology map. ++ * Returns unsigned int, which converts the '-1' return value ++ * (for non-existent mappings in topology map) to UINT_MAX, so ++ * the error check in the caller is simplified. + */ +- return rapl_pmu_idx < rapl_pmus->nr_rapl_pmu ? rapl_pmus->pmus[rapl_pmu_idx] : NULL; ++ switch (scope) { ++ case PERF_PMU_SCOPE_PKG: ++ return topology_logical_package_id(cpu); ++ case PERF_PMU_SCOPE_DIE: ++ return topology_logical_die_id(cpu); ++ case PERF_PMU_SCOPE_CORE: ++ return topology_logical_core_id(cpu); ++ default: ++ return -EINVAL; ++ } + } + + static inline u64 rapl_read_counter(struct perf_event *event) +@@ -186,19 +196,20 @@ static inline u64 rapl_read_counter(struct perf_event *event) + return raw; + } + +-static inline u64 rapl_scale(u64 v, int cfg) ++static inline u64 rapl_scale(u64 v, struct perf_event *event) + { +- if (cfg > NR_RAPL_DOMAINS) { +- pr_warn("Invalid domain %d, failed to scale data\n", cfg); +- return v; +- } ++ int hw_unit = rapl_pkg_hw_unit[event->hw.config - 1]; ++ ++ if (event->pmu->scope == PERF_PMU_SCOPE_CORE) ++ hw_unit = rapl_core_hw_unit; ++ + /* + * scale delta to smallest unit (1/2^32) + * users must then scale back: count * 1/(1e9*2^32) to get Joules + * or use ldexp(count, -32). + * Watts = Joules/Time delta + */ +- return v << (32 - rapl_hw_unit[cfg - 1]); ++ return v << (32 - hw_unit); + } + + static u64 rapl_event_update(struct perf_event *event) +@@ -225,7 +236,7 @@ static u64 rapl_event_update(struct perf_event *event) + delta = (new_raw_count << shift) - (prev_raw_count << shift); + delta >>= shift; + +- sdelta = rapl_scale(delta, event->hw.config); ++ sdelta = rapl_scale(delta, event); + + local64_add(sdelta, &event->count); + +@@ -240,34 +251,34 @@ static void rapl_start_hrtimer(struct rapl_pmu *pmu) + + static enum hrtimer_restart rapl_hrtimer_handle(struct hrtimer *hrtimer) + { +- struct rapl_pmu *pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); ++ struct rapl_pmu *rapl_pmu = container_of(hrtimer, struct rapl_pmu, hrtimer); + struct perf_event *event; + unsigned long flags; + +- if (!pmu->n_active) ++ if (!rapl_pmu->n_active) + return HRTIMER_NORESTART; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + +- list_for_each_entry(event, &pmu->active_list, active_entry) ++ list_for_each_entry(event, &rapl_pmu->active_list, active_entry) + rapl_event_update(event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + +- hrtimer_forward_now(hrtimer, pmu->timer_interval); ++ hrtimer_forward_now(hrtimer, rapl_pmu->timer_interval); + + return HRTIMER_RESTART; + } + +-static void rapl_hrtimer_init(struct rapl_pmu *pmu) ++static void rapl_hrtimer_init(struct rapl_pmu *rapl_pmu) + { +- struct hrtimer *hr = &pmu->hrtimer; ++ struct hrtimer *hr = &rapl_pmu->hrtimer; + + hrtimer_init(hr, CLOCK_MONOTONIC, HRTIMER_MODE_REL); + hr->function = rapl_hrtimer_handle; + } + +-static void __rapl_pmu_event_start(struct rapl_pmu *pmu, ++static void __rapl_pmu_event_start(struct rapl_pmu *rapl_pmu, + struct perf_event *event) + { + if (WARN_ON_ONCE(!(event->hw.state & PERF_HES_STOPPED))) +@@ -275,39 +286,39 @@ static void __rapl_pmu_event_start(struct rapl_pmu *pmu, + + event->hw.state = 0; + +- list_add_tail(&event->active_entry, &pmu->active_list); ++ list_add_tail(&event->active_entry, &rapl_pmu->active_list); + + local64_set(&event->hw.prev_count, rapl_read_counter(event)); + +- pmu->n_active++; +- if (pmu->n_active == 1) +- rapl_start_hrtimer(pmu); ++ rapl_pmu->n_active++; ++ if (rapl_pmu->n_active == 1) ++ rapl_start_hrtimer(rapl_pmu); + } + + static void rapl_pmu_event_start(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); +- __rapl_pmu_event_start(pmu, event); +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); ++ __rapl_pmu_event_start(rapl_pmu, event); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static void rapl_pmu_event_stop(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + /* mark event as deactivated and stopped */ + if (!(hwc->state & PERF_HES_STOPPED)) { +- WARN_ON_ONCE(pmu->n_active <= 0); +- pmu->n_active--; +- if (pmu->n_active == 0) +- hrtimer_cancel(&pmu->hrtimer); ++ WARN_ON_ONCE(rapl_pmu->n_active <= 0); ++ rapl_pmu->n_active--; ++ if (rapl_pmu->n_active == 0) ++ hrtimer_cancel(&rapl_pmu->hrtimer); + + list_del(&event->active_entry); + +@@ -325,23 +336,23 @@ static void rapl_pmu_event_stop(struct perf_event *event, int mode) + hwc->state |= PERF_HES_UPTODATE; + } + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + } + + static int rapl_pmu_event_add(struct perf_event *event, int mode) + { +- struct rapl_pmu *pmu = event->pmu_private; ++ struct rapl_pmu *rapl_pmu = event->pmu_private; + struct hw_perf_event *hwc = &event->hw; + unsigned long flags; + +- raw_spin_lock_irqsave(&pmu->lock, flags); ++ raw_spin_lock_irqsave(&rapl_pmu->lock, flags); + + hwc->state = PERF_HES_UPTODATE | PERF_HES_STOPPED; + + if (mode & PERF_EF_START) +- __rapl_pmu_event_start(pmu, event); ++ __rapl_pmu_event_start(rapl_pmu, event); + +- raw_spin_unlock_irqrestore(&pmu->lock, flags); ++ raw_spin_unlock_irqrestore(&rapl_pmu->lock, flags); + + return 0; + } +@@ -354,12 +365,14 @@ static void rapl_pmu_event_del(struct perf_event *event, int flags) + static int rapl_pmu_event_init(struct perf_event *event) + { + u64 cfg = event->attr.config & RAPL_EVENT_MASK; +- int bit, ret = 0; +- struct rapl_pmu *pmu; ++ int bit, rapl_pmus_scope, ret = 0; ++ struct rapl_pmu *rapl_pmu; ++ unsigned int rapl_pmu_idx; ++ struct rapl_pmus *rapl_pmus; + +- /* only look at RAPL events */ +- if (event->attr.type != rapl_pmus->pmu.type) +- return -ENOENT; ++ /* unsupported modes and filters */ ++ if (event->attr.sample_period) /* no sampling */ ++ return -EINVAL; + + /* check only supported bits are set */ + if (event->attr.config & ~RAPL_EVENT_MASK) +@@ -368,26 +381,49 @@ static int rapl_pmu_event_init(struct perf_event *event) + if (event->cpu < 0) + return -EINVAL; + +- if (!cfg || cfg >= NR_RAPL_DOMAINS + 1) ++ rapl_pmus = container_of(event->pmu, struct rapl_pmus, pmu); ++ if (!rapl_pmus) ++ return -EINVAL; ++ rapl_pmus_scope = rapl_pmus->pmu.scope; ++ ++ if (rapl_pmus_scope == PERF_PMU_SCOPE_PKG || rapl_pmus_scope == PERF_PMU_SCOPE_DIE) { ++ /* only look at RAPL package events */ ++ if (event->attr.type != rapl_pmus_pkg->pmu.type) ++ return -ENOENT; ++ ++ cfg = array_index_nospec((long)cfg, NR_RAPL_PKG_DOMAINS + 1); ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) ++ return -EINVAL; ++ ++ bit = cfg - 1; ++ event->hw.event_base = rapl_model->rapl_pkg_msrs[bit].msr; ++ } else if (rapl_pmus_scope == PERF_PMU_SCOPE_CORE) { ++ /* only look at RAPL core events */ ++ if (event->attr.type != rapl_pmus_core->pmu.type) ++ return -ENOENT; ++ ++ cfg = array_index_nospec((long)cfg, NR_RAPL_CORE_DOMAINS + 1); ++ if (!cfg || cfg >= NR_RAPL_PKG_DOMAINS + 1) ++ return -EINVAL; ++ ++ bit = cfg - 1; ++ event->hw.event_base = rapl_model->rapl_core_msrs[bit].msr; ++ } else + return -EINVAL; +- +- cfg = array_index_nospec((long)cfg, NR_RAPL_DOMAINS + 1); +- bit = cfg - 1; + + /* check event supported */ +- if (!(rapl_cntr_mask & (1 << bit))) ++ if (!(rapl_pmus->cntr_mask & (1 << bit))) + return -EINVAL; + +- /* unsupported modes and filters */ +- if (event->attr.sample_period) /* no sampling */ ++ rapl_pmu_idx = get_rapl_pmu_idx(event->cpu, rapl_pmus_scope); ++ if (rapl_pmu_idx >= rapl_pmus->nr_rapl_pmu) + return -EINVAL; +- + /* must be done before validate_group */ +- pmu = cpu_to_rapl_pmu(event->cpu); +- if (!pmu) ++ rapl_pmu = rapl_pmus->rapl_pmu[rapl_pmu_idx]; ++ if (!rapl_pmu) + return -EINVAL; +- event->pmu_private = pmu; +- event->hw.event_base = rapl_msrs[bit].msr; ++ ++ event->pmu_private = rapl_pmu; + event->hw.config = cfg; + event->hw.idx = bit; + +@@ -404,12 +440,14 @@ RAPL_EVENT_ATTR_STR(energy-pkg , rapl_pkg, "event=0x02"); + RAPL_EVENT_ATTR_STR(energy-ram , rapl_ram, "event=0x03"); + RAPL_EVENT_ATTR_STR(energy-gpu , rapl_gpu, "event=0x04"); + RAPL_EVENT_ATTR_STR(energy-psys, rapl_psys, "event=0x05"); ++RAPL_EVENT_ATTR_STR(energy-core, rapl_core, "event=0x01"); + + RAPL_EVENT_ATTR_STR(energy-cores.unit, rapl_cores_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-pkg.unit , rapl_pkg_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-ram.unit , rapl_ram_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-gpu.unit , rapl_gpu_unit, "Joules"); + RAPL_EVENT_ATTR_STR(energy-psys.unit, rapl_psys_unit, "Joules"); ++RAPL_EVENT_ATTR_STR(energy-core.unit, rapl_core_unit, "Joules"); + + /* + * we compute in 0.23 nJ increments regardless of MSR +@@ -419,6 +457,7 @@ RAPL_EVENT_ATTR_STR(energy-pkg.scale, rapl_pkg_scale, "2.3283064365386962890 + RAPL_EVENT_ATTR_STR(energy-ram.scale, rapl_ram_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-gpu.scale, rapl_gpu_scale, "2.3283064365386962890625e-10"); + RAPL_EVENT_ATTR_STR(energy-psys.scale, rapl_psys_scale, "2.3283064365386962890625e-10"); ++RAPL_EVENT_ATTR_STR(energy-core.scale, rapl_core_scale, "2.3283064365386962890625e-10"); + + /* + * There are no default events, but we need to create +@@ -451,6 +490,12 @@ static const struct attribute_group *rapl_attr_groups[] = { + NULL, + }; + ++static const struct attribute_group *rapl_core_attr_groups[] = { ++ &rapl_pmu_format_group, ++ &rapl_pmu_events_group, ++ NULL, ++}; ++ + static struct attribute *rapl_events_cores[] = { + EVENT_PTR(rapl_cores), + EVENT_PTR(rapl_cores_unit), +@@ -511,6 +556,18 @@ static struct attribute_group rapl_events_psys_group = { + .attrs = rapl_events_psys, + }; + ++static struct attribute *rapl_events_core[] = { ++ EVENT_PTR(rapl_core), ++ EVENT_PTR(rapl_core_unit), ++ EVENT_PTR(rapl_core_scale), ++ NULL, ++}; ++ ++static struct attribute_group rapl_events_core_group = { ++ .name = "events", ++ .attrs = rapl_events_core, ++}; ++ + static bool test_msr(int idx, void *data) + { + return test_bit(idx, (unsigned long *) data); +@@ -536,11 +593,11 @@ static struct perf_msr intel_rapl_spr_msrs[] = { + }; + + /* +- * Force to PERF_RAPL_MAX size due to: +- * - perf_msr_probe(PERF_RAPL_MAX) ++ * Force to PERF_RAPL_PKG_EVENTS_MAX size due to: ++ * - perf_msr_probe(PERF_RAPL_PKG_EVENTS_MAX) + * - want to use same event codes across both architectures + */ +-static struct perf_msr amd_rapl_msrs[] = { ++static struct perf_msr amd_rapl_pkg_msrs[] = { + [PERF_RAPL_PP0] = { 0, &rapl_events_cores_group, NULL, false, 0 }, + [PERF_RAPL_PKG] = { MSR_AMD_PKG_ENERGY_STATUS, &rapl_events_pkg_group, test_msr, false, RAPL_MSR_MASK }, + [PERF_RAPL_RAM] = { 0, &rapl_events_ram_group, NULL, false, 0 }, +@@ -548,18 +605,25 @@ static struct perf_msr amd_rapl_msrs[] = { + [PERF_RAPL_PSYS] = { 0, &rapl_events_psys_group, NULL, false, 0 }, + }; + +-static int rapl_check_hw_unit(struct rapl_model *rm) ++static struct perf_msr amd_rapl_core_msrs[] = { ++ [PERF_RAPL_CORE] = { MSR_AMD_CORE_ENERGY_STATUS, &rapl_events_core_group, ++ test_msr, false, RAPL_MSR_MASK }, ++}; ++ ++static int rapl_check_hw_unit(void) + { + u64 msr_rapl_power_unit_bits; + int i; + + /* protect rdmsrl() to handle virtualization */ +- if (rdmsrl_safe(rm->msr_power_unit, &msr_rapl_power_unit_bits)) ++ if (rdmsrl_safe(rapl_model->msr_power_unit, &msr_rapl_power_unit_bits)) + return -1; +- for (i = 0; i < NR_RAPL_DOMAINS; i++) +- rapl_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) ++ rapl_pkg_hw_unit[i] = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; + +- switch (rm->unit_quirk) { ++ rapl_core_hw_unit = (msr_rapl_power_unit_bits >> 8) & 0x1FULL; ++ ++ switch (rapl_model->unit_quirk) { + /* + * DRAM domain on HSW server and KNL has fixed energy unit which can be + * different than the unit from power unit MSR. See +@@ -567,17 +631,16 @@ static int rapl_check_hw_unit(struct rapl_model *rm) + * of 2. Datasheet, September 2014, Reference Number: 330784-001 " + */ + case RAPL_UNIT_QUIRK_INTEL_HSW: +- rapl_hw_unit[PERF_RAPL_RAM] = 16; ++ rapl_pkg_hw_unit[PERF_RAPL_RAM] = 16; + break; + /* SPR uses a fixed energy unit for Psys domain. */ + case RAPL_UNIT_QUIRK_INTEL_SPR: +- rapl_hw_unit[PERF_RAPL_PSYS] = 0; ++ rapl_pkg_hw_unit[PERF_RAPL_PSYS] = 0; + break; + default: + break; + } + +- + /* + * Calculate the timer rate: + * Use reference of 200W for scaling the timeout to avoid counter +@@ -586,9 +649,9 @@ static int rapl_check_hw_unit(struct rapl_model *rm) + * if hw unit is 32, then we use 2 ms 1/200/2 + */ + rapl_timer_ms = 2; +- if (rapl_hw_unit[0] < 32) { ++ if (rapl_pkg_hw_unit[0] < 32) { + rapl_timer_ms = (1000 / (2 * 100)); +- rapl_timer_ms *= (1ULL << (32 - rapl_hw_unit[0] - 1)); ++ rapl_timer_ms *= (1ULL << (32 - rapl_pkg_hw_unit[0] - 1)); + } + return 0; + } +@@ -596,24 +659,32 @@ static int rapl_check_hw_unit(struct rapl_model *rm) + static void __init rapl_advertise(void) + { + int i; ++ int num_counters = hweight32(rapl_pmus_pkg->cntr_mask); ++ ++ if (rapl_pmus_core) ++ num_counters += hweight32(rapl_pmus_core->cntr_mask); + + pr_info("API unit is 2^-32 Joules, %d fixed counters, %llu ms ovfl timer\n", +- hweight32(rapl_cntr_mask), rapl_timer_ms); ++ num_counters, rapl_timer_ms); + +- for (i = 0; i < NR_RAPL_DOMAINS; i++) { +- if (rapl_cntr_mask & (1 << i)) { ++ for (i = 0; i < NR_RAPL_PKG_DOMAINS; i++) { ++ if (rapl_pmus_pkg->cntr_mask & (1 << i)) { + pr_info("hw unit of domain %s 2^-%d Joules\n", +- rapl_domain_names[i], rapl_hw_unit[i]); ++ rapl_pkg_domain_names[i], rapl_pkg_hw_unit[i]); + } + } ++ ++ if (rapl_pmus_core && (rapl_pmus_core->cntr_mask & (1 << PERF_RAPL_CORE))) ++ pr_info("hw unit of domain %s 2^-%d Joules\n", ++ rapl_core_domain_name, rapl_core_hw_unit); + } + +-static void cleanup_rapl_pmus(void) ++static void cleanup_rapl_pmus(struct rapl_pmus *rapl_pmus) + { + int i; + + for (i = 0; i < rapl_pmus->nr_rapl_pmu; i++) +- kfree(rapl_pmus->pmus[i]); ++ kfree(rapl_pmus->rapl_pmu[i]); + kfree(rapl_pmus); + } + +@@ -626,46 +697,60 @@ static const struct attribute_group *rapl_attr_update[] = { + NULL, + }; + +-static int __init init_rapl_pmu(void) ++static const struct attribute_group *rapl_core_attr_update[] = { ++ &rapl_events_core_group, ++ NULL, ++}; ++ ++static int __init init_rapl_pmu(struct rapl_pmus *rapl_pmus) + { +- struct rapl_pmu *pmu; ++ struct rapl_pmu *rapl_pmu; + int idx; + + for (idx = 0; idx < rapl_pmus->nr_rapl_pmu; idx++) { +- pmu = kzalloc(sizeof(*pmu), GFP_KERNEL); +- if (!pmu) ++ rapl_pmu = kzalloc(sizeof(*rapl_pmu), GFP_KERNEL); ++ if (!rapl_pmu) + goto free; + +- raw_spin_lock_init(&pmu->lock); +- INIT_LIST_HEAD(&pmu->active_list); +- pmu->pmu = &rapl_pmus->pmu; +- pmu->timer_interval = ms_to_ktime(rapl_timer_ms); +- rapl_hrtimer_init(pmu); ++ raw_spin_lock_init(&rapl_pmu->lock); ++ INIT_LIST_HEAD(&rapl_pmu->active_list); ++ rapl_pmu->pmu = &rapl_pmus->pmu; ++ rapl_pmu->timer_interval = ms_to_ktime(rapl_timer_ms); ++ rapl_hrtimer_init(rapl_pmu); + +- rapl_pmus->pmus[idx] = pmu; ++ rapl_pmus->rapl_pmu[idx] = rapl_pmu; + } + + return 0; + free: + for (; idx > 0; idx--) +- kfree(rapl_pmus->pmus[idx - 1]); ++ kfree(rapl_pmus->rapl_pmu[idx - 1]); + return -ENOMEM; + } + +-static int __init init_rapl_pmus(void) ++static int __init init_rapl_pmus(struct rapl_pmus **rapl_pmus_ptr, int rapl_pmu_scope, ++ const struct attribute_group **rapl_attr_groups, ++ const struct attribute_group **rapl_attr_update) + { + int nr_rapl_pmu = topology_max_packages(); +- int rapl_pmu_scope = PERF_PMU_SCOPE_PKG; ++ struct rapl_pmus *rapl_pmus; + +- if (!rapl_pmu_is_pkg_scope()) { +- nr_rapl_pmu *= topology_max_dies_per_package(); +- rapl_pmu_scope = PERF_PMU_SCOPE_DIE; +- } ++ /* ++ * rapl_pmu_scope must be either PKG, DIE or CORE ++ */ ++ if (rapl_pmu_scope == PERF_PMU_SCOPE_DIE) ++ nr_rapl_pmu *= topology_max_dies_per_package(); ++ else if (rapl_pmu_scope == PERF_PMU_SCOPE_CORE) ++ nr_rapl_pmu *= topology_num_cores_per_package(); ++ else if (rapl_pmu_scope != PERF_PMU_SCOPE_PKG) ++ return -EINVAL; + +- rapl_pmus = kzalloc(struct_size(rapl_pmus, pmus, nr_rapl_pmu), GFP_KERNEL); ++ rapl_pmus = kzalloc(struct_size(rapl_pmus, rapl_pmu, nr_rapl_pmu), GFP_KERNEL); + if (!rapl_pmus) + return -ENOMEM; + ++ *rapl_pmus_ptr = rapl_pmus; ++ + rapl_pmus->nr_rapl_pmu = nr_rapl_pmu; + rapl_pmus->pmu.attr_groups = rapl_attr_groups; + rapl_pmus->pmu.attr_update = rapl_attr_update; +@@ -680,75 +765,77 @@ static int __init init_rapl_pmus(void) + rapl_pmus->pmu.module = THIS_MODULE; + rapl_pmus->pmu.capabilities = PERF_PMU_CAP_NO_EXCLUDE; + +- return init_rapl_pmu(); ++ return init_rapl_pmu(rapl_pmus); + } + + static struct rapl_model model_snb = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_snbep = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsw = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_hsx = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_knl = { +- .events = BIT(PERF_RAPL_PKG) | ++ .pkg_events = BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_HSW, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_skl = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PP1) | + BIT(PERF_RAPL_PSYS), + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_msrs, ++ .rapl_pkg_msrs = intel_rapl_msrs, + }; + + static struct rapl_model model_spr = { +- .events = BIT(PERF_RAPL_PP0) | ++ .pkg_events = BIT(PERF_RAPL_PP0) | + BIT(PERF_RAPL_PKG) | + BIT(PERF_RAPL_RAM) | + BIT(PERF_RAPL_PSYS), + .unit_quirk = RAPL_UNIT_QUIRK_INTEL_SPR, + .msr_power_unit = MSR_RAPL_POWER_UNIT, +- .rapl_msrs = intel_rapl_spr_msrs, ++ .rapl_pkg_msrs = intel_rapl_spr_msrs, + }; + + static struct rapl_model model_amd_hygon = { +- .events = BIT(PERF_RAPL_PKG), ++ .pkg_events = BIT(PERF_RAPL_PKG), ++ .core_events = BIT(PERF_RAPL_CORE), + .msr_power_unit = MSR_AMD_RAPL_POWER_UNIT, +- .rapl_msrs = amd_rapl_msrs, ++ .rapl_pkg_msrs = amd_rapl_pkg_msrs, ++ .rapl_core_msrs = amd_rapl_core_msrs, + }; + + static const struct x86_cpu_id rapl_model_match[] __initconst = { +@@ -804,45 +891,73 @@ MODULE_DEVICE_TABLE(x86cpu, rapl_model_match); + static int __init rapl_pmu_init(void) + { + const struct x86_cpu_id *id; +- struct rapl_model *rm; ++ int rapl_pkg_pmu_scope = PERF_PMU_SCOPE_DIE; + int ret; + ++ if (rapl_pkg_pmu_is_pkg_scope()) ++ rapl_pkg_pmu_scope = PERF_PMU_SCOPE_PKG; ++ + id = x86_match_cpu(rapl_model_match); + if (!id) + return -ENODEV; + +- rm = (struct rapl_model *) id->driver_data; +- +- rapl_msrs = rm->rapl_msrs; ++ rapl_model = (struct rapl_model *) id->driver_data; + +- rapl_cntr_mask = perf_msr_probe(rapl_msrs, PERF_RAPL_MAX, +- false, (void *) &rm->events); +- +- ret = rapl_check_hw_unit(rm); ++ ret = rapl_check_hw_unit(); + if (ret) + return ret; + +- ret = init_rapl_pmus(); ++ ret = init_rapl_pmus(&rapl_pmus_pkg, rapl_pkg_pmu_scope, rapl_attr_groups, ++ rapl_attr_update); + if (ret) + return ret; + +- ret = perf_pmu_register(&rapl_pmus->pmu, "power", -1); ++ rapl_pmus_pkg->cntr_mask = perf_msr_probe(rapl_model->rapl_pkg_msrs, ++ PERF_RAPL_PKG_EVENTS_MAX, false, ++ (void *) &rapl_model->pkg_events); ++ ++ ret = perf_pmu_register(&rapl_pmus_pkg->pmu, "power", -1); + if (ret) + goto out; + ++ if (rapl_model->core_events) { ++ ret = init_rapl_pmus(&rapl_pmus_core, PERF_PMU_SCOPE_CORE, ++ rapl_core_attr_groups, ++ rapl_core_attr_update); ++ if (ret) { ++ pr_warn("power-core PMU initialization failed (%d)\n", ret); ++ goto core_init_failed; ++ } ++ ++ rapl_pmus_core->cntr_mask = perf_msr_probe(rapl_model->rapl_core_msrs, ++ PERF_RAPL_CORE_EVENTS_MAX, false, ++ (void *) &rapl_model->core_events); ++ ++ ret = perf_pmu_register(&rapl_pmus_core->pmu, "power_core", -1); ++ if (ret) { ++ pr_warn("power-core PMU registration failed (%d)\n", ret); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ } ++ } ++ ++core_init_failed: + rapl_advertise(); + return 0; + + out: + pr_warn("Initialization failed (%d), disabled\n", ret); +- cleanup_rapl_pmus(); ++ cleanup_rapl_pmus(rapl_pmus_pkg); + return ret; + } + module_init(rapl_pmu_init); + + static void __exit intel_rapl_exit(void) + { +- perf_pmu_unregister(&rapl_pmus->pmu); +- cleanup_rapl_pmus(); ++ if (rapl_pmus_core) { ++ perf_pmu_unregister(&rapl_pmus_core->pmu); ++ cleanup_rapl_pmus(rapl_pmus_core); ++ } ++ perf_pmu_unregister(&rapl_pmus_pkg->pmu); ++ cleanup_rapl_pmus(rapl_pmus_pkg); + } + module_exit(intel_rapl_exit); +diff --git a/arch/x86/include/asm/processor.h b/arch/x86/include/asm/processor.h +index 20e6009381ed..c0cd10182e90 100644 +--- a/arch/x86/include/asm/processor.h ++++ b/arch/x86/include/asm/processor.h +@@ -98,6 +98,7 @@ struct cpuinfo_topology { + // Logical ID mappings + u32 logical_pkg_id; + u32 logical_die_id; ++ u32 logical_core_id; + + // AMD Node ID and Nodes per Package info + u32 amd_node_id; +diff --git a/arch/x86/include/asm/topology.h b/arch/x86/include/asm/topology.h +index 63bab25a4896..ec134b719144 100644 +--- a/arch/x86/include/asm/topology.h ++++ b/arch/x86/include/asm/topology.h +@@ -143,6 +143,7 @@ extern const struct cpumask *cpu_clustergroup_mask(int cpu); + #define topology_logical_package_id(cpu) (cpu_data(cpu).topo.logical_pkg_id) + #define topology_physical_package_id(cpu) (cpu_data(cpu).topo.pkg_id) + #define topology_logical_die_id(cpu) (cpu_data(cpu).topo.logical_die_id) ++#define topology_logical_core_id(cpu) (cpu_data(cpu).topo.logical_core_id) + #define topology_die_id(cpu) (cpu_data(cpu).topo.die_id) + #define topology_core_id(cpu) (cpu_data(cpu).topo.core_id) + #define topology_ppin(cpu) (cpu_data(cpu).ppin) +diff --git a/arch/x86/kernel/cpu/debugfs.c b/arch/x86/kernel/cpu/debugfs.c +index 10719aba6276..cacfd3f6abef 100644 +--- a/arch/x86/kernel/cpu/debugfs.c ++++ b/arch/x86/kernel/cpu/debugfs.c +@@ -25,6 +25,7 @@ static int cpu_debug_show(struct seq_file *m, void *p) + seq_printf(m, "cpu_type: %s\n", get_topology_cpu_type_name(c)); + seq_printf(m, "logical_pkg_id: %u\n", c->topo.logical_pkg_id); + seq_printf(m, "logical_die_id: %u\n", c->topo.logical_die_id); ++ seq_printf(m, "logical_core_id: %u\n", c->topo.logical_core_id); + seq_printf(m, "llc_id: %u\n", c->topo.llc_id); + seq_printf(m, "l2c_id: %u\n", c->topo.l2c_id); + seq_printf(m, "amd_node_id: %u\n", c->topo.amd_node_id); +diff --git a/arch/x86/kernel/cpu/topology_common.c b/arch/x86/kernel/cpu/topology_common.c +index 8277c64f88db..b5a5e1411469 100644 +--- a/arch/x86/kernel/cpu/topology_common.c ++++ b/arch/x86/kernel/cpu/topology_common.c +@@ -185,6 +185,7 @@ static void topo_set_ids(struct topo_scan *tscan, bool early) + if (!early) { + c->topo.logical_pkg_id = topology_get_logical_id(apicid, TOPO_PKG_DOMAIN); + c->topo.logical_die_id = topology_get_logical_id(apicid, TOPO_DIE_DOMAIN); ++ c->topo.logical_core_id = topology_get_logical_id(apicid, TOPO_CORE_DOMAIN); + } + + /* Package relative core ID */ +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0010-pksm.patch b/sys-kernel/gentoo-sources-6.13/0010-pksm.patch new file mode 100644 index 0000000..2ec1324 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0010-pksm.patch @@ -0,0 +1,433 @@ +From 9c28765934eafaff8d73a642512b2b6118aea976 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:32:25 +0100 +Subject: [PATCH 10/12] pksm + +Signed-off-by: Peter Jung +--- + arch/alpha/kernel/syscalls/syscall.tbl | 3 + + arch/arm/tools/syscall.tbl | 3 + + arch/m68k/kernel/syscalls/syscall.tbl | 3 + + arch/microblaze/kernel/syscalls/syscall.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n32.tbl | 3 + + arch/mips/kernel/syscalls/syscall_n64.tbl | 3 + + arch/mips/kernel/syscalls/syscall_o32.tbl | 3 + + arch/parisc/kernel/syscalls/syscall.tbl | 3 + + arch/powerpc/kernel/syscalls/syscall.tbl | 3 + + arch/s390/kernel/syscalls/syscall.tbl | 3 + + arch/sh/kernel/syscalls/syscall.tbl | 3 + + arch/sparc/kernel/syscalls/syscall.tbl | 3 + + arch/x86/entry/syscalls/syscall_32.tbl | 3 + + arch/x86/entry/syscalls/syscall_64.tbl | 3 + + arch/xtensa/kernel/syscalls/syscall.tbl | 3 + + include/linux/syscalls.h | 3 + + include/uapi/asm-generic/unistd.h | 9 +- + kernel/sys.c | 138 ++++++++++++++++++ + kernel/sys_ni.c | 3 + + scripts/syscall.tbl | 3 + + .../arch/powerpc/entry/syscalls/syscall.tbl | 3 + + .../perf/arch/s390/entry/syscalls/syscall.tbl | 3 + + 22 files changed, 206 insertions(+), 1 deletion(-) + +diff --git a/arch/alpha/kernel/syscalls/syscall.tbl b/arch/alpha/kernel/syscalls/syscall.tbl +index c59d53d6d3f3..121696f903e8 100644 +--- a/arch/alpha/kernel/syscalls/syscall.tbl ++++ b/arch/alpha/kernel/syscalls/syscall.tbl +@@ -506,3 +506,6 @@ + 574 common getxattrat sys_getxattrat + 575 common listxattrat sys_listxattrat + 576 common removexattrat sys_removexattrat ++577 common process_ksm_enable sys_process_ksm_enable ++578 common process_ksm_disable sys_process_ksm_disable ++579 common process_ksm_status sys_process_ksm_status +diff --git a/arch/arm/tools/syscall.tbl b/arch/arm/tools/syscall.tbl +index 49eeb2ad8dbd..1ce4d983b5b2 100644 +--- a/arch/arm/tools/syscall.tbl ++++ b/arch/arm/tools/syscall.tbl +@@ -481,3 +481,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/arch/m68k/kernel/syscalls/syscall.tbl b/arch/m68k/kernel/syscalls/syscall.tbl +index f5ed71f1910d..17e865370d37 100644 +--- a/arch/m68k/kernel/syscalls/syscall.tbl ++++ b/arch/m68k/kernel/syscalls/syscall.tbl +@@ -466,3 +466,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/arch/microblaze/kernel/syscalls/syscall.tbl b/arch/microblaze/kernel/syscalls/syscall.tbl +index 680f568b77f2..64740e895587 100644 +--- a/arch/microblaze/kernel/syscalls/syscall.tbl ++++ b/arch/microblaze/kernel/syscalls/syscall.tbl +@@ -472,3 +472,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_n32.tbl b/arch/mips/kernel/syscalls/syscall_n32.tbl +index 0b9b7e25b69a..bfafb91a2eda 100644 +--- a/arch/mips/kernel/syscalls/syscall_n32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n32.tbl +@@ -405,3 +405,6 @@ + 464 n32 getxattrat sys_getxattrat + 465 n32 listxattrat sys_listxattrat + 466 n32 removexattrat sys_removexattrat ++467 n32 process_ksm_enable sys_process_ksm_enable ++468 n32 process_ksm_disable sys_process_ksm_disable ++469 n32 process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_n64.tbl b/arch/mips/kernel/syscalls/syscall_n64.tbl +index c844cd5cda62..39d446aeac64 100644 +--- a/arch/mips/kernel/syscalls/syscall_n64.tbl ++++ b/arch/mips/kernel/syscalls/syscall_n64.tbl +@@ -381,3 +381,6 @@ + 464 n64 getxattrat sys_getxattrat + 465 n64 listxattrat sys_listxattrat + 466 n64 removexattrat sys_removexattrat ++467 n64 process_ksm_enable sys_process_ksm_enable ++468 n64 process_ksm_disable sys_process_ksm_disable ++469 n64 process_ksm_status sys_process_ksm_status +diff --git a/arch/mips/kernel/syscalls/syscall_o32.tbl b/arch/mips/kernel/syscalls/syscall_o32.tbl +index 349b8aad1159..61536c55715a 100644 +--- a/arch/mips/kernel/syscalls/syscall_o32.tbl ++++ b/arch/mips/kernel/syscalls/syscall_o32.tbl +@@ -454,3 +454,6 @@ + 464 o32 getxattrat sys_getxattrat + 465 o32 listxattrat sys_listxattrat + 466 o32 removexattrat sys_removexattrat ++467 o32 process_ksm_enable sys_process_ksm_enable ++468 o32 process_ksm_disable sys_process_ksm_disable ++469 o32 process_ksm_status sys_process_ksm_status +diff --git a/arch/parisc/kernel/syscalls/syscall.tbl b/arch/parisc/kernel/syscalls/syscall.tbl +index d9fc94c86965..85dca5afcf06 100644 +--- a/arch/parisc/kernel/syscalls/syscall.tbl ++++ b/arch/parisc/kernel/syscalls/syscall.tbl +@@ -465,3 +465,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/arch/powerpc/kernel/syscalls/syscall.tbl b/arch/powerpc/kernel/syscalls/syscall.tbl +index d8b4ab78bef0..57aa958c1b97 100644 +--- a/arch/powerpc/kernel/syscalls/syscall.tbl ++++ b/arch/powerpc/kernel/syscalls/syscall.tbl +@@ -557,3 +557,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/arch/s390/kernel/syscalls/syscall.tbl b/arch/s390/kernel/syscalls/syscall.tbl +index e9115b4d8b63..2afc778f2d17 100644 +--- a/arch/s390/kernel/syscalls/syscall.tbl ++++ b/arch/s390/kernel/syscalls/syscall.tbl +@@ -469,3 +469,6 @@ + 464 common getxattrat sys_getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +diff --git a/arch/sh/kernel/syscalls/syscall.tbl b/arch/sh/kernel/syscalls/syscall.tbl +index c8cad33bf250..dfe06a84d902 100644 +--- a/arch/sh/kernel/syscalls/syscall.tbl ++++ b/arch/sh/kernel/syscalls/syscall.tbl +@@ -470,3 +470,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/arch/sparc/kernel/syscalls/syscall.tbl b/arch/sparc/kernel/syscalls/syscall.tbl +index 727f99d333b3..4c43b0d2d09f 100644 +--- a/arch/sparc/kernel/syscalls/syscall.tbl ++++ b/arch/sparc/kernel/syscalls/syscall.tbl +@@ -512,3 +512,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/arch/x86/entry/syscalls/syscall_32.tbl b/arch/x86/entry/syscalls/syscall_32.tbl +index 4d0fb2fba7e2..a63252b84261 100644 +--- a/arch/x86/entry/syscalls/syscall_32.tbl ++++ b/arch/x86/entry/syscalls/syscall_32.tbl +@@ -472,3 +472,6 @@ + 464 i386 getxattrat sys_getxattrat + 465 i386 listxattrat sys_listxattrat + 466 i386 removexattrat sys_removexattrat ++467 i386 process_ksm_enable sys_process_ksm_enable ++468 i386 process_ksm_disable sys_process_ksm_disable ++469 i386 process_ksm_status sys_process_ksm_status +diff --git a/arch/x86/entry/syscalls/syscall_64.tbl b/arch/x86/entry/syscalls/syscall_64.tbl +index 5eb708bff1c7..b5fe77405938 100644 +--- a/arch/x86/entry/syscalls/syscall_64.tbl ++++ b/arch/x86/entry/syscalls/syscall_64.tbl +@@ -390,6 +390,9 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status + + # + # Due to a historical design error, certain syscalls are numbered differently +diff --git a/arch/xtensa/kernel/syscalls/syscall.tbl b/arch/xtensa/kernel/syscalls/syscall.tbl +index 37effc1b134e..5c944f0dcc20 100644 +--- a/arch/xtensa/kernel/syscalls/syscall.tbl ++++ b/arch/xtensa/kernel/syscalls/syscall.tbl +@@ -437,3 +437,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/include/linux/syscalls.h b/include/linux/syscalls.h +index c6333204d451..00400d99eef3 100644 +--- a/include/linux/syscalls.h ++++ b/include/linux/syscalls.h +@@ -831,6 +831,9 @@ asmlinkage long sys_madvise(unsigned long start, size_t len, int behavior); + asmlinkage long sys_process_madvise(int pidfd, const struct iovec __user *vec, + size_t vlen, int behavior, unsigned int flags); + asmlinkage long sys_process_mrelease(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_enable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_disable(int pidfd, unsigned int flags); ++asmlinkage long sys_process_ksm_status(int pidfd, unsigned int flags); + asmlinkage long sys_remap_file_pages(unsigned long start, unsigned long size, + unsigned long prot, unsigned long pgoff, + unsigned long flags); +diff --git a/include/uapi/asm-generic/unistd.h b/include/uapi/asm-generic/unistd.h +index 88dc393c2bca..34d73f16b478 100644 +--- a/include/uapi/asm-generic/unistd.h ++++ b/include/uapi/asm-generic/unistd.h +@@ -850,8 +850,15 @@ __SYSCALL(__NR_listxattrat, sys_listxattrat) + #define __NR_removexattrat 466 + __SYSCALL(__NR_removexattrat, sys_removexattrat) + ++#define __NR_process_ksm_enable 467 ++__SYSCALL(__NR_process_ksm_enable, sys_process_ksm_enable) ++#define __NR_process_ksm_disable 468 ++__SYSCALL(__NR_process_ksm_disable, sys_process_ksm_disable) ++#define __NR_process_ksm_status 469 ++__SYSCALL(__NR_process_ksm_status, sys_process_ksm_status) ++ + #undef __NR_syscalls +-#define __NR_syscalls 467 ++#define __NR_syscalls 470 + + /* + * 32 bit systems traditionally used different +diff --git a/kernel/sys.c b/kernel/sys.c +index c4c701c6f0b4..8806d113f5db 100644 +--- a/kernel/sys.c ++++ b/kernel/sys.c +@@ -2816,6 +2816,144 @@ SYSCALL_DEFINE5(prctl, int, option, unsigned long, arg2, unsigned long, arg3, + return error; + } + ++#ifdef CONFIG_KSM ++enum pkc_action { ++ PKSM_ENABLE = 0, ++ PKSM_DISABLE, ++ PKSM_STATUS, ++}; ++ ++static long do_process_ksm_control(int pidfd, enum pkc_action action) ++{ ++ long ret; ++ struct task_struct *task; ++ struct mm_struct *mm; ++ unsigned int f_flags; ++ ++ task = pidfd_get_task(pidfd, &f_flags); ++ if (IS_ERR(task)) { ++ ret = PTR_ERR(task); ++ goto out; ++ } ++ ++ /* Require PTRACE_MODE_READ to avoid leaking ASLR metadata. */ ++ mm = mm_access(task, PTRACE_MODE_READ_FSCREDS); ++ if (IS_ERR_OR_NULL(mm)) { ++ ret = IS_ERR(mm) ? PTR_ERR(mm) : -ESRCH; ++ goto release_task; ++ } ++ ++ /* Require CAP_SYS_NICE for influencing process performance. */ ++ if (!capable(CAP_SYS_NICE)) { ++ ret = -EPERM; ++ goto release_mm; ++ } ++ ++ if (mmap_write_lock_killable(mm)) { ++ ret = -EINTR; ++ goto release_mm; ++ } ++ ++ switch (action) { ++ case PKSM_ENABLE: ++ ret = ksm_enable_merge_any(mm); ++ break; ++ case PKSM_DISABLE: ++ ret = ksm_disable_merge_any(mm); ++ break; ++ case PKSM_STATUS: ++ ret = !!test_bit(MMF_VM_MERGE_ANY, &mm->flags); ++ break; ++ } ++ ++ mmap_write_unlock(mm); ++ ++release_mm: ++ mmput(mm); ++release_task: ++ put_task_struct(task); ++out: ++ return ret; ++} ++#endif /* CONFIG_KSM */ ++ ++SYSCALL_DEFINE2(process_ksm_enable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_ENABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_disable, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_DISABLE); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++SYSCALL_DEFINE2(process_ksm_status, int, pidfd, unsigned int, flags) ++{ ++#ifdef CONFIG_KSM ++ if (flags != 0) ++ return -EINVAL; ++ ++ return do_process_ksm_control(pidfd, PKSM_STATUS); ++#else /* CONFIG_KSM */ ++ return -ENOSYS; ++#endif /* CONFIG_KSM */ ++} ++ ++#ifdef CONFIG_KSM ++static ssize_t process_ksm_enable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_enable); ++} ++static struct kobj_attribute process_ksm_enable_attr = __ATTR_RO(process_ksm_enable); ++ ++static ssize_t process_ksm_disable_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_disable); ++} ++static struct kobj_attribute process_ksm_disable_attr = __ATTR_RO(process_ksm_disable); ++ ++static ssize_t process_ksm_status_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return sprintf(buf, "%u\n", __NR_process_ksm_status); ++} ++static struct kobj_attribute process_ksm_status_attr = __ATTR_RO(process_ksm_status); ++ ++static struct attribute *process_ksm_sysfs_attrs[] = { ++ &process_ksm_enable_attr.attr, ++ &process_ksm_disable_attr.attr, ++ &process_ksm_status_attr.attr, ++ NULL, ++}; ++ ++static const struct attribute_group process_ksm_sysfs_attr_group = { ++ .attrs = process_ksm_sysfs_attrs, ++ .name = "process_ksm", ++}; ++ ++static int __init process_ksm_sysfs_init(void) ++{ ++ return sysfs_create_group(kernel_kobj, &process_ksm_sysfs_attr_group); ++} ++subsys_initcall(process_ksm_sysfs_init); ++#endif /* CONFIG_KSM */ ++ + SYSCALL_DEFINE3(getcpu, unsigned __user *, cpup, unsigned __user *, nodep, + struct getcpu_cache __user *, unused) + { +diff --git a/kernel/sys_ni.c b/kernel/sys_ni.c +index c00a86931f8c..d82213d68522 100644 +--- a/kernel/sys_ni.c ++++ b/kernel/sys_ni.c +@@ -186,6 +186,9 @@ COND_SYSCALL(mincore); + COND_SYSCALL(madvise); + COND_SYSCALL(process_madvise); + COND_SYSCALL(process_mrelease); ++COND_SYSCALL(process_ksm_enable); ++COND_SYSCALL(process_ksm_disable); ++COND_SYSCALL(process_ksm_status); + COND_SYSCALL(remap_file_pages); + COND_SYSCALL(mbind); + COND_SYSCALL(get_mempolicy); +diff --git a/scripts/syscall.tbl b/scripts/syscall.tbl +index ebbdb3c42e9f..b19b6bfe5cd4 100644 +--- a/scripts/syscall.tbl ++++ b/scripts/syscall.tbl +@@ -407,3 +407,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +index d8b4ab78bef0..57aa958c1b97 100644 +--- a/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/powerpc/entry/syscalls/syscall.tbl +@@ -557,3 +557,6 @@ + 464 common getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status +diff --git a/tools/perf/arch/s390/entry/syscalls/syscall.tbl b/tools/perf/arch/s390/entry/syscalls/syscall.tbl +index e9115b4d8b63..2afc778f2d17 100644 +--- a/tools/perf/arch/s390/entry/syscalls/syscall.tbl ++++ b/tools/perf/arch/s390/entry/syscalls/syscall.tbl +@@ -469,3 +469,6 @@ + 464 common getxattrat sys_getxattrat sys_getxattrat + 465 common listxattrat sys_listxattrat sys_listxattrat + 466 common removexattrat sys_removexattrat sys_removexattrat ++467 common process_ksm_enable sys_process_ksm_enable sys_process_ksm_enable ++468 common process_ksm_disable sys_process_ksm_disable sys_process_ksm_disable ++469 common process_ksm_status sys_process_ksm_status sys_process_ksm_status +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.13/0012-zstd.patch b/sys-kernel/gentoo-sources-6.13/0012-zstd.patch new file mode 100644 index 0000000..df7f814 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.13/0012-zstd.patch @@ -0,0 +1,23530 @@ +From 0b468cb06e1605b1cdb08b8c16d6d775ce653cf2 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 7 Mar 2025 19:33:03 +0100 +Subject: [PATCH 12/12] zstd + +Signed-off-by: Peter Jung +--- + include/linux/zstd.h | 86 +- + include/linux/zstd_errors.h | 30 +- + include/linux/zstd_lib.h | 1123 ++++-- + lib/zstd/Makefile | 3 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 150 + + lib/zstd/common/bitstream.h | 155 +- + lib/zstd/common/compiler.h | 151 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 9 +- + lib/zstd/common/debug.h | 37 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 13 +- + lib/zstd/common/error_private.h | 88 +- + lib/zstd/common/fse.h | 103 +- + lib/zstd/common/fse_decompress.c | 132 +- + lib/zstd/common/huf.h | 240 +- + lib/zstd/common/mem.h | 3 +- + lib/zstd/common/portability_macros.h | 45 +- + lib/zstd/common/zstd_common.c | 38 +- + lib/zstd/common/zstd_deps.h | 16 +- + lib/zstd/common/zstd_internal.h | 153 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 74 +- + lib/zstd/compress/hist.c | 13 +- + lib/zstd/compress/hist.h | 10 +- + lib/zstd/compress/huf_compress.c | 441 ++- + lib/zstd/compress/zstd_compress.c | 3289 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 621 +++- + lib/zstd/compress/zstd_compress_literals.c | 157 +- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 21 +- + lib/zstd/compress/zstd_compress_sequences.h | 16 +- + lib/zstd/compress/zstd_compress_superblock.c | 394 +- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 222 +- + lib/zstd/compress/zstd_double_fast.c | 245 +- + lib/zstd/compress/zstd_double_fast.h | 27 +- + lib/zstd/compress/zstd_fast.c | 703 +++- + lib/zstd/compress/zstd_fast.h | 16 +- + lib/zstd/compress/zstd_lazy.c | 840 +++-- + lib/zstd/compress/zstd_lazy.h | 195 +- + lib/zstd/compress/zstd_ldm.c | 102 +- + lib/zstd/compress/zstd_ldm.h | 17 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 571 +-- + lib/zstd/compress/zstd_opt.h | 55 +- + lib/zstd/compress/zstd_preSplit.c | 239 ++ + lib/zstd/compress/zstd_preSplit.h | 34 + + lib/zstd/decompress/huf_decompress.c | 887 +++-- + lib/zstd/decompress/zstd_ddict.c | 9 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 377 +- + lib/zstd/decompress/zstd_decompress_block.c | 724 ++-- + lib/zstd/decompress/zstd_decompress_block.h | 10 +- + .../decompress/zstd_decompress_internal.h | 19 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 5 +- + lib/zstd/zstd_compress_module.c | 75 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 60 files changed, 8747 insertions(+), 4380 deletions(-) + create mode 100644 lib/zstd/common/allocations.h + create mode 100644 lib/zstd/common/bits.h + create mode 100644 lib/zstd/compress/zstd_preSplit.c + create mode 100644 lib/zstd/compress/zstd_preSplit.h + +diff --git a/include/linux/zstd.h b/include/linux/zstd.h +index b2c7cf310c8f..d7be07c887e7 100644 +--- a/include/linux/zstd.h ++++ b/include/linux/zstd.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -160,6 +160,19 @@ typedef ZSTD_parameters zstd_parameters; + zstd_parameters zstd_get_params(int level, + unsigned long long estimated_src_size); + ++typedef ZSTD_CCtx zstd_cctx; ++typedef ZSTD_cParameter zstd_cparameter; ++ ++/** ++ * zstd_cctx_set_param() - sets a compression parameter ++ * @cctx: The context. Must have been initialized with zstd_init_cctx(). ++ * @param: The parameter to set. ++ * @value: The value to set the parameter to. ++ * ++ * Return: Zero or an error, which can be checked using zstd_is_error(). ++ */ ++size_t zstd_cctx_set_param(zstd_cctx *cctx, zstd_cparameter param, int value); ++ + + /** + * zstd_get_cparams() - returns zstd_compression_parameters for selected level +@@ -175,8 +188,6 @@ zstd_compression_parameters zstd_get_cparams(int level, + + /* ====== Single-pass Compression ====== */ + +-typedef ZSTD_CCtx zstd_cctx; +- + /** + * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx + * @parameters: The compression parameters to be used. +@@ -190,6 +201,20 @@ typedef ZSTD_CCtx zstd_cctx; + */ + size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters); + ++/** ++ * zstd_cctx_workspace_bound_with_ext_seq_prod() - max memory needed to ++ * initialize a zstd_cctx when using the block-level external sequence ++ * producer API. ++ * @parameters: The compression parameters to be used. ++ * ++ * If multiple compression parameters might be used, the caller must call ++ * this function for each set of parameters and use the maximum size. ++ * ++ * Return: A lower bound on the size of the workspace that is passed to ++ * zstd_init_cctx(). ++ */ ++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *parameters); ++ + /** + * zstd_init_cctx() - initialize a zstd compression context + * @workspace: The workspace to emplace the context into. It must outlive +@@ -424,6 +449,16 @@ typedef ZSTD_CStream zstd_cstream; + */ + size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams); + ++/** ++ * zstd_cstream_workspace_bound_with_ext_seq_prod() - memory needed to initialize ++ * a zstd_cstream when using the block-level external sequence producer API. ++ * @cparams: The compression parameters to be used for compression. ++ * ++ * Return: A lower bound on the size of the workspace that is passed to ++ * zstd_init_cstream(). ++ */ ++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *cparams); ++ + /** + * zstd_init_cstream() - initialize a zstd streaming compression context + * @parameters The zstd parameters to use for compression. +@@ -583,6 +618,18 @@ size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output, + */ + size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); + ++/** ++ * zstd_register_sequence_producer() - exposes the zstd library function ++ * ZSTD_registerSequenceProducer(). This is used for the block-level external ++ * sequence producer API. See upstream zstd.h for detailed documentation. ++ */ ++typedef ZSTD_sequenceProducer_F zstd_sequence_producer_f; ++void zstd_register_sequence_producer( ++ zstd_cctx *cctx, ++ void* sequence_producer_state, ++ zstd_sequence_producer_f sequence_producer ++); ++ + /** + * struct zstd_frame_params - zstd frame parameters stored in the frame header + * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not +@@ -596,7 +643,7 @@ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); + * + * See zstd_lib.h. + */ +-typedef ZSTD_frameHeader zstd_frame_header; ++typedef ZSTD_FrameHeader zstd_frame_header; + + /** + * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame +@@ -611,4 +658,35 @@ typedef ZSTD_frameHeader zstd_frame_header; + size_t zstd_get_frame_header(zstd_frame_header *params, const void *src, + size_t src_size); + ++/** ++ * struct zstd_sequence - a sequence of literals or a match ++ * ++ * @offset: The offset of the match ++ * @litLength: The literal length of the sequence ++ * @matchLength: The match length of the sequence ++ * @rep: Represents which repeat offset is used ++ */ ++typedef ZSTD_Sequence zstd_sequence; ++ ++/** ++ * zstd_compress_sequences_and_literals() - compress an array of zstd_sequence and literals ++ * ++ * @cctx: The zstd compression context. ++ * @dst: The buffer to compress the data into. ++ * @dst_capacity: The size of the destination buffer. ++ * @in_seqs: The array of zstd_sequence to compress. ++ * @in_seqs_size: The number of sequences in in_seqs. ++ * @literals: The literals associated to the sequences to be compressed. ++ * @lit_size: The size of the literals in the literals buffer. ++ * @lit_capacity: The size of the literals buffer. ++ * @decompressed_size: The size of the input data ++ * ++ * Return: The compressed size or an error, which can be checked using ++ * zstd_is_error(). ++ */ ++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity, ++ const zstd_sequence *in_seqs, size_t in_seqs_size, ++ const void* literals, size_t lit_size, size_t lit_capacity, ++ size_t decompressed_size); ++ + #endif /* LINUX_ZSTD_H */ +diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h +index 58b6dd45a969..c307fb011132 100644 +--- a/include/linux/zstd_errors.h ++++ b/include/linux/zstd_errors.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,13 +13,18 @@ + #define ZSTD_ERRORS_H_398273423 + + +-/*===== dependency =====*/ +-#include /* size_t */ ++/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ ++#define ZSTDERRORLIB_VISIBLE + ++#ifndef ZSTDERRORLIB_HIDDEN ++# if (__GNUC__ >= 4) && !defined(__MINGW32__) ++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) ++# else ++# define ZSTDERRORLIB_HIDDEN ++# endif ++#endif + +-/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY ++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE + + /*-********************************************* + * Error codes list +@@ -43,14 +49,18 @@ typedef enum { + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, ++ ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, ++ ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_cannotProduce_uncompressedBlock = 49, ++ ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, +@@ -58,18 +68,18 @@ typedef enum { + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, ++ ZSTD_error_noForwardProgress_destFull = 80, ++ ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, ++ ZSTD_error_sequenceProducer_failed = 106, ++ ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ + } ZSTD_ErrorCode; + +-/*! ZSTD_getErrorCode() : +- convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, +- which can be used to compare with enum list published above */ +-ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); + ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + +diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h +index 79d55465d5c1..e295d4125dde 100644 +--- a/include/linux/zstd_lib.h ++++ b/include/linux/zstd_lib.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,23 +12,47 @@ + #ifndef ZSTD_H_235446 + #define ZSTD_H_235446 + +-/* ====== Dependency ======*/ +-#include /* INT_MAX */ ++ ++/* ====== Dependencies ======*/ + #include /* size_t */ + ++#include /* list of errors */ ++#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) ++#include /* INT_MAX */ ++#endif /* ZSTD_STATIC_LINKING_ONLY */ ++ + + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ +-#ifndef ZSTDLIB_VISIBLE ++#define ZSTDLIB_VISIBLE ++ ++#ifndef ZSTDLIB_HIDDEN + # if (__GNUC__ >= 4) && !defined(__MINGW32__) +-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) + # else +-# define ZSTDLIB_VISIBLE + # define ZSTDLIB_HIDDEN + # endif + #endif ++ + #define ZSTDLIB_API ZSTDLIB_VISIBLE + ++/* Deprecation warnings : ++ * Should these warnings be a problem, it is generally possible to disable them, ++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. ++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. ++ */ ++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS ++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ ++#else ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) || defined(__IAR_SYSTEMS_ICC__) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) ++# elif (__GNUC__ >= 3) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) ++# else ++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") ++# define ZSTD_DEPRECATED(message) ++# endif ++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ ++ + + /* ***************************************************************************** + Introduction +@@ -65,7 +90,7 @@ + /*------ Version ------*/ + #define ZSTD_VERSION_MAJOR 1 + #define ZSTD_VERSION_MINOR 5 +-#define ZSTD_VERSION_RELEASE 2 ++#define ZSTD_VERSION_RELEASE 7 + #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + /*! ZSTD_versionNumber() : +@@ -103,11 +128,12 @@ ZSTDLIB_API const char* ZSTD_versionString(void); + + + /* ************************************* +-* Simple API ++* Simple Core API + ***************************************/ + /*! ZSTD_compress() : + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, +@@ -115,47 +141,55 @@ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + int compressionLevel); + + /*! ZSTD_decompress() : +- * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. +- * `dstCapacity` is an upper bound of originalSize to regenerate. +- * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. +- * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), +- * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ ++ * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. ++ * Multiple compressed frames can be decompressed at once with this method. ++ * The result will be the concatenation of all decompressed frames, back to back. ++ * `dstCapacity` is an upper bound of originalSize to regenerate. ++ * First frame's decompressed size can be extracted using ZSTD_getFrameContentSize(). ++ * If maximum upper bound isn't known, prefer using streaming mode to decompress data. ++ * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), ++ * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + ++ ++/*====== Decompression helper functions ======*/ ++ + /*! ZSTD_getFrameContentSize() : requires v1.3.0+ +- * `src` should point to the start of a ZSTD encoded frame. +- * `srcSize` must be at least as large as the frame header. +- * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. +- * @return : - decompressed size of `src` frame content, if known +- * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined +- * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) +- * note 1 : a 0 return value means the frame is valid but "empty". +- * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. +- * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. +- * In which case, it's necessary to use streaming mode to decompress data. +- * Optionally, application can rely on some implicit limit, +- * as ZSTD_decompress() only needs an upper bound of decompressed size. +- * (For example, data could be necessarily cut into blocks <= 16 KB). +- * note 3 : decompressed size is always present when compression is completed using single-pass functions, +- * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). +- * note 4 : decompressed size can be very large (64-bits value), +- * potentially larger than what local system can handle as a single memory segment. +- * In which case, it's necessary to use streaming mode to decompress data. +- * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. +- * Always ensure return value fits within application's authorized limits. +- * Each application can set its own limits. +- * note 6 : This function replaces ZSTD_getDecompressedSize() */ ++ * `src` should point to the start of a ZSTD encoded frame. ++ * `srcSize` must be at least as large as the frame header. ++ * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. ++ * @return : - decompressed size of `src` frame content, if known ++ * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined ++ * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) ++ * note 1 : a 0 return value means the frame is valid but "empty". ++ * When invoking this method on a skippable frame, it will return 0. ++ * note 2 : decompressed size is an optional field, it may not be present (typically in streaming mode). ++ * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. ++ * In which case, it's necessary to use streaming mode to decompress data. ++ * Optionally, application can rely on some implicit limit, ++ * as ZSTD_decompress() only needs an upper bound of decompressed size. ++ * (For example, data could be necessarily cut into blocks <= 16 KB). ++ * note 3 : decompressed size is always present when compression is completed using single-pass functions, ++ * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). ++ * note 4 : decompressed size can be very large (64-bits value), ++ * potentially larger than what local system can handle as a single memory segment. ++ * In which case, it's necessary to use streaming mode to decompress data. ++ * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. ++ * Always ensure return value fits within application's authorized limits. ++ * Each application can set its own limits. ++ * note 6 : This function replaces ZSTD_getDecompressedSize() */ + #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) + #define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) + ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +-/*! ZSTD_getDecompressedSize() : +- * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). ++/*! ZSTD_getDecompressedSize() (obsolete): ++ * This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ ++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") + ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ +@@ -163,18 +197,50 @@ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, +- * or an error code if input is invalid */ ++ * or an error code if input is invalid ++ * Note 1: this method is called _find*() because it's not enough to read the header, ++ * it may have to scan through the frame's content, to reach its end. ++ * Note 2: this method also works with Skippable Frames. In which case, ++ * it returns the size of the complete skippable frame, ++ * which is always equal to its content size + 8 bytes for headers. */ + ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + + +-/*====== Helper functions ======*/ +-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +-ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +-ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +-ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +-ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +-ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ ++/*====== Compression helper functions ======*/ ++ ++/*! ZSTD_compressBound() : ++ * maximum compressed size in worst case single-pass scenario. ++ * When invoking `ZSTD_compress()`, or any other one-pass compression function, ++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) ++ * as it eliminates one potential failure scenario, ++ * aka not enough room in dst buffer to write the compressed frame. ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize >= ZSTD_MAX_INPUT_SIZE . ++ * In which case, ZSTD_compressBound() will return an error code ++ * which can be tested using ZSTD_isError(). ++ * ++ * ZSTD_COMPRESSBOUND() : ++ * same as ZSTD_compressBound(), but as a macro. ++ * It can be used to produce constants, which can be useful for static allocation, ++ * for example to size a static array on stack. ++ * Will produce constant value 0 if srcSize is too large. ++ */ ++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) ++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++ ++ ++/*====== Error helper functions ======*/ ++/* ZSTD_isError() : ++ * Most ZSTD_* functions returning a size_t value can be tested for error, ++ * using ZSTD_isError(). ++ * @return 1 if error, 0 otherwise ++ */ ++ZSTDLIB_API unsigned ZSTD_isError(size_t result); /*!< tells if a `size_t` function result is an error code */ ++ZSTDLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); /* convert a result into an error code, which can be compared to error enum list */ ++ZSTDLIB_API const char* ZSTD_getErrorName(size_t result); /*!< provides readable string from a function result */ ++ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ ++ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ ++ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ + + + /* ************************************* +@@ -182,25 +248,25 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compres + ***************************************/ + /*= Compression context + * When compressing many times, +- * it is recommended to allocate a context just once, +- * and re-use it for each successive compression operation. +- * This will make workload friendlier for system's memory. ++ * it is recommended to allocate a compression context just once, ++ * and reuse it for each successive compression operation. ++ * This will make the workload easier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. +- * Note 2 : In multi-threaded environments, +- * use one different context per thread for parallel execution. ++ * Note 2: For parallel execution in multi-threaded environments, ++ * use one different context per thread . + */ + typedef struct ZSTD_CCtx_s ZSTD_CCtx; + ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +-ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer */ ++ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* compatible with NULL pointer */ + + /*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. +- * Important : in order to behave similarly to `ZSTD_compress()`, +- * this function compresses at requested compression level, +- * __ignoring any other parameter__ . ++ * Important : in order to mirror `ZSTD_compress()` behavior, ++ * this function compresses at the requested compression level, ++ * __ignoring any other advanced parameter__ . + * If any advanced parameter was set using the advanced API, +- * they will all be reset. Only `compressionLevel` remains. ++ * they will all be reset. Only @compressionLevel remains. + */ + ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -210,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + /*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ + typedef struct ZSTD_DCtx_s ZSTD_DCtx; +@@ -220,7 +286,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer * + /*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. +- * Compatible with sticky parameters. ++ * Compatible with sticky parameters (see below). + */ + ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +@@ -236,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! +- * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . ++ * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supersedes all other "advanced" API entry points in the experimental section. +- * In the future, we expect to remove from experimental API entry points which are redundant with this API. ++ * In the future, we expect to remove API entry points from experimental which are redundant with this API. + */ + + +@@ -324,6 +390,19 @@ typedef enum { + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ ++ ++ ZSTD_c_targetCBlockSize=130, /* v1.5.6+ ++ * Attempts to fit compressed block size into approximately targetCBlockSize. ++ * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. ++ * Note that it's not a guarantee, just a convergence target (default:0). ++ * No target when targetCBlockSize == 0. ++ * This is helpful in low bandwidth streaming environments to improve end-to-end latency, ++ * when a client can make use of partial documents (a prominent example being Chrome). ++ * Note: this parameter is stable since v1.5.6. ++ * It was present as an experimental parameter in earlier versions, ++ * but it's not recommended using it with earlier library versions ++ * due to massive performance regressions. ++ */ + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio +@@ -403,15 +482,18 @@ typedef enum { + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode +- * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer + * ZSTD_c_stableOutBuffer + * ZSTD_c_blockDelimiters + * ZSTD_c_validateSequences +- * ZSTD_c_useBlockSplitter ++ * ZSTD_c_blockSplitterLevel ++ * ZSTD_c_splitAfterSequences + * ZSTD_c_useRowMatchFinder ++ * ZSTD_c_prefetchCDictTables ++ * ZSTD_c_enableSeqProducerFallback ++ * ZSTD_c_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. +@@ -421,7 +503,7 @@ typedef enum { + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, +- ZSTD_c_experimentalParam6=1003, ++ /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, +@@ -430,7 +512,12 @@ typedef enum { + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, +- ZSTD_c_experimentalParam15=1012 ++ ZSTD_c_experimentalParam15=1012, ++ ZSTD_c_experimentalParam16=1013, ++ ZSTD_c_experimentalParam17=1014, ++ ZSTD_c_experimentalParam18=1015, ++ ZSTD_c_experimentalParam19=1016, ++ ZSTD_c_experimentalParam20=1017 + } ZSTD_cParameter; + + typedef struct { +@@ -493,7 +580,7 @@ typedef enum { + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". +- * This removes any reference to any dictionary too. ++ * This also removes any reference to any dictionary or external sequence producer. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. +@@ -502,11 +589,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + + /*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. ++ * (note that this entry point doesn't even expose a compression level parameter). + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data, though it is possible it fails for other reasons. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +@@ -543,13 +632,17 @@ typedef enum { + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts ++ * ZSTD_d_disableHuffmanAssembly ++ * ZSTD_d_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, +- ZSTD_d_experimentalParam4=1003 ++ ZSTD_d_experimentalParam4=1003, ++ ZSTD_d_experimentalParam5=1004, ++ ZSTD_d_experimentalParam6=1005 + + } ZSTD_dParameter; + +@@ -604,14 +697,14 @@ typedef struct ZSTD_outBuffer_s { + * A ZSTD_CStream object is required to track streaming operation. + * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. + * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +-* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. ++* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. + * + * For parallel execution, use one separate ZSTD_CStream per thread. + * + * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. + * + * Parameters are sticky : when starting a new compression on the same context, +-* it will re-use the same sticky parameters as previous compression session. ++* it will reuse the same sticky parameters as previous compression session. + * When in doubt, it's recommended to fully initialize the context before usage. + * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), + * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +@@ -700,6 +793,11 @@ typedef enum { + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. ++ * - note: if an operation ends with an error, it may leave @cctx in an undefined state. ++ * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. ++ * In order to be re-employed after an error, a state must be reset, ++ * which can be done explicitly (ZSTD_CCtx_reset()), ++ * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) + */ + ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, +@@ -728,8 +826,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * This following is a legacy streaming API, available since v1.0+ . + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. +- * Streaming in combination with advanced parameters and dictionary compression +- * can only be used through the new API. + ******************************************************************************/ + + /*! +@@ -738,6 +834,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ++ * ++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API ++ * to compress with a dictionary. + */ + ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); + /*! +@@ -758,7 +857,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + * + * A ZSTD_DStream object is required to track streaming operations. + * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +-* ZSTD_DStream objects can be re-used multiple times. ++* ZSTD_DStream objects can be re-employed multiple times. + * + * Use ZSTD_initDStream() to start a new decompression operation. + * @return : recommended first input size +@@ -768,16 +867,21 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + * The function will update both `pos` fields. + * If `input.pos < input.size`, some input has not been consumed. + * It's up to the caller to present again remaining data. ++* + * The function tries to flush all data decoded immediately, respecting output buffer size. + * If `output.pos < output.size`, decoder has flushed everything it could. +-* But if `output.pos == output.size`, there might be some data left within internal buffers., ++* ++* However, when `output.pos == output.size`, it's more difficult to know. ++* If @return > 0, the frame is not complete, meaning ++* either there is still some data left to flush within internal buffers, ++* or there is more input to read to complete the frame (or both). + * In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. + * Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. + * @return : 0 when a frame is completely decoded and fully flushed, + * or an error code, which can be tested using ZSTD_isError(), + * or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : + * the return value is a suggested next input size (just a hint for better latency) +-* that will never request more than the remaining frame size. ++* that will never request more than the remaining content of the compressed frame. + * *******************************************************************************/ + + typedef ZSTD_DCtx ZSTD_DStream; /*< DCtx and DStream are now effectively same object (>= v1.3.0) */ +@@ -788,13 +892,38 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer + + /*===== Streaming decompression functions =====*/ + +-/* This function is redundant with the advanced API and equivalent to: ++/*! ZSTD_initDStream() : ++ * Initialize/reset DStream state for new decompression operation. ++ * Call before new decompression operation using same DStream. + * ++ * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ + ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + ++/*! ZSTD_decompressStream() : ++ * Streaming decompression function. ++ * Call repetitively to consume full input updating it as necessary. ++ * Function will update both input and output `pos` fields exposing current state via these fields: ++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input ++ * on the next call. ++ * - `output.pos < output.size`, decoder flushed internal output buffer. ++ * - `output.pos == output.size`, unflushed data potentially present in the internal buffers, ++ * check ZSTD_decompressStream() @return value, ++ * if > 0, invoke it again to flush remaining data to output. ++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * @return : 0 when a frame is completely decoded and fully flushed, ++ * or an error code, which can be tested using ZSTD_isError(), ++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. ++ * ++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state. ++ * It's UB to invoke `ZSTD_decompressStream()` on such a state. ++ * In order to re-use such a state, it must be first reset, ++ * which can be done explicitly (`ZSTD_DCtx_reset()`), ++ * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) ++ */ + ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + + ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +@@ -913,7 +1042,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). +- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. ++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. +@@ -925,9 +1054,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Advanced dictionary and prefix API (Requires v1.4.0+) + * + * This API allows dictionaries to be used with ZSTD_compress2(), +- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and +- * only reset with the context is reset with ZSTD_reset_parameters or +- * ZSTD_reset_session_and_parameters. Prefixes are single-use. ++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). ++ * Dictionaries are sticky, they remain valid when same context is reused, ++ * they only reset when the context is reset ++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. ++ * In contrast, Prefixes are single-use. + ******************************************************************************/ + + +@@ -937,8 +1068,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". +- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. +- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). ++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, ++ * until parameters are reset, a new dictionary is loaded, or the dictionary ++ * is explicitly invalidated by loading a NULL dictionary. + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, +@@ -947,11 +1079,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() +- * to precisely select how dictionary content must be interpreted. */ ++ * to precisely select how dictionary content must be interpreted. ++ * Note 5 : This method does not benefit from LDM (long distance mode). ++ * If you want to employ LDM on some large dictionary content, ++ * prefer employing ZSTD_CCtx_refPrefix() described below. ++ */ + ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ +- * Reference a prepared dictionary, to be used for all next compressed frames. ++ * Reference a prepared dictionary, to be used for all future compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. +@@ -970,6 +1106,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). ++ * This method is compatible with LDM (long distance mode). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. +@@ -986,9 +1123,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ +- * Create an internal DDict from dict buffer, +- * to be used to decompress next frames. +- * The dictionary remains valid for all future frames, until explicitly invalidated. ++ * Create an internal DDict from dict buffer, to be used to decompress all future frames. ++ * The dictionary remains valid for all future frames, until explicitly invalidated, or ++ * a new dictionary is loaded. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". +@@ -1012,9 +1149,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * ++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary ++ * will be managed, and referencing a dictionary effectively "discards" any previous one. ++ * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). +- * Note 1 : Currently, only one dictionary can be managed. +- * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +@@ -1051,6 +1189,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); + ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); + ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + ++ + #endif /* ZSTD_H_235446 */ + + +@@ -1066,29 +1205,12 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) + #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + ++ + /* This can be overridden externally to hide static symbols. */ + #ifndef ZSTDLIB_STATIC_API + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE + #endif + +-/* Deprecation warnings : +- * Should these warnings be a problem, it is generally possible to disable them, +- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. +- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. +- */ +-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +-#else +-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +-# elif (__GNUC__ >= 3) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +-# else +-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +-# endif +-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ +- + /* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** +@@ -1123,6 +1245,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ + #define ZSTD_STRATEGY_MIN ZSTD_fast + #define ZSTD_STRATEGY_MAX ZSTD_btultra2 ++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ + + + #define ZSTD_OVERLAPLOG_MIN 0 +@@ -1146,7 +1269,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) + + /* Advanced parameter bounds */ +-#define ZSTD_TARGETCBLOCKSIZE_MIN 64 ++#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ + #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX + #define ZSTD_SRCSIZEHINT_MIN 0 + #define ZSTD_SRCSIZEHINT_MAX INT_MAX +@@ -1188,7 +1311,7 @@ typedef struct { + * + * Note: This field is optional. ZSTD_generateSequences() will calculate the value of + * 'rep', but repeat offsets do not necessarily need to be calculated from an external +- * sequence provider's perspective. For example, ZSTD_compressSequences() does not ++ * sequence provider perspective. For example, ZSTD_compressSequences() does not + * use this 'rep' field at all (as of now). + */ + } ZSTD_Sequence; +@@ -1293,17 +1416,18 @@ typedef enum { + } ZSTD_literalCompressionMode_e; + + typedef enum { +- /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final +- * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable +- * or ZSTD_ps_disable allow for a force enable/disable the feature. ++ /* Note: This enum controls features which are conditionally beneficial. ++ * Zstd can take a decision on whether or not to enable the feature (ZSTD_ps_auto), ++ * but setting the switch to ZSTD_ps_enable or ZSTD_ps_disable force enable/disable the feature. + */ + ZSTD_ps_auto = 0, /* Let the library automatically determine whether the feature shall be enabled */ + ZSTD_ps_enable = 1, /* Force-enable the feature */ + ZSTD_ps_disable = 2 /* Do not use the feature */ +-} ZSTD_paramSwitch_e; ++} ZSTD_ParamSwitch_e; ++#define ZSTD_paramSwitch_e ZSTD_ParamSwitch_e /* old name */ + + /* ************************************* +-* Frame size functions ++* Frame header and size functions + ***************************************/ + + /*! ZSTD_findDecompressedSize() : +@@ -1345,34 +1469,130 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src, + ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + + /*! ZSTD_frameHeaderSize() : +- * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. ++ * srcSize must be large enough, aka >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_FrameType_e; ++#define ZSTD_frameType_e ZSTD_FrameType_e /* old name */ ++typedef struct { ++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ ++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ ++ unsigned blockSizeMax; ++ ZSTD_FrameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ unsigned headerSize; ++ unsigned dictID; /* for ZSTD_skippableFrame, contains the skippable magic variant [0-15] */ ++ unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; ++} ZSTD_FrameHeader; ++#define ZSTD_frameHeader ZSTD_FrameHeader /* old name */ ++ ++/*! ZSTD_getFrameHeader() : ++ * decode Frame Header into `zfhPtr`, or requires larger `srcSize`. ++ * @return : 0 => header is complete, `zfhPtr` is correctly filled, ++ * >0 => `srcSize` is too small, @return value is the wanted `srcSize` amount, `zfhPtr` is not filled, ++ * or an error code, which can be tested using ZSTD_isError() */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize); ++/*! ZSTD_getFrameHeader_advanced() : ++ * same as ZSTD_getFrameHeader(), ++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ ++/*! ZSTD_decompressionMargin() : ++ * Zstd supports in-place decompression, where the input and output buffers overlap. ++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, ++ * and the input buffer must be at the end of the output buffer. ++ * ++ * _______________________ Output Buffer ________________________ ++ * | | ++ * | ____ Input Buffer ____| ++ * | | | ++ * v v v ++ * |---------------------------------------|-----------|----------| ++ * ^ ^ ^ ++ * |___________________ Output_Size ___________________|_ Margin _| ++ * ++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). ++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or ++ * ZSTD_decompressDCtx(). ++ * NOTE: This function supports multi-frame input. ++ * ++ * @param src The compressed frame(s) ++ * @param srcSize The size of the compressed frame(s) ++ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); ++ ++/*! ZSTD_DECOMPRESS_MARGIN() : ++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from ++ * the compressed frame, compute it from the original size and the blockSizeLog. ++ * See ZSTD_decompressionMargin() for details. ++ * ++ * WARNING: This macro does not support multi-frame input, the input must be a single ++ * zstd frame. If you need that support use the function, or implement it yourself. ++ * ++ * @param originalSize The original uncompressed size of the data. ++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). ++ * Unless you explicitly set the windowLog smaller than ++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. ++ */ ++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ ++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ ++ 4 /* checksum */ + \ ++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ ++ (blockSize) /* One block of margin */ \ ++ )) ++ + typedef enum { +- ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ +- ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ +-} ZSTD_sequenceFormat_e; ++ ZSTD_sf_noBlockDelimiters = 0, /* ZSTD_Sequence[] has no block delimiters, just sequences */ ++ ZSTD_sf_explicitBlockDelimiters = 1 /* ZSTD_Sequence[] contains explicit block delimiters */ ++} ZSTD_SequenceFormat_e; ++#define ZSTD_sequenceFormat_e ZSTD_SequenceFormat_e /* old name */ ++ ++/*! ZSTD_sequenceBound() : ++ * `srcSize` : size of the input buffer ++ * @return : upper-bound for the number of sequences that can be generated ++ * from a buffer of srcSize bytes ++ * ++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); + + /*! ZSTD_generateSequences() : +- * Generate sequences using ZSTD_compress2, given a source buffer. ++ * WARNING: This function is meant for debugging and informational purposes ONLY! ++ * Its implementation is flawed, and it will be deleted in a future version. ++ * It is not guaranteed to succeed, as there are several cases where it will give ++ * up and fail. You should NOT use this function in production code. ++ * ++ * This function is deprecated, and will be removed in a future version. ++ * ++ * Generate sequences using ZSTD_compress2(), given a source buffer. ++ * ++ * @param zc The compression context to be used for ZSTD_compress2(). Set any ++ * compression parameters you need on this context. ++ * @param outSeqs The output sequences buffer of size @p outSeqsSize ++ * @param outSeqsCapacity The size of the output sequences buffer. ++ * ZSTD_sequenceBound(srcSize) is an upper bound on the number ++ * of sequences that can be generated. ++ * @param src The source buffer to generate sequences from of size @p srcSize. ++ * @param srcSize The size of the source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * +- * zc can be used to insert custom compression params. +- * This function invokes ZSTD_compress2 +- * +- * The output of this function can be fed into ZSTD_compressSequences() with CCtx +- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters +- * @return : number of sequences generated ++ * @returns The number of sequences generated, necessarily less than ++ * ZSTD_sequenceBound(srcSize), or an error code that can be checked ++ * with ZSTD_isError(). + */ +- +-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +- size_t outSeqsSize, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") ++ZSTDLIB_STATIC_API size_t ++ZSTD_generateSequences(ZSTD_CCtx* zc, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize); + + /*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals +@@ -1388,8 +1608,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + + /*! ZSTD_compressSequences() : +- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. +- * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) ++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. ++ * @src contains the entire input (not just the literals). ++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals ++ * If a dictionary is included, then the cctx should reference the dict (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.). + * The entire source is compressed into a single frame. + * + * The compression behavior changes based on cctx params. In particular: +@@ -1398,11 +1620,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * the block size derived from the cctx, and sequences may be split. This is the default setting. + * + * If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain +- * block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. ++ * valid block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. ++ * ++ * When ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, it's possible to decide generating repcodes ++ * using the advanced parameter ZSTD_c_repcodeResolution. Repcodes will improve compression ratio, though the benefit ++ * can vary greatly depending on Sequences. On the other hand, repcode resolution is an expensive operation. ++ * By default, it's disabled at low (<10) compression levels, and enabled above the threshold (>=10). ++ * ZSTD_c_repcodeResolution makes it possible to directly manage this processing in either direction. + * +- * If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined +- * behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for +- * specifics regarding offset/matchlength requirements) then the function will bail out and return an error. ++ * If ZSTD_c_validateSequences == 0, this function blindly accepts the Sequences provided. Invalid Sequences cause undefined ++ * behavior. If ZSTD_c_validateSequences == 1, then the function will detect invalid Sequences (see doc/zstd_compression_format.md for ++ * specifics regarding offset/matchlength requirements) and then bail out and return an error. + * + * In addition to the two adjustable experimental params, there are other important cctx params. + * - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN. +@@ -1410,14 +1638,42 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset + * is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md + * +- * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. +- * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, +- * and cannot emit an RLE block that disagrees with the repcode history +- * @return : final compressed size or a ZSTD error. +- */ +-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +- const void* src, size_t srcSize); ++ * Note: Repcodes are, as of now, always re-calculated within this function, ZSTD_Sequence.rep is effectively unused. ++ * Dev Note: Once ability to ingest repcodes become available, the explicit block delims mode must respect those repcodes exactly, ++ * and cannot emit an RLE block that disagrees with the repcode history. ++ * @return : final compressed size, or a ZSTD error code. ++ */ ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); ++ ++ ++/*! ZSTD_compressSequencesAndLiterals() : ++ * This is a variant of ZSTD_compressSequences() which, ++ * instead of receiving (src,srcSize) as input parameter, receives (literals,litSize), ++ * aka all the literals, already extracted and laid out into a single continuous buffer. ++ * This can be useful if the process generating the sequences also happens to generate the buffer of literals, ++ * thus skipping an extraction + caching stage. ++ * It's a speed optimization, useful when the right conditions are met, ++ * but it also features the following limitations: ++ * - Only supports explicit delimiter mode ++ * - Currently does not support Sequences validation (so input Sequences are trusted) ++ * - Not compatible with frame checksum, which must be disabled ++ * - If any block is incompressible, will fail and return an error ++ * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error. ++ * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals. ++ * @litBufCapacity must be at least 8 bytes larger than @litSize. ++ * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error. ++ * @return : final compressed size, or a ZSTD error code. ++ */ ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t nbSequences, ++ const void* literals, size_t litSize, size_t litBufCapacity, ++ size_t decompressedSize); + + + /*! ZSTD_writeSkippableFrame() : +@@ -1425,8 +1681,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds + * + * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number, + * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. +- * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so +- * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. ++ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, ++ * so the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. + * + * Returns an error if destination buffer is not large enough, if the source size is not representable + * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). +@@ -1434,26 +1690,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds + * @return : number of bytes written or a ZSTD error. + */ + ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, unsigned magicVariant); ++ const void* src, size_t srcSize, ++ unsigned magicVariant); + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves the content of a zstd skippable frame starting at @src, and writes it to @dst buffer. + * +- * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, +- * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested +- * in the magicVariant. ++ * The parameter @magicVariant will receive the magicVariant that was supplied when the frame was written, ++ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. ++ * This can be NULL if the caller is not interested in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, ++ const void* src, size_t srcSize); + + /*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + */ +-ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); ++ZSTDLIB_STATIC_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + + + +@@ -1464,48 +1722,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + /*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. ++ * This is useful in combination with ZSTD_initStatic(), ++ * which makes it possible to employ a static buffer for ZSTD_CCtx* state. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough +- * for any compression level up to selected one. +- * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate +- * does not include space for a window buffer. +- * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. ++ * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() ++ * associated with any compression level up to max specified one. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * ++ * Note that the size estimation is specific for one-shot compression, ++ * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) ++ * nor other potential ways of using a ZSTD_CCtx* state. ++ * + * When srcSize can be bound by a known and rather "small" value, +- * this fact can be used to provide a tighter estimation +- * because the CCtx compression context will need less memory. +- * This tighter estimation can be provided by more advanced functions ++ * this knowledge can be used to provide a tighter budget estimation ++ * because the ZSTD_CCtx* state will need less memory for small inputs. ++ * This tighter estimation can be provided by employing more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * +- * Note 2 : only single-threaded compression is supported. ++ * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); + + /*! ZSTD_estimateCStreamSize() : +- * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. +- * It will also consider src size to be arbitrarily "large", which is worst case. ++ * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression ++ * using any compression level up to the max specified one. ++ * It will also consider src size to be arbitrarily "large", which is a worst case scenario. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. +- * ZSTD_DStream memory budget depends on window Size. ++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. ++ * ++ * ZSTD_DStream memory budget depends on frame's window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); ++ * Any frame requesting a window size larger than max specified one will be rejected. + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. +- * In this case, get total size by adding ZSTD_estimate?DictSize */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); ++ * In this case, get total size by adding ZSTD_estimate?DictSize ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + + /*! ZSTD_estimate?DictSize() : +@@ -1568,7 +1837,15 @@ typedef void (*ZSTD_freeFunction) (void* opaque, void* address); + typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; + static + __attribute__((__unused__)) ++ ++#if defined(__clang__) && __clang_major__ >= 5 ++#pragma clang diagnostic push ++#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" ++#endif + ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /*< this constant defers to stdlib's functions */ ++#if defined(__clang__) && __clang_major__ >= 5 ++#pragma clang diagnostic pop ++#endif + + ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); + ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +@@ -1649,22 +1926,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + * This function never fails (wide contract) */ + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + ++/*! ZSTD_CCtx_setCParams() : ++ * Set all parameters provided within @p cparams into the working @p cctx. ++ * Note : if modifying parameters during compression (MT mode only), ++ * note that changes to the .windowLog parameter will be ignored. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ * On failure, no parameters are updated. ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ ++/*! ZSTD_CCtx_setFParams() : ++ * Set all parameters provided within @p fparams into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); ++ ++/*! ZSTD_CCtx_setParams() : ++ * Set all parameters provided within @p params into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); ++ + /*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- const void* dict,size_t dictSize, +- ZSTD_parameters params); ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ const void* dict,size_t dictSize, ++ ZSTD_parameters params); + + /*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -1725,7 +2025,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * See the comments on that enum for an explanation of the feature. */ + #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +-/* Controlled with ZSTD_paramSwitch_e enum. ++/* Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never compress literals. + * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals +@@ -1737,11 +2037,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +-/* Tries to fit compressed block size to be around targetCBlockSize. +- * No target when targetCBlockSize == 0. +- * There is no guarantee on compressed block size (default:0) */ +-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 +- + /* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, +@@ -1808,13 +2103,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * +- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same +- * between calls, except for the modifications that zstd makes to pos (the +- * caller must not modify pos). This is checked by the compressor, and +- * compression will fail if it ever changes. This means the only flush +- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end +- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) +- * MUST not be modified during compression or you will get data corruption. ++ * Tells the compressor that input data presented with ZSTD_inBuffer ++ * will ALWAYS be the same between calls. ++ * Technically, the @src pointer must never be changed, ++ * and the @pos field can only be updated by zstd. ++ * However, it's possible to increase the @size field, ++ * allowing scenarios where more data can be appended after compressions starts. ++ * These conditions are checked by the compressor, ++ * and compression will fail if they are not respected. ++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) ++ * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until +@@ -1822,18 +2120,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * +- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. +- * That means this flag cannot be used with ZSTD_compressStream(). +- * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds +- * memory. However, compression WILL fail if you violate the preconditions. ++ * memory. However, compression WILL fail if conditions are not respected. + * +- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST +- * not be modified during compression or you will get data corruption. This +- * is because zstd needs to reference data in the ZSTD_inBuffer to find ++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST ++ * not be modified during compression or it will result in data corruption. ++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, +- * but passing this flag tells zstd to use the user provided buffer. ++ * but passing this flag tells zstd to rely on user provided buffer instead. + */ + #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +@@ -1871,22 +2166,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + /* ZSTD_c_validateSequences + * Default is 0 == disabled. Set to 1 to enable sequence validation. + * +- * For use with sequence compression API: ZSTD_compressSequences(). +- * Designates whether or not we validate sequences provided to ZSTD_compressSequences() ++ * For use with sequence compression API: ZSTD_compressSequences*(). ++ * Designates whether or not provided sequences are validated within ZSTD_compressSequences*() + * during function execution. + * +- * Without validation, providing a sequence that does not conform to the zstd spec will cause +- * undefined behavior, and may produce a corrupted block. ++ * When Sequence validation is disabled (default), Sequences are compressed as-is, ++ * so they must correct, otherwise it would result in a corruption error. + * +- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for ++ * Sequence validation adds some protection, by ensuring that all values respect boundary conditions. ++ * If a Sequence is detected invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. +- * + */ + #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 + +-/* ZSTD_c_useBlockSplitter +- * Controlled with ZSTD_paramSwitch_e enum. ++/* ZSTD_c_blockSplitterLevel ++ * note: this parameter only influences the first splitter stage, ++ * which is active before producing the sequences. ++ * ZSTD_c_splitAfterSequences controls the next splitter stage, ++ * which is active after sequence production. ++ * Note that both can be combined. ++ * Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included. ++ * 0 means "auto", which will select a value depending on current ZSTD_c_strategy. ++ * 1 means no splitting. ++ * Then, values from 2 to 6 are sorted in increasing cpu load order. ++ * ++ * Note that currently the first block is never split, ++ * to ensure expansion guarantees in presence of incompressible data. ++ */ ++#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6 ++#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20 ++ ++/* ZSTD_c_splitAfterSequences ++ * This is a stronger splitter algorithm, ++ * based on actual sequences previously produced by the selected parser. ++ * It's also slower, and as a consequence, mostly used for high compression levels. ++ * While the post-splitter does overlap with the pre-splitter, ++ * both can nonetheless be combined, ++ * notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX, ++ * resulting in higher compression ratio than just one of them. ++ * + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use block splitter. + * Set to ZSTD_ps_enable to always use block splitter. +@@ -1894,10 +2213,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * block splitting based on the compression parameters. + */ +-#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13 ++#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13 + + /* ZSTD_c_useRowMatchFinder +- * Controlled with ZSTD_paramSwitch_e enum. ++ * Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use row-based matchfinder. + * Set to ZSTD_ps_enable to force usage of row-based matchfinder. +@@ -1928,6 +2247,80 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + ++/* ZSTD_c_prefetchCDictTables ++ * Controlled with ZSTD_ParamSwitch_e enum. Default is ZSTD_ps_auto. ++ * ++ * In some situations, zstd uses CDict tables in-place rather than copying them ++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). ++ * In such situations, compression speed is seriously impacted when CDict tables are ++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables ++ * when they are used in-place. ++ * ++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. ++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables ++ * into the working context, so there is no need to prefetch. This parameter is ++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be ++ * useful but memcpy() is too expensive. The exact range of input sizes where this ++ * makes sense is best determined by careful experimentation. ++ * ++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, ++ * but in the future zstd may conditionally enable this feature via an auto-detection ++ * heuristic for cold CDicts. ++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. ++ */ ++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 ++ ++/* ZSTD_c_enableSeqProducerFallback ++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. ++ * ++ * Controls whether zstd will fall back to an internal sequence producer if an ++ * external sequence producer is registered and returns an error code. This fallback ++ * is block-by-block: the internal sequence producer will only be called for blocks ++ * where the external sequence producer returns an error code. Fallback parsing will ++ * follow any other cParam settings, such as compression level, the same as in a ++ * normal (fully-internal) compression operation. ++ * ++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API ++ * documentation (below) before setting this parameter. */ ++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 ++ ++/* ZSTD_c_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * This parameter can be used to set an upper bound on the blocksize ++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper ++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make ++ * compressBound() inaccurate). Only currently meant to be used for testing. ++ */ ++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 ++ ++/* ZSTD_c_repcodeResolution ++ * This parameter only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters (may change in the future). ++ * ++ * This parameter affects how zstd parses external sequences, ++ * provided via the ZSTD_compressSequences*() API ++ * or from an external block-level sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets within ++ * external sequences, even if those repcodes are not explicitly indicated in ++ * the "rep" field. Note that this is the only way to exploit repcode matches ++ * while using compressSequences*() or an external sequence producer, since zstd ++ * currently ignores the "rep" field of external sequences. ++ * ++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in ++ * external sequences, regardless of whether the "rep" field has been set. This ++ * reduces sequence compression overhead by about 25% while sacrificing some ++ * compression ratio. ++ * ++ * The default value is ZSTD_ps_auto, for which the library will enable/disable ++ * based on compression level (currently: level<10 disables, level>=10 enables). ++ */ ++#define ZSTD_c_repcodeResolution ZSTD_c_experimentalParam19 ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 /* older name */ ++ ++ + /*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. +@@ -2084,7 +2477,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * +- * When this flags is enabled zstd won't allocate an output buffer, because ++ * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. +@@ -2137,6 +2530,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + */ + #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + ++/* ZSTD_d_disableHuffmanAssembly ++ * Set to 1 to disable the Huffman assembly implementation. ++ * The default value is 0, which allows zstd to use the Huffman assembly ++ * implementation if available. ++ * ++ * This parameter can be used to disable Huffman assembly at runtime. ++ * If you want to disable it at compile time you can define the macro ++ * ZSTD_DISABLE_ASM. ++ */ ++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 ++ ++/* ZSTD_d_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * Forces the decompressor to reject blocks whose content size is ++ * larger than the configured maxBlockSize. When maxBlockSize is ++ * larger than the windowSize, the windowSize is used instead. ++ * This saves memory on the decoder when you know all blocks are small. ++ * ++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize. ++ * ++ * WARNING: This causes the decoder to reject otherwise valid frames ++ * that have block sizes larger than the configured maxBlockSize. ++ */ ++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 ++ + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). +@@ -2145,6 +2565,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") ++ZSTDLIB_STATIC_API + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + /*! ZSTD_decompressStream_simpleArgs() : +@@ -2181,6 +2602,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); +@@ -2198,17 +2620,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + + /*! ZSTD_initCStream_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd parameter and leave the rest as-is. +- * for ((param, value) : params) { +- * ZSTD_CCtx_setParameter(zcs, param, value); +- * } ++ * ZSTD_CCtx_setParams(zcs, params); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * +@@ -2218,6 +2638,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, +@@ -2232,15 +2653,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + + /*! ZSTD_initCStream_usingCDict_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. +- * for ((fParam, value) : fParams) { +- * ZSTD_CCtx_setParameter(zcs, fParam, value); +- * } ++ * ZSTD_CCtx_setFParams(zcs, fParams); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * +@@ -2250,6 +2669,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, +@@ -2264,7 +2684,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * explicitly specified. + * + * start a new frame, using same parameters from previous frame. +- * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. ++ * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. +@@ -2274,6 +2694,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +@@ -2319,8 +2740,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + + /*! +@@ -2330,8 +2751,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + + /*! +@@ -2339,18 +2760,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * +- * re-use decompression parameters from previous init; saves dictionary loading +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x ++ * reuse decompression parameters from previous init; saves dictionary loading + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + ++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* ++ * ++ * *** OVERVIEW *** ++ * The Block-Level Sequence Producer API allows users to provide their own custom ++ * sequence producer which libzstd invokes to process each block. The produced list ++ * of sequences (literals and matches) is then post-processed by libzstd to produce ++ * valid compressed blocks. ++ * ++ * This block-level offload API is a more granular complement of the existing ++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers ++ * an easier migration story for applications already integrated with libzstd: the ++ * user application continues to invoke the same compression functions ++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits ++ * from the specific advantages of the external sequence producer. For example, ++ * the sequence producer could be tuned to take advantage of known characteristics ++ * of the input, to offer better speed / ratio, or could leverage hardware ++ * acceleration not available within libzstd itself. ++ * ++ * See contrib/externalSequenceProducer for an example program employing the ++ * Block-Level Sequence Producer API. ++ * ++ * *** USAGE *** ++ * The user is responsible for implementing a function of type ++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following ++ * arguments to the user-provided function: ++ * ++ * - sequenceProducerState: a pointer to a user-managed state for the sequence ++ * producer. ++ * ++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. ++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory ++ * backing outSeqs is managed by the CCtx. ++ * ++ * - src, srcSize: an input buffer for the sequence producer to parse. ++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * - dict, dictSize: a history buffer, which may be empty, which the sequence ++ * producer may reference as it parses the src buffer. Currently, zstd will ++ * always pass dictSize == 0 into external sequence producers, but this will ++ * change in the future. ++ * ++ * - compressionLevel: a signed integer representing the zstd compression level ++ * set by the user for the current operation. The sequence producer may choose ++ * to use this information to change its compression strategy and speed/ratio ++ * tradeoff. Note: the compression level does not reflect zstd parameters set ++ * through the advanced API. ++ * ++ * - windowSize: a size_t representing the maximum allowed offset for external ++ * sequences. Note that sequence offsets are sometimes allowed to exceed the ++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md ++ * for details. ++ * ++ * The user-provided function shall return a size_t representing the number of ++ * sequences written to outSeqs. This return value will be treated as an error ++ * code if it is greater than outSeqsCapacity. The return value must be non-zero ++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided ++ * for convenience, but any value greater than outSeqsCapacity will be treated as ++ * an error code. ++ * ++ * If the user-provided function does not return an error code, the sequences ++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may ++ * occur if the parse is not valid. A parse is defined to be valid if the ++ * following conditions hold: ++ * - The sum of matchLengths and literalLengths must equal srcSize. ++ * - All sequences in the parse, except for the final sequence, must have ++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have ++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. ++ * - All offsets must respect the windowSize parameter as specified in ++ * doc/zstd_compression_format.md. ++ * - If the final sequence has matchLength == 0, it must also have offset == 0. ++ * ++ * zstd will only validate these conditions (and fail compression if they do not ++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence ++ * validation has a performance cost. ++ * ++ * If the user-provided function returns an error, zstd will either fall back ++ * to an internal sequence producer or fail the compression operation. The user can ++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback ++ * cParam. Fallback compression will follow any other cParam settings, such as ++ * compression level, the same as in a normal compression operation. ++ * ++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F ++ * function by calling ++ * ZSTD_registerSequenceProducer(cctx, ++ * sequenceProducerState, ++ * sequenceProducer) ++ * This setting will persist until the next parameter reset of the CCtx. ++ * ++ * The sequenceProducerState must be initialized by the user before calling ++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the ++ * sequenceProducerState. ++ * ++ * *** LIMITATIONS *** ++ * This API is compatible with all zstd compression APIs which respect advanced parameters. ++ * However, there are three limitations: ++ * ++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. ++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level ++ * external sequence producer. ++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some ++ * cases (see its documentation for details). Users must explicitly set ++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external ++ * sequence producer is registered. ++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default ++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should ++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence ++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). ++ * ++ * Second, history buffers are not currently supported. Concretely, zstd will always pass ++ * dictSize == 0 to the external sequence producer (for now). This has two implications: ++ * - Dictionaries are not currently supported. Compression will *not* fail if the user ++ * references a dictionary, but the dictionary won't have any effect. ++ * - Stream history is not currently supported. All advanced compression APIs, including ++ * streaming APIs, work with external sequence producers, but each block is treated as ++ * an independent chunk without history from previous blocks. ++ * ++ * Third, multi-threading within a single compression is not currently supported. In other words, ++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. ++ * Multi-threading across compressions is fine: simply create one CCtx per thread. ++ * ++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to ++ * overcoming them. It is purely a question of engineering effort. ++ */ ++ ++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) ++ ++typedef size_t (*ZSTD_sequenceProducer_F) ( ++ void* sequenceProducerState, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize, ++ const void* dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize ++); ++ ++/*! ZSTD_registerSequenceProducer() : ++ * Instruct zstd to use a block-level external sequence producer function. ++ * ++ * The sequenceProducerState must be initialized by the caller, and the caller is ++ * responsible for managing its lifetime. This parameter is sticky across ++ * compressions. It will remain set until the user explicitly resets compression ++ * parameters. ++ * ++ * Sequence producer registration is considered to be an "advanced parameter", ++ * part of the "advanced API". This means it will only have an effect on compression ++ * APIs which respect advanced parameters, such as compress2() and compressStream2(). ++ * Older compression APIs such as compressCCtx(), which predate the introduction of ++ * "advanced parameters", will ignore any external sequence producer setting. ++ * ++ * The sequence producer can be "cleared" by registering a NULL function pointer. This ++ * removes all limitations described above in the "LIMITATIONS" section of the API docs. ++ * ++ * The user is strongly encouraged to read the full API documentation (above) before ++ * calling this function. */ ++ZSTDLIB_STATIC_API void ++ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* cctx, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++/*! ZSTD_CCtxParams_registerSequenceProducer() : ++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. ++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), ++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). ++ * ++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() ++ * is required, then this function is for you. Otherwise, you probably don't need it. ++ * ++ * See tests/zstreamtest.c for example usage. */ ++ZSTDLIB_STATIC_API void ++ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++ + /* ******************************************************************* +-* Buffer-less and synchronous inner streaming functions ++* Buffer-less and synchronous inner streaming functions (DEPRECATED) ++* ++* This API is deprecated, and will be removed in a future version. ++* It allows streaming (de)compression with user allocated buffers. ++* However, it is hard to use, and not as well tested as the rest of ++* our API. + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. ++* Please use the normal streaming API instead: ZSTD_compressStream2, ++* and ZSTD_decompressStream. ++* If there is functionality that you need, but it doesn't provide, ++* please open an issue on our GitHub. + ********************************************************************* */ + + /* +@@ -2358,11 +2963,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. +- ZSTD_CCtx object can be re-used multiple times within successive compression operations. ++ ZSTD_CCtx object can be reused multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2380,39 +2984,49 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + +- `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. ++ `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. + */ + + /*===== Buffer-less streaming compression functions =====*/ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ++ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. +- A ZSTD_DCtx object can be re-used multiple times. ++ A ZSTD_DCtx object can be reused multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + +- It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, ++ It fills a ZSTD_FrameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. +@@ -2428,7 +3042,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +3062,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2471,27 +3085,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + */ + + /*===== Buffer-less streaming decompression functions =====*/ +-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +-typedef struct { +- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ +- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ +- unsigned blockSizeMax; +- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ +- unsigned headerSize; +- unsigned dictID; +- unsigned checksumFlag; +-} ZSTD_frameHeader; + +-/*! ZSTD_getFrameHeader() : +- * decode Frame Header, or requires larger `srcSize`. +- * @return : 0, `zfhPtr` is correctly filled, +- * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +-/*! ZSTD_getFrameHeader_advanced() : +- * same as ZSTD_getFrameHeader(), +- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +@@ -2502,6 +3096,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2509,11 +3104,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + +-/* ============================ */ +-/* Block level API */ +-/* ============================ */ ++/* ========================================= */ ++/* Block level API (DEPRECATED) */ ++/* ========================================= */ + + /*! ++ ++ This API is deprecated in favor of the regular compression API. ++ You can get the frame header down to 2 bytes by setting: ++ - ZSTD_c_format = ZSTD_f_zstd1_magicless ++ - ZSTD_c_contentSizeFlag = 0 ++ - ZSTD_c_checksumFlag = 0 ++ - ZSTD_c_dictIDFlag = 0 ++ ++ This API is not as well tested as our normal API, so we recommend not using it. ++ We will be removing it in a future version. If the normal API doesn't provide ++ the functionality you need, please open a GitHub issue. ++ + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. +@@ -2524,7 +3131,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2541,11 +3147,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + */ + + /*===== Raw zstd block functions =====*/ ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ +- +diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile +index 20f08c644b71..be218b5e0ed5 100644 +--- a/lib/zstd/Makefile ++++ b/lib/zstd/Makefile +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + # ################################################################ +-# Copyright (c) Facebook, Inc. ++# Copyright (c) Meta Platforms, Inc. and affiliates. + # All rights reserved. + # + # This source code is licensed under both the BSD-style license (found in the +@@ -26,6 +26,7 @@ zstd_compress-y := \ + compress/zstd_lazy.o \ + compress/zstd_ldm.o \ + compress/zstd_opt.o \ ++ compress/zstd_preSplit.o \ + + zstd_decompress-y := \ + zstd_decompress_module.o \ +diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h +new file mode 100644 +index 000000000000..16c3d08e8d1a +--- /dev/null ++++ b/lib/zstd/common/allocations.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++/* This file provides custom allocation primitives ++ */ ++ ++#define ZSTD_DEPS_NEED_MALLOC ++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ ++ ++#include "compiler.h" /* MEM_STATIC */ ++#define ZSTD_STATIC_LINKING_ONLY ++#include /* ZSTD_customMem */ ++ ++#ifndef ZSTD_ALLOCATIONS_H ++#define ZSTD_ALLOCATIONS_H ++ ++/* custom memory allocation functions */ ++ ++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) ++ return customMem.customAlloc(customMem.opaque, size); ++ return ZSTD_malloc(size); ++} ++ ++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) { ++ /* calloc implemented as malloc+memset; ++ * not as efficient as calloc, but next best guess for custom malloc */ ++ void* const ptr = customMem.customAlloc(customMem.opaque, size); ++ ZSTD_memset(ptr, 0, size); ++ return ptr; ++ } ++ return ZSTD_calloc(1, size); ++} ++ ++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ++{ ++ if (ptr!=NULL) { ++ if (customMem.customFree) ++ customMem.customFree(customMem.opaque, ptr); ++ else ++ ZSTD_free(ptr); ++ } ++} ++ ++#endif /* ZSTD_ALLOCATIONS_H */ +diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h +new file mode 100644 +index 000000000000..c5faaa3d7b08 +--- /dev/null ++++ b/lib/zstd/common/bits.h +@@ -0,0 +1,150 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_BITS_H ++#define ZSTD_BITS_H ++ ++#include "mem.h" ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ 30, 22, 20, 15, 25, 17, 4, 8, ++ 31, 27, 13, 23, 21, 19, 16, 7, ++ 26, 12, 18, 6, 11, 5, 10, 9}; ++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++#else ++ return ZSTD_countTrailingZeros32_fallback(val); ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, ++ 11, 14, 16, 18, 22, 25, 3, 30, ++ 8, 12, 20, 28, 15, 17, 24, 7, ++ 19, 27, 23, 6, 26, 5, 4, 31}; ++ val |= val >> 1; ++ val |= val >> 2; ++ val |= val >> 4; ++ val |= val >> 8; ++ val |= val >> 16; ++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++#else ++ return ZSTD_countLeadingZeros32_fallback(val); ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++#else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); ++ } ++ } ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++#else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); ++ } ++ } ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) ++{ ++ if (MEM_isLittleEndian()) { ++ if (MEM_64bits()) { ++ return ZSTD_countTrailingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countTrailingZeros32((U32)val) >> 3; ++ } ++ } else { /* Big Endian CPU */ ++ if (MEM_64bits()) { ++ return ZSTD_countLeadingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countLeadingZeros32((U32)val) >> 3; ++ } ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ ++{ ++ assert(val != 0); ++ return 31 - ZSTD_countLeadingZeros32(val); ++} ++ ++/* ZSTD_rotateRight_*(): ++ * Rotates a bitfield to the right by "count" bits. ++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts ++ */ ++MEM_STATIC ++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { ++ assert(count < 64); ++ count &= 0x3F; /* for fickle pattern recognition */ ++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); ++} ++ ++MEM_STATIC ++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { ++ assert(count < 32); ++ count &= 0x1F; /* for fickle pattern recognition */ ++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); ++} ++ ++MEM_STATIC ++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { ++ assert(count < 16); ++ count &= 0x0F; /* for fickle pattern recognition */ ++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++} ++ ++#endif /* ZSTD_BITS_H */ +diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h +index feef3a1b1d60..86439da0eea7 100644 +--- a/lib/zstd/common/bitstream.h ++++ b/lib/zstd/common/bitstream.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * bitstream + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -27,7 +28,7 @@ + #include "compiler.h" /* UNLIKELY() */ + #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ + #include "error_private.h" /* error codes and messages */ +- ++#include "bits.h" /* ZSTD_highbit32 */ + + /*========================================= + * Target specific +@@ -41,12 +42,13 @@ + /*-****************************************** + * bitStream encoding API (write forward) + ********************************************/ ++typedef size_t BitContainerType; + /* bitStream can mix input from multiple sources. + * A critical property of these streams is that they encode and decode in **reverse** direction. + * So the first bit sequence you add will be the last to be read, like a LIFO stack. + */ + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitPos; + char* startPtr; + char* ptr; +@@ -54,7 +56,7 @@ typedef struct { + } BIT_CStream_t; + + MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity); +-MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits); ++MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); + MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + +@@ -63,7 +65,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + * `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. + * + * bits are first added to a local register. +-* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems. ++* Local register is BitContainerType, 64-bits on 64-bits systems, or 32-bits on 32-bits systems. + * Writing data into memory is an explicit operation, performed by the flushBits function. + * Hence keep track how many bits are potentially stored into local register to avoid register overflow. + * After a flushBits, a maximum of 7 bits might still be stored into local register. +@@ -80,28 +82,28 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + * bitStream decoding API (read backward) + **********************************************/ + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; + const char* limitPtr; + } BIT_DStream_t; + +-typedef enum { BIT_DStream_unfinished = 0, +- BIT_DStream_endOfBuffer = 1, +- BIT_DStream_completed = 2, +- BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ +- /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ ++typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ ++ BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ ++ BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ ++ BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ ++ } BIT_DStream_status; /* result of BIT_reloadDStream() */ + + MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); +-MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); ++MEM_STATIC BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); + MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); + MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + + + /* Start by invoking BIT_initDStream(). + * A chunk of the bitStream is then stored into a local register. +-* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). ++* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). + * You can then retrieve bitFields stored into the local register, **in reverse order**. + * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. + * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +@@ -113,7 +115,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + /*-**************************************** + * unsafe API + ******************************************/ +-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits); ++MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); + /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ + + MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); +@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); + /* faster, but works only if nbBits >= 1 */ + +- +- +-/*-************************************************************** +-* Internal functions +-****************************************************************/ +-MEM_STATIC unsigned BIT_highbit32 (U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, +- 11, 14, 16, 18, 22, 25, 3, 30, +- 8, 12, 20, 28, 15, 17, 24, 7, +- 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- + /*===== Local Constants =====*/ + static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, +@@ -178,16 +153,22 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + return 0; + } + ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getLowerBits(BitContainerType bitContainer, U32 const nbBits) ++{ ++ assert(nbBits < BIT_MASK_SIZE); ++ return bitContainer & BIT_mask[nbBits]; ++} ++ + /*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ + MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, +- size_t value, unsigned nbBits) ++ BitContainerType value, unsigned nbBits) + { + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; ++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; + bitC->bitPos += nbBits; + } + +@@ -195,7 +176,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + * works only if `value` is _clean_, + * meaning all high bits above nbBits are 0 */ + MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, +- size_t value, unsigned nbBits) ++ BitContainerType value, unsigned nbBits) + { + assert((value>>nbBits) == 0); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +@@ -242,7 +223,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) + BIT_addBitsFast(bitC, 1, 1); /* endMark */ + BIT_flushBits(bitC); + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); + } + + +@@ -266,35 +247,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { +- case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); ++ case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + ZSTD_FALLTHROUGH; + +- case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); ++ case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + ZSTD_FALLTHROUGH; + +- case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); ++ case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + ZSTD_FALLTHROUGH; + +- case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; ++ case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; + ZSTD_FALLTHROUGH; + +- case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; ++ case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; + ZSTD_FALLTHROUGH; + +- case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; ++ case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; + ZSTD_FALLTHROUGH; + + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; +@@ -303,12 +284,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + return srcSize; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getUpperBits(BitContainerType bitContainer, U32 const start) + { + return bitContainer >> start; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) + { + U32 const regMask = sizeof(bitContainer)*8 - 1; + /* if start > regMask, bitstream is corrupted, and result is undefined */ +@@ -318,26 +299,20 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c + * such cpus old (pre-Haswell, 2013) and their performance is not of that + * importance. + */ +-#if defined(__x86_64__) || defined(_M_X86) ++#if defined(__x86_64__) || defined(_M_X64) + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); + #else + return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; + #endif + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +-{ +- assert(nbBits < BIT_MASK_SIZE); +- return bitContainer & BIT_mask[nbBits]; +-} +- + /*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) + { + /* arbitrate between double-shift and shift+mask */ + #if 1 +@@ -353,14 +328,14 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U3 + + /*! BIT_lookBitsFast() : + * unsafe version; only works if nbBits >= 1 */ +-MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) ++MEM_STATIC BitContainerType BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) + { + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + assert(nbBits >= 1); + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); + } + +-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + { + bitD->bitsConsumed += nbBits; + } +@@ -369,23 +344,38 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) + { +- size_t const value = BIT_lookBits(bitD, nbBits); ++ BitContainerType const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; + } + + /*! BIT_readBitsFast() : +- * unsafe version; only works only if nbBits >= 1 */ +-MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) ++ * unsafe version; only works if nbBits >= 1 */ ++MEM_STATIC BitContainerType BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + { +- size_t const value = BIT_lookBitsFast(bitD, nbBits); ++ BitContainerType const value = BIT_lookBitsFast(bitD, nbBits); + assert(nbBits >= 1); + BIT_skipBits(bitD, nbBits); + return value; + } + ++/*! BIT_reloadDStream_internal() : ++ * Simple variant of BIT_reloadDStream(), with two conditions: ++ * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 ++ * 2. look window is valid after shifted down : bitD->ptr >= bitD->start ++ */ ++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) ++{ ++ assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); ++ bitD->ptr -= bitD->bitsConsumed >> 3; ++ assert(bitD->ptr >= bitD->start); ++ bitD->bitsConsumed &= 7; ++ bitD->bitContainer = MEM_readLEST(bitD->ptr); ++ return BIT_DStream_unfinished; ++} ++ + /*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! +@@ -396,31 +386,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) + { + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; +- assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); +- bitD->ptr -= bitD->bitsConsumed >> 3; +- bitD->bitsConsumed &= 7; +- bitD->bitContainer = MEM_readLEST(bitD->ptr); +- return BIT_DStream_unfinished; ++ return BIT_reloadDStream_internal(bitD); + } + + /*! BIT_reloadDStream() : + * Refill `bitD` from buffer previously set in BIT_initDStream() . +- * This function is safe, it guarantees it will not read beyond src buffer. ++ * This function is safe, it guarantees it will not never beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) ++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) + { +- if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ ++ /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ ++ if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { ++ static const BitContainerType zeroFilled = 0; ++ bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ ++ /* overflow detected, erroneous scenario or end of stream: no update */ + return BIT_DStream_overflow; ++ } ++ ++ assert(bitD->ptr >= bitD->start); + + if (bitD->ptr >= bitD->limitPtr) { +- return BIT_reloadDStreamFast(bitD); ++ return BIT_reloadDStream_internal(bitD); + } + if (bitD->ptr == bitD->start) { ++ /* reached end of bitStream => no update */ + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } +- /* start < ptr < limitPtr */ ++ /* start < ptr < limitPtr => cautious update */ + { U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { +@@ -442,5 +436,4 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) + return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); + } + +- + #endif /* BITSTREAM_H_MODULE */ +diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h +index c42d39faf9bd..dc9bd15e174e 100644 +--- a/lib/zstd/common/compiler.h ++++ b/lib/zstd/common/compiler.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,6 +12,8 @@ + #ifndef ZSTD_COMPILER_H + #define ZSTD_COMPILER_H + ++#include ++ + #include "portability_macros.h" + + /*-******************************************************* +@@ -41,12 +44,15 @@ + */ + #define WIN_CDECL + ++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ ++#define UNUSED_ATTR __attribute__((unused)) ++ + /* + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to eliminate the constant + * branches. + */ +-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR ++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR + /* + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers +@@ -61,11 +67,21 @@ + #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 + # define HINT_INLINE static INLINE_KEYWORD + #else +-# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR ++# define HINT_INLINE FORCE_INLINE_TEMPLATE + #endif + +-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +-#define UNUSED_ATTR __attribute__((unused)) ++/* "soft" inline : ++ * The compiler is free to select if it's a good idea to inline or not. ++ * The main objective is to silence compiler warnings ++ * when a defined function in included but not used. ++ * ++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. ++ * Updating the prefix is probably preferable, but requires a fairly large codemod, ++ * since this name is used everywhere. ++ */ ++#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ ++#define MEM_STATIC static __inline UNUSED_ATTR ++#endif + + /* force no inlining */ + #define FORCE_NOINLINE static __attribute__((__noinline__)) +@@ -86,23 +102,24 @@ + # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) + # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) + #elif defined(__aarch64__) +-# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) +-# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) ++# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) ++# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) + #else +-# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +-# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ ++# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ ++# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ + #endif /* NO_PREFETCH */ + + #define CACHELINE_SIZE 64 + +-#define PREFETCH_AREA(p, s) { \ +- const char* const _ptr = (const char*)(p); \ +- size_t const _size = (size_t)(s); \ +- size_t _pos; \ +- for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ +- PREFETCH_L2(_ptr + _pos); \ +- } \ +-} ++#define PREFETCH_AREA(p, s) \ ++ do { \ ++ const char* const _ptr = (const char*)(p); \ ++ size_t const _size = (size_t)(s); \ ++ size_t _pos; \ ++ for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ ++ PREFETCH_L2(_ptr + _pos); \ ++ } \ ++ } while (0) + + /* vectorization + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, +@@ -126,16 +143,13 @@ + #define UNLIKELY(x) (__builtin_expect((x), 0)) + + #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) +-# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } ++# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) + #else +-# define ZSTD_UNREACHABLE { assert(0); } ++# define ZSTD_UNREACHABLE do { assert(0); } while (0) + #endif + + /* disable warnings */ + +-/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/ +- +- + /* compile time determination of SIMD support */ + + /* C-language Attributes are added in C23. */ +@@ -158,9 +172,15 @@ + #define ZSTD_FALLTHROUGH fallthrough + + /*-************************************************************** +-* Alignment check ++* Alignment + *****************************************************************/ + ++/* @return 1 if @u is a 2^n value, 0 otherwise ++ * useful to check a value is valid for alignment restrictions */ ++MEM_STATIC int ZSTD_isPower2(size_t u) { ++ return (u & (u-1)) == 0; ++} ++ + /* this test was initially positioned in mem.h, + * but this file is removed (or replaced) for linux kernel + * so it's now hosted in compiler.h, +@@ -175,10 +195,95 @@ + + #endif /* ZSTD_ALIGNOF */ + ++#ifndef ZSTD_ALIGNED ++/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ ++#define ZSTD_ALIGNED(a) __attribute__((aligned(a))) ++#endif /* ZSTD_ALIGNED */ ++ ++ + /*-************************************************************** + * Sanitizer + *****************************************************************/ + ++/* ++ * Zstd relies on pointer overflow in its decompressor. ++ * We add this attribute to functions that rely on pointer overflow. ++ */ ++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# if __has_attribute(no_sanitize) ++# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 ++ /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) ++# else ++ /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) ++# endif ++# else ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# endif ++#endif ++ ++/* ++ * Helper function to perform a wrapped pointer difference without triggering ++ * UBSAN. ++ * ++ * @returns lhs - rhs with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) ++{ ++ return lhs - rhs; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer add without triggering UBSAN. ++ * ++ * @return ptr + add with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) ++{ ++ return ptr + add; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer subtraction without triggering ++ * UBSAN. ++ * ++ * @return ptr - sub with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) ++{ ++ return ptr - sub; ++} ++ ++/* ++ * Helper function to add to a pointer that works around C's undefined behavior ++ * of adding 0 to NULL. ++ * ++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`. ++ */ ++MEM_STATIC ++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) ++{ ++ return add > 0 ? ptr + add : ptr; ++} ++ ++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an ++ * abundance of caution, disable our custom poisoning on mingw. */ ++#ifdef __MINGW32__ ++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE ++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 ++#endif ++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE ++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 ++#endif ++#endif ++ + + + #endif /* ZSTD_COMPILER_H */ +diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h +index 0db7b42407ee..d8319a2bef4c 100644 +--- a/lib/zstd/common/cpu.h ++++ b/lib/zstd/common/cpu.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c +index bb863c9ea616..8eb6aa9a3b20 100644 +--- a/lib/zstd/common/debug.c ++++ b/lib/zstd/common/debug.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -21,4 +22,10 @@ + + #include "debug.h" + ++#if (DEBUGLEVEL>=2) ++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a ++ * translation unit is empty. So remove this from Linux kernel builds, but ++ * otherwise just leave it in. ++ */ + int g_debuglevel = DEBUGLEVEL; ++#endif +diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h +index 6dd88d1fbd02..c8a10281f112 100644 +--- a/lib/zstd/common/debug.h ++++ b/lib/zstd/common/debug.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -33,7 +34,6 @@ + #define DEBUG_H_12987983217 + + +- + /* static assert is triggered at compile time, leaving no runtime artefact. + * static assert only works with compile-time constants. + * Also, this variant can only be used inside a function. */ +@@ -82,20 +82,27 @@ extern int g_debuglevel; /* the variable is only declared, + It's useful when enabling very verbose levels + on selective conditions (such as position in src) */ + +-# define RAWLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__VA_ARGS__); \ +- } } +-# define DEBUGLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \ +- ZSTD_DEBUG_PRINT(" \n"); \ +- } } ++# define RAWLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__VA_ARGS__); \ ++ } \ ++ } while (0) ++ ++#define STRINGIFY(x) #x ++#define TOSTRING(x) STRINGIFY(x) ++#define LINE_AS_STRING TOSTRING(__LINE__) ++ ++# define DEBUGLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ ++ ZSTD_DEBUG_PRINT(" \n"); \ ++ } \ ++ } while (0) + #else +-# define RAWLOG(l, ...) {} /* disabled */ +-# define DEBUGLOG(l, ...) {} /* disabled */ ++# define RAWLOG(l, ...) do { } while (0) /* disabled */ ++# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ + #endif + +- +- + #endif /* DEBUG_H_12987983217 */ +diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c +index fef67056f052..6cdd82233fb5 100644 +--- a/lib/zstd/common/entropy_common.c ++++ b/lib/zstd/common/entropy_common.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * Common functions of New Generation Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,8 +20,8 @@ + #include "error_private.h" /* ERR_*, ERROR */ + #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ + #include "huf.h" ++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ + + + /*=== Version ===*/ +@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + /*-************************************************************** + * FSE NCount encoding-decoding + ****************************************************************/ +-static U32 FSE_ctz(U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_ctz(val); +-# else /* Software version */ +- U32 count = 0; +- while ((val & 1) == 0) { +- val >>= 1; +- ++count; +- } +- return count; +-# endif +- } +-} +- + FORCE_INLINE_TEMPLATE + size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * repeat. + * Avoid UB by setting the high bit to 1. + */ +- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { +@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; +- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + } + charnum += 3 * repeats; + bitStream >>= 2 * repeats; +@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * know that threshold > 1. + */ + if (remaining <= 1) break; +- nbBits = BIT_highbit32(remaining) + 1; ++ nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); + } + if (charnum >= maxSV1) break; +@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + const void* src, size_t srcSize) + { + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); ++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ +- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; ++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; +- U32 const verif = 1 << BIT_highbit32(rest); +- U32 const lastWeight = BIT_highbit32(rest) + 1; ++ U32 const verif = 1 << ZSTD_highbit32(rest); ++ U32 const lastWeight = ZSTD_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; +@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, +- int bmi2) ++ int flags) + { + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } + #endif +- (void)bmi2; ++ (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c +index 6d1135f8c373..6c3dbad838b6 100644 +--- a/lib/zstd/common/error_private.c ++++ b/lib/zstd/common/error_private.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; +- case PREFIX(corruption_detected): return "Corrupted block detected"; ++ case PREFIX(corruption_detected): return "Data corruption detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; ++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; ++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; +@@ -38,17 +41,23 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(cannotProduce_uncompressedBlock): return "This mode cannot generate an uncompressed block"; ++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; ++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; ++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; ++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; ++ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; + case PREFIX(maxCode): + default: return notErrorCode; + } +diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h +index ca5101e542fa..08ee87b68cca 100644 +--- a/lib/zstd/common/error_private.h ++++ b/lib/zstd/common/error_private.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,8 +14,6 @@ + #ifndef ERROR_H_MODULE + #define ERROR_H_MODULE + +- +- + /* **************************************** + * Dependencies + ******************************************/ +@@ -23,7 +22,6 @@ + #include "debug.h" + #include "zstd_deps.h" /* size_t */ + +- + /* **************************************** + * Compiler-specific + ******************************************/ +@@ -49,8 +47,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } + + /* check and forward error code */ +-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e +-#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } ++#define CHECK_V_F(e, f) \ ++ size_t const e = f; \ ++ do { \ ++ if (ERR_isError(e)) \ ++ return e; \ ++ } while (0) ++#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) + + + /*-**************************************** +@@ -84,10 +87,12 @@ void _force_has_format_string(const char *format, ...) { + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +-#define _FORCE_HAS_FORMAT_STRING(...) \ +- if (0) { \ +- _force_has_format_string(__VA_ARGS__); \ +- } ++#define _FORCE_HAS_FORMAT_STRING(...) \ ++ do { \ ++ if (0) { \ ++ _force_has_format_string(__VA_ARGS__); \ ++ } \ ++ } while (0) + + #define ERR_QUOTE(str) #str + +@@ -98,48 +103,49 @@ void _force_has_format_string(const char *format, ...) { + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +-#define RETURN_ERROR_IF(cond, err, ...) \ +- if (cond) { \ +- RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } ++#define RETURN_ERROR_IF(cond, err, ...) \ ++ do { \ ++ if (cond) { \ ++ RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } \ ++ } while (0) + + /* + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +-#define RETURN_ERROR(err, ...) \ +- do { \ +- RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } while(0); ++#define RETURN_ERROR(err, ...) \ ++ do { \ ++ RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } while(0) + + /* + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +-#define FORWARD_IF_ERROR(err, ...) \ +- do { \ +- size_t const err_code = (err); \ +- if (ERR_isError(err_code)) { \ +- RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ +- __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return err_code; \ +- } \ +- } while(0); +- ++#define FORWARD_IF_ERROR(err, ...) \ ++ do { \ ++ size_t const err_code = (err); \ ++ if (ERR_isError(err_code)) { \ ++ RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return err_code; \ ++ } \ ++ } while(0) + + #endif /* ERROR_H_MODULE */ +diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h +index 4507043b2287..b36ce7a2a8c3 100644 +--- a/lib/zstd/common/fse.h ++++ b/lib/zstd/common/fse.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -11,8 +12,6 @@ + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + ****************************************************************** */ +- +- + #ifndef FSE_H + #define FSE_H + +@@ -22,7 +21,6 @@ + ******************************************/ + #include "zstd_deps.h" /* size_t, ptrdiff_t */ + +- + /*-***************************************** + * FSE_PUBLIC_API : control library symbols visibility + ******************************************/ +@@ -50,34 +48,6 @@ + FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ + + +-/*-**************************************** +-* FSE simple functions +-******************************************/ +-/*! FSE_compress() : +- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. +- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). +- @return : size of compressed data (<= dstCapacity). +- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. +- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +-*/ +-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/*! FSE_decompress(): +- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', +- into already allocated destination buffer 'dst', of size 'dstCapacity'. +- @return : size of regenerated data (<= maxDstSize), +- or an error code, which can be tested using FSE_isError() . +- +- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! +- Why ? : making this distinction requires a header. +- Header management is intentionally delegated to the user layer, which can better manage special cases. +-*/ +-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, +- const void* cSrc, size_t cSrcSize); +- +- + /*-***************************************** + * Tool functions + ******************************************/ +@@ -88,20 +58,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return + FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +-/*-***************************************** +-* FSE advanced functions +-******************************************/ +-/*! FSE_compress2() : +- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' +- Both parameters can be defined as '0' to mean : use default value +- @return : size of compressed data +- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. +- if FSE_isError(return), it's an error code. +-*/ +-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +- +- + /*-***************************************** + * FSE detailed API + ******************************************/ +@@ -161,8 +117,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + /*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ + typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + + /*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). +@@ -238,23 +192,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +-/*! Constructor and Destructor of FSE_DTable. +- Note that its size depends on 'tableLog' */ + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); +- +-/*! FSE_buildDTable(): +- Builds 'dt', which must be already allocated, using FSE_createDTable(). +- return : 0, or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +- +-/*! FSE_decompress_usingDTable(): +- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` +- into `dst` which must be already allocated. +- @return : size of regenerated data (necessarily <= `dstCapacity`), +- or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + + /*! + Tutorial : +@@ -286,13 +224,11 @@ If there is an error, the function will return an error code, which can be teste + + #endif /* FSE_H */ + ++ + #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY) + #define FSE_H_FSE_STATIC_LINKING_ONLY +- +-/* *** Dependency *** */ + #include "bitstream.h" + +- + /* ***************************************** + * Static allocation + *******************************************/ +@@ -317,16 +253,6 @@ If there is an error, the function will return an error code, which can be teste + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); + /*< same as FSE_optimalTableLog(), which used `minus==2` */ + +-/* FSE_compress_wksp() : +- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). +- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. +- */ +-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +- +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ +- + size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); + /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +@@ -344,19 +270,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi + FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); +-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ +- +-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); +-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ +- +-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) ++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) + #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ +- + size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ ++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. ++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + + typedef enum { + FSE_repeat_none, /*< Cannot use the previous table */ +@@ -539,20 +457,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); +- BIT_addBits(bitC, statePtr->value, nbBitsOut); ++ BIT_addBits(bitC, (BitContainerType)statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } + + MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) + { +- BIT_addBits(bitC, statePtr->value, statePtr->stateLog); ++ BIT_addBits(bitC, (BitContainerType)statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); + } + + + /* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. +- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) ++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ + MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +@@ -705,7 +623,4 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) + + #define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3) + +- + #endif /* FSE_STATIC_LINKING_ONLY */ +- +- +diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c +index 8dcb8ca39767..15081d8dc607 100644 +--- a/lib/zstd/common/fse_decompress.c ++++ b/lib/zstd/common/fse_decompress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy decoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -22,8 +23,8 @@ + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" + #include "error_private.h" +-#define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" ++#include "zstd_deps.h" /* ZSTD_memcpy */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -55,19 +56,6 @@ + #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) + #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + +- +-/* Function templates */ +-FSE_DTable* FSE_createDTable (unsigned tableLog) +-{ +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +-} +- +-void FSE_freeDTable (FSE_DTable* dt) +-{ +- ZSTD_free(dt); +-} +- + static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) + { + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ +@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + symbolNext[s] = 1; + } else { + if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; +- symbolNext[s] = normalizedCounter[s]; ++ symbolNext[s] = (U16)normalizedCounter[s]; + } } } + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); + } +@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ +- { +- U64 const add = 0x0101010101010101ull; ++ { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; +@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; +- } +- } ++ pos += (size_t)n; ++ } } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (u=0; utableLog = 0; +- DTableH->fastMode = 0; +- +- cell->newState = 0; +- cell->symbol = symbolValue; +- cell->nbBits = 0; +- +- return 0; +-} +- +- +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +-{ +- void* ptr = dt; +- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; +- void* dPtr = dt + 1; +- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSV1 = tableMask+1; +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* Build Decoding Table */ +- DTableH->tableLog = (U16)nbBits; +- DTableH->fastMode = 1; +- for (s=0; sfastMode; +- +- /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); +- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +-} +- +- +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +-{ +- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); ++ assert(op >= ostart); ++ return (size_t)(op-ostart); + } + + typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; +- FSE_DTable dtable[]; /* Dynamically sized */ + } FSE_DecompressWksp; + + +@@ -327,13 +252,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; ++ size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); ++ FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; + +- DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); ++ FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); + ++ /* correct offset to dtable depends on this property */ ++ FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); ++ + /* normal FSE decoding mode */ +- { +- size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); ++ { size_t const NCountLength = ++ FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + if (FSE_isError(NCountLength)) return NCountLength; + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); +@@ -342,19 +272,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + } + + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); +- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); ++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); ++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + +- CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); ++ CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); + + { +- const void* ptr = wksp->dtable; ++ const void* ptr = dtable; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); +- return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); ++ if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); ++ return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + } + } + +@@ -382,9 +313,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } + +- +-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +- +- +- + #endif /* FSE_COMMONDEFS_ONLY */ +diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h +index 5042ff870308..49736dcd8f49 100644 +--- a/lib/zstd/common/huf.h ++++ b/lib/zstd/common/huf.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -12,105 +13,26 @@ + * You may select, at your option, one of the above-listed licenses. + ****************************************************************** */ + +- + #ifndef HUF_H_298734234 + #define HUF_H_298734234 + + /* *** Dependencies *** */ + #include "zstd_deps.h" /* size_t */ +- +- +-/* *** library symbols visibility *** */ +-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, +- * HUF symbols remain "private" (internal symbols for library only). +- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +-# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +-# define HUF_PUBLIC_API __declspec(dllexport) +-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +-#else +-# define HUF_PUBLIC_API +-#endif +- +- +-/* ========================== */ +-/* *** simple functions *** */ +-/* ========================== */ +- +-/* HUF_compress() : +- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. +- * 'dst' buffer must be already allocated. +- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). +- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. +- * @return : size of compressed data (<= `dstCapacity`). +- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) +- */ +-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/* HUF_decompress() : +- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', +- * into already allocated buffer 'dst', of minimum size 'dstSize'. +- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. +- * Note : in contrast with FSE, HUF_decompress can regenerate +- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, +- * because it knows size to regenerate (originalSize). +- * @return : size of regenerated data (== originalSize), +- * or an error code, which can be tested using HUF_isError() +- */ +-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, +- const void* cSrc, size_t cSrcSize); +- ++#include "mem.h" /* U32 */ ++#define FSE_STATIC_LINKING_ONLY ++#include "fse.h" + + /* *** Tool functions *** */ +-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ +-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ ++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ ++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ + + /* Error Management */ +-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ +-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ ++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ ++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ + + +-/* *** Advanced function *** */ +- +-/* HUF_compress2() : +- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. +- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . +- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog); +- +-/* HUF_compress4X_wksp() : +- * Same as HUF_compress2(), but uses externally allocated `workSpace`. +- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) +-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog, +- void* workSpace, size_t wkspSize); +- +-#endif /* HUF_H_298734234 */ +- +-/* ****************************************************************** +- * WARNING !! +- * The following section contains advanced and experimental definitions +- * which shall never be used in the context of a dynamic library, +- * because they are not guaranteed to remain stable in the future. +- * Only consider them in association with static linking. +- * *****************************************************************/ +-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +-#define HUF_H_HUF_STATIC_LINKING_ONLY +- +-/* *** Dependencies *** */ +-#include "mem.h" /* U32 */ +-#define FSE_STATIC_LINKING_ONLY +-#include "fse.h" +- + + /* *** Constants *** */ + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ +@@ -151,25 +73,49 @@ typedef U32 HUF_DTable; + /* **************************************** + * Advanced decompression functions + ******************************************/ +-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-#endif + +-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ +-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++/* ++ * Huffman flags bitset. ++ * For all flags, 0 is the default value. ++ */ ++typedef enum { ++ /* ++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. ++ * Otherwise: Ignored. ++ */ ++ HUF_flags_bmi2 = (1 << 0), ++ /* ++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. ++ * If unset: Use heuristic to find the table depth. ++ */ ++ HUF_flags_optimalDepth = (1 << 1), ++ /* ++ * If set: If the previous table can encode the input, always reuse the previous table. ++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. ++ */ ++ HUF_flags_preferRepeat = (1 << 2), ++ /* ++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. ++ * If unset: Always histogram the entire input. ++ */ ++ HUF_flags_suspectUncompressible = (1 << 3), ++ /* ++ * If set: Don't use assembly implementations ++ * If unset: Allow using assembly implementations ++ */ ++ HUF_flags_disableAsm = (1 << 4), ++ /* ++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. ++ * If unset: Use the fast decoding loop when possible. ++ */ ++ HUF_flags_disableFast = (1 << 5) ++} HUF_flags_e; + + + /* **************************************** + * HUF detailed API + * ****************************************/ ++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra + + /*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") +@@ -182,12 +128,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); ++unsigned HUF_minTableLog(unsigned symbolCardinality); ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); ++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, ++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ + size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +@@ -196,6 +142,7 @@ typedef enum { + HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; ++ + /* HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -206,13 +153,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) ++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) + #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) + size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, +@@ -238,7 +185,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, +- int bmi2); ++ int flags); + + /* HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +@@ -246,9 +193,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + + /* HUF_getNbBitsFromCTable() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX +- * Note 1 : is not inlined, as HUF_CElt definition is private */ ++ * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 ++ * Note 2 : is not inlined, as HUF_CElt definition is private ++ */ + U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); + ++typedef struct { ++ BYTE tableLog; ++ BYTE maxSymbolValue; ++ BYTE unused[sizeof(size_t) - 2]; ++} HUF_CTableHeader; ++ ++/* HUF_readCTableHeader() : ++ * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. ++ */ ++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); ++ + /* + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics +@@ -276,32 +236,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) + #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +- + + /* ====================== */ + /* single stream variants */ + /* ====================== */ + +-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + /* HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -312,47 +252,27 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); +- +-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +-#endif +- +-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); ++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ + #endif + + /* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #endif +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + +-#endif /* HUF_STATIC_LINKING_ONLY */ +- ++#endif /* HUF_H_298734234 */ +diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h +index c22a2e69bf46..d9bd752fe17b 100644 +--- a/lib/zstd/common/mem.h ++++ b/lib/zstd/common/mem.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,6 +24,7 @@ + /*-**************************************** + * Compiler specifics + ******************************************/ ++#undef MEM_STATIC /* may be already defined from common/compiler.h */ + #define MEM_STATIC static inline + + /*-************************************************************** +diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h +index 0e3b2c0a527d..05286af72683 100644 +--- a/lib/zstd/common/portability_macros.h ++++ b/lib/zstd/common/portability_macros.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,7 +13,7 @@ + #define ZSTD_PORTABILITY_MACROS_H + + /* +- * This header file contains macro defintions to support portability. ++ * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * +@@ -45,30 +46,35 @@ + /* Mark the internal assembly functions as hidden */ + #ifdef __ELF__ + # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func ++#elif defined(__APPLE__) ++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func + #else + # define ZSTD_HIDE_ASM_FUNCTION(func) + #endif + ++/* Compile time determination of BMI2 support */ ++ ++ + /* Enable runtime BMI2 dispatch based on the CPU. + * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. + */ + #ifndef DYNAMIC_BMI2 +- #if ((defined(__clang__) && __has_attribute(__target__)) \ ++# if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ + && (__GNUC__ >= 5 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 8)))) \ +- && (defined(__x86_64__) || defined(_M_X64)) \ ++ && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)) \ + && !defined(__BMI2__) +- # define DYNAMIC_BMI2 1 +- #else +- # define DYNAMIC_BMI2 0 +- #endif ++# define DYNAMIC_BMI2 1 ++# else ++# define DYNAMIC_BMI2 0 ++# endif + #endif + + /* +- * Only enable assembly for GNUC comptabile compilers, ++ * Only enable assembly for GNU C compatible compilers, + * because other platforms may not support GAS assembly syntax. + * +- * Only enable assembly for Linux / MacOS, other platforms may ++ * Only enable assembly for Linux / MacOS / Win32, other platforms may + * work, but they haven't been tested. This could likely be + * extended to BSD systems. + * +@@ -90,4 +96,23 @@ + */ + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + ++/* ++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in ++ * assembly sources when CET is enabled. ++ * ++ * Additionally, any function that may be called indirectly must begin ++ * with ZSTD_CET_ENDBRANCH. ++ */ ++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ ++ && defined(__has_include) ++# if __has_include() ++# include ++# define ZSTD_CET_ENDBRANCH _CET_ENDBR ++# endif ++#endif ++ ++#ifndef ZSTD_CET_ENDBRANCH ++# define ZSTD_CET_ENDBRANCH ++#endif ++ + #endif /* ZSTD_PORTABILITY_MACROS_H */ +diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c +index 3d7e35b309b5..44b95b25344a 100644 +--- a/lib/zstd/common/zstd_common.c ++++ b/lib/zstd/common/zstd_common.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,6 @@ + * Dependencies + ***************************************/ + #define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + #include "error_private.h" + #include "zstd_internal.h" + +@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + /*! ZSTD_getErrorString() : + * provides error code string from enum */ + const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } +- +- +- +-/*=************************************************************** +-* Custom allocator +-****************************************************************/ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) +- return customMem.customAlloc(customMem.opaque, size); +- return ZSTD_malloc(size); +-} +- +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) { +- /* calloc implemented as malloc+memset; +- * not as efficient as calloc, but next best guess for custom malloc */ +- void* const ptr = customMem.customAlloc(customMem.opaque, size); +- ZSTD_memset(ptr, 0, size); +- return ptr; +- } +- return ZSTD_calloc(1, size); +-} +- +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +-{ +- if (ptr!=NULL) { +- if (customMem.customFree) +- customMem.customFree(customMem.opaque, ptr); +- else +- ZSTD_free(ptr); +- } +-} +diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h +index 2c34e8a33a1c..f931f7d0e294 100644 +--- a/lib/zstd/common/zstd_deps.h ++++ b/lib/zstd/common/zstd_deps.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { + + #endif /* ZSTD_DEPS_IO */ + #endif /* ZSTD_DEPS_NEED_IO */ ++ ++/* ++ * Only requested when MSAN is enabled. ++ * Need: ++ * intptr_t ++ */ ++#ifdef ZSTD_DEPS_NEED_STDINT ++#ifndef ZSTD_DEPS_STDINT ++#define ZSTD_DEPS_STDINT ++ ++/* intptr_t already provided by ZSTD_DEPS_COMMON */ ++ ++#endif /* ZSTD_DEPS_STDINT */ ++#endif /* ZSTD_DEPS_NEED_STDINT */ +diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h +index 93305d9b41bb..52a79435caf6 100644 +--- a/lib/zstd/common/zstd_internal.h ++++ b/lib/zstd/common/zstd_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -28,12 +29,10 @@ + #include + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "huf.h" + #include /* XXH_reset, update, digest */ + #define ZSTD_TRACE 0 + +- + /* ---- static assert (debug) --- */ + #define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) + #define ZSTD_isError ERR_isError /* for inlining */ +@@ -83,16 +82,17 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + #define ZSTD_FRAMECHECKSUMSIZE 4 + + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ ++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ ++#define MIN_LITERALS_FOR_4_STREAMS 6 + +-#define HufLog 12 +-typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; ++typedef enum { set_basic, set_rle, set_compressed, set_repeat } SymbolEncodingType_e; + + #define LONGNBSEQ 0x7F00 + + #define MINMATCH 3 + + #define Litbits 8 ++#define LitHufLog 11 + #define MaxLit ((1<= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); +@@ -225,12 +227,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. + */ +-#ifdef __aarch64__ +- do { +- COPY16(op, ip); +- } +- while (op < oend); +-#else + ZSTD_copy16(op, ip); + if (16 >= length) return; + op += 16; +@@ -240,7 +236,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + COPY16(op, ip); + } + while (op < oend); +-#endif + } + } + +@@ -273,62 +268,6 @@ typedef enum { + /*-******************************************* + * Private declarations + *********************************************/ +-typedef struct seqDef_s { +- U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ +- U16 litLength; +- U16 mlBase; /* mlBase == matchLength - MINMATCH */ +-} seqDef; +- +-/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */ +-typedef enum { +- ZSTD_llt_none = 0, /* no longLengthType */ +- ZSTD_llt_literalLength = 1, /* represents a long literal */ +- ZSTD_llt_matchLength = 2 /* represents a long match */ +-} ZSTD_longLengthType_e; +- +-typedef struct { +- seqDef* sequencesStart; +- seqDef* sequences; /* ptr to end of sequences */ +- BYTE* litStart; +- BYTE* lit; /* ptr to end of literals */ +- BYTE* llCode; +- BYTE* mlCode; +- BYTE* ofCode; +- size_t maxNbSeq; +- size_t maxNbLit; +- +- /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength +- * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment +- * the existing value of the litLength or matchLength by 0x10000. +- */ +- ZSTD_longLengthType_e longLengthType; +- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ +-} seqStore_t; +- +-typedef struct { +- U32 litLength; +- U32 matchLength; +-} ZSTD_sequenceLength; +- +-/* +- * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences +- * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. +- */ +-MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) +-{ +- ZSTD_sequenceLength seqLen; +- seqLen.litLength = seq->litLength; +- seqLen.matchLength = seq->mlBase + MINMATCH; +- if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { +- if (seqStore->longLengthType == ZSTD_llt_literalLength) { +- seqLen.litLength += 0xFFFF; +- } +- if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- seqLen.matchLength += 0xFFFF; +- } +- } +- return seqLen; +-} + + /* + * Contains the compressed frame size and an upper-bound for the decompressed frame size. +@@ -337,74 +276,11 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ + typedef struct { ++ size_t nbBlocks; + size_t compressedSize; + unsigned long long decompressedBound; + } ZSTD_frameSizeInfo; /* decompress & legacy */ + +-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ +- +-/* custom memory allocation functions */ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); +- +- +-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- +-/* +- * Counts the number of trailing zeros of a `size_t`. +- * Most compilers should support CTZ as a builtin. A backup +- * implementation is provided if the builtin isn't supported, but +- * it may not be terribly efficient. +- */ +-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +-{ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return __builtin_ctzll((U64)val); +-# else +- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, +- 4, 25, 14, 28, 9, 34, 20, 56, +- 5, 17, 26, 54, 15, 41, 29, 43, +- 10, 31, 38, 35, 21, 45, 49, 57, +- 63, 6, 12, 18, 24, 27, 33, 55, +- 16, 53, 40, 42, 30, 37, 44, 48, +- 62, 11, 23, 32, 52, 39, 36, 47, +- 61, 22, 51, 46, 60, 50, 59, 58 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return __builtin_ctz((U32)val); +-# else +- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, +- 30, 22, 20, 15, 25, 17, 4, 8, +- 31, 27, 13, 23, 21, 19, 16, 7, +- 26, 12, 18, 6, 11, 5, 10, 9 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +-} +- +- + /* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; +@@ -420,13 +296,13 @@ typedef struct { + + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: decompress, fullbench */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + + /*! ZSTD_decodeSeqHeaders() : + * decode sequence header from src */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: zstd_decompress_block, fullbench */ + size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize); + +@@ -439,5 +315,4 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void) + return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); + } + +- + #endif /* ZSTD_CCOMMON_H_MODULE */ +diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h +index d9a76112ec3a..6ab8be6532ef 100644 +--- a/lib/zstd/compress/clevels.h ++++ b/lib/zstd/compress/clevels.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c +index ec5b1ca6d71a..44a3c10becf2 100644 +--- a/lib/zstd/compress/fse_compress.c ++++ b/lib/zstd/compress/fse_compress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy encoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -25,7 +26,8 @@ + #include "../common/error_private.h" + #define ZSTD_DEPS_NEED_MALLOC + #define ZSTD_DEPS_NEED_MATH64 +-#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : +- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ ++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ +@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + break; + default : + assert(normalizedCounter[s] > 1); +- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); ++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) +- + 1 /* round up to whole nb bytes */ +- + 2 /* additional two bytes for bitstream flush */; ++ + 1 /* round up to whole nb bytes */ ++ + 2 /* additional two bytes for bitstream flush */; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ + } + +@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; +- nbBits = tableLog+1; ++ nbBits = (int)tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { +@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + } + while (symbol >= start+3) { + start+=3; +- bitStream += 3 << bitCount; ++ bitStream += 3U << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; +@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ +- bitStream += count << bitCount; ++ bitStream += (U32)count << bitCount; + bitCount += nbBits; + bitCount -= (count>8); + out+= (bitCount+7) /8; + +- return (out-ostart); ++ assert(out >= ostart); ++ return (size_t)(out-ostart); + } + + +@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, + * FSE Compression Code + ****************************************************************/ + +-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +-{ +- size_t size; +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); +- return (FSE_CTable*)ZSTD_malloc(size); +-} +- +-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } +- + /* provides the minimum logSize to safely represent a distribution */ + static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + { +- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; +- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; ++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; ++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) + { +- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; ++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ +@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + return tableLog; + } + +- +-/* fake FSE_CTable, for raw (uncompressed) input */ +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) +-{ +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSymbolValue = tableMask; +- void* const ptr = ct; +- U16* const tableU16 = ( (U16*) ptr) + 2; +- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ +- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* header */ +- tableU16[-2] = (U16) nbBits; +- tableU16[-1] = (U16) maxSymbolValue; +- +- /* Build table */ +- for (s=0; s= 2 ++ ++static size_t showU32(const U32* arr, size_t size) + { +- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ size_t u; ++ for (u=0; u= sizeof(HUF_WriteCTableWksp)); ++ ++ assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); ++ assert(HUF_readCTableHeader(CTable).tableLog == huffLog); ++ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); +@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + return ((maxSymbolValue+1)/2) + 1; + } + +-/*! HUF_writeCTable() : +- `CTable` : Huffman tree to save, using huf representation. +- @return : size of saved CTable */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, +- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +-{ +- HUF_WriteCTableWksp wksp; +- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +-} +- + + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) + { +@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + +- CTable[0] = tableLog; ++ *maxSymbolValuePtr = nbSymbols - 1; ++ ++ HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; +@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) ++ return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); + } + + +-typedef struct nodeElt_s { +- U32 count; +- U16 parent; +- BYTE byte; +- BYTE nbBits; +-} nodeElt; +- + /* + * HUF_setMaxHeight(): +- * Enforces maxNbBits on the Huffman tree described in huffNode. ++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * +- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts +- * the tree to so that it is a valid canonical Huffman tree. ++ * It attempts to convert all nodes with nbBits > @targetNbBits ++ * to employ @targetNbBits instead. Then it adjusts the tree ++ * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, +- * where largestBits is the return value <= maxNbBits. ++ * where largestBits is the return value (expected <= targetNbBits). + * +- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. ++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. ++ * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. +- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree ++ * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will +- * respect maxNbBits. +- * @return The maximum number of bits of the Huffman tree after adjustment, +- * necessarily no more than maxNbBits. ++ * respect targetNbBits. ++ * @return The maximum number of bits of the Huffman tree after adjustment. + */ +-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) ++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) + { + const U32 largestBits = huffNode[lastNonNull].nbBits; +- /* early exit : no elt > maxNbBits, so the tree is already valid. */ +- if (largestBits <= maxNbBits) return largestBits; ++ /* early exit : no elt > targetNbBits, so the tree is already valid. */ ++ if (largestBits <= targetNbBits) return largestBits; ++ ++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; +- const U32 baseCost = 1 << (largestBits - maxNbBits); ++ const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + +- /* Adjust any ranks > maxNbBits to maxNbBits. ++ /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ +- while (huffNode[n].nbBits > maxNbBits) { ++ while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); +- huffNode[n].nbBits = (BYTE)maxNbBits; ++ huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } +- /* n stops at huffNode[n].nbBits <= maxNbBits */ +- assert(huffNode[n].nbBits <= maxNbBits); +- /* n end at index of smallest symbol using < maxNbBits */ +- while (huffNode[n].nbBits == maxNbBits) --n; ++ /* n stops at huffNode[n].nbBits <= targetNbBits */ ++ assert(huffNode[n].nbBits <= targetNbBits); ++ /* n end at index of smallest symbol using < targetNbBits */ ++ while (huffNode[n].nbBits == targetNbBits) --n; + +- /* renorm totalCost from 2^largestBits to 2^maxNbBits ++ /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ +- assert((totalCost & (baseCost - 1)) == 0); +- totalCost >>= (largestBits - maxNbBits); ++ assert(((U32)totalCost & (baseCost - 1)) == 0); ++ totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ +@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); +- { U32 currentNbBits = maxNbBits; ++ { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; +- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ +- rankLast[maxNbBits-currentNbBits] = (U32)pos; ++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ ++ rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ +- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; ++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; +@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; +- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) ++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ +@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ +- /* special case : no rank 1 symbol (using maxNbBits-1); +- * let's create one from largest rank 0 (using maxNbBits). ++ /* special case : no rank 1 symbol (using targetNbBits-1); ++ * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { +- while (huffNode[n].nbBits == maxNbBits) n--; ++ while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); +@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + +- return maxNbBits; ++ return targetNbBits; + } + + typedef struct { +@@ -429,7 +500,7 @@ typedef struct { + U16 curr; + } rankPos; + +-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; ++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + + /* Number of buckets available for HUF_sort() */ + #define RANK_POSITION_TABLE_SIZE 192 +@@ -448,8 +519,8 @@ typedef struct { + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ + #define RANK_POSITION_MAX_COUNT_LOG 32 +-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ ++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) ++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + + /* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. +@@ -457,7 +528,7 @@ typedef struct { + static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count +- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; ++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; + } + + /* Helper swap function for HUF_quickSortPartition() */ +@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; ++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); +@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + assert(HUF_isSorted(huffNode, maxSymbolValue1)); + } + ++ + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). +@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; ++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; +@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + ++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); ++ + return nonNullRank; + } + +@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ + for (n=0; nhuffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + ++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); ++ ++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); ++ + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) +- return ERROR(workSpace_tooSmall); ++ return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) +- return ERROR(maxSymbolValue_tooLarge); ++ return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); ++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + +- /* enforce maxTableLog */ ++ /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + +@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, + } + + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { +- HUF_CElt const* ct = CTable + 1; +- int bad = 0; +- int s; +- for (s = 0; s <= (int)maxSymbolValue; ++s) { +- bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); +- } +- return !bad; ++ HUF_CTableHeader header = HUF_readCTableHeader(CTable); ++ HUF_CElt const* ct = CTable + 1; ++ int bad = 0; ++ int s; ++ ++ assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); ++ ++ if (header.maxSymbolValue < maxSymbolValue) ++ return 0; ++ ++ for (s = 0; s <= (int)maxSymbolValue; ++s) { ++ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); ++ } ++ return !bad; + } + + size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id + #if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); +- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; ++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC) + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } + } + +@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) + { +- U32 const tableLog = (U32)CTable[0]; ++ U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; +- BYTE* op = ostart; + HUF_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ +- { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); ++ { BYTE* op = ostart; ++ size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) +@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- (void)bmi2; ++ (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); + } + + #endif + +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +-{ +- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + static size_t + HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, int bmi2) ++ const HUF_CElt* CTable, int flags) + { + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; +@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + op += 6; /* jumpTable */ + + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; +@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; +@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; +@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } +@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + return (size_t)(op-ostart); + } + +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; +@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, +- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) ++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) + { + size_t const cSize = (nbStreams==HUF_singleStream) ? +- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : +- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); ++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : ++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; +@@ -1168,6 +1249,81 @@ typedef struct { + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) ++{ ++ unsigned cardinality = 0; ++ unsigned i; ++ ++ for (i = 0; i < maxSymbolValue + 1; i++) { ++ if (count[i] != 0) cardinality += 1; ++ } ++ ++ return cardinality; ++} ++ ++unsigned HUF_minTableLog(unsigned symbolCardinality) ++{ ++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; ++ return minBitsSymbols; ++} ++ ++unsigned HUF_optimalTableLog( ++ unsigned maxTableLog, ++ size_t srcSize, ++ unsigned maxSymbolValue, ++ void* workSpace, size_t wkspSize, ++ HUF_CElt* table, ++ const unsigned* count, ++ int flags) ++{ ++ assert(srcSize > 1); /* Not supported, RLE should be used instead */ ++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); ++ ++ if (!(flags & HUF_flags_optimalDepth)) { ++ /* cheap evaluation, based on FSE */ ++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ } ++ ++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); ++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); ++ size_t hSize, newSize; ++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); ++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); ++ size_t optSize = ((size_t) ~0) - 1; ++ unsigned optLog = maxTableLog, optLogGuess; ++ ++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); ++ ++ /* Search until size increases */ ++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { ++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); ++ ++ { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); ++ if (ERR_isError(maxBits)) continue; ++ ++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; ++ ++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); ++ } ++ ++ if (ERR_isError(hSize)) continue; ++ ++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; ++ ++ if (newSize > optSize + 1) { ++ break; ++ } ++ ++ if (newSize < optSize) { ++ optSize = newSize; ++ optLog = optLogGuess; ++ } ++ } ++ assert(optLog <= HUF_TABLELOG_MAX); ++ return optLog; ++ } ++} ++ + /* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, +- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, +- const int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) + { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + ++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ +@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ +- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { ++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; ++ DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; +@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } ++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat +@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ +- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; +- } +- /* Zero unused symbols in CTable, so we can check it for validity */ +- { +- size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); +- size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); +- ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); ++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + + /* Write table description header */ +@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ +@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize, + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, table->CTable, bmi2); +-} +- +- +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_singleStream, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ nbStreams, table->CTable, flags); + } + + size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +- int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, +- repeat, preferRepeat, bmi2, suspectUncompressible); +-} +- +-/* HUF_compress4X_repeat(): +- * compress input using 4 streams. +- * provide workspace to generate compression tables */ +-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_fourStreams, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ repeat, flags); + } + + /* HUF_compress4X_repeat(): + * compress input using 4 streams. + * consider skipping quickly +- * re-use an existing huffman compression table */ ++ * reuse an existing huffman compression table */ + size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, +- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); ++ hufTable, repeat, flags); + } +- +diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c +index 16bb995bc6c4..fc0a0f4e71a6 100644 +--- a/lib/zstd/compress/zstd_compress.c ++++ b/lib/zstd/compress/zstd_compress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,12 +12,13 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ + #include "../common/mem.h" ++#include "../common/error_private.h" + #include "hist.h" /* HIST_countFast_wksp */ + #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_compress_internal.h" + #include "zstd_compress_sequences.h" +@@ -27,6 +29,7 @@ + #include "zstd_opt.h" + #include "zstd_ldm.h" + #include "zstd_compress_superblock.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + + /* *************************************************************** + * Tuning parameters +@@ -44,7 +47,7 @@ + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. +- * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3, ++ * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ + #ifndef ZSTD_HASHLOG3_MAX +@@ -55,14 +58,17 @@ + * Helper functions + ***************************************/ + /* ZSTD_compressBound() +- * Note that the result from this function is only compatible with the "normal" +- * full-block strategy. +- * When there are a lot of small blocks due to frequent flush in streaming mode +- * the overhead of headers can make the compressed data to be larger than the +- * return value of ZSTD_compressBound(). ++ * Note that the result from this function is only valid for ++ * the one-pass compression functions. ++ * When employing the streaming mode, ++ * if flushes are frequently altering the size of blocks, ++ * the overhead from block headers can make the compressed data larger ++ * than the return value of ZSTD_compressBound(). + */ + size_t ZSTD_compressBound(size_t srcSize) { +- return ZSTD_COMPRESSBOUND(srcSize); ++ size_t const r = ZSTD_COMPRESSBOUND(srcSize); ++ if (r==0) return ERROR(srcSize_wrong); ++ return r; + } + + +@@ -75,12 +81,12 @@ struct ZSTD_CDict_s { + ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */ + U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + ZSTD_cwksp workspace; +- ZSTD_matchState_t matchState; ++ ZSTD_MatchState_t matchState; + ZSTD_compressedBlockState_t cBlockState; + ZSTD_customMem customMem; + U32 dictID; + int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ +- ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use ++ ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use + * row-based matchfinder. Unless the cdict is reloaded, we will use + * the same greedy/lazy matchfinder at compression time. + */ +@@ -130,11 +136,12 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) + ZSTD_cwksp_move(&cctx->workspace, &ws); + cctx->staticSize = workspaceSize; + +- /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ +- if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; ++ /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */ ++ if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); +- cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE); ++ cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); ++ cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + return cctx; + } +@@ -168,15 +175,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) + + size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) + { ++ DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); +- { +- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); +- if (!cctxInWorkspace) { +- ZSTD_customFree(cctx, cctx->customMem); +- } ++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; + } +@@ -205,7 +210,7 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) + } + + /* private API call, for dictBuilder only */ +-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } ++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + + /* Returns true if the strategy supports using a row based matchfinder */ + static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { +@@ -215,32 +220,23 @@ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { + /* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +-static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) { ++static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) { + assert(mode != ZSTD_ps_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); + } + + /* Returns row matchfinder usage given an initial mode and cParams */ +-static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +-#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) +- int const kHasSIMD128 = 1; +-#else +- int const kHasSIMD128 = 0; +-#endif + if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ + mode = ZSTD_ps_disable; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; +- if (kHasSIMD128) { +- if (cParams->windowLog > 14) mode = ZSTD_ps_enable; +- } else { +- if (cParams->windowLog > 17) mode = ZSTD_ps_enable; +- } ++ if (cParams->windowLog > 14) mode = ZSTD_ps_enable; + return mode; + } + + /* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ +-static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; +@@ -248,7 +244,7 @@ static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, + + /* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ + static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const U32 forDDSDict) { + assert(useRowMatchFinder != ZSTD_ps_auto); + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. +@@ -257,16 +253,44 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); + } + +-/* Returns 1 if compression parameters are such that we should ++/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). +- * Returns 0 otherwise. ++ * Returns ZSTD_ps_disable otherwise. + */ +-static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; + } + ++static int ZSTD_resolveExternalSequenceValidation(int mode) { ++ return mode; ++} ++ ++/* Resolves maxBlockSize to the default if no value is present. */ ++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { ++ if (maxBlockSize == 0) { ++ return ZSTD_BLOCKSIZE_MAX; ++ } else { ++ return maxBlockSize; ++ } ++} ++ ++static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) { ++ if (value != ZSTD_ps_auto) return value; ++ if (cLevel < 10) { ++ return ZSTD_ps_disable; ++ } else { ++ return ZSTD_ps_enable; ++ } ++} ++ ++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. ++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ ++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { ++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; ++} ++ + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) + { +@@ -282,8 +306,12 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); + assert(cctxParams.ldmParams.hashRateLog < 32); + } +- cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); ++ cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); ++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); ++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); ++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, ++ cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; + } +@@ -329,10 +357,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) + #define ZSTD_NO_CLEVEL 0 + + /* +- * Initializes the cctxParams from params and compressionLevel. ++ * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) ++static void ++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ++ const ZSTD_parameters* params, ++ int compressionLevel) + { + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); +@@ -343,10 +374,13 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par + */ + cctxParams->compressionLevel = compressionLevel; + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); +- cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); ++ cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); ++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); ++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); ++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", +- cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); ++ cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm); + } + + size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) +@@ -359,7 +393,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete + + /* + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. +- * @param param Validated zstd parameters. ++ * @param params Validated zstd parameters. + */ + static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +@@ -455,8 +489,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + return bounds; + + case ZSTD_c_enableLongDistanceMatching: +- bounds.lowerBound = 0; +- bounds.upperBound = 1; ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: +@@ -534,11 +568,16 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + ++ case ZSTD_c_blockSplitterLevel: ++ bounds.lowerBound = 0; ++ bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX; ++ return bounds; ++ + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; +@@ -549,6 +588,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + ++ case ZSTD_c_prefetchCDictTables: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ ++ case ZSTD_c_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ ++ case ZSTD_c_repcodeResolution: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; +@@ -567,10 +626,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) + return 0; + } + +-#define BOUNDCHECK(cParam, val) { \ +- RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ +- parameter_outOfBound, "Param out of bounds"); \ +-} ++#define BOUNDCHECK(cParam, val) \ ++ do { \ ++ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ ++ parameter_outOfBound, "Param out of bounds"); \ ++ } while (0) + + + static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +@@ -584,6 +644,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: ++ case ZSTD_c_blockSplitterLevel: + return 1; + + case ZSTD_c_format: +@@ -610,9 +671,13 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_repcodeResolution: + default: + return 0; + } +@@ -625,7 +690,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { +- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); ++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) +@@ -665,9 +730,14 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: ++ case ZSTD_c_blockSplitterLevel: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_repcodeResolution: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); +@@ -723,12 +793,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); +- CCtxParams->cParams.minMatch = value; ++ CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); +- CCtxParams->cParams.targetLength = value; ++ CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : +@@ -741,12 +811,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; +- return CCtxParams->fParams.contentSizeFlag; ++ return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; +- return CCtxParams->fParams.checksumFlag; ++ return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); +@@ -755,18 +825,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); +- return CCtxParams->forceWindow; ++ return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; +- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); ++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { +- const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value; ++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } +@@ -789,47 +859,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); +- return CCtxParams->enableDedicatedDictSearch; ++ return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : +- CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; ++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); ++ CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); +- CCtxParams->ldmParams.hashLog = value; ++ CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); +- CCtxParams->ldmParams.minMatchLength = value; ++ CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); +- CCtxParams->ldmParams.bucketSizeLog = value; ++ CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); +- CCtxParams->ldmParams.hashRateLog = value; ++ CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : +- if (value!=0) /* 0 ==> default */ ++ if (value!=0) { /* 0 ==> default */ ++ value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); +- CCtxParams->targetCBlockSize = value; ++ } ++ CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; +- return CCtxParams->srcSizeHint; ++ return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); +@@ -843,28 +916,55 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_blockDelimiters: + BOUNDCHECK(ZSTD_c_blockDelimiters, value); +- CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value; ++ CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value; + return CCtxParams->blockDelimiters; + + case ZSTD_c_validateSequences: + BOUNDCHECK(ZSTD_c_validateSequences, value); + CCtxParams->validateSequences = value; +- return CCtxParams->validateSequences; ++ return (size_t)CCtxParams->validateSequences; + +- case ZSTD_c_useBlockSplitter: +- BOUNDCHECK(ZSTD_c_useBlockSplitter, value); +- CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value; +- return CCtxParams->useBlockSplitter; ++ case ZSTD_c_splitAfterSequences: ++ BOUNDCHECK(ZSTD_c_splitAfterSequences, value); ++ CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value; ++ return CCtxParams->postBlockSplitter; ++ ++ case ZSTD_c_blockSplitterLevel: ++ BOUNDCHECK(ZSTD_c_blockSplitterLevel, value); ++ CCtxParams->preBlockSplitter_level = value; ++ return (size_t)CCtxParams->preBlockSplitter_level; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); +- CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value; ++ CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value; + return CCtxParams->useRowMatchFinder; + + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; +- return CCtxParams->deterministicRefPrefix; ++ return (size_t)CCtxParams->deterministicRefPrefix; ++ ++ case ZSTD_c_prefetchCDictTables: ++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); ++ CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value; ++ return CCtxParams->prefetchCDictTables; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); ++ CCtxParams->enableMatchFinderFallback = value; ++ return (size_t)CCtxParams->enableMatchFinderFallback; ++ ++ case ZSTD_c_maxBlockSize: ++ if (value!=0) /* 0 ==> default */ ++ BOUNDCHECK(ZSTD_c_maxBlockSize, value); ++ assert(value>=0); ++ CCtxParams->maxBlockSize = (size_t)value; ++ return CCtxParams->maxBlockSize; ++ ++ case ZSTD_c_repcodeResolution: ++ BOUNDCHECK(ZSTD_c_repcodeResolution, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value; ++ return CCtxParams->searchForExternalRepcodes; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +@@ -881,7 +981,7 @@ size_t ZSTD_CCtxParams_getParameter( + switch(param) + { + case ZSTD_c_format : +- *value = CCtxParams->format; ++ *value = (int)CCtxParams->format; + break; + case ZSTD_c_compressionLevel : + *value = CCtxParams->compressionLevel; +@@ -896,16 +996,16 @@ size_t ZSTD_CCtxParams_getParameter( + *value = (int)CCtxParams->cParams.chainLog; + break; + case ZSTD_c_searchLog : +- *value = CCtxParams->cParams.searchLog; ++ *value = (int)CCtxParams->cParams.searchLog; + break; + case ZSTD_c_minMatch : +- *value = CCtxParams->cParams.minMatch; ++ *value = (int)CCtxParams->cParams.minMatch; + break; + case ZSTD_c_targetLength : +- *value = CCtxParams->cParams.targetLength; ++ *value = (int)CCtxParams->cParams.targetLength; + break; + case ZSTD_c_strategy : +- *value = (unsigned)CCtxParams->cParams.strategy; ++ *value = (int)CCtxParams->cParams.strategy; + break; + case ZSTD_c_contentSizeFlag : + *value = CCtxParams->fParams.contentSizeFlag; +@@ -920,10 +1020,10 @@ size_t ZSTD_CCtxParams_getParameter( + *value = CCtxParams->forceWindow; + break; + case ZSTD_c_forceAttachDict : +- *value = CCtxParams->attachDictPref; ++ *value = (int)CCtxParams->attachDictPref; + break; + case ZSTD_c_literalCompressionMode : +- *value = CCtxParams->literalCompressionMode; ++ *value = (int)CCtxParams->literalCompressionMode; + break; + case ZSTD_c_nbWorkers : + assert(CCtxParams->nbWorkers == 0); +@@ -939,19 +1039,19 @@ size_t ZSTD_CCtxParams_getParameter( + *value = CCtxParams->enableDedicatedDictSearch; + break; + case ZSTD_c_enableLongDistanceMatching : +- *value = CCtxParams->ldmParams.enableLdm; ++ *value = (int)CCtxParams->ldmParams.enableLdm; + break; + case ZSTD_c_ldmHashLog : +- *value = CCtxParams->ldmParams.hashLog; ++ *value = (int)CCtxParams->ldmParams.hashLog; + break; + case ZSTD_c_ldmMinMatch : +- *value = CCtxParams->ldmParams.minMatchLength; ++ *value = (int)CCtxParams->ldmParams.minMatchLength; + break; + case ZSTD_c_ldmBucketSizeLog : +- *value = CCtxParams->ldmParams.bucketSizeLog; ++ *value = (int)CCtxParams->ldmParams.bucketSizeLog; + break; + case ZSTD_c_ldmHashRateLog : +- *value = CCtxParams->ldmParams.hashRateLog; ++ *value = (int)CCtxParams->ldmParams.hashRateLog; + break; + case ZSTD_c_targetCBlockSize : + *value = (int)CCtxParams->targetCBlockSize; +@@ -971,8 +1071,11 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_validateSequences : + *value = (int)CCtxParams->validateSequences; + break; +- case ZSTD_c_useBlockSplitter : +- *value = (int)CCtxParams->useBlockSplitter; ++ case ZSTD_c_splitAfterSequences : ++ *value = (int)CCtxParams->postBlockSplitter; ++ break; ++ case ZSTD_c_blockSplitterLevel : ++ *value = CCtxParams->preBlockSplitter_level; + break; + case ZSTD_c_useRowMatchFinder : + *value = (int)CCtxParams->useRowMatchFinder; +@@ -980,6 +1083,18 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; ++ case ZSTD_c_prefetchCDictTables: ++ *value = (int)CCtxParams->prefetchCDictTables; ++ break; ++ case ZSTD_c_enableSeqProducerFallback: ++ *value = CCtxParams->enableMatchFinderFallback; ++ break; ++ case ZSTD_c_maxBlockSize: ++ *value = (int)CCtxParams->maxBlockSize; ++ break; ++ case ZSTD_c_repcodeResolution: ++ *value = (int)CCtxParams->searchForExternalRepcodes; ++ break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +@@ -1006,9 +1121,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( + return 0; + } + ++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); ++ /* only update if all parameters are valid */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setParams"); ++ /* First check cParams, because we want to update all or none. */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); ++ /* Finally set cParams, which should succeed. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); ++ return 0; ++} ++ + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) + { +- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); ++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; +@@ -1024,9 +1177,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + + /* +- * Initializes the local dict using the requested parameters. +- * NOTE: This does not use the pledged src size, because it may be used for more +- * than one compression. ++ * Initializes the local dictionary using requested parameters. ++ * NOTE: Initialization does not employ the pledged src size, ++ * because the dictionary may be used for multiple compressions. + */ + static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + { +@@ -1039,8 +1192,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + return 0; + } + if (dl->cdict != NULL) { +- assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ ++ assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); +@@ -1060,26 +1213,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + } + + size_t ZSTD_CCtx_loadDictionary_advanced( +- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) ++ ZSTD_CCtx* cctx, ++ const void* dict, size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_dictContentType_e dictContentType) + { +- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't load a dictionary when ctx is not in init stage."); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); +- ZSTD_clearAllDicts(cctx); /* in case one already exists */ +- if (dict == NULL || dictSize == 0) /* no dictionary mode */ ++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ++ "Can't load a dictionary when cctx is not in init stage."); ++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ ++ if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { ++ /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, +- "no malloc for static CCtx"); ++ "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); +- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); ++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, ++ "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); +- cctx->localDict.dictBuffer = dictBuffer; +- cctx->localDict.dict = dictBuffer; ++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ ++ cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; +@@ -1149,7 +1306,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't reset parameters only when not in init stage."); ++ "Reset parameters is only possible during init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } +@@ -1168,7 +1325,7 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) + BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); + BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); + BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); +- BOUNDCHECK(ZSTD_c_strategy, cParams.strategy); ++ BOUNDCHECK(ZSTD_c_strategy, (int)cParams.strategy); + return 0; + } + +@@ -1178,11 +1335,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) + static ZSTD_compressionParameters + ZSTD_clampCParams(ZSTD_compressionParameters cParams) + { +-# define CLAMP_TYPE(cParam, val, type) { \ +- ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ +- if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ +- } ++# define CLAMP_TYPE(cParam, val, type) \ ++ do { \ ++ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ ++ if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ ++ } while (0) + # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); +@@ -1240,19 +1398,62 @@ static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) + * optimize `cPar` for a specified input (`srcSize` and `dictSize`). + * mostly downsize to reduce memory consumption and initialization latency. + * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. +- * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`. ++ * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`. + * note : `srcSize==0` means 0! + * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ + static ZSTD_compressionParameters + ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, +- ZSTD_cParamMode_e mode) ++ ZSTD_CParamMode_e mode, ++ ZSTD_ParamSwitch_e useRowMatchFinder) + { + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + ++ /* Cascade the selected strategy down to the next-highest one built into ++ * this binary. */ ++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btultra2) { ++ cPar.strategy = ZSTD_btultra; ++ } ++ if (cPar.strategy == ZSTD_btultra) { ++ cPar.strategy = ZSTD_btopt; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btopt) { ++ cPar.strategy = ZSTD_btlazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btlazy2) { ++ cPar.strategy = ZSTD_lazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy2) { ++ cPar.strategy = ZSTD_lazy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy) { ++ cPar.strategy = ZSTD_greedy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_greedy) { ++ cPar.strategy = ZSTD_dfast; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_dfast) { ++ cPar.strategy = ZSTD_fast; ++ cPar.targetLength = 0; ++ } ++#endif ++ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: +@@ -1281,8 +1482,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + } + + /* resize windowLog if input is small enough, to use less memory */ +- if ( (srcSize < maxWindowResize) +- && (dictSize < maxWindowResize) ) { ++ if ( (srcSize <= maxWindowResize) ++ && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : +@@ -1300,6 +1501,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + ++ /* We can't use more than 32 bits of hash in total, so that means that we require: ++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 ++ */ ++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { ++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; ++ if (cPar.hashLog > maxShortCacheHashLog) { ++ cPar.hashLog = maxShortCacheHashLog; ++ } ++ if (cPar.chainLog > maxShortCacheHashLog) { ++ cPar.chainLog = maxShortCacheHashLog; ++ } ++ } ++ ++ ++ /* At this point, we aren't 100% sure if we are using the row match finder. ++ * Unless it is explicitly disabled, conservatively assume that it is enabled. ++ * In this case it will only be disabled for small sources, so shrinking the ++ * hash log a little bit shouldn't result in any ratio loss. ++ */ ++ if (useRowMatchFinder == ZSTD_ps_auto) ++ useRowMatchFinder = ZSTD_ps_enable; ++ ++ /* We can't hash more than 32-bits in total. So that means that we require: ++ * (hashLog - rowLog + 8) <= 32 ++ */ ++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { ++ /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); ++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; ++ U32 const maxHashLog = maxRowHashLog + rowLog; ++ assert(cPar.hashLog >= rowLog); ++ if (cPar.hashLog > maxHashLog) { ++ cPar.hashLog = maxHashLog; ++ } ++ } ++ + return cPar; + } + +@@ -1310,11 +1547,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + { + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; +- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); ++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); + } + +-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); ++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); ++static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + + static void ZSTD_overrideCParams( + ZSTD_compressionParameters* cParams, +@@ -1330,24 +1567,25 @@ static void ZSTD_overrideCParams( + } + + ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( +- const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { +- srcSizeHint = CCtxParams->srcSizeHint; ++ assert(CCtxParams->srcSizeHint>=0); ++ srcSizeHint = (U64)CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); + if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ +- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); + } + + static size_t + ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, +- const ZSTD_paramSwitch_e useRowMatchFinder, +- const U32 enableDedicatedDictSearch, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, ++ const int enableDedicatedDictSearch, + const U32 forCCtx) + { + /* chain table size should be 0 for fast or row-hash strategies */ +@@ -1363,14 +1601,14 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + + hSize * sizeof(U32) + + h3Size * sizeof(U32); + size_t const optPotentialSpace = +- ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((1<strategy, useRowMatchFinder) +- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) ++ ? ZSTD_cwksp_aligned64_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace +@@ -1386,30 +1624,38 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; + } + ++/* Helper function for calculating memory requirements. ++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ ++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { ++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; ++ return blockSize / divider; ++} ++ + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, + const int isStatic, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, +- const U64 pledgedSrcSize) ++ const U64 pledgedSrcSize, ++ int useSequenceProducer, ++ size_t maxBlockSize) + { + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (cParams->minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) +- + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) ++ + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); +- size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); ++ size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); + size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? +- ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; ++ ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + + + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) +@@ -1417,15 +1663,21 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ size_t const externalSeqSpace = useSequenceProducer ++ ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ : 0; ++ + size_t const neededSpace = + cctxSpace + +- entropySpace + ++ tmpWorkSpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + +- bufferSpace; ++ bufferSpace + ++ externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +@@ -1435,7 +1687,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + { + ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, + &cParams); + + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); +@@ -1443,7 +1695,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( +- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); ++ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + + size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +@@ -1493,18 +1745,18 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; + size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, +- ZSTD_CONTENTSIZE_UNKNOWN); ++ ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + } + +@@ -1600,7 +1852,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) + * Invalidate all the matches in the match finder tables. + * Requires nextSrc and base to be set (can be NULL). + */ +-static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) ++static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms) + { + ZSTD_window_clear(&ms->window); + +@@ -1637,12 +1889,25 @@ typedef enum { + ZSTD_resetTarget_CCtx + } ZSTD_resetTarget_e; + ++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ ++static U64 ZSTD_bitmix(U64 val, U64 len) { ++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); ++ val *= 0x9FB21C651E98DF25ULL; ++ val ^= (val >> 35) + len ; ++ val *= 0x9FB21C651E98DF25ULL; ++ return val ^ (val >> 28); ++} ++ ++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ ++static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) { ++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); ++} + + static size_t +-ZSTD_reset_matchState(ZSTD_matchState_t* ms, ++ZSTD_reset_matchState(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + const ZSTD_compressionParameters* cParams, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const ZSTD_compResetPolicy_e crp, + const ZSTD_indexResetPolicy_e forceResetIndex, + const ZSTD_resetTarget_e forWho) +@@ -1664,6 +1929,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + + ms->hashLog3 = hashLog3; ++ ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + +@@ -1685,22 +1951,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp_clean_tables(ws); + } + +- /* opt parser space */ +- if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { +- DEBUGLOG(4, "reserving optimal parser space"); +- ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); +- ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); +- ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); +- ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); +- ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); +- } +- + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +- { /* Row match finder needs an additional table of hashes ("tags") */ +- size_t const tagTableSize = hSize*sizeof(U16); +- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ /* Row match finder needs an additional table of hashes ("tags") */ ++ size_t const tagTableSize = hSize; ++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use ++ * 0 when we reset a Cdict */ ++ if(forWho == ZSTD_resetTarget_CCtx) { ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ++ ZSTD_advanceHashSalt(ms); ++ } else { ++ /* When we are not salting we want to always memset the memory */ ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize); ++ ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ ms->hashSalt = 0; + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +@@ -1709,6 +1972,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + } + ++ /* opt parser space */ ++ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { ++ DEBUGLOG(4, "reserving optimal parser space"); ++ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned)); ++ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned)); ++ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned)); ++ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); ++ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); ++ } ++ + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, +@@ -1754,7 +2028,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + { + ZSTD_cwksp* const ws = &zc->workspace; + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", +- (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter); ++ (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + + zc->isFirstBlock = 1; +@@ -1766,8 +2040,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + params = &zc->appliedParams; + + assert(params->useRowMatchFinder != ZSTD_ps_auto); +- assert(params->useBlockSplitter != ZSTD_ps_auto); ++ assert(params->postBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); ++ assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +@@ -1776,9 +2051,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(params->maxBlockSize, windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +@@ -1795,8 +2069,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, +- buffInSize, buffOutSize, pledgedSrcSize); +- int resizeWorkspace; ++ buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); + +@@ -1805,7 +2078,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + { /* Check if workspace is large enough, alloc a new one if needed */ + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); +- resizeWorkspace = workspaceTooSmall || workspaceWasteful; ++ int resizeWorkspace = workspaceTooSmall || workspaceWasteful; + DEBUGLOG(4, "Need %zu B workspace", neededSpace); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + +@@ -1823,21 +2096,23 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + DEBUGLOG(5, "reserving object space"); + /* Statically sized space. +- * entropyWorkspace never moves, ++ * tmpWorkspace never moves, + * though prev/next block swap places */ + assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); + zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); + zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); +- zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE); +- RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); ++ zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE); ++ RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace"); ++ zc->tmpWkspSize = TMP_WORKSPACE_SIZE; + } } + + ZSTD_cwksp_clear(ws); + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; ++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; +@@ -1845,7 +2120,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); +- zc->blockSize = blockSize; ++ zc->blockSizeMax = blockSize; + + xxh64_reset(&zc->xxhState, 0); + zc->stage = ZSTDcs_init; +@@ -1854,13 +2129,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + ++ FORWARD_IF_ERROR(ZSTD_reset_matchState( ++ &zc->blockState.matchState, ++ ws, ++ ¶ms->cParams, ++ params->useRowMatchFinder, ++ crp, ++ needsIndexReset, ++ ZSTD_resetTarget_CCtx), ""); ++ ++ zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef)); ++ ++ /* ldm hash table */ ++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { ++ /* TODO: avoid memset? */ ++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t)); ++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->maxNbLdmSequences = maxNbLdmSeq; ++ ++ ZSTD_window_init(&zc->ldmState.window); ++ zc->ldmState.loadedDictEnd = 0; ++ } ++ ++ /* reserve space for block-level external sequences */ ++ if (ZSTD_hasExtSeqProd(params)) { ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ zc->extSeqBufCapacity = maxNbExternalSeq; ++ zc->extSeqBuf = ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ } ++ ++ /* buffers */ ++ + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + +- /* buffers */ + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); +@@ -1883,32 +2191,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); +- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); +- +- FORWARD_IF_ERROR(ZSTD_reset_matchState( +- &zc->blockState.matchState, +- ws, +- ¶ms->cParams, +- params->useRowMatchFinder, +- crp, +- needsIndexReset, +- ZSTD_resetTarget_CCtx), ""); +- +- /* ldm hash table */ +- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { +- /* TODO: avoid memset? */ +- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; +- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); +- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); +- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); +- zc->maxNbLdmSequences = maxNbLdmSeq; +- +- ZSTD_window_init(&zc->ldmState.window); +- zc->ldmState.loadedDictEnd = 0; +- } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + +@@ -1980,7 +2265,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, +- cdict->dictContentSize, ZSTD_cpm_attachDict); ++ cdict->dictContentSize, ZSTD_cpm_attachDict, ++ params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +@@ -2019,6 +2305,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + return 0; + } + ++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ++ ZSTD_compressionParameters const* cParams) { ++ if (ZSTD_CDictIndicesAreTagged(cParams)){ ++ /* Remove tags from the CDict table if they are present. ++ * See docs on "short cache" in zstd_compress_internal.h for context. */ ++ size_t i; ++ for (i = 0; i < tableSize; i++) { ++ U32 const taggedIndex = src[i]; ++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; ++ dst[i] = index; ++ } ++ } else { ++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); ++ } ++} ++ + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, +@@ -2054,26 +2356,29 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + +- ZSTD_memcpy(cctx->blockState.matchState.hashTable, +- cdict->matchState.hashTable, +- hSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, ++ cdict->matchState.hashTable, ++ hSize, cdict_cParams); ++ + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +- ZSTD_memcpy(cctx->blockState.matchState.chainTable, +- cdict->matchState.chainTable, +- chainSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, ++ cdict->matchState.chainTable, ++ chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +- size_t const tagTableSize = hSize*sizeof(U16); ++ size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, +- cdict->matchState.tagTable, +- tagTableSize); ++ cdict->matchState.tagTable, ++ tagTableSize); ++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + + /* Zero the hashTable3, since the cdict never fills it */ +- { int const h3log = cctx->blockState.matchState.hashLog3; ++ assert(cctx->blockState.matchState.hashLog3 <= 31); ++ { U32 const h3log = cctx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + assert(cdict->matchState.hashLog3 == 0); + ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); +@@ -2082,8 +2387,8 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + ZSTD_cwksp_mark_tables_clean(&cctx->workspace); + + /* copy dictionary offsets */ +- { ZSTD_matchState_t const* srcMatchState = &cdict->matchState; +- ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState; ++ { ZSTD_MatchState_t const* srcMatchState = &cdict->matchState; ++ ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; +@@ -2141,12 +2446,13 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + /* Copy only compression parameters related to tables. */ + params.cParams = srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); +- assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto); ++ assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; +- params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; ++ params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; ++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); +@@ -2166,7 +2472,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) + : 0; + size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; +- int const h3log = srcCCtx->blockState.matchState.hashLog3; ++ U32 const h3log = srcCCtx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + + ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable, +@@ -2184,8 +2490,8 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + + /* copy dictionary offsets */ + { +- const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState; +- ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState; ++ const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState; ++ ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; +@@ -2234,7 +2540,7 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ + U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ +- assert(size < (1U<<31)); /* can be casted to int */ ++ assert(size < (1U<<31)); /* can be cast to int */ + + + for (rowNb=0 ; rowNb < nbRows ; rowNb++) { +@@ -2267,7 +2573,7 @@ static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const + + /*! ZSTD_reduceIndex() : + * rescale all indexes to avoid future overflow (indexes are U32) */ +-static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue) ++static void ZSTD_reduceIndex (ZSTD_MatchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue) + { + { U32 const hSize = (U32)1 << params->cParams.hashLog; + ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); +@@ -2294,26 +2600,32 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par + + /* See doc/zstd_compression_format.md for detailed format description */ + +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr) + { +- const seqDef* const sequences = seqStorePtr->sequencesStart; ++ const SeqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; + BYTE* const ofCodeTable = seqStorePtr->ofCode; + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; ++ int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); ++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) ++ longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; ++ return longOffsets; + } + + /* ZSTD_useTargetCBlockSize(): +@@ -2333,9 +2645,9 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) + * Returns 1 if true, 0 otherwise. */ + static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) + { +- DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter); +- assert(cctxParams->useBlockSplitter != ZSTD_ps_auto); +- return (cctxParams->useBlockSplitter == ZSTD_ps_enable); ++ DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter); ++ assert(cctxParams->postBlockSplitter != ZSTD_ps_auto); ++ return (cctxParams->postBlockSplitter == ZSTD_ps_enable); + } + + /* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types +@@ -2347,6 +2659,7 @@ typedef struct { + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ ++ int longOffsets; + } ZSTD_symbolEncodingTypeStats_t; + + /* ZSTD_buildSequencesStatistics(): +@@ -2357,11 +2670,13 @@ typedef struct { + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +- BYTE* dst, const BYTE* const dstEnd, +- ZSTD_strategy strategy, unsigned* countWorkspace, +- void* entropyWorkspace, size_t entropyWkspSize) { ++ZSTD_buildSequencesStatistics( ++ const SeqStore_t* seqStorePtr, size_t nbSeq, ++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, ++ BYTE* dst, const BYTE* const dstEnd, ++ ZSTD_strategy strategy, unsigned* countWorkspace, ++ void* entropyWorkspace, size_t entropyWkspSize) ++{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; +@@ -2375,7 +2690,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + + stats.lastCountSize = 0; + /* convert length/distances into codes */ +- ZSTD_seqToCodes(seqStorePtr); ++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ +@@ -2392,7 +2707,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype, ++ CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype, + countWorkspace, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, +@@ -2413,7 +2728,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ +- ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; ++ ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, +@@ -2424,7 +2739,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype, ++ CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype, + countWorkspace, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, +@@ -2454,7 +2769,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype, ++ CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype, + countWorkspace, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, +@@ -2480,22 +2795,23 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + */ + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- void* entropyWorkspace, size_t entropyWkspSize, +- const int bmi2) ++ZSTD_entropyCompressSeqStore_internal( ++ void* dst, size_t dstCapacity, ++ const void* literals, size_t litSize, ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ const int bmi2) + { +- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; +- const seqDef* const sequences = seqStorePtr->sequencesStart; +- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const SeqDef* const sequences = seqStorePtr->sequencesStart; ++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; +@@ -2503,29 +2819,28 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; ++ int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ +- { const BYTE* const literals = seqStorePtr->litStart; +- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; ++ { size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ +- unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); +- size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); ++ + size_t const cSize = ZSTD_compressLiterals( +- &prevEntropy->huf, &nextEntropy->huf, +- cctxParams->cParams.strategy, +- ZSTD_literalsCompressionIsDisabled(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, +- bmi2, suspectUncompressible); ++ &prevEntropy->huf, &nextEntropy->huf, ++ cctxParams->cParams.strategy, ++ ZSTD_literalsCompressionIsDisabled(cctxParams), ++ suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; +@@ -2551,11 +2866,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } +- { +- ZSTD_symbolEncodingTypeStats_t stats; +- BYTE* seqHead = op++; ++ { BYTE* const seqHead = op++; + /* build stats for sequences */ +- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, ++ const ZSTD_symbolEncodingTypeStats_t stats = ++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, +@@ -2564,6 +2878,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; ++ longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +@@ -2597,104 +2912,146 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + return (size_t)(op - ostart); + } + +-MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- size_t srcSize, +- void* entropyWorkspace, size_t entropyWkspSize, +- int bmi2) ++static size_t ++ZSTD_entropyCompressSeqStore_wExtLitBuffer( ++ void* dst, size_t dstCapacity, ++ const void* literals, size_t litSize, ++ size_t blockSize, ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) + { + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( +- seqStorePtr, prevEntropy, nextEntropy, cctxParams, + dst, dstCapacity, ++ literals, litSize, ++ seqStorePtr, prevEntropy, nextEntropy, cctxParams, + entropyWorkspace, entropyWkspSize, bmi2); + if (cSize == 0) return 0; + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ +- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) ++ if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) { ++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ ++ } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ +- { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); ++ { size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. ++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. ++ */ ++ assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; + } + ++static size_t ++ZSTD_entropyCompressSeqStore( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) ++{ ++ return ZSTD_entropyCompressSeqStore_wExtLitBuffer( ++ dst, dstCapacity, ++ seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart), ++ srcSize, ++ seqStorePtr, ++ prevEntropy, nextEntropy, ++ cctxParams, ++ entropyWorkspace, entropyWkspSize, ++ bmi2); ++} ++ + /* ZSTD_selectBlockCompressor() : + * Not static, but internal use only (used by long distance matcher) + * assumption : strat is a valid strategy */ +-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) ++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) + { +- static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { ++ static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, +- ZSTD_compressBlock_doubleFast, +- ZSTD_compressBlock_greedy, +- ZSTD_compressBlock_lazy, +- ZSTD_compressBlock_lazy2, +- ZSTD_compressBlock_btlazy2, +- ZSTD_compressBlock_btopt, +- ZSTD_compressBlock_btultra, +- ZSTD_compressBlock_btultra2 }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST, ++ ZSTD_COMPRESSBLOCK_GREEDY, ++ ZSTD_COMPRESSBLOCK_LAZY, ++ ZSTD_COMPRESSBLOCK_LAZY2, ++ ZSTD_COMPRESSBLOCK_BTLAZY2, ++ ZSTD_COMPRESSBLOCK_BTOPT, ++ ZSTD_COMPRESSBLOCK_BTULTRA, ++ ZSTD_COMPRESSBLOCK_BTULTRA2 ++ }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, +- ZSTD_compressBlock_doubleFast_extDict, +- ZSTD_compressBlock_greedy_extDict, +- ZSTD_compressBlock_lazy_extDict, +- ZSTD_compressBlock_lazy2_extDict, +- ZSTD_compressBlock_btlazy2_extDict, +- ZSTD_compressBlock_btopt_extDict, +- ZSTD_compressBlock_btultra_extDict, +- ZSTD_compressBlock_btultra_extDict }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ++ }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, +- ZSTD_compressBlock_doubleFast_dictMatchState, +- ZSTD_compressBlock_greedy_dictMatchState, +- ZSTD_compressBlock_lazy_dictMatchState, +- ZSTD_compressBlock_lazy2_dictMatchState, +- ZSTD_compressBlock_btlazy2_dictMatchState, +- ZSTD_compressBlock_btopt_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ++ }, + { NULL /* default for 0 */, + NULL, + NULL, +- ZSTD_compressBlock_greedy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch, ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, + NULL, + NULL, + NULL, + NULL } + }; +- ZSTD_blockCompressor selectedCompressor; ++ ZSTD_BlockCompressor_f selectedCompressor; + ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); + +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); +- DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); ++ DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { +- static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { +- { ZSTD_compressBlock_greedy_row, +- ZSTD_compressBlock_lazy_row, +- ZSTD_compressBlock_lazy2_row }, +- { ZSTD_compressBlock_greedy_extDict_row, +- ZSTD_compressBlock_lazy_extDict_row, +- ZSTD_compressBlock_lazy2_extDict_row }, +- { ZSTD_compressBlock_greedy_dictMatchState_row, +- ZSTD_compressBlock_lazy_dictMatchState_row, +- ZSTD_compressBlock_lazy2_dictMatchState_row }, +- { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } ++ static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = { ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ++ } + }; +- DEBUGLOG(4, "Selecting a row-based matchfinder"); ++ DEBUGLOG(5, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; + } else { +@@ -2704,30 +3061,126 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS + return selectedCompressor; + } + +-static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr, ++static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr, + const BYTE* anchor, size_t lastLLSize) + { + ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } + +-void ZSTD_resetSeqStore(seqStore_t* ssPtr) ++void ZSTD_resetSeqStore(SeqStore_t* ssPtr) + { + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; + ssPtr->longLengthType = ZSTD_llt_none; + } + +-typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; ++/* ZSTD_postProcessSequenceProducerResult() : ++ * Validates and post-processes sequences obtained through the external matchfinder API: ++ * - Checks whether nbExternalSeqs represents an error condition. ++ * - Appends a block delimiter to outSeqs if one is not already present. ++ * See zstd.h for context regarding block delimiters. ++ * Returns the number of sequences after post-processing, or an error code. */ ++static size_t ZSTD_postProcessSequenceProducerResult( ++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ++) { ++ RETURN_ERROR_IF( ++ nbExternalSeqs > outSeqsCapacity, ++ sequenceProducer_failed, ++ "External sequence producer returned error code %lu", ++ (unsigned long)nbExternalSeqs ++ ); ++ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == 0 && srcSize > 0, ++ sequenceProducer_failed, ++ "Got zero sequences from external sequence producer for a non-empty src buffer!" ++ ); ++ ++ if (srcSize == 0) { ++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); ++ return 1; ++ } ++ ++ { ++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; ++ ++ /* We can return early if lastSeq is already a block delimiter. */ ++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { ++ return nbExternalSeqs; ++ } ++ ++ /* This error condition is only possible if the external matchfinder ++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == outSeqsCapacity, ++ sequenceProducer_failed, ++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ++ ); ++ ++ /* lastSeq is not a block delimiter, so we need to append one. */ ++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); ++ return nbExternalSeqs + 1; ++ } ++} ++ ++/* ZSTD_fastSequenceLengthSum() : ++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. ++ * Similar to another function in zstd_compress.c (determine_blockSize), ++ * except it doesn't check for a block delimiter to end summation. ++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). ++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ ++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { ++ size_t matchLenSum, litLenSum, i; ++ matchLenSum = 0; ++ litLenSum = 0; ++ for (i = 0; i < seqBufSize; i++) { ++ litLenSum += seqBuf[i].litLength; ++ matchLenSum += seqBuf[i].matchLength; ++ } ++ return litLenSum + matchLenSum; ++} ++ ++/* ++ * Function to validate sequences produced by a block compressor. ++ */ ++static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams) ++{ ++#if DEBUGLEVEL >= 1 ++ const SeqDef* seq = seqStore->sequencesStart; ++ const SeqDef* const seqEnd = seqStore->sequences; ++ size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4; ++ for (; seq < seqEnd; ++seq) { ++ const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq); ++ assert(seqLength.matchLength >= matchLenLowerBound); ++ (void)seqLength; ++ (void)matchLenLowerBound; ++ } ++#else ++ (void)seqStore; ++ (void)cParams; ++#endif ++} ++ ++static size_t ++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch); ++ ++typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e; + + static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + { +- ZSTD_matchState_t* const ms = &zc->blockState.matchState; ++ ZSTD_MatchState_t* const ms = &zc->blockState.matchState; + DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); +- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { +@@ -2763,6 +3216,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, +@@ -2772,7 +3234,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + src, srcSize); + assert(zc->externSeqStore.pos <= zc->externSeqStore.size); + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { +- rawSeqStore_t ldmSeqStore = kNullRawSeqStore; ++ RawSeqStore_t ldmSeqStore = kNullRawSeqStore; ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); + + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; +@@ -2788,42 +3258,116 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); +- } else { /* not long range mode */ +- ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, +- zc->appliedParams.useRowMatchFinder, +- dictMode); ++ } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { ++ assert( ++ zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) ++ ); ++ assert(zc->appliedParams.extSeqProdFunc != NULL); ++ ++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; ++ ++ size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( ++ zc->appliedParams.extSeqProdState, ++ zc->extSeqBuf, ++ zc->extSeqBufCapacity, ++ src, srcSize, ++ NULL, 0, /* dict and dictSize, currently not supported */ ++ zc->appliedParams.compressionLevel, ++ windowSize ++ ); ++ ++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( ++ zc->extSeqBuf, ++ nbExternalSeqs, ++ zc->extSeqBufCapacity, ++ srcSize ++ ); ++ ++ /* Return early if there is no error, since we don't need to worry about last literals */ ++ if (!ZSTD_isError(nbPostProcessedSeqs)) { ++ ZSTD_SequencePosition seqPos = {0,0,0}; ++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); ++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); ++ FORWARD_IF_ERROR( ++ ZSTD_transferSequences_wBlockDelim( ++ zc, &seqPos, ++ zc->extSeqBuf, nbPostProcessedSeqs, ++ src, srcSize, ++ zc->appliedParams.searchForExternalRepcodes ++ ), ++ "Failed to copy external sequences to seqStore!" ++ ); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); ++ return ZSTDbss_compress; ++ } ++ ++ /* Propagate the error if fallback is disabled */ ++ if (!zc->appliedParams.enableMatchFinderFallback) { ++ return nbPostProcessedSeqs; ++ } ++ ++ /* Fallback to software matchfinder */ ++ { ZSTD_BlockCompressor_f const blockCompressor = ++ ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG( ++ 5, ++ "External sequence producer returned error code %lu. Falling back to internal parser.", ++ (unsigned long)nbExternalSeqs ++ ); ++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); ++ } } ++ } else { /* not long range mode and no external matchfinder */ ++ ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); + ms->ldmSeqStore = NULL; + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } + { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; + ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); + } } ++ ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams); + return ZSTDbss_compress; + } + +-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) ++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) + { +- const seqStore_t* seqStore = ZSTD_getSeqStore(zc); +- const seqDef* seqStoreSeqs = seqStore->sequencesStart; +- size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; +- size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); +- size_t literalsRead = 0; +- size_t lastLLSize; ++ const SeqDef* inSeqs = seqStore->sequencesStart; ++ const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs); ++ const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); + +- ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; ++ ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; ++ const size_t nbOutSequences = nbInSequences + 1; ++ size_t nbOutLiterals = 0; ++ Repcodes_t repcodes; + size_t i; +- repcodes_t updatedRepcodes; +- +- assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); +- /* Ensure we have enough space for last literals "sequence" */ +- assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); +- ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (i = 0; i < seqStoreSeqSize; ++i) { +- U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; +- outSeqs[i].litLength = seqStoreSeqs[i].litLength; +- outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; ++ ++ /* Bounds check that we have enough space for every input sequence ++ * and the block delimiter ++ */ ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ RETURN_ERROR_IF( ++ nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), ++ dstSize_tooSmall, ++ "Not enough space to copy sequences"); ++ ++ ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); ++ for (i = 0; i < nbInSequences; ++i) { ++ U32 rawOffset; ++ outSeqs[i].litLength = inSeqs[i].litLength; ++ outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; + outSeqs[i].rep = 0; + ++ /* Handle the possible single length >= 64K ++ * There can only be one because we add MINMATCH to every match length, ++ * and blocks are at most 128K. ++ */ + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { + outSeqs[i].litLength += 0x10000; +@@ -2832,46 +3376,75 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + } + } + +- if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { +- /* Derive the correct offset corresponding to a repcode */ +- outSeqs[i].rep = seqStoreSeqs[i].offBase; ++ /* Determine the raw offset given the offBase, which may be a repcode. */ ++ if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { ++ const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); ++ assert(repcode > 0); ++ outSeqs[i].rep = repcode; + if (outSeqs[i].litLength != 0) { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; ++ rawOffset = repcodes.rep[repcode - 1]; + } else { +- if (outSeqs[i].rep == 3) { +- rawOffset = updatedRepcodes.rep[0] - 1; ++ if (repcode == 3) { ++ assert(repcodes.rep[0] > 1); ++ rawOffset = repcodes.rep[0] - 1; + } else { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; ++ rawOffset = repcodes.rep[repcode]; + } + } ++ } else { ++ rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); + } + outSeqs[i].offset = rawOffset; +- /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode +- so we provide seqStoreSeqs[i].offset - 1 */ +- ZSTD_updateRep(updatedRepcodes.rep, +- seqStoreSeqs[i].offBase - 1, +- seqStoreSeqs[i].litLength == 0); +- literalsRead += outSeqs[i].litLength; ++ ++ /* Update repcode history for the sequence */ ++ ZSTD_updateRep(repcodes.rep, ++ inSeqs[i].offBase, ++ inSeqs[i].litLength == 0); ++ ++ nbOutLiterals += outSeqs[i].litLength; + } + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ +- assert(seqStoreLiteralsSize >= literalsRead); +- lastLLSize = seqStoreLiteralsSize - literalsRead; +- outSeqs[i].litLength = (U32)lastLLSize; +- outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; +- seqStoreSeqSize++; +- zc->seqCollector.seqIndex += seqStoreSeqSize; ++ assert(nbInLiterals >= nbOutLiterals); ++ { ++ const size_t lastLLSize = nbInLiterals - nbOutLiterals; ++ outSeqs[nbInSequences].litLength = (U32)lastLLSize; ++ outSeqs[nbInSequences].matchLength = 0; ++ outSeqs[nbInSequences].offset = 0; ++ assert(nbOutSequences == nbInSequences + 1); ++ } ++ seqCollector->seqIndex += nbOutSequences; ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ ++ return 0; ++} ++ ++size_t ZSTD_sequenceBound(size_t srcSize) { ++ const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; ++ const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; ++ return maxNbSeq + maxNbDelims; + } + + size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) + { + const size_t dstCapacity = ZSTD_compressBound(srcSize); +- void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); ++ void* dst; /* Make C90 happy. */ + SeqCollector seqCollector; ++ { ++ int targetCBlockSize; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); ++ RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); ++ } ++ { ++ int nbWorkers; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); ++ RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); ++ } + ++ dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + + seqCollector.collectSequences = 1; +@@ -2880,8 +3453,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + +- ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); +- ZSTD_customFree(dst, ZSTD_defaultCMem); ++ { ++ const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ++ ZSTD_customFree(dst, ZSTD_defaultCMem); ++ FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); ++ } ++ assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); + return zc->seqCollector.seqIndex; + } + +@@ -2910,19 +3487,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; +- size_t u; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { ++ size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; +- } +- } +- } ++ } } } + return 1; + } + +@@ -2930,7 +3505,7 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + * This is just a heuristic based on the compressibility. + * It may return both false positives and false negatives. + */ +-static int ZSTD_maybeRLE(seqStore_t const* seqStore) ++static int ZSTD_maybeRLE(SeqStore_t const* seqStore) + { + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); +@@ -2938,7 +3513,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) + return nbSeqs < 4 && nbLits < 10; + } + +-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) ++static void ++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) + { + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; +@@ -2946,12 +3522,14 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c + } + + /* Writes the block header */ +-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { ++static void ++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) ++{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); +- DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); ++ DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); + } + + /* ZSTD_buildBlockEntropyStats_literals() : +@@ -2959,13 +3537,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace +- * @return : size of huffman description table or error code */ +-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +- const ZSTD_hufCTables_t* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_hufCTablesMetadata_t* hufMetadata, +- const int literalsCompressionIsDisabled, +- void* workspace, size_t wkspSize) ++ * @return : size of huffman description table, or an error code ++ */ ++static size_t ++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const int literalsCompressionIsDisabled, ++ void* workspace, size_t wkspSize, ++ int hufFlags) + { + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; +@@ -2973,9 +3554,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; +- const size_t nodeWkspSize = wkspEnd-nodeWksp; ++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +- unsigned huffLog = HUF_TABLELOG_DEFAULT; ++ unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +@@ -2990,73 +3571,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + + /* small ? don't even attempt compression (speed opt) */ + #ifndef COMPRESS_LITERALS_SIZE_MIN +-#define COMPRESS_LITERALS_SIZE_MIN 63 ++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ + #endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Scan input and build symbol stats */ +- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); ++ { size_t const largest = ++ HIST_count_wksp (countWksp, &maxSymbolValue, ++ (const BYTE*)src, srcSize, ++ workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { ++ /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { ++ /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Validate the previous Huffman table */ +- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { ++ if (repeat == HUF_repeat_check ++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); ++ assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; +- { /* Build and write the CTable */ +- size_t const newCSize = HUF_estimateCompressedSize( +- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +- size_t const hSize = HUF_writeCTable_wksp( +- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +- nodeWksp, nodeWkspSize); +- /* Check against repeating the previous CTable */ +- if (repeat != HUF_repeat_none) { +- size_t const oldCSize = HUF_estimateCompressedSize( +- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +- DEBUGLOG(5, "set_repeat - smaller"); +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_repeat; +- return 0; +- } +- } +- if (newCSize + hSize >= srcSize) { +- DEBUGLOG(5, "set_basic - no gains"); ++ } ++ { /* Build and write the CTable */ ++ size_t const newCSize = HUF_estimateCompressedSize( ++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); ++ size_t const hSize = HUF_writeCTable_wksp( ++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), ++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, ++ nodeWksp, nodeWkspSize); ++ /* Check against repeating the previous CTable */ ++ if (repeat != HUF_repeat_none) { ++ size_t const oldCSize = HUF_estimateCompressedSize( ++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); ++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { ++ DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_basic; ++ hufMetadata->hType = set_repeat; + return 0; +- } +- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +- hufMetadata->hType = set_compressed; +- nextHuf->repeatMode = HUF_repeat_check; +- return hSize; +- } ++ } } ++ if (newCSize + hSize >= srcSize) { ++ DEBUGLOG(5, "set_basic - no gains"); ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ hufMetadata->hType = set_basic; ++ return 0; ++ } ++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); ++ hufMetadata->hType = set_compressed; ++ nextHuf->repeatMode = HUF_repeat_check; ++ return hSize; + } + } + +@@ -3066,8 +3651,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + * and updates nextEntropy to the appropriate repeatMode. + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; ++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) ++{ ++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; +@@ -3078,16 +3664,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. +- * @return : size of fse tables or error code */ +-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +- const ZSTD_fseCTables_t* prevEntropy, +- ZSTD_fseCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize) ++ * @return : size of fse tables or error code */ ++static size_t ++ZSTD_buildBlockEntropyStats_sequences( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_fseCTables_t* prevEntropy, ++ ZSTD_fseCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize) + { + ZSTD_strategy const strategy = cctxParams->cParams.strategy; +- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; +@@ -3103,9 +3691,9 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + entropyWorkspace, entropyWorkspaceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); +- fseMetadata->llType = (symbolEncodingType_e) stats.LLtype; +- fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype; +- fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype; ++ fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype; ++ fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype; ++ fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize = stats.lastCountSize; + return stats.size; + } +@@ -3114,23 +3702,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE +- * +- * @return : 0 on success or error code ++ * @return : 0 on success, or an error code ++ * Note : also employed in superblock + */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize) +-{ +- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; ++size_t ZSTD_buildBlockEntropyStats( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize) ++{ ++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); ++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; ++ + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), +- workspace, wkspSize); ++ workspace, wkspSize, hufFlags); ++ + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +@@ -3143,11 +3736,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + } + + /* Returns the size estimate for the literals section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +- const ZSTD_hufCTables_t* huf, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, ++ const ZSTD_hufCTables_t* huf, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +@@ -3169,12 +3763,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz + } + + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +- const FSE_CTable* fseCTable, +- const U8* additionalBits, +- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +- void* workspace, size_t wkspSize) ++static size_t ++ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type, ++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, ++ const FSE_CTable* fseCTable, ++ const U8* additionalBits, ++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, ++ void* workspace, size_t wkspSize) + { + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; +@@ -3206,116 +3801,121 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + } + + /* Returns the size estimate for the sequences section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +- fseTables->offcodeCTable, NULL, +- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +- workspace, wkspSize); ++ fseTables->offcodeCTable, NULL, ++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +- fseTables->litlengthCTable, LL_bits, +- LL_defaultNorm, LL_defaultNormLog, MaxLL, +- workspace, wkspSize); ++ fseTables->litlengthCTable, LL_bits, ++ LL_defaultNorm, LL_defaultNormLog, MaxLL, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +- fseTables->matchlengthCTable, ML_bits, +- ML_defaultNorm, ML_defaultNormLog, MaxML, +- workspace, wkspSize); ++ fseTables->matchlengthCTable, ML_bits, ++ ML_defaultNorm, ML_defaultNormLog, MaxML, ++ workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + + /* Returns the size estimate for a given stream of literals, of, ll, ml */ +-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +- const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_entropyCTables_t* entropy, +- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { ++static size_t ++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, ++ const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_entropyCTables_t* entropy, ++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize, ++ int writeLitEntropy, int writeSeqEntropy) ++{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +- workspace, wkspSize, writeSeqEntropy); ++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, ++ workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; + } + + /* Builds entropy statistics and uses them for blocksize estimation. + * +- * Returns the estimated compressed size of the seqStore, or a zstd error. ++ * @return: estimated compressed size of the seqStore, or a zstd error. + */ +-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; ++static size_t ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc) ++{ ++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), ++ zc->tmpWorkspace, zc->tmpWkspSize), ""); ++ return ZSTD_estimateBlockSize( ++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), +- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ &zc->blockState.nextCBlock->entropy, ++ entropyMetadata, ++ zc->tmpWorkspace, zc->tmpWkspSize, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); + } + + /* Returns literals bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore) ++{ + size_t literalsBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ SeqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; +- } +- } ++ } } + return literalsBytes; + } + + /* Returns match bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore) ++{ + size_t matchBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ SeqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; +- } +- } ++ } } + return matchBytes; + } + + /* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. + */ +-static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, +- const seqStore_t* originalSeqStore, +- size_t startIdx, size_t endIdx) { +- BYTE* const litEnd = originalSeqStore->lit; +- size_t literalsBytes; +- size_t literalsBytesPreceding = 0; +- ++static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore, ++ const SeqStore_t* originalSeqStore, ++ size_t startIdx, size_t endIdx) ++{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ +@@ -3328,13 +3928,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +- resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +- resultSeqStore->lit = litEnd; ++ assert(resultSeqStore->lit == originalSeqStore->lit); + } else { +- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; ++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; +@@ -3342,20 +3941,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + + /* +- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. +- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). ++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. ++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ + static U32 +-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +-{ +- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ +- assert(STORED_IS_REPCODE(offCode)); +- if (adjustedOffCode == ZSTD_REP_NUM) { +- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +- assert(rep[0] > 0); ++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) ++{ ++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ ++ assert(OFFBASE_IS_REPCODE(offBase)); ++ if (adjustedRepCode == ZSTD_REP_NUM) { ++ assert(ll0); ++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 ++ * This is only valid if it results in a valid offset value, aka > 0. ++ * Note : it may happen that `rep[0]==1` in exceptional circumstances. ++ * In which case this function will return 0, which is an invalid offset. ++ * It's not an issue though, since this value will be ++ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). ++ */ + return rep[0] - 1; + } +- return rep[adjustedOffCode]; ++ return rep[adjustedRepCode]; + } + + /* +@@ -3371,30 +3976,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +- seqStore_t* const seqStore, U32 const nbSeq) { ++static void ++ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes, ++ const SeqStore_t* const seqStore, U32 const nbSeq) ++{ + U32 idx = 0; ++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { +- seqDef* const seq = seqStore->sequencesStart + idx; +- U32 const ll0 = (seq->litLength == 0); +- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +- assert(seq->offBase > 0); +- if (STORED_IS_REPCODE(offCode)) { +- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ SeqDef* const seq = seqStore->sequencesStart + idx; ++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); ++ U32 const offBase = seq->offBase; ++ assert(offBase > 0); ++ if (OFFBASE_IS_REPCODE(offBase)) { ++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); ++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { +- seq->offBase = cRawOffset + ZSTD_REP_NUM; ++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ +- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); ++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } + } + +@@ -3404,10 +4012,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ + * Returns the total size of that block (including header) or a ZSTD error code. + */ + static size_t +-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, +- repcodes_t* const dRep, repcodes_t* const cRep, ++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, ++ const SeqStore_t* const seqStore, ++ Repcodes_t* const dRep, Repcodes_t* const cRep, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, ++ const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) + { + const U32 rleMaxLength = 25; +@@ -3417,7 +4026,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ +- repcodes_t const dRepOriginal = *dRep; ++ Repcodes_t const dRepOriginal = *dRep; + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); +@@ -3428,7 +4037,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, + srcSize, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + +@@ -3442,8 +4051,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + cSeqsSize = 1; + } + ++ /* Sequence collection not supported when block splitting */ + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3451,18 +4061,18 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + if (cSeqsSize == 0) { + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else if (cSeqsSize == 1) { + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize = ZSTD_blockHeaderSize + cSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) +@@ -3481,45 +4091,49 @@ typedef struct { + + /* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. +- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then +- * we do not recurse. ++ * If advantageous to split, then we recurse down the two sub-blocks. ++ * If not, or if an error occurred in estimation, then we do not recurse. + * +- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. ++ * Note: The recursion depth is capped by a heuristic minimum number of sequences, ++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * +- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize ++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. ++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ + static void + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, +- ZSTD_CCtx* zc, const seqStore_t* origSeqStore) ++ ZSTD_CCtx* zc, const SeqStore_t* origSeqStore) + { +- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + ++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); ++ assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); ++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } +- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", ++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { ++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; +@@ -3527,14 +4141,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end + } + } + +-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. ++/* Base recursive function. ++ * Populates a table with intra-block partition indices that can improve compression ratio. + * +- * Returns the number of splits made (which equals the size of the partition table - 1). ++ * @return: number of splits made (which equals the size of the partition table - 1). + */ +-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +- seqStoreSplits splits = {partitions, 0}; ++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) ++{ ++ seqStoreSplits splits; ++ splits.splitLocations = partitions; ++ splits.idx = 0; + if (nbSeq <= 4) { +- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); ++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } +@@ -3550,18 +4168,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ + static size_t +-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) ++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t blockSize, ++ U32 lastBlock, U32 nbSeq) + { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; +- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); ++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ ++ SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +@@ -3577,36 +4197,37 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ +- repcodes_t dRep; +- repcodes_t cRep; +- ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); ++ Repcodes_t dRep; ++ Repcodes_t cRep; ++ ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t)); + +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { +- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +- &dRep, &cRep, +- op, dstCapacity, +- ip, blockSize, +- lastBlock, 0 /* isPartition */); ++ size_t cSizeSingleBlock = ++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, ++ &dRep, &cRep, ++ op, dstCapacity, ++ ip, blockSize, ++ lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { +- size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + +- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); ++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ +@@ -3621,7 +4242,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); +- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); ++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; +@@ -3629,12 +4251,12 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; +- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize); + } +- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +- * for the next block. ++ /* cRep and dRep may have diverged during the compression. ++ * If so, we use the dRep repcodes for the next block. + */ +- ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t)); + return cSize; + } + +@@ -3643,21 +4265,20 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) + { +- const BYTE* ip = (const BYTE*)src; +- BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +- assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable); ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock"); ++ assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; + } + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); +@@ -3673,9 +4294,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) + { +- /* This the upper bound for the length of an rle block. +- * This isn't the actual upper bound. Finding the real threshold +- * needs further investigation. ++ /* This is an estimated upper bound for the length of an rle block. ++ * This isn't the actual upper bound. ++ * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; +@@ -3687,11 +4308,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); +- if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } ++ if (bss == ZSTDbss_noCompress) { ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = 0; ++ goto out; ++ } + } + + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3702,7 +4327,7 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + &zc->appliedParams, + dst, dstCapacity, + srcSize, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + + if (frame && +@@ -3767,10 +4392,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ +- { +- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); ++ { size_t const cSize = ++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { +- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); ++ size_t const maxCSize = ++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +@@ -3778,7 +4404,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + } + } + } +- } ++ } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. +@@ -3807,7 +4433,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc, + return cSize; + } + +-static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, ++static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + void const* ip, +@@ -3831,39 +4457,82 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + } + } + ++#include "zstd_preSplit.h" ++ ++static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings) ++{ ++ /* split level based on compression strategy, from `fast` to `btultra2` */ ++ static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 }; ++ /* note: conservatively only split full blocks (128 KB) currently. ++ * While it's possible to go lower, let's keep it simple for a first implementation. ++ * Besides, benefits of splitting are reduced when blocks are already small. ++ */ ++ if (srcSize < 128 KB || blockSizeMax < 128 KB) ++ return MIN(srcSize, blockSizeMax); ++ /* do not split incompressible data though: ++ * require verified savings to allow pre-splitting. ++ * Note: as a consequence, the first full block is not split. ++ */ ++ if (savings < 3) { ++ DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings); ++ return 128 KB; ++ } ++ /* apply @splitLevel, or use default value (which depends on @strat). ++ * note that splitting heuristic is still conditioned by @savings >= 3, ++ * so the first block will not reach this code path */ ++ if (splitLevel == 1) return 128 KB; ++ if (splitLevel == 0) { ++ assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2); ++ splitLevel = splitLevels[strat]; ++ } else { ++ assert(2 <= splitLevel && splitLevel <= 6); ++ splitLevel -= 2; ++ } ++ return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize); ++} ++ + /*! ZSTD_compress_frameChunk() : + * Compress a chunk of data into one or multiple blocks. + * All blocks will be terminated, all input will be consumed. + * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. + * Frame is supposed already started (header already produced) +-* @return : compressed size, or an error code ++* @return : compressed size, or an error code + */ + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastFrameChunk) + { +- size_t blockSize = cctx->blockSize; ++ size_t blockSizeMax = cctx->blockSizeMax; + size_t remaining = srcSize; + const BYTE* ip = (const BYTE*)src; + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; ++ S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize; + + assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); + +- DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize); ++ DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax); + if (cctx->appliedParams.fParams.checksumFlag && srcSize) + xxh64_update(&cctx->xxhState, src, srcSize); + + while (remaining) { +- ZSTD_matchState_t* const ms = &cctx->blockState.matchState; +- U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); +- +- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; ++ size_t const blockSize = ZSTD_optimalBlockSize(cctx, ++ ip, remaining, ++ blockSizeMax, ++ cctx->appliedParams.preBlockSplitter_level, ++ cctx->appliedParams.cParams.strategy, ++ savings); ++ U32 const lastBlock = lastFrameChunk & (blockSize == remaining); ++ assert(blockSize <= remaining); ++ ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); +- if (remaining < blockSize) blockSize = remaining; + + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); +@@ -3899,8 +4568,23 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } +- } +- ++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ ++ ++ /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data. ++ * Without splitting, the maximum expansion is 3 bytes per full block. ++ * An adversarial input could attempt to fudge the split detector, ++ * and make it split incompressible data, resulting in more block headers. ++ * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block, ++ * and the splitter never creates blocks that small (current lower limit is 8 KB), ++ * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit. ++ * But if the goal is to not expand by more than 3-bytes per 128 KB full block, ++ * then yes, it becomes possible to make the block splitter oversplit incompressible data. ++ * Using @savings, we enforce an even more conservative condition, ++ * requiring the presence of enough savings (at least 3 bytes) to authorize splitting, ++ * otherwise only full blocks are used. ++ * But being conservative is fine, ++ * since splitting barely compressible blocks is not fruitful anyway */ ++ savings += (S64)blockSize - (S64)cSize; + + ip += blockSize; + assert(remaining >= blockSize); +@@ -3919,8 +4603,10 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + + + static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, +- const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) +-{ BYTE* const op = (BYTE*)dst; ++ const ZSTD_CCtx_params* params, ++ U64 pledgedSrcSize, U32 dictID) ++{ ++ BYTE* const op = (BYTE*)dst; + U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ + U32 const checksumFlag = params->fParams.checksumFlag>0; +@@ -4001,19 +4687,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) + } + } + +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) + { +- RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, +- "wrong cctx stage"); +- RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, +- parameter_unsupported, +- "incompatible with ldm"); ++ assert(cctx->stage == ZSTDcs_init); ++ assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + cctx->externSeqStore.posInSequence = 0; +- return 0; + } + + +@@ -4022,7 +4704,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + const void* src, size_t srcSize, + U32 frame, U32 lastFrameChunk) + { +- ZSTD_matchState_t* const ms = &cctx->blockState.matchState; ++ ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; + size_t fhSize = 0; + + DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", +@@ -4057,7 +4739,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + src, (BYTE const*)src + srcSize); + } + +- DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize); ++ DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax); + { size_t const cSize = frame ? + ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : + ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); +@@ -4078,58 +4760,90 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + } + } + +-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressContinue_public() */ ++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); ++} + +-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) + { + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); +- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); ++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); + } + +-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ ++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++{ ++ return ZSTD_getBlockSize_deprecated(cctx); ++} ++ ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); +- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); ++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++{ ++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); ++} ++ + /*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, +- ldmState_t* ls, +- ZSTD_cwksp* ws, +- ZSTD_CCtx_params const* params, +- const void* src, size_t srcSize, +- ZSTD_dictTableLoadMethod_e dtlm) ++static size_t ++ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms, ++ ldmState_t* ls, ++ ZSTD_cwksp* ws, ++ ZSTD_CCtx_params const* params, ++ const void* src, size_t srcSize, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) + { + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + +- /* Assert that we the ms params match the params we're being given */ ++ /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +- if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ { /* Ensure large dictionaries can't cause index overflow */ ++ + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ +- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +- /* We must have cleared our windows when our source is this large. */ +- assert(ZSTD_window_isEmpty(ms->window)); +- if (loadLdmDict) +- assert(ZSTD_window_isEmpty(ls->window)); ++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; ++ ++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); ++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { ++ /* Some dictionary matchfinders in zstd use "short cache", ++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each ++ * CDict hashtable entry as a tag rather than as part of an index. ++ * When short cache is used, we need to truncate the dictionary ++ * so that its indices don't overlap with the tag. */ ++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; ++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); ++ assert(!loadLdmDict); ++ } ++ + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; +@@ -4138,35 +4852,59 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } + } + +- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ /* We must have cleared our windows when our source is this large. */ ++ assert(ZSTD_window_isEmpty(ms->window)); ++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); ++ } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); +- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +- ms->forceNonContiguous = params->deterministicRefPrefix; + +- if (loadLdmDict) { ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ ++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict"); + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ++ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes"); + } + ++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ ++ { U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31); ++ if (srcSize > maxDictSize) { ++ ip = iend - maxDictSize; ++ src = ip; ++ srcSize = maxDictSize; ++ } ++ } ++ ++ ms->nextToUpdate = (U32)(ip - ms->window.base); ++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ++ ms->forceNonContiguous = params->deterministicRefPrefix; ++ + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + +- if (loadLdmDict) +- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); +- + switch(params->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, dtlm); ++ ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, dtlm); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); +@@ -4174,7 +4912,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { +- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); ++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); +@@ -4183,14 +4921,24 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); + } + } ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); ++ DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + default: +@@ -4233,20 +4981,19 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + { unsigned maxSymbolValue = 255; + unsigned hasZeroWeights = 1; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, +- dictEnd-dictPtr, &hasZeroWeights); ++ (size_t)(dictEnd-dictPtr), &hasZeroWeights); + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ +- if (!hasZeroWeights) ++ if (!hasZeroWeights && maxSymbolValue == 255) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); +- RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + + { unsigned offcodeLog; +- size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); ++ size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + /* fill all offset symbols to avoid garbage at end of table */ +@@ -4261,7 +5008,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; +- size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); ++ size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( +@@ -4275,7 +5022,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; +- size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); ++ size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( +@@ -4309,7 +5056,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } } + +- return dictPtr - (const BYTE*)dict; ++ return (size_t)(dictPtr - (const BYTE*)dict); + } + + /* Dictionary format : +@@ -4322,11 +5069,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + * dictSize supposed >= 8 + */ + static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + const BYTE* dictPtr = (const BYTE*)dict; +@@ -4345,7 +5093,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( +- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); ++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; + } +@@ -4354,13 +5102,14 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + * @return : dictID, or an error code */ + static size_t + ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + const ZSTD_CCtx_params* params, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); +@@ -4373,13 +5122,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) +- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); ++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( +- ms, ls, ws, params, dict, dictSize, dtlm); ++ ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ +@@ -4387,13 +5136,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( +- bs, ms, ws, params, dict, dictSize, dtlm, workspace); ++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); + } + + #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) + #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + + /*! ZSTD_compressBegin_internal() : ++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ + static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, +@@ -4426,11 +5176,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, +- cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->tmpWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, +- dictContentType, dtlm, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; +@@ -4471,11 +5221,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + &cctxParams, pledgedSrcSize); + } + +-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++static size_t ++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) + { + ZSTD_CCtx_params cctxParams; +- { +- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); +@@ -4483,9 +5233,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); + } + ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++{ ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); ++} ++ + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) + { +- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); + } + + +@@ -4496,14 +5252,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + { + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; +- size_t fhSize = 0; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { +- fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); ++ size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; +@@ -4513,8 +5268,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; +- RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); +- MEM_writeLE32(op, cBlockHeader24); ++ ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); ++ MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } +@@ -4528,7 +5284,7 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + } + + cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ +- return op-ostart; ++ return (size_t)(op-ostart); + } + + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) +@@ -4537,9 +5293,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) + (void)extraCSize; + } + +-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, +@@ -4563,6 +5319,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + return cSize + endResult; + } + ++/* NOTE: Must just wrap ZSTD_compressEnd_public() */ ++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); ++} ++ + size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -4591,7 +5355,7 @@ size_t ZSTD_compress_advanced_internal( + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, +@@ -4709,7 +5473,7 @@ static size_t ZSTD_initCDict_internal( + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, +- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); ++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; +@@ -4719,14 +5483,16 @@ static size_t ZSTD_initCDict_internal( + return 0; + } + +-static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, +- ZSTD_compressionParameters cParams, +- ZSTD_paramSwitch_e useRowMatchFinder, +- U32 enableDedicatedDictSearch, +- ZSTD_customMem customMem) ++static ZSTD_CDict* ++ZSTD_createCDict_advanced_internal(size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_compressionParameters cParams, ++ ZSTD_ParamSwitch_e useRowMatchFinder, ++ int enableDedicatedDictSearch, ++ ZSTD_customMem customMem) + { + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; ++ DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize); + + { size_t const workspaceSize = + ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + +@@ -4763,6 +5529,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, + { + ZSTD_CCtx_params cctxParams; + ZSTD_memset(&cctxParams, 0, sizeof(cctxParams)); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + ZSTD_CCtxParams_init(&cctxParams, 0); + cctxParams.cParams = cParams; + cctxParams.customMem = customMem; +@@ -4783,7 +5550,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + ZSTD_compressionParameters cParams; + ZSTD_CDict* cdict; + +- DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + if (cctxParams.enableDedicatedDictSearch) { +@@ -4802,7 +5569,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + } + +- DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch); + cctxParams.cParams = cParams; + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + +@@ -4813,7 +5580,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + if (!cdict) + return NULL; + +- if (ZSTD_isError( ZSTD_initCDict_internal(cdict, ++ if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cctxParams) )) { +@@ -4867,7 +5634,7 @@ size_t ZSTD_freeCDict(ZSTD_CDict* cdict) + * workspaceSize: Use ZSTD_estimateCDictSize() + * to determine how large workspace must be. + * cParams : use ZSTD_getCParams() to transform a compression level +- * into its relevants cParams. ++ * into its relevant cParams. + * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) + * Note : there is no corresponding "free" function. + * Since workspace was allocated externally, it must be freed externally. +@@ -4879,7 +5646,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) + { +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); + size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +@@ -4890,6 +5657,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_CDict* cdict; + ZSTD_CCtx_params params; + ++ DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize); + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + + { +@@ -4900,14 +5668,13 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_cwksp_move(&cdict->workspace, &ws); + } + +- DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u", +- (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize)); + if (workspaceSize < neededSize) return NULL; + + ZSTD_CCtxParams_init(¶ms, 0); + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; ++ cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, +@@ -4987,12 +5754,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( + + /* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + { + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + } + ++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++{ ++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); ++} ++ + /*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +@@ -5002,7 +5774,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) + { + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + /*! ZSTD_compress_usingCDict_advanced(): +@@ -5068,7 +5840,7 @@ size_t ZSTD_CStreamOutSize(void) + return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; + } + +-static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) ++static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) + { + if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) + return ZSTD_cpm_attachDict; +@@ -5199,30 +5971,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) + + static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) + { +- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; +- if (hintInSize==0) hintInSize = cctx->blockSize; +- return hintInSize; ++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ return cctx->blockSizeMax - cctx->stableIn_notConsumed; ++ } ++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); ++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; ++ if (hintInSize==0) hintInSize = cctx->blockSizeMax; ++ return hintInSize; ++ } + } + + /* ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants +- * non-static, because can be called from zstdmt_compress.c +- * @return : hint size for next input */ ++ * @return : hint size for next input to complete ongoing block */ + static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) + { +- const char* const istart = (const char*)input->src; +- const char* const iend = input->size != 0 ? istart + input->size : istart; +- const char* ip = input->pos != 0 ? istart + input->pos : istart; +- char* const ostart = (char*)output->dst; +- char* const oend = output->size != 0 ? ostart + output->size : ostart; +- char* op = output->pos != 0 ? ostart + output->pos : ostart; ++ const char* const istart = (assert(input != NULL), (const char*)input->src); ++ const char* const iend = (istart != NULL) ? istart + input->size : istart; ++ const char* ip = (istart != NULL) ? istart + input->pos : istart; ++ char* const ostart = (assert(output != NULL), (char*)output->dst); ++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; ++ char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ +- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); ++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); ++ assert(zcs != NULL); ++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ assert(input->pos >= zcs->stableIn_notConsumed); ++ input->pos -= zcs->stableIn_notConsumed; ++ if (ip) ip -= zcs->stableIn_notConsumed; ++ zcs->stableIn_notConsumed = 0; ++ } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); +@@ -5231,8 +6014,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } +- assert(output->pos <= output->size); ++ if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); ++ if (output->dst == NULL) assert(output->size == 0); ++ assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { +@@ -5243,12 +6028,13 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + + case zcss_load: + if ( (flushMode == ZSTD_e_end) +- && ( (size_t)(oend-op) >= ZSTD_compressBound(iend-ip) /* Enough output space */ ++ && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip)) /* Enough output space */ + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ +- size_t const cSize = ZSTD_compressEnd(zcs, +- op, oend-op, ip, iend-ip); ++ size_t const cSize = ZSTD_compressEnd_public(zcs, ++ op, (size_t)(oend-op), ++ ip, (size_t)(iend-ip)); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); + ip = iend; +@@ -5262,10 +6048,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + size_t const loaded = ZSTD_limitCopy( + zcs->inBuff + zcs->inBuffPos, toLoad, +- ip, iend-ip); ++ ip, (size_t)(iend-ip)); + zcs->inBuffPos += loaded; +- if (loaded != 0) +- ip += loaded; ++ if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ +@@ -5276,16 +6061,29 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + /* empty */ + someMoreWork = 0; break; + } ++ } else { ++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ if ( (flushMode == ZSTD_e_continue) ++ && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) { ++ /* can't compress a full block : stop here */ ++ zcs->stableIn_notConsumed = (size_t)(iend - ip); ++ ip = iend; /* pretend to have consumed input */ ++ someMoreWork = 0; break; ++ } ++ if ( (flushMode == ZSTD_e_flush) ++ && (ip == iend) ) { ++ /* empty */ ++ someMoreWork = 0; break; ++ } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); + { int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered); + void* cDst; + size_t cSize; +- size_t oSize = oend-op; +- size_t const iSize = inputBuffered +- ? zcs->inBuffPos - zcs->inToCompress +- : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t oSize = (size_t)(oend-op); ++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress ++ : MIN((size_t)(iend - ip), zcs->blockSizeMax); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else +@@ -5293,34 +6091,31 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ++ ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ++ ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + /* prepare next block */ +- zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; ++ zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax; + if (zcs->inBuffTarget > zcs->inBuffSize) +- zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; ++ zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; +- } else { +- unsigned const lastBlock = (ip + iSize == iend); +- assert(flushMode == ZSTD_e_end /* Already validated */); ++ } else { /* !inputBuffered, hence ZSTD_bm_stable */ ++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); ++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ +- if (iSize > 0) +- ip += iSize; ++ if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +- if (lastBlock) +- assert(ip == iend); ++ if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; +@@ -5369,8 +6164,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + } + } + +- input->pos = ip - istart; +- output->pos = op - ostart; ++ input->pos = (size_t)(ip - istart); ++ output->pos = (size_t)(op - ostart); + if (zcs->frameEnded) return 0; + return ZSTD_nextInputSizeHint(zcs); + } +@@ -5390,8 +6185,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf + /* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) ++static void ++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) + { ++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } +@@ -5410,22 +6207,27 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + { + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; +- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); +- if (endOp != ZSTD_e_end) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); ++ if (expect.src != input->src || expect.pos != input->pos) ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } ++ (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) +- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; + } + ++/* ++ * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize. ++ * Otherwise, it's ignored. ++ * @return: 0 on success, or a ZSTD_error code otherwise. ++ */ + static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, +- size_t inSize) { ++ size_t inSize) ++{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ +@@ -5438,21 +6240,24 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + */ + params.compressionLevel = cctx->cdict->compressionLevel; + } +- DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ +- { +- size_t const dictSize = prefixDict.dict ++ DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage"); ++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ ++ ++ { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); +- ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); ++ ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); + params.cParams = ZSTD_getCParamsFromCCtxParams( + ¶ms, cctx->pledgedSrcSizePlusOne-1, + dictSize, mode); + } + +- params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); ++ params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); ++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); ++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); ++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +@@ -5468,7 +6273,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + /* for small input: avoid automatic flush on reaching end of block, since + * it would require to add a 3-bytes null block to end frame + */ +- cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize); ++ cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize); + } else { + cctx->inBuffTarget = 0; + } +@@ -5479,6 +6284,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + return 0; + } + ++/* @return provides a minimum amount of data remaining to be flushed from internal buffers ++ */ + size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, +@@ -5493,8 +6300,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { +- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); +- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ ++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ ++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; ++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ ++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ ++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ ++ if (cctx->stableIn_notConsumed) { /* not the first time */ ++ /* check stable source guarantees */ ++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); ++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); ++ } ++ /* pretend input was consumed, to give a sense forward progress */ ++ input->pos = input->size; ++ /* save stable inBuffer, for later control, and flush/end */ ++ cctx->expectedInBuffer = *input; ++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ ++ cctx->stableIn_notConsumed += inputSize; ++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ ++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + +@@ -5512,13 +6338,20 @@ size_t ZSTD_compressStream2_simpleArgs ( + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } + + size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5541,6 +6374,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; ++ + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); +@@ -5551,64 +6385,67 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + } + } + +-typedef struct { +- U32 idx; /* Index in array of ZSTD_Sequence */ +- U32 posInSequence; /* Position within sequence at idx */ +- size_t posInSrc; /* Number of bytes given by sequences provided so far */ +-} ZSTD_sequencePosition; +- + /* ZSTD_validateSequence() : +- * @offCode : is presumed to follow format required by ZSTD_storeSeq() ++ * @offBase : must use the format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ + static size_t +-ZSTD_validateSequence(U32 offCode, U32 matchLength, +- size_t posInSrc, U32 windowLog, size_t dictSize) ++ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch, ++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) + { +- U32 const windowSize = 1 << windowLog; ++ U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); ++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; ++ RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ ++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; + } + + /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) ++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) + { +- U32 offCode = STORE_OFFSET(rawOffset); ++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { +- offCode = STORE_REPCODE_1; ++ offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { +- offCode = STORE_REPCODE(2 - ll0); ++ offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { +- offCode = STORE_REPCODE(3 - ll0); ++ offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { +- offCode = STORE_REPCODE_3; ++ offBase = REPCODE3_TO_OFFBASE; + } +- return offCode; ++ return offBase; + } + +-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of +- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. ++/* This function scans through an array of ZSTD_Sequence, ++ * storing the sequences it reads, until it reaches a block delimiter. ++ * Note that the block delimiter includes the last literals of the block. ++ * @blockSize must be == sum(sequence_lengths). ++ * @returns @blockSize on success, and a ZSTD_error otherwise. + */ + static size_t +-ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +- ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; ++ U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; +- repcodes_t updatedRepcodes; ++ Repcodes_t updatedRepcodes; + U32 dictSize; + ++ DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize); ++ + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5616,27 +6453,60 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + } else { + dictSize = 0; + } +- ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; +- U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; +- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ U32 offBase; ++ ++ if (externalRepSearch == ZSTD_ps_disable) { ++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); ++ } else { ++ U32 const ll0 = (litLength == 0); ++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } + +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, ++ seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ++ ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } +- ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ++ RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found."); ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ assert(externalRepSearch != ZSTD_ps_auto); ++ assert(idx >= startIdx); ++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { ++ U32* const rep = updatedRepcodes.rep; ++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ ++ ++ if (lastSeqIdx >= startIdx + 2) { ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (lastSeqIdx == startIdx + 1) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else { ++ assert(lastSeqIdx == startIdx); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } ++ } ++ ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + if (inSeqs[idx].litLength) { + DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength); +@@ -5644,37 +6514,43 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } +- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); ++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; +- return 0; ++ return blockSize; + } + +-/* Returns the number of bytes to move the current read position back by. Only non-zero +- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something +- * went wrong. ++/* ++ * This function attempts to scan through @blockSize bytes in @src ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. + * +- * This function will attempt to scan through blockSize bytes represented by the sequences +- * in inSeqs, storing any (partial) sequences. ++ * Occasionally, we may want to reduce the actual number of bytes consumed from @src ++ * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH. + * +- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to +- * avoid splitting a match, or to avoid splitting a match such that it would produce a match +- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. ++ * @returns the number of bytes consumed from @src, necessarily <= @blockSize. ++ * Otherwise, it may return a ZSTD error if something went wrong. + */ + static size_t +-ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; + U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; + size_t dictSize; +- BYTE const* ip = (BYTE const*)(src); +- BYTE const* iend = ip + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ +- repcodes_t updatedRepcodes; ++ const BYTE* const istart = (const BYTE*)(src); ++ const BYTE* ip = istart; ++ const BYTE* iend = istart + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ ++ Repcodes_t updatedRepcodes; + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + ++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ ++ (void)externalRepSearch; ++ + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5682,15 +6558,15 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } else { + dictSize = 0; + } +- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); +- ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { + const ZSTD_Sequence currSeq = inSeqs[idx]; + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; +- U32 offCode; ++ U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { +@@ -5704,7 +6580,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; +- idx++; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ +@@ -5744,58 +6619,113 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); +- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; ++ if (!finalMatchSplit) ++ idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); + seqPos->idx = idx; + seqPos->posInSequence = endPosInSequence; +- ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + iend -= bytesAdjustment; + if (ip != iend) { + /* Store any last literals */ +- U32 lastLLSize = (U32)(iend - ip); ++ U32 const lastLLSize = (U32)(iend - ip); + assert(ip <= iend); + DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize); + ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize); + seqPos->posInSrc += lastLLSize; + } + +- return bytesAdjustment; ++ return (size_t)(iend-istart); + } + +-typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); +-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) ++/* @seqPos represents a position within @inSeqs, ++ * it is read and updated by this function, ++ * once the goal to produce a block of size @blockSize is reached. ++ * @return: nb of bytes consumed from @src, necessarily <= @blockSize. ++ */ ++typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch); ++ ++static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode) + { +- ZSTD_sequenceCopier sequenceCopier = NULL; +- assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode)); + if (mode == ZSTD_sf_explicitBlockDelimiters) { +- return ZSTD_copySequencesToSeqStoreExplicitBlockDelim; +- } else if (mode == ZSTD_sf_noBlockDelimiters) { +- return ZSTD_copySequencesToSeqStoreNoBlockDelim; ++ return ZSTD_transferSequences_wBlockDelim; ++ } ++ assert(mode == ZSTD_sf_noBlockDelimiters); ++ return ZSTD_transferSequences_noDelim; ++} ++ ++/* Discover the size of next block by searching for the delimiter. ++ * Note that a block delimiter **must** exist in this mode, ++ * otherwise it's an input error. ++ * The block size retrieved will be later compared to ensure it remains within bounds */ ++static size_t ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos) ++{ ++ int end = 0; ++ size_t blockSize = 0; ++ size_t spos = seqPos.idx; ++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); ++ assert(spos <= inSeqsSize); ++ while (spos < inSeqsSize) { ++ end = (inSeqs[spos].offset == 0); ++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; ++ if (end) { ++ if (inSeqs[spos].matchLength != 0) ++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); ++ break; ++ } ++ spos++; + } +- assert(sequenceCopier != NULL); +- return sequenceCopier; ++ if (!end) ++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); ++ return blockSize; + } + +-/* Compress, block-by-block, all of the sequences given. ++static size_t determine_blockSize(ZSTD_SequenceFormat_e mode, ++ size_t blockSize, size_t remaining, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ ZSTD_SequencePosition seqPos) ++{ ++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) { ++ /* Note: more a "target" block size */ ++ return MIN(remaining, blockSize); ++ } ++ assert(mode == ZSTD_sf_explicitBlockDelimiters); ++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); ++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); ++ if (explicitBlockSize > blockSize) ++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); ++ if (explicitBlockSize > remaining) ++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); ++ return explicitBlockSize; ++ } ++} ++ ++/* Compress all provided sequences, block-by-block. + * + * Returns the cumulative size of all compressed blocks (including their headers), + * otherwise a ZSTD error. +@@ -5807,15 +6737,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + const void* src, size_t srcSize) + { + size_t cSize = 0; +- U32 lastBlock; +- size_t blockSize; +- size_t compressedSeqsSize; + size_t remaining = srcSize; +- ZSTD_sequencePosition seqPos = {0, 0, 0}; ++ ZSTD_SequencePosition seqPos = {0, 0, 0}; + +- BYTE const* ip = (BYTE const*)src; ++ const BYTE* ip = (BYTE const*)src; + BYTE* op = (BYTE*)dst; +- ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); ++ ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); + + DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); + /* Special case: empty frame */ +@@ -5829,22 +6756,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + } + + while (remaining) { ++ size_t compressedSeqsSize; + size_t cBlockSize; +- size_t additionalByteAdjustment; +- lastBlock = remaining <= cctx->blockSize; +- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; ++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, ++ cctx->blockSizeMax, remaining, ++ inSeqs, inSeqsSize, seqPos); ++ U32 const lastBlock = (blockSize == remaining); ++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); ++ assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); +- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); + +- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); +- FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); +- blockSize -= additionalByteAdjustment; ++ blockSize = sequenceCopier(cctx, ++ &seqPos, inSeqs, inSeqsSize, ++ ip, blockSize, ++ cctx->appliedParams.searchForExternalRepcodes); ++ FORWARD_IF_ERROR(blockSize, "Bad sequence copy"); + + /* If blocks are too small, emit as a nocompress block */ +- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; +@@ -5853,35 +6787,36 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + continue; + } + ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, + op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, + blockSize, +- cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); +- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && +- ZSTD_isRLE((BYTE const*)src, srcSize)) { +- /* We don't want to emit our first block as a RLE even if it qualifies because +- * doing so will cause the decoder (cli only) to throw a "should consume all input error." +- * This is only an issue for zstd <= v1.4.3 +- */ ++ ZSTD_isRLE(ip, blockSize)) { ++ /* Note: don't emit the first block as RLE even if it qualifies because ++ * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error ++ * "should consume all input error." ++ */ + compressedSeqsSize = 1; + } + + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ +@@ -5893,11 +6828,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; +- DEBUGLOG(4, "cSize running total: %zu", cSize); + + if (lastBlock) { + break; +@@ -5908,41 +6842,50 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + ++ DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; + } + +-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, ++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) + { + BYTE* op = (BYTE*)dst; + size_t cSize = 0; +- size_t compressedBlocksSize = 0; +- size_t frameHeaderSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ +- DEBUGLOG(3, "ZSTD_compressSequences()"); ++ DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); ++ + /* Begin writing output, starting with frame header */ +- frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID); +- op += frameHeaderSize; +- dstCapacity -= frameHeaderSize; +- cSize += frameHeaderSize; ++ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, ++ &cctx->appliedParams, srcSize, cctx->dictID); ++ op += frameHeaderSize; ++ assert(frameHeaderSize <= dstCapacity); ++ dstCapacity -= frameHeaderSize; ++ cSize += frameHeaderSize; ++ } + if (cctx->appliedParams.fParams.checksumFlag && srcSize) { + xxh64_update(&cctx->xxhState, src, srcSize); + } +- /* cSize includes block header size and compressed sequences size */ +- compressedBlocksSize = ZSTD_compressSequences_internal(cctx, ++ ++ /* Now generate compressed blocks */ ++ { size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx, + op, dstCapacity, + inSeqs, inSeqsSize, + src, srcSize); +- FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!"); +- cSize += compressedBlocksSize; +- dstCapacity -= compressedBlocksSize; ++ FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); ++ cSize += cBlocksSize; ++ assert(cBlocksSize <= dstCapacity); ++ dstCapacity -= cBlocksSize; ++ } + ++ /* Complete with frame checksum, if needed */ + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) xxh64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); +@@ -5951,26 +6894,557 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + cSize += 4; + } + +- DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); ++ return cSize; ++} ++ ++ ++#if defined(__AVX2__) ++ ++#include /* AVX2 intrinsics */ ++ ++/* ++ * Convert 2 sequences per iteration, using AVX2 intrinsics: ++ * - offset -> offBase = offset + 2 ++ * - litLength -> (U16) litLength ++ * - matchLength -> (U16)(matchLength - 3) ++ * - rep is ignored ++ * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). ++ * ++ * At the end, instead of extracting two __m128i, ++ * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, ++ * then store the lower 16 bytes in one go. ++ * ++ * @returns 0 on succes, with no long length detected ++ * @returns > 0 if there is one long length (> 65535), ++ * indicating the position, and type. ++ */ ++static size_t convertSequences_noRepcodes( ++ SeqDef* dstSeqs, ++ const ZSTD_Sequence* inSeqs, ++ size_t nbSequences) ++{ ++ /* ++ * addition: ++ * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) ++ */ ++ const __m256i addition = _mm256_setr_epi32( ++ ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ ++ ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ ++ ); ++ ++ /* limit: check if there is a long length */ ++ const __m256i limit = _mm256_set1_epi32(65535); ++ ++ /* ++ * shuffle mask for byte-level rearrangement in each 128-bit half: ++ * ++ * Input layout (after addition) per 128-bit half: ++ * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] ++ * We only need: ++ * offBase (4 bytes) = offset+2 ++ * litLength (2 bytes) = low 2 bytes of litLength ++ * mlBase (2 bytes) = low 2 bytes of (matchLength) ++ * => Bytes [0..3, 4..5, 8..9], zero the rest. ++ */ ++ const __m256i mask = _mm256_setr_epi8( ++ /* For the lower 128 bits => sequence i */ ++ 0, 1, 2, 3, /* offset+2 */ ++ 4, 5, /* litLength (16 bits) */ ++ 8, 9, /* matchLength (16 bits) */ ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ ++ /* For the upper 128 bits => sequence i+1 */ ++ 16,17,18,19, /* offset+2 */ ++ 20,21, /* litLength */ ++ 24,25, /* matchLength */ ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80 ++ ); ++ ++ /* ++ * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). ++ * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. ++ * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. ++ */ ++#define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ ++ ++ size_t longLen = 0, i = 0; ++ ++ /* AVX permutation depends on the specific definition of target structures */ ++ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); ++ ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); ++ ++ /* Process 2 sequences per loop iteration */ ++ for (; i + 1 < nbSequences; i += 2) { ++ /* Load 2 ZSTD_Sequence (32 bytes) */ ++ __m256i vin = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]); ++ ++ /* Add {2, 0, -3, 0} in each 128-bit half */ ++ __m256i vadd = _mm256_add_epi32(vin, addition); ++ ++ /* Check for long length */ ++ __m256i ll_cmp = _mm256_cmpgt_epi32(vadd, limit); /* 0xFFFFFFFF for element > 65535 */ ++ int ll_res = _mm256_movemask_epi8(ll_cmp); ++ ++ /* Shuffle bytes so each half gives us the 8 bytes we need */ ++ __m256i vshf = _mm256_shuffle_epi8(vadd, mask); ++ /* ++ * Now: ++ * Lane0 = seq0's 8 bytes ++ * Lane1 = 0 ++ * Lane2 = seq1's 8 bytes ++ * Lane3 = 0 ++ */ ++ ++ /* Permute 64-bit lanes => move Lane2 down into Lane1. */ ++ __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); ++ /* ++ * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. ++ * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. ++ */ ++ ++ /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ ++ _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm)); ++ /* ++ * This writes out 16 bytes total: ++ * - offset 0..7 => seq0 (offBase, litLength, mlBase) ++ * - offset 8..15 => seq1 (offBase, litLength, mlBase) ++ */ ++ ++ /* check (unlikely) long lengths > 65535 ++ * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] ++ * => combined mask = 0x0FF00FF0 ++ */ ++ if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) { ++ /* long length detected: let's figure out which one*/ ++ if (inSeqs[i].matchLength > 65535+MINMATCH) { ++ assert(longLen == 0); ++ longLen = i + 1; ++ } ++ if (inSeqs[i].litLength > 65535) { ++ assert(longLen == 0); ++ longLen = i + nbSequences + 1; ++ } ++ if (inSeqs[i+1].matchLength > 65535+MINMATCH) { ++ assert(longLen == 0); ++ longLen = i + 1 + 1; ++ } ++ if (inSeqs[i+1].litLength > 65535) { ++ assert(longLen == 0); ++ longLen = i + 1 + nbSequences + 1; ++ } ++ } ++ } ++ ++ /* Handle leftover if @nbSequences is odd */ ++ if (i < nbSequences) { ++ /* process last sequence */ ++ assert(i == nbSequences - 1); ++ dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); ++ dstSeqs[i].litLength = (U16)inSeqs[i].litLength; ++ dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); ++ /* check (unlikely) long lengths > 65535 */ ++ if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { ++ assert(longLen == 0); ++ longLen = i + 1; ++ } ++ if (UNLIKELY(inSeqs[i].litLength > 65535)) { ++ assert(longLen == 0); ++ longLen = i + nbSequences + 1; ++ } ++ } ++ ++ return longLen; ++} ++ ++/* the vector implementation could also be ported to SSSE3, ++ * but since this implementation is targeting modern systems (>= Sapphire Rapid), ++ * it's not useful to develop and maintain code for older pre-AVX2 platforms */ ++ ++#else /* no AVX2 */ ++ ++static size_t convertSequences_noRepcodes( ++ SeqDef* dstSeqs, ++ const ZSTD_Sequence* inSeqs, ++ size_t nbSequences) ++{ ++ size_t longLen = 0; ++ size_t n; ++ for (n=0; n 65535 */ ++ if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { ++ assert(longLen == 0); ++ longLen = n + 1; ++ } ++ if (UNLIKELY(inSeqs[n].litLength > 65535)) { ++ assert(longLen == 0); ++ longLen = n + nbSequences + 1; ++ } ++ } ++ return longLen; ++} ++ ++#endif ++ ++/* ++ * Precondition: Sequences must end on an explicit Block Delimiter ++ * @return: 0 on success, or an error code. ++ * Note: Sequence validation functionality has been disabled (removed). ++ * This is helpful to generate a lean main pipeline, improving performance. ++ * It may be re-inserted later. ++ */ ++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, ++ const ZSTD_Sequence* const inSeqs, size_t nbSequences, ++ int repcodeResolution) ++{ ++ Repcodes_t updatedRepcodes; ++ size_t seqNb = 0; ++ ++ DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences); ++ ++ RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid, ++ "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); ++ ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ++ /* check end condition */ ++ assert(nbSequences >= 1); ++ assert(inSeqs[nbSequences-1].matchLength == 0); ++ assert(inSeqs[nbSequences-1].offset == 0); ++ ++ /* Convert Sequences from public format to internal format */ ++ if (!repcodeResolution) { ++ size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1); ++ cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1; ++ if (longl) { ++ DEBUGLOG(5, "long length"); ++ assert(cctx->seqStore.longLengthType == ZSTD_llt_none); ++ if (longl <= nbSequences-1) { ++ DEBUGLOG(5, "long match length detected at pos %zu", longl-1); ++ cctx->seqStore.longLengthType = ZSTD_llt_matchLength; ++ cctx->seqStore.longLengthPos = (U32)(longl-1); ++ } else { ++ DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); ++ assert(longl <= 2* (nbSequences-1)); ++ cctx->seqStore.longLengthType = ZSTD_llt_literalLength; ++ cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1); ++ } ++ } ++ } else { ++ for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { ++ U32 const litLength = inSeqs[seqNb].litLength; ++ U32 const matchLength = inSeqs[seqNb].matchLength; ++ U32 const ll0 = (litLength == 0); ++ U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); ++ ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } ++ } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ if (!repcodeResolution && nbSequences > 1) { ++ U32* const rep = updatedRepcodes.rep; ++ ++ if (nbSequences >= 4) { ++ U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */ ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (nbSequences == 3) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[0].offset; ++ rep[0] = inSeqs[1].offset; ++ } else { ++ assert(nbSequences == 2); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[0].offset; ++ } ++ } ++ ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); ++ ++ return 0; ++} ++ ++#if defined(ZSTD_ARCH_X86_AVX2) ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) ++{ ++ size_t i; ++ __m256i const zeroVec = _mm256_setzero_si256(); ++ __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ ++ ZSTD_ALIGNED(32) U32 tmp[8]; /* temporary buffer for reduction */ ++ size_t mSum = 0, lSum = 0; ++ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); ++ ++ /* Process 2 structs (32 bytes) at a time */ ++ for (i = 0; i + 2 <= nbSeqs; i += 2) { ++ /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ ++ __m256i data = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]); ++ /* check end of block signal */ ++ __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); ++ int cmp_res = _mm256_movemask_epi8(cmp); ++ /* indices for match lengths correspond to bits [8..11], [24..27] ++ * => combined mask = 0x0F000F00 */ ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); ++ if (cmp_res & 0x0F000F00) break; ++ /* Accumulate in sumVec */ ++ sumVec = _mm256_add_epi32(sumVec, data); ++ } ++ ++ /* Horizontal reduction */ ++ _mm256_store_si256((__m256i*)tmp, sumVec); ++ lSum = tmp[1] + tmp[5]; ++ mSum = tmp[2] + tmp[6]; ++ ++ /* Handle the leftover */ ++ for (; i < nbSeqs; i++) { ++ lSum += seqs[i].litLength; ++ mSum += seqs[i].matchLength; ++ if (seqs[i].matchLength == 0) break; /* end of block */ ++ } ++ ++ if (i==nbSeqs) { ++ /* reaching end of sequences: end of block signal was not present */ ++ BlockSummary bs; ++ bs.nbSequences = ERROR(externalSequences_invalid); ++ return bs; ++ } ++ { BlockSummary bs; ++ bs.nbSequences = i+1; ++ bs.blockSize = lSum + mSum; ++ bs.litSize = lSum; ++ return bs; ++ } ++} ++ ++#else ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) ++{ ++ size_t totalMatchSize = 0; ++ size_t litSize = 0; ++ size_t n; ++ assert(seqs); ++ for (n=0; nappliedParams.searchForExternalRepcodes == ZSTD_ps_enable); ++ assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto); ++ ++ DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize); ++ RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block"); ++ ++ /* Special case: empty frame */ ++ if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) { ++ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header"); ++ MEM_writeLE24(op, cBlockHeader24); ++ op += ZSTD_blockHeaderSize; ++ dstCapacity -= ZSTD_blockHeaderSize; ++ cSize += ZSTD_blockHeaderSize; ++ } ++ ++ while (nbSequences) { ++ size_t compressedSeqsSize, cBlockSize, conversionStatus; ++ BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences); ++ U32 const lastBlock = (block.nbSequences == nbSequences); ++ FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block"); ++ assert(block.nbSequences <= nbSequences); ++ RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer"); ++ ZSTD_resetSeqStore(&cctx->seqStore); ++ ++ conversionStatus = ZSTD_convertBlockSequences(cctx, ++ inSeqs, block.nbSequences, ++ repcodeResolution); ++ FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion"); ++ inSeqs += block.nbSequences; ++ nbSequences -= block.nbSequences; ++ remaining -= block.blockSize; ++ ++ /* Note: when blockSize is very small, other variant send it uncompressed. ++ * Here, we still send the sequences, because we don't have the original source to send it uncompressed. ++ * One could imagine in theory reproducing the source from the sequences, ++ * but that's complex and costly memory intensive, and goes against the objectives of this variant. */ ++ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); ++ ++ compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal( ++ op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, ++ literals, block.litSize, ++ &cctx->seqStore, ++ &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, ++ &cctx->appliedParams, ++ cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, ++ cctx->bmi2); ++ FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); ++ /* note: the spec forbids for any compressed block to be larger than maximum block size */ ++ if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0; ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); ++ litSize -= block.litSize; ++ literals = (const char*)literals + block.litSize; ++ ++ /* Note: difficult to check source for RLE block when only Literals are provided, ++ * but it could be considered from analyzing the sequence directly */ ++ ++ if (compressedSeqsSize == 0) { ++ /* Sending uncompressed blocks is out of reach, because the source is not provided. ++ * In theory, one could use the sequences to regenerate the source, like a decompressor, ++ * but it's complex, and memory hungry, killing the purpose of this variant. ++ * Current outcome: generate an error code. ++ */ ++ RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block"); ++ } else { ++ U32 cBlockHeader; ++ assert(compressedSeqsSize > 1); /* no RLE */ ++ /* Error checking and repcodes update */ ++ ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); ++ if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) ++ cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; ++ ++ /* Write block header into beginning of block*/ ++ cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); ++ MEM_writeLE24(op, cBlockHeader); ++ cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); ++ } ++ ++ cSize += cBlockSize; ++ op += cBlockSize; ++ dstCapacity -= cBlockSize; ++ cctx->isFirstBlock = 0; ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); ++ ++ if (lastBlock) { ++ assert(nbSequences == 0); ++ break; ++ } ++ } ++ ++ RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed"); ++ RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize); ++ DEBUGLOG(4, "cSize final total: %zu", cSize); ++ return cSize; ++} ++ ++size_t ++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* literals, size_t litSize, size_t litCapacity, ++ size_t decompressedSize) ++{ ++ BYTE* op = (BYTE*)dst; ++ size_t cSize = 0; ++ ++ /* Transparent initialization stage, same as compressStream2() */ ++ DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity); ++ assert(cctx != NULL); ++ if (litCapacity < litSize) { ++ RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)"); ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed"); ++ ++ if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) { ++ RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters"); ++ } ++ if (cctx->appliedParams.validateSequences) { ++ RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation"); ++ } ++ if (cctx->appliedParams.fParams.checksumFlag) { ++ RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum"); ++ } ++ ++ /* Begin writing output, starting with frame header */ ++ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, ++ &cctx->appliedParams, decompressedSize, cctx->dictID); ++ op += frameHeaderSize; ++ assert(frameHeaderSize <= dstCapacity); ++ dstCapacity -= frameHeaderSize; ++ cSize += frameHeaderSize; ++ } ++ ++ /* Now generate compressed blocks */ ++ { size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx, ++ op, dstCapacity, ++ inSeqs, inSeqsSize, ++ literals, litSize, decompressedSize); ++ FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); ++ cSize += cBlocksSize; ++ assert(cBlocksSize <= dstCapacity); ++ dstCapacity -= cBlocksSize; ++ } ++ ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; + } + + /*====== Finalize ======*/ + ++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) ++{ ++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; ++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ return stableInput ? zcs->expectedInBuffer : nullInput; ++} ++ + /*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ + size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); ++ input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); + } + +- + size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); +- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); ++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; +@@ -6046,7 +7520,7 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + } + } + +-static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + switch (mode) { + case ZSTD_cpm_unknown: +@@ -6070,8 +7544,8 @@ static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMo + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. + * Use dictSize == 0 for unknown or unused. +- * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */ +-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++ * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */ ++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode); + U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); +@@ -6092,7 +7566,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ +- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } + } + +@@ -6109,7 +7583,9 @@ ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long l + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) { ++static ZSTD_parameters ++ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) ++{ + ZSTD_parameters params; + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode); + DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); +@@ -6123,7 +7599,34 @@ static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned lo + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +-ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { ++ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) ++{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); + } ++ ++void ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* zc, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc) ++{ ++ assert(zc != NULL); ++ ZSTD_CCtxParams_registerSequenceProducer( ++ &zc->requestedParams, extSeqProdState, extSeqProdFunc ++ ); ++} ++ ++void ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc) ++{ ++ assert(params != NULL); ++ if (extSeqProdFunc != NULL) { ++ params->extSeqProdFunc = extSeqProdFunc; ++ params->extSeqProdState = extSeqProdState; ++ } else { ++ params->extSeqProdFunc = NULL; ++ params->extSeqProdState = NULL; ++ } ++} +diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h +index 71697a11ae30..b10978385876 100644 +--- a/lib/zstd/compress/zstd_compress_internal.h ++++ b/lib/zstd/compress/zstd_compress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,7 +21,8 @@ + ***************************************/ + #include "../common/zstd_internal.h" + #include "zstd_cwksp.h" +- ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ ++#include "zstd_preSplit.h" /* ZSTD_SLIPBLOCK_WORKSPACESIZE */ + + /*-************************************* + * Constants +@@ -32,7 +34,7 @@ + It's not a big deal though : candidate will just be sorted again. + Additionally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. +- The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. ++ The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. + This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ + + +@@ -75,6 +77,70 @@ typedef struct { + ZSTD_fseCTables_t fse; + } ZSTD_entropyCTables_t; + ++/* ********************************************* ++* Sequences * ++***********************************************/ ++typedef struct SeqDef_s { ++ U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ ++ U16 litLength; ++ U16 mlBase; /* mlBase == matchLength - MINMATCH */ ++} SeqDef; ++ ++/* Controls whether seqStore has a single "long" litLength or matchLength. See SeqStore_t. */ ++typedef enum { ++ ZSTD_llt_none = 0, /* no longLengthType */ ++ ZSTD_llt_literalLength = 1, /* represents a long literal */ ++ ZSTD_llt_matchLength = 2 /* represents a long match */ ++} ZSTD_longLengthType_e; ++ ++typedef struct { ++ SeqDef* sequencesStart; ++ SeqDef* sequences; /* ptr to end of sequences */ ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; ++ size_t maxNbSeq; ++ size_t maxNbLit; ++ ++ /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength ++ * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment ++ * the existing value of the litLength or matchLength by 0x10000. ++ */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++} SeqStore_t; ++ ++typedef struct { ++ U32 litLength; ++ U32 matchLength; ++} ZSTD_SequenceLength; ++ ++/* ++ * Returns the ZSTD_SequenceLength for the given sequences. It handles the decoding of long sequences ++ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. ++ */ ++MEM_STATIC ZSTD_SequenceLength ZSTD_getSequenceLength(SeqStore_t const* seqStore, SeqDef const* seq) ++{ ++ ZSTD_SequenceLength seqLen; ++ seqLen.litLength = seq->litLength; ++ seqLen.matchLength = seq->mlBase + MINMATCH; ++ if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { ++ if (seqStore->longLengthType == ZSTD_llt_literalLength) { ++ seqLen.litLength += 0x10000; ++ } ++ if (seqStore->longLengthType == ZSTD_llt_matchLength) { ++ seqLen.matchLength += 0x10000; ++ } ++ } ++ return seqLen; ++} ++ ++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ ++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ ++ ++ + /* ********************************************* + * Entropy buffer statistics structs and funcs * + ***********************************************/ +@@ -84,7 +150,7 @@ typedef struct { + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ + typedef struct { +- symbolEncodingType_e hType; ++ SymbolEncodingType_e hType; + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; + size_t hufDesSize; + } ZSTD_hufCTablesMetadata_t; +@@ -95,9 +161,9 @@ typedef struct { + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ + typedef struct { +- symbolEncodingType_e llType; +- symbolEncodingType_e ofType; +- symbolEncodingType_e mlType; ++ SymbolEncodingType_e llType; ++ SymbolEncodingType_e ofType; ++ SymbolEncodingType_e mlType; + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +@@ -111,12 +177,13 @@ typedef struct { + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize); ++size_t ZSTD_buildBlockEntropyStats( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize); + + /* ******************************* + * Compression internals structs * +@@ -140,28 +207,29 @@ typedef struct { + stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */ + size_t size; /* The number of sequences. <= capacity. */ + size_t capacity; /* The capacity starting from `seq` pointer */ +-} rawSeqStore_t; ++} RawSeqStore_t; + +-UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; ++UNUSED_ATTR static const RawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + + typedef struct { +- int price; +- U32 off; +- U32 mlen; +- U32 litlen; +- U32 rep[ZSTD_REP_NUM]; ++ int price; /* price from beginning of segment to this position */ ++ U32 off; /* offset of previous match */ ++ U32 mlen; /* length of previous match */ ++ U32 litlen; /* nb of literals since previous match */ ++ U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ + } ZSTD_optimal_t; + + typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; + ++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) + typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + unsigned* litFreq; /* table of literals statistics, of size 256 */ + unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ +- ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ +- ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ ++ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ ++ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ +@@ -173,7 +241,7 @@ typedef struct { + U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ + ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ + const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ +- ZSTD_paramSwitch_e literalCompressionMode; ++ ZSTD_ParamSwitch_e literalCompressionMode; + } optState_t; + + typedef struct { +@@ -195,11 +263,11 @@ typedef struct { + + #define ZSTD_WINDOW_START_INDEX 2 + +-typedef struct ZSTD_matchState_t ZSTD_matchState_t; ++typedef struct ZSTD_MatchState_t ZSTD_MatchState_t; + + #define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */ + +-struct ZSTD_matchState_t { ++struct ZSTD_MatchState_t { + ZSTD_window_t window; /* State for window round buffer management */ + U32 loadedDictEnd; /* index of end of dictionary, within context's referential. + * When loadedDictEnd != 0, a dictionary is in use, and still valid. +@@ -212,28 +280,42 @@ struct ZSTD_matchState_t { + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ ++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ ++ U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ ++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + + U32* hashTable; + U32* hashTable3; + U32* chainTable; + +- U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ ++ int forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ + + int dedicatedDictSearch; /* Indicates whether this matchState is using the + * dedicated dictionary search structure. + */ + optState_t opt; /* optimal parser state */ +- const ZSTD_matchState_t* dictMatchState; ++ const ZSTD_MatchState_t* dictMatchState; + ZSTD_compressionParameters cParams; +- const rawSeqStore_t* ldmSeqStore; ++ const RawSeqStore_t* ldmSeqStore; ++ ++ /* Controls prefetching in some dictMatchState matchfinders. ++ * This behavior is controlled from the cctx ms. ++ * This parameter has no effect in the cdict ms. */ ++ int prefetchCDictTables; ++ ++ /* When == 0, lazy match finders insert every position. ++ * When != 0, lazy match finders only insert positions they search. ++ * This allows them to skip much faster over incompressible data, ++ * at a small cost to compression ratio. ++ */ ++ int lazySkipping; + }; + + typedef struct { + ZSTD_compressedBlockState_t* prevCBlock; + ZSTD_compressedBlockState_t* nextCBlock; +- ZSTD_matchState_t matchState; ++ ZSTD_MatchState_t matchState; + } ZSTD_blockState_t; + + typedef struct { +@@ -260,7 +342,7 @@ typedef struct { + } ldmState_t; + + typedef struct { +- ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ ++ ZSTD_ParamSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ + U32 hashLog; /* Log size of hashTable */ + U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ + U32 minMatchLength; /* Minimum match length */ +@@ -291,7 +373,7 @@ struct ZSTD_CCtx_params_s { + * There is no guarantee that hint is close to actual source size */ + + ZSTD_dictAttachPref_e attachDictPref; +- ZSTD_paramSwitch_e literalCompressionMode; ++ ZSTD_ParamSwitch_e literalCompressionMode; + + /* Multithreading: used to pass parameters to mtctx */ + int nbWorkers; +@@ -310,24 +392,54 @@ struct ZSTD_CCtx_params_s { + ZSTD_bufferMode_e outBufferMode; + + /* Sequence compression API */ +- ZSTD_sequenceFormat_e blockDelimiters; ++ ZSTD_SequenceFormat_e blockDelimiters; + int validateSequences; + +- /* Block splitting */ +- ZSTD_paramSwitch_e useBlockSplitter; ++ /* Block splitting ++ * @postBlockSplitter executes split analysis after sequences are produced, ++ * it's more accurate but consumes more resources. ++ * @preBlockSplitter_level splits before knowing sequences, ++ * it's more approximative but also cheaper. ++ * Valid @preBlockSplitter_level values range from 0 to 6 (included). ++ * 0 means auto, 1 means do not split, ++ * then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest). ++ * Highest @preBlockSplitter_level combines well with @postBlockSplitter. ++ */ ++ ZSTD_ParamSwitch_e postBlockSplitter; ++ int preBlockSplitter_level; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; + + /* Param for deciding whether to use row-based matchfinder */ +- ZSTD_paramSwitch_e useRowMatchFinder; ++ ZSTD_ParamSwitch_e useRowMatchFinder; + + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ + int deterministicRefPrefix; + + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ + ZSTD_customMem customMem; ++ ++ /* Controls prefetching in some dictMatchState matchfinders */ ++ ZSTD_ParamSwitch_e prefetchCDictTables; ++ ++ /* Controls whether zstd will fall back to an internal matchfinder ++ * if the external matchfinder returns an error code. */ ++ int enableMatchFinderFallback; ++ ++ /* Parameters for the external sequence producer API. ++ * Users set these parameters through ZSTD_registerSequenceProducer(). ++ * It is not possible to set these parameters individually through the public API. */ ++ void* extSeqProdState; ++ ZSTD_sequenceProducer_F extSeqProdFunc; ++ ++ /* Controls repcode search in external sequence parsing */ ++ ZSTD_ParamSwitch_e searchForExternalRepcodes; + }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ + + #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) + #define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE) ++#define TMP_WORKSPACE_SIZE (MAX(ENTROPY_WORKSPACE_SIZE, ZSTD_SLIPBLOCK_WORKSPACESIZE)) + + /* + * Indicates whether this compression proceeds directly from user-provided +@@ -345,11 +457,11 @@ typedef enum { + */ + #define ZSTD_MAX_NB_BLOCK_SPLITS 196 + typedef struct { +- seqStore_t fullSeqStoreChunk; +- seqStore_t firstHalfSeqStore; +- seqStore_t secondHalfSeqStore; +- seqStore_t currSeqStore; +- seqStore_t nextSeqStore; ++ SeqStore_t fullSeqStoreChunk; ++ SeqStore_t firstHalfSeqStore; ++ SeqStore_t secondHalfSeqStore; ++ SeqStore_t currSeqStore; ++ SeqStore_t nextSeqStore; + + U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; + ZSTD_entropyCTablesMetadata_t entropyMetadata; +@@ -366,7 +478,7 @@ struct ZSTD_CCtx_s { + size_t dictContentSize; + + ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */ +- size_t blockSize; ++ size_t blockSizeMax; + unsigned long long pledgedSrcSizePlusOne; /* this way, 0 (default) == unknown */ + unsigned long long consumedSrcSize; + unsigned long long producedCSize; +@@ -378,13 +490,14 @@ struct ZSTD_CCtx_s { + int isFirstBlock; + int initialized; + +- seqStore_t seqStore; /* sequences storage ptrs */ ++ SeqStore_t seqStore; /* sequences storage ptrs */ + ldmState_t ldmState; /* long distance matching state */ + rawSeq* ldmSequences; /* Storage for the ldm output sequences */ + size_t maxNbLdmSequences; +- rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ ++ RawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ + ZSTD_blockState_t blockState; +- U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */ ++ void* tmpWorkspace; /* used as substitute of stack space - must be aligned for S64 type */ ++ size_t tmpWkspSize; + + /* Whether we are streaming or not */ + ZSTD_buffered_policy_e bufferedPolicy; +@@ -404,6 +517,7 @@ struct ZSTD_CCtx_s { + + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; ++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + + /* Dictionary */ +@@ -417,9 +531,14 @@ struct ZSTD_CCtx_s { + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; ++ ++ /* Buffer for output from external sequence producer */ ++ ZSTD_Sequence* extSeqBuf; ++ size_t extSeqBufCapacity; + }; + + typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + + typedef enum { + ZSTD_noDict = 0, +@@ -441,17 +560,17 @@ typedef enum { + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ +- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. ++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. + */ +-} ZSTD_cParamMode_e; ++} ZSTD_CParamMode_e; + +-typedef size_t (*ZSTD_blockCompressor) ( +- ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++typedef size_t (*ZSTD_BlockCompressor_f) ( ++ ZSTD_MatchState_t* bs, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); ++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); + + + MEM_STATIC U32 ZSTD_LLcode(U32 litLength) +@@ -497,12 +616,33 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) + return 1; + } + ++/* ZSTD_selectAddr: ++ * @return index >= lowLimit ? candidate : backup, ++ * tries to force branchless codegen. */ ++MEM_STATIC const BYTE* ++ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup) ++{ ++#if defined(__x86_64__) ++ __asm__ ( ++ "cmp %1, %2\n" ++ "cmova %3, %0\n" ++ : "+r"(candidate) ++ : "r"(index), "r"(lowLimit), "r"(backup) ++ ); ++ return candidate; ++#else ++ return index >= lowLimit ? candidate : backup; ++#endif ++} ++ + /* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) + { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); ++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); +@@ -510,7 +650,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi + return ZSTD_blockHeaderSize + srcSize; + } + +-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) + { + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); +@@ -529,7 +670,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) + { + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + return (srcSize >> minlog) + 2; + } + +@@ -565,29 +706,68 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con + while (ip < iend) *op++ = *ip++; + } + +-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +-#define STORE_REPCODE_1 STORE_REPCODE(1) +-#define STORE_REPCODE_2 STORE_REPCODE(2) +-#define STORE_REPCODE_3 STORE_REPCODE(3) +-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +-#define STORED_TO_OFFBASE(o) ((o)+1) +-#define OFFBASE_TO_STORED(o) ((o)-1) ++ ++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) ++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) ++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) ++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ ++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) ++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) ++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) ++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) ++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ ++ ++/*! ZSTD_storeSeqOnly() : ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. ++ * Literals themselves are not copied, but @litPtr is updated. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). ++ * @matchLength : must be >= MINMATCH ++*/ ++HINT_INLINE UNUSED_ATTR void ++ZSTD_storeSeqOnly(SeqStore_t* seqStorePtr, ++ size_t litLength, ++ U32 offBase, ++ size_t matchLength) ++{ ++ assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); ++ ++ /* literal Length */ ++ assert(litLength <= ZSTD_BLOCKSIZE_MAX); ++ if (UNLIKELY(litLength>0xFFFF)) { ++ assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ ++ seqStorePtr->longLengthType = ZSTD_llt_literalLength; ++ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ } ++ seqStorePtr->sequences[0].litLength = (U16)litLength; ++ ++ /* match offset */ ++ seqStorePtr->sequences[0].offBase = offBase; ++ ++ /* match Length */ ++ assert(matchLength <= ZSTD_BLOCKSIZE_MAX); ++ assert(matchLength >= MINMATCH); ++ { size_t const mlBase = matchLength - MINMATCH; ++ if (UNLIKELY(mlBase>0xFFFF)) { ++ assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ ++ seqStorePtr->longLengthType = ZSTD_llt_matchLength; ++ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ } ++ seqStorePtr->sequences[0].mlBase = (U16)mlBase; ++ } ++ ++ seqStorePtr->sequences++; ++} + + /*! ZSTD_storeSeq() : +- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. +- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +- * Allowed to overread literals up to litLimit. ++ * Allowed to over-read literals up to litLimit. + */ + HINT_INLINE UNUSED_ATTR void +-ZSTD_storeSeq(seqStore_t* seqStorePtr, ++ZSTD_storeSeq(SeqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, +- U32 offBase_minus1, ++ U32 offBase, + size_t matchLength) + { + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; +@@ -596,8 +776,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); +- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", +- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); ++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", ++ pos, (U32)litLength, (U32)matchLength, (U32)offBase); + } + #endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); +@@ -607,9 +787,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. +- * First copy 16 bytes, because literals are likely short. +- */ +- assert(WILDCOPY_OVERLENGTH >= 16); ++ * First copy 16 bytes, because literals are likely short. ++ */ ++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); +@@ -619,44 +799,22 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + } + seqStorePtr->lit += litLength; + +- /* literal Length */ +- if (litLength>0xFFFF) { +- assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +- seqStorePtr->longLengthType = ZSTD_llt_literalLength; +- seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +- } +- seqStorePtr->sequences[0].litLength = (U16)litLength; +- +- /* match offset */ +- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); +- +- /* match Length */ +- assert(matchLength >= MINMATCH); +- { size_t const mlBase = matchLength - MINMATCH; +- if (mlBase>0xFFFF) { +- assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +- seqStorePtr->longLengthType = ZSTD_llt_matchLength; +- seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +- } +- seqStorePtr->sequences[0].mlBase = (U16)mlBase; +- } +- +- seqStorePtr->sequences++; ++ ZSTD_storeSeqOnly(seqStorePtr, litLength, offBase, matchLength); + } + + /* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) +- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() ++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ + MEM_STATIC void +-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ ++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; +- rep[0] = STORED_OFFSET(offBase_minus1); ++ rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ +- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; ++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +@@ -670,14 +828,14 @@ ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) + + typedef struct repcodes_s { + U32 rep[3]; +-} repcodes_t; ++} Repcodes_t; + +-MEM_STATIC repcodes_t +-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++MEM_STATIC Repcodes_t ++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- repcodes_t newReps; ++ Repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); ++ ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; + } + +@@ -685,59 +843,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 + /*-************************************* + * Match length counter + ***************************************/ +-static unsigned ZSTD_NbCommonBytes (size_t val) +-{ +- if (MEM_isLittleEndian()) { +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_ctzll((U64)val) >> 3); +-# else +- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, +- 0, 3, 1, 3, 1, 4, 2, 7, +- 0, 2, 3, 6, 1, 5, 3, 5, +- 1, 3, 4, 4, 2, 5, 6, 7, +- 7, 0, 1, 2, 3, 3, 4, 6, +- 2, 6, 5, 5, 3, 4, 5, 6, +- 7, 1, 2, 4, 6, 4, 4, 5, +- 7, 2, 6, 5, 7, 6, 7, 7 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_ctz((U32)val) >> 3); +-# else +- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, +- 3, 2, 2, 1, 3, 2, 0, 1, +- 3, 3, 1, 2, 2, 2, 2, 0, +- 3, 1, 2, 0, 1, 0, 1, 1 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +- } else { /* Big Endian CPU */ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_clzll(val) >> 3); +-# else +- unsigned r; +- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ +- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } +- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } +- r += (!val); +- return r; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_clz((U32)val) >> 3); +-# else +- unsigned r; +- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } +- r += (!val); +- return r; +-# endif +- } } +-} +- +- + MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) + { + const BYTE* const pStart = pIn; +@@ -771,8 +876,8 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + size_t const matchLength = ZSTD_count(ip, match, vEnd); + if (match + matchLength != mEnd) return matchLength; + DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength); +- DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match); +- DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip); ++ DEBUGLOG(7, "distance from match beginning to end dictionary = %i", (int)(mEnd - match)); ++ DEBUGLOG(7, "distance from current pos to end buffer = %i", (int)(iEnd - ip)); + DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart); + DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd)); + return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd); +@@ -783,32 +888,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + * Hashes + ***************************************/ + static const U32 prime3bytes = 506832829U; +-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } +-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ ++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } ++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ ++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } + + static const U32 prime4bytes = 2654435761U; +-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } ++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } + + static const U64 prime5bytes = 889523592379ULL; +-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } ++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } + + static const U64 prime6bytes = 227718039650203ULL; +-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } ++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } + + static const U64 prime7bytes = 58295818150454627ULL; +-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } ++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } + + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } ++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } ++ + + MEM_STATIC FORCE_INLINE_ATTR + size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ + switch(mls) + { + default: +@@ -820,6 +936,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + } + } + ++MEM_STATIC FORCE_INLINE_ATTR ++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ ++ switch(mls) ++ { ++ default: ++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); ++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); ++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); ++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); ++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); ++ } ++} ++ ++ + /* ZSTD_ipow() : + * Return base^exponent. + */ +@@ -881,11 +1015,12 @@ MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 + /*-************************************* + * Round buffer management + ***************************************/ +-#if (ZSTD_WINDOWLOG_MAX_64 > 31) +-# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX" +-#endif +-/* Max current allowed */ +-#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX)) ++/* Max @current value allowed: ++ * In 32-bit mode: we want to avoid crossing the 2 GB limit, ++ * reducing risks of side effects in case of signed operations on indexes. ++ * In 64-bit mode: we want to ensure that adding the maximum job size (512 MB) ++ * doesn't overflow U32 index capacity (4 GB) */ ++#define ZSTD_CURRENT_MAX (MEM_64bits() ? 3500U MB : 2000U MB) + /* Maximum chunk size before overflow correction needs to be called again */ + #define ZSTD_CHUNKSIZE_MAX \ + ( ((U32)-1) /* Maximum ending current index */ \ +@@ -925,7 +1060,7 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window) + * Inspects the provided matchState and figures out what dictMode should be + * passed to the compressor. + */ +-MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) ++MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_MatchState_t *ms) + { + return ZSTD_window_hasExtDict(ms->window) ? + ZSTD_extDict : +@@ -1011,7 +1146,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + */ +-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) + { + /* preemptive overflow correction: +@@ -1112,7 +1249,7 @@ ZSTD_window_enforceMaxDist(ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, +- const ZSTD_matchState_t** dictMatchStatePtr) ++ const ZSTD_MatchState_t** dictMatchStatePtr) + { + U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); + U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0; +@@ -1157,7 +1294,7 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, +- const ZSTD_matchState_t** dictMatchStatePtr) ++ const ZSTD_MatchState_t** dictMatchStatePtr) + { + assert(loadedDictEndPtr != NULL); + assert(dictMatchStatePtr != NULL); +@@ -1167,10 +1304,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + +- if (blockEndIdx > loadedDictEnd + maxDist) { ++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. ++ * ++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected ++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. ++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use ++ * dictMatchState, so setting it to NULL is not a problem. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; +@@ -1199,9 +1341,11 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, +- void const* src, size_t srcSize, +- int forceNonContiguous) ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_update(ZSTD_window_t* window, ++ const void* src, size_t srcSize, ++ int forceNonContiguous) + { + BYTE const* const ip = (BYTE const*)src; + U32 contiguous = 1; +@@ -1228,8 +1372,9 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, + /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ + if ( (ip+srcSize > window->dictBase + window->lowLimit) + & (ip < window->dictBase + window->dictLimit)) { +- ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase; +- U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; ++ size_t const highInputIdx = (size_t)((ip + srcSize) - window->dictBase); ++ U32 const lowLimitMax = (highInputIdx > (size_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; ++ assert(highInputIdx < UINT_MAX); + window->lowLimit = lowLimitMax; + DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit); + } +@@ -1239,7 +1384,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, + /* + * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix. + */ +-MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) ++MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) + { + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.lowLimit; +@@ -1256,7 +1401,7 @@ MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, u + /* + * Returns the lowest allowed match index in the prefix. + */ +-MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) ++MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) + { + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.dictLimit; +@@ -1269,6 +1414,13 @@ MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, + return matchLowest; + } + ++/* index_safety_check: ++ * intentional underflow : ensure repIndex isn't overlapping dict + prefix ++ * @return 1 if values are not overlapping, ++ * 0 otherwise */ ++MEM_STATIC int ZSTD_index_overlap_check(const U32 prefixLowestIndex, const U32 repIndex) { ++ return ((U32)((prefixLowestIndex-1) - repIndex) >= 3); ++} + + + /* debug functions */ +@@ -1302,7 +1454,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) + + #endif + ++/* Short Cache */ ++ ++/* Normally, zstd matchfinders follow this flow: ++ * 1. Compute hash at ip ++ * 2. Load index from hashTable[hash] ++ * 3. Check if *ip == *(base + index) ++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. ++ * ++ * Short cache is an optimization which allows us to avoid step 3 most of the time ++ * when the data doesn't actually match. With short cache, the flow becomes: ++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. ++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. ++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. ++ * ++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to ++ * dictMatchState matchfinders. ++ */ ++#define ZSTD_SHORT_CACHE_TAG_BITS 8 ++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) ++ ++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. ++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ ++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { ++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); ++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); ++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; ++} + ++/* Helper function for short cache matchfinders. ++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ ++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { ++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; ++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; ++ return tag1 == tag2; ++} + + /* =============================================================== + * Shared internal declarations +@@ -1319,6 +1506,25 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_SequencePosition; ++ ++/* for benchmark */ ++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, ++ const ZSTD_Sequence* const inSeqs, size_t nbSequences, ++ int const repcodeResolution); ++ ++typedef struct { ++ size_t nbSequences; ++ size_t blockSize; ++ size_t litSize; ++} BlockSummary; ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs); ++ + /* ============================================================== + * Private declarations + * These prototypes shall only be called from within lib/compress +@@ -1330,7 +1536,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + * Note: srcSizeHint == 0 means 0! + */ + ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( +- const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); ++ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + + /*! ZSTD_initCStream_internal() : + * Private use only. Init streaming operation. +@@ -1342,7 +1548,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize); + +-void ZSTD_resetSeqStore(seqStore_t* ssPtr); ++void ZSTD_resetSeqStore(SeqStore_t* ssPtr); + + /*! ZSTD_getCParamsFromCDict() : + * as the name implies */ +@@ -1381,11 +1587,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity); + * This cannot be used when long range matching is enabled. + * Zstd will use these sequences, and pass the literals to a secondary block + * compressor. +- * @return : An error code on failure. + * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory + * access and data corruption. + */ +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); + + /* ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +@@ -1396,4 +1601,28 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + */ + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + ++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ ++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { ++ return params->extSeqProdFunc != NULL; ++} ++ ++/* =============================================================== ++ * Deprecated definitions that are still used internally to avoid ++ * deprecation warnings. These functions are exactly equivalent to ++ * their public variants, but avoid the deprecation warnings. ++ * =============================================================== */ ++ ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ++ ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ ++ + #endif /* ZSTD_COMPRESS_H */ +diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c +index 52b0a8059aba..ec39b4299b6f 100644 +--- a/lib/zstd/compress/zstd_compress_literals.c ++++ b/lib/zstd/compress/zstd_compress_literals.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,11 +14,36 @@ + ***************************************/ + #include "zstd_compress_literals.h" + ++ ++/* ************************************************************** ++* Debug Traces ++****************************************************************/ ++#if DEBUGLEVEL >= 2 ++ ++static size_t showHexa(const void* src, size_t srcSize) ++{ ++ const BYTE* const ip = (const BYTE*)src; ++ size_t u; ++ for (u=0; u31) + (srcSize>4095); + ++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); ++ + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) +@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, + } + + ZSTD_memcpy(ostart + flSize, src, srcSize); +- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); ++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; + } + ++static int allBytesIdentical(const void* src, size_t srcSize) ++{ ++ assert(srcSize >= 1); ++ assert(src != NULL); ++ { const BYTE b = ((const BYTE*)src)[0]; ++ size_t p; ++ for (p=1; p31) + (srcSize>4095); + +- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ ++ assert(dstCapacity >= 4); (void)dstCapacity; ++ assert(allBytesIdentical(src, srcSize)); + + switch(flSize) + { +@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* + } + + ostart[flSize] = *(const BYTE*)src; +- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); ++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); + return flSize+1; + } + +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible) ++/* ZSTD_minLiteralsToCompress() : ++ * returns minimal amount of literals ++ * for literal compression to even be attempted. ++ * Minimum is made tighter as compression strategy increases. ++ */ ++static size_t ++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) ++{ ++ assert((int)strategy >= 0); ++ assert((int)strategy <= 9); ++ /* btultra2 : min 8 bytes; ++ * then 2x larger for each successive compression strategy ++ * max threshold 64 bytes */ ++ { int const shift = MIN(9-(int)strategy, 3); ++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; ++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); ++ return mintc; ++ } ++} ++ ++size_t ZSTD_compressLiterals ( ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ void* entropyWorkspace, size_t entropyWorkspaceSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, ++ int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2) + { +- size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; +- symbolEncodingType_e hType = set_compressed; ++ SymbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", +- disableLiteralCompression, (U32)srcSize); ++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", ++ disableLiteralCompression, (U32)srcSize, dstCapacity); ++ ++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + +- /* small ? don't even attempt compression (speed opt) */ +-# define COMPRESS_LITERALS_SIZE_MIN 63 +- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ /* if too small, don't even attempt compression (speed opt) */ ++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; +- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; ++ int const flags = 0 ++ | (bmi2 ? HUF_flags_bmi2 : 0) ++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) ++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) ++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); ++ ++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); ++ huf_compress_f huf_compress; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; +- cLitSize = singleStream ? +- HUF_compress1X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : +- HUF_compress4X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); ++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; ++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, ++ src, srcSize, ++ HUF_SYMBOLVALUE_MAX, LitHufLog, ++ entropyWorkspace, entropyWorkspaceSize, ++ (HUF_CElt*)nextHuf->CTable, ++ &repeat, flags); ++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ +- DEBUGLOG(5, "Reusing previous huffman table"); ++ DEBUGLOG(5, "reusing statistics from previous huffman block"); + hType = set_repeat; + } + } + +- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ { size_t const minGain = ZSTD_minGain(srcSize, strategy); ++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); ++ } } + if (cLitSize==1) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); +- } ++ /* A return value of 1 signals that the alphabet consists of a single symbol. ++ * However, in some rare circumstances, it could be the compressed size (a single byte). ++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. ++ * (it's also necessary to not generate statistics). ++ * Therefore, in such a case, actively check that all bytes are identical. */ ++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); ++ } } + + if (hType == set_compressed) { + /* using a newly constructed table */ +@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); ++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); +diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h +index 9775fb97cb70..a2a85d6b69e5 100644 +--- a/lib/zstd/compress/zstd_compress_literals.h ++++ b/lib/zstd/compress/zstd_compress_literals.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,16 +17,24 @@ + + size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ++/* ZSTD_compressRleLiteralsBlock() : ++ * Conditions : ++ * - All bytes in @src are identical ++ * - dstCapacity >= 4 */ + size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, ++/* ZSTD_compressLiterals(): ++ * @entropyWorkspace: must be aligned on 4-bytes boundaries ++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE ++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding ++ */ ++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible); ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2); + + #endif /* ZSTD_COMPRESS_LITERALS_H */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c +index 21ddc1b37acf..256980c9d85a 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.c ++++ b/lib/zstd/compress/zstd_compress_sequences.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq) + { + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on +- * comprssibility. ++ * compressibility. + */ + return nbSeq >= 2048; + } +@@ -153,20 +154,20 @@ size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + return cost >> 8; + } + +-symbolEncodingType_e ++SymbolEncodingType_e + ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, +- ZSTD_defaultPolicy_e const isDefaultAllowed, ++ ZSTD_DefaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy) + { + ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { +- /* Prefer set_basic over set_rle when there are 2 or less symbols, ++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ +@@ -241,7 +242,7 @@ typedef struct { + + size_t + ZSTD_buildCTable(void* dst, size_t dstCapacity, +- FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, ++ FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, +@@ -293,7 +294,7 @@ ZSTD_encodeSequences_body( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; +@@ -387,7 +388,7 @@ ZSTD_encodeSequences_default( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, +@@ -405,7 +406,7 @@ ZSTD_encodeSequences_bmi2( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, +@@ -421,7 +422,7 @@ size_t ZSTD_encodeSequences( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) + { + DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity); + #if DYNAMIC_BMI2 +diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h +index 7991364c2f71..14fdccb6547f 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.h ++++ b/lib/zstd/compress/zstd_compress_sequences.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,26 +12,27 @@ + #ifndef ZSTD_COMPRESS_SEQUENCES_H + #define ZSTD_COMPRESS_SEQUENCES_H + ++#include "zstd_compress_internal.h" /* SeqDef */ + #include "../common/fse.h" /* FSE_repeat, FSE_CTable */ +-#include "../common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */ ++#include "../common/zstd_internal.h" /* SymbolEncodingType_e, ZSTD_strategy */ + + typedef enum { + ZSTD_defaultDisallowed = 0, + ZSTD_defaultAllowed = 1 +-} ZSTD_defaultPolicy_e; ++} ZSTD_DefaultPolicy_e; + +-symbolEncodingType_e ++SymbolEncodingType_e + ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, +- ZSTD_defaultPolicy_e const isDefaultAllowed, ++ ZSTD_DefaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy); + + size_t + ZSTD_buildCTable(void* dst, size_t dstCapacity, +- FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, ++ FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, +@@ -42,7 +44,7 @@ size_t ZSTD_encodeSequences( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); ++ SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); + + size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, +diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c +index 17d836cc84e8..dc12d64e935c 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.c ++++ b/lib/zstd/compress/zstd_compress_superblock.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -36,13 +37,14 @@ + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block +- * Or 0 if it unable to compress. ++ * Or 0 if unable to compress. + * Or error code */ +-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- const BYTE* literals, size_t litSize, +- void* dst, size_t dstSize, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const BYTE* literals, size_t litSize, ++ void* dst, size_t dstSize, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); +@@ -50,11 +52,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart + lhSize; + U32 const singleStream = lhSize == 3; +- symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; ++ SymbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + +- (void)bmi2; /* TODO bmi2... */ +- + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; +@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + +- /* TODO bmi2 */ +- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) +- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); ++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; ++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) ++ : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { +@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } +@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); +- return op-ostart; ++ return (size_t)(op-ostart); + } + +-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { +- const seqDef* const sstart = sequences; +- const seqDef* const send = sequences + nbSeq; +- const seqDef* sp = sstart; ++static size_t ++ZSTD_seqDecompressedSize(SeqStore_t const* seqStore, ++ const SeqDef* sequences, size_t nbSeqs, ++ size_t litSize, int lastSubBlock) ++{ + size_t matchLengthSum = 0; + size_t litLengthSum = 0; +- (void)(litLengthSum); /* suppress unused variable warning on some environments */ +- while (send-sp > 0) { +- ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); ++ size_t n; ++ for (n=0; ncParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; +@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); +- if (nbSeq < 0x7F) ++ if (nbSeq < 128) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ +@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +- op, oend - op, ++ op, (size_t)(oend - op), + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, +@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + #endif + + *entropyWritten = 1; +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* ZSTD_compressSubBlock() : +@@ -258,7 +263,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + * Or 0 if it failed to compress. */ + static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- const seqDef* sequences, size_t nbSeq, ++ const SeqDef* sequences, size_t nbSeq, + const BYTE* literals, size_t litSize, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, +@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, +- op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; +@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, +- op, oend-op, ++ op, (size_t)(oend-op), + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ +- { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; ++ { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } +- return op-ostart; ++ return (size_t)(op-ostart); + } + + static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, +@@ -322,7 +328,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit + return 0; + } + +-static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type, ++static size_t ZSTD_estimateSubBlockSize_symbolType(SymbolEncodingType_e type, + const BYTE* codeTable, unsigned maxCode, + size_t nbSeq, const FSE_CTable* fseCTable, + const U8* additionalBits, +@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + +-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, ++typedef struct { ++ size_t estLitSize; ++ size_t estBlockSize; ++} EstimatedBlockSize; ++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, +@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { +- size_t cSizeEstimate = 0; +- cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); +- cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, ++ int writeLitEntropy, int writeSeqEntropy) ++{ ++ EstimatedBlockSize ebs; ++ ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); ++ ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); +- return cSizeEstimate + ZSTD_blockHeaderSize; ++ ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; ++ return ebs; + } + + static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +@@ -415,14 +427,57 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe + return 0; + } + ++static size_t countLiterals(SeqStore_t const* seqStore, const SeqDef* sp, size_t seqCount) ++{ ++ size_t n, total = 0; ++ assert(sp != NULL); ++ for (n=0; n %zu bytes", seqCount, (const void*)sp, total); ++ return total; ++} ++ ++#define BYTESCALE 256 ++ ++static size_t sizeBlockSequences(const SeqDef* sp, size_t nbSeqs, ++ size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, ++ int firstSubBlock) ++{ ++ size_t n, budget = 0, inSize=0; ++ /* entropy headers */ ++ size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ ++ assert(firstSubBlock==0 || firstSubBlock==1); ++ budget += headerSize; ++ ++ /* first sequence => at least one sequence*/ ++ budget += sp[0].litLength * avgLitCost + avgSeqCost; ++ if (budget > targetBudget) return 1; ++ inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); ++ ++ /* loop over sequences */ ++ for (n=1; n targetBudget) ++ /* though continue to expand until the sub-block is deemed compressible */ ++ && (budget < inSize * BYTESCALE) ) ++ break; ++ } ++ ++ return n; ++} ++ + /* ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. +- * Entropy will be written to the first block. +- * The following blocks will use repeat mode to compress. +- * All sub-blocks are compressed blocks (no raw or rle blocks). +- * @return : compressed size of the super block (which is multiple ZSTD blocks) +- * Or 0 if it failed to compress. */ +-static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, ++ * Entropy will be written into the first block. ++ * The following blocks use repeat_mode to compress. ++ * Sub-blocks are all compressed, except the last one when beneficial. ++ * @return : compressed size of the super block (which features multiple ZSTD blocks) ++ * or 0 if it failed to compress. */ ++static size_t ZSTD_compressSubBlock_multi(const SeqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +@@ -432,12 +487,14 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const int bmi2, U32 lastBlock, + void* workspace, size_t wkspSize) + { +- const seqDef* const sstart = seqStorePtr->sequencesStart; +- const seqDef* const send = seqStorePtr->sequences; +- const seqDef* sp = sstart; ++ const SeqDef* const sstart = seqStorePtr->sequencesStart; ++ const SeqDef* const send = seqStorePtr->sequences; ++ const SeqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ ++ size_t const nbSeqs = (size_t)(send - sstart); + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; ++ size_t const nbLiterals = (size_t)(lend - lstart); + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; +@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; +- size_t targetCBlockSize = cctxParams->targetCBlockSize; +- size_t litSize, seqCount; +- int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; ++ size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ ++ size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); ++ int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); + int writeSeqEntropy = 1; +- int lastSequence = 0; +- +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", +- (unsigned)(lend-lp), (unsigned)(send-sstart)); +- +- litSize = 0; +- seqCount = 0; +- do { +- size_t cBlockSizeEstimate = 0; +- if (sstart == send) { +- lastSequence = 1; +- } else { +- const seqDef* const sequence = sp + seqCount; +- lastSequence = sequence == send - 1; +- litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; +- seqCount++; +- } +- if (lastSequence) { +- assert(lp <= lend); +- assert(litSize <= (size_t)(lend - lp)); +- litSize = (size_t)(lend - lp); ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", ++ (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); ++ ++ /* let's start by a general estimation for the full block */ ++ if (nbSeqs > 0) { ++ EstimatedBlockSize const ebs = ++ ZSTD_estimateSubBlockSize(lp, nbLiterals, ++ ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, ++ &nextCBlock->entropy, entropyMetadata, ++ workspace, wkspSize, ++ writeLitEntropy, writeSeqEntropy); ++ /* quick estimation */ ++ size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; ++ size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; ++ const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); ++ size_t n, avgBlockBudget, blockBudgetSupp=0; ++ avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; ++ DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", ++ (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, ++ (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); ++ /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately ++ * this will result in the production of a single uncompressed block covering @srcSize.*/ ++ if (ebs.estBlockSize > srcSize) return 0; ++ ++ /* compress and write sub-blocks */ ++ assert(nbSubBlocks>0); ++ for (n=0; n < nbSubBlocks-1; n++) { ++ /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ ++ size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), ++ avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); ++ /* if reached last sequence : break to last sub-block (simplification) */ ++ assert(seqCount <= (size_t)(send-sp)); ++ if (sp + seqCount == send) break; ++ assert(seqCount > 0); ++ /* compress sub-block */ ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = countLiterals(seqStorePtr, sp, seqCount); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ 0); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* check compressibility, update state components */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; ++ } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; ++ blockBudgetSupp = 0; ++ } } ++ /* otherwise : do not compress yet, coalesce current sub-block with following one */ + } +- /* I think there is an optimization opportunity here. +- * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful +- * since it recalculates estimate from scratch. +- * For example, it would recount literal distribution and symbol codes every time. +- */ +- cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, +- &nextCBlock->entropy, entropyMetadata, +- workspace, wkspSize, writeLitEntropy, writeSeqEntropy); +- if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { +- int litEntropyWritten = 0; +- int seqEntropyWritten = 0; +- const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); +- const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, +- sp, seqCount, +- lp, litSize, +- llCodePtr, mlCodePtr, ofCodePtr, +- cctxParams, +- op, oend-op, +- bmi2, writeLitEntropy, writeSeqEntropy, +- &litEntropyWritten, &seqEntropyWritten, +- lastBlock && lastSequence); +- FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); +- if (cSize > 0 && cSize < decompressedSize) { +- DEBUGLOG(5, "Committed the sub-block"); +- assert(ip + decompressedSize <= iend); +- ip += decompressedSize; +- sp += seqCount; +- lp += litSize; +- op += cSize; +- llCodePtr += seqCount; +- mlCodePtr += seqCount; +- ofCodePtr += seqCount; +- litSize = 0; +- seqCount = 0; +- /* Entropy only needs to be written once */ +- if (litEntropyWritten) { +- writeLitEntropy = 0; +- } +- if (seqEntropyWritten) { +- writeSeqEntropy = 0; +- } ++ } /* if (nbSeqs > 0) */ ++ ++ /* write last block */ ++ DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = (size_t)(lend - lp); ++ size_t seqCount = (size_t)(send - sp); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ lastBlock); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* update pointers, the nb of literals borrowed from next sequence must be preserved */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; + } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; + } +- } while (!lastSequence); ++ } ++ ++ + if (writeLitEntropy) { +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); ++ DEBUGLOG(5, "Literal entropy tables were never written"); + ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); ++ DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); + return 0; + } ++ + if (ip < iend) { +- size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); ++ /* some data left : last part of the block sent uncompressed */ ++ size_t const rSize = (size_t)((iend - ip)); ++ size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); ++ DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { +- seqDef const* seq; +- repcodes_t rep; ++ const SeqDef* seq; ++ Repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { +- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); ++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); +- return op-ostart; ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", ++ (unsigned)(op-ostart)); ++ return (size_t)(op-ostart); + } + + size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, +- void const* src, size_t srcSize, +- unsigned lastBlock) { ++ const void* src, size_t srcSize, ++ unsigned lastBlock) ++{ + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, +@@ -559,7 +675,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + &entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */), ""); + + return ZSTD_compressSubBlock_multi(&zc->seqStore, + zc->blockState.prevCBlock, +@@ -569,5 +685,5 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + dst, dstCapacity, + src, srcSize, + zc->bmi2, lastBlock, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */); ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */); + } +diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h +index 224ece79546e..826bbc9e029b 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.h ++++ b/lib/zstd/compress/zstd_compress_superblock.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h +index 349fc923c355..dce42f653bae 100644 +--- a/lib/zstd/compress/zstd_cwksp.h ++++ b/lib/zstd/compress/zstd_cwksp.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,8 +15,10 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_internal.h" +- ++#include "../common/portability_macros.h" ++#include "../common/compiler.h" /* ZS2_isPower2 */ + + /*-************************************* + * Constants +@@ -41,8 +44,9 @@ + ***************************************/ + typedef enum { + ZSTD_cwksp_alloc_objects, +- ZSTD_cwksp_alloc_buffers, +- ZSTD_cwksp_alloc_aligned ++ ZSTD_cwksp_alloc_aligned_init_once, ++ ZSTD_cwksp_alloc_aligned, ++ ZSTD_cwksp_alloc_buffers + } ZSTD_cwksp_alloc_phase_e; + + /* +@@ -95,8 +99,8 @@ typedef enum { + * + * Workspace Layout: + * +- * [ ... workspace ... ] +- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] ++ * [ ... workspace ... ] ++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: +@@ -120,9 +124,18 @@ typedef enum { + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. These tables are 64-byte aligned. + * +- * - Aligned: these buffers are used for various purposes that require 4 byte +- * alignment, but don't require any initialization before they're used. These +- * buffers are each aligned to 64 bytes. ++ * - Init once: these buffers require to be initialized at least once before ++ * use. They should be used when we want to skip memory initialization ++ * while not triggering memory checkers (like Valgrind) when reading from ++ * from this memory without writing to it first. ++ * These buffers should be used carefully as they might contain data ++ * from previous compressions. ++ * Buffers are aligned to 64 bytes. ++ * ++ * - Aligned: these buffers don't require any initialization before they're ++ * used. The user of the buffer should make sure they write into a buffer ++ * location before reading from it. ++ * Buffers are aligned to 64 bytes. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can +@@ -134,8 +147,9 @@ typedef enum { + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects +- * 2. Buffers +- * 3. Aligned/Tables ++ * 2. Init once / Tables ++ * 3. Aligned / Tables ++ * 4. Buffers / Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +@@ -147,6 +161,7 @@ typedef struct { + void* tableEnd; + void* tableValidEnd; + void* allocStart; ++ void* initOnceStart; + + BYTE allocFailed; + int workspaceOversizedDuration; +@@ -159,6 +174,7 @@ typedef struct { + ***************************************/ + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); + + MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; +@@ -168,14 +184,16 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); ++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); ++ assert(ws->workspace <= ws->initOnceStart); + } + + /* + * Align must be a power of 2. + */ +-MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { ++MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t align) { + size_t const mask = align - 1; +- assert((align & mask) == 0); ++ assert(ZSTD_isPower2(align)); + return (size + mask) & ~mask; + } + +@@ -189,7 +207,7 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { + * to figure out how much space you need for the matchState tables. Everything + * else is though. + * +- * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size(). ++ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned64_alloc_size(). + */ + MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { + if (size == 0) +@@ -197,12 +215,16 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { + return size; + } + ++MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size, size_t alignment) { ++ return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, alignment)); ++} ++ + /* + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. + * Used to determine the number of bytes required for a given "aligned". + */ +-MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { +- return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES)); ++MEM_STATIC size_t ZSTD_cwksp_aligned64_alloc_size(size_t size) { ++ return ZSTD_cwksp_aligned_alloc_size(size, ZSTD_CWKSP_ALIGNMENT_BYTES); + } + + /* +@@ -210,14 +232,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + * for internal purposes (currently only alignment). + */ + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +- * to align the beginning of the aligned section. +- * +- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +- * aligneds being sized in multiples of 64 bytes. ++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES ++ * bytes to align the beginning of tables section and end of buffers; + */ +- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; ++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; + } + +@@ -229,11 +247,23 @@ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { + MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; +- assert((alignBytes & alignBytesMask) == 0); +- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(ZSTD_isPower2(alignBytes)); ++ assert(bytes < alignBytes); + return bytes; + } + ++/* ++ * Returns the initial value for allocStart which is used to determine the position from ++ * which we can allocate from the end of the workspace. ++ */ ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) ++{ ++ char* endPtr = (char*)ws->workspaceEnd; ++ assert(ZSTD_isPower2(ZSTD_CWKSP_ALIGNMENT_BYTES)); ++ endPtr = endPtr - ((size_t)endPtr % ZSTD_CWKSP_ALIGNMENT_BYTES); ++ return (void*)endPtr; ++} ++ + /* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, +@@ -246,7 +276,7 @@ ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) + { + void* const alloc = (BYTE*)ws->allocStart - bytes; + void* const bottom = ws->tableEnd; +- DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", ++ DEBUGLOG(5, "cwksp: reserving [0x%p]:%zd bytes; %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >= bottom); +@@ -274,27 +304,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + { + assert(phase >= ws->phase); + if (phase > ws->phase) { +- /* Going from allocating objects to allocating buffers */ +- if (ws->phase < ZSTD_cwksp_alloc_buffers && +- phase >= ZSTD_cwksp_alloc_buffers) { ++ /* Going from allocating objects to allocating initOnce / tables */ ++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && ++ phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; +- } ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + +- /* Going from allocating buffers to allocating aligneds/tables */ +- if (ws->phase < ZSTD_cwksp_alloc_aligned && +- phase >= ZSTD_cwksp_alloc_aligned) { +- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +- size_t const bytesToAlign = +- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +- memory_allocation, "aligned phase - alignment initial allocation failed!"); +- } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +- void* const alloc = ws->objectEnd; ++ void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +- void* const objectEnd = (BYTE*)alloc + bytesToAlign; ++ void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); +@@ -302,7 +321,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; +- } } } ++ } ++ } ++ } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -314,7 +335,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + */ + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) + { +- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); ++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); + } + + /* +@@ -345,29 +366,61 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) + + /* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ * This memory has been initialized at least once in the past. ++ * This doesn't mean it has been initialized this time, and it might contain data from previous ++ * operations. ++ * The main usage is for algorithms that might need read access into uninitialized memory. ++ * The algorithm must maintain safety under these conditions and must make sure it doesn't ++ * leak any of the past data (directly or in side channels). + */ +-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) + { +- void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), +- ZSTD_cwksp_alloc_aligned); +- assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); ++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); ++ if(ptr && ptr < ws->initOnceStart) { ++ /* We assume the memory following the current allocation is either: ++ * 1. Not usable as initOnce memory (end of workspace) ++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) ++ * 3. An ASAN redzone, in which case we don't want to write on it ++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. ++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ++ ws->initOnceStart = ptr; ++ } ++ return ptr; ++} ++ ++/* ++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned64(ZSTD_cwksp* ws, size_t bytes) ++{ ++ void* const ptr = ZSTD_cwksp_reserve_internal(ws, ++ ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), ++ ZSTD_cwksp_alloc_aligned); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + return ptr; + } + + /* + * Aligned on 64 bytes. These buffers have the special property that +- * their values remain constrained, allowing us to re-use them without ++ * their values remain constrained, allowing us to reuse them without + * memset()-ing them. + */ + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + { +- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; ++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + +- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +- return NULL; ++ /* We can only start allocating tables after we are done reserving space for objects at the ++ * start of the workspace */ ++ if(ws->phase < phase) { ++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { ++ return NULL; ++ } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; +@@ -387,7 +440,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + + + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); +- assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + return alloc; + } + +@@ -421,6 +474,20 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) + + return alloc; + } ++/* ++ * with alignment control ++ * Note : should happen only once, at workspace first initialization ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_object_aligned(ZSTD_cwksp* ws, size_t byteSize, size_t alignment) ++{ ++ size_t const mask = alignment - 1; ++ size_t const surplus = (alignment > sizeof(void*)) ? alignment - sizeof(void*) : 0; ++ void* const start = ZSTD_cwksp_reserve_object(ws, byteSize + surplus); ++ if (start == NULL) return NULL; ++ if (surplus == 0) return start; ++ assert(ZSTD_isPower2(alignment)); ++ return (void*)(((size_t)start + surplus) & ~mask); ++} + + MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) + { +@@ -451,7 +518,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { +- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); ++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); + } + ZSTD_cwksp_mark_tables_clean(ws); + } +@@ -460,7 +527,8 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + * Invalidates table allocations. + * All other allocations remain valid. + */ +-MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) { ++MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) ++{ + DEBUGLOG(4, "cwksp: clearing tables!"); + + +@@ -478,14 +546,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + + + ws->tableEnd = ws->objectEnd; +- ws->allocStart = ws->workspaceEnd; ++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); + ws->allocFailed = 0; +- if (ws->phase > ZSTD_cwksp_alloc_buffers) { +- ws->phase = ZSTD_cwksp_alloc_buffers; ++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { ++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; + } + ZSTD_cwksp_assert_internal_consistency(ws); + } + ++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); ++} ++ ++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) ++ + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); ++} ++ + /* + * The provided workspace takes ownership of the buffer [start, start+size). + * Any existing values in the workspace are ignored (the previously managed +@@ -498,6 +575,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; + ZSTD_cwksp_clear(ws); +@@ -529,15 +607,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { + ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); + } + +-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +-} +- +-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) +- + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); +-} +- + MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + return ws->allocFailed; + } +@@ -550,17 +619,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +- size_t const estimatedSpace, int resizedWorkspace) { +- if (resizedWorkspace) { +- /* Resized/newly allocated wksp should have exact bounds */ +- return ZSTD_cwksp_used(ws) == estimatedSpace; +- } else { +- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +- * than estimatedSpace. See the comments in zstd_cwksp.h for details. +- */ +- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +- } ++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { ++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice ++ * the alignment bytes difference between estimation and actual usage */ ++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && ++ ZSTD_cwksp_used(ws) <= estimatedSpace; + } + + +@@ -591,5 +654,4 @@ MEM_STATIC void ZSTD_cwksp_bump_oversized_duration( + } + } + +- + #endif /* ZSTD_CWKSP_H */ +diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c +index 76933dea2624..995e83f3a183 100644 +--- a/lib/zstd/compress/zstd_double_fast.c ++++ b/lib/zstd/compress/zstd_double_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,49 @@ + #include "zstd_compress_internal.h" + #include "zstd_double_fast.h" + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCDict(ZSTD_MatchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashLarge = ms->hashTable; ++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ U32* const hashSmall = ms->chainTable; ++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; ++ ++ /* Always insert every fastHashFillStep position into the hash tables. ++ * Insert the other positions into the large hash table if their entry ++ * is empty. ++ */ ++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ U32 i; ++ for (i = 0; i < fastHashFillStep; ++i) { ++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); ++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); ++ if (i == 0) { ++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); ++ } ++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { ++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); ++ } ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ if (dtlm == ZSTD_dtlm_fast) ++ break; ++ } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -43,13 +85,26 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; +- } } ++ } } ++} ++ ++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); ++ } + } + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_noDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) + { + ZSTD_compressionParameters const* cParams = &ms->cParams; +@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; +@@ -88,9 +143,14 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchs0; /* the short match for ip */ + const BYTE* matchl1; /* the long match for ip1 */ ++ const BYTE* matchs0_safe; /* matchs0 or safe address */ + + const BYTE* ip = istart; /* the current position */ + const BYTE* ip1; /* the next position */ ++ /* Array of ~random data, should have low probability of matching data ++ * we load from here instead of from tables, if matchl0/matchl1 are ++ * invalid indices. Used to avoid unpredictable branches. */ ++ const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + +@@ -100,8 +160,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; +- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ +@@ -131,30 +191,35 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + + hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); + +- if (idxl0 > prefixLowestIndex) { ++ /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch. ++ * However expression below complies into conditional move. Since ++ * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex ++ * if there is a match, all branches become predictable. */ ++ { const BYTE* const matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]); ++ + /* check prefix long match */ +- if (MEM_read64(matchl0) == MEM_read64(ip)) { ++ if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; +- } +- } ++ } } + + idxl1 = hashLong[hl1]; + matchl1 = base + idxl1; + +- if (idxs0 > prefixLowestIndex) { +- /* check prefix short match */ +- if (MEM_read32(matchs0) == MEM_read32(ip)) { +- goto _search_next_long; +- } ++ /* Same optimization as matchl0 above */ ++ matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]); ++ ++ /* check prefix short match */ ++ if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) { ++ goto _search_next_long; + } + + if (ip1 >= nextStep) { +@@ -175,30 +240,36 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + } while (ip1 <= ilimit); + + _cleanup: ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + + _search_next_long: + +- /* check prefix long +1 match */ +- if (idxl1 > prefixLowestIndex) { +- if (MEM_read64(matchl1) == MEM_read64(ip1)) { ++ /* short match found: let's check for a longer one */ ++ mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; ++ offset = (U32)(ip - matchs0); ++ ++ /* check long match at +1 position */ ++ if ((idxl1 > prefixLowestIndex) && (MEM_read64(matchl1) == MEM_read64(ip1))) { ++ size_t const l1len = ZSTD_count(ip1+8, matchl1+8, iend) + 8; ++ if (l1len > mLength) { ++ /* use the long match instead */ + ip = ip1; +- mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; ++ mLength = l1len; + offset = (U32)(ip-matchl1); +- while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ +- goto _match_found; ++ matchs0 = matchl1; + } + } + +- /* if no long +1 match, explore the short match we found */ +- mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; +- offset = (U32)(ip - matchs0); +- while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */ ++ while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* complete backward */ + + /* fall-through */ + +@@ -217,7 +288,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + hashLong[hl1] = (U32)(ip1 - base); + } + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -243,7 +314,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -254,8 +325,9 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) + { +@@ -275,9 +347,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; + const U32* const dictHashLong = dms->hashTable; + const U32* const dictHashSmall = dms->chainTable; +@@ -286,8 +357,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); +- const U32 dictHBitsL = dictCParams->hashLog; +- const U32 dictHBitsS = dictCParams->chainLog; ++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); +@@ -295,6 +366,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashLong, hashTableBytes); ++ PREFETCH_AREA(dictHashSmall, chainTableBytes); ++ } ++ + /* init */ + ip += (dictAndPrefixLength == 0); + +@@ -309,8 +387,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); +- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); +- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); ++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); ++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; +@@ -323,26 +405,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ + + /* check repcode */ +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +- if (matchIndexL > prefixLowestIndex) { ++ if ((matchIndexL >= prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + /* check prefix long match */ +- if (MEM_read64(matchLong) == MEM_read64(ip)) { +- mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; +- offset = (U32)(ip-matchLong); +- while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ +- goto _match_found; +- } +- } else { ++ mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; ++ offset = (U32)(ip-matchLong); ++ while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ ++ goto _match_found; ++ } else if (dictTagsMatchL) { + /* check dictMatchState long match */ +- U32 const dictMatchIndexL = dictHashLong[dictHL]; ++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + +@@ -354,13 +434,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } } + + if (matchIndexS > prefixLowestIndex) { +- /* check prefix short match */ ++ /* short match candidate */ + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } +- } else { ++ } else if (dictTagsMatchS) { + /* check dictMatchState short match */ +- U32 const dictMatchIndexS = dictHashSmall[dictHS]; ++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + +@@ -375,25 +455,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + continue; + + _search_next_long: +- + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); ++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; ++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + + /* check prefix long +1 match */ +- if (matchIndexL3 > prefixLowestIndex) { +- if (MEM_read64(matchL3) == MEM_read64(ip+1)) { +- mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; +- ip++; +- offset = (U32)(ip-matchL3); +- while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ +- goto _match_found; +- } +- } else { ++ if ((matchIndexL3 >= prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1))) { ++ mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; ++ ip++; ++ offset = (U32)(ip-matchL3); ++ while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ ++ goto _match_found; ++ } else if (dictTagsMatchL3) { + /* check dict long +1 match */ +- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; ++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { +@@ -419,7 +498,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + offset_2 = offset_1; + offset_1 = offset; + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -443,12 +522,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; +- if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) ++ if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex2)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -461,8 +540,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } /* while (ip < ilimit) */ + + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -470,7 +549,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + + #define ZSTD_GEN_DFAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ +@@ -488,7 +567,7 @@ ZSTD_GEN_DFAST_FN(dictMatchState, 7) + + + size_t ZSTD_compressBlock_doubleFast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + const U32 mls = ms->cParams.minMatch; +@@ -508,7 +587,7 @@ size_t ZSTD_compressBlock_doubleFast( + + + size_t ZSTD_compressBlock_doubleFast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + const U32 mls = ms->cParams.minMatch; +@@ -527,8 +606,10 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_doubleFast_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_doubleFast_extDict_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) + { +@@ -579,13 +660,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ + +- if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ ++ if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; +@@ -596,7 +677,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +@@ -621,7 +702,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + } + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; +@@ -647,13 +728,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ ++ if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) + & (offset_2 <= current2 - dictStartIndex)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -677,7 +758,7 @@ ZSTD_GEN_DFAST_FN(extDict, 6) + ZSTD_GEN_DFAST_FN(extDict, 7) + + size_t ZSTD_compressBlock_doubleFast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; +@@ -694,3 +775,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict( + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); + } + } ++ ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ +diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h +index 6822bde65a1d..011556ce56f7 100644 +--- a/lib/zstd/compress/zstd_double_fast.h ++++ b/lib/zstd/compress/zstd_double_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,22 +12,32 @@ + #ifndef ZSTD_DOUBLE_FAST_H + #define ZSTD_DOUBLE_FAST_H + +- + #include "../common/mem.h" /* U32 */ + #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); ++ + size_t ZSTD_compressBlock_doubleFast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_doubleFast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_doubleFast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +- ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ + + #endif /* ZSTD_DOUBLE_FAST_H */ +diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c +index a752e6beab52..60e07e839e5f 100644 +--- a/lib/zstd/compress/zstd_fast.c ++++ b/lib/zstd/compress/zstd_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,46 @@ + #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ + #include "zstd_fast.h" + ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashTable = ms->hashTable; ++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; ++ ++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_full); ++ ++ /* Always insert every fastHashFillStep position into the hash table. ++ * Insert the other positions if their hash entry is empty. ++ */ ++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } ++ ++ if (dtlm == ZSTD_dtlm_fast) continue; ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ { U32 p; ++ for (p = 1; p < fastHashFillStep; ++p) { ++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); ++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); ++ } } } } ++} + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) + { +@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + ++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_fast); ++ + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ +@@ -42,6 +85,60 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + } } } } + } + ++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillHashTableForCCtx(ms, end, dtlm); ++ } ++} ++ ++ ++typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit); ++ ++static int ++ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) ++{ ++ /* Array of ~random data, should have low probability of matching data. ++ * Load from here if the index is invalid. ++ * Used to avoid unpredictable branches. */ ++ static const BYTE dummy[] = {0x12,0x34,0x56,0x78}; ++ ++ /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. ++ * However expression below compiles into conditional move. ++ */ ++ const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy); ++ /* Note: this used to be written as : return test1 && test2; ++ * Unfortunately, once inlined, these tests become branches, ++ * in which case it becomes critical that they are executed in the right order (test1 then test2). ++ * So we have to write these tests in a specific manner to ensure their ordering. ++ */ ++ if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0; ++ /* force ordering of these tests, which matters once the function is inlined, as they become branches */ ++ __asm__(""); ++ return matchIdx >= idxLowLimit; ++} ++ ++static int ++ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) ++{ ++ /* using a branch instead of a cmov, ++ * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true, ++ * aka almost all candidates are within range */ ++ U32 mval; ++ if (matchIdx >= idxLowLimit) { ++ mval = MEM_read32(matchAddress); ++ } else { ++ mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */ ++ } ++ ++ return (MEM_read32(currentPtr) == mval); ++} ++ + + /* + * If you squint hard enough (and ignore repcodes), the search operation at any +@@ -89,17 +186,17 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + * + * This is also the work we do at the beginning to enter the loop initially. + */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_fast_noDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_noDict_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, +- U32 const mls, U32 const hasStep) ++ U32 const mls, int useCmov) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; +- /* support stepSize of 0 */ +- size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2; ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */ + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); +@@ -117,12 +214,11 @@ ZSTD_compressBlock_fast_noDict_generic( + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ +- U32 idx; /* match idx for ip0 */ +- U32 mval; /* src value at match idx */ ++ U32 matchIdx; /* match idx for ip0 */ + + U32 offcode; + const BYTE* match0; +@@ -135,14 +231,15 @@ ZSTD_compressBlock_fast_noDict_generic( + size_t step; + const BYTE* nextStep; + const size_t kStepIncr = (1 << (kSearchStrength - 1)); ++ const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch; + + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); + ip0 += (ip0 == prefixStart); + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; +- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; ++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; ++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +@@ -163,7 +260,7 @@ ZSTD_compressBlock_fast_noDict_generic( + hash0 = ZSTD_hashPtr(ip0, hlog, mls); + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + +- idx = hashTable[hash0]; ++ matchIdx = hashTable[hash0]; + + do { + /* load repcode match for ip[2]*/ +@@ -180,26 +277,28 @@ ZSTD_compressBlock_fast_noDict_generic( + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; +- offcode = STORE_REPCODE_1; ++ offcode = REPCODE1_TO_OFFBASE; + mLength += 4; ++ ++ /* Write next hash table entry: it's already calculated. ++ * This write is known to be safe because ip1 is before the ++ * repcode (ip2). */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _match; + } + +- /* load match for ip[0] */ +- if (idx >= prefixStartIndex) { +- mval = MEM_read32(base + idx); +- } else { +- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ +- } ++ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { ++ /* Write next hash table entry (it's already calculated). ++ * This write is known to be safe because the ip1 == ip0 + 1, ++ * so searching will resume after ip1 */ ++ hashTable[hash1] = (U32)(ip1 - base); + +- /* check match at ip[0] */ +- if (MEM_read32(ip0) == mval) { +- /* found a match! */ + goto _offset; + } + + /* lookup ip[1] */ +- idx = hashTable[hash1]; ++ matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; +@@ -214,21 +313,19 @@ ZSTD_compressBlock_fast_noDict_generic( + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + +- /* load match for ip[0] */ +- if (idx >= prefixStartIndex) { +- mval = MEM_read32(base + idx); +- } else { +- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ +- } +- +- /* check match at ip[0] */ +- if (MEM_read32(ip0) == mval) { +- /* found a match! */ ++ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { ++ /* Write next hash table entry, since it's already calculated */ ++ if (step <= 4) { ++ /* Avoid writing an index if it's >= position where search will resume. ++ * The minimum possible match has length 4, so search can resume at ip0 + 4. ++ */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ } + goto _offset; + } + + /* lookup ip[1] */ +- idx = hashTable[hash1]; ++ matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; +@@ -250,13 +347,28 @@ ZSTD_compressBlock_fast_noDict_generic( + } while (ip3 < ilimit); + + _cleanup: +- /* Note that there are probably still a couple positions we could search. ++ /* Note that there are probably still a couple positions one could search. + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + ++ /* When the repcodes are outside of the prefix, we set them to zero before the loop. ++ * When the offsets are still zero, we need to restore them after the block to have a correct ++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both ++ * offsets were invalid. We need to figure out which offset to refill with. ++ * - If both offsets are zero they are in the same order. ++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. ++ * - If only one is zero, we need to decide which offset to restore. ++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. ++ * - It is impossible for rep_offset2 to be non-zero. ++ * ++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then ++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. ++ */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; ++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; ++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -264,10 +376,10 @@ ZSTD_compressBlock_fast_noDict_generic( + _offset: /* Requires: ip0, idx */ + + /* Compute the offset code. */ +- match0 = base + idx; ++ match0 = base + matchIdx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); +- offcode = STORE_OFFSET(rep_offset1); ++ offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ +@@ -287,11 +399,6 @@ ZSTD_compressBlock_fast_noDict_generic( + ip0 += mLength; + anchor = ip0; + +- /* write next hash table entry */ +- if (ip1 < ip0) { +- hashTable[hash1] = (U32)(ip1 - base); +- } +- + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ +@@ -306,7 +413,7 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; +- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } +@@ -314,12 +421,12 @@ ZSTD_compressBlock_fast_noDict_generic( + goto _start; + } + +-#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \ +- static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \ +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ ++#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \ ++ static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \ ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ +- return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ ++ return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \ + } + + ZSTD_GEN_FAST_FN(noDict, 4, 1) +@@ -333,13 +440,15 @@ ZSTD_GEN_FAST_FN(noDict, 6, 0) + ZSTD_GEN_FAST_FN(noDict, 7, 0) + + size_t ZSTD_compressBlock_fast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- U32 const mls = ms->cParams.minMatch; ++ U32 const mml = ms->cParams.minMatch; ++ /* use cmov when "candidate in range" branch is likely unpredictable */ ++ int const useCmov = ms->cParams.windowLog < 19; + assert(ms->dictMatchState == NULL); +- if (ms->cParams.targetLength > 1) { +- switch(mls) ++ if (useCmov) { ++ switch(mml) + { + default: /* includes case 3 */ + case 4 : +@@ -352,7 +461,8 @@ size_t ZSTD_compressBlock_fast( + return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); + } + } else { +- switch(mls) ++ /* use a branch instead */ ++ switch(mml) + { + default: /* includes case 3 */ + case 4 : +@@ -364,13 +474,13 @@ size_t ZSTD_compressBlock_fast( + case 7 : + return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); + } +- + } + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_fast_dictMatchState_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -380,16 +490,16 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; ++ const BYTE* ip0 = istart; ++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; + const U32* const dictHashTable = dms->hashTable; + const U32 dictStartIndex = dms->window.dictLimit; +@@ -397,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); +- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); +- const U32 dictHLog = dictCParams->hashLog; ++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); ++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; +- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); ++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + +@@ -413,106 +523,154 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashTable, hashTableBytes); ++ } ++ + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); +- ip += (dictAndPrefixLength == 0); ++ ip0 += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + +- /* Main Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ++ /* Outer search loop */ ++ assert(stepSize >= 1); ++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + size_t mLength; +- size_t const h = ZSTD_hashPtr(ip, hlog, mls); +- U32 const curr = (U32)(ip-base); +- U32 const matchIndex = hashTable[h]; +- const BYTE* match = base + matchIndex; +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* repMatch = (repIndex < prefixStartIndex) ? +- dictBase + (repIndex - dictIndexDelta) : +- base + repIndex; +- hashTable[h] = curr; /* update hash table */ +- +- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +- } else if ( (matchIndex <= prefixStartIndex) ) { +- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); +- U32 const dictMatchIndex = dictHashTable[dictHash]; +- const BYTE* dictMatch = dictBase + dictMatchIndex; +- if (dictMatchIndex <= dictStartIndex || +- MEM_read32(dictMatch) != MEM_read32(ip)) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a dict match */ +- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); +- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; +- while (((ip>anchor) & (dictMatch>dictStart)) +- && (ip[-1] == dictMatch[-1])) { +- ip--; dictMatch--; mLength++; ++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ ++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); ++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); ++ ++ U32 matchIndex = hashTable[hash0]; ++ U32 curr = (U32)(ip0 - base); ++ size_t step = stepSize; ++ const size_t kStepIncr = 1 << kSearchStrength; ++ const BYTE* nextStep = ip0 + kStepIncr; ++ ++ /* Inner search loop */ ++ while (1) { ++ const BYTE* match = base + matchIndex; ++ const U32 repIndex = curr + 1 - offset_1; ++ const BYTE* repMatch = (repIndex < prefixStartIndex) ? ++ dictBase + (repIndex - dictIndexDelta) : ++ base + repIndex; ++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); ++ hashTable[hash0] = curr; /* update hash table */ ++ ++ if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) ++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { ++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ++ ip0++; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ++ break; ++ } ++ ++ if (dictTagsMatch) { ++ /* Found a possible dict match */ ++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* dictMatch = dictBase + dictMatchIndex; ++ if (dictMatchIndex > dictStartIndex && ++ MEM_read32(dictMatch) == MEM_read32(ip0)) { ++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ ++ if (matchIndex <= prefixStartIndex) { ++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); ++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; ++ while (((ip0 > anchor) & (dictMatch > dictStart)) ++ && (ip0[-1] == dictMatch[-1])) { ++ ip0--; ++ dictMatch--; ++ mLength++; ++ } /* catch up */ ++ offset_2 = offset_1; ++ offset_1 = offset; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; ++ } ++ } ++ } ++ ++ if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) { ++ /* found a regular match of size >= 4 */ ++ U32 const offset = (U32) (ip0 - match); ++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; ++ while (((ip0 > anchor) & (match > prefixStart)) ++ && (ip0[-1] == match[-1])) { ++ ip0--; ++ match--; ++ mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; + } +- } else if (MEM_read32(match) != MEM_read32(ip)) { +- /* it's not a match, and we're not going to check the dictionary */ +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a regular match */ +- U32 const offset = (U32)(ip-match); +- mLength = ZSTD_count(ip+4, match+4, iend) + 4; +- while (((ip>anchor) & (match>prefixStart)) +- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; +- offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- } ++ ++ /* Prepare for next iteration */ ++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); ++ matchIndex = hashTable[hash1]; ++ ++ if (ip1 >= nextStep) { ++ step++; ++ nextStep += kStepIncr; ++ } ++ ip0 = ip1; ++ ip1 = ip1 + step; ++ if (ip1 > ilimit) goto _cleanup; ++ ++ curr = (U32)(ip0 - base); ++ hash0 = hash1; ++ } /* end inner search loop */ + + /* match found */ +- ip += mLength; +- anchor = ip; ++ assert(mLength); ++ ip0 += mLength; ++ anchor = ip0; + +- if (ip <= ilimit) { ++ if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); ++ while (ip0 <= ilimit) { ++ U32 const current2 = (U32)(ip0-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; +- if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) ++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ++ ip0 += repLength2; ++ anchor = ip0; + continue; + } + break; + } + } ++ ++ /* Prepare for next iteration */ ++ assert(ip0 == anchor); ++ ip1 = ip0 + stepSize; + } + ++_cleanup: + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -525,7 +683,7 @@ ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) + ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) + + size_t ZSTD_compressBlock_fast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; +@@ -545,19 +703,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_fast_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_extDict_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ +- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); +@@ -570,6 +729,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; ++ ++ const BYTE* ip0 = istart; ++ const BYTE* ip1; ++ const BYTE* ip2; ++ const BYTE* ip3; ++ U32 current0; ++ ++ ++ size_t hash0; /* hash for ip0 */ ++ size_t hash1; /* hash for ip1 */ ++ U32 idx; /* match idx for ip0 */ ++ const BYTE* idxBase; /* base pointer for idx */ ++ ++ U32 offcode; ++ const BYTE* match0; ++ size_t mLength; ++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ ++ ++ size_t step; ++ const BYTE* nextStep; ++ const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ + +@@ -579,75 +760,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + +- /* Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ +- const size_t h = ZSTD_hashPtr(ip, hlog, mls); +- const U32 matchIndex = hashTable[h]; +- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; +- const BYTE* match = matchBase + matchIndex; +- const U32 curr = (U32)(ip-base); +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; +- const BYTE* const repMatch = repBase + repIndex; +- hashTable[h] = curr; /* update hash table */ +- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); +- +- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); +- ip += rLength; +- anchor = ip; +- } else { +- if ( (matchIndex < dictStartIndex) || +- (MEM_read32(match) != MEM_read32(ip)) ) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; ++ { U32 const curr = (U32)(ip0 - base); ++ U32 const maxRep = curr - dictStartIndex; ++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; ++ } ++ ++ /* start each op */ ++_start: /* Requires: ip0 */ ++ ++ step = stepSize; ++ nextStep = ip0 + kStepIncr; ++ ++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ++ ip1 = ip0 + 1; ++ ip2 = ip0 + step; ++ ip3 = ip2 + 1; ++ ++ if (ip3 >= ilimit) { ++ goto _cleanup; ++ } ++ ++ hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ ++ idx = hashTable[hash0]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ do { ++ { /* load repcode match for ip[2] */ ++ U32 const current2 = (U32)(ip2 - base); ++ U32 const repIndex = current2 - offset_1; ++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; ++ U32 rval; ++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ ++ & (offset_1 > 0) ) { ++ rval = MEM_read32(repBase + repIndex); ++ } else { ++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ + } +- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; +- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; +- U32 const offset = curr - matchIndex; +- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; +- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = offset; /* update offset history */ +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- ip += mLength; +- anchor = ip; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ /* check repcode at ip[2] */ ++ if (MEM_read32(ip2) == rval) { ++ ip0 = ip2; ++ match0 = repBase + repIndex; ++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ assert((match0 != prefixStart) & (match0 != dictStart)); ++ mLength = ip0[-1] == match0[-1]; ++ ip0 -= mLength; ++ match0 -= mLength; ++ offcode = REPCODE1_TO_OFFBASE; ++ mLength += 4; ++ goto _match; + } } + +- if (ip <= ilimit) { +- /* Fill Table */ +- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); +- /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); +- U32 const repIndex2 = current2 - offset_2; +- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; +- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; +- continue; +- } +- break; +- } } } ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip3; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip0 + step; ++ ip3 = ip1 + step; ++ ++ /* calculate step */ ++ if (ip2 >= nextStep) { ++ step++; ++ PREFETCH_L1(ip1 + 64); ++ PREFETCH_L1(ip1 + 128); ++ nextStep += kStepIncr; ++ } ++ } while (ip3 < ilimit); ++ ++_cleanup: ++ /* Note that there are probably still a couple positions we could search. ++ * However, it seems to be a meaningful performance hit to try to search ++ * them. So let's not. */ ++ ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ +- rep[0] = offset_1; +- rep[1] = offset_2; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); ++ ++_offset: /* Requires: ip0, idx, idxBase */ ++ ++ /* Compute the offset code. */ ++ { U32 const offset = current0 - idx; ++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; ++ matchEnd = idx < prefixStartIndex ? dictEnd : iend; ++ match0 = idxBase + idx; ++ offset_2 = offset_1; ++ offset_1 = offset; ++ offcode = OFFSET_TO_OFFBASE(offset); ++ mLength = 4; ++ ++ /* Count the backwards match length. */ ++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { ++ ip0--; ++ match0--; ++ mLength++; ++ } } ++ ++_match: /* Requires: ip0, match0, offcode, matchEnd */ ++ ++ /* Count the forward length. */ ++ assert(matchEnd != 0); ++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); ++ ++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); ++ ++ ip0 += mLength; ++ anchor = ip0; ++ ++ /* write next hash table entry */ ++ if (ip1 < ip0) { ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ ++ /* Fill table and check for immediate repcode. */ ++ if (ip0 <= ilimit) { ++ /* Fill Table */ ++ assert(base+current0+2 > istart); /* check base overflow */ ++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); ++ ++ while (ip0 <= ilimit) { ++ U32 const repIndex2 = (U32)(ip0-base) - offset_2; ++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; ++ if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0)) ++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { ++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ++ ip0 += repLength2; ++ anchor = ip0; ++ continue; ++ } ++ break; ++ } } ++ ++ goto _start; + } + + ZSTD_GEN_FAST_FN(extDict, 4, 0) +@@ -656,10 +964,11 @@ ZSTD_GEN_FAST_FN(extDict, 6, 0) + ZSTD_GEN_FAST_FN(extDict, 7, 0) + + size_t ZSTD_compressBlock_fast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; ++ assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ +diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h +index fddc2f532d21..04fde0a72a4e 100644 +--- a/lib/zstd/compress/zstd_fast.h ++++ b/lib/zstd/compress/zstd_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,21 +12,20 @@ + #ifndef ZSTD_FAST_H + #define ZSTD_FAST_H + +- + #include "../common/mem.h" /* U32 */ + #include "zstd_compress_internal.h" + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_fast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_fast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_fast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +- + #endif /* ZSTD_FAST_H */ +diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c +index 0298a01a7504..88e2501fe3ef 100644 +--- a/lib/zstd/compress/zstd_lazy.c ++++ b/lib/zstd/compress/zstd_lazy.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -10,14 +11,23 @@ + + #include "zstd_compress_internal.h" + #include "zstd_lazy.h" ++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) ++ ++#define kLazySkippingStep 8 + + + /*-************************************* + * Binary Tree search + ***************************************/ + +-static void +-ZSTD_updateDUBT(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_updateDUBT(ZSTD_MatchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) + { +@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, + * sort one already inserted but unsorted position + * assumption : curr >= btlow == (curr - btmask) + * doesn't fail */ +-static void +-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms, + U32 curr, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +@@ -149,9 +160,10 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, + } + + +-static size_t +-ZSTD_DUBT_findBetterDictMatch ( +- const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBetterDictMatch ( ++ const ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + size_t bestLength, +@@ -159,7 +171,7 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +- const ZSTD_matchState_t * const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t * const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; + const U32 * const dictHashTable = dms->hashTable; + U32 const hashLog = dmsCParams->hashLog; +@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", +- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); ++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ +@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } +@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + +-static size_t +-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +- size_t* offsetPtr, ++ size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; +- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) ++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any +@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, +- offsetPtr, bestLength, nbCompares, ++ offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", +- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); ++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +@@ -378,24 +391,25 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + + + /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +- size_t* offsetPtr, ++ size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) + { + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); +- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); ++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); + } + + /* ********************************* + * Dedicated dict search + ***********************************/ + +-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) ++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip) + { + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); +@@ -514,7 +528,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B + */ + FORCE_INLINE_TEMPLATE + size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, +- const ZSTD_matchState_t* const dms, ++ const ZSTD_MatchState_t* const dms, + const BYTE* const ip, const BYTE* const iLimit, + const BYTE* const prefixStart, const U32 curr, + const U32 dictLimit, const size_t ddsIdx) { +@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; +@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + + /* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( +- ZSTD_matchState_t* ms, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndex_internal( ++ ZSTD_MatchState_t* ms, + const ZSTD_compressionParameters* const cParams, +- const BYTE* ip, U32 const mls) ++ const BYTE* ip, U32 const mls, U32 const lazySkipping) + { + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; +@@ -632,21 +648,25 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; ++ /* Stop inserting every position when in the lazy skipping mode. */ ++ if (lazySkipping) ++ break; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; + } + +-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { ++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); ++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); + } + + /* inlining is important to hardwire a hot branch (template emulation) */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_HcFindBestMatch( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode) +@@ -670,7 +690,7 @@ size_t ZSTD_HcFindBestMatch( + U32 nbAttempts = 1U << cParams->searchLog; + size_t ml=4-1; + +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch + ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch +@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch( + } + + /* HC4 match finder */ +- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); ++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch( + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch( + * (SIMD) Row-based matchfinder + ***********************************/ + /* Constants for row-based hash */ +-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +- assert(val != 0); +-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +- if (sizeof(size_t) == 4) { +- U32 mostSignificantWord = (U32)(val >> 32); +- U32 leastSignificantWord = (U32)val; +- if (leastSignificantWord == 0) { +- return 32 + (U32)__builtin_ctz(mostSignificantWord); +- } else { +- return (U32)__builtin_ctz(leastSignificantWord); +- } +- } else { +- return (U32)__builtin_ctzll(val); +- } +-# else +- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +- */ +- val = ~val & (val - 1ULL); /* Lowest set bit mask */ +- val = val - ((val >> 1) & 0x5555555555555555); +- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +-# endif +-} +- +-/* ZSTD_rotateRight_*(): +- * Rotates a bitfield to the right by "count" bits. +- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts +- */ +-FORCE_INLINE_TEMPLATE +-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +- assert(count < 64); +- count &= 0x3F; /* for fickle pattern recognition */ +- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +- assert(count < 32); +- count &= 0x1F; /* for fickle pattern recognition */ +- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +- assert(count < 16); +- count &= 0x0F; /* for fickle pattern recognition */ +- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { ++ return ZSTD_countTrailingZeros64(val); + } + + /* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" +- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) ++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +- U32 const next = (*tagRow - 1) & rowMask; +- *tagRow = (BYTE)next; +- return next; ++ U32 next = (*tagRow-1) & rowMask; ++ next += (next == 0) ? rowMask : 0; /* skip first position */ ++ *tagRow = (BYTE)next; ++ return next; + } + + /* ZSTD_isAligned(): +@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + /* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { ++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); +@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) + { + U32 const* const hashTable = ms->hashTable; +- U16 const* const tagTable = ms->tagTable; ++ BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { +- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; +@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +- U16 const* tagTable, BYTE const* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, ++ BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, +- U32 const rowLog, U32 const mls) ++ U32 const rowLog, U32 const mls, ++ U64 const hashSalt) + { +- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab + /* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, +- U32 updateStartIdx, U32 const updateEndIdx, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms, ++ U32 updateStartIdx, U32 const updateEndIdx, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) ++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; +- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +- Explicit cast allows us to get exact desired position within each row */ ++ BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; ++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } + } +@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; +@@ -965,13 +947,41 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { ++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) { + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + const U32 rowMask = (1u << rowLog) - 1; + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); ++} ++ ++/* Returns the mask width of bits group of which will be set to 1. Given not all ++ * architectures have easy movemask instruction, this helps to iterate over ++ * groups of bits easier and faster. ++ */ ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ (void)rowEntries; ++#if defined(ZSTD_ARCH_ARM_NEON) ++ /* NEON path only works for little endian */ ++ if (!MEM_isLittleEndian()) { ++ return 1; ++ } ++ if (rowEntries == 16) { ++ return 4; ++ } ++ if (rowEntries == 32) { ++ return 2; ++ } ++ if (rowEntries == 64) { ++ return 1; ++ } ++#endif ++ return 1; + } + + #if defined(ZSTD_ARCH_X86_SSE2) +@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U + } + #endif + +-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches +- * the hash at the nth position in a row of the tagTable. +- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield +- * to match up with the actual layout of the entries within the hashTable */ ++#if defined(ZSTD_ARCH_ARM_NEON) ++FORCE_INLINE_TEMPLATE ZSTD_VecMask ++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ if (rowEntries == 16) { ++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. ++ * After that groups of 4 bits represent the equalMask. We lower ++ * all bits except the highest in these groups by doing AND with ++ * 0x88 = 0b10001000. ++ */ ++ const uint8x16_t chunk = vld1q_u8(src); ++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); ++ const uint8x8_t res = vshrn_n_u16(equalMask, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; ++ } else if (rowEntries == 32) { ++ /* Same idea as with rowEntries == 16 but doing AND with ++ * 0x55 = 0b01010101. ++ */ ++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); ++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); ++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); ++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); ++ const uint8x8_t res = vsli_n_u8(t0, t1, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; ++ } else { /* rowEntries == 64 */ ++ const uint8x16x4_t chunk = vld4q_u8(src); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); ++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); ++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); ++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); ++ ++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); ++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); ++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); ++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); ++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped); ++ } ++} ++#endif ++ ++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by ++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" ++ * matches the hash at the nth position in a row of the tagTable. ++ * Each row is a circular buffer beginning at the value of "headGrouped". So we ++ * must rotate the "matches" bitfield to match up with the actual layout of the ++ * entries within the hashTable */ + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) + { +- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; ++ const BYTE* const src = tagRow; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); + + #if defined(ZSTD_ARCH_X86_SSE2) + +- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); ++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); + + #else /* SW or NEON-LE */ + + # if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { +- if (rowEntries == 16) { +- const uint8x16_t chunk = vld1q_u8(src); +- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +- const U16 hi = (U16)vgetq_lane_u8(t3, 8); +- const U16 lo = (U16)vgetq_lane_u8(t3, 0); +- return ZSTD_rotateRight_U16((hi << 8) | lo, head); +- } else if (rowEntries == 32) { +- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +- const uint8x8x2_t t3 = vuzp_u8(t2, t0); +- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +- return ZSTD_rotateRight_U32(matches, head); +- } else { /* rowEntries == 64 */ +- const uint8x16x4_t chunk = vld4q_u8(src); +- const uint8x16_t dup = vdupq_n_u8(tag); +- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); +- +- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +- return ZSTD_rotateRight_U64(matches, head); +- } ++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); + } + # endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ +- { const size_t chunkSize = sizeof(size_t); ++ { const int chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; +@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + } + matches = ~matches; + if (rowEntries == 16) { +- return ZSTD_rotateRight_U16((U16)matches, head); ++ return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { +- return ZSTD_rotateRight_U32((U32)matches, head); ++ return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { +- return ZSTD_rotateRight_U64((U64)matches, head); ++ return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } + #endif +@@ -1103,29 +1124,30 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + + /* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: +- * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" +- * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines ++ * - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index. ++ * - The hash is salted by a value that changes on every context reset, so when the same table is used ++ * we will avoid collisions that would otherwise slow us down by introducing phantom matches. ++ * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines + * which row to insert into. +- * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can +- * be considered as a circular buffer with a "head" index that resides in the tagTable. +- * - Also insert the "tag" into the equivalent row and position in the tagTable. +- * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. +- * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, +- * for alignment/performance reasons, leaving some bytes unused. +- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and ++ * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can ++ * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes ++ * per row). ++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. ++ * - Insert the tag into the equivalent row and position in the tagTable. + */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_RowFindBestMatch( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, + const U32 rowLog) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -1143,11 +1165,14 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ ++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); ++ const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; ++ U32 hash; + + /* DMS/DDS variables that may be referenced laster */ +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; +@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch( + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; +- U16* const dmsTagTable = dms->tagTable; ++ BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch( + } + + /* Update the hashTable and tagTable up to (but not including) ip */ +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ if (!ms->lazySkipping) { ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); ++ } else { ++ /* Stop inserting every position when in the lazy skipping mode. ++ * The hash cache is also not kept up to date in this mode. ++ */ ++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); ++ ms->nextToUpdate = curr; ++ } ++ ms->hashSaltEntropy += hash; /* collect salt entropy */ ++ + { /* Get the hash for ip, compute the appropriate row */ +- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); +- U32 const head = *tagRow & rowMask; ++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; ++ if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; +@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; ++ tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + +@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch( + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch( + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + +- { U32 const head = *dmsTagRow & rowMask; ++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; ++ if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Return the longest match */ +@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } +@@ -1301,7 +1341,7 @@ size_t ZSTD_RowFindBestMatch( + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating a +- * bunch of constants from the ZSTD_matchState_t. These computations could be ++ * bunch of constants from the ZSTD_MatchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. +@@ -1325,7 +1365,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offBasePtr) \ + { \ +@@ -1335,7 +1375,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ +@@ -1345,7 +1385,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ +@@ -1446,7 +1486,7 @@ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searc + * If a match is found its offset is stored in @p offsetPtr. + */ + FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, +@@ -1472,9 +1512,10 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + * Common parser - lazy strategy + *********************************/ + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_lazy_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_lazy_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth, +@@ -1491,12 +1532,13 @@ ZSTD_compressBlock_lazy_generic( + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + +- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; ++ U32 offset_1 = rep[0], offset_2 = rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; + const int isDxS = isDMS || isDDS; +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; + const BYTE* const dictBase = isDxS ? dms->window.base : NULL; + const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; +@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic( + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; +- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 +@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic( + assert(offset_2 <= dictAndPrefixLength); + } + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + +@@ -1548,7 +1591,7 @@ ZSTD_compressBlock_lazy_generic( + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; +@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic( + } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); ++ { size_t offbaseFound = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; ++ ip += step; ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1579,34 +1631,34 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 1"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1615,34 +1667,34 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 2"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ +@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic( + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { ++ if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { +- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ ++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) ++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + if (isDxS) { +@@ -1682,12 +1741,12 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase - dictIndexDelta + repIndex : + base + repIndex; +- if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) ++ if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; +@@ -1701,168 +1760,183 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + +- /* Save reps for next block */ +- rep[0] = offset_1 ? offset_1 : savedOffset; +- rep[1] = offset_2 ? offset_2 : savedOffset; ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ ++ /* save reps for next block */ ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + + +-size_t ZSTD_compressBlock_btlazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_lazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_greedy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); + } + +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); + } + +-/* Row-based matchfinder */ +-size_t ZSTD_compressBlock_lazy2_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); + } + +- + size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); + } ++#endif + ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_lazy_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth) +@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + +@@ -1912,7 +1987,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const U32 repIndex = (U32)(curr+1 - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ + if (MEM_read32(ip+1) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ +@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + } } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); ++ ip += step + 1; /* jump faster over incompressible sections */ ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1939,30 +2023,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1971,50 +2055,57 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ if (OFFBASE_IS_OFFSET(offBase)) { ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + while (ip <= ilimit) { +@@ -2023,14 +2114,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const U32 repIndex = repCurrent - offset_2; + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -2045,58 +2136,65 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + +- ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_greedy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); + } + +-size_t ZSTD_compressBlock_lazy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); + } + +-size_t ZSTD_compressBlock_btlazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); + } ++#endif + +-size_t ZSTD_compressBlock_greedy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) ++ + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); + } + +-size_t ZSTD_compressBlock_lazy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); + } ++#endif +diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h +index e5bdf4df8dde..987a036d8bde 100644 +--- a/lib/zstd/compress/zstd_lazy.h ++++ b/lib/zstd/compress/zstd_lazy.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,7 +12,6 @@ + #ifndef ZSTD_LAZY_H + #define ZSTD_LAZY_H + +- + #include "zstd_compress_internal.h" + + /* +@@ -22,98 +22,173 @@ + */ + #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + +-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); ++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) ++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip); ++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip); + +-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); ++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip); + + void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ ++#endif + +-size_t ZSTD_compressBlock_btlazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); ++size_t ZSTD_compressBlock_greedy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_GREEDY NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_greedy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btlazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- + ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL ++#endif + + #endif /* ZSTD_LAZY_H */ +diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c +index dd86fc83e7dd..54eefad9cae6 100644 +--- a/lib/zstd/compress/zstd_ldm.c ++++ b/lib/zstd/compress/zstd_ldm.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,7 @@ + #include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ + #include "zstd_ldm_geartab.h" + +-#define LDM_BUCKET_SIZE_LOG 3 ++#define LDM_BUCKET_SIZE_LOG 4 + #define LDM_MIN_MATCH_LENGTH 64 + #define LDM_HASH_RLOG 7 + +@@ -133,21 +134,35 @@ static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state, + } + + void ZSTD_ldm_adjustParameters(ldmParams_t* params, +- ZSTD_compressionParameters const* cParams) ++ const ZSTD_compressionParameters* cParams) + { + params->windowLog = cParams->windowLog; + ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); + DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); +- if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; +- if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; ++ if (params->hashRateLog == 0) { ++ if (params->hashLog > 0) { ++ /* if params->hashLog is set, derive hashRateLog from it */ ++ assert(params->hashLog <= ZSTD_HASHLOG_MAX); ++ if (params->windowLog > params->hashLog) { ++ params->hashRateLog = params->windowLog - params->hashLog; ++ } ++ } else { ++ assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); ++ /* mapping from [fast, rate7] to [btultra2, rate4] */ ++ params->hashRateLog = 7 - (cParams->strategy/3); ++ } ++ } + if (params->hashLog == 0) { +- params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG); +- assert(params->hashLog <= ZSTD_HASHLOG_MAX); ++ params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX); + } +- if (params->hashRateLog == 0) { +- params->hashRateLog = params->windowLog < params->hashLog +- ? 0 +- : params->windowLog - params->hashLog; ++ if (params->minMatchLength == 0) { ++ params->minMatchLength = LDM_MIN_MATCH_LENGTH; ++ if (cParams->strategy >= ZSTD_btultra) ++ params->minMatchLength /= 2; ++ } ++ if (params->bucketSizeLog==0) { ++ assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); ++ params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX); + } + params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); + } +@@ -170,22 +185,22 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) + /* ZSTD_ldm_getBucket() : + * Returns a pointer to the start of the bucket associated with hash. */ + static ldmEntry_t* ZSTD_ldm_getBucket( +- ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) ++ const ldmState_t* ldmState, size_t hash, U32 const bucketSizeLog) + { +- return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); ++ return ldmState->hashTable + (hash << bucketSizeLog); + } + + /* ZSTD_ldm_insertEntry() : + * Insert the entry with corresponding hash into the hash table */ + static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, + size_t const hash, const ldmEntry_t entry, +- ldmParams_t const ldmParams) ++ U32 const bucketSizeLog) + { + BYTE* const pOffset = ldmState->bucketOffsets + hash; + unsigned const offset = *pOffset; + +- *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry; +- *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1)); ++ *(ZSTD_ldm_getBucket(ldmState, hash, bucketSizeLog) + offset) = entry; ++ *pOffset = (BYTE)((offset + 1) & ((1u << bucketSizeLog) - 1)); + + } + +@@ -234,7 +249,7 @@ static size_t ZSTD_ldm_countBackwardsMatch_2segments( + * + * The tables for the other strategies are filled within their + * block compressors. */ +-static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, ++static size_t ZSTD_ldm_fillFastTables(ZSTD_MatchState_t* ms, + void const* end) + { + const BYTE* const iend = (const BYTE*)end; +@@ -242,11 +257,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + switch(ms->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: +@@ -269,7 +288,8 @@ void ZSTD_ldm_fillHashTable( + const BYTE* iend, ldmParams_t const* params) + { + U32 const minMatchLength = params->minMatchLength; +- U32 const hBits = params->hashLog - params->bucketSizeLog; ++ U32 const bucketSizeLog = params->bucketSizeLog; ++ U32 const hBits = params->hashLog - bucketSizeLog; + BYTE const* const base = ldmState->window.base; + BYTE const* const istart = ip; + ldmRollingHashState_t hashState; +@@ -284,7 +304,7 @@ void ZSTD_ldm_fillHashTable( + unsigned n; + + numSplits = 0; +- hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); ++ hashed = ZSTD_ldm_gear_feed(&hashState, ip, (size_t)(iend - ip), splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + if (ip + splits[n] >= istart + minMatchLength) { +@@ -295,7 +315,7 @@ void ZSTD_ldm_fillHashTable( + + entry.offset = (U32)(split - base); + entry.checksum = (U32)(xxhash >> 32); +- ZSTD_ldm_insertEntry(ldmState, hash, entry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, entry, params->bucketSizeLog); + } + } + +@@ -309,7 +329,7 @@ void ZSTD_ldm_fillHashTable( + * Sets cctx->nextToUpdate to a position corresponding closer to anchor + * if it is far way + * (after a long match, only update tables a limited amount). */ +-static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) ++static void ZSTD_ldm_limitTableUpdate(ZSTD_MatchState_t* ms, const BYTE* anchor) + { + U32 const curr = (U32)(anchor - ms->window.base); + if (curr > ms->nextToUpdate + 1024) { +@@ -318,8 +338,10 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) + } + } + +-static size_t ZSTD_ldm_generateSequences_internal( +- ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_ldm_generateSequences_internal( ++ ldmState_t* ldmState, RawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) + { + /* LDM parameters */ +@@ -373,7 +395,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + candidates[n].split = split; + candidates[n].hash = hash; + candidates[n].checksum = (U32)(xxhash >> 32); +- candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params); ++ candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, params->bucketSizeLog); + PREFETCH_L1(candidates[n].bucket); + } + +@@ -396,7 +418,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + * the previous one, we merely register it in the hash table and + * move on */ + if (split < anchor) { +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + +@@ -443,7 +465,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + /* No match found -- insert an entry into the hash table + * and process the next candidate match */ + if (bestEntry == NULL) { +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + +@@ -464,7 +486,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + + /* Insert the current entry into the hash table --- it must be + * done after the previous block to avoid clobbering bestEntry */ +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + + anchor = split + forwardMatchLength; + +@@ -503,7 +525,7 @@ static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, + } + + size_t ZSTD_ldm_generateSequences( +- ldmState_t* ldmState, rawSeqStore_t* sequences, ++ ldmState_t* ldmState, RawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize) + { + U32 const maxDist = 1U << params->windowLog; +@@ -549,7 +571,7 @@ size_t ZSTD_ldm_generateSequences( + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the +- * the offset against maxDist directly. ++ * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may +@@ -580,7 +602,7 @@ size_t ZSTD_ldm_generateSequences( + } + + void +-ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) ++ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) + { + while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { + rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; +@@ -616,7 +638,7 @@ ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const min + * Returns the current sequence to handle, or if the rest of the block should + * be literals, it returns a sequence with offset == 0. + */ +-static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, ++static rawSeq maybeSplitSequence(RawSeqStore_t* rawSeqStore, + U32 const remaining, U32 const minMatch) + { + rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; +@@ -640,7 +662,7 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, + return sequence; + } + +-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { ++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { + rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; +@@ -657,14 +679,14 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { + } + } + +-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_paramSwitch_e useRowMatchFinder, ++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_ParamSwitch_e useRowMatchFinder, + void const* src, size_t srcSize) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + unsigned const minMatch = cParams->minMatch; +- ZSTD_blockCompressor const blockCompressor = ++ ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; +@@ -689,7 +711,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); +- int i; + /* End signal */ + if (sequence.offset == 0) + break; +@@ -702,6 +723,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { ++ int i; + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; +@@ -711,7 +733,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, +- STORE_OFFSET(sequence.offset), ++ OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } +diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h +index fbc6a5e88fd7..41400a7191b2 100644 +--- a/lib/zstd/compress/zstd_ldm.h ++++ b/lib/zstd/compress/zstd_ldm.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,7 +12,6 @@ + #ifndef ZSTD_LDM_H + #define ZSTD_LDM_H + +- + #include "zstd_compress_internal.h" /* ldmParams_t, U32 */ + #include /* ZSTD_CCtx, size_t */ + +@@ -40,7 +40,7 @@ void ZSTD_ldm_fillHashTable( + * sequences. + */ + size_t ZSTD_ldm_generateSequences( +- ldmState_t* ldms, rawSeqStore_t* sequences, ++ ldmState_t* ldms, RawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize); + + /* +@@ -61,9 +61,9 @@ size_t ZSTD_ldm_generateSequences( + * two. We handle that case correctly, and update `rawSeqStore` appropriately. + * NOTE: This function does not return any errors. + */ +-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_paramSwitch_e useRowMatchFinder, ++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_ParamSwitch_e useRowMatchFinder, + void const* src, size_t srcSize); + + /* +@@ -73,7 +73,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + * Avoids emitting matches less than `minMatch` bytes. + * Must be called for data that is not passed to ZSTD_ldm_blockCompress(). + */ +-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, ++void ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, + U32 const minMatch); + + /* ZSTD_ldm_skipRawSeqStoreBytes(): +@@ -81,7 +81,7 @@ void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, + * Not to be used in conjunction with ZSTD_ldm_skipSequences(). + * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + */ +-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes); ++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes); + + /* ZSTD_ldm_getTableSize() : + * Estimate the space needed for long distance matching tables or 0 if LDM is +@@ -107,5 +107,4 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize); + void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams); + +- + #endif /* ZSTD_FAST_H */ +diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h +index 647f865be290..cfccfc46f6f7 100644 +--- a/lib/zstd/compress/zstd_ldm_geartab.h ++++ b/lib/zstd/compress/zstd_ldm_geartab.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c +index fd82acfda62f..b62fd1b0d83e 100644 +--- a/lib/zstd/compress/zstd_opt.c ++++ b/lib/zstd/compress/zstd_opt.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,11 +13,14 @@ + #include "hist.h" + #include "zstd_opt.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + + #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ + #define ZSTD_MAX_PRICE (1<<30) + +-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + + /*-************************************* +@@ -26,27 +30,35 @@ + #if 0 /* approximation at bit level (for tests) */ + # define BITCOST_ACCURACY 0 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) ++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) + #elif 0 /* fractional bit accuracy (for tests) */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) + #else /* opt==approx, ultra==accurate */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) ++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) + #endif + ++/* ZSTD_bitWeight() : ++ * provide estimated "cost" of a stat in full bits only */ + MEM_STATIC U32 ZSTD_bitWeight(U32 stat) + { + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); + } + ++/* ZSTD_fracWeight() : ++ * provide fractional-bit "cost" of a stat, ++ * using linear interpolation approximation */ + MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + { + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; ++ /* Fweight was meant for "Fractional weight" ++ * but it's effectively a value between 1 and 2 ++ * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); +@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + /* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +-MEM_STATIC double ZSTD_fCost(U32 price) ++MEM_STATIC double ZSTD_fCost(int price) + { + return (double)price / (BITCOST_MULTIPLIER*8); + } +@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts) + return total; + } + +-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) ++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; ++ ++static U32 ++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) + { + U32 s, sum=0; +- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); ++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", ++ (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); + for (s=0; s> shift); +- sum += table[s]; ++ unsigned const base = base1 ? 1 : (table[s]>0); ++ unsigned const newStat = base + (table[s] >> shift); ++ sum += newStat; ++ table[s] = newStat; + } + return sum; + } + + /* ZSTD_scaleStats() : +- * reduce all elements in table is sum too large ++ * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + { +@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; +- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); ++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); + } + + /* ZSTD_rescaleFreqs() : +@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + +- if (optPtr->litLengthSum == 0) { /* first block : init */ +- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ +- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); ++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ ++ ++ /* heuristic: use pre-defined stats for too small inputs */ ++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { ++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { +- /* huffman table presumed generated by dictionary */ ++ ++ /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { ++ /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; +@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + +- } else { /* not a dictionary */ ++ } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { ++ /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ +- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); ++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { +@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + +- + } + +- } else { /* new block : re-use previous statistics, scaled down */ ++ } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr, + int optLevel) + { ++ DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) +@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ +- { U32 price = litLength * optPtr->litSumBasePrice; ++ { U32 price = optPtr->litSumBasePrice * litLength; ++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; ++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { +- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ +- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; ++ price -= litPrice; + } + return price; + } +@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); +- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +- * because it isn't representable in the zstd format. So instead just +- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +- * would be all literals. ++ ++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX ++ * because it isn't representable in the zstd format. ++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. ++ * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); +@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + } + + /* ZSTD_getMatchPrice() : +- * Provides the cost of the match part (offset + matchLength) of a sequence ++ * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. +- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 ++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ + FORCE_INLINE_TEMPLATE U32 +-ZSTD_getMatchPrice(U32 const offcode, ++ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) + { + U32 price; +- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); ++ U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + +- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ +- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); ++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ ++ return WEIGHT(mlBase, optLevel) ++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); +@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode, + } + + /* ZSTD_updateStats() : +- * assumption : literals + litLengtn <= iend */ ++ * assumption : literals + litLength <= iend */ + static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, +- U32 offsetCode, U32 matchLength) ++ U32 offBase, U32 matchLength) + { + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { +@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, + optPtr->litLengthSum++; + } + +- /* offset code : expected to follow storeSeq() numeric representation */ +- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); ++ /* offset code : follows storeSeq() numeric representation */ ++ { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; +@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) + + /* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip) + { + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; +@@ -408,8 +438,10 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position + * @return : nb of positions added */ +-static U32 ZSTD_insertBt1( +- const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertBt1( ++ const ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const target, + U32 const mls, const int extDict) +@@ -527,15 +559,16 @@ static U32 ZSTD_insertBt1( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_updateTree_internal( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + const U32 mls, const ZSTD_dictMode_e dictMode) + { + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; +- DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", ++ DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { +@@ -548,20 +581,23 @@ void ZSTD_updateTree_internal( + ms->nextToUpdate = target; + } + +-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { ++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); + } + + FORCE_INLINE_TEMPLATE +-U32 ZSTD_insertBtAndGetAllMatches ( +- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ +- ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, +- const U32 rep[ZSTD_REP_NUM], +- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ +- const U32 lengthToBeat, +- U32 const mls /* template */) ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ++ZSTD_insertBtAndGetAllMatches ( ++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ++ ZSTD_MatchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip, const BYTE* const iLimit, ++ const ZSTD_dictMode_e dictMode, ++ const U32 rep[ZSTD_REP_NUM], ++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ ++ const U32 lengthToBeat, ++ const U32 mls /* template */) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -590,7 +626,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + U32 mnum = 0; + U32 nbCompares = 1U << cParams->searchLog; + +- const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; ++ const ZSTD_MatchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; + const ZSTD_compressionParameters* const dmsCParams = + dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL; + const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL; +@@ -629,13 +665,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + assert(curr >= windowLow); + if ( dictMode == ZSTD_extDict + && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow) /* equivalent to `curr > repIndex >= windowLow` */ +- & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */) ++ & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; + } + if (dictMode == ZSTD_dictMatchState + && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `curr > repIndex >= dmsLowLimit` */ +- & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch; + } } +@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; +- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ ++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) +@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ +- matches[0].off = STORE_OFFSET(curr - matchIndex3); ++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | +@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + } + + if (matchLength > bestLength) { +- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; +- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -784,7 +820,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + typedef U32 (*ZSTD_getAllMatchesFn)( + ZSTD_match_t*, +- ZSTD_matchState_t*, ++ ZSTD_MatchState_t*, + U32*, + const BYTE*, + const BYTE*, +@@ -792,9 +828,11 @@ typedef U32 (*ZSTD_getAllMatchesFn)( + U32 const ll0, + U32 const lengthToBeat); + +-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, + const BYTE* const iHighLimit, +@@ -817,7 +855,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( + #define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ + ZSTD_match_t* matches, \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + U32* nextToUpdate3, \ + const BYTE* ip, \ + const BYTE* const iHighLimit, \ +@@ -849,7 +887,7 @@ GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) + } + + static ZSTD_getAllMatchesFn +-ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode) ++ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode) + { + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), +@@ -868,7 +906,7 @@ ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const di + + /* Struct containing info needed to make decision about ldm inclusion */ + typedef struct { +- rawSeqStore_t seqStore; /* External match candidates store for this block */ ++ RawSeqStore_t seqStore; /* External match candidates store for this block */ + U32 startPosInBlock; /* Start position of the current match candidate */ + U32 endPosInBlock; /* End position of the current match candidate */ + U32 offset; /* Offset of the match candidate */ +@@ -878,7 +916,7 @@ typedef struct { + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'. + */ +-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) ++static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) + { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { +@@ -935,7 +973,7 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock + return; + } + +- /* Matches may be < MINMATCH by this process. In that case, we will reject them ++ /* Matches may be < minMatch by this process. In that case, we will reject them + when we are deciding whether or not to add the ldm */ + optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining; + optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining; +@@ -957,25 +995,26 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock + * into 'matches'. Maintains the correct ordering of 'matches'. + */ + static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, +- const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) ++ const ZSTD_optLdm_t* optLdm, U32 currPosInBlock, ++ U32 minMatch) + { + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; +- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ ++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ + if (currPosInBlock < optLdm->startPosInBlock + || currPosInBlock >= optLdm->endPosInBlock +- || candidateMatchLength < MINMATCH) { ++ || candidateMatchLength < minMatch) { + return; + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); +- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", +- candidateOffCode, candidateMatchLength, currPosInBlock); ++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); ++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", ++ candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; +- matches[*nbMatches].off = candidateOffCode; ++ matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } + } +@@ -986,7 +1025,8 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + static void + ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + ZSTD_match_t* matches, U32* nbMatches, +- U32 currPosInBlock, U32 remainingBytes) ++ U32 currPosInBlock, U32 remainingBytes, ++ U32 minMatch) + { + if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { + return; +@@ -1003,7 +1043,7 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + } + ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); + } +- ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock); ++ ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch); + } + + +@@ -1011,11 +1051,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + * Optimal parser + *********************************/ + +-static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +-{ +- return sol.litlen + sol.mlen; +-} +- + #if 0 /* debug */ + + static void +@@ -1033,9 +1068,15 @@ listStats(const U32* table, int lastEltID) + + #endif + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, ++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) ++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) ++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) ++ ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ++ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms, ++ SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const int optLevel, +@@ -1059,9 +1100,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; +- ZSTD_optimal_t lastSequence; ++ ZSTD_optimal_t lastStretch; + ZSTD_optLdm_t optLdm; + ++ ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); ++ + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); +@@ -1082,103 +1125,140 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const ll0 = !litlen; + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(ip-istart), (U32)(iend - ip)); +- if (!nbMatches) { ip++; continue; } ++ (U32)(ip-istart), (U32)(iend-ip), ++ minMatch); ++ if (!nbMatches) { ++ DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); ++ ip++; ++ continue; ++ } ++ ++ /* Match found: let's store this solution, and eventually find more candidates. ++ * During this forward pass, @opt is used to store stretches, ++ * defined as "a match followed by N literals". ++ * Note how this is different from a Sequence, which is "N literals followed by a match". ++ * Storing stretches allows us to store different match predecessors ++ * for each literal position part of a literals run. */ + + /* initialize opt[0] */ +- { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; +- U32 const maxOffcode = matches[nbMatches-1].off; +- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", +- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); ++ U32 const maxOffBase = matches[nbMatches-1].off; ++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", ++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { +- lastSequence.litlen = litlen; +- lastSequence.mlen = maxML; +- lastSequence.off = maxOffcode; +- DEBUGLOG(6, "large match (%u>%u), immediate encoding", ++ lastStretch.litlen = 0; ++ lastStretch.mlen = maxML; ++ lastStretch.off = maxOffBase; ++ DEBUGLOG(6, "large match (%u>%u) => immediate encoding", + maxML, sufficient_len); + cur = 0; +- last_pos = ZSTD_totalLen(lastSequence); ++ last_pos = maxML; + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + assert(opt[0].price >= 0); +- { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); +- U32 pos; ++ { U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { +- opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ ++ opt[pos].price = ZSTD_MAX_PRICE; ++ opt[pos].mlen = 0; ++ opt[pos].litlen = litlen + pos; + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { +- U32 const offcode = matches[matchNb].off; ++ U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { +- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); +- U32 const sequencePrice = literalsPrice + matchPrice; ++ int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); ++ int const sequencePrice = opt[0].price + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; +- opt[pos].off = offcode; +- opt[pos].litlen = litlen; +- opt[pos].price = (int)sequencePrice; +- } } ++ opt[pos].off = offBase; ++ opt[pos].litlen = 0; /* end of match */ ++ opt[pos].price = sequencePrice + LL_PRICE(0); ++ } ++ } + last_pos = pos-1; ++ opt[pos].price = ZSTD_MAX_PRICE; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; +- assert(cur < ZSTD_OPT_NUM); +- DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) ++ assert(cur <= ZSTD_OPT_NUM); ++ DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur); + + /* Fix current position with one literal if cheaper */ +- { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; ++ { U32 const litlen = opt[cur-1].litlen + 1; + int const price = opt[cur-1].price +- + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) +- + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) +- - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); ++ + LIT_PRICE(ip+cur-1) ++ + LL_INCPRICE(litlen); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, ++ ZSTD_optimal_t const prevMatch = opt[cur]; ++ DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", ++ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); +- opt[cur].mlen = 0; +- opt[cur].off = 0; ++ opt[cur] = opt[cur-1]; + opt[cur].litlen = litlen; + opt[cur].price = price; ++ if ( (optLevel >= 1) /* additional check only for higher modes */ ++ && (prevMatch.litlen == 0) /* replace a match */ ++ && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ ++ && LIKELY(ip + cur < iend) ++ ) { ++ /* check next position, in case it would be cheaper */ ++ int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); ++ int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); ++ DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", ++ cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); ++ if ( (with1literal < withMoreLiterals) ++ && (with1literal < opt[cur+1].price) ) { ++ /* update offset history - before it disappears */ ++ U32 const prev = cur - prevMatch.mlen; ++ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); ++ assert(cur >= prevMatch.mlen); ++ DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", ++ ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), ++ newReps.rep[0], newReps.rep[1], newReps.rep[2] ); ++ opt[cur+1] = prevMatch; /* mlen & offbase */ ++ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t)); ++ opt[cur+1].litlen = 1; ++ opt[cur+1].price = with1literal; ++ if (last_pos < cur+1) last_pos = cur+1; ++ } ++ } + } else { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), +- opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); ++ DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)", ++ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); + } + } + +- /* Set the repcodes of the current position. We must do it here +- * because we rely on the repcodes of the 2nd to last sequence being +- * correct to set the next chunks repcodes during the backward +- * traversal. ++ /* Offset history is not updated during match comparison. ++ * Do it here, now that the match is selected and confirmed. + */ +- ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); ++ ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t)); + assert(cur >= opt[cur].mlen); +- if (opt[cur].mlen != 0) { ++ if (opt[cur].litlen == 0) { ++ /* just finished a match => alter offset history */ + U32 const prev = cur - opt[cur].mlen; +- repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); +- ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); +- } else { +- ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); ++ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); ++ ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ +@@ -1188,38 +1268,37 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { +- DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); ++ DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + assert(opt[cur].price >= 0); +- { U32 const ll0 = (opt[cur].mlen != 0); +- U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; +- U32 const previousPrice = (U32)opt[cur].price; +- U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); ++ { U32 const ll0 = (opt[cur].litlen == 0); ++ int const previousPrice = opt[cur].price; ++ int const basePrice = previousPrice + LL_PRICE(0); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); + U32 matchNb; + + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(inr-istart), (U32)(iend-inr)); ++ (U32)(inr-istart), (U32)(iend-inr), ++ minMatch); + + if (!nbMatches) { + DEBUGLOG(7, "rPos:%u : no match found", cur); + continue; + } + +- { U32 const maxML = matches[nbMatches-1].len; +- DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", +- inr-istart, cur, nbMatches, maxML); +- +- if ( (maxML > sufficient_len) +- || (cur + maxML >= ZSTD_OPT_NUM) ) { +- lastSequence.mlen = maxML; +- lastSequence.off = matches[nbMatches-1].off; +- lastSequence.litlen = litlen; +- cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ +- last_pos = cur + ZSTD_totalLen(lastSequence); +- if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ ++ { U32 const longestML = matches[nbMatches-1].len; ++ DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u", ++ (int)(inr-istart), cur, nbMatches, longestML); ++ ++ if ( (longestML > sufficient_len) ++ || (cur + longestML >= ZSTD_OPT_NUM) ++ || (ip + cur + longestML >= iend) ) { ++ lastStretch.mlen = longestML; ++ lastStretch.off = matches[nbMatches-1].off; ++ lastStretch.litlen = 0; ++ last_pos = cur + longestML; + goto _shortestPath; + } } + +@@ -1230,20 +1309,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + +- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", +- matchNb, matches[matchNb].off, lastML, litlen); ++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", ++ matchNb, matches[matchNb].off, lastML, opt[cur].litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; +- int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); ++ int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); +- while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ ++ while (last_pos < pos) { ++ /* fill empty positions, for future comparisons */ ++ last_pos++; ++ opt[last_pos].price = ZSTD_MAX_PRICE; ++ opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ ++ } + opt[pos].mlen = mlen; + opt[pos].off = offset; +- opt[pos].litlen = litlen; ++ opt[pos].litlen = 0; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", +@@ -1251,55 +1335,89 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } ++ opt[last_pos+1].price = ZSTD_MAX_PRICE; + } /* for (cur = 1; cur <= last_pos; cur++) */ + +- lastSequence = opt[last_pos]; +- cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ +- assert(cur < ZSTD_OPT_NUM); /* control overflow*/ ++ lastStretch = opt[last_pos]; ++ assert(cur >= lastStretch.mlen); ++ cur = last_pos - lastStretch.mlen; + + _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); ++ assert(last_pos >= lastStretch.mlen); ++ assert(cur == last_pos - lastStretch.mlen); + +- /* Set the next chunk's repcodes based on the repcodes of the beginning +- * of the last match, and the last sequence. This avoids us having to +- * update them while traversing the sequences. +- */ +- if (lastSequence.mlen != 0) { +- repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); +- ZSTD_memcpy(rep, &reps, sizeof(reps)); ++ if (lastStretch.mlen==0) { ++ /* no solution : all matches have been converted into literals */ ++ assert(lastStretch.litlen == (ip - anchor) + last_pos); ++ ip += last_pos; ++ continue; ++ } ++ assert(lastStretch.off > 0); ++ ++ /* Update offset history */ ++ if (lastStretch.litlen == 0) { ++ /* finishing on a match : update offset history */ ++ Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); ++ ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t)); + } else { +- ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t)); ++ assert(cur >= lastStretch.litlen); ++ cur -= lastStretch.litlen; + } + +- { U32 const storeEnd = cur + 1; ++ /* Let's write the shortest path solution. ++ * It is stored in @opt in reverse order, ++ * starting from @storeEnd (==cur+2), ++ * effectively partially @opt overwriting. ++ * Content is changed too: ++ * - So far, @opt stored stretches, aka a match followed by literals ++ * - Now, it will store sequences, aka literals followed by a match ++ */ ++ { U32 const storeEnd = cur + 2; + U32 storeStart = storeEnd; +- U32 seqPos = cur; ++ U32 stretchPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; +- assert(storeEnd < ZSTD_OPT_NUM); +- DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); +- opt[storeEnd] = lastSequence; +- while (seqPos > 0) { +- U32 const backDist = ZSTD_totalLen(opt[seqPos]); ++ assert(storeEnd < ZSTD_OPT_SIZE); ++ DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", ++ storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); ++ if (lastStretch.litlen > 0) { ++ /* last "sequence" is unfinished: just a bunch of literals */ ++ opt[storeEnd].litlen = lastStretch.litlen; ++ opt[storeEnd].mlen = 0; ++ storeStart = storeEnd-1; ++ opt[storeStart] = lastStretch; ++ } { ++ opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ ++ storeStart = storeEnd; ++ } ++ while (1) { ++ ZSTD_optimal_t nextStretch = opt[stretchPos]; ++ opt[storeStart].litlen = nextStretch.litlen; ++ DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", ++ opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); ++ if (nextStretch.mlen == 0) { ++ /* reaching beginning of segment */ ++ break; ++ } + storeStart--; +- DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); +- opt[storeStart] = opt[seqPos]; +- seqPos = (seqPos > backDist) ? seqPos - backDist : 0; ++ opt[storeStart] = nextStretch; /* note: litlen will be fixed */ ++ assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); ++ stretchPos -= nextStretch.litlen + nextStretch.mlen; + } + + /* save sequences */ +- DEBUGLOG(6, "sending selected sequences into seqStore") ++ DEBUGLOG(6, "sending selected sequences into seqStore"); + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; +- U32 const offCode = opt[storePos].off; ++ U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; +- DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", +- anchor - istart, (unsigned)llen, (unsigned)mlen); ++ DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u", ++ (int)(anchor - istart), (unsigned)llen, (unsigned)mlen); + + if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */ + assert(storePos == storeEnd); /* must be last sequence */ +@@ -1308,11 +1426,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + } + + assert(anchor + llen <= iend); +- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); +- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); ++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); ++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } ++ DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); ++ ++ /* update all costs */ + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ +@@ -1320,42 +1441,51 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt0( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + + + + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + /* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. +- * this function cannot error, hence its contract must be respected. ++ * this function cannot error out, its narrow contract must be respected. + */ +-static void +-ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, +- U32 rep[ZSTD_REP_NUM], +- const void* src, size_t srcSize) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms, ++ SeqStore_t* seqStore, ++ U32 rep[ZSTD_REP_NUM], ++ const void* src, size_t srcSize) + { + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); +@@ -1368,7 +1498,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + +- /* invalidate first scan from history */ ++ /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; +@@ -1378,7 +1508,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + } + + size_t ZSTD_compressBlock_btultra( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); +@@ -1386,16 +1516,16 @@ size_t ZSTD_compressBlock_btultra( + } + + size_t ZSTD_compressBlock_btultra2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + +- /* 2-pass strategy: ++ /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics +- * and seed next round's statistics with it. +- * After 1st pass, function forgets everything, and starts a new block. ++ * in order to seed next round's statistics with it. ++ * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), +@@ -1404,42 +1534,47 @@ size_t ZSTD_compressBlock_btultra2( + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ +- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ +- && (srcSize > ZSTD_PREDEF_THRESHOLD) ++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ ++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_btultra_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btopt_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + +-size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); ++ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + + size_t ZSTD_compressBlock_btultra_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries +diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h +index 22b862858ba7..fbdc540ec9d1 100644 +--- a/lib/zstd/compress/zstd_opt.h ++++ b/lib/zstd/compress/zstd_opt.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,40 +12,62 @@ + #ifndef ZSTD_OPT_H + #define ZSTD_OPT_H + +- + #include "zstd_compress_internal.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + /* used in ZSTD_loadDictionaryContent() */ +-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); ++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend); ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btopt_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btopt_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTOPT NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL ++#endif + +-size_t ZSTD_compressBlock_btopt_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ ++size_t ZSTD_compressBlock_btultra2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 ++#else ++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL ++#endif + + #endif /* ZSTD_OPT_H */ +diff --git a/lib/zstd/compress/zstd_preSplit.c b/lib/zstd/compress/zstd_preSplit.c +new file mode 100644 +index 000000000000..7d9403c9a3bc +--- /dev/null ++++ b/lib/zstd/compress/zstd_preSplit.c +@@ -0,0 +1,239 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#include "../common/compiler.h" /* ZSTD_ALIGNOF */ ++#include "../common/mem.h" /* S64 */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/zstd_internal.h" /* ZSTD_STATIC_ASSERT */ ++#include "hist.h" /* HIST_add */ ++#include "zstd_preSplit.h" ++ ++ ++#define BLOCKSIZE_MIN 3500 ++#define THRESHOLD_PENALTY_RATE 16 ++#define THRESHOLD_BASE (THRESHOLD_PENALTY_RATE - 2) ++#define THRESHOLD_PENALTY 3 ++ ++#define HASHLENGTH 2 ++#define HASHLOG_MAX 10 ++#define HASHTABLESIZE (1 << HASHLOG_MAX) ++#define HASHMASK (HASHTABLESIZE - 1) ++#define KNUTH 0x9e3779b9 ++ ++/* for hashLog > 8, hash 2 bytes. ++ * for hashLog == 8, just take the byte, no hashing. ++ * The speed of this method relies on compile-time constant propagation */ ++FORCE_INLINE_TEMPLATE unsigned hash2(const void *p, unsigned hashLog) ++{ ++ assert(hashLog >= 8); ++ if (hashLog == 8) return (U32)((const BYTE*)p)[0]; ++ assert(hashLog <= HASHLOG_MAX); ++ return (U32)(MEM_read16(p)) * KNUTH >> (32 - hashLog); ++} ++ ++ ++typedef struct { ++ unsigned events[HASHTABLESIZE]; ++ size_t nbEvents; ++} Fingerprint; ++typedef struct { ++ Fingerprint pastEvents; ++ Fingerprint newEvents; ++} FPStats; ++ ++static void initStats(FPStats* fpstats) ++{ ++ ZSTD_memset(fpstats, 0, sizeof(FPStats)); ++} ++ ++FORCE_INLINE_TEMPLATE void ++addEvents_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) ++{ ++ const char* p = (const char*)src; ++ size_t limit = srcSize - HASHLENGTH + 1; ++ size_t n; ++ assert(srcSize >= HASHLENGTH); ++ for (n = 0; n < limit; n+=samplingRate) { ++ fp->events[hash2(p+n, hashLog)]++; ++ } ++ fp->nbEvents += limit/samplingRate; ++} ++ ++FORCE_INLINE_TEMPLATE void ++recordFingerprint_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) ++{ ++ ZSTD_memset(fp, 0, sizeof(unsigned) * ((size_t)1 << hashLog)); ++ fp->nbEvents = 0; ++ addEvents_generic(fp, src, srcSize, samplingRate, hashLog); ++} ++ ++typedef void (*RecordEvents_f)(Fingerprint* fp, const void* src, size_t srcSize); ++ ++#define FP_RECORD(_rate) ZSTD_recordFingerprint_##_rate ++ ++#define ZSTD_GEN_RECORD_FINGERPRINT(_rate, _hSize) \ ++ static void FP_RECORD(_rate)(Fingerprint* fp, const void* src, size_t srcSize) \ ++ { \ ++ recordFingerprint_generic(fp, src, srcSize, _rate, _hSize); \ ++ } ++ ++ZSTD_GEN_RECORD_FINGERPRINT(1, 10) ++ZSTD_GEN_RECORD_FINGERPRINT(5, 10) ++ZSTD_GEN_RECORD_FINGERPRINT(11, 9) ++ZSTD_GEN_RECORD_FINGERPRINT(43, 8) ++ ++ ++static U64 abs64(S64 s64) { return (U64)((s64 < 0) ? -s64 : s64); } ++ ++static U64 fpDistance(const Fingerprint* fp1, const Fingerprint* fp2, unsigned hashLog) ++{ ++ U64 distance = 0; ++ size_t n; ++ assert(hashLog <= HASHLOG_MAX); ++ for (n = 0; n < ((size_t)1 << hashLog); n++) { ++ distance += ++ abs64((S64)fp1->events[n] * (S64)fp2->nbEvents - (S64)fp2->events[n] * (S64)fp1->nbEvents); ++ } ++ return distance; ++} ++ ++/* Compare newEvents with pastEvents ++ * return 1 when considered "too different" ++ */ ++static int compareFingerprints(const Fingerprint* ref, ++ const Fingerprint* newfp, ++ int penalty, ++ unsigned hashLog) ++{ ++ assert(ref->nbEvents > 0); ++ assert(newfp->nbEvents > 0); ++ { U64 p50 = (U64)ref->nbEvents * (U64)newfp->nbEvents; ++ U64 deviation = fpDistance(ref, newfp, hashLog); ++ U64 threshold = p50 * (U64)(THRESHOLD_BASE + penalty) / THRESHOLD_PENALTY_RATE; ++ return deviation >= threshold; ++ } ++} ++ ++static void mergeEvents(Fingerprint* acc, const Fingerprint* newfp) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ acc->events[n] += newfp->events[n]; ++ } ++ acc->nbEvents += newfp->nbEvents; ++} ++ ++static void flushEvents(FPStats* fpstats) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ fpstats->pastEvents.events[n] = fpstats->newEvents.events[n]; ++ } ++ fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents; ++ ZSTD_memset(&fpstats->newEvents, 0, sizeof(fpstats->newEvents)); ++} ++ ++static void removeEvents(Fingerprint* acc, const Fingerprint* slice) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ assert(acc->events[n] >= slice->events[n]); ++ acc->events[n] -= slice->events[n]; ++ } ++ acc->nbEvents -= slice->nbEvents; ++} ++ ++#define CHUNKSIZE (8 << 10) ++static size_t ZSTD_splitBlock_byChunks(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize) ++{ ++ static const RecordEvents_f records_fs[] = { ++ FP_RECORD(43), FP_RECORD(11), FP_RECORD(5), FP_RECORD(1) ++ }; ++ static const unsigned hashParams[] = { 8, 9, 10, 10 }; ++ const RecordEvents_f record_f = (assert(0<=level && level<=3), records_fs[level]); ++ FPStats* const fpstats = (FPStats*)workspace; ++ const char* p = (const char*)blockStart; ++ int penalty = THRESHOLD_PENALTY; ++ size_t pos = 0; ++ assert(blockSize == (128 << 10)); ++ assert(workspace != NULL); ++ assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); ++ ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); ++ assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; ++ ++ initStats(fpstats); ++ record_f(&fpstats->pastEvents, p, CHUNKSIZE); ++ for (pos = CHUNKSIZE; pos <= blockSize - CHUNKSIZE; pos += CHUNKSIZE) { ++ record_f(&fpstats->newEvents, p + pos, CHUNKSIZE); ++ if (compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, penalty, hashParams[level])) { ++ return pos; ++ } else { ++ mergeEvents(&fpstats->pastEvents, &fpstats->newEvents); ++ if (penalty > 0) penalty--; ++ } ++ } ++ assert(pos == blockSize); ++ return blockSize; ++ (void)flushEvents; (void)removeEvents; ++} ++ ++/* ZSTD_splitBlock_fromBorders(): very fast strategy : ++ * compare fingerprint from beginning and end of the block, ++ * derive from their difference if it's preferable to split in the middle, ++ * repeat the process a second time, for finer grained decision. ++ * 3 times did not brought improvements, so I stopped at 2. ++ * Benefits are good enough for a cheap heuristic. ++ * More accurate splitting saves more, but speed impact is also more perceptible. ++ * For better accuracy, use more elaborate variant *_byChunks. ++ */ ++static size_t ZSTD_splitBlock_fromBorders(const void* blockStart, size_t blockSize, ++ void* workspace, size_t wkspSize) ++{ ++#define SEGMENT_SIZE 512 ++ FPStats* const fpstats = (FPStats*)workspace; ++ Fingerprint* middleEvents = (Fingerprint*)(void*)((char*)workspace + 512 * sizeof(unsigned)); ++ assert(blockSize == (128 << 10)); ++ assert(workspace != NULL); ++ assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); ++ ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); ++ assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; ++ ++ initStats(fpstats); ++ HIST_add(fpstats->pastEvents.events, blockStart, SEGMENT_SIZE); ++ HIST_add(fpstats->newEvents.events, (const char*)blockStart + blockSize - SEGMENT_SIZE, SEGMENT_SIZE); ++ fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents = SEGMENT_SIZE; ++ if (!compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, 0, 8)) ++ return blockSize; ++ ++ HIST_add(middleEvents->events, (const char*)blockStart + blockSize/2 - SEGMENT_SIZE/2, SEGMENT_SIZE); ++ middleEvents->nbEvents = SEGMENT_SIZE; ++ { U64 const distFromBegin = fpDistance(&fpstats->pastEvents, middleEvents, 8); ++ U64 const distFromEnd = fpDistance(&fpstats->newEvents, middleEvents, 8); ++ U64 const minDistance = SEGMENT_SIZE * SEGMENT_SIZE / 3; ++ if (abs64((S64)distFromBegin - (S64)distFromEnd) < minDistance) ++ return 64 KB; ++ return (distFromBegin > distFromEnd) ? 32 KB : 96 KB; ++ } ++} ++ ++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize) ++{ ++ DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level); ++ assert(0<=level && level<=4); ++ if (level == 0) ++ return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize); ++ /* level >= 1*/ ++ return ZSTD_splitBlock_byChunks(blockStart, blockSize, level-1, workspace, wkspSize); ++} +diff --git a/lib/zstd/compress/zstd_preSplit.h b/lib/zstd/compress/zstd_preSplit.h +new file mode 100644 +index 000000000000..f98f797fe191 +--- /dev/null ++++ b/lib/zstd/compress/zstd_preSplit.h +@@ -0,0 +1,34 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_PRESPLIT_H ++#define ZSTD_PRESPLIT_H ++ ++#include /* size_t */ ++ ++#define ZSTD_SLIPBLOCK_WORKSPACESIZE 8208 ++ ++/* ZSTD_splitBlock(): ++ * @level must be a value between 0 and 4. ++ * higher levels spend more energy to detect block boundaries. ++ * @workspace must be aligned for size_t. ++ * @wkspSize must be at least >= ZSTD_SLIPBLOCK_WORKSPACESIZE ++ * note: ++ * For the time being, this function only accepts full 128 KB blocks. ++ * Therefore, @blockSize must be == 128 KB. ++ * While this could be extended to smaller sizes in the future, ++ * it is not yet clear if this would be useful. TBD. ++ */ ++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize); ++ ++#endif /* ZSTD_PRESPLIT_H */ +diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c +index 60958afebc41..ac8b87f48f84 100644 +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,10 +20,10 @@ + #include "../common/compiler.h" + #include "../common/bitstream.h" /* BIT_* */ + #include "../common/fse.h" /* to compress headers */ +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/error_private.h" + #include "../common/zstd_internal.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + + /* ************************************************************** + * Constants +@@ -34,6 +35,12 @@ + * Macros + ****************************************************************/ + ++#ifdef HUF_DISABLE_FAST_DECODE ++# define HUF_ENABLE_FAST_DECODE 0 ++#else ++# define HUF_ENABLE_FAST_DECODE 1 ++#endif ++ + /* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. +@@ -43,27 +50,25 @@ + #error "Cannot force the use of the X1 and X2 decoders at the same time!" + #endif + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE ++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is ++ * supported at runtime, so we can add the BMI2 target attribute. ++ * When it is disabled, we will still get BMI2 if it is enabled statically. ++ */ ++#if DYNAMIC_BMI2 ++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE + #else +-# define HUF_ASM_X86_64_BMI2_ATTRS ++# define HUF_FAST_BMI2_ATTRS + #endif + + #define HUF_EXTERN_C + #define HUF_ASM_DECL HUF_EXTERN_C + +-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) ++#if DYNAMIC_BMI2 + # define HUF_NEED_BMI2_FUNCTION 1 + #else + # define HUF_NEED_BMI2_FUNCTION 0 + #endif + +-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +-# define HUF_NEED_DEFAULT_FUNCTION 1 +-#else +-# define HUF_NEED_DEFAULT_FUNCTION 0 +-#endif +- + /* ************************************************************** + * Error Management + ****************************************************************/ +@@ -80,6 +85,11 @@ + /* ************************************************************** + * BMI2 Variant Wrappers + ****************************************************************/ ++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, ++ const void *cSrc, ++ size_t cSrcSize, ++ const HUF_DTable *DTable); ++ + #if DYNAMIC_BMI2 + + #define HUF_DGEN(fn) \ +@@ -101,9 +111,9 @@ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- if (bmi2) { \ ++ if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ +@@ -113,9 +123,9 @@ + + #define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- (void)bmi2; \ ++ (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) + return dtd; + } + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 +- +-static size_t HUF_initDStream(BYTE const* ip) { ++static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; +- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); ++ assert(sizeof(size_t) == 8); + return value << bitsConsumed; + } ++ ++ ++/* ++ * The input/output arguments to the Huffman fast decoding loop: ++ * ++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. ++ * op [in/out] - The output pointers, must be updated to reflect what is written. ++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. ++ * dt [in] - The decoding table. ++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read ++ * down to this pointer. It may be below iend[0]. ++ * oend [in] - The end of the output stream. op[3] must not cross oend. ++ * iend [in] - The end of each input stream. ip[i] may cross iend[i], ++ * as long as it is above ilowest, but that indicates corruption. ++ */ + typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; +- BYTE const* ilimit; ++ BYTE const* ilowest; + BYTE* oend; + BYTE const* iend[4]; +-} HUF_DecompressAsmArgs; ++} HUF_DecompressFastArgs; ++ ++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + + /* +- * Initializes args for the asm decoding loop. +- * @returns 0 on success +- * 1 if the fallback implementation should be used. ++ * Initializes args for the fast decoding loop. ++ * @returns 1 on success ++ * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) ++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) + { + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + +- const BYTE* const ilimit = (const BYTE*)src + 6 + 8; ++ const BYTE* const istart = (const BYTE*)src; + +- BYTE* const oend = (BYTE*)dst + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + +- /* The following condition is false on x32 platform, +- * but HUF_asm is not compatible with this ABI */ +- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; ++ /* The fast decoding loop assumes 64-bit little-endian. ++ * This condition is false on x32. ++ */ ++ if (!MEM_isLittleEndian() || MEM_32bits()) ++ return 0; ++ ++ /* Avoid nullptr addition */ ++ if (dstSize == 0) ++ return 0; ++ assert(dst != NULL); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) +@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) +- return 1; ++ return 0; + + /* Read the jump table. */ + { +- const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); +@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + +- /* HUF_initDStream() requires this, and this small of an input ++ /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. +- * length1 must be >= 16 so that ip[0] >= ilimit before the loop +- * starts. + */ +- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +- return 1; ++ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) ++ return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ +@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) +- return 1; ++ return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. +@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ +- args->bits[0] = HUF_initDStream(args->ip[0]); +- args->bits[1] = HUF_initDStream(args->ip[1]); +- args->bits[2] = HUF_initDStream(args->ip[2]); +- args->bits[3] = HUF_initDStream(args->ip[3]); +- +- /* If ip[] >= ilimit, it is guaranteed to be safe to +- * reload bits[]. It may be beyond its section, but is +- * guaranteed to be valid (>= istart). +- */ +- args->ilimit = ilimit; ++ args->bits[0] = HUF_initFastDStream(args->ip[0]); ++ args->bits[1] = HUF_initFastDStream(args->ip[1]); ++ args->bits[2] = HUF_initFastDStream(args->ip[2]); ++ args->bits[3] = HUF_initFastDStream(args->ip[3]); ++ ++ /* The decoders must be sure to never read beyond ilowest. ++ * This is lower than iend[0], but allowing decoders to read ++ * down to ilowest can allow an extra iteration or two in the ++ * fast loop. ++ */ ++ args->ilowest = istart; + + args->oend = oend; + args->dt = dt; + +- return 0; ++ return 1; + } + +-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) ++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) + { + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) +@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ +- bit->bitContainer = MEM_readLE64(args->ip[stream]); +- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); +- bit->start = (const char*)args->iend[0]; ++ assert(sizeof(size_t) == 8); ++ bit->bitContainer = MEM_readLEST(args->ip[stream]); ++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); ++ bit->start = (const char*)args->ilowest; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; + } +-#endif ++ ++/* Calls X(N) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM(X) \ ++ do { \ ++ X(0); \ ++ X(1); \ ++ X(2); \ ++ X(3); \ ++ } while (0) ++ ++/* Calls X(N, var) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ ++ do { \ ++ X(0, (var)); \ ++ X(1, (var)); \ ++ X(2, (var)); \ ++ X(3, (var)); \ ++ } while (0) + + + #ifndef HUF_FORCE_DECOMPRESS_X2 +@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi + static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { +- D4 = (symbol << 8) + nbBits; ++ D4 = (U64)((symbol << 8) + nbBits); + } else { +- D4 = symbol + (nbBits << 8); ++ D4 = (U64)(symbol + (nbBits << 8)); + } ++ assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; + } +@@ -329,13 +379,7 @@ typedef struct { + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; + } HUF_ReadDTableX1_Workspace; + +- +-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog = 0; + U32 nbSymbols = 0; +@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + +@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ +- { +- int n; +- int nextRankStart = 0; ++ { int n; ++ U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { +@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ +- { +- U32 w; +- int symbol=wksp->rankVal[0]; +- int rankStart=0; ++ { U32 w; ++ int symbol = wksp->rankVal[0]; ++ int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; +@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog + } + + #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ +- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) ++ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + +- return pEnd-pStart; ++ return (size_t)(pEnd-pStart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body( + const HUF_DTable* DTable) + { + BYTE* op = (BYTE*)dst; +- BYTE* const oend = op + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; +@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body( + return dstSize; + } + ++/* HUF_decompress4X1_usingDTable_internal_body(): ++ * Conditions : ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body( + { + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body( + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6); /* validated above */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ U16 const* const dtable = (U16 const*)args->dt; ++ BYTE* const oend = args->oend; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local variables */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each iteration produces 5 output symbols per stream */ ++ size_t const oiters = (size_t)(oend - op[3]) / 5; ++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes ++ * per stream. ++ */ ++ size_t const iiters = (size_t)(ip[0] - ilowest) / 7; ++ /* We can safely run iters iterations before running bounds checks */ ++ size_t const iters = MIN(oiters, iiters); ++ size_t const symbols = iters * 5; ++ ++ /* We can simply check that op[3] < olimit, instead of checking all ++ * of our bounds, since we can't hit the other bounds until we've run ++ * iters iterations, which only happens when op[3] == olimit. ++ */ ++ olimit = op[3] + symbols; ++ ++ /* Exit fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ ++ do { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ int const entry = (int)dtable[index]; \ ++ bits[(_stream)] <<= (entry & 0x3F); \ ++ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ ++ } while (0) ++ ++#define HUF_4X1_RELOAD_STREAM(_stream) \ ++ do { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ op[(_stream)] += 5; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols in each of the 4 streams */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); ++ ++ /* Reload each of the 4 the bitstreams */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ ++#undef HUF_4X1_DECODE_SYMBOL ++#undef HUF_4X1_RELOAD_STREAM ++ } + +-static HUF_ASM_X86_64_BMI2_ATTRS ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++/* ++ * @returns @p dstSize on success (>= 6) ++ * 0 if the fallback implementation should be used ++ * An error if an error occurred ++ */ ++static HUF_FAST_BMI2_ATTRS + size_t +-HUF_decompress4X1_usingDTable_internal_bmi2_asm( ++HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) + { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; +- { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +- FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ BYTE const* const ilowest = (BYTE const*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; ++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + +- /* Our loop guarantees that ip[] >= ilimit and that we haven't ++ /* Our loop guarantees that ip[] >= ilowest and that we haven't + * overwritten any op[]. + */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bit streams one by one. */ +- { +- size_t const segmentSize = (dstSize+3) / 4; ++ { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { +@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + } + + /* decoded size */ ++ assert(dstSize != 0); + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +- +-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, +- const void *cSrc, +- size_t cSrcSize, +- const HUF_DTable *DTable); + + HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + #endif +-} +- +- +-size_t HUF_decompress1X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} + +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- const BYTE* ip = (const BYTE*) cSrc; +- +- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); +- if (HUF_isError(hSize)) return hSize; +- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); +- ip += hSize; cSrcSize -= hSize; +- +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + +-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +-} +- +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); ++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +- + #endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +@@ -1040,14 +1175,7 @@ typedef struct { + + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +- const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); +@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ +@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c + } + + #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, +@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body( + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; +- BYTE* const oend = ostart + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); +@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body( + /* decoded size */ + return dstSize; + } ++ ++/* HUF_decompress4X2_usingDTable_internal_body(): ++ * Conditions: ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body( + const HUF_DTable* DTable) + { + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body( + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + +- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ +- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ ++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6 /* validated above */); + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ BYTE* oend[4]; ++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local registers. */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ oend[0] = op[1]; ++ oend[1] = op[2]; ++ oend[2] = op[3]; ++ oend[3] = args->oend; ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= oend[stream]); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each loop does 5 table lookups for each of the 4 streams. ++ * Each table lookup consumes up to 11 bits of input, and produces ++ * up to 2 bytes of output. ++ */ ++ /* We can consume up to 7 bytes of input per iteration per stream. ++ * We also know that each input pointer is >= ip[0]. So we can run ++ * iters loops before running out of input. ++ */ ++ size_t iters = (size_t)(ip[0] - ilowest) / 7; ++ /* Each iteration can produce up to 10 bytes of output per stream. ++ * Each output stream my advance at different rates. So take the ++ * minimum number of safe iterations among all the output streams. ++ */ ++ for (stream = 0; stream < 4; ++stream) { ++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; ++ iters = MIN(iters, oiters); ++ } ++ ++ /* Each iteration produces at least 5 output symbols. So until ++ * op[3] crosses olimit, we know we haven't executed iters ++ * iterations yet. This saves us maintaining an iters counter, ++ * at the expense of computing the remaining # of iterations ++ * more frequently. ++ */ ++ olimit = op[3] + (iters * 5); ++ ++ /* Exit the fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif + +-static HUF_ASM_X86_64_BMI2_ATTRS size_t +-HUF_decompress4X2_usingDTable_internal_bmi2_asm( ++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ ++ do { \ ++ if ((_decode3) || (_stream) != 3) { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ HUF_DEltX2 const entry = dtable[index]; \ ++ MEM_write16(op[(_stream)], entry.sequence); \ ++ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ ++ op[(_stream)] += (entry.length); \ ++ } \ ++ } while (0) ++ ++#define HUF_4X2_RELOAD_STREAM(_stream) \ ++ do { \ ++ HUF_4X2_DECODE_SYMBOL(3, 1); \ ++ { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols from each of the first 3 streams. ++ * The final stream will be decoded during the reload phase ++ * to reduce register pressure. ++ */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ ++ /* Decode one symbol from the final stream */ ++ HUF_4X2_DECODE_SYMBOL(3, 1); ++ ++ /* Decode 4 symbols from the final stream & reload bitstreams. ++ * The final stream is reloaded last, meaning that all 5 symbols ++ * are decoded from the final stream before it is reloaded. ++ */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ } ++ ++#undef HUF_4X2_DECODE_SYMBOL ++#undef HUF_4X2_RELOAD_STREAM ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++ ++static HUF_FAST_BMI2_ATTRS size_t ++HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) { ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; ++ const BYTE* const ilowest = (const BYTE*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; + { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + + /* note : op4 already verified within main loop */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bitStreams one by one */ + { +@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( + /* decoded size */ + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + #endif ++ ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +-size_t HUF_decompress1X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- + size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); ++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); + } + +- +-size_t HUF_decompress4X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- +-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + /* Universal decompression selectors */ + /* ***********************************/ + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- + + #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) + #endif + } + +- +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, +- size_t dstSize, const void* cSrc, +- size_t cSrcSize, void* workSpace, +- size_t wkspSize) +-{ +- /* validation checks */ +- if (dstSize == 0) return ERROR(dstSize_tooSmall); +- if (cSrcSize == 0) return ERROR(corruption_detected); +- +- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)algoNb; +- assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)algoNb; +- assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#else +- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): +- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#endif +- } +-} +- + size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): ++ cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #endif + } + } + + +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + #endif + +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #else +- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : +- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : ++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #endif + } + } +- +diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c +index dbbc7919de53..30ef65e1ab5c 100644 +--- a/lib/zstd/decompress/zstd_ddict.c ++++ b/lib/zstd/decompress/zstd_ddict.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,12 +15,12 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/cpu.h" /* bmi2 */ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_decompress_internal.h" + #include "zstd_ddict.h" +@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + ZSTD_memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; +- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); +@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) + unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) + { + if (ddict==NULL) return 0; +- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); ++ return ddict->dictID; + } +diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h +index 8c1a79d666f8..de459a0dacd1 100644 +--- a/lib/zstd/decompress/zstd_ddict.h ++++ b/lib/zstd/decompress/zstd_ddict.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c +index 6b3177c94711..da8b4cf116e3 100644 +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -53,13 +54,15 @@ + * Dependencies + *********************************************************/ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ ++#include "../common/error_private.h" ++#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "../common/mem.h" /* low level memory routines */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ +-#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ +@@ -72,11 +75,11 @@ + *************************************/ + + #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. +- * Currently, that means a 0.75 load factor. +- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded +- * the load factor of the ddict hash set. +- */ ++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. ++ * Currently, that means a 0.75 load factor. ++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded ++ * the load factor of the ddict hash set. ++ */ + + #define DDICT_HASHSET_TABLE_BASE_SIZE 64 + #define DDICT_HASHSET_RESIZE_FACTOR 2 +@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; ++ dctx->disableHufAsm = 0; ++ dctx->maxBlockSizeParam = 0; + } + + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; ++ dctx->isFrameDecompression = 1; + #if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); + #endif +@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) ++** or an error code, which can be tested using ZSTD_isError() */ ++size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) + { + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + +- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ +- if (srcSize < minInputSize) return minInputSize; +- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); ++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); ++ ++ if (srcSize > 0) { ++ /* note : technically could be considered an assert(), since it's an invalid entry */ ++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); ++ } ++ if (srcSize < minInputSize) { ++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { ++ /* when receiving less than @minInputSize bytes, ++ * control these bytes at least correspond to a supported magic number ++ * in order to error out early if they don't. ++ **/ ++ size_t const toCopy = MIN(4, srcSize); ++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); ++ assert(src != NULL); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { ++ /* not a zstd frame : let's check if it's a skippable frame */ ++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { ++ RETURN_ERROR(prefix_unknown, ++ "first bytes don't correspond to any supported magic number"); ++ } } } ++ return minInputSize; ++ } + ++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { +@@ -438,8 +468,10 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s + if (srcSize < ZSTD_SKIPPABLEHEADERSIZE) + return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */ + ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); +- zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + zfhPtr->frameType = ZSTD_skippableFrame; ++ zfhPtr->dictID = MEM_readLE32(src) - ZSTD_MAGIC_SKIPPABLE_START; ++ zfhPtr->headerSize = ZSTD_SKIPPABLEHEADERSIZE; ++ zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + return 0; + } + RETURN_ERROR(prefix_unknown, ""); +@@ -508,7 +540,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +-size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) ++size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize) + { + return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); + } +@@ -520,7 +552,7 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ + unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) + { +- { ZSTD_frameHeader zfh; ++ { ZSTD_FrameHeader zfh; + if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (zfh.frameType == ZSTD_skippableFrame) { +@@ -540,61 +572,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); +- { +- size_t const skippableSize = skippableHeaderSize + sizeU32; ++ { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } + } + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * +- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. ++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize) ++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, /* optional, can be NULL */ ++ const void* src, size_t srcSize) + { +- U32 const magicNumber = MEM_readLE32(src); +- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; +- +- /* check input validity */ +- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + +- /* deliver payload */ +- if (skippableContentSize > 0 && dst != NULL) +- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +- if (magicVariant != NULL) +- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +- return skippableContentSize; ++ { U32 const magicNumber = MEM_readLE32(src); ++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); ++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; ++ ++ /* check input validity */ ++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); ++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); ++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ ++ /* deliver payload */ ++ if (skippableContentSize > 0 && dst != NULL) ++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); ++ if (magicVariant != NULL) ++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; ++ return skippableContentSize; ++ } + } + + /* ZSTD_findDecompressedSize() : +- * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames +- * @return : decompressed size of the frames contained */ ++ * note: compatible with legacy mode ++ * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { +- unsigned long long totalDstSize = 0; ++ U64 totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- if (ZSTD_isError(skippableSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; +@@ -602,17 +635,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + continue; + } + +- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); +- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; ++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); ++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- /* check for overflow */ +- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; +- totalDstSize += ret; ++ if (U64_MAX - totalDstSize < fcs) ++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ ++ totalDstSize += fcs; + } ++ /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); +- if (ZSTD_isError(frameSrcSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; ++ assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; +@@ -676,13 +709,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) + return frameSizeInfo; + } + +-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) ++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) + { + ZSTD_frameSizeInfo frameSizeInfo; + ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + + +- if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) ++ if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || +@@ -693,10 +726,10 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + const BYTE* const ipstart = ip; + size_t remainingSize = srcSize; + size_t nbBlocks = 0; +- ZSTD_frameHeader zfh; ++ ZSTD_FrameHeader zfh; + + /* Extract Frame Header */ +- { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); ++ { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) +@@ -730,28 +763,31 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ip += 4; + } + ++ frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize +- : nbBlocks * zfh.blockSizeMax; ++ : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } + } + ++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); ++ return frameSizeInfo.compressedSize; ++} ++ + /* ZSTD_findFrameCompressedSize() : +- * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame +- * `srcSize` must be at least as large as the frame contained +- * @return : the compressed size of the frame starting at `src` */ ++ * See docs in zstd.h ++ * Note: compatible with legacy mode */ + size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) + { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); +- return frameSizeInfo.compressedSize; ++ return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); + } + + /* ZSTD_decompressBound() : + * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame or a skippeable frame ++ * `src` must point to the start of a ZSTD frame or a skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the maximum decompressed size of the compressed source + */ +@@ -760,7 +796,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) +@@ -773,6 +809,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + return bound; + } + ++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) ++{ ++ size_t margin = 0; ++ unsigned maxBlockSize = 0; ++ ++ /* Iterate over each frame */ ++ while (srcSize > 0) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); ++ size_t const compressedSize = frameSizeInfo.compressedSize; ++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ++ ZSTD_FrameHeader zfh; ++ ++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); ++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) ++ return ERROR(corruption_detected); ++ ++ if (zfh.frameType == ZSTD_frame) { ++ /* Add the frame header to our margin */ ++ margin += zfh.headerSize; ++ /* Add the checksum to our margin */ ++ margin += zfh.checksumFlag ? 4 : 0; ++ /* Add 3 bytes per block */ ++ margin += 3 * frameSizeInfo.nbBlocks; ++ ++ /* Compute the max block size */ ++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); ++ } else { ++ assert(zfh.frameType == ZSTD_skippableFrame); ++ /* Add the entire skippable frame size to our margin. */ ++ margin += compressedSize; ++ } ++ ++ assert(srcSize >= compressedSize); ++ src = (const BYTE*)src + compressedSize; ++ srcSize -= compressedSize; ++ } ++ ++ /* Add the max block size back to the margin. */ ++ margin += maxBlockSize; ++ ++ return margin; ++} + + /*-************************************************************* + * Frame decoding +@@ -815,7 +893,7 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, + return regenSize; + } + +-static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming) ++static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, int streaming) + { + (void)dctx; + (void)uncompressedSize; +@@ -856,6 +934,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + ++ /* Shrink the blockSizeMax if enabled */ ++ if (dctx->maxBlockSizeParam != 0) ++ dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); ++ + /* Loop on each block */ + while (1) { + BYTE* oBlockEnd = oend; +@@ -888,7 +970,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + switch(blockProperties.blockType) + { + case bt_compressed: +- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); + break; + case bt_raw : + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ +@@ -901,12 +984,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } +- +- if (ZSTD_isError(decodedSize)) return decodedSize; +- if (dctx->validateChecksum) ++ FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); ++ DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); ++ if (dctx->validateChecksum) { + xxh64_update(&dctx->xxhState, op, decodedSize); +- if (decodedSize != 0) ++ } ++ if (decodedSize) /* support dst = NULL,0 */ { + op += decodedSize; ++ } + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; +@@ -930,12 +1015,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %i, consuming %i bytes of input", (int)(op-ostart), (int)(ip - (const BYTE*)*srcPtr)); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); + } + +-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, +@@ -955,17 +1043,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + + +- { U32 const magicNumber = MEM_readLE32(src); +- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", +- (unsigned)magicNumber, ZSTD_MAGICNUMBER); ++ if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { ++ U32 const magicNumber = MEM_readLE32(src); ++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { ++ /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); ++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; +- continue; ++ continue; /* check next frame */ + } } + + if (ddict) { +@@ -1061,8 +1150,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr + size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + + /* +- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, +- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can ++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we ++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce +@@ -1181,7 +1270,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); +- rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : +@@ -1250,6 +1340,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); ++ assert(dctx->format != ZSTD_f_zstd1_magicless); + ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; +@@ -1262,7 +1353,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } + } + +@@ -1303,11 +1394,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; +@@ -1403,10 +1494,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; +- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; ++ dctx->isFrameDecompression = 1; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; +@@ -1465,7 +1557,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. +- * Needed dictionary is a hidden information. ++ * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. +@@ -1474,7 +1566,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ + unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) + { +- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; ++ ZSTD_FrameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +@@ -1581,7 +1673,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di + size_t ZSTD_initDStream(ZSTD_DStream* zds) + { + DEBUGLOG(4, "ZSTD_initDStream"); +- return ZSTD_initDStream_usingDDict(zds, NULL); ++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); ++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); ++ return ZSTD_startingInputLength(zds->format); + } + + /* ZSTD_initDStream_usingDDict() : +@@ -1589,6 +1683,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) + * this function cannot fail */ + size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + { ++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +@@ -1599,6 +1694,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + * this function cannot fail */ + size_t ZSTD_resetDStream(ZSTD_DStream* dctx) + { ++ DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); + } +@@ -1670,6 +1766,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; ++ case ZSTD_d_disableHuffmanAssembly: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ case ZSTD_d_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ + default:; + } + bounds.error = ERROR(parameter_unsupported); +@@ -1710,6 +1815,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ *value = (int)dctx->disableHufAsm; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ *value = dctx->maxBlockSizeParam; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1743,6 +1854,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); ++ dctx->disableHufAsm = value != 0; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); ++ dctx->maxBlockSizeParam = value; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1754,6 +1873,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; ++ dctx->isFrameDecompression = 1; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { +@@ -1770,11 +1890,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) + return ZSTD_sizeof_DCtx(dctx); + } + +-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) + { +- size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +- /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ +- unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); ++ size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); ++ /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block ++ * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing ++ * the block at the beginning of the output buffer, and maintain a full window. ++ * ++ * We need another blockSize worth of buffer so that we can store split ++ * literals at the end of the block without overwriting the extDict window. ++ */ ++ unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, +@@ -1782,6 +1908,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long + return minRBSize; + } + ++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++{ ++ return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); ++} ++ + size_t ZSTD_estimateDStreamSize(size_t windowSize) + { + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +@@ -1793,7 +1924,7 @@ size_t ZSTD_estimateDStreamSize(size_t windowSize) + size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) + { + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */ +- ZSTD_frameHeader zfh; ++ ZSTD_FrameHeader zfh; + size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(err)) return err; + RETURN_ERROR_IF(err>0, srcSize_wrong, ""); +@@ -1888,6 +2019,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + U32 someMoreWork = 1; + + DEBUGLOG(5, "ZSTD_decompressStream"); ++ assert(zds != NULL); + RETURN_ERROR_IF( + input->pos > input->size, + srcSize_wrong, +@@ -1918,7 +2050,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } +- DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { + return hSize; /* error */ + } +@@ -1932,6 +2063,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->lhSize += remainingInput; + } + input->pos = input->size; ++ /* check first few bytes */ ++ FORWARD_IF_ERROR( ++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), ++ "First few bytes detected incorrect" ); ++ /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); +@@ -1943,14 +2079,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { +- size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart)); ++ size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; +- DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") ++ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); ++ assert(istart != NULL); + ip = istart + cSize; +- op += decompressedSize; ++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; +@@ -1969,7 +2106,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + +- if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ ++ if (zds->format == ZSTD_f_zstd1 ++ && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { +@@ -1985,11 +2123,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); ++ if (zds->maxBlockSizeParam != 0) ++ zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered +- ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) ++ ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); +@@ -2034,6 +2174,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); ++ assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; +@@ -2048,7 +2189,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ +- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); ++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { +@@ -2057,8 +2198,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } +- ip += loadedSize; +- zds->inPos += loadedSize; ++ if (loadedSize != 0) { ++ /* ip may be NULL */ ++ ip += loadedSize; ++ zds->inPos += loadedSize; ++ } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ +@@ -2068,14 +2212,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + break; + } + case zdss_flush: +- { size_t const toFlushSize = zds->outEnd - zds->outStart; ++ { ++ size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); +- op += flushedSize; ++ ++ op = op ? op + flushedSize : op; ++ + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) +- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { ++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); +@@ -2089,7 +2236,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ +@@ -2102,8 +2249,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { +- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); +- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); ++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); ++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { +@@ -2140,11 +2287,17 @@ size_t ZSTD_decompressStream_simpleArgs ( + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; +- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; ++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } +diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c +index c1913b8e7c89..710eb0ffd5a3 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.c ++++ b/lib/zstd/decompress/zstd_decompress_block.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,12 +21,12 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/zstd_internal.h" + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /*_******************************************************* + * Macros +@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } + * Block decoding + ***************************************************************/ + ++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) ++{ ++ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; ++ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ return blockSizeMax; ++} ++ + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, +@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) + { +- if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) +- { +- /* room for litbuffer to fit without read faulting */ +- dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); ++ assert(litSize <= blockSizeMax); ++ assert(dctx->isFrameDecompression || streaming == not_streaming); ++ assert(expectedWriteSize <= blockSizeMax); ++ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { ++ /* If we aren't streaming, we can just put the literals after the output ++ * of the current block. We don't need to worry about overwriting the ++ * extDict of our window, because it doesn't exist. ++ * So if we have space after the end of the block, just put it there. ++ */ ++ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; +- } +- else if (litSize > ZSTD_LITBUFFEREXTRASIZE) +- { +- /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ ++ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { ++ /* Literals fit entirely within the extra buffer, put them there to avoid ++ * having to split the literals. ++ */ ++ dctx->litBuffer = dctx->litExtraBuffer; ++ dctx->litBufferEnd = dctx->litBuffer + litSize; ++ dctx->litBufferLocation = ZSTD_not_in_dst; ++ } else { ++ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); ++ /* Literals must be split between the output block and the extra lit ++ * buffer. We fill the extra lit buffer with the tail of the literals, ++ * and put the rest of the literals at the end of the block, with ++ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. ++ * This MUST not write more than our maxBlockSize beyond dst, because in ++ * streaming mode, that could overwrite part of our extDict window. ++ */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; +- } +- else { +- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ ++ } else { ++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; +- } +- else +- { +- /* fits entirely within litExtraBuffer, so no split is necessary */ +- dctx->litBuffer = dctx->litExtraBuffer; +- dctx->litBufferEnd = dctx->litBuffer + litSize; +- dctx->litBufferLocation = ZSTD_not_in_dst; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); + } + } + +-/* Hidden declaration for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, +- const void* src, size_t srcSize, +- void* dst, size_t dstCapacity, const streaming_operation streaming); + /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current +@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + * + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, ++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) + { +@@ -124,7 +140,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); + + { const BYTE* const istart = (const BYTE*) src; +- symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); ++ SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3); ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + + switch(litEncType) + { +@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + ZSTD_FALLTHROUGH; + + case set_compressed: +- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); ++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); ++ int const flags = 0 ++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) ++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); ++ if (!singleStream) ++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, ++ "Not enough literals (%zu) for the 4-streams mode (min %u)", ++ litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); +@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + if (litEncType==set_repeat) { + if (singleStream) { +- hufSuccess = HUF_decompress1X_usingDTable_bmi2( ++ hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } else { +- hufSuccess = HUF_decompress4X_usingDTable_bmi2( ++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); ++ hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } + } else { + if (singleStream) { +@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace)); ++ sizeof(dctx->workspace), flags); + #else +- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( ++ hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + #endif + } else { +- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( ++ hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) + { ++ assert(litSize > ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); +@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } + + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ +@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 1: + lhSize = 2; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; +- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) +@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + } + } + ++/* Hidden declaration for fullbench */ ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity); ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity) ++{ ++ dctx->isFrameDecompression = 0; ++ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); ++} ++ + /* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions +@@ -317,7 +359,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + * - start from default distributions, present in /lib/common/zstd_internal.h + * - generate tables normally, using ZSTD_buildFSETable() + * - printout the content of tables +- * - pretify output, report below, test with fuzzer to ensure it's correct */ ++ * - prettify output, report below, test with fuzzer to ensure it's correct */ + + /* Default FSE distribution table for Literal Lengths */ + static const ZSTD_seqSymbol LL_defaultDTable[(1<=0); ++ pos += (size_t)n; + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ ++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } +@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (u=0; u 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); +@@ -681,11 +719,19 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + } + *nbSeqPtr = nbSeq; + ++ if (nbSeq == 0) { ++ /* No sequence : section ends immediately */ ++ RETURN_ERROR_IF(ip != iend, corruption_detected, ++ "extraneous data present in the Sequences section"); ++ return (size_t)(ip - istart); ++ } ++ + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ +- { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); +- symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); +- symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); ++ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ ++ { SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6); ++ SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3); ++ SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3); + ip++; + + /* Build DTables */ +@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt + /* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { ++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + +@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op, + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op, + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); ++ ++#if defined(__aarch64__) ++ /* prefetch sequence starting from match that will be used for copy later */ ++ PREFETCH_L1(match); ++#endif + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend +@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + } + + /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum +- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) ++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + + typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; + ++/* ++ * ZSTD_decodeSequence(): ++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets ++ * only used in 32-bit mode ++ * @return : Sequence (litL + matchL + offset) ++ */ + FORCE_INLINE_TEMPLATE seq_t +-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) ++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) + { + seq_t seq; ++ /* ++ * ZSTD_seqSymbol is a 64 bits wide structure. ++ * It can be loaded in one operation ++ * and its fields extracted by simply shifting or bit-extracting on aarch64. ++ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh ++ * operations that cause performance drop. This can be avoided by using this ++ * ZSTD_memcpy hack. ++ */ ++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) ++ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; ++ ZSTD_seqSymbol* const llDInfo = &llDInfoS; ++ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; ++ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; ++ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); ++#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; ++#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; +@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; ++ ++ assert(llBits <= MaxLLBits); ++ assert(mlBits <= MaxMLBits); ++ assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only +- * valuable to mark likelyness for clang, it gives around 3-4% of ++ * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; +- #if defined(__clang__) +- if (LIKELY(ofBits > 1)) { +- #else + if (ofBits > 1) { +- #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +- assert(ofBits <= MaxOff); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); ++ /* Always read extra bits, this keeps the logic simple, ++ * avoids branches, and avoids accidentally reading 0 bits. ++ */ ++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); +- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ ++ offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); +@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; +- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ ++ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; +@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + seq.offset = offset; + } + +- #if defined(__clang__) +- if (UNLIKELY(mlBits > 0)) +- #else + if (mlBits > 0) +- #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +- #if defined(__clang__) +- if (UNLIKELY(llBits > 0)) +- #else + if (llBits > 0) +- #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) +@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + +- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ +- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ if (!isLastSeq) { ++ /* don't update FSE state for last Sequence */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ ++ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ BIT_reloadDStream(&seqState->DStream); ++ } + } + + return seq; + } + +-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) ++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) ++#if DEBUGLEVEL >= 1 ++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) + { + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ +@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix + /* Dictionary is active. */ + return 1; + } ++#endif + +-MEM_STATIC void ZSTD_assertValidSequence( ++static void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) + { + #if DEBUGLEVEL >= 1 +- size_t const windowSize = dctx->fParams.windowSize; +- size_t const sequenceSize = seq.litLength + seq.matchLength; +- BYTE const* const oLitEnd = op + seq.litLength; +- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", +- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); +- assert(op <= oend); +- assert((size_t)(oend - op) >= sequenceSize); +- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); +- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { +- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); +- /* Offset must be within the dictionary. */ +- assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); +- assert(seq.offset <= windowSize + dictSize); +- } else { +- /* Offset must be within our window. */ +- assert(seq.offset <= windowSize); ++ if (dctx->isFrameDecompression) { ++ size_t const windowSize = dctx->fParams.windowSize; ++ size_t const sequenceSize = seq.litLength + seq.matchLength; ++ BYTE const* const oLitEnd = op + seq.litLength; ++ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", ++ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); ++ assert(op <= oend); ++ assert((size_t)(oend - op) >= sequenceSize); ++ assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); ++ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { ++ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); ++ /* Offset must be within the dictionary. */ ++ assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); ++ assert(seq.offset <= windowSize + dictSize); ++ } else { ++ /* Offset must be within our window. */ ++ assert(seq.offset <= windowSize); ++ } + } + #else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +@@ -1322,23 +1404,21 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = ostart + maxDstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); + +- /* Regen sequences */ ++ /* Literals are split between internal buffer & output buffer */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; +@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + BIT_DStream_completed < BIT_DStream_overflow); + + /* decompress without overrunning litPtr begins */ +- { +- seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression +@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + #endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ +- for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { +- size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ for ( ; nbSeq; nbSeq--) { ++ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); ++ if (litPtr + sequence.litLength > dctx->litBufferEnd) break; ++ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif +- if (UNLIKELY(ZSTD_isError(oneSeqSize))) +- return oneSeqSize; +- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +- op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); +- sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +- } ++ if (UNLIKELY(ZSTD_isError(oneSeqSize))) ++ return oneSeqSize; ++ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); ++ op += oneSeqSize; ++ } } ++ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; +@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (--nbSeq) +- BIT_reloadDStream(&(seqState.DStream)); + } ++ nbSeq--; + } + } + +- if (nbSeq > 0) /* there is remaining lit from extra buffer */ +- { ++ if (nbSeq > 0) { ++ /* there is remaining lit from extra buffer */ + + #if defined(__x86_64__) + __asm__(".p2align 6"); +@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + # endif + #endif + +- for (; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ +- { +- size_t const lastLLSize = litBufferEnd - litPtr; ++ if (dctx->litBufferLocation == ZSTD_split) { ++ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ ++ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); +@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } +- { size_t const lastLLSize = litBufferEnd - litPtr; ++ /* copy last literals from internal buffer */ ++ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -1539,21 +1616,19 @@ DONT_VECTORIZE + ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_body"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + + /* Regen sequences */ + if (nbSeq) { +@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + +- ZSTD_STATIC_ASSERT( +- BIT_DStream_unfinished < BIT_DStream_completed && +- BIT_DStream_endOfBuffer < BIT_DStream_completed && +- BIT_DStream_completed < BIT_DStream_overflow); +- + #if defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + # endif + #endif + +- for ( ; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + + /* check if reached exact end */ +- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); +- RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ assert(nbSeq == 0); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- { size_t const lastLLSize = litEnd - litPtr; ++ { size_t const lastLLSize = (size_t)(litEnd - litPtr); ++ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + + static size_t + ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, ++FORCE_INLINE_TEMPLATE ++ ++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) + { + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; +- const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. +- * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. ++ * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- (void)frame; + + /* Regen sequences */ + if (nbSeq) { +@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body( + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ +- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) +@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif +- if (ZSTD_isError(oneSeqSize)) return oneSeqSize; ++ if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); +- sequences[seqNb & STORED_SEQS_MASK] = sequence; +- op += oneSeqSize; +- } ++ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); ++ sequences[seqNb & STORED_SEQS_MASK] = sequence; ++ op += oneSeqSize; ++ } } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ +- oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ++ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body( + op += oneSeqSize; + } + } +- RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; +@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body( + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body( + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ +- { ++ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { +@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body( + } + } + +- return op-ostart; ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1851,20 +1908,18 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static BMI2_TARGET_ATTRIBUTE size_t + DONT_VECTORIZE + ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1873,50 +1928,40 @@ static BMI2_TARGET_ATTRIBUTE size_t + ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + #endif /* DYNAMIC_BMI2 */ + +-typedef size_t (*ZSTD_decompressSequences_t)( +- ZSTD_DCtx* dctx, +- void* dst, size_t maxDstSize, +- const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame); +- + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + static size_t + ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequences"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static size_t + ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1931,69 +1976,114 @@ static size_t + ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + ++/* ++ * @returns The total size of the history referenceable by zstd, including ++ * both the prefix and the extDict. At @p op any offset larger than this ++ * is invalid. ++ */ ++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) ++{ ++ return (size_t)(op - virtualStart); ++} ++ ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : ++/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) +- * compared to maximum possible of (1< 22) total += 1; ++ ZSTD_OffsetInfo info = {0, 0}; ++ /* If nbSeq == 0, then the offTable is uninitialized, but we have ++ * no sequences, so both values should be 0. ++ */ ++ if (nbSeq != 0) { ++ const void* ptr = offTable; ++ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; ++ const ZSTD_seqSymbol* table = offTable + 1; ++ U32 const max = 1 << tableLog; ++ U32 u; ++ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); ++ ++ assert(max <= (1 << OffFSELog)); /* max not too large */ ++ for (u=0; u 22) info.longOffsetShare += 1; ++ } ++ ++ assert(tableLog <= OffFSELog); ++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + +- assert(tableLog <= OffFSELog); +- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ ++ return info; ++} + +- return total; ++/* ++ * @returns The maximum offset we can decode in one read of our bitstream, without ++ * reloading more bits in the middle of the offset bits read. Any offsets larger ++ * than this must use the long offset decoder. ++ */ ++static size_t ZSTD_maxShortOffset(void) ++{ ++ if (MEM_64bits()) { ++ /* We can decode any offset without reloading bits. ++ * This might change if the max window size grows. ++ */ ++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ++ return (size_t)-1; ++ } else { ++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. ++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. ++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. ++ */ ++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; ++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; ++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); ++ return maxOffset; ++ } + } +-#endif + + size_t + ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming) ++ const void* src, size_t srcSize, const streaming_operation streaming) + { /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; +- /* isLongOffset must be true if there are long offsets. +- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. +- * We don't expect that to be the case in 64-bit mode. +- * In block mode, window size is not known, so we have to be conservative. +- * (note: but it could be evaluated from current-lowLimit) +- */ +- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); +- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); +- +- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); ++ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); ++ ++ /* Note : the wording of the specification ++ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). ++ * This generally does not happen, as it makes little sense, ++ * since an uncompressed block would feature same size and have no decompression cost. ++ * Also, note that decoder from reference libzstd before < v1.5.4 ++ * would consider this edge case as an error. ++ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) ++ * for broader compatibility with the deployed ecosystem of zstd decoders */ ++ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); +- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); ++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; +@@ -2001,6 +2091,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + + /* Build Decoding Tables */ + { ++ /* Compute the maximum block size, which must also work when !frame and fParams are unset. ++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. ++ */ ++ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); ++ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); ++ /* isLongOffset must be true if there are long offsets. ++ * Offsets are long if they are larger than ZSTD_maxShortOffset(). ++ * We don't expect that to be the case in 64-bit mode. ++ * ++ * We check here to see if our history is large enough to allow long offsets. ++ * If it isn't, then we can't possible have (valid) long offsets. If the offset ++ * is invalid, then it is okay to read it incorrectly. ++ * ++ * If isLongOffsets is true, then we will later check our decoding table to see ++ * if it is even possible to generate long offsets. ++ */ ++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. +@@ -2008,6 +2115,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; ++#else ++ /* Set to 1 to avoid computing offset info if we don't need to. ++ * Otherwise this value is ignored. ++ */ ++ int usePrefetchDecoder = 1; + #endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); +@@ -2015,40 +2127,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + ip += seqHSize; + srcSize -= seqHSize; + +- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, ++ "invalid dst"); + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if ( !usePrefetchDecoder +- && (!frame || (dctx->fParams.windowSize > (1<<24))) +- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ +- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); +- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ +- usePrefetchDecoder = (shareLongOffsets >= minShare); ++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, ++ * compute information about the share of long offsets, and the maximum nbAdditionalBits. ++ * NOTE: could probably use a larger nbSeq limit ++ */ ++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); ++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { ++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small ++ * enough, then we know it is impossible to have too long an offset in this block, so we can ++ * use the regular offset decoder. ++ */ ++ isLongOffset = ZSTD_lo_isRegularOffset; ++ } ++ if (!usePrefetchDecoder) { ++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ ++ usePrefetchDecoder = (info.longOffsetShare >= minShare); ++ } + } +-#endif + + dctx->ddictIsCold = 0; + + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if (usePrefetchDecoder) ++ if (usePrefetchDecoder) { ++#else ++ (void)usePrefetchDecoder; ++ { + #endif + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif ++ } + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + if (dctx->litBufferLocation == ZSTD_split) +- return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + else +- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif + } + } + + ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + { + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ +@@ -2060,13 +2187,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + } + + +-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t dSize; ++ dctx->isFrameDecompression = 0; + ZSTD_checkContinuity(dctx, dst, dstCapacity); +- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); ++ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); ++ FORWARD_IF_ERROR(dSize, ""); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; + } ++ ++ ++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ ++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); ++} +diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h +index 3d2d57a5d25a..becffbd89364 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.h ++++ b/lib/zstd/decompress/zstd_decompress_block.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -47,7 +48,7 @@ typedef enum { + */ + size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming); ++ const void* src, size_t srcSize, const streaming_operation streaming); + + /* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) +@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + ++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ + + #endif /* ZSTD_DEC_BLOCK_H */ +diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h +index 98102edb6a83..2a225d1811c4 100644 +--- a/lib/zstd/decompress/zstd_decompress_internal.h ++++ b/lib/zstd/decompress/zstd_decompress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = { + + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) ++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + + typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ +- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ ++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; + } ZSTD_entropyDTables_t; +@@ -135,7 +137,7 @@ struct ZSTD_DCtx_s + const void* virtualStart; /* virtual start of previous segment if it was just before current one */ + const void* dictEnd; /* end of previous segment */ + size_t expected; +- ZSTD_frameHeader fParams; ++ ZSTD_FrameHeader fParams; + U64 processedCSize; + U64 decodedSize; + blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ +@@ -152,7 +154,8 @@ struct ZSTD_DCtx_s + size_t litSize; + size_t rleSize; + size_t staticSize; +-#if DYNAMIC_BMI2 != 0 ++ int isFrameDecompression; ++#if DYNAMIC_BMI2 + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + #endif + +@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s + ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ ++ int disableHufAsm; ++ int maxBlockSizeParam; + + /* streaming */ + ZSTD_dStreamStage streamStage; +@@ -199,11 +204,11 @@ struct ZSTD_DCtx_s + }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ + + MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +-#if DYNAMIC_BMI2 != 0 +- return dctx->bmi2; ++#if DYNAMIC_BMI2 ++ return dctx->bmi2; + #else + (void)dctx; +- return 0; ++ return 0; + #endif + } + +diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h +index a06ca187aab5..8a47eb2a4514 100644 +--- a/lib/zstd/decompress_sources.h ++++ b/lib/zstd/decompress_sources.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c +index 22686e367e6f..466828e35752 100644 +--- a/lib/zstd/zstd_common_module.c ++++ b/lib/zstd/zstd_common_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); + EXPORT_SYMBOL_GPL(ZSTD_isError); + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +-EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customFree); + + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Common"); +diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c +index bd8784449b31..a788ebfcb111 100644 +--- a/lib/zstd/zstd_compress_module.c ++++ b/lib/zstd/zstd_compress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,6 +16,7 @@ + + #include "common/zstd_deps.h" + #include "common/zstd_internal.h" ++#include "compress/zstd_compress_internal.h" + + #define ZSTD_FORWARD_IF_ERR(ret) \ + do { \ +@@ -85,6 +86,12 @@ zstd_parameters zstd_get_params(int level, + } + EXPORT_SYMBOL(zstd_get_params); + ++size_t zstd_cctx_set_param(zstd_cctx *cctx, ZSTD_cParameter param, int value) ++{ ++ return ZSTD_CCtx_setParameter(cctx, param, value); ++} ++EXPORT_SYMBOL(zstd_cctx_set_param); ++ + zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size) + { +@@ -98,6 +105,52 @@ size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams) + } + EXPORT_SYMBOL(zstd_cctx_workspace_bound); + ++// Used by zstd_cctx_workspace_bound_with_ext_seq_prod() ++static size_t dummy_external_sequence_producer( ++ void *sequenceProducerState, ++ ZSTD_Sequence *outSeqs, size_t outSeqsCapacity, ++ const void *src, size_t srcSize, ++ const void *dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize) ++{ ++ (void)sequenceProducerState; ++ (void)outSeqs; (void)outSeqsCapacity; ++ (void)src; (void)srcSize; ++ (void)dict; (void)dictSize; ++ (void)compressionLevel; ++ (void)windowSize; ++ return ZSTD_SEQUENCE_PRODUCER_ERROR; ++} ++ ++static void init_cctx_params_from_compress_params( ++ ZSTD_CCtx_params *cctx_params, ++ const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_parameters zstd_params; ++ memset(&zstd_params, 0, sizeof(zstd_params)); ++ zstd_params.cParams = *compress_params; ++ ZSTD_CCtxParams_init_advanced(cctx_params, zstd_params); ++} ++ ++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_CCtx_params cctx_params; ++ init_cctx_params_from_compress_params(&cctx_params, compress_params); ++ ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer); ++ return ZSTD_estimateCCtxSize_usingCCtxParams(&cctx_params); ++} ++EXPORT_SYMBOL(zstd_cctx_workspace_bound_with_ext_seq_prod); ++ ++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_CCtx_params cctx_params; ++ init_cctx_params_from_compress_params(&cctx_params, compress_params); ++ ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer); ++ return ZSTD_estimateCStreamSize_usingCCtxParams(&cctx_params); ++} ++EXPORT_SYMBOL(zstd_cstream_workspace_bound_with_ext_seq_prod); ++ + zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size) + { + if (workspace == NULL) +@@ -209,5 +262,25 @@ size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output) + } + EXPORT_SYMBOL(zstd_end_stream); + ++void zstd_register_sequence_producer( ++ zstd_cctx *cctx, ++ void* sequence_producer_state, ++ zstd_sequence_producer_f sequence_producer ++) { ++ ZSTD_registerSequenceProducer(cctx, sequence_producer_state, sequence_producer); ++} ++EXPORT_SYMBOL(zstd_register_sequence_producer); ++ ++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity, ++ const zstd_sequence *in_seqs, size_t in_seqs_size, ++ const void* literals, size_t lit_size, size_t lit_capacity, ++ size_t decompressed_size) ++{ ++ return ZSTD_compressSequencesAndLiterals(cctx, dst, dst_capacity, in_seqs, ++ in_seqs_size, literals, lit_size, ++ lit_capacity, decompressed_size); ++} ++EXPORT_SYMBOL(zstd_compress_sequences_and_literals); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Compressor"); +diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c +index 469fc3059be0..0ae819f0c927 100644 +--- a/lib/zstd/zstd_decompress_module.c ++++ b/lib/zstd/zstd_decompress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream); + + size_t zstd_reset_dstream(zstd_dstream *dstream) + { +- return ZSTD_resetDStream(dstream); ++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); + } + EXPORT_SYMBOL(zstd_reset_dstream); + +-- +2.48.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.14/0001-bore.patch b/sys-kernel/gentoo-sources-6.14/0001-bore.patch new file mode 100644 index 0000000..82b3dac --- /dev/null +++ b/sys-kernel/gentoo-sources-6.14/0001-bore.patch @@ -0,0 +1,1006 @@ +From 9c32e28fe484288e6ba87efd34914c1dcb3f3150 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Mon, 2 Jun 2025 20:44:49 +0200 +Subject: [PATCH] bore + +Signed-off-by: Piotr Gorski +--- + include/linux/sched.h | 18 ++ + include/linux/sched/bore.h | 40 ++++ + init/Kconfig | 17 ++ + kernel/Kconfig.hz | 17 ++ + kernel/fork.c | 6 + + kernel/sched/Makefile | 1 + + kernel/sched/bore.c | 443 +++++++++++++++++++++++++++++++++++++ + kernel/sched/core.c | 6 + + kernel/sched/debug.c | 61 ++++- + kernel/sched/fair.c | 73 +++++- + kernel/sched/sched.h | 9 + + 11 files changed, 686 insertions(+), 5 deletions(-) + create mode 100644 include/linux/sched/bore.h + create mode 100644 kernel/sched/bore.c + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 6e5c38718..77ac55985 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -550,6 +550,15 @@ struct sched_statistics { + #endif /* CONFIG_SCHEDSTATS */ + } ____cacheline_aligned; + ++#ifdef CONFIG_SCHED_BORE ++struct sched_burst_cache { ++ u8 score; ++ u32 count; ++ u64 timestamp; ++ spinlock_t lock; ++}; ++#endif // CONFIG_SCHED_BORE ++ + struct sched_entity { + /* For load-balancing: */ + struct load_weight load; +@@ -569,6 +578,15 @@ struct sched_entity { + u64 sum_exec_runtime; + u64 prev_sum_exec_runtime; + u64 vruntime; ++#ifdef CONFIG_SCHED_BORE ++ u64 burst_time; ++ u8 prev_burst_penalty; ++ u8 curr_burst_penalty; ++ u8 burst_penalty; ++ u8 burst_score; ++ struct sched_burst_cache child_burst; ++ struct sched_burst_cache group_burst; ++#endif // CONFIG_SCHED_BORE + s64 vlag; + u64 slice; + +diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h +new file mode 100644 +index 000000000..a8faabc28 +--- /dev/null ++++ b/include/linux/sched/bore.h +@@ -0,0 +1,40 @@ ++ ++#include ++#include ++ ++#ifndef _LINUX_SCHED_BORE_H ++#define _LINUX_SCHED_BORE_H ++#define SCHED_BORE_VERSION "5.9.6" ++ ++#ifdef CONFIG_SCHED_BORE ++extern u8 __read_mostly sched_bore; ++extern u8 __read_mostly sched_burst_exclude_kthreads; ++extern u8 __read_mostly sched_burst_smoothness_long; ++extern u8 __read_mostly sched_burst_smoothness_short; ++extern u8 __read_mostly sched_burst_fork_atavistic; ++extern u8 __read_mostly sched_burst_parity_threshold; ++extern u8 __read_mostly sched_burst_penalty_offset; ++extern uint __read_mostly sched_burst_penalty_scale; ++extern uint __read_mostly sched_burst_cache_stop_count; ++extern uint __read_mostly sched_burst_cache_lifetime; ++extern uint __read_mostly sched_deadline_boost_mask; ++ ++extern void update_burst_score(struct sched_entity *se); ++extern void update_burst_penalty(struct sched_entity *se); ++ ++extern void restart_burst(struct sched_entity *se); ++extern void restart_burst_rescale_deadline(struct sched_entity *se); ++ ++extern int sched_bore_update_handler(const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++ ++extern void sched_clone_bore( ++ struct task_struct *p, struct task_struct *parent, u64 clone_flags, u64 now); ++ ++extern void reset_task_bore(struct task_struct *p); ++extern void sched_bore_init(void); ++ ++extern void reweight_entity( ++ struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); ++#endif // CONFIG_SCHED_BORE ++#endif // _LINUX_SCHED_BORE_H +diff --git a/init/Kconfig b/init/Kconfig +index 522fac299..13a48166e 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1363,6 +1363,23 @@ config CHECKPOINT_RESTORE + + If unsure, say N here. + ++config SCHED_BORE ++ bool "Burst-Oriented Response Enhancer" ++ default y ++ help ++ In Desktop and Mobile computing, one might prefer interactive ++ tasks to keep responsive no matter what they run in the background. ++ ++ Enabling this kernel feature modifies the scheduler to discriminate ++ tasks by their burst time (runtime since it last went sleeping or ++ yielding state) and prioritize those that run less bursty. ++ Such tasks usually include window compositor, widgets backend, ++ terminal emulator, video playback, games and so on. ++ With a little impact to scheduling fairness, it may improve ++ responsiveness especially under heavy background workload. ++ ++ If unsure, say Y here. ++ + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d068..253c566b5 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -55,5 +55,22 @@ config HZ + default 300 if HZ_300 + default 1000 if HZ_1000 + ++config MIN_BASE_SLICE_NS ++ int "Default value for min_base_slice_ns" ++ default 2000000 ++ help ++ The BORE Scheduler automatically calculates the optimal base ++ slice for the configured HZ using the following equation: ++ ++ base_slice_ns = ++ 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) ++ ++ This option sets the default lower bound limit of the base slice ++ to prevent the loss of task throughput due to overscheduling. ++ ++ Setting this value too high can cause the system to boot with ++ an unnecessarily large base slice, resulting in high scheduling ++ latency and poor system responsiveness. ++ + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/fork.c b/kernel/fork.c +index 5e640468b..235fe18fe 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -112,6 +112,8 @@ + #include + #include + ++#include ++ + #include + + #define CREATE_TRACE_POINTS +@@ -2529,6 +2531,10 @@ __latent_entropy struct task_struct *copy_process( + p->start_time = ktime_get_ns(); + p->start_boottime = ktime_get_boottime_ns(); + ++#ifdef CONFIG_SCHED_BORE ++ if (likely(p->pid)) ++ sched_clone_bore(p, current, clone_flags, p->start_time); ++#endif // CONFIG_SCHED_BORE + /* + * Make it visible to the rest of the system, but dont wake it up yet. + * Need tasklist lock for parent etc handling! +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 8ae86371d..ab9ad886a 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -37,3 +37,4 @@ obj-y += core.o + obj-y += fair.o + obj-y += build_policy.o + obj-y += build_utility.o ++obj-y += bore.o +diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c +new file mode 100644 +index 000000000..23aeb5649 +--- /dev/null ++++ b/kernel/sched/bore.c +@@ -0,0 +1,443 @@ ++/* ++ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler ++ * Copyright (C) 2021-2024 Masahito Suzuki ++ */ ++#include ++#include ++#include ++#include "sched.h" ++ ++#ifdef CONFIG_SCHED_BORE ++u8 __read_mostly sched_bore = 1; ++u8 __read_mostly sched_burst_exclude_kthreads = 1; ++u8 __read_mostly sched_burst_smoothness_long = 1; ++u8 __read_mostly sched_burst_smoothness_short = 0; ++u8 __read_mostly sched_burst_fork_atavistic = 2; ++u8 __read_mostly sched_burst_parity_threshold = 2; ++u8 __read_mostly sched_burst_penalty_offset = 24; ++uint __read_mostly sched_burst_penalty_scale = 1280; ++uint __read_mostly sched_burst_cache_stop_count = 64; ++uint __read_mostly sched_burst_cache_lifetime = 75000000; ++uint __read_mostly sched_deadline_boost_mask = ENQUEUE_INITIAL ++ | ENQUEUE_WAKEUP; ++static int __maybe_unused sixty_four = 64; ++static int __maybe_unused maxval_u8 = 255; ++static int __maybe_unused maxval_12_bits = 4095; ++ ++#define MAX_BURST_PENALTY (39U <<2) ++ ++static inline u32 log2plus1_u64_u32f8(u64 v) { ++ u32 integral = fls64(v); ++ u8 fractional = v << (64 - integral) >> 55; ++ return integral << 8 | fractional; ++} ++ ++static inline u32 calc_burst_penalty(u64 burst_time) { ++ u32 greed, tolerance, penalty, scaled_penalty; ++ ++ greed = log2plus1_u64_u32f8(burst_time); ++ tolerance = sched_burst_penalty_offset << 8; ++ penalty = max(0, (s32)(greed - tolerance)); ++ scaled_penalty = penalty * sched_burst_penalty_scale >> 16; ++ ++ return min(MAX_BURST_PENALTY, scaled_penalty); ++} ++ ++static inline u64 __scale_slice(u64 delta, u8 score) ++{return mul_u64_u32_shr(delta, sched_prio_to_wmult[score], 22);} ++ ++static inline u64 __unscale_slice(u64 delta, u8 score) ++{return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10);} ++ ++static void reweight_task_by_prio(struct task_struct *p, int prio) { ++ struct sched_entity *se = &p->se; ++ unsigned long weight = scale_load(sched_prio_to_weight[prio]); ++ ++ reweight_entity(cfs_rq_of(se), se, weight); ++ se->load.inv_weight = sched_prio_to_wmult[prio]; ++} ++ ++static inline u8 effective_prio(struct task_struct *p) { ++ u8 prio = p->static_prio - MAX_RT_PRIO; ++ if (likely(sched_bore)) ++ prio += p->se.burst_score; ++ return min(39, prio); ++} ++ ++void update_burst_score(struct sched_entity *se) { ++ if (!entity_is_task(se)) return; ++ struct task_struct *p = task_of(se); ++ u8 prev_prio = effective_prio(p); ++ ++ u8 burst_score = 0; ++ if (!((p->flags & PF_KTHREAD) && likely(sched_burst_exclude_kthreads))) ++ burst_score = se->burst_penalty >> 2; ++ se->burst_score = burst_score; ++ ++ u8 new_prio = effective_prio(p); ++ if (new_prio != prev_prio) ++ reweight_task_by_prio(p, new_prio); ++} ++ ++void update_burst_penalty(struct sched_entity *se) { ++ se->curr_burst_penalty = calc_burst_penalty(se->burst_time); ++ se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty); ++ update_burst_score(se); ++} ++ ++static inline u32 binary_smooth(u32 new, u32 old) { ++ int increment = new - old; ++ return (0 <= increment)? ++ old + ( increment >> (int)sched_burst_smoothness_long): ++ old - (-increment >> (int)sched_burst_smoothness_short); ++} ++ ++static void revolve_burst_penalty(struct sched_entity *se) { ++ se->prev_burst_penalty = ++ binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty); ++ se->burst_time = 0; ++ se->curr_burst_penalty = 0; ++} ++ ++inline void restart_burst(struct sched_entity *se) { ++ revolve_burst_penalty(se); ++ se->burst_penalty = se->prev_burst_penalty; ++ update_burst_score(se); ++} ++ ++void restart_burst_rescale_deadline(struct sched_entity *se) { ++ s64 vscaled, wremain, vremain = se->deadline - se->vruntime; ++ struct task_struct *p = task_of(se); ++ u8 prev_prio = effective_prio(p); ++ restart_burst(se); ++ u8 new_prio = effective_prio(p); ++ if (prev_prio > new_prio) { ++ wremain = __unscale_slice(abs(vremain), prev_prio); ++ vscaled = __scale_slice(wremain, new_prio); ++ if (unlikely(vremain < 0)) ++ vscaled = -vscaled; ++ se->deadline = se->vruntime + vscaled; ++ } ++} ++ ++static inline bool task_is_bore_eligible(struct task_struct *p) ++{return p && p->sched_class == &fair_sched_class && !p->exit_state;} ++ ++static void reset_task_weights_bore(void) { ++ struct task_struct *task; ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ write_lock_irq(&tasklist_lock); ++ for_each_process(task) { ++ if (!task_is_bore_eligible(task)) continue; ++ rq = task_rq(task); ++ rq_pin_lock(rq, &rf); ++ update_rq_clock(rq); ++ reweight_task_by_prio(task, effective_prio(task)); ++ rq_unpin_lock(rq, &rf); ++ } ++ write_unlock_irq(&tasklist_lock); ++} ++ ++int sched_bore_update_handler(const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) { ++ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); ++ if (ret || !write) ++ return ret; ++ ++ reset_task_weights_bore(); ++ ++ return 0; ++} ++ ++#define for_each_child(p, t) \ ++ list_for_each_entry(t, &(p)->children, sibling) ++ ++static u32 count_entries_upto2(struct list_head *head) { ++ struct list_head *next = head->next; ++ return (next != head) + (next->next != head); ++} ++ ++static inline void init_task_burst_cache_lock(struct task_struct *p) { ++ spin_lock_init(&p->se.child_burst.lock); ++ spin_lock_init(&p->se.group_burst.lock); ++} ++ ++static inline bool burst_cache_expired(struct sched_burst_cache *bc, u64 now) ++{return (s64)(bc->timestamp + sched_burst_cache_lifetime - now) < 0;} ++ ++static void update_burst_cache(struct sched_burst_cache *bc, ++ struct task_struct *p, u32 cnt, u32 sum, u64 now) { ++ u8 avg = cnt ? sum / cnt : 0; ++ bc->score = max(avg, p->se.burst_penalty); ++ bc->count = cnt; ++ bc->timestamp = now; ++} ++ ++static inline void update_child_burst_direct(struct task_struct *p, u64 now) { ++ u32 cnt = 0, sum = 0; ++ struct task_struct *child; ++ ++ for_each_child(p, child) { ++ if (!task_is_bore_eligible(child)) continue; ++ cnt++; ++ sum += child->se.burst_penalty; ++ } ++ ++ update_burst_cache(&p->se.child_burst, p, cnt, sum, now); ++} ++ ++static inline u8 inherit_burst_direct( ++ struct task_struct *p, u64 now, u64 clone_flags) { ++ struct task_struct *parent = p; ++ struct sched_burst_cache *bc; ++ ++ if (clone_flags & CLONE_PARENT) ++ parent = parent->real_parent; ++ ++ bc = &parent->se.child_burst; ++ guard(spinlock)(&bc->lock); ++ if (burst_cache_expired(bc, now)) ++ update_child_burst_direct(parent, now); ++ ++ return bc->score; ++} ++ ++static void update_child_burst_topological( ++ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { ++ u32 cnt = 0, dcnt = 0, sum = 0; ++ struct task_struct *child, *dec; ++ struct sched_burst_cache *bc __maybe_unused; ++ ++ for_each_child(p, child) { ++ dec = child; ++ while ((dcnt = count_entries_upto2(&dec->children)) == 1) ++ dec = list_first_entry(&dec->children, struct task_struct, sibling); ++ ++ if (!dcnt || !depth) { ++ if (!task_is_bore_eligible(dec)) continue; ++ cnt++; ++ sum += dec->se.burst_penalty; ++ continue; ++ } ++ bc = &dec->se.child_burst; ++ spin_lock(&bc->lock); ++ if (!burst_cache_expired(bc, now)) { ++ cnt += bc->count; ++ sum += (u32)bc->score * bc->count; ++ if (sched_burst_cache_stop_count <= cnt) { ++ spin_unlock(&bc->lock); ++ break; ++ } ++ spin_unlock(&bc->lock); ++ continue; ++ } ++ update_child_burst_topological(dec, now, depth - 1, &cnt, &sum); ++ spin_unlock(&bc->lock); ++ } ++ ++ update_burst_cache(&p->se.child_burst, p, cnt, sum, now); ++ *acnt += cnt; ++ *asum += sum; ++} ++ ++static inline u8 inherit_burst_topological( ++ struct task_struct *p, u64 now, u64 clone_flags) { ++ struct task_struct *anc = p; ++ struct sched_burst_cache *bc; ++ u32 cnt = 0, sum = 0; ++ u32 base_child_cnt = 0; ++ ++ if (clone_flags & CLONE_PARENT) { ++ anc = anc->real_parent; ++ base_child_cnt = 1; ++ } ++ ++ for (struct task_struct *next; ++ anc != (next = anc->real_parent) && ++ count_entries_upto2(&anc->children) <= base_child_cnt;) { ++ anc = next; ++ base_child_cnt = 1; ++ } ++ ++ bc = &anc->se.child_burst; ++ guard(spinlock)(&bc->lock); ++ if (burst_cache_expired(bc, now)) ++ update_child_burst_topological( ++ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); ++ ++ return bc->score; ++} ++ ++static inline void update_tg_burst(struct task_struct *p, u64 now) { ++ struct task_struct *task; ++ u32 cnt = 0, sum = 0; ++ ++ for_each_thread(p, task) { ++ if (!task_is_bore_eligible(task)) continue; ++ cnt++; ++ sum += task->se.burst_penalty; ++ } ++ ++ update_burst_cache(&p->se.group_burst, p, cnt, sum, now); ++} ++ ++static inline u8 inherit_burst_tg(struct task_struct *p, u64 now) { ++ struct task_struct *parent = rcu_dereference(p->group_leader); ++ struct sched_burst_cache *bc = &parent->se.group_burst; ++ guard(spinlock)(&bc->lock); ++ if (burst_cache_expired(bc, now)) ++ update_tg_burst(parent, now); ++ ++ return bc->score; ++} ++ ++void sched_clone_bore(struct task_struct *p, ++ struct task_struct *parent, u64 clone_flags, u64 now) { ++ struct sched_entity *se = &p->se; ++ u8 penalty; ++ ++ init_task_burst_cache_lock(p); ++ ++ if (!task_is_bore_eligible(p)) return; ++ ++ if (clone_flags & CLONE_THREAD) { ++ rcu_read_lock(); ++ penalty = inherit_burst_tg(parent, now); ++ rcu_read_unlock(); ++ } else { ++ read_lock(&tasklist_lock); ++ penalty = likely(sched_burst_fork_atavistic) ? ++ inherit_burst_topological(parent, now, clone_flags): ++ inherit_burst_direct(parent, now, clone_flags); ++ read_unlock(&tasklist_lock); ++ } ++ ++ revolve_burst_penalty(se); ++ se->burst_penalty = se->prev_burst_penalty = ++ max(se->prev_burst_penalty, penalty); ++ se->child_burst.timestamp = 0; ++ se->group_burst.timestamp = 0; ++} ++ ++void reset_task_bore(struct task_struct *p) { ++ p->se.burst_time = 0; ++ p->se.prev_burst_penalty = 0; ++ p->se.curr_burst_penalty = 0; ++ p->se.burst_penalty = 0; ++ p->se.burst_score = 0; ++ memset(&p->se.child_burst, 0, sizeof(struct sched_burst_cache)); ++ memset(&p->se.group_burst, 0, sizeof(struct sched_burst_cache)); ++} ++ ++void __init sched_bore_init(void) { ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification %s by Masahito Suzuki", SCHED_BORE_VERSION); ++ reset_task_bore(&init_task); ++ init_task_burst_cache_lock(&init_task); ++} ++ ++#ifdef CONFIG_SYSCTL ++static struct ctl_table sched_bore_sysctls[] = { ++ { ++ .procname = "sched_bore", ++ .data = &sched_bore, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = sched_bore_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_exclude_kthreads", ++ .data = &sched_burst_exclude_kthreads, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_smoothness_long", ++ .data = &sched_burst_smoothness_long, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_smoothness_short", ++ .data = &sched_burst_smoothness_short, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_fork_atavistic", ++ .data = &sched_burst_fork_atavistic, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_THREE, ++ }, ++ { ++ .procname = "sched_burst_parity_threshold", ++ .data = &sched_burst_parity_threshold, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_u8, ++ }, ++ { ++ .procname = "sched_burst_penalty_offset", ++ .data = &sched_burst_penalty_offset, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &sixty_four, ++ }, ++ { ++ .procname = "sched_burst_penalty_scale", ++ .data = &sched_burst_penalty_scale, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_12_bits, ++ }, ++ { ++ .procname = "sched_burst_cache_stop_count", ++ .data = &sched_burst_cache_stop_count, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++ { ++ .procname = "sched_burst_cache_lifetime", ++ .data = &sched_burst_cache_lifetime, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++ { ++ .procname = "sched_deadline_boost_mask", ++ .data = &sched_deadline_boost_mask, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++}; ++ ++static int __init sched_bore_sysctl_init(void) { ++ register_sysctl_init("kernel", sched_bore_sysctls); ++ return 0; ++} ++late_initcall(sched_bore_sysctl_init); ++#endif // CONFIG_SYSCTL ++#endif // CONFIG_SCHED_BORE +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 3c7c942c7..f6a9189ff 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -97,6 +97,8 @@ + #include "../../io_uring/io-wq.h" + #include "../smpboot.h" + ++#include ++ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); + +@@ -8490,6 +8492,10 @@ void __init sched_init(void) + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ sched_bore_init(); ++#endif // CONFIG_SCHED_BORE ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index a0893a483..1ee54165f 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -167,7 +167,53 @@ static const struct file_operations sched_feat_fops = { + }; + + #ifdef CONFIG_SMP ++#ifdef CONFIG_SCHED_BORE ++#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ ++static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ ++{ \ ++ char buf[16]; \ ++ unsigned int value; \ ++\ ++ if (cnt > 15) \ ++ cnt = 15; \ ++\ ++ if (copy_from_user(&buf, ubuf, cnt)) \ ++ return -EFAULT; \ ++ buf[cnt] = '\0'; \ ++\ ++ if (kstrtouint(buf, 10, &value)) \ ++ return -EINVAL; \ ++\ ++ sysctl_sched_##name = value; \ ++ sched_update_##update_func(); \ ++\ ++ *ppos += cnt; \ ++ return cnt; \ ++} \ ++\ ++static int sched_##name##_show(struct seq_file *m, void *v) \ ++{ \ ++ seq_printf(m, "%d\n", sysctl_sched_##name); \ ++ return 0; \ ++} \ ++\ ++static int sched_##name##_open(struct inode *inode, struct file *filp) \ ++{ \ ++ return single_open(filp, sched_##name##_show, NULL); \ ++} \ ++\ ++static const struct file_operations sched_##name##_fops = { \ ++ .open = sched_##name##_open, \ ++ .write = sched_##name##_write, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++}; ++ ++DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) + ++#undef DEFINE_SYSCTL_SCHED_FUNC ++#else // !CONFIG_SCHED_BORE + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +@@ -213,7 +259,7 @@ static const struct file_operations sched_scaling_fops = { + .llseek = seq_lseek, + .release = single_release, + }; +- ++#endif // CONFIG_SCHED_BORE + #endif /* SMP */ + + #ifdef CONFIG_PREEMPT_DYNAMIC +@@ -505,13 +551,20 @@ static __init int sched_init_debug(void) + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); ++ debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); ++#else // !CONFIG_SCHED_BORE + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); ++#endif // CONFIG_SCHED_BORE + + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); + + #ifdef CONFIG_SMP ++#if !defined(CONFIG_SCHED_BORE) + debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); ++#endif // CONFIG_SCHED_BORE + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); + debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); + +@@ -756,6 +809,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + ++#ifdef CONFIG_SCHED_BORE ++ SEQ_printf(m, " %2d", p->se.burst_score); ++#endif // CONFIG_SCHED_BORE + #ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif +@@ -1242,6 +1298,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + + P(se.load.weight); + #ifdef CONFIG_SMP ++#ifdef CONFIG_SCHED_BORE ++ P(se.burst_score); ++#endif // CONFIG_SCHED_BORE + P(se.avg.load_sum); + P(se.avg.runnable_sum); + P(se.avg.util_sum); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index eb1165016..abecfa517 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -58,6 +58,8 @@ + #include "stats.h" + #include "autogroup.h" + ++#include ++ + /* + * The initial- and re-scaling of tunables is configurable + * +@@ -67,17 +69,30 @@ + * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * +- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) ++ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant ++ * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + */ ++#ifdef CONFIG_SCHED_BORE ++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; ++#else // !CONFIG_SCHED_BORE + unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; ++#endif // CONFIG_SCHED_BORE + + /* + * Minimal preemption granularity for CPU-bound tasks: + * +- * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) ++ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice ++ * (default min_base_slice = 2000000 constant, units: nanoseconds) ++ * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds + */ ++#ifdef CONFIG_SCHED_BORE ++static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; ++unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; ++__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; ++#else // !CONFIG_SCHED_BORE + unsigned int sysctl_sched_base_slice = 700000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; ++#endif // CONFIG_SCHED_BORE + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + +@@ -191,6 +206,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) + * + * This idea comes from the SD scheduler of Con Kolivas: + */ ++#ifdef CONFIG_SCHED_BORE ++static void update_sysctl(void) { ++ sysctl_sched_base_slice = nsecs_per_tick * ++ max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); ++} ++void sched_update_min_base_slice(void) { update_sysctl(); } ++#else // !CONFIG_SCHED_BORE + static unsigned int get_update_sysctl_factor(void) + { + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); +@@ -221,6 +243,7 @@ static void update_sysctl(void) + SET_SYSCTL(sched_base_slice); + #undef SET_SYSCTL + } ++#endif // CONFIG_SCHED_BORE + + void __init sched_init_granularity(void) + { +@@ -700,6 +723,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) + + vlag = avg_vruntime(cfs_rq) - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); ++#ifdef CONFIG_SCHED_BORE ++ limit >>= !!sched_bore; ++#endif // CONFIG_SCHED_BORE + + se->vlag = clamp(vlag, -limit, limit); + } +@@ -940,6 +966,10 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) + curr = NULL; + + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) ++#ifdef CONFIG_SCHED_BORE ++ if (!(likely(sched_bore) && likely(sched_burst_parity_threshold) && ++ sched_burst_parity_threshold < cfs_rq->nr_queued)) ++#endif // CONFIG_SCHED_BORE + return curr; + + /* Pick the leftmost entity if it's eligible */ +@@ -998,6 +1028,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + * Scheduling class statistics methods: + */ + #ifdef CONFIG_SMP ++#if !defined(CONFIG_SCHED_BORE) + int sched_update_scaling(void) + { + unsigned int factor = get_update_sysctl_factor(); +@@ -1009,6 +1040,7 @@ int sched_update_scaling(void) + + return 0; + } ++#endif // CONFIG_SCHED_BORE + #endif + #endif + +@@ -1239,6 +1271,10 @@ static void update_curr(struct cfs_rq *cfs_rq) + if (unlikely(delta_exec <= 0)) + return; + ++#ifdef CONFIG_SCHED_BORE ++ curr->burst_time += delta_exec; ++ update_burst_penalty(curr); ++#endif // CONFIG_SCHED_BORE + curr->vruntime += calc_delta_fair(delta_exec, curr); + resched = update_deadline(cfs_rq, curr); + update_min_vruntime(cfs_rq); +@@ -3786,7 +3822,7 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } + + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); + +-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, ++void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) + { + bool curr = cfs_rq->curr == se; +@@ -5292,7 +5328,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + se->rel_deadline = 0; + return; + } +- ++#ifdef CONFIG_SCHED_BORE ++ else if (likely(sched_bore)) ++ vslice >>= !!(flags & sched_deadline_boost_mask); ++ else ++#endif // CONFIG_SCHED_BORE + /* + * When joining the competition; the existing tasks will be, + * on average, halfway through their slice, as such start tasks +@@ -7187,6 +7227,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + util_est_dequeue(&rq->cfs, p); + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); ++#ifdef CONFIG_SCHED_BORE ++ struct cfs_rq *cfs_rq = &rq->cfs; ++ struct sched_entity *se = &p->se; ++ if (flags & DEQUEUE_SLEEP && entity_is_task(se)) { ++ if (cfs_rq->curr == se) ++ update_curr(cfs_rq); ++ restart_burst(se); ++ } ++#endif // CONFIG_SCHED_BORE + if (dequeue_entities(rq, &p->se, flags) < 0) + return false; + +@@ -9007,16 +9056,25 @@ static void yield_task_fair(struct rq *rq) + /* + * Are we the only task in the tree? + */ ++#if !defined(CONFIG_SCHED_BORE) + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); ++#endif // CONFIG_SCHED_BORE + + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); ++#ifdef CONFIG_SCHED_BORE ++ restart_burst_rescale_deadline(se); ++ if (unlikely(rq->nr_running == 1)) ++ return; ++ ++ clear_buddies(cfs_rq, se); ++#endif // CONFIG_SCHED_BORE + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() +@@ -13130,6 +13188,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + static void task_fork_fair(struct task_struct *p) + { + set_task_max_allowed_capacity(p); ++#ifdef CONFIG_SCHED_BORE ++ update_burst_score(&p->se); ++#endif // CONFIG_SCHED_BORE + } + + /* +@@ -13240,6 +13301,10 @@ static void attach_task_cfs_rq(struct task_struct *p) + + static void switched_from_fair(struct rq *rq, struct task_struct *p) + { ++ p->se.rel_deadline = 0; ++#ifdef CONFIG_SCHED_BORE ++ reset_task_bore(p); ++#endif // CONFIG_SCHED_BORE + detach_task_cfs_rq(p); + } + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 1aa65a0ac..fddf67b19 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2122,7 +2122,11 @@ static inline void update_sched_domain_debugfs(void) { } + static inline void dirty_sched_domain_sysctl(int cpu) { } + #endif + ++#ifdef CONFIG_SCHED_BORE ++extern void sched_update_min_base_slice(void); ++#else // !CONFIG_SCHED_BORE + extern int sched_update_scaling(void); ++#endif // CONFIG_SCHED_BORE + + static inline const struct cpumask *task_user_cpus(struct task_struct *p) + { +@@ -2846,7 +2850,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + extern const_debug unsigned int sysctl_sched_nr_migrate; + extern const_debug unsigned int sysctl_sched_migration_cost; + ++#ifdef CONFIG_SCHED_BORE ++extern unsigned int sysctl_sched_min_base_slice; ++extern __read_mostly uint sysctl_sched_base_slice; ++#else // !CONFIG_SCHED_BORE + extern unsigned int sysctl_sched_base_slice; ++#endif // CONFIG_SCHED_BORE + + #ifdef CONFIG_SCHED_DEBUG + extern int sysctl_resched_latency_warn_ms; +-- +2.49.0 + diff --git a/sys-kernel/gentoo-sources-6.14/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.14/0004-bbr3.patch new file mode 100644 index 0000000..4b43326 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.14/0004-bbr3.patch @@ -0,0 +1,3387 @@ +From d221b4b9939f83a4df2ca8d037a2b73d49041a40 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 22 May 2025 16:19:46 +0200 +Subject: [PATCH 4/9] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 6 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 72 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 4 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2231 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 16 files changed, 1937 insertions(+), 554 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index f88daaa76d83..e569fd1ed7e8 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -243,7 +243,8 @@ struct tcp_sock { + /* OOO segments go in this rbtree. Socket lock must be held. */ + struct rb_root out_of_order_queue; + u32 snd_ssthresh; /* Slow start size threshold */ +- u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ ++ u32 recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ ++ fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */ + __cacheline_group_end(tcp_sock_read_rx); + + /* TX read-write hotpath cache lines */ +@@ -300,7 +301,8 @@ struct tcp_sock { + */ + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ +- rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ ++ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ ++ tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index c7f42844c79a..170250145598 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -137,8 +137,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 2d08473a6dc0..aa80dd0abe5a 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -376,6 +376,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_LOW 16 ++#define TCP_ECN_ECT_PERMANENT 32 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +@@ -796,6 +798,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -901,6 +912,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -990,9 +1006,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1105,6 +1126,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1127,7 +1149,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1147,10 +1173,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1161,7 +1190,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1185,8 +1216,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1252,6 +1286,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1271,6 +1313,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1283,6 +1326,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2434,7 +2492,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index 66c3903d29cf..dfdbc1c0b606 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -516,12 +516,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index dbf896f3146c..92b6d6472951 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -178,6 +178,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ ++#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN enabled at conn init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 6d2c97f8e9ef..ddc116ef22cb 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index 554804774628..fb6ab6ca8440 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } +@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 57df7c1d2faa..47605d71f68b 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3398,6 +3398,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4124,6 +4125,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..516a5daac694 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +456,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, ++ bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +473,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +534,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +547,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +579,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +599,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +670,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +681,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +710,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +739,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +795,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +803,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +849,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +858,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +886,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +923,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +946,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +971,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; + +- bbr_update_model(sk, rs); +- +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ !!bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * and in PROBE_UP. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2360,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2397,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index df758adbb445..e98e5dbc050e 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 0cbf81bf3d45..7e8324f54563 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -376,7 +376,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -387,7 +387,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1126,7 +1126,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1507,6 +1512,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3832,7 +3848,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3849,6 +3866,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3859,6 +3877,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3967,6 +3990,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4041,7 +4065,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_rack_update_reo_wnd(sk, &rs); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4065,6 +4089,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4084,7 +4109,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5764,13 +5789,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index dfdb7a4608a8..874e99902bba 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -471,6 +471,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index bc95d2a5924f..d4c45ca6fe06 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1606,7 +1609,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + int nlen; + u8 flags; +@@ -1681,6 +1684,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2038,13 +2065,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2770,6 +2796,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2982,6 +3009,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index b412ed88ccd9..d70f8b742b21 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -699,6 +699,7 @@ void tcp_write_timer_handler(struct sock *sk) + return; + } + ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.49.0.634.g8613c2bb6c + diff --git a/sys-kernel/gentoo-sources-6.14/0006-crypto.patch b/sys-kernel/gentoo-sources-6.14/0006-crypto.patch new file mode 100644 index 0000000..ac617a2 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.14/0006-crypto.patch @@ -0,0 +1,2495 @@ +From 54fcd81865473d94e2174586621d03006f85c68d Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 22 May 2025 16:35:34 +0200 +Subject: [PATCH 6/9] crypto + +Signed-off-by: Peter Jung +--- + MAINTAINERS | 1 + + arch/x86/Kconfig | 2 +- + arch/x86/crypto/aesni-intel_glue.c | 22 +- + arch/x86/include/asm/cpufeatures.h | 1 + + arch/x86/kernel/cpu/intel.c | 22 ++ + arch/x86/lib/Makefile | 2 +- + arch/x86/lib/crc-pclmul-consts.h | 99 +++++ + arch/x86/lib/crc-pclmul-template.S | 584 ++++++++++++++++++++++++++++ + arch/x86/lib/crc-pclmul-template.h | 81 ++++ + arch/x86/lib/crc-t10dif-glue.c | 23 +- + arch/x86/lib/crc16-msb-pclmul.S | 6 + + arch/x86/lib/crc32-glue.c | 51 +-- + arch/x86/lib/crc32-pclmul.S | 219 +---------- + arch/x86/lib/crct10dif-pcl-asm_64.S | 332 ---------------- + drivers/nvme/host/Kconfig | 3 +- + drivers/nvme/host/tcp.c | 122 ++---- + drivers/nvme/target/tcp.c | 90 ++--- + include/linux/skbuff.h | 7 +- + net/core/datagram.c | 46 +-- + scripts/gen-crc-consts.py | 238 ++++++++++++ + 20 files changed, 1143 insertions(+), 808 deletions(-) + create mode 100644 arch/x86/lib/crc-pclmul-consts.h + create mode 100644 arch/x86/lib/crc-pclmul-template.S + create mode 100644 arch/x86/lib/crc-pclmul-template.h + create mode 100644 arch/x86/lib/crc16-msb-pclmul.S + delete mode 100644 arch/x86/lib/crct10dif-pcl-asm_64.S + create mode 100755 scripts/gen-crc-consts.py + +diff --git a/MAINTAINERS b/MAINTAINERS +index c0d5232a473b..ed22cbce79af 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -6140,6 +6140,7 @@ F: Documentation/staging/crc* + F: arch/*/lib/crc* + F: include/linux/crc* + F: lib/crc* ++F: scripts/gen-crc-consts.py + + CREATIVE SB0540 + M: Bastien Nocera +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 473364353bd9..500584609508 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -77,7 +77,7 @@ config X86 + select ARCH_HAS_CPU_FINALIZE_INIT + select ARCH_HAS_CPU_PASID if IOMMU_SVA + select ARCH_HAS_CRC32 +- select ARCH_HAS_CRC_T10DIF if X86_64 ++ select ARCH_HAS_CRC_T10DIF + select ARCH_HAS_CURRENT_STACK_POINTER + select ARCH_HAS_DEBUG_VIRTUAL + select ARCH_HAS_DEBUG_VM_PGTABLE if !X86_PAE +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index 11e95fc62636..3e9ab5cdade4 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -1536,26 +1536,6 @@ DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, + AES_GCM_KEY_AVX10_SIZE, 800); + #endif /* CONFIG_AS_VAES && CONFIG_AS_VPCLMULQDQ */ + +-/* +- * This is a list of CPU models that are known to suffer from downclocking when +- * zmm registers (512-bit vectors) are used. On these CPUs, the AES mode +- * implementations with zmm registers won't be used by default. Implementations +- * with ymm registers (256-bit vectors) will be used by default instead. +- */ +-static const struct x86_cpu_id zmm_exclusion_list[] = { +- X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), +- X86_MATCH_VFM(INTEL_ICELAKE_X, 0), +- X86_MATCH_VFM(INTEL_ICELAKE_D, 0), +- X86_MATCH_VFM(INTEL_ICELAKE, 0), +- X86_MATCH_VFM(INTEL_ICELAKE_L, 0), +- X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0), +- X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0), +- X86_MATCH_VFM(INTEL_TIGERLAKE, 0), +- /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ +- /* Also allow AMD CPUs (starting with Zen 4, the first with AVX-512). */ +- {}, +-}; +- + static int __init register_avx_algs(void) + { + int err; +@@ -1600,7 +1580,7 @@ static int __init register_avx_algs(void) + if (err) + return err; + +- if (x86_match_cpu(zmm_exclusion_list)) { ++ if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { + int i; + + aes_xts_alg_vaes_avx10_512.base.cra_priority = 1; +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index 4c38c9b9c69d..97d7617cab1e 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -485,6 +485,7 @@ + #define X86_FEATURE_AMD_HETEROGENEOUS_CORES (21*32 + 6) /* Heterogeneous Core Topology */ + #define X86_FEATURE_AMD_WORKLOAD_CLASS (21*32 + 7) /* Workload Classification */ + #define X86_FEATURE_INDIRECT_THUNK_ITS (21*32 + 8) /* Use thunk for indirect branches in lower half of cacheline */ ++#define X86_FEATURE_PREFER_YMM (21*32 + 9) /* Avoid ZMM registers due to downclocking */ + + /* + * BUG word(s) +diff --git a/arch/x86/kernel/cpu/intel.c b/arch/x86/kernel/cpu/intel.c +index 134368a3f4b1..5fe563eeb17d 100644 +--- a/arch/x86/kernel/cpu/intel.c ++++ b/arch/x86/kernel/cpu/intel.c +@@ -521,6 +521,25 @@ static void init_intel_misc_features(struct cpuinfo_x86 *c) + wrmsrl(MSR_MISC_FEATURES_ENABLES, msr); + } + ++/* ++ * This is a list of Intel CPUs that are known to suffer from downclocking when ++ * ZMM registers (512-bit vectors) are used. On these CPUs, when the kernel ++ * executes SIMD-optimized code such as cryptography functions or CRCs, it ++ * should prefer 256-bit (YMM) code to 512-bit (ZMM) code. ++ */ ++static const struct x86_cpu_id zmm_exclusion_list[] = { ++ X86_MATCH_VFM(INTEL_SKYLAKE_X, 0), ++ X86_MATCH_VFM(INTEL_ICELAKE_X, 0), ++ X86_MATCH_VFM(INTEL_ICELAKE_D, 0), ++ X86_MATCH_VFM(INTEL_ICELAKE, 0), ++ X86_MATCH_VFM(INTEL_ICELAKE_L, 0), ++ X86_MATCH_VFM(INTEL_ICELAKE_NNPI, 0), ++ X86_MATCH_VFM(INTEL_TIGERLAKE_L, 0), ++ X86_MATCH_VFM(INTEL_TIGERLAKE, 0), ++ /* Allow Rocket Lake and later, and Sapphire Rapids and later. */ ++ {}, ++}; ++ + static void init_intel(struct cpuinfo_x86 *c) + { + early_init_intel(c); +@@ -601,6 +620,9 @@ static void init_intel(struct cpuinfo_x86 *c) + } + #endif + ++ if (x86_match_cpu(zmm_exclusion_list)) ++ set_cpu_cap(c, X86_FEATURE_PREFER_YMM); ++ + /* Work around errata */ + srat_detect_node(c); + +diff --git a/arch/x86/lib/Makefile b/arch/x86/lib/Makefile +index 8a59c61624c2..08496e221a7d 100644 +--- a/arch/x86/lib/Makefile ++++ b/arch/x86/lib/Makefile +@@ -43,7 +43,7 @@ crc32-x86-y := crc32-glue.o crc32-pclmul.o + crc32-x86-$(CONFIG_64BIT) += crc32c-3way.o + + obj-$(CONFIG_CRC_T10DIF_ARCH) += crc-t10dif-x86.o +-crc-t10dif-x86-y := crc-t10dif-glue.o crct10dif-pcl-asm_64.o ++crc-t10dif-x86-y := crc-t10dif-glue.o crc16-msb-pclmul.o + + obj-y += msr.o msr-reg.o msr-reg-export.o hweight.o + obj-y += iomem.o +diff --git a/arch/x86/lib/crc-pclmul-consts.h b/arch/x86/lib/crc-pclmul-consts.h +new file mode 100644 +index 000000000000..089954988f97 +--- /dev/null ++++ b/arch/x86/lib/crc-pclmul-consts.h +@@ -0,0 +1,99 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++/* ++ * CRC constants generated by: ++ * ++ * ./scripts/gen-crc-consts.py x86_pclmul crc16_msb_0x8bb7,crc32_lsb_0xedb88320 ++ * ++ * Do not edit manually. ++ */ ++ ++/* ++ * CRC folding constants generated for most-significant-bit-first CRC-16 using ++ * G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 ++ */ ++static const struct { ++ u8 bswap_mask[16]; ++ u64 fold_across_2048_bits_consts[2]; ++ u64 fold_across_1024_bits_consts[2]; ++ u64 fold_across_512_bits_consts[2]; ++ u64 fold_across_256_bits_consts[2]; ++ u64 fold_across_128_bits_consts[2]; ++ u8 shuf_table[48]; ++ u64 barrett_reduction_consts[2]; ++} crc16_msb_0x8bb7_consts ____cacheline_aligned __maybe_unused = { ++ .bswap_mask = {15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2, 1, 0}, ++ .fold_across_2048_bits_consts = { ++ 0xdccf000000000000, /* LO64_TERMS: (x^2000 mod G) * x^48 */ ++ 0x4b0b000000000000, /* HI64_TERMS: (x^2064 mod G) * x^48 */ ++ }, ++ .fold_across_1024_bits_consts = { ++ 0x9d9d000000000000, /* LO64_TERMS: (x^976 mod G) * x^48 */ ++ 0x7cf5000000000000, /* HI64_TERMS: (x^1040 mod G) * x^48 */ ++ }, ++ .fold_across_512_bits_consts = { ++ 0x044c000000000000, /* LO64_TERMS: (x^464 mod G) * x^48 */ ++ 0xe658000000000000, /* HI64_TERMS: (x^528 mod G) * x^48 */ ++ }, ++ .fold_across_256_bits_consts = { ++ 0x6ee3000000000000, /* LO64_TERMS: (x^208 mod G) * x^48 */ ++ 0xe7b5000000000000, /* HI64_TERMS: (x^272 mod G) * x^48 */ ++ }, ++ .fold_across_128_bits_consts = { ++ 0x2d56000000000000, /* LO64_TERMS: (x^80 mod G) * x^48 */ ++ 0x06df000000000000, /* HI64_TERMS: (x^144 mod G) * x^48 */ ++ }, ++ .shuf_table = { ++ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ++ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ++ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ++ }, ++ .barrett_reduction_consts = { ++ 0x8bb7000000000000, /* LO64_TERMS: (G - x^16) * x^48 */ ++ 0xf65a57f81d33a48a, /* HI64_TERMS: (floor(x^79 / G) * x) - x^64 */ ++ }, ++}; ++ ++/* ++ * CRC folding constants generated for least-significant-bit-first CRC-32 using ++ * G(x) = x^32 + x^26 + x^23 + x^22 + x^16 + x^12 + x^11 + x^10 + x^8 + x^7 + ++ * x^5 + x^4 + x^2 + x^1 + x^0 ++ */ ++static const struct { ++ u64 fold_across_2048_bits_consts[2]; ++ u64 fold_across_1024_bits_consts[2]; ++ u64 fold_across_512_bits_consts[2]; ++ u64 fold_across_256_bits_consts[2]; ++ u64 fold_across_128_bits_consts[2]; ++ u8 shuf_table[48]; ++ u64 barrett_reduction_consts[2]; ++} crc32_lsb_0xedb88320_consts ____cacheline_aligned __maybe_unused = { ++ .fold_across_2048_bits_consts = { ++ 0x00000000ce3371cb, /* HI64_TERMS: (x^2079 mod G) * x^32 */ ++ 0x00000000e95c1271, /* LO64_TERMS: (x^2015 mod G) * x^32 */ ++ }, ++ .fold_across_1024_bits_consts = { ++ 0x0000000033fff533, /* HI64_TERMS: (x^1055 mod G) * x^32 */ ++ 0x00000000910eeec1, /* LO64_TERMS: (x^991 mod G) * x^32 */ ++ }, ++ .fold_across_512_bits_consts = { ++ 0x000000008f352d95, /* HI64_TERMS: (x^543 mod G) * x^32 */ ++ 0x000000001d9513d7, /* LO64_TERMS: (x^479 mod G) * x^32 */ ++ }, ++ .fold_across_256_bits_consts = { ++ 0x00000000f1da05aa, /* HI64_TERMS: (x^287 mod G) * x^32 */ ++ 0x0000000081256527, /* LO64_TERMS: (x^223 mod G) * x^32 */ ++ }, ++ .fold_across_128_bits_consts = { ++ 0x00000000ae689191, /* HI64_TERMS: (x^159 mod G) * x^32 */ ++ 0x00000000ccaa009e, /* LO64_TERMS: (x^95 mod G) * x^32 */ ++ }, ++ .shuf_table = { ++ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ++ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, ++ -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, -1, ++ }, ++ .barrett_reduction_consts = { ++ 0xb4e5b025f7011641, /* HI64_TERMS: floor(x^95 / G) */ ++ 0x00000001db710640, /* LO64_TERMS: (G - x^32) * x^31 */ ++ }, ++}; +diff --git a/arch/x86/lib/crc-pclmul-template.S b/arch/x86/lib/crc-pclmul-template.S +new file mode 100644 +index 000000000000..dc91cc074b30 +--- /dev/null ++++ b/arch/x86/lib/crc-pclmul-template.S +@@ -0,0 +1,584 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++// ++// Template to generate [V]PCLMULQDQ-based CRC functions for x86 ++// ++// Copyright 2025 Google LLC ++// ++// Author: Eric Biggers ++ ++#include ++ ++// Offsets within the generated constants table ++.set OFFSETOF_BSWAP_MASK, -5*16 // msb-first CRCs only ++.set OFFSETOF_FOLD_ACROSS_2048_BITS_CONSTS, -4*16 // must precede next ++.set OFFSETOF_FOLD_ACROSS_1024_BITS_CONSTS, -3*16 // must precede next ++.set OFFSETOF_FOLD_ACROSS_512_BITS_CONSTS, -2*16 // must precede next ++.set OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS, -1*16 // must precede next ++.set OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS, 0*16 // must be 0 ++.set OFFSETOF_SHUF_TABLE, 1*16 ++.set OFFSETOF_BARRETT_REDUCTION_CONSTS, 4*16 ++ ++// Emit a VEX (or EVEX) coded instruction if allowed, or emulate it using the ++// corresponding non-VEX instruction plus any needed moves. The supported ++// instruction formats are: ++// ++// - Two-arg [src, dst], where the non-VEX format is the same. ++// - Three-arg [src1, src2, dst] where the non-VEX format is ++// [src1, src2_and_dst]. If src2 != dst, then src1 must != dst too. ++// ++// \insn gives the instruction without a "v" prefix and including any immediate ++// argument if needed to make the instruction follow one of the above formats. ++// If \unaligned_mem_tmp is given, then the emitted non-VEX code moves \arg1 to ++// it first; this is needed when \arg1 is an unaligned mem operand. ++.macro _cond_vex insn:req, arg1:req, arg2:req, arg3, unaligned_mem_tmp ++.if AVX_LEVEL == 0 ++ // VEX not allowed. Emulate it. ++ .ifnb \arg3 // Three-arg [src1, src2, dst] ++ .ifc "\arg2", "\arg3" // src2 == dst? ++ .ifnb \unaligned_mem_tmp ++ movdqu \arg1, \unaligned_mem_tmp ++ \insn \unaligned_mem_tmp, \arg3 ++ .else ++ \insn \arg1, \arg3 ++ .endif ++ .else // src2 != dst ++ .ifc "\arg1", "\arg3" ++ .error "Can't have src1 == dst when src2 != dst" ++ .endif ++ .ifnb \unaligned_mem_tmp ++ movdqu \arg1, \unaligned_mem_tmp ++ movdqa \arg2, \arg3 ++ \insn \unaligned_mem_tmp, \arg3 ++ .else ++ movdqa \arg2, \arg3 ++ \insn \arg1, \arg3 ++ .endif ++ .endif ++ .else // Two-arg [src, dst] ++ .ifnb \unaligned_mem_tmp ++ movdqu \arg1, \unaligned_mem_tmp ++ \insn \unaligned_mem_tmp, \arg2 ++ .else ++ \insn \arg1, \arg2 ++ .endif ++ .endif ++.else ++ // VEX is allowed. Emit the desired instruction directly. ++ .ifnb \arg3 ++ v\insn \arg1, \arg2, \arg3 ++ .else ++ v\insn \arg1, \arg2 ++ .endif ++.endif ++.endm ++ ++// Broadcast an aligned 128-bit mem operand to all 128-bit lanes of a vector ++// register of length VL. ++.macro _vbroadcast src, dst ++.if VL == 16 ++ _cond_vex movdqa, \src, \dst ++.elseif VL == 32 ++ vbroadcasti128 \src, \dst ++.else ++ vbroadcasti32x4 \src, \dst ++.endif ++.endm ++ ++// Load \vl bytes from the unaligned mem operand \src into \dst, and if the CRC ++// is msb-first use \bswap_mask to reflect the bytes within each 128-bit lane. ++.macro _load_data vl, src, bswap_mask, dst ++.if \vl < 64 ++ _cond_vex movdqu, "\src", \dst ++.else ++ vmovdqu8 \src, \dst ++.endif ++.if !LSB_CRC ++ _cond_vex pshufb, \bswap_mask, \dst, \dst ++.endif ++.endm ++ ++.macro _prepare_v0 vl, v0, v1, bswap_mask ++.if LSB_CRC ++ .if \vl < 64 ++ _cond_vex pxor, (BUF), \v0, \v0, unaligned_mem_tmp=\v1 ++ .else ++ vpxorq (BUF), \v0, \v0 ++ .endif ++.else ++ _load_data \vl, (BUF), \bswap_mask, \v1 ++ .if \vl < 64 ++ _cond_vex pxor, \v1, \v0, \v0 ++ .else ++ vpxorq \v1, \v0, \v0 ++ .endif ++.endif ++.endm ++ ++// The x^0..x^63 terms, i.e. poly128 mod x^64, i.e. the physically low qword for ++// msb-first order or the physically high qword for lsb-first order ++#define LO64_TERMS 0 ++ ++// The x^64..x^127 terms, i.e. floor(poly128 / x^64), i.e. the physically high ++// qword for msb-first order or the physically low qword for lsb-first order ++#define HI64_TERMS 1 ++ ++// Multiply the given \src1_terms of each 128-bit lane of \src1 by the given ++// \src2_terms of each 128-bit lane of \src2, and write the result(s) to \dst. ++.macro _pclmulqdq src1, src1_terms, src2, src2_terms, dst ++ _cond_vex "pclmulqdq $((\src1_terms ^ LSB_CRC) << 4) ^ (\src2_terms ^ LSB_CRC),", \ ++ \src1, \src2, \dst ++.endm ++ ++// Fold \acc into \data and store the result back into \acc. \data can be an ++// unaligned mem operand if using VEX is allowed and the CRC is lsb-first so no ++// byte-reflection is needed; otherwise it must be a vector register. \consts ++// is a vector register containing the needed fold constants, and \tmp is a ++// temporary vector register. All arguments must be the same length. ++.macro _fold_vec acc, data, consts, tmp ++ _pclmulqdq \consts, HI64_TERMS, \acc, HI64_TERMS, \tmp ++ _pclmulqdq \consts, LO64_TERMS, \acc, LO64_TERMS, \acc ++.if AVX_LEVEL < 10 ++ _cond_vex pxor, \data, \tmp, \tmp ++ _cond_vex pxor, \tmp, \acc, \acc ++.else ++ vpternlogq $0x96, \data, \tmp, \acc ++.endif ++.endm ++ ++// Fold \acc into \data and store the result back into \acc. \data is an ++// unaligned mem operand, \consts is a vector register containing the needed ++// fold constants, \bswap_mask is a vector register containing the ++// byte-reflection table if the CRC is msb-first, and \tmp1 and \tmp2 are ++// temporary vector registers. All arguments must have length \vl. ++.macro _fold_vec_mem vl, acc, data, consts, bswap_mask, tmp1, tmp2 ++.if AVX_LEVEL == 0 || !LSB_CRC ++ _load_data \vl, \data, \bswap_mask, \tmp1 ++ _fold_vec \acc, \tmp1, \consts, \tmp2 ++.else ++ _fold_vec \acc, \data, \consts, \tmp1 ++.endif ++.endm ++ ++// Load the constants for folding across 2**i vectors of length VL at a time ++// into all 128-bit lanes of the vector register CONSTS. ++.macro _load_vec_folding_consts i ++ _vbroadcast OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS+(4-LOG2_VL-\i)*16(CONSTS_PTR), \ ++ CONSTS ++.endm ++ ++// Given vector registers \v0 and \v1 of length \vl, fold \v0 into \v1 and store ++// the result back into \v0. If the remaining length mod \vl is nonzero, also ++// fold \vl data bytes from BUF. For both operations the fold distance is \vl. ++// \consts must be a register of length \vl containing the fold constants. ++.macro _fold_vec_final vl, v0, v1, consts, bswap_mask, tmp1, tmp2 ++ _fold_vec \v0, \v1, \consts, \tmp1 ++ test $\vl, LEN8 ++ jz .Lfold_vec_final_done\@ ++ _fold_vec_mem \vl, \v0, (BUF), \consts, \bswap_mask, \tmp1, \tmp2 ++ add $\vl, BUF ++.Lfold_vec_final_done\@: ++.endm ++ ++// This macro generates the body of a CRC function with the following prototype: ++// ++// crc_t crc_func(crc_t crc, const u8 *buf, size_t len, const void *consts); ++// ++// |crc| is the initial CRC, and crc_t is a data type wide enough to hold it. ++// |buf| is the data to checksum. |len| is the data length in bytes, which must ++// be at least 16. |consts| is a pointer to the fold_across_128_bits_consts ++// field of the constants struct that was generated for the chosen CRC variant. ++// ++// Moving onto the macro parameters, \n is the number of bits in the CRC, e.g. ++// 32 for a CRC-32. Currently the supported values are 8, 16, 32, and 64. If ++// the file is compiled in i386 mode, then the maximum supported value is 32. ++// ++// \lsb_crc is 1 if the CRC processes the least significant bit of each byte ++// first, i.e. maps bit0 to x^7, bit1 to x^6, ..., bit7 to x^0. \lsb_crc is 0 ++// if the CRC processes the most significant bit of each byte first, i.e. maps ++// bit0 to x^0, bit1 to x^1, bit7 to x^7. ++// ++// \vl is the maximum length of vector register to use in bytes: 16, 32, or 64. ++// ++// \avx_level is the level of AVX support to use: 0 for SSE only, 2 for AVX2, or ++// 10 for AVX10 or AVX512. ++// ++// If \vl == 16 && \avx_level == 0, the generated code requires: ++// PCLMULQDQ && SSE4.1. (Note: all known CPUs with PCLMULQDQ also have SSE4.1.) ++// ++// If \vl == 32 && \avx_level == 2, the generated code requires: ++// VPCLMULQDQ && AVX2. ++// ++// If \vl == 32 && \avx_level == 10, the generated code requires: ++// VPCLMULQDQ && (AVX10/256 || (AVX512BW && AVX512VL)) ++// ++// If \vl == 64 && \avx_level == 10, the generated code requires: ++// VPCLMULQDQ && (AVX10/512 || (AVX512BW && AVX512VL)) ++// ++// Other \vl and \avx_level combinations are either not supported or not useful. ++.macro _crc_pclmul n, lsb_crc, vl, avx_level ++ .set LSB_CRC, \lsb_crc ++ .set VL, \vl ++ .set AVX_LEVEL, \avx_level ++ ++ // Define aliases for the xmm, ymm, or zmm registers according to VL. ++.irp i, 0,1,2,3,4,5,6,7 ++ .if VL == 16 ++ .set V\i, %xmm\i ++ .set LOG2_VL, 4 ++ .elseif VL == 32 ++ .set V\i, %ymm\i ++ .set LOG2_VL, 5 ++ .elseif VL == 64 ++ .set V\i, %zmm\i ++ .set LOG2_VL, 6 ++ .else ++ .error "Unsupported vector length" ++ .endif ++.endr ++ // Define aliases for the function parameters. ++ // Note: when crc_t is shorter than u32, zero-extension to 32 bits is ++ // guaranteed by the ABI. Zero-extension to 64 bits is *not* guaranteed ++ // when crc_t is shorter than u64. ++#ifdef __x86_64__ ++.if \n <= 32 ++ .set CRC, %edi ++.else ++ .set CRC, %rdi ++.endif ++ .set BUF, %rsi ++ .set LEN, %rdx ++ .set LEN32, %edx ++ .set LEN8, %dl ++ .set CONSTS_PTR, %rcx ++#else ++ // 32-bit support, assuming -mregparm=3 and not including support for ++ // CRC-64 (which would use both eax and edx to pass the crc parameter). ++ .set CRC, %eax ++ .set BUF, %edx ++ .set LEN, %ecx ++ .set LEN32, %ecx ++ .set LEN8, %cl ++ .set CONSTS_PTR, %ebx // Passed on stack ++#endif ++ ++ // Define aliases for some local variables. V0-V5 are used without ++ // aliases (for accumulators, data, temporary values, etc). Staying ++ // within the first 8 vector registers keeps the code 32-bit SSE ++ // compatible and reduces the size of 64-bit SSE code slightly. ++ .set BSWAP_MASK, V6 ++ .set BSWAP_MASK_YMM, %ymm6 ++ .set BSWAP_MASK_XMM, %xmm6 ++ .set CONSTS, V7 ++ .set CONSTS_YMM, %ymm7 ++ .set CONSTS_XMM, %xmm7 ++ ++#ifdef __i386__ ++ push CONSTS_PTR ++ mov 8(%esp), CONSTS_PTR ++#endif ++ ++ // Create a 128-bit vector that contains the initial CRC in the end ++ // representing the high-order polynomial coefficients, and the rest 0. ++ // If the CRC is msb-first, also load the byte-reflection table. ++.if \n <= 32 ++ _cond_vex movd, CRC, %xmm0 ++.else ++ _cond_vex movq, CRC, %xmm0 ++.endif ++.if !LSB_CRC ++ _cond_vex pslldq, $(128-\n)/8, %xmm0, %xmm0 ++ _vbroadcast OFFSETOF_BSWAP_MASK(CONSTS_PTR), BSWAP_MASK ++.endif ++ ++ // Load the first vector of data and XOR the initial CRC into the ++ // appropriate end of the first 128-bit lane of data. If LEN < VL, then ++ // use a short vector and jump ahead to the final reduction. (LEN >= 16 ++ // is guaranteed here but not necessarily LEN >= VL.) ++.if VL >= 32 ++ cmp $VL, LEN ++ jae .Lat_least_1vec\@ ++ .if VL == 64 ++ cmp $32, LEN32 ++ jb .Lless_than_32bytes\@ ++ _prepare_v0 32, %ymm0, %ymm1, BSWAP_MASK_YMM ++ add $32, BUF ++ jmp .Lreduce_256bits_to_128bits\@ ++.Lless_than_32bytes\@: ++ .endif ++ _prepare_v0 16, %xmm0, %xmm1, BSWAP_MASK_XMM ++ add $16, BUF ++ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM ++ jmp .Lcheck_for_partial_block\@ ++.Lat_least_1vec\@: ++.endif ++ _prepare_v0 VL, V0, V1, BSWAP_MASK ++ ++ // Handle VL <= LEN < 4*VL. ++ cmp $4*VL-1, LEN ++ ja .Lat_least_4vecs\@ ++ add $VL, BUF ++ // If VL <= LEN < 2*VL, then jump ahead to the reduction from 1 vector. ++ // If VL==16 then load fold_across_128_bits_consts first, as the final ++ // reduction depends on it and it won't be loaded anywhere else. ++ cmp $2*VL-1, LEN32 ++.if VL == 16 ++ _cond_vex movdqa, OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM ++.endif ++ jbe .Lreduce_1vec_to_128bits\@ ++ // Otherwise 2*VL <= LEN < 4*VL. Load one more vector and jump ahead to ++ // the reduction from 2 vectors. ++ _load_data VL, (BUF), BSWAP_MASK, V1 ++ add $VL, BUF ++ jmp .Lreduce_2vecs_to_1\@ ++ ++.Lat_least_4vecs\@: ++ // Load 3 more vectors of data. ++ _load_data VL, 1*VL(BUF), BSWAP_MASK, V1 ++ _load_data VL, 2*VL(BUF), BSWAP_MASK, V2 ++ _load_data VL, 3*VL(BUF), BSWAP_MASK, V3 ++ sub $-4*VL, BUF // Shorter than 'add 4*VL' when VL=32 ++ add $-4*VL, LEN // Shorter than 'sub 4*VL' when VL=32 ++ ++ // Main loop: while LEN >= 4*VL, fold the 4 vectors V0-V3 into the next ++ // 4 vectors of data and write the result back to V0-V3. ++ cmp $4*VL-1, LEN // Shorter than 'cmp 4*VL' when VL=32 ++ jbe .Lreduce_4vecs_to_2\@ ++ _load_vec_folding_consts 2 ++.Lfold_4vecs_loop\@: ++ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 ++ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 ++ _fold_vec_mem VL, V2, 2*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 ++ _fold_vec_mem VL, V3, 3*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 ++ sub $-4*VL, BUF ++ add $-4*VL, LEN ++ cmp $4*VL-1, LEN ++ ja .Lfold_4vecs_loop\@ ++ ++ // Fold V0,V1 into V2,V3 and write the result back to V0,V1. Then fold ++ // two more vectors of data from BUF, if at least that much remains. ++.Lreduce_4vecs_to_2\@: ++ _load_vec_folding_consts 1 ++ _fold_vec V0, V2, CONSTS, V4 ++ _fold_vec V1, V3, CONSTS, V4 ++ test $2*VL, LEN8 ++ jz .Lreduce_2vecs_to_1\@ ++ _fold_vec_mem VL, V0, 0*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 ++ _fold_vec_mem VL, V1, 1*VL(BUF), CONSTS, BSWAP_MASK, V4, V5 ++ sub $-2*VL, BUF ++ ++ // Fold V0 into V1 and write the result back to V0. Then fold one more ++ // vector of data from BUF, if at least that much remains. ++.Lreduce_2vecs_to_1\@: ++ _load_vec_folding_consts 0 ++ _fold_vec_final VL, V0, V1, CONSTS, BSWAP_MASK, V4, V5 ++ ++.Lreduce_1vec_to_128bits\@: ++.if VL == 64 ++ // Reduce 512-bit %zmm0 to 256-bit %ymm0. Then fold 256 more bits of ++ // data from BUF, if at least that much remains. ++ vbroadcasti128 OFFSETOF_FOLD_ACROSS_256_BITS_CONSTS(CONSTS_PTR), CONSTS_YMM ++ vextracti64x4 $1, %zmm0, %ymm1 ++ _fold_vec_final 32, %ymm0, %ymm1, CONSTS_YMM, BSWAP_MASK_YMM, %ymm4, %ymm5 ++.Lreduce_256bits_to_128bits\@: ++.endif ++.if VL >= 32 ++ // Reduce 256-bit %ymm0 to 128-bit %xmm0. Then fold 128 more bits of ++ // data from BUF, if at least that much remains. ++ vmovdqa OFFSETOF_FOLD_ACROSS_128_BITS_CONSTS(CONSTS_PTR), CONSTS_XMM ++ vextracti128 $1, %ymm0, %xmm1 ++ _fold_vec_final 16, %xmm0, %xmm1, CONSTS_XMM, BSWAP_MASK_XMM, %xmm4, %xmm5 ++.Lcheck_for_partial_block\@: ++.endif ++ and $15, LEN32 ++ jz .Lreduce_128bits_to_crc\@ ++ ++ // 1 <= LEN <= 15 data bytes remain in BUF. The polynomial is now ++ // A*(x^(8*LEN)) + B, where A is the 128-bit polynomial stored in %xmm0 ++ // and B is the polynomial of the remaining LEN data bytes. To reduce ++ // this to 128 bits without needing fold constants for each possible ++ // LEN, rearrange this expression into C1*(x^128) + C2, where ++ // C1 = floor(A / x^(128 - 8*LEN)) and C2 = A*x^(8*LEN) + B mod x^128. ++ // Then fold C1 into C2, which is just another fold across 128 bits. ++ ++.if !LSB_CRC || AVX_LEVEL == 0 ++ // Load the last 16 data bytes. Note that originally LEN was >= 16. ++ _load_data 16, "-16(BUF,LEN)", BSWAP_MASK_XMM, %xmm2 ++.endif // Else will use vpblendvb mem operand later. ++.if !LSB_CRC ++ neg LEN // Needed for indexing shuf_table ++.endif ++ ++ // tmp = A*x^(8*LEN) mod x^128 ++ // lsb: pshufb by [LEN, LEN+1, ..., 15, -1, -1, ..., -1] ++ // i.e. right-shift by LEN bytes. ++ // msb: pshufb by [-1, -1, ..., -1, 0, 1, ..., 15-LEN] ++ // i.e. left-shift by LEN bytes. ++ _cond_vex movdqu, "OFFSETOF_SHUF_TABLE+16(CONSTS_PTR,LEN)", %xmm3 ++ _cond_vex pshufb, %xmm3, %xmm0, %xmm1 ++ ++ // C1 = floor(A / x^(128 - 8*LEN)) ++ // lsb: pshufb by [-1, -1, ..., -1, 0, 1, ..., LEN-1] ++ // i.e. left-shift by 16-LEN bytes. ++ // msb: pshufb by [16-LEN, 16-LEN+1, ..., 15, -1, -1, ..., -1] ++ // i.e. right-shift by 16-LEN bytes. ++ _cond_vex pshufb, "OFFSETOF_SHUF_TABLE+32*!LSB_CRC(CONSTS_PTR,LEN)", \ ++ %xmm0, %xmm0, unaligned_mem_tmp=%xmm4 ++ ++ // C2 = tmp + B. This is just a blend of tmp with the last 16 data ++ // bytes (reflected if msb-first). The blend mask is the shuffle table ++ // that was used to create tmp. 0 selects tmp, and 1 last16databytes. ++.if AVX_LEVEL == 0 ++ movdqa %xmm0, %xmm4 ++ movdqa %xmm3, %xmm0 ++ pblendvb %xmm2, %xmm1 // uses %xmm0 as implicit operand ++ movdqa %xmm4, %xmm0 ++.elseif LSB_CRC ++ vpblendvb %xmm3, -16(BUF,LEN), %xmm1, %xmm1 ++.else ++ vpblendvb %xmm3, %xmm2, %xmm1, %xmm1 ++.endif ++ ++ // Fold C1 into C2 and store the 128-bit result in %xmm0. ++ _fold_vec %xmm0, %xmm1, CONSTS_XMM, %xmm4 ++ ++.Lreduce_128bits_to_crc\@: ++ // Compute the CRC as %xmm0 * x^n mod G. Here %xmm0 means the 128-bit ++ // polynomial stored in %xmm0 (using either lsb-first or msb-first bit ++ // order according to LSB_CRC), and G is the CRC's generator polynomial. ++ ++ // First, multiply %xmm0 by x^n and reduce the result to 64+n bits: ++ // ++ // t0 := (x^(64+n) mod G) * floor(%xmm0 / x^64) + ++ // x^n * (%xmm0 mod x^64) ++ // ++ // Store t0 * x^(64-n) in %xmm0. I.e., actually do: ++ // ++ // %xmm0 := ((x^(64+n) mod G) * x^(64-n)) * floor(%xmm0 / x^64) + ++ // x^64 * (%xmm0 mod x^64) ++ // ++ // The extra unreduced factor of x^(64-n) makes floor(t0 / x^n) aligned ++ // to the HI64_TERMS of %xmm0 so that the next pclmulqdq can easily ++ // select it. The 64-bit constant (x^(64+n) mod G) * x^(64-n) in the ++ // msb-first case, or (x^(63+n) mod G) * x^(64-n) in the lsb-first case ++ // (considering the extra factor of x that gets implicitly introduced by ++ // each pclmulqdq when using lsb-first order), is identical to the ++ // constant that was used earlier for folding the LO64_TERMS across 128 ++ // bits. Thus it's already available in LO64_TERMS of CONSTS_XMM. ++ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm0, HI64_TERMS, %xmm1 ++.if LSB_CRC ++ _cond_vex psrldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) ++.else ++ _cond_vex pslldq, $8, %xmm0, %xmm0 // x^64 * (%xmm0 mod x^64) ++.endif ++ _cond_vex pxor, %xmm1, %xmm0, %xmm0 ++ // The HI64_TERMS of %xmm0 now contain floor(t0 / x^n). ++ // The LO64_TERMS of %xmm0 now contain (t0 mod x^n) * x^(64-n). ++ ++ // First step of Barrett reduction: Compute floor(t0 / G). This is the ++ // polynomial by which G needs to be multiplied to cancel out the x^n ++ // and higher terms of t0, i.e. to reduce t0 mod G. First do: ++ // ++ // t1 := floor(x^(63+n) / G) * x * floor(t0 / x^n) ++ // ++ // Then the desired value floor(t0 / G) is floor(t1 / x^64). The 63 in ++ // x^(63+n) is the maximum degree of floor(t0 / x^n) and thus the lowest ++ // value that makes enough precision be carried through the calculation. ++ // ++ // The '* x' makes it so the result is floor(t1 / x^64) rather than ++ // floor(t1 / x^63), making it qword-aligned in HI64_TERMS so that it ++ // can be extracted much more easily in the next step. In the lsb-first ++ // case the '* x' happens implicitly. In the msb-first case it must be ++ // done explicitly; floor(x^(63+n) / G) * x is a 65-bit constant, so the ++ // constant passed to pclmulqdq is (floor(x^(63+n) / G) * x) - x^64, and ++ // the multiplication by the x^64 term is handled using a pxor. The ++ // pxor causes the low 64 terms of t1 to be wrong, but they are unused. ++ _cond_vex movdqa, OFFSETOF_BARRETT_REDUCTION_CONSTS(CONSTS_PTR), CONSTS_XMM ++ _pclmulqdq CONSTS_XMM, HI64_TERMS, %xmm0, HI64_TERMS, %xmm1 ++.if !LSB_CRC ++ _cond_vex pxor, %xmm0, %xmm1, %xmm1 // += x^64 * floor(t0 / x^n) ++.endif ++ // The HI64_TERMS of %xmm1 now contain floor(t1 / x^64) = floor(t0 / G). ++ ++ // Second step of Barrett reduction: Cancel out the x^n and higher terms ++ // of t0 by subtracting the needed multiple of G. This gives the CRC: ++ // ++ // crc := t0 - (G * floor(t0 / G)) ++ // ++ // But %xmm0 contains t0 * x^(64-n), so it's more convenient to do: ++ // ++ // crc := ((t0 * x^(64-n)) - ((G * x^(64-n)) * floor(t0 / G))) / x^(64-n) ++ // ++ // Furthermore, since the resulting CRC is n-bit, if mod x^n is ++ // explicitly applied to it then the x^n term of G makes no difference ++ // in the result and can be omitted. This helps keep the constant ++ // multiplier in 64 bits in most cases. This gives the following: ++ // ++ // %xmm0 := %xmm0 - (((G - x^n) * x^(64-n)) * floor(t0 / G)) ++ // crc := (%xmm0 / x^(64-n)) mod x^n ++ // ++ // In the lsb-first case, each pclmulqdq implicitly introduces ++ // an extra factor of x, so in that case the constant that needs to be ++ // passed to pclmulqdq is actually '(G - x^n) * x^(63-n)' when n <= 63. ++ // For lsb-first CRCs where n=64, the extra factor of x cannot be as ++ // easily avoided. In that case, instead pass '(G - x^n - x^0) / x' to ++ // pclmulqdq and handle the x^0 term (i.e. 1) separately. (All CRC ++ // polynomials have nonzero x^n and x^0 terms.) It works out as: the ++ // CRC has be XORed with the physically low qword of %xmm1, representing ++ // floor(t0 / G). The most efficient way to do that is to move it to ++ // the physically high qword and use a ternlog to combine the two XORs. ++.if LSB_CRC && \n == 64 ++ _cond_vex punpcklqdq, %xmm1, %xmm2, %xmm2 ++ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 ++ .if AVX_LEVEL < 10 ++ _cond_vex pxor, %xmm2, %xmm0, %xmm0 ++ _cond_vex pxor, %xmm1, %xmm0, %xmm0 ++ .else ++ vpternlogq $0x96, %xmm2, %xmm1, %xmm0 ++ .endif ++ _cond_vex "pextrq $1,", %xmm0, %rax // (%xmm0 / x^0) mod x^64 ++.else ++ _pclmulqdq CONSTS_XMM, LO64_TERMS, %xmm1, HI64_TERMS, %xmm1 ++ _cond_vex pxor, %xmm1, %xmm0, %xmm0 ++ .if \n == 8 ++ _cond_vex "pextrb $7 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^56) mod x^8 ++ .elseif \n == 16 ++ _cond_vex "pextrw $3 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^48) mod x^16 ++ .elseif \n == 32 ++ _cond_vex "pextrd $1 + LSB_CRC,", %xmm0, %eax // (%xmm0 / x^32) mod x^32 ++ .else // \n == 64 && !LSB_CRC ++ _cond_vex movq, %xmm0, %rax // (%xmm0 / x^0) mod x^64 ++ .endif ++.endif ++ ++.if VL > 16 ++ vzeroupper // Needed when ymm or zmm registers may have been used. ++.endif ++#ifdef __i386__ ++ pop CONSTS_PTR ++#endif ++ RET ++.endm ++ ++#ifdef CONFIG_AS_VPCLMULQDQ ++#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ ++SYM_FUNC_START(prefix##_pclmul_sse); \ ++ _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ ++SYM_FUNC_END(prefix##_pclmul_sse); \ ++ \ ++SYM_FUNC_START(prefix##_vpclmul_avx2); \ ++ _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=2; \ ++SYM_FUNC_END(prefix##_vpclmul_avx2); \ ++ \ ++SYM_FUNC_START(prefix##_vpclmul_avx10_256); \ ++ _crc_pclmul n=bits, lsb_crc=lsb, vl=32, avx_level=10; \ ++SYM_FUNC_END(prefix##_vpclmul_avx10_256); \ ++ \ ++SYM_FUNC_START(prefix##_vpclmul_avx10_512); \ ++ _crc_pclmul n=bits, lsb_crc=lsb, vl=64, avx_level=10; \ ++SYM_FUNC_END(prefix##_vpclmul_avx10_512); ++#else ++#define DEFINE_CRC_PCLMUL_FUNCS(prefix, bits, lsb) \ ++SYM_FUNC_START(prefix##_pclmul_sse); \ ++ _crc_pclmul n=bits, lsb_crc=lsb, vl=16, avx_level=0; \ ++SYM_FUNC_END(prefix##_pclmul_sse); ++#endif // !CONFIG_AS_VPCLMULQDQ +diff --git a/arch/x86/lib/crc-pclmul-template.h b/arch/x86/lib/crc-pclmul-template.h +new file mode 100644 +index 000000000000..7b89f0edbc17 +--- /dev/null ++++ b/arch/x86/lib/crc-pclmul-template.h +@@ -0,0 +1,81 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++/* ++ * Macros for accessing the [V]PCLMULQDQ-based CRC functions that are ++ * instantiated by crc-pclmul-template.S ++ * ++ * Copyright 2025 Google LLC ++ * ++ * Author: Eric Biggers ++ */ ++#ifndef _CRC_PCLMUL_TEMPLATE_H ++#define _CRC_PCLMUL_TEMPLATE_H ++ ++#include ++#include ++#include ++#include ++#include "crc-pclmul-consts.h" ++ ++#define DECLARE_CRC_PCLMUL_FUNCS(prefix, crc_t) \ ++crc_t prefix##_pclmul_sse(crc_t crc, const u8 *p, size_t len, \ ++ const void *consts_ptr); \ ++crc_t prefix##_vpclmul_avx2(crc_t crc, const u8 *p, size_t len, \ ++ const void *consts_ptr); \ ++crc_t prefix##_vpclmul_avx10_256(crc_t crc, const u8 *p, size_t len, \ ++ const void *consts_ptr); \ ++crc_t prefix##_vpclmul_avx10_512(crc_t crc, const u8 *p, size_t len, \ ++ const void *consts_ptr); \ ++DEFINE_STATIC_CALL(prefix##_pclmul, prefix##_pclmul_sse) ++ ++#define INIT_CRC_PCLMUL(prefix) \ ++do { \ ++ if (IS_ENABLED(CONFIG_AS_VPCLMULQDQ) && \ ++ boot_cpu_has(X86_FEATURE_VPCLMULQDQ) && \ ++ boot_cpu_has(X86_FEATURE_AVX2) && \ ++ cpu_has_xfeatures(XFEATURE_MASK_YMM, NULL)) { \ ++ if (boot_cpu_has(X86_FEATURE_AVX512BW) && \ ++ boot_cpu_has(X86_FEATURE_AVX512VL) && \ ++ cpu_has_xfeatures(XFEATURE_MASK_AVX512, NULL)) { \ ++ if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) \ ++ static_call_update(prefix##_pclmul, \ ++ prefix##_vpclmul_avx10_256); \ ++ else \ ++ static_call_update(prefix##_pclmul, \ ++ prefix##_vpclmul_avx10_512); \ ++ } else { \ ++ static_call_update(prefix##_pclmul, \ ++ prefix##_vpclmul_avx2); \ ++ } \ ++ } \ ++} while (0) ++ ++/* ++ * Call a [V]PCLMULQDQ optimized CRC function if the data length is at least 16 ++ * bytes, the CPU has PCLMULQDQ support, and the current context may use SIMD. ++ * ++ * 16 bytes is the minimum length supported by the [V]PCLMULQDQ functions. ++ * There is overhead associated with kernel_fpu_begin() and kernel_fpu_end(), ++ * varying by CPU and factors such as which parts of the "FPU" state userspace ++ * has touched, which could result in a larger cutoff being better. Indeed, a ++ * larger cutoff is usually better for a *single* message. However, the ++ * overhead of the FPU section gets amortized if multiple FPU sections get ++ * executed before returning to userspace, since the XSAVE and XRSTOR occur only ++ * once. Considering that and the fact that the [V]PCLMULQDQ code is lighter on ++ * the dcache than the table-based code is, a 16-byte cutoff seems to work well. ++ */ ++#define CRC_PCLMUL(crc, p, len, prefix, consts, have_pclmulqdq) \ ++do { \ ++ if ((len) >= 16 && static_branch_likely(&(have_pclmulqdq)) && \ ++ crypto_simd_usable()) { \ ++ const void *consts_ptr; \ ++ \ ++ consts_ptr = (consts).fold_across_128_bits_consts; \ ++ kernel_fpu_begin(); \ ++ crc = static_call(prefix##_pclmul)((crc), (p), (len), \ ++ consts_ptr); \ ++ kernel_fpu_end(); \ ++ return crc; \ ++ } \ ++} while (0) ++ ++#endif /* _CRC_PCLMUL_TEMPLATE_H */ +diff --git a/arch/x86/lib/crc-t10dif-glue.c b/arch/x86/lib/crc-t10dif-glue.c +index 13f07ddc9122..6b09374b8355 100644 +--- a/arch/x86/lib/crc-t10dif-glue.c ++++ b/arch/x86/lib/crc-t10dif-glue.c +@@ -1,37 +1,32 @@ + // SPDX-License-Identifier: GPL-2.0-or-later + /* +- * CRC-T10DIF using PCLMULQDQ instructions ++ * CRC-T10DIF using [V]PCLMULQDQ instructions + * + * Copyright 2024 Google LLC + */ + +-#include +-#include +-#include + #include + #include ++#include "crc-pclmul-template.h" + + static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +-asmlinkage u16 crc_t10dif_pcl(u16 init_crc, const u8 *buf, size_t len); ++DECLARE_CRC_PCLMUL_FUNCS(crc16_msb, u16); + + u16 crc_t10dif_arch(u16 crc, const u8 *p, size_t len) + { +- if (len >= 16 && +- static_key_enabled(&have_pclmulqdq) && crypto_simd_usable()) { +- kernel_fpu_begin(); +- crc = crc_t10dif_pcl(crc, p, len); +- kernel_fpu_end(); +- return crc; +- } ++ CRC_PCLMUL(crc, p, len, crc16_msb, crc16_msb_0x8bb7_consts, ++ have_pclmulqdq); + return crc_t10dif_generic(crc, p, len); + } + EXPORT_SYMBOL(crc_t10dif_arch); + + static int __init crc_t10dif_x86_init(void) + { +- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) ++ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); ++ INIT_CRC_PCLMUL(crc16_msb); ++ } + return 0; + } + arch_initcall(crc_t10dif_x86_init); +@@ -47,5 +42,5 @@ bool crc_t10dif_is_optimized(void) + } + EXPORT_SYMBOL(crc_t10dif_is_optimized); + +-MODULE_DESCRIPTION("CRC-T10DIF using PCLMULQDQ instructions"); ++MODULE_DESCRIPTION("CRC-T10DIF using [V]PCLMULQDQ instructions"); + MODULE_LICENSE("GPL"); +diff --git a/arch/x86/lib/crc16-msb-pclmul.S b/arch/x86/lib/crc16-msb-pclmul.S +new file mode 100644 +index 000000000000..e9fe248093a8 +--- /dev/null ++++ b/arch/x86/lib/crc16-msb-pclmul.S +@@ -0,0 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++// Copyright 2025 Google LLC ++ ++#include "crc-pclmul-template.S" ++ ++DEFINE_CRC_PCLMUL_FUNCS(crc16_msb, /* bits= */ 16, /* lsb= */ 0) +diff --git a/arch/x86/lib/crc32-glue.c b/arch/x86/lib/crc32-glue.c +index 2dd18a886ded..5b2878c2f793 100644 +--- a/arch/x86/lib/crc32-glue.c ++++ b/arch/x86/lib/crc32-glue.c +@@ -7,43 +7,20 @@ + * Copyright 2024 Google LLC + */ + +-#include +-#include +-#include + #include +-#include + #include +- +-/* minimum size of buffer for crc32_pclmul_le_16 */ +-#define CRC32_PCLMUL_MIN_LEN 64 ++#include "crc-pclmul-template.h" + + static DEFINE_STATIC_KEY_FALSE(have_crc32); + static DEFINE_STATIC_KEY_FALSE(have_pclmulqdq); + +-u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); ++DECLARE_CRC_PCLMUL_FUNCS(crc32_lsb, u32); + + u32 crc32_le_arch(u32 crc, const u8 *p, size_t len) + { +- if (len >= CRC32_PCLMUL_MIN_LEN + 15 && +- static_branch_likely(&have_pclmulqdq) && crypto_simd_usable()) { +- size_t n = -(uintptr_t)p & 15; +- +- /* align p to 16-byte boundary */ +- if (n) { +- crc = crc32_le_base(crc, p, n); +- p += n; +- len -= n; +- } +- n = round_down(len, 16); +- kernel_fpu_begin(); +- crc = crc32_pclmul_le_16(crc, p, n); +- kernel_fpu_end(); +- p += n; +- len -= n; +- } +- if (len) +- crc = crc32_le_base(crc, p, len); +- return crc; ++ CRC_PCLMUL(crc, p, len, crc32_lsb, crc32_lsb_0xedb88320_consts, ++ have_pclmulqdq); ++ return crc32_le_base(crc, p, len); + } + EXPORT_SYMBOL(crc32_le_arch); + +@@ -78,10 +55,18 @@ u32 crc32c_le_arch(u32 crc, const u8 *p, size_t len) + + for (num_longs = len / sizeof(unsigned long); + num_longs != 0; num_longs--, p += sizeof(unsigned long)) +- asm(CRC32_INST : "+r" (crc) : "rm" (*(unsigned long *)p)); ++ asm(CRC32_INST : "+r" (crc) : ASM_INPUT_RM (*(unsigned long *)p)); + +- for (len %= sizeof(unsigned long); len; len--, p++) +- asm("crc32b %1, %0" : "+r" (crc) : "rm" (*p)); ++ if (sizeof(unsigned long) > 4 && (len & 4)) { ++ asm("crc32l %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u32 *)p)); ++ p += 4; ++ } ++ if (len & 2) { ++ asm("crc32w %1, %0" : "+r" (crc) : ASM_INPUT_RM (*(u16 *)p)); ++ p += 2; ++ } ++ if (len & 1) ++ asm("crc32b %1, %0" : "+r" (crc) : ASM_INPUT_RM (*p)); + + return crc; + } +@@ -97,8 +82,10 @@ static int __init crc32_x86_init(void) + { + if (boot_cpu_has(X86_FEATURE_XMM4_2)) + static_branch_enable(&have_crc32); +- if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) ++ if (boot_cpu_has(X86_FEATURE_PCLMULQDQ)) { + static_branch_enable(&have_pclmulqdq); ++ INIT_CRC_PCLMUL(crc32_lsb); ++ } + return 0; + } + arch_initcall(crc32_x86_init); +diff --git a/arch/x86/lib/crc32-pclmul.S b/arch/x86/lib/crc32-pclmul.S +index f9637789cac1..f20f40fb0172 100644 +--- a/arch/x86/lib/crc32-pclmul.S ++++ b/arch/x86/lib/crc32-pclmul.S +@@ -1,217 +1,6 @@ +-/* SPDX-License-Identifier: GPL-2.0-only */ +-/* +- * Copyright 2012 Xyratex Technology Limited +- * +- * Using hardware provided PCLMULQDQ instruction to accelerate the CRC32 +- * calculation. +- * CRC32 polynomial:0x04c11db7(BE)/0xEDB88320(LE) +- * PCLMULQDQ is a new instruction in Intel SSE4.2, the reference can be found +- * at: +- * http://www.intel.com/products/processor/manuals/ +- * Intel(R) 64 and IA-32 Architectures Software Developer's Manual +- * Volume 2B: Instruction Set Reference, N-Z +- * +- * Authors: Gregory Prestas +- * Alexander Boyko +- */ ++/* SPDX-License-Identifier: GPL-2.0-or-later */ ++// Copyright 2025 Google LLC + +-#include ++#include "crc-pclmul-template.S" + +- +-.section .rodata +-.align 16 +-/* +- * [x4*128+32 mod P(x) << 32)]' << 1 = 0x154442bd4 +- * #define CONSTANT_R1 0x154442bd4LL +- * +- * [(x4*128-32 mod P(x) << 32)]' << 1 = 0x1c6e41596 +- * #define CONSTANT_R2 0x1c6e41596LL +- */ +-.Lconstant_R2R1: +- .octa 0x00000001c6e415960000000154442bd4 +-/* +- * [(x128+32 mod P(x) << 32)]' << 1 = 0x1751997d0 +- * #define CONSTANT_R3 0x1751997d0LL +- * +- * [(x128-32 mod P(x) << 32)]' << 1 = 0x0ccaa009e +- * #define CONSTANT_R4 0x0ccaa009eLL +- */ +-.Lconstant_R4R3: +- .octa 0x00000000ccaa009e00000001751997d0 +-/* +- * [(x64 mod P(x) << 32)]' << 1 = 0x163cd6124 +- * #define CONSTANT_R5 0x163cd6124LL +- */ +-.Lconstant_R5: +- .octa 0x00000000000000000000000163cd6124 +-.Lconstant_mask32: +- .octa 0x000000000000000000000000FFFFFFFF +-/* +- * #define CRCPOLY_TRUE_LE_FULL 0x1DB710641LL +- * +- * Barrett Reduction constant (u64`) = u` = (x**64 / P(x))` = 0x1F7011641LL +- * #define CONSTANT_RU 0x1F7011641LL +- */ +-.Lconstant_RUpoly: +- .octa 0x00000001F701164100000001DB710641 +- +-#define CONSTANT %xmm0 +- +-#ifdef __x86_64__ +-#define CRC %edi +-#define BUF %rsi +-#define LEN %rdx +-#else +-#define CRC %eax +-#define BUF %edx +-#define LEN %ecx +-#endif +- +- +- +-.text +-/** +- * Calculate crc32 +- * CRC - initial crc32 +- * BUF - buffer (16 bytes aligned) +- * LEN - sizeof buffer (16 bytes aligned), LEN should be greater than 63 +- * return %eax crc32 +- * u32 crc32_pclmul_le_16(u32 crc, const u8 *buffer, size_t len); +- */ +- +-SYM_FUNC_START(crc32_pclmul_le_16) /* buffer and buffer size are 16 bytes aligned */ +- movdqa (BUF), %xmm1 +- movdqa 0x10(BUF), %xmm2 +- movdqa 0x20(BUF), %xmm3 +- movdqa 0x30(BUF), %xmm4 +- movd CRC, CONSTANT +- pxor CONSTANT, %xmm1 +- sub $0x40, LEN +- add $0x40, BUF +- cmp $0x40, LEN +- jb .Lless_64 +- +-#ifdef __x86_64__ +- movdqa .Lconstant_R2R1(%rip), CONSTANT +-#else +- movdqa .Lconstant_R2R1, CONSTANT +-#endif +- +-.Lloop_64:/* 64 bytes Full cache line folding */ +- prefetchnta 0x40(BUF) +- movdqa %xmm1, %xmm5 +- movdqa %xmm2, %xmm6 +- movdqa %xmm3, %xmm7 +-#ifdef __x86_64__ +- movdqa %xmm4, %xmm8 +-#endif +- pclmulqdq $0x00, CONSTANT, %xmm1 +- pclmulqdq $0x00, CONSTANT, %xmm2 +- pclmulqdq $0x00, CONSTANT, %xmm3 +-#ifdef __x86_64__ +- pclmulqdq $0x00, CONSTANT, %xmm4 +-#endif +- pclmulqdq $0x11, CONSTANT, %xmm5 +- pclmulqdq $0x11, CONSTANT, %xmm6 +- pclmulqdq $0x11, CONSTANT, %xmm7 +-#ifdef __x86_64__ +- pclmulqdq $0x11, CONSTANT, %xmm8 +-#endif +- pxor %xmm5, %xmm1 +- pxor %xmm6, %xmm2 +- pxor %xmm7, %xmm3 +-#ifdef __x86_64__ +- pxor %xmm8, %xmm4 +-#else +- /* xmm8 unsupported for x32 */ +- movdqa %xmm4, %xmm5 +- pclmulqdq $0x00, CONSTANT, %xmm4 +- pclmulqdq $0x11, CONSTANT, %xmm5 +- pxor %xmm5, %xmm4 +-#endif +- +- pxor (BUF), %xmm1 +- pxor 0x10(BUF), %xmm2 +- pxor 0x20(BUF), %xmm3 +- pxor 0x30(BUF), %xmm4 +- +- sub $0x40, LEN +- add $0x40, BUF +- cmp $0x40, LEN +- jge .Lloop_64 +-.Lless_64:/* Folding cache line into 128bit */ +-#ifdef __x86_64__ +- movdqa .Lconstant_R4R3(%rip), CONSTANT +-#else +- movdqa .Lconstant_R4R3, CONSTANT +-#endif +- prefetchnta (BUF) +- +- movdqa %xmm1, %xmm5 +- pclmulqdq $0x00, CONSTANT, %xmm1 +- pclmulqdq $0x11, CONSTANT, %xmm5 +- pxor %xmm5, %xmm1 +- pxor %xmm2, %xmm1 +- +- movdqa %xmm1, %xmm5 +- pclmulqdq $0x00, CONSTANT, %xmm1 +- pclmulqdq $0x11, CONSTANT, %xmm5 +- pxor %xmm5, %xmm1 +- pxor %xmm3, %xmm1 +- +- movdqa %xmm1, %xmm5 +- pclmulqdq $0x00, CONSTANT, %xmm1 +- pclmulqdq $0x11, CONSTANT, %xmm5 +- pxor %xmm5, %xmm1 +- pxor %xmm4, %xmm1 +- +- cmp $0x10, LEN +- jb .Lfold_64 +-.Lloop_16:/* Folding rest buffer into 128bit */ +- movdqa %xmm1, %xmm5 +- pclmulqdq $0x00, CONSTANT, %xmm1 +- pclmulqdq $0x11, CONSTANT, %xmm5 +- pxor %xmm5, %xmm1 +- pxor (BUF), %xmm1 +- sub $0x10, LEN +- add $0x10, BUF +- cmp $0x10, LEN +- jge .Lloop_16 +- +-.Lfold_64: +- /* perform the last 64 bit fold, also adds 32 zeroes +- * to the input stream */ +- pclmulqdq $0x01, %xmm1, CONSTANT /* R4 * xmm1.low */ +- psrldq $0x08, %xmm1 +- pxor CONSTANT, %xmm1 +- +- /* final 32-bit fold */ +- movdqa %xmm1, %xmm2 +-#ifdef __x86_64__ +- movdqa .Lconstant_R5(%rip), CONSTANT +- movdqa .Lconstant_mask32(%rip), %xmm3 +-#else +- movdqa .Lconstant_R5, CONSTANT +- movdqa .Lconstant_mask32, %xmm3 +-#endif +- psrldq $0x04, %xmm2 +- pand %xmm3, %xmm1 +- pclmulqdq $0x00, CONSTANT, %xmm1 +- pxor %xmm2, %xmm1 +- +- /* Finish up with the bit-reversed barrett reduction 64 ==> 32 bits */ +-#ifdef __x86_64__ +- movdqa .Lconstant_RUpoly(%rip), CONSTANT +-#else +- movdqa .Lconstant_RUpoly, CONSTANT +-#endif +- movdqa %xmm1, %xmm2 +- pand %xmm3, %xmm1 +- pclmulqdq $0x10, CONSTANT, %xmm1 +- pand %xmm3, %xmm1 +- pclmulqdq $0x00, CONSTANT, %xmm1 +- pxor %xmm2, %xmm1 +- pextrd $0x01, %xmm1, %eax +- +- RET +-SYM_FUNC_END(crc32_pclmul_le_16) ++DEFINE_CRC_PCLMUL_FUNCS(crc32_lsb, /* bits= */ 32, /* lsb= */ 1) +diff --git a/arch/x86/lib/crct10dif-pcl-asm_64.S b/arch/x86/lib/crct10dif-pcl-asm_64.S +deleted file mode 100644 +index 5286db5b8165..000000000000 +--- a/arch/x86/lib/crct10dif-pcl-asm_64.S ++++ /dev/null +@@ -1,332 +0,0 @@ +-######################################################################## +-# Implement fast CRC-T10DIF computation with SSE and PCLMULQDQ instructions +-# +-# Copyright (c) 2013, Intel Corporation +-# +-# Authors: +-# Erdinc Ozturk +-# Vinodh Gopal +-# James Guilford +-# Tim Chen +-# +-# This software is available to you under a choice of one of two +-# licenses. You may choose to be licensed under the terms of the GNU +-# General Public License (GPL) Version 2, available from the file +-# COPYING in the main directory of this source tree, or the +-# OpenIB.org BSD license below: +-# +-# Redistribution and use in source and binary forms, with or without +-# modification, are permitted provided that the following conditions are +-# met: +-# +-# * Redistributions of source code must retain the above copyright +-# notice, this list of conditions and the following disclaimer. +-# +-# * Redistributions in binary form must reproduce the above copyright +-# notice, this list of conditions and the following disclaimer in the +-# documentation and/or other materials provided with the +-# distribution. +-# +-# * Neither the name of the Intel Corporation nor the names of its +-# contributors may be used to endorse or promote products derived from +-# this software without specific prior written permission. +-# +-# +-# THIS SOFTWARE IS PROVIDED BY INTEL CORPORATION ""AS IS"" AND ANY +-# EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE +-# IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR +-# PURPOSE ARE DISCLAIMED. IN NO EVENT SHALL INTEL CORPORATION OR +-# CONTRIBUTORS BE LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, +-# EXEMPLARY, OR CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, +-# PROCUREMENT OF SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR +-# PROFITS; OR BUSINESS INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF +-# LIABILITY, WHETHER IN CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING +-# NEGLIGENCE OR OTHERWISE) ARISING IN ANY WAY OUT OF THE USE OF THIS +-# SOFTWARE, EVEN IF ADVISED OF THE POSSIBILITY OF SUCH DAMAGE. +-# +-# Reference paper titled "Fast CRC Computation for Generic +-# Polynomials Using PCLMULQDQ Instruction" +-# URL: http://www.intel.com/content/dam/www/public/us/en/documents +-# /white-papers/fast-crc-computation-generic-polynomials-pclmulqdq-paper.pdf +-# +- +-#include +- +-.text +- +-#define init_crc %edi +-#define buf %rsi +-#define len %rdx +- +-#define FOLD_CONSTS %xmm10 +-#define BSWAP_MASK %xmm11 +- +-# Fold reg1, reg2 into the next 32 data bytes, storing the result back into +-# reg1, reg2. +-.macro fold_32_bytes offset, reg1, reg2 +- movdqu \offset(buf), %xmm9 +- movdqu \offset+16(buf), %xmm12 +- pshufb BSWAP_MASK, %xmm9 +- pshufb BSWAP_MASK, %xmm12 +- movdqa \reg1, %xmm8 +- movdqa \reg2, %xmm13 +- pclmulqdq $0x00, FOLD_CONSTS, \reg1 +- pclmulqdq $0x11, FOLD_CONSTS, %xmm8 +- pclmulqdq $0x00, FOLD_CONSTS, \reg2 +- pclmulqdq $0x11, FOLD_CONSTS, %xmm13 +- pxor %xmm9 , \reg1 +- xorps %xmm8 , \reg1 +- pxor %xmm12, \reg2 +- xorps %xmm13, \reg2 +-.endm +- +-# Fold src_reg into dst_reg. +-.macro fold_16_bytes src_reg, dst_reg +- movdqa \src_reg, %xmm8 +- pclmulqdq $0x11, FOLD_CONSTS, \src_reg +- pclmulqdq $0x00, FOLD_CONSTS, %xmm8 +- pxor %xmm8, \dst_reg +- xorps \src_reg, \dst_reg +-.endm +- +-# +-# u16 crc_t10dif_pcl(u16 init_crc, const *u8 buf, size_t len); +-# +-# Assumes len >= 16. +-# +-SYM_FUNC_START(crc_t10dif_pcl) +- +- movdqa .Lbswap_mask(%rip), BSWAP_MASK +- +- # For sizes less than 256 bytes, we can't fold 128 bytes at a time. +- cmp $256, len +- jl .Lless_than_256_bytes +- +- # Load the first 128 data bytes. Byte swapping is necessary to make the +- # bit order match the polynomial coefficient order. +- movdqu 16*0(buf), %xmm0 +- movdqu 16*1(buf), %xmm1 +- movdqu 16*2(buf), %xmm2 +- movdqu 16*3(buf), %xmm3 +- movdqu 16*4(buf), %xmm4 +- movdqu 16*5(buf), %xmm5 +- movdqu 16*6(buf), %xmm6 +- movdqu 16*7(buf), %xmm7 +- add $128, buf +- pshufb BSWAP_MASK, %xmm0 +- pshufb BSWAP_MASK, %xmm1 +- pshufb BSWAP_MASK, %xmm2 +- pshufb BSWAP_MASK, %xmm3 +- pshufb BSWAP_MASK, %xmm4 +- pshufb BSWAP_MASK, %xmm5 +- pshufb BSWAP_MASK, %xmm6 +- pshufb BSWAP_MASK, %xmm7 +- +- # XOR the first 16 data *bits* with the initial CRC value. +- pxor %xmm8, %xmm8 +- pinsrw $7, init_crc, %xmm8 +- pxor %xmm8, %xmm0 +- +- movdqa .Lfold_across_128_bytes_consts(%rip), FOLD_CONSTS +- +- # Subtract 128 for the 128 data bytes just consumed. Subtract another +- # 128 to simplify the termination condition of the following loop. +- sub $256, len +- +- # While >= 128 data bytes remain (not counting xmm0-7), fold the 128 +- # bytes xmm0-7 into them, storing the result back into xmm0-7. +-.Lfold_128_bytes_loop: +- fold_32_bytes 0, %xmm0, %xmm1 +- fold_32_bytes 32, %xmm2, %xmm3 +- fold_32_bytes 64, %xmm4, %xmm5 +- fold_32_bytes 96, %xmm6, %xmm7 +- add $128, buf +- sub $128, len +- jge .Lfold_128_bytes_loop +- +- # Now fold the 112 bytes in xmm0-xmm6 into the 16 bytes in xmm7. +- +- # Fold across 64 bytes. +- movdqa .Lfold_across_64_bytes_consts(%rip), FOLD_CONSTS +- fold_16_bytes %xmm0, %xmm4 +- fold_16_bytes %xmm1, %xmm5 +- fold_16_bytes %xmm2, %xmm6 +- fold_16_bytes %xmm3, %xmm7 +- # Fold across 32 bytes. +- movdqa .Lfold_across_32_bytes_consts(%rip), FOLD_CONSTS +- fold_16_bytes %xmm4, %xmm6 +- fold_16_bytes %xmm5, %xmm7 +- # Fold across 16 bytes. +- movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS +- fold_16_bytes %xmm6, %xmm7 +- +- # Add 128 to get the correct number of data bytes remaining in 0...127 +- # (not counting xmm7), following the previous extra subtraction by 128. +- # Then subtract 16 to simplify the termination condition of the +- # following loop. +- add $128-16, len +- +- # While >= 16 data bytes remain (not counting xmm7), fold the 16 bytes +- # xmm7 into them, storing the result back into xmm7. +- jl .Lfold_16_bytes_loop_done +-.Lfold_16_bytes_loop: +- movdqa %xmm7, %xmm8 +- pclmulqdq $0x11, FOLD_CONSTS, %xmm7 +- pclmulqdq $0x00, FOLD_CONSTS, %xmm8 +- pxor %xmm8, %xmm7 +- movdqu (buf), %xmm0 +- pshufb BSWAP_MASK, %xmm0 +- pxor %xmm0 , %xmm7 +- add $16, buf +- sub $16, len +- jge .Lfold_16_bytes_loop +- +-.Lfold_16_bytes_loop_done: +- # Add 16 to get the correct number of data bytes remaining in 0...15 +- # (not counting xmm7), following the previous extra subtraction by 16. +- add $16, len +- je .Lreduce_final_16_bytes +- +-.Lhandle_partial_segment: +- # Reduce the last '16 + len' bytes where 1 <= len <= 15 and the first 16 +- # bytes are in xmm7 and the rest are the remaining data in 'buf'. To do +- # this without needing a fold constant for each possible 'len', redivide +- # the bytes into a first chunk of 'len' bytes and a second chunk of 16 +- # bytes, then fold the first chunk into the second. +- +- movdqa %xmm7, %xmm2 +- +- # xmm1 = last 16 original data bytes +- movdqu -16(buf, len), %xmm1 +- pshufb BSWAP_MASK, %xmm1 +- +- # xmm2 = high order part of second chunk: xmm7 left-shifted by 'len' bytes. +- lea .Lbyteshift_table+16(%rip), %rax +- sub len, %rax +- movdqu (%rax), %xmm0 +- pshufb %xmm0, %xmm2 +- +- # xmm7 = first chunk: xmm7 right-shifted by '16-len' bytes. +- pxor .Lmask1(%rip), %xmm0 +- pshufb %xmm0, %xmm7 +- +- # xmm1 = second chunk: 'len' bytes from xmm1 (low-order bytes), +- # then '16-len' bytes from xmm2 (high-order bytes). +- pblendvb %xmm2, %xmm1 #xmm0 is implicit +- +- # Fold the first chunk into the second chunk, storing the result in xmm7. +- movdqa %xmm7, %xmm8 +- pclmulqdq $0x11, FOLD_CONSTS, %xmm7 +- pclmulqdq $0x00, FOLD_CONSTS, %xmm8 +- pxor %xmm8, %xmm7 +- pxor %xmm1, %xmm7 +- +-.Lreduce_final_16_bytes: +- # Reduce the 128-bit value M(x), stored in xmm7, to the final 16-bit CRC +- +- # Load 'x^48 * (x^48 mod G(x))' and 'x^48 * (x^80 mod G(x))'. +- movdqa .Lfinal_fold_consts(%rip), FOLD_CONSTS +- +- # Fold the high 64 bits into the low 64 bits, while also multiplying by +- # x^64. This produces a 128-bit value congruent to x^64 * M(x) and +- # whose low 48 bits are 0. +- movdqa %xmm7, %xmm0 +- pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high bits * x^48 * (x^80 mod G(x)) +- pslldq $8, %xmm0 +- pxor %xmm0, %xmm7 # + low bits * x^64 +- +- # Fold the high 32 bits into the low 96 bits. This produces a 96-bit +- # value congruent to x^64 * M(x) and whose low 48 bits are 0. +- movdqa %xmm7, %xmm0 +- pand .Lmask2(%rip), %xmm0 # zero high 32 bits +- psrldq $12, %xmm7 # extract high 32 bits +- pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # high 32 bits * x^48 * (x^48 mod G(x)) +- pxor %xmm0, %xmm7 # + low bits +- +- # Load G(x) and floor(x^48 / G(x)). +- movdqa .Lbarrett_reduction_consts(%rip), FOLD_CONSTS +- +- # Use Barrett reduction to compute the final CRC value. +- movdqa %xmm7, %xmm0 +- pclmulqdq $0x11, FOLD_CONSTS, %xmm7 # high 32 bits * floor(x^48 / G(x)) +- psrlq $32, %xmm7 # /= x^32 +- pclmulqdq $0x00, FOLD_CONSTS, %xmm7 # *= G(x) +- psrlq $48, %xmm0 +- pxor %xmm7, %xmm0 # + low 16 nonzero bits +- # Final CRC value (x^16 * M(x)) mod G(x) is in low 16 bits of xmm0. +- +- pextrw $0, %xmm0, %eax +- RET +- +-.align 16 +-.Lless_than_256_bytes: +- # Checksumming a buffer of length 16...255 bytes +- +- # Load the first 16 data bytes. +- movdqu (buf), %xmm7 +- pshufb BSWAP_MASK, %xmm7 +- add $16, buf +- +- # XOR the first 16 data *bits* with the initial CRC value. +- pxor %xmm0, %xmm0 +- pinsrw $7, init_crc, %xmm0 +- pxor %xmm0, %xmm7 +- +- movdqa .Lfold_across_16_bytes_consts(%rip), FOLD_CONSTS +- cmp $16, len +- je .Lreduce_final_16_bytes # len == 16 +- sub $32, len +- jge .Lfold_16_bytes_loop # 32 <= len <= 255 +- add $16, len +- jmp .Lhandle_partial_segment # 17 <= len <= 31 +-SYM_FUNC_END(crc_t10dif_pcl) +- +-.section .rodata, "a", @progbits +-.align 16 +- +-# Fold constants precomputed from the polynomial 0x18bb7 +-# G(x) = x^16 + x^15 + x^11 + x^9 + x^8 + x^7 + x^5 + x^4 + x^2 + x^1 + x^0 +-.Lfold_across_128_bytes_consts: +- .quad 0x0000000000006123 # x^(8*128) mod G(x) +- .quad 0x0000000000002295 # x^(8*128+64) mod G(x) +-.Lfold_across_64_bytes_consts: +- .quad 0x0000000000001069 # x^(4*128) mod G(x) +- .quad 0x000000000000dd31 # x^(4*128+64) mod G(x) +-.Lfold_across_32_bytes_consts: +- .quad 0x000000000000857d # x^(2*128) mod G(x) +- .quad 0x0000000000007acc # x^(2*128+64) mod G(x) +-.Lfold_across_16_bytes_consts: +- .quad 0x000000000000a010 # x^(1*128) mod G(x) +- .quad 0x0000000000001faa # x^(1*128+64) mod G(x) +-.Lfinal_fold_consts: +- .quad 0x1368000000000000 # x^48 * (x^48 mod G(x)) +- .quad 0x2d56000000000000 # x^48 * (x^80 mod G(x)) +-.Lbarrett_reduction_consts: +- .quad 0x0000000000018bb7 # G(x) +- .quad 0x00000001f65a57f8 # floor(x^48 / G(x)) +- +-.section .rodata.cst16.mask1, "aM", @progbits, 16 +-.align 16 +-.Lmask1: +- .octa 0x80808080808080808080808080808080 +- +-.section .rodata.cst16.mask2, "aM", @progbits, 16 +-.align 16 +-.Lmask2: +- .octa 0x00000000FFFFFFFFFFFFFFFFFFFFFFFF +- +-.section .rodata.cst16.bswap_mask, "aM", @progbits, 16 +-.align 16 +-.Lbswap_mask: +- .octa 0x000102030405060708090A0B0C0D0E0F +- +-.section .rodata.cst32.byteshift_table, "aM", @progbits, 32 +-.align 16 +-# For 1 <= len <= 15, the 16-byte vector beginning at &byteshift_table[16 - len] +-# is the index vector to shift left by 'len' bytes, and is also {0x80, ..., +-# 0x80} XOR the index vector to shift right by '16 - len' bytes. +-.Lbyteshift_table: +- .byte 0x0, 0x81, 0x82, 0x83, 0x84, 0x85, 0x86, 0x87 +- .byte 0x88, 0x89, 0x8a, 0x8b, 0x8c, 0x8d, 0x8e, 0x8f +- .byte 0x0, 0x1, 0x2, 0x3, 0x4, 0x5, 0x6, 0x7 +- .byte 0x8, 0x9, 0xa, 0xb, 0xc, 0xd, 0xe , 0x0 +diff --git a/drivers/nvme/host/Kconfig b/drivers/nvme/host/Kconfig +index 09ed1f61c9a8..aa859464519e 100644 +--- a/drivers/nvme/host/Kconfig ++++ b/drivers/nvme/host/Kconfig +@@ -80,8 +80,7 @@ config NVME_TCP + depends on INET + depends on BLOCK + select NVME_FABRICS +- select CRYPTO +- select CRYPTO_CRC32C ++ select CRC32 + help + This provides support for the NVMe over Fabrics protocol using + the TCP transport. This allows you to use remote block devices +diff --git a/drivers/nvme/host/tcp.c b/drivers/nvme/host/tcp.c +index d991baa82a1c..2a17b535bba6 100644 +--- a/drivers/nvme/host/tcp.c ++++ b/drivers/nvme/host/tcp.c +@@ -8,6 +8,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -17,7 +18,6 @@ + #include + #include + #include +-#include + #include + #include + +@@ -169,8 +169,8 @@ struct nvme_tcp_queue { + bool hdr_digest; + bool data_digest; + bool tls_enabled; +- struct ahash_request *rcv_hash; +- struct ahash_request *snd_hash; ++ u32 rcv_crc; ++ u32 snd_crc; + __le32 exp_ddgst; + __le32 recv_ddgst; + struct completion tls_complete; +@@ -457,32 +457,29 @@ nvme_tcp_fetch_request(struct nvme_tcp_queue *queue) + return req; + } + +-static inline void nvme_tcp_ddgst_final(struct ahash_request *hash, +- __le32 *dgst) ++static inline void nvme_tcp_ddgst_init(u32 *crcp) + { +- ahash_request_set_crypt(hash, NULL, (u8 *)dgst, 0); +- crypto_ahash_final(hash); ++ *crcp = ~0; + } + +-static inline void nvme_tcp_ddgst_update(struct ahash_request *hash, ++static inline void nvme_tcp_ddgst_update(u32 *crcp, + struct page *page, off_t off, size_t len) + { +- struct scatterlist sg; ++ const void *virt = kmap_local_page(page + (off >> PAGE_SHIFT)); + +- sg_init_table(&sg, 1); +- sg_set_page(&sg, page, len, off); +- ahash_request_set_crypt(hash, &sg, NULL, len); +- crypto_ahash_update(hash); ++ *crcp = crc32c(*crcp, virt + (off & ~PAGE_MASK), len); ++ ++ kunmap_local(virt); + } + +-static inline void nvme_tcp_hdgst(struct ahash_request *hash, +- void *pdu, size_t len) ++static inline void nvme_tcp_ddgst_final(u32 *crcp, __le32 *dgst) + { +- struct scatterlist sg; ++ *dgst = cpu_to_le32(~*crcp); ++} + +- sg_init_one(&sg, pdu, len); +- ahash_request_set_crypt(hash, &sg, pdu + len, len); +- crypto_ahash_digest(hash); ++static inline void nvme_tcp_hdgst(void *pdu, size_t len) ++{ ++ put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len); + } + + static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, +@@ -500,7 +497,7 @@ static int nvme_tcp_verify_hdgst(struct nvme_tcp_queue *queue, + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); +- nvme_tcp_hdgst(queue->rcv_hash, pdu, pdu_len); ++ nvme_tcp_hdgst(pdu, pdu_len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + dev_err(queue->ctrl->ctrl.device, +@@ -527,7 +524,7 @@ static int nvme_tcp_check_ddgst(struct nvme_tcp_queue *queue, void *pdu) + nvme_tcp_queue_id(queue)); + return -EPROTO; + } +- crypto_ahash_init(queue->rcv_hash); ++ nvme_tcp_ddgst_init(&queue->rcv_crc); + + return 0; + } +@@ -890,6 +887,17 @@ static inline void nvme_tcp_end_request(struct request *rq, u16 status) + nvme_complete_rq(rq); + } + ++static size_t crc_and_copy_to_iter(const void *addr, size_t bytes, void *crcp_, ++ struct iov_iter *i) ++{ ++ u32 *crcp = crcp_; ++ size_t copied; ++ ++ copied = copy_to_iter(addr, bytes, i); ++ *crcp = crc32c(*crcp, addr, copied); ++ return copied; ++} ++ + static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, + unsigned int *offset, size_t *len) + { +@@ -927,8 +935,10 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, + iov_iter_count(&req->iter)); + + if (queue->data_digest) +- ret = skb_copy_and_hash_datagram_iter(skb, *offset, +- &req->iter, recv_len, queue->rcv_hash); ++ ret = __skb_datagram_iter(skb, *offset, &req->iter, ++ recv_len, true, ++ crc_and_copy_to_iter, ++ &queue->rcv_crc); + else + ret = skb_copy_datagram_iter(skb, *offset, + &req->iter, recv_len); +@@ -946,7 +956,8 @@ static int nvme_tcp_recv_data(struct nvme_tcp_queue *queue, struct sk_buff *skb, + + if (!queue->data_remaining) { + if (queue->data_digest) { +- nvme_tcp_ddgst_final(queue->rcv_hash, &queue->exp_ddgst); ++ nvme_tcp_ddgst_final(&queue->rcv_crc, ++ &queue->exp_ddgst); + queue->ddgst_remaining = NVME_TCP_DIGEST_LENGTH; + } else { + if (pdu->hdr.flags & NVME_TCP_F_DATA_SUCCESS) { +@@ -1148,7 +1159,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) + return ret; + + if (queue->data_digest) +- nvme_tcp_ddgst_update(queue->snd_hash, page, ++ nvme_tcp_ddgst_update(&queue->snd_crc, page, + offset, ret); + + /* +@@ -1162,7 +1173,7 @@ static int nvme_tcp_try_send_data(struct nvme_tcp_request *req) + /* fully successful last send in current PDU */ + if (last && ret == len) { + if (queue->data_digest) { +- nvme_tcp_ddgst_final(queue->snd_hash, ++ nvme_tcp_ddgst_final(&queue->snd_crc, + &req->ddgst); + req->state = NVME_TCP_SEND_DDGST; + req->offset = 0; +@@ -1195,7 +1206,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) + msg.msg_flags |= MSG_EOR; + + if (queue->hdr_digest && !req->offset) +- nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); ++ nvme_tcp_hdgst(pdu, sizeof(*pdu)); + + bvec_set_virt(&bvec, (void *)pdu + req->offset, len); + iov_iter_bvec(&msg.msg_iter, ITER_SOURCE, &bvec, 1, len); +@@ -1208,7 +1219,7 @@ static int nvme_tcp_try_send_cmd_pdu(struct nvme_tcp_request *req) + if (inline_data) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) +- crypto_ahash_init(queue->snd_hash); ++ nvme_tcp_ddgst_init(&queue->snd_crc); + } else { + nvme_tcp_done_send_req(queue); + } +@@ -1230,7 +1241,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) + int ret; + + if (queue->hdr_digest && !req->offset) +- nvme_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); ++ nvme_tcp_hdgst(pdu, sizeof(*pdu)); + + if (!req->h2cdata_left) + msg.msg_flags |= MSG_SPLICE_PAGES; +@@ -1245,7 +1256,7 @@ static int nvme_tcp_try_send_data_pdu(struct nvme_tcp_request *req) + if (!len) { + req->state = NVME_TCP_SEND_DATA; + if (queue->data_digest) +- crypto_ahash_init(queue->snd_hash); ++ nvme_tcp_ddgst_init(&queue->snd_crc); + return 1; + } + req->offset += ret; +@@ -1385,41 +1396,6 @@ static void nvme_tcp_io_work(struct work_struct *w) + queue_work_on(queue->io_cpu, nvme_tcp_wq, &queue->io_work); + } + +-static void nvme_tcp_free_crypto(struct nvme_tcp_queue *queue) +-{ +- struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); +- +- ahash_request_free(queue->rcv_hash); +- ahash_request_free(queue->snd_hash); +- crypto_free_ahash(tfm); +-} +- +-static int nvme_tcp_alloc_crypto(struct nvme_tcp_queue *queue) +-{ +- struct crypto_ahash *tfm; +- +- tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); +- if (IS_ERR(tfm)) +- return PTR_ERR(tfm); +- +- queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); +- if (!queue->snd_hash) +- goto free_tfm; +- ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); +- +- queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); +- if (!queue->rcv_hash) +- goto free_snd_hash; +- ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); +- +- return 0; +-free_snd_hash: +- ahash_request_free(queue->snd_hash); +-free_tfm: +- crypto_free_ahash(tfm); +- return -ENOMEM; +-} +- + static void nvme_tcp_free_async_req(struct nvme_tcp_ctrl *ctrl) + { + struct nvme_tcp_request *async = &ctrl->async_req; +@@ -1452,9 +1428,6 @@ static void nvme_tcp_free_queue(struct nvme_ctrl *nctrl, int qid) + if (!test_and_clear_bit(NVME_TCP_Q_ALLOCATED, &queue->flags)) + return; + +- if (queue->hdr_digest || queue->data_digest) +- nvme_tcp_free_crypto(queue); +- + page_frag_cache_drain(&queue->pf_cache); + + noreclaim_flag = memalloc_noreclaim_save(); +@@ -1865,21 +1838,13 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, + + queue->hdr_digest = nctrl->opts->hdr_digest; + queue->data_digest = nctrl->opts->data_digest; +- if (queue->hdr_digest || queue->data_digest) { +- ret = nvme_tcp_alloc_crypto(queue); +- if (ret) { +- dev_err(nctrl->device, +- "failed to allocate queue %d crypto\n", qid); +- goto err_sock; +- } +- } + + rcv_pdu_size = sizeof(struct nvme_tcp_rsp_pdu) + + nvme_tcp_hdgst_len(queue); + queue->pdu = kmalloc(rcv_pdu_size, GFP_KERNEL); + if (!queue->pdu) { + ret = -ENOMEM; +- goto err_crypto; ++ goto err_sock; + } + + dev_dbg(nctrl->device, "connecting queue %d\n", +@@ -1912,9 +1877,6 @@ static int nvme_tcp_alloc_queue(struct nvme_ctrl *nctrl, int qid, + kernel_sock_shutdown(queue->sock, SHUT_RDWR); + err_rcv_pdu: + kfree(queue->pdu); +-err_crypto: +- if (queue->hdr_digest || queue->data_digest) +- nvme_tcp_free_crypto(queue); + err_sock: + /* ->sock will be released by fput() */ + fput(queue->sock->file); +diff --git a/drivers/nvme/target/tcp.c b/drivers/nvme/target/tcp.c +index 4f9cac8a5abe..cbedf61c8d0a 100644 +--- a/drivers/nvme/target/tcp.c ++++ b/drivers/nvme/target/tcp.c +@@ -7,6 +7,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -18,7 +19,6 @@ + #include + #include + #include +-#include + #include + + #include "nvmet.h" +@@ -173,8 +173,8 @@ struct nvmet_tcp_queue { + /* digest state */ + bool hdr_digest; + bool data_digest; +- struct ahash_request *snd_hash; +- struct ahash_request *rcv_hash; ++ u32 snd_crc; ++ u32 rcv_crc; + + /* TLS state */ + key_serial_t tls_pskid; +@@ -295,14 +295,9 @@ static inline u8 nvmet_tcp_ddgst_len(struct nvmet_tcp_queue *queue) + return queue->data_digest ? NVME_TCP_DIGEST_LENGTH : 0; + } + +-static inline void nvmet_tcp_hdgst(struct ahash_request *hash, +- void *pdu, size_t len) ++static inline void nvmet_tcp_hdgst(void *pdu, size_t len) + { +- struct scatterlist sg; +- +- sg_init_one(&sg, pdu, len); +- ahash_request_set_crypt(hash, &sg, pdu + len, len); +- crypto_ahash_digest(hash); ++ put_unaligned_le32(~crc32c(~0, pdu, len), pdu + len); + } + + static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, +@@ -319,7 +314,7 @@ static int nvmet_tcp_verify_hdgst(struct nvmet_tcp_queue *queue, + } + + recv_digest = *(__le32 *)(pdu + hdr->hlen); +- nvmet_tcp_hdgst(queue->rcv_hash, pdu, len); ++ nvmet_tcp_hdgst(pdu, len); + exp_digest = *(__le32 *)(pdu + hdr->hlen); + if (recv_digest != exp_digest) { + pr_err("queue %d: header digest error: recv %#x expected %#x\n", +@@ -442,12 +437,20 @@ static int nvmet_tcp_map_data(struct nvmet_tcp_cmd *cmd) + return NVME_SC_INTERNAL; + } + +-static void nvmet_tcp_calc_ddgst(struct ahash_request *hash, +- struct nvmet_tcp_cmd *cmd) ++static void nvmet_tcp_calc_ddgst(struct nvmet_tcp_cmd *cmd) + { +- ahash_request_set_crypt(hash, cmd->req.sg, +- (void *)&cmd->exp_ddgst, cmd->req.transfer_len); +- crypto_ahash_digest(hash); ++ size_t total_len = cmd->req.transfer_len; ++ struct scatterlist *sg = cmd->req.sg; ++ u32 crc = ~0; ++ ++ while (total_len) { ++ size_t len = min_t(size_t, total_len, sg->length); ++ ++ crc = crc32c(crc, sg_virt(sg), len); ++ total_len -= len; ++ sg = sg_next(sg); ++ } ++ cmd->exp_ddgst = cpu_to_le32(~crc); + } + + static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) +@@ -474,19 +477,18 @@ static void nvmet_setup_c2h_data_pdu(struct nvmet_tcp_cmd *cmd) + + if (queue->data_digest) { + pdu->hdr.flags |= NVME_TCP_F_DDGST; +- nvmet_tcp_calc_ddgst(queue->snd_hash, cmd); ++ nvmet_tcp_calc_ddgst(cmd); + } + + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; +- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); ++ nvmet_tcp_hdgst(pdu, sizeof(*pdu)); + } + } + + static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) + { + struct nvme_tcp_r2t_pdu *pdu = cmd->r2t_pdu; +- struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; +@@ -504,14 +506,13 @@ static void nvmet_setup_r2t_pdu(struct nvmet_tcp_cmd *cmd) + pdu->r2t_offset = cpu_to_le32(cmd->rbytes_done); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; +- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); ++ nvmet_tcp_hdgst(pdu, sizeof(*pdu)); + } + } + + static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) + { + struct nvme_tcp_rsp_pdu *pdu = cmd->rsp_pdu; +- struct nvmet_tcp_queue *queue = cmd->queue; + u8 hdgst = nvmet_tcp_hdgst_len(cmd->queue); + + cmd->offset = 0; +@@ -524,7 +525,7 @@ static void nvmet_setup_response_pdu(struct nvmet_tcp_cmd *cmd) + pdu->hdr.plen = cpu_to_le32(pdu->hdr.hlen + hdgst); + if (cmd->queue->hdr_digest) { + pdu->hdr.flags |= NVME_TCP_F_HDGST; +- nvmet_tcp_hdgst(queue->snd_hash, pdu, sizeof(*pdu)); ++ nvmet_tcp_hdgst(pdu, sizeof(*pdu)); + } + } + +@@ -858,42 +859,6 @@ static void nvmet_prepare_receive_pdu(struct nvmet_tcp_queue *queue) + smp_store_release(&queue->rcv_state, NVMET_TCP_RECV_PDU); + } + +-static void nvmet_tcp_free_crypto(struct nvmet_tcp_queue *queue) +-{ +- struct crypto_ahash *tfm = crypto_ahash_reqtfm(queue->rcv_hash); +- +- ahash_request_free(queue->rcv_hash); +- ahash_request_free(queue->snd_hash); +- crypto_free_ahash(tfm); +-} +- +-static int nvmet_tcp_alloc_crypto(struct nvmet_tcp_queue *queue) +-{ +- struct crypto_ahash *tfm; +- +- tfm = crypto_alloc_ahash("crc32c", 0, CRYPTO_ALG_ASYNC); +- if (IS_ERR(tfm)) +- return PTR_ERR(tfm); +- +- queue->snd_hash = ahash_request_alloc(tfm, GFP_KERNEL); +- if (!queue->snd_hash) +- goto free_tfm; +- ahash_request_set_callback(queue->snd_hash, 0, NULL, NULL); +- +- queue->rcv_hash = ahash_request_alloc(tfm, GFP_KERNEL); +- if (!queue->rcv_hash) +- goto free_snd_hash; +- ahash_request_set_callback(queue->rcv_hash, 0, NULL, NULL); +- +- return 0; +-free_snd_hash: +- ahash_request_free(queue->snd_hash); +-free_tfm: +- crypto_free_ahash(tfm); +- return -ENOMEM; +-} +- +- + static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) + { + struct nvme_tcp_icreq_pdu *icreq = &queue->pdu.icreq; +@@ -922,11 +887,6 @@ static int nvmet_tcp_handle_icreq(struct nvmet_tcp_queue *queue) + + queue->hdr_digest = !!(icreq->digest & NVME_TCP_HDR_DIGEST_ENABLE); + queue->data_digest = !!(icreq->digest & NVME_TCP_DATA_DIGEST_ENABLE); +- if (queue->hdr_digest || queue->data_digest) { +- ret = nvmet_tcp_alloc_crypto(queue); +- if (ret) +- return ret; +- } + + memset(icresp, 0, sizeof(*icresp)); + icresp->hdr.type = nvme_tcp_icresp; +@@ -1247,7 +1207,7 @@ static void nvmet_tcp_prep_recv_ddgst(struct nvmet_tcp_cmd *cmd) + { + struct nvmet_tcp_queue *queue = cmd->queue; + +- nvmet_tcp_calc_ddgst(queue->rcv_hash, cmd); ++ nvmet_tcp_calc_ddgst(cmd); + queue->offset = 0; + queue->left = NVME_TCP_DIGEST_LENGTH; + queue->rcv_state = NVMET_TCP_RECV_DDGST; +@@ -1616,8 +1576,6 @@ static void nvmet_tcp_release_queue_work(struct work_struct *w) + /* ->sock will be released by fput() */ + fput(queue->sock->file); + nvmet_tcp_free_cmds(queue); +- if (queue->hdr_digest || queue->data_digest) +- nvmet_tcp_free_crypto(queue); + ida_free(&nvmet_tcp_queue_ida, queue->idx); + page_frag_cache_drain(&queue->pf_cache); + kfree(queue); +diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h +index bb2b751d274a..98804d51986c 100644 +--- a/include/linux/skbuff.h ++++ b/include/linux/skbuff.h +@@ -4145,9 +4145,10 @@ static inline int skb_copy_datagram_msg(const struct sk_buff *from, int offset, + } + int skb_copy_and_csum_datagram_msg(struct sk_buff *skb, int hlen, + struct msghdr *msg); +-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, +- struct iov_iter *to, int len, +- struct ahash_request *hash); ++int __skb_datagram_iter(const struct sk_buff *skb, int offset, ++ struct iov_iter *to, int len, bool fault_short, ++ size_t (*cb)(const void *, size_t, void *, ++ struct iov_iter *), void *data); + int skb_copy_datagram_from_iter(struct sk_buff *skb, int offset, + struct iov_iter *from, int len); + int zerocopy_sg_from_iter(struct sk_buff *skb, struct iov_iter *frm); +diff --git a/net/core/datagram.c b/net/core/datagram.c +index f0693707aece..19304c7ce7a3 100644 +--- a/net/core/datagram.c ++++ b/net/core/datagram.c +@@ -61,7 +61,6 @@ + #include + #include + #include +-#include + + /* + * Is a socket 'connection oriented' ? +@@ -385,10 +384,10 @@ INDIRECT_CALLABLE_DECLARE(static size_t simple_copy_to_iter(const void *addr, + void *data __always_unused, + struct iov_iter *i)); + +-static int __skb_datagram_iter(const struct sk_buff *skb, int offset, +- struct iov_iter *to, int len, bool fault_short, +- size_t (*cb)(const void *, size_t, void *, +- struct iov_iter *), void *data) ++int __skb_datagram_iter(const struct sk_buff *skb, int offset, ++ struct iov_iter *to, int len, bool fault_short, ++ size_t (*cb)(const void *, size_t, void *, ++ struct iov_iter *), void *data) + { + int start = skb_headlen(skb); + int i, copy = start - offset, start_off = offset, n; +@@ -481,42 +480,7 @@ static int __skb_datagram_iter(const struct sk_buff *skb, int offset, + + return 0; + } +- +-static size_t hash_and_copy_to_iter(const void *addr, size_t bytes, void *hashp, +- struct iov_iter *i) +-{ +-#ifdef CONFIG_CRYPTO_HASH +- struct ahash_request *hash = hashp; +- struct scatterlist sg; +- size_t copied; +- +- copied = copy_to_iter(addr, bytes, i); +- sg_init_one(&sg, addr, copied); +- ahash_request_set_crypt(hash, &sg, NULL, copied); +- crypto_ahash_update(hash); +- return copied; +-#else +- return 0; +-#endif +-} +- +-/** +- * skb_copy_and_hash_datagram_iter - Copy datagram to an iovec iterator +- * and update a hash. +- * @skb: buffer to copy +- * @offset: offset in the buffer to start copying from +- * @to: iovec iterator to copy to +- * @len: amount of data to copy from buffer to iovec +- * @hash: hash request to update +- */ +-int skb_copy_and_hash_datagram_iter(const struct sk_buff *skb, int offset, +- struct iov_iter *to, int len, +- struct ahash_request *hash) +-{ +- return __skb_datagram_iter(skb, offset, to, len, true, +- hash_and_copy_to_iter, hash); +-} +-EXPORT_SYMBOL(skb_copy_and_hash_datagram_iter); ++EXPORT_SYMBOL_GPL(__skb_datagram_iter); + + static size_t simple_copy_to_iter(const void *addr, size_t bytes, + void *data __always_unused, struct iov_iter *i) +diff --git a/scripts/gen-crc-consts.py b/scripts/gen-crc-consts.py +new file mode 100755 +index 000000000000..aa678a50897d +--- /dev/null ++++ b/scripts/gen-crc-consts.py +@@ -0,0 +1,238 @@ ++#!/usr/bin/env python3 ++# SPDX-License-Identifier: GPL-2.0-or-later ++# ++# Script that generates constants for computing the given CRC variant(s). ++# ++# Copyright 2025 Google LLC ++# ++# Author: Eric Biggers ++ ++import sys ++ ++# XOR (add) an iterable of polynomials. ++def xor(iterable): ++ res = 0 ++ for val in iterable: ++ res ^= val ++ return res ++ ++# Multiply two polynomials. ++def clmul(a, b): ++ return xor(a << i for i in range(b.bit_length()) if (b & (1 << i)) != 0) ++ ++# Polynomial division floor(a / b). ++def div(a, b): ++ q = 0 ++ while a.bit_length() >= b.bit_length(): ++ q ^= 1 << (a.bit_length() - b.bit_length()) ++ a ^= b << (a.bit_length() - b.bit_length()) ++ return q ++ ++# Reduce the polynomial 'a' modulo the polynomial 'b'. ++def reduce(a, b): ++ return a ^ clmul(div(a, b), b) ++ ++# Reflect the bits of a polynomial. ++def bitreflect(poly, num_bits): ++ assert poly.bit_length() <= num_bits ++ return xor(((poly >> i) & 1) << (num_bits - 1 - i) for i in range(num_bits)) ++ ++# Format a polynomial as hex. Bit-reflect it if the CRC is lsb-first. ++def fmt_poly(variant, poly, num_bits): ++ if variant.lsb: ++ poly = bitreflect(poly, num_bits) ++ return f'0x{poly:0{2*num_bits//8}x}' ++ ++# Print a pair of 64-bit polynomial multipliers. They are always passed in the ++# order [HI64_TERMS, LO64_TERMS] but will be printed in the appropriate order. ++def print_mult_pair(variant, mults): ++ mults = list(mults if variant.lsb else reversed(mults)) ++ terms = ['HI64_TERMS', 'LO64_TERMS'] if variant.lsb else ['LO64_TERMS', 'HI64_TERMS'] ++ for i in range(2): ++ print(f'\t\t{fmt_poly(variant, mults[i]["val"], 64)},\t/* {terms[i]}: {mults[i]["desc"]} */') ++ ++# Pretty-print a polynomial. ++def pprint_poly(prefix, poly): ++ terms = [f'x^{i}' for i in reversed(range(poly.bit_length())) ++ if (poly & (1 << i)) != 0] ++ j = 0 ++ while j < len(terms): ++ s = prefix + terms[j] + (' +' if j < len(terms) - 1 else '') ++ j += 1 ++ while j < len(terms) and len(s) < 73: ++ s += ' ' + terms[j] + (' +' if j < len(terms) - 1 else '') ++ j += 1 ++ print(s) ++ prefix = ' * ' + (' ' * (len(prefix) - 3)) ++ ++# Print a comment describing constants generated for the given CRC variant. ++def print_header(variant, what): ++ print('/*') ++ s = f'{"least" if variant.lsb else "most"}-significant-bit-first CRC-{variant.bits}' ++ print(f' * {what} generated for {s} using') ++ pprint_poly(' * G(x) = ', variant.G) ++ print(' */') ++ ++class CrcVariant: ++ def __init__(self, bits, generator_poly, bit_order): ++ self.bits = bits ++ if bit_order not in ['lsb', 'msb']: ++ raise ValueError('Invalid value for bit_order') ++ self.lsb = bit_order == 'lsb' ++ self.name = f'crc{bits}_{bit_order}_0x{generator_poly:0{(2*bits+7)//8}x}' ++ if self.lsb: ++ generator_poly = bitreflect(generator_poly, bits) ++ self.G = generator_poly ^ (1 << bits) ++ ++# Generate tables for CRC computation using the "slice-by-N" method. ++# N=1 corresponds to the traditional byte-at-a-time table. ++def gen_slicebyN_tables(variants, n): ++ for v in variants: ++ print('') ++ print_header(v, f'Slice-by-{n} CRC table') ++ print(f'static const u{v.bits} __maybe_unused {v.name}_table[{256*n}] = {{') ++ s = '' ++ for i in range(256 * n): ++ # The i'th table entry is the CRC of the message consisting of byte ++ # i % 256 followed by i // 256 zero bytes. ++ poly = (bitreflect(i % 256, 8) if v.lsb else i % 256) << (v.bits + 8*(i//256)) ++ next_entry = fmt_poly(v, reduce(poly, v.G), v.bits) + ',' ++ if len(s + next_entry) > 71: ++ print(f'\t{s}') ++ s = '' ++ s += (' ' if s else '') + next_entry ++ if s: ++ print(f'\t{s}') ++ print('};') ++ ++# Generate constants for carryless multiplication based CRC computation. ++def gen_x86_pclmul_consts(variants): ++ # These are the distances, in bits, to generate folding constants for. ++ FOLD_DISTANCES = [2048, 1024, 512, 256, 128] ++ ++ for v in variants: ++ (G, n, lsb) = (v.G, v.bits, v.lsb) ++ print('') ++ print_header(v, 'CRC folding constants') ++ print('static const struct {') ++ if not lsb: ++ print('\tu8 bswap_mask[16];') ++ for i in FOLD_DISTANCES: ++ print(f'\tu64 fold_across_{i}_bits_consts[2];') ++ print('\tu8 shuf_table[48];') ++ print('\tu64 barrett_reduction_consts[2];') ++ print(f'}} {v.name}_consts ____cacheline_aligned __maybe_unused = {{') ++ ++ # Byte-reflection mask, needed for msb-first CRCs ++ if not lsb: ++ print('\t.bswap_mask = {' + ', '.join(str(i) for i in reversed(range(16))) + '},') ++ ++ # Fold constants for all distances down to 128 bits ++ for i in FOLD_DISTANCES: ++ print(f'\t.fold_across_{i}_bits_consts = {{') ++ # Given 64x64 => 128 bit carryless multiplication instructions, two ++ # 64-bit fold constants are needed per "fold distance" i: one for ++ # HI64_TERMS that is basically x^(i+64) mod G and one for LO64_TERMS ++ # that is basically x^i mod G. The exact values however undergo a ++ # couple adjustments, described below. ++ mults = [] ++ for j in [64, 0]: ++ pow_of_x = i + j ++ if lsb: ++ # Each 64x64 => 128 bit carryless multiplication instruction ++ # actually generates a 127-bit product in physical bits 0 ++ # through 126, which in the lsb-first case represent the ++ # coefficients of x^1 through x^127, not x^0 through x^126. ++ # Thus in the lsb-first case, each such instruction ++ # implicitly adds an extra factor of x. The below removes a ++ # factor of x from each constant to compensate for this. ++ # For n < 64 the x could be removed from either the reduced ++ # part or unreduced part, but for n == 64 the reduced part ++ # is the only option. Just always use the reduced part. ++ pow_of_x -= 1 ++ # Make a factor of x^(64-n) be applied unreduced rather than ++ # reduced, to cause the product to use only the x^(64-n) and ++ # higher terms and always be zero in the lower terms. Usually ++ # this makes no difference as it does not affect the product's ++ # congruence class mod G and the constant remains 64-bit, but ++ # part of the final reduction from 128 bits does rely on this ++ # property when it reuses one of the constants. ++ pow_of_x -= 64 - n ++ mults.append({ 'val': reduce(1 << pow_of_x, G) << (64 - n), ++ 'desc': f'(x^{pow_of_x} mod G) * x^{64-n}' }) ++ print_mult_pair(v, mults) ++ print('\t},') ++ ++ # Shuffle table for handling 1..15 bytes at end ++ print('\t.shuf_table = {') ++ print('\t\t' + (16*'-1, ').rstrip()) ++ print('\t\t' + ''.join(f'{i:2}, ' for i in range(16)).rstrip()) ++ print('\t\t' + (16*'-1, ').rstrip()) ++ print('\t},') ++ ++ # Barrett reduction constants for reducing 128 bits to the final CRC ++ print('\t.barrett_reduction_consts = {') ++ mults = [] ++ ++ val = div(1 << (63+n), G) ++ desc = f'floor(x^{63+n} / G)' ++ if not lsb: ++ val = (val << 1) - (1 << 64) ++ desc = f'({desc} * x) - x^64' ++ mults.append({ 'val': val, 'desc': desc }) ++ ++ val = G - (1 << n) ++ desc = f'G - x^{n}' ++ if lsb and n == 64: ++ assert (val & 1) != 0 # The x^0 term should always be nonzero. ++ val >>= 1 ++ desc = f'({desc} - x^0) / x' ++ else: ++ pow_of_x = 64 - n - (1 if lsb else 0) ++ val <<= pow_of_x ++ desc = f'({desc}) * x^{pow_of_x}' ++ mults.append({ 'val': val, 'desc': desc }) ++ ++ print_mult_pair(v, mults) ++ print('\t},') ++ ++ print('};') ++ ++def parse_crc_variants(vars_string): ++ variants = [] ++ for var_string in vars_string.split(','): ++ bits, bit_order, generator_poly = var_string.split('_') ++ assert bits.startswith('crc') ++ bits = int(bits.removeprefix('crc')) ++ assert generator_poly.startswith('0x') ++ generator_poly = generator_poly.removeprefix('0x') ++ assert len(generator_poly) % 2 == 0 ++ generator_poly = int(generator_poly, 16) ++ variants.append(CrcVariant(bits, generator_poly, bit_order)) ++ return variants ++ ++if len(sys.argv) != 3: ++ sys.stderr.write(f'Usage: {sys.argv[0]} CONSTS_TYPE[,CONSTS_TYPE]... CRC_VARIANT[,CRC_VARIANT]...\n') ++ sys.stderr.write(' CONSTS_TYPE can be sliceby[1-8] or x86_pclmul\n') ++ sys.stderr.write(' CRC_VARIANT is crc${num_bits}_${bit_order}_${generator_poly_as_hex}\n') ++ sys.stderr.write(' E.g. crc16_msb_0x8bb7 or crc32_lsb_0xedb88320\n') ++ sys.stderr.write(' Polynomial must use the given bit_order and exclude x^{num_bits}\n') ++ sys.exit(1) ++ ++print('/* SPDX-License-Identifier: GPL-2.0-or-later */') ++print('/*') ++print(' * CRC constants generated by:') ++print(' *') ++print(f' *\t{sys.argv[0]} {" ".join(sys.argv[1:])}') ++print(' *') ++print(' * Do not edit manually.') ++print(' */') ++consts_types = sys.argv[1].split(',') ++variants = parse_crc_variants(sys.argv[2]) ++for consts_type in consts_types: ++ if consts_type.startswith('sliceby'): ++ gen_slicebyN_tables(variants, int(consts_type.removeprefix('sliceby'))) ++ elif consts_type == 'x86_pclmul': ++ gen_x86_pclmul_consts(variants) ++ else: ++ raise ValueError(f'Unknown consts_type: {consts_type}') +-- +2.49.0.634.g8613c2bb6c + diff --git a/sys-kernel/gentoo-sources-6.14/0009-zstd.patch b/sys-kernel/gentoo-sources-6.14/0009-zstd.patch new file mode 100644 index 0000000..b47cba6 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.14/0009-zstd.patch @@ -0,0 +1,23554 @@ +From 2e674186d8b03209dca74867e2d4c885190243fe Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 22 May 2025 16:36:17 +0200 +Subject: [PATCH 9/9] zstd + +Signed-off-by: Peter Jung +--- + MAINTAINERS | 1 + + include/linux/zstd.h | 87 +- + include/linux/zstd_errors.h | 30 +- + include/linux/zstd_lib.h | 1123 ++++-- + lib/zstd/Makefile | 3 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 150 + + lib/zstd/common/bitstream.h | 155 +- + lib/zstd/common/compiler.h | 151 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 9 +- + lib/zstd/common/debug.h | 37 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 13 +- + lib/zstd/common/error_private.h | 88 +- + lib/zstd/common/fse.h | 103 +- + lib/zstd/common/fse_decompress.c | 132 +- + lib/zstd/common/huf.h | 240 +- + lib/zstd/common/mem.h | 3 +- + lib/zstd/common/portability_macros.h | 51 +- + lib/zstd/common/zstd_common.c | 38 +- + lib/zstd/common/zstd_deps.h | 16 +- + lib/zstd/common/zstd_internal.h | 153 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 74 +- + lib/zstd/compress/hist.c | 13 +- + lib/zstd/compress/hist.h | 10 +- + lib/zstd/compress/huf_compress.c | 441 ++- + lib/zstd/compress/zstd_compress.c | 3293 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 621 +++- + lib/zstd/compress/zstd_compress_literals.c | 157 +- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 21 +- + lib/zstd/compress/zstd_compress_sequences.h | 16 +- + lib/zstd/compress/zstd_compress_superblock.c | 394 +- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 222 +- + lib/zstd/compress/zstd_double_fast.c | 245 +- + lib/zstd/compress/zstd_double_fast.h | 27 +- + lib/zstd/compress/zstd_fast.c | 703 +++- + lib/zstd/compress/zstd_fast.h | 16 +- + lib/zstd/compress/zstd_lazy.c | 840 +++-- + lib/zstd/compress/zstd_lazy.h | 195 +- + lib/zstd/compress/zstd_ldm.c | 102 +- + lib/zstd/compress/zstd_ldm.h | 17 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 571 +-- + lib/zstd/compress/zstd_opt.h | 55 +- + lib/zstd/compress/zstd_preSplit.c | 239 ++ + lib/zstd/compress/zstd_preSplit.h | 34 + + lib/zstd/decompress/huf_decompress.c | 887 +++-- + lib/zstd/decompress/zstd_ddict.c | 9 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 377 +- + lib/zstd/decompress/zstd_decompress_block.c | 724 ++-- + lib/zstd/decompress/zstd_decompress_block.h | 10 +- + .../decompress/zstd_decompress_internal.h | 19 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 5 +- + lib/zstd/zstd_compress_module.c | 75 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 61 files changed, 8755 insertions(+), 4384 deletions(-) + create mode 100644 lib/zstd/common/allocations.h + create mode 100644 lib/zstd/common/bits.h + create mode 100644 lib/zstd/compress/zstd_preSplit.c + create mode 100644 lib/zstd/compress/zstd_preSplit.h + +diff --git a/MAINTAINERS b/MAINTAINERS +index 161dd28ca25b..de1f3f463548 100644 +--- a/MAINTAINERS ++++ b/MAINTAINERS +@@ -26310,6 +26310,7 @@ F: mm/zsmalloc.c + + ZSTD + M: Nick Terrell ++M: David Sterba + S: Maintained + B: https://github.com/facebook/zstd/issues + T: git https://github.com/terrelln/linux.git +diff --git a/include/linux/zstd.h b/include/linux/zstd.h +index b2c7cf310c8f..2f2a3c8b8a33 100644 +--- a/include/linux/zstd.h ++++ b/include/linux/zstd.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -160,7 +160,6 @@ typedef ZSTD_parameters zstd_parameters; + zstd_parameters zstd_get_params(int level, + unsigned long long estimated_src_size); + +- + /** + * zstd_get_cparams() - returns zstd_compression_parameters for selected level + * @level: The compression level +@@ -173,9 +172,20 @@ zstd_parameters zstd_get_params(int level, + zstd_compression_parameters zstd_get_cparams(int level, + unsigned long long estimated_src_size, size_t dict_size); + +-/* ====== Single-pass Compression ====== */ +- + typedef ZSTD_CCtx zstd_cctx; ++typedef ZSTD_cParameter zstd_cparameter; ++ ++/** ++ * zstd_cctx_set_param() - sets a compression parameter ++ * @cctx: The context. Must have been initialized with zstd_init_cctx(). ++ * @param: The parameter to set. ++ * @value: The value to set the parameter to. ++ * ++ * Return: Zero or an error, which can be checked using zstd_is_error(). ++ */ ++size_t zstd_cctx_set_param(zstd_cctx *cctx, zstd_cparameter param, int value); ++ ++/* ====== Single-pass Compression ====== */ + + /** + * zstd_cctx_workspace_bound() - max memory needed to initialize a zstd_cctx +@@ -190,6 +200,20 @@ typedef ZSTD_CCtx zstd_cctx; + */ + size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *parameters); + ++/** ++ * zstd_cctx_workspace_bound_with_ext_seq_prod() - max memory needed to ++ * initialize a zstd_cctx when using the block-level external sequence ++ * producer API. ++ * @parameters: The compression parameters to be used. ++ * ++ * If multiple compression parameters might be used, the caller must call ++ * this function for each set of parameters and use the maximum size. ++ * ++ * Return: A lower bound on the size of the workspace that is passed to ++ * zstd_init_cctx(). ++ */ ++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *parameters); ++ + /** + * zstd_init_cctx() - initialize a zstd compression context + * @workspace: The workspace to emplace the context into. It must outlive +@@ -424,6 +448,16 @@ typedef ZSTD_CStream zstd_cstream; + */ + size_t zstd_cstream_workspace_bound(const zstd_compression_parameters *cparams); + ++/** ++ * zstd_cstream_workspace_bound_with_ext_seq_prod() - memory needed to initialize ++ * a zstd_cstream when using the block-level external sequence producer API. ++ * @cparams: The compression parameters to be used for compression. ++ * ++ * Return: A lower bound on the size of the workspace that is passed to ++ * zstd_init_cstream(). ++ */ ++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *cparams); ++ + /** + * zstd_init_cstream() - initialize a zstd streaming compression context + * @parameters The zstd parameters to use for compression. +@@ -583,6 +617,18 @@ size_t zstd_decompress_stream(zstd_dstream *dstream, zstd_out_buffer *output, + */ + size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); + ++/** ++ * zstd_register_sequence_producer() - exposes the zstd library function ++ * ZSTD_registerSequenceProducer(). This is used for the block-level external ++ * sequence producer API. See upstream zstd.h for detailed documentation. ++ */ ++typedef ZSTD_sequenceProducer_F zstd_sequence_producer_f; ++void zstd_register_sequence_producer( ++ zstd_cctx *cctx, ++ void* sequence_producer_state, ++ zstd_sequence_producer_f sequence_producer ++); ++ + /** + * struct zstd_frame_params - zstd frame parameters stored in the frame header + * @frameContentSize: The frame content size, or ZSTD_CONTENTSIZE_UNKNOWN if not +@@ -596,7 +642,7 @@ size_t zstd_find_frame_compressed_size(const void *src, size_t src_size); + * + * See zstd_lib.h. + */ +-typedef ZSTD_frameHeader zstd_frame_header; ++typedef ZSTD_FrameHeader zstd_frame_header; + + /** + * zstd_get_frame_header() - extracts parameters from a zstd or skippable frame +@@ -611,4 +657,35 @@ typedef ZSTD_frameHeader zstd_frame_header; + size_t zstd_get_frame_header(zstd_frame_header *params, const void *src, + size_t src_size); + ++/** ++ * struct zstd_sequence - a sequence of literals or a match ++ * ++ * @offset: The offset of the match ++ * @litLength: The literal length of the sequence ++ * @matchLength: The match length of the sequence ++ * @rep: Represents which repeat offset is used ++ */ ++typedef ZSTD_Sequence zstd_sequence; ++ ++/** ++ * zstd_compress_sequences_and_literals() - compress an array of zstd_sequence and literals ++ * ++ * @cctx: The zstd compression context. ++ * @dst: The buffer to compress the data into. ++ * @dst_capacity: The size of the destination buffer. ++ * @in_seqs: The array of zstd_sequence to compress. ++ * @in_seqs_size: The number of sequences in in_seqs. ++ * @literals: The literals associated to the sequences to be compressed. ++ * @lit_size: The size of the literals in the literals buffer. ++ * @lit_capacity: The size of the literals buffer. ++ * @decompressed_size: The size of the input data ++ * ++ * Return: The compressed size or an error, which can be checked using ++ * zstd_is_error(). ++ */ ++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity, ++ const zstd_sequence *in_seqs, size_t in_seqs_size, ++ const void* literals, size_t lit_size, size_t lit_capacity, ++ size_t decompressed_size); ++ + #endif /* LINUX_ZSTD_H */ +diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h +index 58b6dd45a969..c307fb011132 100644 +--- a/include/linux/zstd_errors.h ++++ b/include/linux/zstd_errors.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,13 +13,18 @@ + #define ZSTD_ERRORS_H_398273423 + + +-/*===== dependency =====*/ +-#include /* size_t */ ++/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ ++#define ZSTDERRORLIB_VISIBLE + ++#ifndef ZSTDERRORLIB_HIDDEN ++# if (__GNUC__ >= 4) && !defined(__MINGW32__) ++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) ++# else ++# define ZSTDERRORLIB_HIDDEN ++# endif ++#endif + +-/* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY ++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE + + /*-********************************************* + * Error codes list +@@ -43,14 +49,18 @@ typedef enum { + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, ++ ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, ++ ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_cannotProduce_uncompressedBlock = 49, ++ ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, +@@ -58,18 +68,18 @@ typedef enum { + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, ++ ZSTD_error_noForwardProgress_destFull = 80, ++ ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, ++ ZSTD_error_sequenceProducer_failed = 106, ++ ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ + } ZSTD_ErrorCode; + +-/*! ZSTD_getErrorCode() : +- convert a `size_t` function result into a `ZSTD_ErrorCode` enum type, +- which can be used to compare with enum list published above */ +-ZSTDERRORLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); + ZSTDERRORLIB_API const char* ZSTD_getErrorString(ZSTD_ErrorCode code); /*< Same as ZSTD_getErrorName, but using a `ZSTD_ErrorCode` enum argument */ + + +diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h +index 79d55465d5c1..e295d4125dde 100644 +--- a/include/linux/zstd_lib.h ++++ b/include/linux/zstd_lib.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,23 +12,47 @@ + #ifndef ZSTD_H_235446 + #define ZSTD_H_235446 + +-/* ====== Dependency ======*/ +-#include /* INT_MAX */ ++ ++/* ====== Dependencies ======*/ + #include /* size_t */ + ++#include /* list of errors */ ++#if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) ++#include /* INT_MAX */ ++#endif /* ZSTD_STATIC_LINKING_ONLY */ ++ + + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ +-#ifndef ZSTDLIB_VISIBLE ++#define ZSTDLIB_VISIBLE ++ ++#ifndef ZSTDLIB_HIDDEN + # if (__GNUC__ >= 4) && !defined(__MINGW32__) +-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) + # else +-# define ZSTDLIB_VISIBLE + # define ZSTDLIB_HIDDEN + # endif + #endif ++ + #define ZSTDLIB_API ZSTDLIB_VISIBLE + ++/* Deprecation warnings : ++ * Should these warnings be a problem, it is generally possible to disable them, ++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. ++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. ++ */ ++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS ++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ ++#else ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) || defined(__IAR_SYSTEMS_ICC__) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) ++# elif (__GNUC__ >= 3) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) ++# else ++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") ++# define ZSTD_DEPRECATED(message) ++# endif ++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ ++ + + /* ***************************************************************************** + Introduction +@@ -65,7 +90,7 @@ + /*------ Version ------*/ + #define ZSTD_VERSION_MAJOR 1 + #define ZSTD_VERSION_MINOR 5 +-#define ZSTD_VERSION_RELEASE 2 ++#define ZSTD_VERSION_RELEASE 7 + #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + /*! ZSTD_versionNumber() : +@@ -103,11 +128,12 @@ ZSTDLIB_API const char* ZSTD_versionString(void); + + + /* ************************************* +-* Simple API ++* Simple Core API + ***************************************/ + /*! ZSTD_compress() : + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, +@@ -115,47 +141,55 @@ ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, + int compressionLevel); + + /*! ZSTD_decompress() : +- * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. +- * `dstCapacity` is an upper bound of originalSize to regenerate. +- * If user cannot imply a maximum upper bound, it's better to use streaming mode to decompress data. +- * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), +- * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ ++ * `compressedSize` : must be the _exact_ size of some number of compressed and/or skippable frames. ++ * Multiple compressed frames can be decompressed at once with this method. ++ * The result will be the concatenation of all decompressed frames, back to back. ++ * `dstCapacity` is an upper bound of originalSize to regenerate. ++ * First frame's decompressed size can be extracted using ZSTD_getFrameContentSize(). ++ * If maximum upper bound isn't known, prefer using streaming mode to decompress data. ++ * @return : the number of bytes decompressed into `dst` (<= `dstCapacity`), ++ * or an errorCode if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_decompress( void* dst, size_t dstCapacity, + const void* src, size_t compressedSize); + ++ ++/*====== Decompression helper functions ======*/ ++ + /*! ZSTD_getFrameContentSize() : requires v1.3.0+ +- * `src` should point to the start of a ZSTD encoded frame. +- * `srcSize` must be at least as large as the frame header. +- * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. +- * @return : - decompressed size of `src` frame content, if known +- * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined +- * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) +- * note 1 : a 0 return value means the frame is valid but "empty". +- * note 2 : decompressed size is an optional field, it may not be present, typically in streaming mode. +- * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. +- * In which case, it's necessary to use streaming mode to decompress data. +- * Optionally, application can rely on some implicit limit, +- * as ZSTD_decompress() only needs an upper bound of decompressed size. +- * (For example, data could be necessarily cut into blocks <= 16 KB). +- * note 3 : decompressed size is always present when compression is completed using single-pass functions, +- * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). +- * note 4 : decompressed size can be very large (64-bits value), +- * potentially larger than what local system can handle as a single memory segment. +- * In which case, it's necessary to use streaming mode to decompress data. +- * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. +- * Always ensure return value fits within application's authorized limits. +- * Each application can set its own limits. +- * note 6 : This function replaces ZSTD_getDecompressedSize() */ ++ * `src` should point to the start of a ZSTD encoded frame. ++ * `srcSize` must be at least as large as the frame header. ++ * hint : any size >= `ZSTD_frameHeaderSize_max` is large enough. ++ * @return : - decompressed size of `src` frame content, if known ++ * - ZSTD_CONTENTSIZE_UNKNOWN if the size cannot be determined ++ * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) ++ * note 1 : a 0 return value means the frame is valid but "empty". ++ * When invoking this method on a skippable frame, it will return 0. ++ * note 2 : decompressed size is an optional field, it may not be present (typically in streaming mode). ++ * When `return==ZSTD_CONTENTSIZE_UNKNOWN`, data to decompress could be any size. ++ * In which case, it's necessary to use streaming mode to decompress data. ++ * Optionally, application can rely on some implicit limit, ++ * as ZSTD_decompress() only needs an upper bound of decompressed size. ++ * (For example, data could be necessarily cut into blocks <= 16 KB). ++ * note 3 : decompressed size is always present when compression is completed using single-pass functions, ++ * such as ZSTD_compress(), ZSTD_compressCCtx() ZSTD_compress_usingDict() or ZSTD_compress_usingCDict(). ++ * note 4 : decompressed size can be very large (64-bits value), ++ * potentially larger than what local system can handle as a single memory segment. ++ * In which case, it's necessary to use streaming mode to decompress data. ++ * note 5 : If source is untrusted, decompressed size could be wrong or intentionally modified. ++ * Always ensure return value fits within application's authorized limits. ++ * Each application can set its own limits. ++ * note 6 : This function replaces ZSTD_getDecompressedSize() */ + #define ZSTD_CONTENTSIZE_UNKNOWN (0ULL - 1) + #define ZSTD_CONTENTSIZE_ERROR (0ULL - 2) + ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize); + +-/*! ZSTD_getDecompressedSize() : +- * NOTE: This function is now obsolete, in favor of ZSTD_getFrameContentSize(). ++/*! ZSTD_getDecompressedSize() (obsolete): ++ * This function is now obsolete, in favor of ZSTD_getFrameContentSize(). + * Both functions work the same way, but ZSTD_getDecompressedSize() blends + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ ++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") + ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ +@@ -163,18 +197,50 @@ ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t + * `srcSize` must be >= first frame size + * @return : the compressed size of the first frame starting at `src`, + * suitable to pass as `srcSize` to `ZSTD_decompress` or similar, +- * or an error code if input is invalid */ ++ * or an error code if input is invalid ++ * Note 1: this method is called _find*() because it's not enough to read the header, ++ * it may have to scan through the frame's content, to reach its end. ++ * Note 2: this method also works with Skippable Frames. In which case, ++ * it returns the size of the complete skippable frame, ++ * which is always equal to its content size + 8 bytes for headers. */ + ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize); + + +-/*====== Helper functions ======*/ +-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ +-ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ +-ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ +-ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +-ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ +-ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ ++/*====== Compression helper functions ======*/ ++ ++/*! ZSTD_compressBound() : ++ * maximum compressed size in worst case single-pass scenario. ++ * When invoking `ZSTD_compress()`, or any other one-pass compression function, ++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) ++ * as it eliminates one potential failure scenario, ++ * aka not enough room in dst buffer to write the compressed frame. ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize >= ZSTD_MAX_INPUT_SIZE . ++ * In which case, ZSTD_compressBound() will return an error code ++ * which can be tested using ZSTD_isError(). ++ * ++ * ZSTD_COMPRESSBOUND() : ++ * same as ZSTD_compressBound(), but as a macro. ++ * It can be used to produce constants, which can be useful for static allocation, ++ * for example to size a static array on stack. ++ * Will produce constant value 0 if srcSize is too large. ++ */ ++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00ULL : 0xFF00FF00U) ++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++ ++ ++/*====== Error helper functions ======*/ ++/* ZSTD_isError() : ++ * Most ZSTD_* functions returning a size_t value can be tested for error, ++ * using ZSTD_isError(). ++ * @return 1 if error, 0 otherwise ++ */ ++ZSTDLIB_API unsigned ZSTD_isError(size_t result); /*!< tells if a `size_t` function result is an error code */ ++ZSTDLIB_API ZSTD_ErrorCode ZSTD_getErrorCode(size_t functionResult); /* convert a result into an error code, which can be compared to error enum list */ ++ZSTDLIB_API const char* ZSTD_getErrorName(size_t result); /*!< provides readable string from a function result */ ++ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ ++ZSTDLIB_API int ZSTD_maxCLevel(void); /*!< maximum compression level available */ ++ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compression level, specified by ZSTD_CLEVEL_DEFAULT, requires v1.5.0+ */ + + + /* ************************************* +@@ -182,25 +248,25 @@ ZSTDLIB_API int ZSTD_defaultCLevel(void); /*!< default compres + ***************************************/ + /*= Compression context + * When compressing many times, +- * it is recommended to allocate a context just once, +- * and re-use it for each successive compression operation. +- * This will make workload friendlier for system's memory. ++ * it is recommended to allocate a compression context just once, ++ * and reuse it for each successive compression operation. ++ * This will make the workload easier for system's memory. + * Note : re-using context is just a speed / resource optimization. + * It doesn't change the compression ratio, which remains identical. +- * Note 2 : In multi-threaded environments, +- * use one different context per thread for parallel execution. ++ * Note 2: For parallel execution in multi-threaded environments, ++ * use one different context per thread . + */ + typedef struct ZSTD_CCtx_s ZSTD_CCtx; + ZSTDLIB_API ZSTD_CCtx* ZSTD_createCCtx(void); +-ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* accept NULL pointer */ ++ZSTDLIB_API size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx); /* compatible with NULL pointer */ + + /*! ZSTD_compressCCtx() : + * Same as ZSTD_compress(), using an explicit ZSTD_CCtx. +- * Important : in order to behave similarly to `ZSTD_compress()`, +- * this function compresses at requested compression level, +- * __ignoring any other parameter__ . ++ * Important : in order to mirror `ZSTD_compress()` behavior, ++ * this function compresses at the requested compression level, ++ * __ignoring any other advanced parameter__ . + * If any advanced parameter was set using the advanced API, +- * they will all be reset. Only `compressionLevel` remains. ++ * they will all be reset. Only @compressionLevel remains. + */ + ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -210,7 +276,7 @@ ZSTDLIB_API size_t ZSTD_compressCCtx(ZSTD_CCtx* cctx, + /*= Decompression context + * When decompressing many times, + * it is recommended to allocate a context only once, +- * and re-use it for each successive compression operation. ++ * and reuse it for each successive compression operation. + * This will make workload friendlier for system's memory. + * Use one context per thread for parallel execution. */ + typedef struct ZSTD_DCtx_s ZSTD_DCtx; +@@ -220,7 +286,7 @@ ZSTDLIB_API size_t ZSTD_freeDCtx(ZSTD_DCtx* dctx); /* accept NULL pointer * + /*! ZSTD_decompressDCtx() : + * Same as ZSTD_decompress(), + * requires an allocated ZSTD_DCtx. +- * Compatible with sticky parameters. ++ * Compatible with sticky parameters (see below). + */ + ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +@@ -236,12 +302,12 @@ ZSTDLIB_API size_t ZSTD_decompressDCtx(ZSTD_DCtx* dctx, + * using ZSTD_CCtx_set*() functions. + * Pushed parameters are sticky : they are valid for next compressed frame, and any subsequent frame. + * "sticky" parameters are applicable to `ZSTD_compress2()` and `ZSTD_compressStream*()` ! +- * __They do not apply to "simple" one-shot variants such as ZSTD_compressCCtx()__ . ++ * __They do not apply to one-shot variants such as ZSTD_compressCCtx()__ . + * + * It's possible to reset all parameters to "default" using ZSTD_CCtx_reset(). + * + * This API supersedes all other "advanced" API entry points in the experimental section. +- * In the future, we expect to remove from experimental API entry points which are redundant with this API. ++ * In the future, we expect to remove API entry points from experimental which are redundant with this API. + */ + + +@@ -324,6 +390,19 @@ typedef enum { + * The higher the value of selected strategy, the more complex it is, + * resulting in stronger and slower compression. + * Special: value 0 means "use default strategy". */ ++ ++ ZSTD_c_targetCBlockSize=130, /* v1.5.6+ ++ * Attempts to fit compressed block size into approximately targetCBlockSize. ++ * Bound by ZSTD_TARGETCBLOCKSIZE_MIN and ZSTD_TARGETCBLOCKSIZE_MAX. ++ * Note that it's not a guarantee, just a convergence target (default:0). ++ * No target when targetCBlockSize == 0. ++ * This is helpful in low bandwidth streaming environments to improve end-to-end latency, ++ * when a client can make use of partial documents (a prominent example being Chrome). ++ * Note: this parameter is stable since v1.5.6. ++ * It was present as an experimental parameter in earlier versions, ++ * but it's not recommended using it with earlier library versions ++ * due to massive performance regressions. ++ */ + /* LDM mode parameters */ + ZSTD_c_enableLongDistanceMatching=160, /* Enable long distance matching. + * This parameter is designed to improve compression ratio +@@ -403,15 +482,18 @@ typedef enum { + * ZSTD_c_forceMaxWindow + * ZSTD_c_forceAttachDict + * ZSTD_c_literalCompressionMode +- * ZSTD_c_targetCBlockSize + * ZSTD_c_srcSizeHint + * ZSTD_c_enableDedicatedDictSearch + * ZSTD_c_stableInBuffer + * ZSTD_c_stableOutBuffer + * ZSTD_c_blockDelimiters + * ZSTD_c_validateSequences +- * ZSTD_c_useBlockSplitter ++ * ZSTD_c_blockSplitterLevel ++ * ZSTD_c_splitAfterSequences + * ZSTD_c_useRowMatchFinder ++ * ZSTD_c_prefetchCDictTables ++ * ZSTD_c_enableSeqProducerFallback ++ * ZSTD_c_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. +@@ -421,7 +503,7 @@ typedef enum { + ZSTD_c_experimentalParam3=1000, + ZSTD_c_experimentalParam4=1001, + ZSTD_c_experimentalParam5=1002, +- ZSTD_c_experimentalParam6=1003, ++ /* was ZSTD_c_experimentalParam6=1003; is now ZSTD_c_targetCBlockSize */ + ZSTD_c_experimentalParam7=1004, + ZSTD_c_experimentalParam8=1005, + ZSTD_c_experimentalParam9=1006, +@@ -430,7 +512,12 @@ typedef enum { + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, +- ZSTD_c_experimentalParam15=1012 ++ ZSTD_c_experimentalParam15=1012, ++ ZSTD_c_experimentalParam16=1013, ++ ZSTD_c_experimentalParam17=1014, ++ ZSTD_c_experimentalParam18=1015, ++ ZSTD_c_experimentalParam19=1016, ++ ZSTD_c_experimentalParam20=1017 + } ZSTD_cParameter; + + typedef struct { +@@ -493,7 +580,7 @@ typedef enum { + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". +- * This removes any reference to any dictionary too. ++ * This also removes any reference to any dictionary or external sequence producer. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. +@@ -502,11 +589,13 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + + /*! ZSTD_compress2() : + * Behave the same as ZSTD_compressCCtx(), but compression parameters are set using the advanced API. ++ * (note that this entry point doesn't even expose a compression level parameter). + * ZSTD_compress2() always starts a new frame. + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data, though it is possible it fails for other reasons. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +@@ -543,13 +632,17 @@ typedef enum { + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts ++ * ZSTD_d_disableHuffmanAssembly ++ * ZSTD_d_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, +- ZSTD_d_experimentalParam4=1003 ++ ZSTD_d_experimentalParam4=1003, ++ ZSTD_d_experimentalParam5=1004, ++ ZSTD_d_experimentalParam6=1005 + + } ZSTD_dParameter; + +@@ -604,14 +697,14 @@ typedef struct ZSTD_outBuffer_s { + * A ZSTD_CStream object is required to track streaming operation. + * Use ZSTD_createCStream() and ZSTD_freeCStream() to create/release resources. + * ZSTD_CStream objects can be reused multiple times on consecutive compression operations. +-* It is recommended to re-use ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. ++* It is recommended to reuse ZSTD_CStream since it will play nicer with system's memory, by re-using already allocated memory. + * + * For parallel execution, use one separate ZSTD_CStream per thread. + * + * note : since v1.3.0, ZSTD_CStream and ZSTD_CCtx are the same thing. + * + * Parameters are sticky : when starting a new compression on the same context, +-* it will re-use the same sticky parameters as previous compression session. ++* it will reuse the same sticky parameters as previous compression session. + * When in doubt, it's recommended to fully initialize the context before usage. + * Use ZSTD_CCtx_reset() to reset the context and ZSTD_CCtx_setParameter(), + * ZSTD_CCtx_setPledgedSrcSize(), or ZSTD_CCtx_loadDictionary() and friends to +@@ -700,6 +793,11 @@ typedef enum { + * only ZSTD_e_end or ZSTD_e_flush operations are allowed. + * Before starting a new compression job, or changing compression parameters, + * it is required to fully flush internal buffers. ++ * - note: if an operation ends with an error, it may leave @cctx in an undefined state. ++ * Therefore, it's UB to invoke ZSTD_compressStream2() of ZSTD_compressStream() on such a state. ++ * In order to be re-employed after an error, a state must be reset, ++ * which can be done explicitly (ZSTD_CCtx_reset()), ++ * or is sometimes implied by methods starting a new compression job (ZSTD_initCStream(), ZSTD_compressCCtx()) + */ + ZSTDLIB_API size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, +@@ -728,8 +826,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * This following is a legacy streaming API, available since v1.0+ . + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. +- * Streaming in combination with advanced parameters and dictionary compression +- * can only be used through the new API. + ******************************************************************************/ + + /*! +@@ -738,6 +834,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ++ * ++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API ++ * to compress with a dictionary. + */ + ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); + /*! +@@ -758,7 +857,7 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + * + * A ZSTD_DStream object is required to track streaming operations. + * Use ZSTD_createDStream() and ZSTD_freeDStream() to create/release resources. +-* ZSTD_DStream objects can be re-used multiple times. ++* ZSTD_DStream objects can be re-employed multiple times. + * + * Use ZSTD_initDStream() to start a new decompression operation. + * @return : recommended first input size +@@ -768,16 +867,21 @@ ZSTDLIB_API size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output); + * The function will update both `pos` fields. + * If `input.pos < input.size`, some input has not been consumed. + * It's up to the caller to present again remaining data. ++* + * The function tries to flush all data decoded immediately, respecting output buffer size. + * If `output.pos < output.size`, decoder has flushed everything it could. +-* But if `output.pos == output.size`, there might be some data left within internal buffers., ++* ++* However, when `output.pos == output.size`, it's more difficult to know. ++* If @return > 0, the frame is not complete, meaning ++* either there is still some data left to flush within internal buffers, ++* or there is more input to read to complete the frame (or both). + * In which case, call ZSTD_decompressStream() again to flush whatever remains in the buffer. + * Note : with no additional input provided, amount of data flushed is necessarily <= ZSTD_BLOCKSIZE_MAX. + * @return : 0 when a frame is completely decoded and fully flushed, + * or an error code, which can be tested using ZSTD_isError(), + * or any other value > 0, which means there is still some decoding or flushing to do to complete current frame : + * the return value is a suggested next input size (just a hint for better latency) +-* that will never request more than the remaining frame size. ++* that will never request more than the remaining content of the compressed frame. + * *******************************************************************************/ + + typedef ZSTD_DCtx ZSTD_DStream; /*< DCtx and DStream are now effectively same object (>= v1.3.0) */ +@@ -788,13 +892,38 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer + + /*===== Streaming decompression functions =====*/ + +-/* This function is redundant with the advanced API and equivalent to: ++/*! ZSTD_initDStream() : ++ * Initialize/reset DStream state for new decompression operation. ++ * Call before new decompression operation using same DStream. + * ++ * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ + ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + ++/*! ZSTD_decompressStream() : ++ * Streaming decompression function. ++ * Call repetitively to consume full input updating it as necessary. ++ * Function will update both input and output `pos` fields exposing current state via these fields: ++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input ++ * on the next call. ++ * - `output.pos < output.size`, decoder flushed internal output buffer. ++ * - `output.pos == output.size`, unflushed data potentially present in the internal buffers, ++ * check ZSTD_decompressStream() @return value, ++ * if > 0, invoke it again to flush remaining data to output. ++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * @return : 0 when a frame is completely decoded and fully flushed, ++ * or an error code, which can be tested using ZSTD_isError(), ++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. ++ * ++ * Note: when an operation returns with an error code, the @zds state may be left in undefined state. ++ * It's UB to invoke `ZSTD_decompressStream()` on such a state. ++ * In order to re-use such a state, it must be first reset, ++ * which can be done explicitly (`ZSTD_DCtx_reset()`), ++ * or is implied for operations starting some new decompression job (`ZSTD_initDStream`, `ZSTD_decompressDCtx()`, `ZSTD_decompress_usingDict()`) ++ */ + ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + + ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +@@ -913,7 +1042,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). +- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. ++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. +@@ -925,9 +1054,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Advanced dictionary and prefix API (Requires v1.4.0+) + * + * This API allows dictionaries to be used with ZSTD_compress2(), +- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and +- * only reset with the context is reset with ZSTD_reset_parameters or +- * ZSTD_reset_session_and_parameters. Prefixes are single-use. ++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). ++ * Dictionaries are sticky, they remain valid when same context is reused, ++ * they only reset when the context is reset ++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. ++ * In contrast, Prefixes are single-use. + ******************************************************************************/ + + +@@ -937,8 +1068,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". +- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. +- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). ++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, ++ * until parameters are reset, a new dictionary is loaded, or the dictionary ++ * is explicitly invalidated by loading a NULL dictionary. + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, +@@ -947,11 +1079,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() +- * to precisely select how dictionary content must be interpreted. */ ++ * to precisely select how dictionary content must be interpreted. ++ * Note 5 : This method does not benefit from LDM (long distance mode). ++ * If you want to employ LDM on some large dictionary content, ++ * prefer employing ZSTD_CCtx_refPrefix() described below. ++ */ + ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ +- * Reference a prepared dictionary, to be used for all next compressed frames. ++ * Reference a prepared dictionary, to be used for all future compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. +@@ -970,6 +1106,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). ++ * This method is compatible with LDM (long distance mode). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. +@@ -986,9 +1123,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ +- * Create an internal DDict from dict buffer, +- * to be used to decompress next frames. +- * The dictionary remains valid for all future frames, until explicitly invalidated. ++ * Create an internal DDict from dict buffer, to be used to decompress all future frames. ++ * The dictionary remains valid for all future frames, until explicitly invalidated, or ++ * a new dictionary is loaded. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". +@@ -1012,9 +1149,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * ++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary ++ * will be managed, and referencing a dictionary effectively "discards" any previous one. ++ * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). +- * Note 1 : Currently, only one dictionary can be managed. +- * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +@@ -1051,6 +1189,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DStream(const ZSTD_DStream* zds); + ZSTDLIB_API size_t ZSTD_sizeof_CDict(const ZSTD_CDict* cdict); + ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + ++ + #endif /* ZSTD_H_235446 */ + + +@@ -1066,29 +1205,12 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #if !defined(ZSTD_H_ZSTD_STATIC_LINKING_ONLY) + #define ZSTD_H_ZSTD_STATIC_LINKING_ONLY + ++ + /* This can be overridden externally to hide static symbols. */ + #ifndef ZSTDLIB_STATIC_API + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE + #endif + +-/* Deprecation warnings : +- * Should these warnings be a problem, it is generally possible to disable them, +- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. +- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. +- */ +-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +-#else +-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +-# elif (__GNUC__ >= 3) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +-# else +-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +-# endif +-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ +- + /* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** +@@ -1123,6 +1245,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ + #define ZSTD_STRATEGY_MIN ZSTD_fast + #define ZSTD_STRATEGY_MAX ZSTD_btultra2 ++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ + + + #define ZSTD_OVERLAPLOG_MIN 0 +@@ -1146,7 +1269,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_LDM_HASHRATELOG_MAX (ZSTD_WINDOWLOG_MAX - ZSTD_HASHLOG_MIN) + + /* Advanced parameter bounds */ +-#define ZSTD_TARGETCBLOCKSIZE_MIN 64 ++#define ZSTD_TARGETCBLOCKSIZE_MIN 1340 /* suitable to fit into an ethernet / wifi / 4G transport frame */ + #define ZSTD_TARGETCBLOCKSIZE_MAX ZSTD_BLOCKSIZE_MAX + #define ZSTD_SRCSIZEHINT_MIN 0 + #define ZSTD_SRCSIZEHINT_MAX INT_MAX +@@ -1188,7 +1311,7 @@ typedef struct { + * + * Note: This field is optional. ZSTD_generateSequences() will calculate the value of + * 'rep', but repeat offsets do not necessarily need to be calculated from an external +- * sequence provider's perspective. For example, ZSTD_compressSequences() does not ++ * sequence provider perspective. For example, ZSTD_compressSequences() does not + * use this 'rep' field at all (as of now). + */ + } ZSTD_Sequence; +@@ -1293,17 +1416,18 @@ typedef enum { + } ZSTD_literalCompressionMode_e; + + typedef enum { +- /* Note: This enum controls features which are conditionally beneficial. Zstd typically will make a final +- * decision on whether or not to enable the feature (ZSTD_ps_auto), but setting the switch to ZSTD_ps_enable +- * or ZSTD_ps_disable allow for a force enable/disable the feature. ++ /* Note: This enum controls features which are conditionally beneficial. ++ * Zstd can take a decision on whether or not to enable the feature (ZSTD_ps_auto), ++ * but setting the switch to ZSTD_ps_enable or ZSTD_ps_disable force enable/disable the feature. + */ + ZSTD_ps_auto = 0, /* Let the library automatically determine whether the feature shall be enabled */ + ZSTD_ps_enable = 1, /* Force-enable the feature */ + ZSTD_ps_disable = 2 /* Do not use the feature */ +-} ZSTD_paramSwitch_e; ++} ZSTD_ParamSwitch_e; ++#define ZSTD_paramSwitch_e ZSTD_ParamSwitch_e /* old name */ + + /* ************************************* +-* Frame size functions ++* Frame header and size functions + ***************************************/ + + /*! ZSTD_findDecompressedSize() : +@@ -1345,34 +1469,130 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_findDecompressedSize(const void* src, + ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize); + + /*! ZSTD_frameHeaderSize() : +- * srcSize must be >= ZSTD_FRAMEHEADERSIZE_PREFIX. ++ * srcSize must be large enough, aka >= ZSTD_FRAMEHEADERSIZE_PREFIX. + * @return : size of the Frame Header, + * or an error code (if srcSize is too small) */ + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_FrameType_e; ++#define ZSTD_frameType_e ZSTD_FrameType_e /* old name */ ++typedef struct { ++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ ++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ ++ unsigned blockSizeMax; ++ ZSTD_FrameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ unsigned headerSize; ++ unsigned dictID; /* for ZSTD_skippableFrame, contains the skippable magic variant [0-15] */ ++ unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; ++} ZSTD_FrameHeader; ++#define ZSTD_frameHeader ZSTD_FrameHeader /* old name */ ++ ++/*! ZSTD_getFrameHeader() : ++ * decode Frame Header into `zfhPtr`, or requires larger `srcSize`. ++ * @return : 0 => header is complete, `zfhPtr` is correctly filled, ++ * >0 => `srcSize` is too small, @return value is the wanted `srcSize` amount, `zfhPtr` is not filled, ++ * or an error code, which can be tested using ZSTD_isError() */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize); ++/*! ZSTD_getFrameHeader_advanced() : ++ * same as ZSTD_getFrameHeader(), ++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ ++/*! ZSTD_decompressionMargin() : ++ * Zstd supports in-place decompression, where the input and output buffers overlap. ++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, ++ * and the input buffer must be at the end of the output buffer. ++ * ++ * _______________________ Output Buffer ________________________ ++ * | | ++ * | ____ Input Buffer ____| ++ * | | | ++ * v v v ++ * |---------------------------------------|-----------|----------| ++ * ^ ^ ^ ++ * |___________________ Output_Size ___________________|_ Margin _| ++ * ++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). ++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or ++ * ZSTD_decompressDCtx(). ++ * NOTE: This function supports multi-frame input. ++ * ++ * @param src The compressed frame(s) ++ * @param srcSize The size of the compressed frame(s) ++ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); ++ ++/*! ZSTD_DECOMPRESS_MARGIN() : ++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from ++ * the compressed frame, compute it from the original size and the blockSizeLog. ++ * See ZSTD_decompressionMargin() for details. ++ * ++ * WARNING: This macro does not support multi-frame input, the input must be a single ++ * zstd frame. If you need that support use the function, or implement it yourself. ++ * ++ * @param originalSize The original uncompressed size of the data. ++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). ++ * Unless you explicitly set the windowLog smaller than ++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. ++ */ ++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ ++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ ++ 4 /* checksum */ + \ ++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ ++ (blockSize) /* One block of margin */ \ ++ )) ++ + typedef enum { +- ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ +- ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ +-} ZSTD_sequenceFormat_e; ++ ZSTD_sf_noBlockDelimiters = 0, /* ZSTD_Sequence[] has no block delimiters, just sequences */ ++ ZSTD_sf_explicitBlockDelimiters = 1 /* ZSTD_Sequence[] contains explicit block delimiters */ ++} ZSTD_SequenceFormat_e; ++#define ZSTD_sequenceFormat_e ZSTD_SequenceFormat_e /* old name */ ++ ++/*! ZSTD_sequenceBound() : ++ * `srcSize` : size of the input buffer ++ * @return : upper-bound for the number of sequences that can be generated ++ * from a buffer of srcSize bytes ++ * ++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); + + /*! ZSTD_generateSequences() : +- * Generate sequences using ZSTD_compress2, given a source buffer. ++ * WARNING: This function is meant for debugging and informational purposes ONLY! ++ * Its implementation is flawed, and it will be deleted in a future version. ++ * It is not guaranteed to succeed, as there are several cases where it will give ++ * up and fail. You should NOT use this function in production code. ++ * ++ * This function is deprecated, and will be removed in a future version. ++ * ++ * Generate sequences using ZSTD_compress2(), given a source buffer. ++ * ++ * @param zc The compression context to be used for ZSTD_compress2(). Set any ++ * compression parameters you need on this context. ++ * @param outSeqs The output sequences buffer of size @p outSeqsSize ++ * @param outSeqsCapacity The size of the output sequences buffer. ++ * ZSTD_sequenceBound(srcSize) is an upper bound on the number ++ * of sequences that can be generated. ++ * @param src The source buffer to generate sequences from of size @p srcSize. ++ * @param srcSize The size of the source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * +- * zc can be used to insert custom compression params. +- * This function invokes ZSTD_compress2 +- * +- * The output of this function can be fed into ZSTD_compressSequences() with CCtx +- * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters +- * @return : number of sequences generated ++ * @returns The number of sequences generated, necessarily less than ++ * ZSTD_sequenceBound(srcSize), or an error code that can be checked ++ * with ZSTD_isError(). + */ +- +-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +- size_t outSeqsSize, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("For debugging only, will be replaced by ZSTD_extractSequences()") ++ZSTDLIB_STATIC_API size_t ++ZSTD_generateSequences(ZSTD_CCtx* zc, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize); + + /*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals +@@ -1388,8 +1608,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + + /*! ZSTD_compressSequences() : +- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. +- * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) ++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. ++ * @src contains the entire input (not just the literals). ++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals ++ * If a dictionary is included, then the cctx should reference the dict (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.). + * The entire source is compressed into a single frame. + * + * The compression behavior changes based on cctx params. In particular: +@@ -1398,11 +1620,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * the block size derived from the cctx, and sequences may be split. This is the default setting. + * + * If ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, the array of ZSTD_Sequence is expected to contain +- * block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. ++ * valid block delimiters (defined in ZSTD_Sequence). Behavior is undefined if no block delimiters are provided. ++ * ++ * When ZSTD_c_blockDelimiters == ZSTD_sf_explicitBlockDelimiters, it's possible to decide generating repcodes ++ * using the advanced parameter ZSTD_c_repcodeResolution. Repcodes will improve compression ratio, though the benefit ++ * can vary greatly depending on Sequences. On the other hand, repcode resolution is an expensive operation. ++ * By default, it's disabled at low (<10) compression levels, and enabled above the threshold (>=10). ++ * ZSTD_c_repcodeResolution makes it possible to directly manage this processing in either direction. + * +- * If ZSTD_c_validateSequences == 0, this function will blindly accept the sequences provided. Invalid sequences cause undefined +- * behavior. If ZSTD_c_validateSequences == 1, then if sequence is invalid (see doc/zstd_compression_format.md for +- * specifics regarding offset/matchlength requirements) then the function will bail out and return an error. ++ * If ZSTD_c_validateSequences == 0, this function blindly accepts the Sequences provided. Invalid Sequences cause undefined ++ * behavior. If ZSTD_c_validateSequences == 1, then the function will detect invalid Sequences (see doc/zstd_compression_format.md for ++ * specifics regarding offset/matchlength requirements) and then bail out and return an error. + * + * In addition to the two adjustable experimental params, there are other important cctx params. + * - ZSTD_c_minMatch MUST be set as less than or equal to the smallest match generated by the match finder. It has a minimum value of ZSTD_MINMATCH_MIN. +@@ -1410,14 +1638,42 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * - ZSTD_c_windowLog affects offset validation: this function will return an error at higher debug levels if a provided offset + * is larger than what the spec allows for a given window log and dictionary (if present). See: doc/zstd_compression_format.md + * +- * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. +- * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, +- * and cannot emit an RLE block that disagrees with the repcode history +- * @return : final compressed size or a ZSTD error. +- */ +-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +- const void* src, size_t srcSize); ++ * Note: Repcodes are, as of now, always re-calculated within this function, ZSTD_Sequence.rep is effectively unused. ++ * Dev Note: Once ability to ingest repcodes become available, the explicit block delims mode must respect those repcodes exactly, ++ * and cannot emit an RLE block that disagrees with the repcode history. ++ * @return : final compressed size, or a ZSTD error code. ++ */ ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); ++ ++ ++/*! ZSTD_compressSequencesAndLiterals() : ++ * This is a variant of ZSTD_compressSequences() which, ++ * instead of receiving (src,srcSize) as input parameter, receives (literals,litSize), ++ * aka all the literals, already extracted and laid out into a single continuous buffer. ++ * This can be useful if the process generating the sequences also happens to generate the buffer of literals, ++ * thus skipping an extraction + caching stage. ++ * It's a speed optimization, useful when the right conditions are met, ++ * but it also features the following limitations: ++ * - Only supports explicit delimiter mode ++ * - Currently does not support Sequences validation (so input Sequences are trusted) ++ * - Not compatible with frame checksum, which must be disabled ++ * - If any block is incompressible, will fail and return an error ++ * - @litSize must be == sum of all @.litLength fields in @inSeqs. Any discrepancy will generate an error. ++ * - @litBufCapacity is the size of the underlying buffer into which literals are written, starting at address @literals. ++ * @litBufCapacity must be at least 8 bytes larger than @litSize. ++ * - @decompressedSize must be correct, and correspond to the sum of all Sequences. Any discrepancy will generate an error. ++ * @return : final compressed size, or a ZSTD error code. ++ */ ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t nbSequences, ++ const void* literals, size_t litSize, size_t litBufCapacity, ++ size_t decompressedSize); + + + /*! ZSTD_writeSkippableFrame() : +@@ -1425,8 +1681,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds + * + * Skippable frames begin with a 4-byte magic number. There are 16 possible choices of magic number, + * ranging from ZSTD_MAGIC_SKIPPABLE_START to ZSTD_MAGIC_SKIPPABLE_START+15. +- * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, so +- * the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. ++ * As such, the parameter magicVariant controls the exact skippable frame magic number variant used, ++ * so the magic number used will be ZSTD_MAGIC_SKIPPABLE_START + magicVariant. + * + * Returns an error if destination buffer is not large enough, if the source size is not representable + * with a 4-byte unsigned int, or if the parameter magicVariant is greater than 15 (and therefore invalid). +@@ -1434,26 +1690,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* ds + * @return : number of bytes written or a ZSTD error. + */ + ZSTDLIB_STATIC_API size_t ZSTD_writeSkippableFrame(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, unsigned magicVariant); ++ const void* src, size_t srcSize, ++ unsigned magicVariant); + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves the content of a zstd skippable frame starting at @src, and writes it to @dst buffer. + * +- * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, +- * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested +- * in the magicVariant. ++ * The parameter @magicVariant will receive the magicVariant that was supplied when the frame was written, ++ * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. ++ * This can be NULL if the caller is not interested in the magicVariant. + * + * Returns an error if destination buffer is not large enough, or if the frame is not skippable. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, ++ const void* src, size_t srcSize); + + /*! ZSTD_isSkippableFrame() : + * Tells if the content of `buffer` starts with a valid Frame Identifier for a skippable frame. + */ +-ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); ++ZSTDLIB_STATIC_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + + + +@@ -1464,48 +1722,59 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + /*! ZSTD_estimate*() : + * These functions make it possible to estimate memory usage + * of a future {D,C}Ctx, before its creation. ++ * This is useful in combination with ZSTD_initStatic(), ++ * which makes it possible to employ a static buffer for ZSTD_CCtx* state. + * + * ZSTD_estimateCCtxSize() will provide a memory budget large enough +- * for any compression level up to selected one. +- * Note : Unlike ZSTD_estimateCStreamSize*(), this estimate +- * does not include space for a window buffer. +- * Therefore, the estimation is only guaranteed for single-shot compressions, not streaming. ++ * to compress data of any size using one-shot compression ZSTD_compressCCtx() or ZSTD_compress2() ++ * associated with any compression level up to max specified one. + * The estimate will assume the input may be arbitrarily large, + * which is the worst case. + * ++ * Note that the size estimation is specific for one-shot compression, ++ * it is not valid for streaming (see ZSTD_estimateCStreamSize*()) ++ * nor other potential ways of using a ZSTD_CCtx* state. ++ * + * When srcSize can be bound by a known and rather "small" value, +- * this fact can be used to provide a tighter estimation +- * because the CCtx compression context will need less memory. +- * This tighter estimation can be provided by more advanced functions ++ * this knowledge can be used to provide a tighter budget estimation ++ * because the ZSTD_CCtx* state will need less memory for small inputs. ++ * This tighter estimation can be provided by employing more advanced functions + * ZSTD_estimateCCtxSize_usingCParams(), which can be used in tandem with ZSTD_getCParams(), + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * +- * Note 2 : only single-threaded compression is supported. ++ * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); + + /*! ZSTD_estimateCStreamSize() : +- * ZSTD_estimateCStreamSize() will provide a budget large enough for any compression level up to selected one. +- * It will also consider src size to be arbitrarily "large", which is worst case. ++ * ZSTD_estimateCStreamSize() will provide a memory budget large enough for streaming compression ++ * using any compression level up to the max specified one. ++ * It will also consider src size to be arbitrarily "large", which is a worst case scenario. + * If srcSize is known to always be small, ZSTD_estimateCStreamSize_usingCParams() can provide a tighter estimation. + * ZSTD_estimateCStreamSize_usingCParams() can be used in tandem with ZSTD_getCParams() to create cParams from compressionLevel. + * ZSTD_estimateCStreamSize_usingCCtxParams() can be used in tandem with ZSTD_CCtxParams_setParameter(). Only single-threaded compression is supported. This function will return an error code if ZSTD_c_nbWorkers is >= 1. + * Note : CStream size estimation is only correct for single-threaded compression. +- * ZSTD_DStream memory budget depends on window Size. ++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * Note 2 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. ++ * ++ * ZSTD_DStream memory budget depends on frame's window Size. + * This information can be passed manually, using ZSTD_estimateDStreamSize, + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); ++ * Any frame requesting a window size larger than max specified one will be rejected. + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. +- * In this case, get total size by adding ZSTD_estimate?DictSize */ +-ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); ++ * In this case, get total size by adding ZSTD_estimate?DictSize ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int maxCompressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +-ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t windowSize); ++ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize(size_t maxWindowSize); + ZSTDLIB_STATIC_API size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize); + + /*! ZSTD_estimate?DictSize() : +@@ -1568,7 +1837,15 @@ typedef void (*ZSTD_freeFunction) (void* opaque, void* address); + typedef struct { ZSTD_allocFunction customAlloc; ZSTD_freeFunction customFree; void* opaque; } ZSTD_customMem; + static + __attribute__((__unused__)) ++ ++#if defined(__clang__) && __clang_major__ >= 5 ++#pragma clang diagnostic push ++#pragma clang diagnostic ignored "-Wzero-as-null-pointer-constant" ++#endif + ZSTD_customMem const ZSTD_defaultCMem = { NULL, NULL, NULL }; /*< this constant defers to stdlib's functions */ ++#if defined(__clang__) && __clang_major__ >= 5 ++#pragma clang diagnostic pop ++#endif + + ZSTDLIB_STATIC_API ZSTD_CCtx* ZSTD_createCCtx_advanced(ZSTD_customMem customMem); + ZSTDLIB_STATIC_API ZSTD_CStream* ZSTD_createCStream_advanced(ZSTD_customMem customMem); +@@ -1649,22 +1926,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + * This function never fails (wide contract) */ + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + ++/*! ZSTD_CCtx_setCParams() : ++ * Set all parameters provided within @p cparams into the working @p cctx. ++ * Note : if modifying parameters during compression (MT mode only), ++ * note that changes to the .windowLog parameter will be ignored. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ * On failure, no parameters are updated. ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ ++/*! ZSTD_CCtx_setFParams() : ++ * Set all parameters provided within @p fparams into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); ++ ++/*! ZSTD_CCtx_setParams() : ++ * Set all parameters provided within @p params into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); ++ + /*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- const void* dict,size_t dictSize, +- ZSTD_parameters params); ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ const void* dict,size_t dictSize, ++ ZSTD_parameters params); + + /*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -1725,7 +2025,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * See the comments on that enum for an explanation of the feature. */ + #define ZSTD_c_forceAttachDict ZSTD_c_experimentalParam4 + +-/* Controlled with ZSTD_paramSwitch_e enum. ++/* Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never compress literals. + * Set to ZSTD_ps_enable to always compress literals. (Note: uncompressed literals +@@ -1737,11 +2037,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_literalCompressionMode ZSTD_c_experimentalParam5 + +-/* Tries to fit compressed block size to be around targetCBlockSize. +- * No target when targetCBlockSize == 0. +- * There is no guarantee on compressed block size (default:0) */ +-#define ZSTD_c_targetCBlockSize ZSTD_c_experimentalParam6 +- + /* User's best guess of source size. + * Hint is not valid when srcSizeHint == 0. + * There is no guarantee that hint is close to actual source size, +@@ -1808,13 +2103,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * +- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same +- * between calls, except for the modifications that zstd makes to pos (the +- * caller must not modify pos). This is checked by the compressor, and +- * compression will fail if it ever changes. This means the only flush +- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end +- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) +- * MUST not be modified during compression or you will get data corruption. ++ * Tells the compressor that input data presented with ZSTD_inBuffer ++ * will ALWAYS be the same between calls. ++ * Technically, the @src pointer must never be changed, ++ * and the @pos field can only be updated by zstd. ++ * However, it's possible to increase the @size field, ++ * allowing scenarios where more data can be appended after compressions starts. ++ * These conditions are checked by the compressor, ++ * and compression will fail if they are not respected. ++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) ++ * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until +@@ -1822,18 +2120,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * +- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. +- * That means this flag cannot be used with ZSTD_compressStream(). +- * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds +- * memory. However, compression WILL fail if you violate the preconditions. ++ * memory. However, compression WILL fail if conditions are not respected. + * +- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST +- * not be modified during compression or you will get data corruption. This +- * is because zstd needs to reference data in the ZSTD_inBuffer to find ++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST ++ * not be modified during compression or it will result in data corruption. ++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, +- * but passing this flag tells zstd to use the user provided buffer. ++ * but passing this flag tells zstd to rely on user provided buffer instead. + */ + #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +@@ -1871,22 +2166,46 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + /* ZSTD_c_validateSequences + * Default is 0 == disabled. Set to 1 to enable sequence validation. + * +- * For use with sequence compression API: ZSTD_compressSequences(). +- * Designates whether or not we validate sequences provided to ZSTD_compressSequences() ++ * For use with sequence compression API: ZSTD_compressSequences*(). ++ * Designates whether or not provided sequences are validated within ZSTD_compressSequences*() + * during function execution. + * +- * Without validation, providing a sequence that does not conform to the zstd spec will cause +- * undefined behavior, and may produce a corrupted block. ++ * When Sequence validation is disabled (default), Sequences are compressed as-is, ++ * so they must correct, otherwise it would result in a corruption error. + * +- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for ++ * Sequence validation adds some protection, by ensuring that all values respect boundary conditions. ++ * If a Sequence is detected invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. +- * + */ + #define ZSTD_c_validateSequences ZSTD_c_experimentalParam12 + +-/* ZSTD_c_useBlockSplitter +- * Controlled with ZSTD_paramSwitch_e enum. ++/* ZSTD_c_blockSplitterLevel ++ * note: this parameter only influences the first splitter stage, ++ * which is active before producing the sequences. ++ * ZSTD_c_splitAfterSequences controls the next splitter stage, ++ * which is active after sequence production. ++ * Note that both can be combined. ++ * Allowed values are between 0 and ZSTD_BLOCKSPLITTER_LEVEL_MAX included. ++ * 0 means "auto", which will select a value depending on current ZSTD_c_strategy. ++ * 1 means no splitting. ++ * Then, values from 2 to 6 are sorted in increasing cpu load order. ++ * ++ * Note that currently the first block is never split, ++ * to ensure expansion guarantees in presence of incompressible data. ++ */ ++#define ZSTD_BLOCKSPLITTER_LEVEL_MAX 6 ++#define ZSTD_c_blockSplitterLevel ZSTD_c_experimentalParam20 ++ ++/* ZSTD_c_splitAfterSequences ++ * This is a stronger splitter algorithm, ++ * based on actual sequences previously produced by the selected parser. ++ * It's also slower, and as a consequence, mostly used for high compression levels. ++ * While the post-splitter does overlap with the pre-splitter, ++ * both can nonetheless be combined, ++ * notably with ZSTD_c_blockSplitterLevel at ZSTD_BLOCKSPLITTER_LEVEL_MAX, ++ * resulting in higher compression ratio than just one of them. ++ * + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use block splitter. + * Set to ZSTD_ps_enable to always use block splitter. +@@ -1894,10 +2213,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * By default, in ZSTD_ps_auto, the library will decide at runtime whether to use + * block splitting based on the compression parameters. + */ +-#define ZSTD_c_useBlockSplitter ZSTD_c_experimentalParam13 ++#define ZSTD_c_splitAfterSequences ZSTD_c_experimentalParam13 + + /* ZSTD_c_useRowMatchFinder +- * Controlled with ZSTD_paramSwitch_e enum. ++ * Controlled with ZSTD_ParamSwitch_e enum. + * Default is ZSTD_ps_auto. + * Set to ZSTD_ps_disable to never use row-based matchfinder. + * Set to ZSTD_ps_enable to force usage of row-based matchfinder. +@@ -1928,6 +2247,80 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + ++/* ZSTD_c_prefetchCDictTables ++ * Controlled with ZSTD_ParamSwitch_e enum. Default is ZSTD_ps_auto. ++ * ++ * In some situations, zstd uses CDict tables in-place rather than copying them ++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). ++ * In such situations, compression speed is seriously impacted when CDict tables are ++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables ++ * when they are used in-place. ++ * ++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. ++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables ++ * into the working context, so there is no need to prefetch. This parameter is ++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be ++ * useful but memcpy() is too expensive. The exact range of input sizes where this ++ * makes sense is best determined by careful experimentation. ++ * ++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, ++ * but in the future zstd may conditionally enable this feature via an auto-detection ++ * heuristic for cold CDicts. ++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. ++ */ ++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 ++ ++/* ZSTD_c_enableSeqProducerFallback ++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. ++ * ++ * Controls whether zstd will fall back to an internal sequence producer if an ++ * external sequence producer is registered and returns an error code. This fallback ++ * is block-by-block: the internal sequence producer will only be called for blocks ++ * where the external sequence producer returns an error code. Fallback parsing will ++ * follow any other cParam settings, such as compression level, the same as in a ++ * normal (fully-internal) compression operation. ++ * ++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API ++ * documentation (below) before setting this parameter. */ ++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 ++ ++/* ZSTD_c_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * This parameter can be used to set an upper bound on the blocksize ++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper ++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make ++ * compressBound() inaccurate). Only currently meant to be used for testing. ++ */ ++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 ++ ++/* ZSTD_c_repcodeResolution ++ * This parameter only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters (may change in the future). ++ * ++ * This parameter affects how zstd parses external sequences, ++ * provided via the ZSTD_compressSequences*() API ++ * or from an external block-level sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets within ++ * external sequences, even if those repcodes are not explicitly indicated in ++ * the "rep" field. Note that this is the only way to exploit repcode matches ++ * while using compressSequences*() or an external sequence producer, since zstd ++ * currently ignores the "rep" field of external sequences. ++ * ++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in ++ * external sequences, regardless of whether the "rep" field has been set. This ++ * reduces sequence compression overhead by about 25% while sacrificing some ++ * compression ratio. ++ * ++ * The default value is ZSTD_ps_auto, for which the library will enable/disable ++ * based on compression level (currently: level<10 disables, level>=10 enables). ++ */ ++#define ZSTD_c_repcodeResolution ZSTD_c_experimentalParam19 ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 /* older name */ ++ ++ + /*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. +@@ -2084,7 +2477,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * +- * When this flags is enabled zstd won't allocate an output buffer, because ++ * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. +@@ -2137,6 +2530,33 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + */ + #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + ++/* ZSTD_d_disableHuffmanAssembly ++ * Set to 1 to disable the Huffman assembly implementation. ++ * The default value is 0, which allows zstd to use the Huffman assembly ++ * implementation if available. ++ * ++ * This parameter can be used to disable Huffman assembly at runtime. ++ * If you want to disable it at compile time you can define the macro ++ * ZSTD_DISABLE_ASM. ++ */ ++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 ++ ++/* ZSTD_d_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * Forces the decompressor to reject blocks whose content size is ++ * larger than the configured maxBlockSize. When maxBlockSize is ++ * larger than the windowSize, the windowSize is used instead. ++ * This saves memory on the decoder when you know all blocks are small. ++ * ++ * This option is typically used in conjunction with ZSTD_c_maxBlockSize. ++ * ++ * WARNING: This causes the decoder to reject otherwise valid frames ++ * that have block sizes larger than the configured maxBlockSize. ++ */ ++#define ZSTD_d_maxBlockSize ZSTD_d_experimentalParam6 ++ + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). +@@ -2145,6 +2565,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") ++ZSTDLIB_STATIC_API + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + /*! ZSTD_decompressStream_simpleArgs() : +@@ -2181,6 +2602,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); +@@ -2198,17 +2620,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + + /*! ZSTD_initCStream_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd parameter and leave the rest as-is. +- * for ((param, value) : params) { +- * ZSTD_CCtx_setParameter(zcs, param, value); +- * } ++ * ZSTD_CCtx_setParams(zcs, params); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * +@@ -2218,6 +2638,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, +@@ -2232,15 +2653,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + + /*! ZSTD_initCStream_usingCDict_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. +- * for ((fParam, value) : fParams) { +- * ZSTD_CCtx_setParameter(zcs, fParam, value); +- * } ++ * ZSTD_CCtx_setFParams(zcs, fParams); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * +@@ -2250,6 +2669,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, +@@ -2264,7 +2684,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * explicitly specified. + * + * start a new frame, using same parameters from previous frame. +- * This is typically useful to skip dictionary loading stage, since it will re-use it in-place. ++ * This is typically useful to skip dictionary loading stage, since it will reuse it in-place. + * Note that zcs must be init at least once before using ZSTD_resetCStream(). + * If pledgedSrcSize is not known at reset time, use macro ZSTD_CONTENTSIZE_UNKNOWN. + * If pledgedSrcSize > 0, its value must be correct, as it will be written in header, and controlled at the end. +@@ -2274,6 +2694,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +@@ -2319,8 +2740,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + + /*! +@@ -2330,8 +2751,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + + /*! +@@ -2339,18 +2760,202 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z + * + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * +- * re-use decompression parameters from previous init; saves dictionary loading +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x ++ * reuse decompression parameters from previous init; saves dictionary loading + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + ++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* ++ * ++ * *** OVERVIEW *** ++ * The Block-Level Sequence Producer API allows users to provide their own custom ++ * sequence producer which libzstd invokes to process each block. The produced list ++ * of sequences (literals and matches) is then post-processed by libzstd to produce ++ * valid compressed blocks. ++ * ++ * This block-level offload API is a more granular complement of the existing ++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers ++ * an easier migration story for applications already integrated with libzstd: the ++ * user application continues to invoke the same compression functions ++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits ++ * from the specific advantages of the external sequence producer. For example, ++ * the sequence producer could be tuned to take advantage of known characteristics ++ * of the input, to offer better speed / ratio, or could leverage hardware ++ * acceleration not available within libzstd itself. ++ * ++ * See contrib/externalSequenceProducer for an example program employing the ++ * Block-Level Sequence Producer API. ++ * ++ * *** USAGE *** ++ * The user is responsible for implementing a function of type ++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following ++ * arguments to the user-provided function: ++ * ++ * - sequenceProducerState: a pointer to a user-managed state for the sequence ++ * producer. ++ * ++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. ++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory ++ * backing outSeqs is managed by the CCtx. ++ * ++ * - src, srcSize: an input buffer for the sequence producer to parse. ++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * - dict, dictSize: a history buffer, which may be empty, which the sequence ++ * producer may reference as it parses the src buffer. Currently, zstd will ++ * always pass dictSize == 0 into external sequence producers, but this will ++ * change in the future. ++ * ++ * - compressionLevel: a signed integer representing the zstd compression level ++ * set by the user for the current operation. The sequence producer may choose ++ * to use this information to change its compression strategy and speed/ratio ++ * tradeoff. Note: the compression level does not reflect zstd parameters set ++ * through the advanced API. ++ * ++ * - windowSize: a size_t representing the maximum allowed offset for external ++ * sequences. Note that sequence offsets are sometimes allowed to exceed the ++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md ++ * for details. ++ * ++ * The user-provided function shall return a size_t representing the number of ++ * sequences written to outSeqs. This return value will be treated as an error ++ * code if it is greater than outSeqsCapacity. The return value must be non-zero ++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided ++ * for convenience, but any value greater than outSeqsCapacity will be treated as ++ * an error code. ++ * ++ * If the user-provided function does not return an error code, the sequences ++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may ++ * occur if the parse is not valid. A parse is defined to be valid if the ++ * following conditions hold: ++ * - The sum of matchLengths and literalLengths must equal srcSize. ++ * - All sequences in the parse, except for the final sequence, must have ++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have ++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. ++ * - All offsets must respect the windowSize parameter as specified in ++ * doc/zstd_compression_format.md. ++ * - If the final sequence has matchLength == 0, it must also have offset == 0. ++ * ++ * zstd will only validate these conditions (and fail compression if they do not ++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence ++ * validation has a performance cost. ++ * ++ * If the user-provided function returns an error, zstd will either fall back ++ * to an internal sequence producer or fail the compression operation. The user can ++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback ++ * cParam. Fallback compression will follow any other cParam settings, such as ++ * compression level, the same as in a normal compression operation. ++ * ++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F ++ * function by calling ++ * ZSTD_registerSequenceProducer(cctx, ++ * sequenceProducerState, ++ * sequenceProducer) ++ * This setting will persist until the next parameter reset of the CCtx. ++ * ++ * The sequenceProducerState must be initialized by the user before calling ++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the ++ * sequenceProducerState. ++ * ++ * *** LIMITATIONS *** ++ * This API is compatible with all zstd compression APIs which respect advanced parameters. ++ * However, there are three limitations: ++ * ++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. ++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level ++ * external sequence producer. ++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some ++ * cases (see its documentation for details). Users must explicitly set ++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external ++ * sequence producer is registered. ++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default ++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should ++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence ++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). ++ * ++ * Second, history buffers are not currently supported. Concretely, zstd will always pass ++ * dictSize == 0 to the external sequence producer (for now). This has two implications: ++ * - Dictionaries are not currently supported. Compression will *not* fail if the user ++ * references a dictionary, but the dictionary won't have any effect. ++ * - Stream history is not currently supported. All advanced compression APIs, including ++ * streaming APIs, work with external sequence producers, but each block is treated as ++ * an independent chunk without history from previous blocks. ++ * ++ * Third, multi-threading within a single compression is not currently supported. In other words, ++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. ++ * Multi-threading across compressions is fine: simply create one CCtx per thread. ++ * ++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to ++ * overcoming them. It is purely a question of engineering effort. ++ */ ++ ++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) ++ ++typedef size_t (*ZSTD_sequenceProducer_F) ( ++ void* sequenceProducerState, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize, ++ const void* dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize ++); ++ ++/*! ZSTD_registerSequenceProducer() : ++ * Instruct zstd to use a block-level external sequence producer function. ++ * ++ * The sequenceProducerState must be initialized by the caller, and the caller is ++ * responsible for managing its lifetime. This parameter is sticky across ++ * compressions. It will remain set until the user explicitly resets compression ++ * parameters. ++ * ++ * Sequence producer registration is considered to be an "advanced parameter", ++ * part of the "advanced API". This means it will only have an effect on compression ++ * APIs which respect advanced parameters, such as compress2() and compressStream2(). ++ * Older compression APIs such as compressCCtx(), which predate the introduction of ++ * "advanced parameters", will ignore any external sequence producer setting. ++ * ++ * The sequence producer can be "cleared" by registering a NULL function pointer. This ++ * removes all limitations described above in the "LIMITATIONS" section of the API docs. ++ * ++ * The user is strongly encouraged to read the full API documentation (above) before ++ * calling this function. */ ++ZSTDLIB_STATIC_API void ++ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* cctx, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++/*! ZSTD_CCtxParams_registerSequenceProducer() : ++ * Same as ZSTD_registerSequenceProducer(), but operates on ZSTD_CCtx_params. ++ * This is used for accurate size estimation with ZSTD_estimateCCtxSize_usingCCtxParams(), ++ * which is needed when creating a ZSTD_CCtx with ZSTD_initStaticCCtx(). ++ * ++ * If you are using the external sequence producer API in a scenario where ZSTD_initStaticCCtx() ++ * is required, then this function is for you. Otherwise, you probably don't need it. ++ * ++ * See tests/zstreamtest.c for example usage. */ ++ZSTDLIB_STATIC_API void ++ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F sequenceProducer ++); ++ ++ + /* ******************************************************************* +-* Buffer-less and synchronous inner streaming functions ++* Buffer-less and synchronous inner streaming functions (DEPRECATED) ++* ++* This API is deprecated, and will be removed in a future version. ++* It allows streaming (de)compression with user allocated buffers. ++* However, it is hard to use, and not as well tested as the rest of ++* our API. + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. ++* Please use the normal streaming API instead: ZSTD_compressStream2, ++* and ZSTD_decompressStream. ++* If there is functionality that you need, but it doesn't provide, ++* please open an issue on our GitHub. + ********************************************************************* */ + + /* +@@ -2358,11 +2963,10 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + A ZSTD_CCtx object is required to track streaming operations. + Use ZSTD_createCCtx() / ZSTD_freeCCtx() to manage resource. +- ZSTD_CCtx object can be re-used multiple times within successive compression operations. ++ ZSTD_CCtx object can be reused multiple times within successive compression operations. + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2380,39 +2984,49 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + It's possible to use srcSize==0, in which case, it will write a final empty block to end the frame. + Without last block mark, frames are considered unfinished (hence corrupted) by compliant decoders. + +- `ZSTD_CCtx` object can be re-used (ZSTD_compressBegin()) to compress again. ++ `ZSTD_CCtx` object can be reused (ZSTD_compressBegin()) to compress again. + */ + + /*===== Buffer-less streaming compression functions =====*/ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ++ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) + + A ZSTD_DCtx object is required to track streaming operations. + Use ZSTD_createDCtx() / ZSTD_freeDCtx() to manage it. +- A ZSTD_DCtx object can be re-used multiple times. ++ A ZSTD_DCtx object can be reused multiple times. + + First typical operation is to retrieve frame parameters, using ZSTD_getFrameHeader(). + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + +- It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, ++ It fills a ZSTD_FrameHeader structure with important information to correctly decode the frame, + such as the dictionary ID, content size, or maximum back-reference distance (`windowSize`). + Note that these values could be wrong, either because of data corruption, or because a 3rd party deliberately spoofs false information. + As a consequence, check that values remain within valid application range. +@@ -2428,7 +3042,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +3062,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2471,27 +3085,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + */ + + /*===== Buffer-less streaming decompression functions =====*/ +-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +-typedef struct { +- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ +- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ +- unsigned blockSizeMax; +- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ +- unsigned headerSize; +- unsigned dictID; +- unsigned checksumFlag; +-} ZSTD_frameHeader; + +-/*! ZSTD_getFrameHeader() : +- * decode Frame Header, or requires larger `srcSize`. +- * @return : 0, `zfhPtr` is correctly filled, +- * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +-/*! ZSTD_getFrameHeader_advanced() : +- * same as ZSTD_getFrameHeader(), +- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +@@ -2502,6 +3096,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2509,11 +3104,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + +-/* ============================ */ +-/* Block level API */ +-/* ============================ */ ++/* ========================================= */ ++/* Block level API (DEPRECATED) */ ++/* ========================================= */ + + /*! ++ ++ This API is deprecated in favor of the regular compression API. ++ You can get the frame header down to 2 bytes by setting: ++ - ZSTD_c_format = ZSTD_f_zstd1_magicless ++ - ZSTD_c_contentSizeFlag = 0 ++ - ZSTD_c_checksumFlag = 0 ++ - ZSTD_c_dictIDFlag = 0 ++ ++ This API is not as well tested as our normal API, so we recommend not using it. ++ We will be removing it in a future version. If the normal API doesn't provide ++ the functionality you need, please open a GitHub issue. ++ + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. +@@ -2524,7 +3131,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2541,11 +3147,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + */ + + /*===== Raw zstd block functions =====*/ ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + + + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ +- +diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile +index 20f08c644b71..be218b5e0ed5 100644 +--- a/lib/zstd/Makefile ++++ b/lib/zstd/Makefile +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + # ################################################################ +-# Copyright (c) Facebook, Inc. ++# Copyright (c) Meta Platforms, Inc. and affiliates. + # All rights reserved. + # + # This source code is licensed under both the BSD-style license (found in the +@@ -26,6 +26,7 @@ zstd_compress-y := \ + compress/zstd_lazy.o \ + compress/zstd_ldm.o \ + compress/zstd_opt.o \ ++ compress/zstd_preSplit.o \ + + zstd_decompress-y := \ + zstd_decompress_module.o \ +diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h +new file mode 100644 +index 000000000000..16c3d08e8d1a +--- /dev/null ++++ b/lib/zstd/common/allocations.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++/* This file provides custom allocation primitives ++ */ ++ ++#define ZSTD_DEPS_NEED_MALLOC ++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ ++ ++#include "compiler.h" /* MEM_STATIC */ ++#define ZSTD_STATIC_LINKING_ONLY ++#include /* ZSTD_customMem */ ++ ++#ifndef ZSTD_ALLOCATIONS_H ++#define ZSTD_ALLOCATIONS_H ++ ++/* custom memory allocation functions */ ++ ++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) ++ return customMem.customAlloc(customMem.opaque, size); ++ return ZSTD_malloc(size); ++} ++ ++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) { ++ /* calloc implemented as malloc+memset; ++ * not as efficient as calloc, but next best guess for custom malloc */ ++ void* const ptr = customMem.customAlloc(customMem.opaque, size); ++ ZSTD_memset(ptr, 0, size); ++ return ptr; ++ } ++ return ZSTD_calloc(1, size); ++} ++ ++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ++{ ++ if (ptr!=NULL) { ++ if (customMem.customFree) ++ customMem.customFree(customMem.opaque, ptr); ++ else ++ ZSTD_free(ptr); ++ } ++} ++ ++#endif /* ZSTD_ALLOCATIONS_H */ +diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h +new file mode 100644 +index 000000000000..c5faaa3d7b08 +--- /dev/null ++++ b/lib/zstd/common/bits.h +@@ -0,0 +1,150 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_BITS_H ++#define ZSTD_BITS_H ++ ++#include "mem.h" ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ 30, 22, 20, 15, 25, 17, 4, 8, ++ 31, 27, 13, 23, 21, 19, 16, 7, ++ 26, 12, 18, 6, 11, 5, 10, 9}; ++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++#else ++ return ZSTD_countTrailingZeros32_fallback(val); ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, ++ 11, 14, 16, 18, 22, 25, 3, 30, ++ 8, 12, 20, 28, 15, 17, 24, 7, ++ 19, 27, 23, 6, 26, 5, 4, 31}; ++ val |= val >> 1; ++ val |= val >> 2; ++ val |= val >> 4; ++ val |= val >> 8; ++ val |= val >> 16; ++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++#else ++ return ZSTD_countLeadingZeros32_fallback(val); ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++#else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); ++ } ++ } ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) ++{ ++ assert(val != 0); ++#if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++#else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); ++ } ++ } ++#endif ++} ++ ++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) ++{ ++ if (MEM_isLittleEndian()) { ++ if (MEM_64bits()) { ++ return ZSTD_countTrailingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countTrailingZeros32((U32)val) >> 3; ++ } ++ } else { /* Big Endian CPU */ ++ if (MEM_64bits()) { ++ return ZSTD_countLeadingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countLeadingZeros32((U32)val) >> 3; ++ } ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ ++{ ++ assert(val != 0); ++ return 31 - ZSTD_countLeadingZeros32(val); ++} ++ ++/* ZSTD_rotateRight_*(): ++ * Rotates a bitfield to the right by "count" bits. ++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts ++ */ ++MEM_STATIC ++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { ++ assert(count < 64); ++ count &= 0x3F; /* for fickle pattern recognition */ ++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); ++} ++ ++MEM_STATIC ++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { ++ assert(count < 32); ++ count &= 0x1F; /* for fickle pattern recognition */ ++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); ++} ++ ++MEM_STATIC ++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { ++ assert(count < 16); ++ count &= 0x0F; /* for fickle pattern recognition */ ++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++} ++ ++#endif /* ZSTD_BITS_H */ +diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h +index feef3a1b1d60..86439da0eea7 100644 +--- a/lib/zstd/common/bitstream.h ++++ b/lib/zstd/common/bitstream.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * bitstream + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -27,7 +28,7 @@ + #include "compiler.h" /* UNLIKELY() */ + #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ + #include "error_private.h" /* error codes and messages */ +- ++#include "bits.h" /* ZSTD_highbit32 */ + + /*========================================= + * Target specific +@@ -41,12 +42,13 @@ + /*-****************************************** + * bitStream encoding API (write forward) + ********************************************/ ++typedef size_t BitContainerType; + /* bitStream can mix input from multiple sources. + * A critical property of these streams is that they encode and decode in **reverse** direction. + * So the first bit sequence you add will be the last to be read, like a LIFO stack. + */ + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitPos; + char* startPtr; + char* ptr; +@@ -54,7 +56,7 @@ typedef struct { + } BIT_CStream_t; + + MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, void* dstBuffer, size_t dstCapacity); +-MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, size_t value, unsigned nbBits); ++MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); + MEM_STATIC void BIT_flushBits(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + +@@ -63,7 +65,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + * `dstCapacity` must be >= sizeof(bitD->bitContainer), otherwise @return will be an error code. + * + * bits are first added to a local register. +-* Local register is size_t, hence 64-bits on 64-bits systems, or 32-bits on 32-bits systems. ++* Local register is BitContainerType, 64-bits on 64-bits systems, or 32-bits on 32-bits systems. + * Writing data into memory is an explicit operation, performed by the flushBits function. + * Hence keep track how many bits are potentially stored into local register to avoid register overflow. + * After a flushBits, a maximum of 7 bits might still be stored into local register. +@@ -80,28 +82,28 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC); + * bitStream decoding API (read backward) + **********************************************/ + typedef struct { +- size_t bitContainer; ++ BitContainerType bitContainer; + unsigned bitsConsumed; + const char* ptr; + const char* start; + const char* limitPtr; + } BIT_DStream_t; + +-typedef enum { BIT_DStream_unfinished = 0, +- BIT_DStream_endOfBuffer = 1, +- BIT_DStream_completed = 2, +- BIT_DStream_overflow = 3 } BIT_DStream_status; /* result of BIT_reloadDStream() */ +- /* 1,2,4,8 would be better for bitmap combinations, but slows down performance a bit ... :( */ ++typedef enum { BIT_DStream_unfinished = 0, /* fully refilled */ ++ BIT_DStream_endOfBuffer = 1, /* still some bits left in bitstream */ ++ BIT_DStream_completed = 2, /* bitstream entirely consumed, bit-exact */ ++ BIT_DStream_overflow = 3 /* user requested more bits than present in bitstream */ ++ } BIT_DStream_status; /* result of BIT_reloadDStream() */ + + MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, size_t srcSize); +-MEM_STATIC size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); ++MEM_STATIC BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits); + MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD); + MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + + + /* Start by invoking BIT_initDStream(). + * A chunk of the bitStream is then stored into a local register. +-* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (size_t). ++* Local register size is 64-bits on 64-bits systems, 32-bits on 32-bits systems (BitContainerType). + * You can then retrieve bitFields stored into the local register, **in reverse order**. + * Local register is explicitly reloaded from memory by the BIT_reloadDStream() method. + * A reload guarantee a minimum of ((8*sizeof(bitD->bitContainer))-7) bits when its result is BIT_DStream_unfinished. +@@ -113,7 +115,7 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* bitD); + /*-**************************************** + * unsafe API + ******************************************/ +-MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, size_t value, unsigned nbBits); ++MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, BitContainerType value, unsigned nbBits); + /* faster, but works only if value is "clean", meaning all high bits above nbBits are 0 */ + + MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); +@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); + /* faster, but works only if nbBits >= 1 */ + +- +- +-/*-************************************************************** +-* Internal functions +-****************************************************************/ +-MEM_STATIC unsigned BIT_highbit32 (U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, +- 11, 14, 16, 18, 22, 25, 3, 30, +- 8, 12, 20, 28, 15, 17, 24, 7, +- 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- + /*===== Local Constants =====*/ + static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, +@@ -178,16 +153,22 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + return 0; + } + ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getLowerBits(BitContainerType bitContainer, U32 const nbBits) ++{ ++ assert(nbBits < BIT_MASK_SIZE); ++ return bitContainer & BIT_mask[nbBits]; ++} ++ + /*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ + MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, +- size_t value, unsigned nbBits) ++ BitContainerType value, unsigned nbBits) + { + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; ++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; + bitC->bitPos += nbBits; + } + +@@ -195,7 +176,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + * works only if `value` is _clean_, + * meaning all high bits above nbBits are 0 */ + MEM_STATIC void BIT_addBitsFast(BIT_CStream_t* bitC, +- size_t value, unsigned nbBits) ++ BitContainerType value, unsigned nbBits) + { + assert((value>>nbBits) == 0); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +@@ -242,7 +223,7 @@ MEM_STATIC size_t BIT_closeCStream(BIT_CStream_t* bitC) + BIT_addBitsFast(bitC, 1, 1); /* endMark */ + BIT_flushBits(bitC); + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (bitC->bitPos > 0); + } + + +@@ -266,35 +247,35 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; + bitD->bitContainer = *(const BYTE*)(bitD->start); + switch(srcSize) + { +- case 7: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); ++ case 7: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[6]) << (sizeof(bitD->bitContainer)*8 - 16); + ZSTD_FALLTHROUGH; + +- case 6: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); ++ case 6: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[5]) << (sizeof(bitD->bitContainer)*8 - 24); + ZSTD_FALLTHROUGH; + +- case 5: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); ++ case 5: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[4]) << (sizeof(bitD->bitContainer)*8 - 32); + ZSTD_FALLTHROUGH; + +- case 4: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[3]) << 24; ++ case 4: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[3]) << 24; + ZSTD_FALLTHROUGH; + +- case 3: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[2]) << 16; ++ case 3: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[2]) << 16; + ZSTD_FALLTHROUGH; + +- case 2: bitD->bitContainer += (size_t)(((const BYTE*)(srcBuffer))[1]) << 8; ++ case 2: bitD->bitContainer += (BitContainerType)(((const BYTE*)(srcBuffer))[1]) << 8; + ZSTD_FALLTHROUGH; + + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; +@@ -303,12 +284,12 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + return srcSize; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getUpperBits(size_t bitContainer, U32 const start) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getUpperBits(BitContainerType bitContainer, U32 const start) + { + return bitContainer >> start; + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 const start, U32 const nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_getMiddleBits(BitContainerType bitContainer, U32 const start, U32 const nbBits) + { + U32 const regMask = sizeof(bitContainer)*8 - 1; + /* if start > regMask, bitstream is corrupted, and result is undefined */ +@@ -318,26 +299,20 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c + * such cpus old (pre-Haswell, 2013) and their performance is not of that + * importance. + */ +-#if defined(__x86_64__) || defined(_M_X86) ++#if defined(__x86_64__) || defined(_M_X64) + return (bitContainer >> (start & regMask)) & ((((U64)1) << nbBits) - 1); + #else + return (bitContainer >> (start & regMask)) & BIT_mask[nbBits]; + #endif + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +-{ +- assert(nbBits < BIT_MASK_SIZE); +- return bitContainer & BIT_mask[nbBits]; +-} +- + /*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. + * On 32-bits, maxNbBits==24. + * On 64-bits, maxNbBits==56. + * @return : value extracted */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_lookBits(const BIT_DStream_t* bitD, U32 nbBits) + { + /* arbitrate between double-shift and shift+mask */ + #if 1 +@@ -353,14 +328,14 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_lookBits(const BIT_DStream_t* bitD, U3 + + /*! BIT_lookBitsFast() : + * unsafe version; only works if nbBits >= 1 */ +-MEM_STATIC size_t BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) ++MEM_STATIC BitContainerType BIT_lookBitsFast(const BIT_DStream_t* bitD, U32 nbBits) + { + U32 const regMask = sizeof(bitD->bitContainer)*8 - 1; + assert(nbBits >= 1); + return (bitD->bitContainer << (bitD->bitsConsumed & regMask)) >> (((regMask+1)-nbBits) & regMask); + } + +-MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) ++FORCE_INLINE_TEMPLATE void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + { + bitD->bitsConsumed += nbBits; + } +@@ -369,23 +344,38 @@ MEM_STATIC FORCE_INLINE_ATTR void BIT_skipBits(BIT_DStream_t* bitD, U32 nbBits) + * Read (consume) next n bits from local register and update. + * Pay attention to not read more than nbBits contained into local register. + * @return : extracted value. */ +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) ++FORCE_INLINE_TEMPLATE BitContainerType BIT_readBits(BIT_DStream_t* bitD, unsigned nbBits) + { +- size_t const value = BIT_lookBits(bitD, nbBits); ++ BitContainerType const value = BIT_lookBits(bitD, nbBits); + BIT_skipBits(bitD, nbBits); + return value; + } + + /*! BIT_readBitsFast() : +- * unsafe version; only works only if nbBits >= 1 */ +-MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) ++ * unsafe version; only works if nbBits >= 1 */ ++MEM_STATIC BitContainerType BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + { +- size_t const value = BIT_lookBitsFast(bitD, nbBits); ++ BitContainerType const value = BIT_lookBitsFast(bitD, nbBits); + assert(nbBits >= 1); + BIT_skipBits(bitD, nbBits); + return value; + } + ++/*! BIT_reloadDStream_internal() : ++ * Simple variant of BIT_reloadDStream(), with two conditions: ++ * 1. bitstream is valid : bitsConsumed <= sizeof(bitD->bitContainer)*8 ++ * 2. look window is valid after shifted down : bitD->ptr >= bitD->start ++ */ ++MEM_STATIC BIT_DStream_status BIT_reloadDStream_internal(BIT_DStream_t* bitD) ++{ ++ assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); ++ bitD->ptr -= bitD->bitsConsumed >> 3; ++ assert(bitD->ptr >= bitD->start); ++ bitD->bitsConsumed &= 7; ++ bitD->bitContainer = MEM_readLEST(bitD->ptr); ++ return BIT_DStream_unfinished; ++} ++ + /*! BIT_reloadDStreamFast() : + * Similar to BIT_reloadDStream(), but with two differences: + * 1. bitsConsumed <= sizeof(bitD->bitContainer)*8 must hold! +@@ -396,31 +386,35 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) + { + if (UNLIKELY(bitD->ptr < bitD->limitPtr)) + return BIT_DStream_overflow; +- assert(bitD->bitsConsumed <= sizeof(bitD->bitContainer)*8); +- bitD->ptr -= bitD->bitsConsumed >> 3; +- bitD->bitsConsumed &= 7; +- bitD->bitContainer = MEM_readLEST(bitD->ptr); +- return BIT_DStream_unfinished; ++ return BIT_reloadDStream_internal(bitD); + } + + /*! BIT_reloadDStream() : + * Refill `bitD` from buffer previously set in BIT_initDStream() . +- * This function is safe, it guarantees it will not read beyond src buffer. ++ * This function is safe, it guarantees it will not never beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) ++FORCE_INLINE_TEMPLATE BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) + { +- if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ ++ /* note : once in overflow mode, a bitstream remains in this mode until it's reset */ ++ if (UNLIKELY(bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8))) { ++ static const BitContainerType zeroFilled = 0; ++ bitD->ptr = (const char*)&zeroFilled; /* aliasing is allowed for char */ ++ /* overflow detected, erroneous scenario or end of stream: no update */ + return BIT_DStream_overflow; ++ } ++ ++ assert(bitD->ptr >= bitD->start); + + if (bitD->ptr >= bitD->limitPtr) { +- return BIT_reloadDStreamFast(bitD); ++ return BIT_reloadDStream_internal(bitD); + } + if (bitD->ptr == bitD->start) { ++ /* reached end of bitStream => no update */ + if (bitD->bitsConsumed < sizeof(bitD->bitContainer)*8) return BIT_DStream_endOfBuffer; + return BIT_DStream_completed; + } +- /* start < ptr < limitPtr */ ++ /* start < ptr < limitPtr => cautious update */ + { U32 nbBytes = bitD->bitsConsumed >> 3; + BIT_DStream_status result = BIT_DStream_unfinished; + if (bitD->ptr - nbBytes < bitD->start) { +@@ -442,5 +436,4 @@ MEM_STATIC unsigned BIT_endOfDStream(const BIT_DStream_t* DStream) + return ((DStream->ptr == DStream->start) && (DStream->bitsConsumed == sizeof(DStream->bitContainer)*8)); + } + +- + #endif /* BITSTREAM_H_MODULE */ +diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h +index c42d39faf9bd..dc9bd15e174e 100644 +--- a/lib/zstd/common/compiler.h ++++ b/lib/zstd/common/compiler.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,6 +12,8 @@ + #ifndef ZSTD_COMPILER_H + #define ZSTD_COMPILER_H + ++#include ++ + #include "portability_macros.h" + + /*-******************************************************* +@@ -41,12 +44,15 @@ + */ + #define WIN_CDECL + ++/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ ++#define UNUSED_ATTR __attribute__((unused)) ++ + /* + * FORCE_INLINE_TEMPLATE is used to define C "templates", which take constant + * parameters. They must be inlined for the compiler to eliminate the constant + * branches. + */ +-#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR ++#define FORCE_INLINE_TEMPLATE static INLINE_KEYWORD FORCE_INLINE_ATTR UNUSED_ATTR + /* + * HINT_INLINE is used to help the compiler generate better code. It is *not* + * used for "templates", so it can be tweaked based on the compilers +@@ -61,11 +67,21 @@ + #if !defined(__clang__) && defined(__GNUC__) && __GNUC__ >= 4 && __GNUC_MINOR__ >= 8 && __GNUC__ < 5 + # define HINT_INLINE static INLINE_KEYWORD + #else +-# define HINT_INLINE static INLINE_KEYWORD FORCE_INLINE_ATTR ++# define HINT_INLINE FORCE_INLINE_TEMPLATE + #endif + +-/* UNUSED_ATTR tells the compiler it is okay if the function is unused. */ +-#define UNUSED_ATTR __attribute__((unused)) ++/* "soft" inline : ++ * The compiler is free to select if it's a good idea to inline or not. ++ * The main objective is to silence compiler warnings ++ * when a defined function in included but not used. ++ * ++ * Note : this macro is prefixed `MEM_` because it used to be provided by `mem.h` unit. ++ * Updating the prefix is probably preferable, but requires a fairly large codemod, ++ * since this name is used everywhere. ++ */ ++#ifndef MEM_STATIC /* already defined in Linux Kernel mem.h */ ++#define MEM_STATIC static __inline UNUSED_ATTR ++#endif + + /* force no inlining */ + #define FORCE_NOINLINE static __attribute__((__noinline__)) +@@ -86,23 +102,24 @@ + # define PREFETCH_L1(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 3 /* locality */) + # define PREFETCH_L2(ptr) __builtin_prefetch((ptr), 0 /* rw==read */, 2 /* locality */) + #elif defined(__aarch64__) +-# define PREFETCH_L1(ptr) __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))) +-# define PREFETCH_L2(ptr) __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))) ++# define PREFETCH_L1(ptr) do { __asm__ __volatile__("prfm pldl1keep, %0" ::"Q"(*(ptr))); } while (0) ++# define PREFETCH_L2(ptr) do { __asm__ __volatile__("prfm pldl2keep, %0" ::"Q"(*(ptr))); } while (0) + #else +-# define PREFETCH_L1(ptr) (void)(ptr) /* disabled */ +-# define PREFETCH_L2(ptr) (void)(ptr) /* disabled */ ++# define PREFETCH_L1(ptr) do { (void)(ptr); } while (0) /* disabled */ ++# define PREFETCH_L2(ptr) do { (void)(ptr); } while (0) /* disabled */ + #endif /* NO_PREFETCH */ + + #define CACHELINE_SIZE 64 + +-#define PREFETCH_AREA(p, s) { \ +- const char* const _ptr = (const char*)(p); \ +- size_t const _size = (size_t)(s); \ +- size_t _pos; \ +- for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ +- PREFETCH_L2(_ptr + _pos); \ +- } \ +-} ++#define PREFETCH_AREA(p, s) \ ++ do { \ ++ const char* const _ptr = (const char*)(p); \ ++ size_t const _size = (size_t)(s); \ ++ size_t _pos; \ ++ for (_pos=0; _pos<_size; _pos+=CACHELINE_SIZE) { \ ++ PREFETCH_L2(_ptr + _pos); \ ++ } \ ++ } while (0) + + /* vectorization + * older GCC (pre gcc-4.3 picked as the cutoff) uses a different syntax, +@@ -126,16 +143,13 @@ + #define UNLIKELY(x) (__builtin_expect((x), 0)) + + #if __has_builtin(__builtin_unreachable) || (defined(__GNUC__) && (__GNUC__ > 4 || (__GNUC__ == 4 && __GNUC_MINOR__ >= 5))) +-# define ZSTD_UNREACHABLE { assert(0), __builtin_unreachable(); } ++# define ZSTD_UNREACHABLE do { assert(0), __builtin_unreachable(); } while (0) + #else +-# define ZSTD_UNREACHABLE { assert(0); } ++# define ZSTD_UNREACHABLE do { assert(0); } while (0) + #endif + + /* disable warnings */ + +-/*Like DYNAMIC_BMI2 but for compile time determination of BMI2 support*/ +- +- + /* compile time determination of SIMD support */ + + /* C-language Attributes are added in C23. */ +@@ -158,9 +172,15 @@ + #define ZSTD_FALLTHROUGH fallthrough + + /*-************************************************************** +-* Alignment check ++* Alignment + *****************************************************************/ + ++/* @return 1 if @u is a 2^n value, 0 otherwise ++ * useful to check a value is valid for alignment restrictions */ ++MEM_STATIC int ZSTD_isPower2(size_t u) { ++ return (u & (u-1)) == 0; ++} ++ + /* this test was initially positioned in mem.h, + * but this file is removed (or replaced) for linux kernel + * so it's now hosted in compiler.h, +@@ -175,10 +195,95 @@ + + #endif /* ZSTD_ALIGNOF */ + ++#ifndef ZSTD_ALIGNED ++/* C90-compatible alignment macro (GCC/Clang). Adjust for other compilers if needed. */ ++#define ZSTD_ALIGNED(a) __attribute__((aligned(a))) ++#endif /* ZSTD_ALIGNED */ ++ ++ + /*-************************************************************** + * Sanitizer + *****************************************************************/ + ++/* ++ * Zstd relies on pointer overflow in its decompressor. ++ * We add this attribute to functions that rely on pointer overflow. ++ */ ++#ifndef ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# if __has_attribute(no_sanitize) ++# if !defined(__clang__) && defined(__GNUC__) && __GNUC__ < 8 ++ /* gcc < 8 only has signed-integer-overlow which triggers on pointer overflow */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("signed-integer-overflow"))) ++# else ++ /* older versions of clang [3.7, 5.0) will warn that pointer-overflow is ignored. */ ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR __attribute__((no_sanitize("pointer-overflow"))) ++# endif ++# else ++# define ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++# endif ++#endif ++ ++/* ++ * Helper function to perform a wrapped pointer difference without triggering ++ * UBSAN. ++ * ++ * @returns lhs - rhs with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++ptrdiff_t ZSTD_wrappedPtrDiff(unsigned char const* lhs, unsigned char const* rhs) ++{ ++ return lhs - rhs; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer add without triggering UBSAN. ++ * ++ * @return ptr + add with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrAdd(unsigned char const* ptr, ptrdiff_t add) ++{ ++ return ptr + add; ++} ++ ++/* ++ * Helper function to perform a wrapped pointer subtraction without triggering ++ * UBSAN. ++ * ++ * @return ptr - sub with wrapping ++ */ ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++unsigned char const* ZSTD_wrappedPtrSub(unsigned char const* ptr, ptrdiff_t sub) ++{ ++ return ptr - sub; ++} ++ ++/* ++ * Helper function to add to a pointer that works around C's undefined behavior ++ * of adding 0 to NULL. ++ * ++ * @returns `ptr + add` except it defines `NULL + 0 == NULL`. ++ */ ++MEM_STATIC ++unsigned char* ZSTD_maybeNullPtrAdd(unsigned char* ptr, ptrdiff_t add) ++{ ++ return add > 0 ? ptr + add : ptr; ++} ++ ++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an ++ * abundance of caution, disable our custom poisoning on mingw. */ ++#ifdef __MINGW32__ ++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE ++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 ++#endif ++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE ++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 ++#endif ++#endif ++ + + + #endif /* ZSTD_COMPILER_H */ +diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h +index 0db7b42407ee..d8319a2bef4c 100644 +--- a/lib/zstd/common/cpu.h ++++ b/lib/zstd/common/cpu.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c +index bb863c9ea616..8eb6aa9a3b20 100644 +--- a/lib/zstd/common/debug.c ++++ b/lib/zstd/common/debug.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -21,4 +22,10 @@ + + #include "debug.h" + ++#if (DEBUGLEVEL>=2) ++/* We only use this when DEBUGLEVEL>=2, but we get -Werror=pedantic errors if a ++ * translation unit is empty. So remove this from Linux kernel builds, but ++ * otherwise just leave it in. ++ */ + int g_debuglevel = DEBUGLEVEL; ++#endif +diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h +index 6dd88d1fbd02..c8a10281f112 100644 +--- a/lib/zstd/common/debug.h ++++ b/lib/zstd/common/debug.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -33,7 +34,6 @@ + #define DEBUG_H_12987983217 + + +- + /* static assert is triggered at compile time, leaving no runtime artefact. + * static assert only works with compile-time constants. + * Also, this variant can only be used inside a function. */ +@@ -82,20 +82,27 @@ extern int g_debuglevel; /* the variable is only declared, + It's useful when enabling very verbose levels + on selective conditions (such as position in src) */ + +-# define RAWLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__VA_ARGS__); \ +- } } +-# define DEBUGLOG(l, ...) { \ +- if (l<=g_debuglevel) { \ +- ZSTD_DEBUG_PRINT(__FILE__ ": " __VA_ARGS__); \ +- ZSTD_DEBUG_PRINT(" \n"); \ +- } } ++# define RAWLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__VA_ARGS__); \ ++ } \ ++ } while (0) ++ ++#define STRINGIFY(x) #x ++#define TOSTRING(x) STRINGIFY(x) ++#define LINE_AS_STRING TOSTRING(__LINE__) ++ ++# define DEBUGLOG(l, ...) \ ++ do { \ ++ if (l<=g_debuglevel) { \ ++ ZSTD_DEBUG_PRINT(__FILE__ ":" LINE_AS_STRING ": " __VA_ARGS__); \ ++ ZSTD_DEBUG_PRINT(" \n"); \ ++ } \ ++ } while (0) + #else +-# define RAWLOG(l, ...) {} /* disabled */ +-# define DEBUGLOG(l, ...) {} /* disabled */ ++# define RAWLOG(l, ...) do { } while (0) /* disabled */ ++# define DEBUGLOG(l, ...) do { } while (0) /* disabled */ + #endif + +- +- + #endif /* DEBUG_H_12987983217 */ +diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c +index fef67056f052..6cdd82233fb5 100644 +--- a/lib/zstd/common/entropy_common.c ++++ b/lib/zstd/common/entropy_common.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * Common functions of New Generation Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,8 +20,8 @@ + #include "error_private.h" /* ERR_*, ERROR */ + #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ + #include "huf.h" ++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ + + + /*=== Version ===*/ +@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + /*-************************************************************** + * FSE NCount encoding-decoding + ****************************************************************/ +-static U32 FSE_ctz(U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_ctz(val); +-# else /* Software version */ +- U32 count = 0; +- while ((val & 1) == 0) { +- val >>= 1; +- ++count; +- } +- return count; +-# endif +- } +-} +- + FORCE_INLINE_TEMPLATE + size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * repeat. + * Avoid UB by setting the high bit to 1. + */ +- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { +@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; +- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + } + charnum += 3 * repeats; + bitStream >>= 2 * repeats; +@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * know that threshold > 1. + */ + if (remaining <= 1) break; +- nbBits = BIT_highbit32(remaining) + 1; ++ nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); + } + if (charnum >= maxSV1) break; +@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + const void* src, size_t srcSize) + { + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); ++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ +- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; ++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; +- U32 const verif = 1 << BIT_highbit32(rest); +- U32 const lastWeight = BIT_highbit32(rest) + 1; ++ U32 const verif = 1 << ZSTD_highbit32(rest); ++ U32 const lastWeight = ZSTD_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; +@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, +- int bmi2) ++ int flags) + { + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } + #endif +- (void)bmi2; ++ (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c +index 6d1135f8c373..6c3dbad838b6 100644 +--- a/lib/zstd/common/error_private.c ++++ b/lib/zstd/common/error_private.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; +- case PREFIX(corruption_detected): return "Corrupted block detected"; ++ case PREFIX(corruption_detected): return "Data corruption detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; ++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; ++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; +@@ -38,17 +41,23 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(cannotProduce_uncompressedBlock): return "This mode cannot generate an uncompressed block"; ++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; ++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; ++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; ++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; ++ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; + case PREFIX(maxCode): + default: return notErrorCode; + } +diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h +index ca5101e542fa..08ee87b68cca 100644 +--- a/lib/zstd/common/error_private.h ++++ b/lib/zstd/common/error_private.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,8 +14,6 @@ + #ifndef ERROR_H_MODULE + #define ERROR_H_MODULE + +- +- + /* **************************************** + * Dependencies + ******************************************/ +@@ -23,7 +22,6 @@ + #include "debug.h" + #include "zstd_deps.h" /* size_t */ + +- + /* **************************************** + * Compiler-specific + ******************************************/ +@@ -49,8 +47,13 @@ ERR_STATIC unsigned ERR_isError(size_t code) { return (code > ERROR(maxCode)); } + ERR_STATIC ERR_enum ERR_getErrorCode(size_t code) { if (!ERR_isError(code)) return (ERR_enum)0; return (ERR_enum) (0-code); } + + /* check and forward error code */ +-#define CHECK_V_F(e, f) size_t const e = f; if (ERR_isError(e)) return e +-#define CHECK_F(f) { CHECK_V_F(_var_err__, f); } ++#define CHECK_V_F(e, f) \ ++ size_t const e = f; \ ++ do { \ ++ if (ERR_isError(e)) \ ++ return e; \ ++ } while (0) ++#define CHECK_F(f) do { CHECK_V_F(_var_err__, f); } while (0) + + + /*-**************************************** +@@ -84,10 +87,12 @@ void _force_has_format_string(const char *format, ...) { + * We want to force this function invocation to be syntactically correct, but + * we don't want to force runtime evaluation of its arguments. + */ +-#define _FORCE_HAS_FORMAT_STRING(...) \ +- if (0) { \ +- _force_has_format_string(__VA_ARGS__); \ +- } ++#define _FORCE_HAS_FORMAT_STRING(...) \ ++ do { \ ++ if (0) { \ ++ _force_has_format_string(__VA_ARGS__); \ ++ } \ ++ } while (0) + + #define ERR_QUOTE(str) #str + +@@ -98,48 +103,49 @@ void _force_has_format_string(const char *format, ...) { + * In order to do that (particularly, printing the conditional that failed), + * this can't just wrap RETURN_ERROR(). + */ +-#define RETURN_ERROR_IF(cond, err, ...) \ +- if (cond) { \ +- RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } ++#define RETURN_ERROR_IF(cond, err, ...) \ ++ do { \ ++ if (cond) { \ ++ RAWLOG(3, "%s:%d: ERROR!: check %s failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(cond), ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } \ ++ } while (0) + + /* + * Unconditionally return the specified error. + * + * In debug modes, prints additional information. + */ +-#define RETURN_ERROR(err, ...) \ +- do { \ +- RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ +- __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return ERROR(err); \ +- } while(0); ++#define RETURN_ERROR(err, ...) \ ++ do { \ ++ RAWLOG(3, "%s:%d: ERROR!: unconditional check failed, returning %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(ERROR(err))); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return ERROR(err); \ ++ } while(0) + + /* + * If the provided expression evaluates to an error code, returns that error code. + * + * In debug modes, prints additional information. + */ +-#define FORWARD_IF_ERROR(err, ...) \ +- do { \ +- size_t const err_code = (err); \ +- if (ERR_isError(err_code)) { \ +- RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ +- __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ +- _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ +- RAWLOG(3, ": " __VA_ARGS__); \ +- RAWLOG(3, "\n"); \ +- return err_code; \ +- } \ +- } while(0); +- ++#define FORWARD_IF_ERROR(err, ...) \ ++ do { \ ++ size_t const err_code = (err); \ ++ if (ERR_isError(err_code)) { \ ++ RAWLOG(3, "%s:%d: ERROR!: forwarding error in %s: %s", \ ++ __FILE__, __LINE__, ERR_QUOTE(err), ERR_getErrorName(err_code)); \ ++ _FORCE_HAS_FORMAT_STRING(__VA_ARGS__); \ ++ RAWLOG(3, ": " __VA_ARGS__); \ ++ RAWLOG(3, "\n"); \ ++ return err_code; \ ++ } \ ++ } while(0) + + #endif /* ERROR_H_MODULE */ +diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h +index 4507043b2287..b36ce7a2a8c3 100644 +--- a/lib/zstd/common/fse.h ++++ b/lib/zstd/common/fse.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -11,8 +12,6 @@ + * in the COPYING file in the root directory of this source tree). + * You may select, at your option, one of the above-listed licenses. + ****************************************************************** */ +- +- + #ifndef FSE_H + #define FSE_H + +@@ -22,7 +21,6 @@ + ******************************************/ + #include "zstd_deps.h" /* size_t, ptrdiff_t */ + +- + /*-***************************************** + * FSE_PUBLIC_API : control library symbols visibility + ******************************************/ +@@ -50,34 +48,6 @@ + FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ + + +-/*-**************************************** +-* FSE simple functions +-******************************************/ +-/*! FSE_compress() : +- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. +- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). +- @return : size of compressed data (<= dstCapacity). +- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. +- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +-*/ +-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/*! FSE_decompress(): +- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', +- into already allocated destination buffer 'dst', of size 'dstCapacity'. +- @return : size of regenerated data (<= maxDstSize), +- or an error code, which can be tested using FSE_isError() . +- +- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! +- Why ? : making this distinction requires a header. +- Header management is intentionally delegated to the user layer, which can better manage special cases. +-*/ +-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, +- const void* cSrc, size_t cSrcSize); +- +- + /*-***************************************** + * Tool functions + ******************************************/ +@@ -88,20 +58,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return + FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +-/*-***************************************** +-* FSE advanced functions +-******************************************/ +-/*! FSE_compress2() : +- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' +- Both parameters can be defined as '0' to mean : use default value +- @return : size of compressed data +- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. +- if FSE_isError(return), it's an error code. +-*/ +-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +- +- + /*-***************************************** + * FSE detailed API + ******************************************/ +@@ -161,8 +117,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + /*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ + typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + + /*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). +@@ -238,23 +192,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +-/*! Constructor and Destructor of FSE_DTable. +- Note that its size depends on 'tableLog' */ + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); +- +-/*! FSE_buildDTable(): +- Builds 'dt', which must be already allocated, using FSE_createDTable(). +- return : 0, or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +- +-/*! FSE_decompress_usingDTable(): +- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` +- into `dst` which must be already allocated. +- @return : size of regenerated data (necessarily <= `dstCapacity`), +- or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + + /*! + Tutorial : +@@ -286,13 +224,11 @@ If there is an error, the function will return an error code, which can be teste + + #endif /* FSE_H */ + ++ + #if !defined(FSE_H_FSE_STATIC_LINKING_ONLY) + #define FSE_H_FSE_STATIC_LINKING_ONLY +- +-/* *** Dependency *** */ + #include "bitstream.h" + +- + /* ***************************************** + * Static allocation + *******************************************/ +@@ -317,16 +253,6 @@ If there is an error, the function will return an error code, which can be teste + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); + /*< same as FSE_optimalTableLog(), which used `minus==2` */ + +-/* FSE_compress_wksp() : +- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). +- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. +- */ +-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +- +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ +- + size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); + /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +@@ -344,19 +270,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi + FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); +-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ +- +-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); +-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ +- +-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) ++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) + #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ +- + size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ ++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. ++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + + typedef enum { + FSE_repeat_none, /*< Cannot use the previous table */ +@@ -539,20 +457,20 @@ MEM_STATIC void FSE_encodeSymbol(BIT_CStream_t* bitC, FSE_CState_t* statePtr, un + FSE_symbolCompressionTransform const symbolTT = ((const FSE_symbolCompressionTransform*)(statePtr->symbolTT))[symbol]; + const U16* const stateTable = (const U16*)(statePtr->stateTable); + U32 const nbBitsOut = (U32)((statePtr->value + symbolTT.deltaNbBits) >> 16); +- BIT_addBits(bitC, statePtr->value, nbBitsOut); ++ BIT_addBits(bitC, (BitContainerType)statePtr->value, nbBitsOut); + statePtr->value = stateTable[ (statePtr->value >> nbBitsOut) + symbolTT.deltaFindState]; + } + + MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePtr) + { +- BIT_addBits(bitC, statePtr->value, statePtr->stateLog); ++ BIT_addBits(bitC, (BitContainerType)statePtr->value, statePtr->stateLog); + BIT_flushBits(bitC); + } + + + /* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. +- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) ++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ + MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +@@ -705,7 +623,4 @@ MEM_STATIC unsigned FSE_endOfDState(const FSE_DState_t* DStatePtr) + + #define FSE_TABLESTEP(tableSize) (((tableSize)>>1) + ((tableSize)>>3) + 3) + +- + #endif /* FSE_STATIC_LINKING_ONLY */ +- +- +diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c +index 8dcb8ca39767..15081d8dc607 100644 +--- a/lib/zstd/common/fse_decompress.c ++++ b/lib/zstd/common/fse_decompress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy decoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -22,8 +23,8 @@ + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" + #include "error_private.h" +-#define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" ++#include "zstd_deps.h" /* ZSTD_memcpy */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -55,19 +56,6 @@ + #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) + #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + +- +-/* Function templates */ +-FSE_DTable* FSE_createDTable (unsigned tableLog) +-{ +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +-} +- +-void FSE_freeDTable (FSE_DTable* dt) +-{ +- ZSTD_free(dt); +-} +- + static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) + { + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ +@@ -96,7 +84,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + symbolNext[s] = 1; + } else { + if (normalizedCounter[s] >= largeLimit) DTableH.fastMode=0; +- symbolNext[s] = normalizedCounter[s]; ++ symbolNext[s] = (U16)normalizedCounter[s]; + } } } + ZSTD_memcpy(dt, &DTableH, sizeof(DTableH)); + } +@@ -111,8 +99,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + * all symbols have counts <= 8. We ensure we have 8 bytes at the end of + * our buffer to handle the over-write. + */ +- { +- U64 const add = 0x0101010101010101ull; ++ { U64 const add = 0x0101010101010101ull; + size_t pos = 0; + U64 sv = 0; + U32 s; +@@ -123,14 +110,13 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; +- } +- } ++ pos += (size_t)n; ++ } } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -166,7 +152,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (u=0; utableLog = 0; +- DTableH->fastMode = 0; +- +- cell->newState = 0; +- cell->symbol = symbolValue; +- cell->nbBits = 0; +- +- return 0; +-} +- +- +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +-{ +- void* ptr = dt; +- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; +- void* dPtr = dt + 1; +- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSV1 = tableMask+1; +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* Build Decoding Table */ +- DTableH->tableLog = (U16)nbBits; +- DTableH->fastMode = 1; +- for (s=0; sfastMode; +- +- /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); +- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +-} +- +- +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +-{ +- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); ++ assert(op >= ostart); ++ return (size_t)(op-ostart); + } + + typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; +- FSE_DTable dtable[]; /* Dynamically sized */ + } FSE_DecompressWksp; + + +@@ -327,13 +252,18 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + unsigned tableLog; + unsigned maxSymbolValue = FSE_MAX_SYMBOL_VALUE; + FSE_DecompressWksp* const wksp = (FSE_DecompressWksp*)workSpace; ++ size_t const dtablePos = sizeof(FSE_DecompressWksp) / sizeof(FSE_DTable); ++ FSE_DTable* const dtable = (FSE_DTable*)workSpace + dtablePos; + +- DEBUG_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); ++ FSE_STATIC_ASSERT((FSE_MAX_SYMBOL_VALUE + 1) % 2 == 0); + if (wkspSize < sizeof(*wksp)) return ERROR(GENERIC); + ++ /* correct offset to dtable depends on this property */ ++ FSE_STATIC_ASSERT(sizeof(FSE_DecompressWksp) % sizeof(FSE_DTable) == 0); ++ + /* normal FSE decoding mode */ +- { +- size_t const NCountLength = FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); ++ { size_t const NCountLength = ++ FSE_readNCount_bmi2(wksp->ncount, &maxSymbolValue, &tableLog, istart, cSrcSize, bmi2); + if (FSE_isError(NCountLength)) return NCountLength; + if (tableLog > maxLog) return ERROR(tableLog_tooLarge); + assert(NCountLength <= cSrcSize); +@@ -342,19 +272,20 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + } + + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); +- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); ++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); ++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + +- CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); ++ CHECK_F( FSE_buildDTable_internal(dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); + + { +- const void* ptr = wksp->dtable; ++ const void* ptr = dtable; + const FSE_DTableHeader* DTableH = (const FSE_DTableHeader*)ptr; + const U32 fastMode = DTableH->fastMode; + + /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 1); +- return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, wksp->dtable, 0); ++ if (fastMode) return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 1); ++ return FSE_decompress_usingDTable_generic(dst, dstCapacity, ip, cSrcSize, dtable, 0); + } + } + +@@ -382,9 +313,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } + +- +-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +- +- +- + #endif /* FSE_COMMONDEFS_ONLY */ +diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h +index 5042ff870308..49736dcd8f49 100644 +--- a/lib/zstd/common/huf.h ++++ b/lib/zstd/common/huf.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -12,105 +13,26 @@ + * You may select, at your option, one of the above-listed licenses. + ****************************************************************** */ + +- + #ifndef HUF_H_298734234 + #define HUF_H_298734234 + + /* *** Dependencies *** */ + #include "zstd_deps.h" /* size_t */ +- +- +-/* *** library symbols visibility *** */ +-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, +- * HUF symbols remain "private" (internal symbols for library only). +- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +-# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +-# define HUF_PUBLIC_API __declspec(dllexport) +-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +-#else +-# define HUF_PUBLIC_API +-#endif +- +- +-/* ========================== */ +-/* *** simple functions *** */ +-/* ========================== */ +- +-/* HUF_compress() : +- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. +- * 'dst' buffer must be already allocated. +- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). +- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. +- * @return : size of compressed data (<= `dstCapacity`). +- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) +- */ +-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/* HUF_decompress() : +- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', +- * into already allocated buffer 'dst', of minimum size 'dstSize'. +- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. +- * Note : in contrast with FSE, HUF_decompress can regenerate +- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, +- * because it knows size to regenerate (originalSize). +- * @return : size of regenerated data (== originalSize), +- * or an error code, which can be tested using HUF_isError() +- */ +-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, +- const void* cSrc, size_t cSrcSize); +- ++#include "mem.h" /* U32 */ ++#define FSE_STATIC_LINKING_ONLY ++#include "fse.h" + + /* *** Tool functions *** */ +-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ +-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ ++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ ++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ + + /* Error Management */ +-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ +-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ ++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ ++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ + + +-/* *** Advanced function *** */ +- +-/* HUF_compress2() : +- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. +- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . +- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog); +- +-/* HUF_compress4X_wksp() : +- * Same as HUF_compress2(), but uses externally allocated `workSpace`. +- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) +-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog, +- void* workSpace, size_t wkspSize); +- +-#endif /* HUF_H_298734234 */ +- +-/* ****************************************************************** +- * WARNING !! +- * The following section contains advanced and experimental definitions +- * which shall never be used in the context of a dynamic library, +- * because they are not guaranteed to remain stable in the future. +- * Only consider them in association with static linking. +- * *****************************************************************/ +-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +-#define HUF_H_HUF_STATIC_LINKING_ONLY +- +-/* *** Dependencies *** */ +-#include "mem.h" /* U32 */ +-#define FSE_STATIC_LINKING_ONLY +-#include "fse.h" +- + + /* *** Constants *** */ + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ +@@ -151,25 +73,49 @@ typedef U32 HUF_DTable; + /* **************************************** + * Advanced decompression functions + ******************************************/ +-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-#endif + +-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ +-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++/* ++ * Huffman flags bitset. ++ * For all flags, 0 is the default value. ++ */ ++typedef enum { ++ /* ++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. ++ * Otherwise: Ignored. ++ */ ++ HUF_flags_bmi2 = (1 << 0), ++ /* ++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. ++ * If unset: Use heuristic to find the table depth. ++ */ ++ HUF_flags_optimalDepth = (1 << 1), ++ /* ++ * If set: If the previous table can encode the input, always reuse the previous table. ++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. ++ */ ++ HUF_flags_preferRepeat = (1 << 2), ++ /* ++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. ++ * If unset: Always histogram the entire input. ++ */ ++ HUF_flags_suspectUncompressible = (1 << 3), ++ /* ++ * If set: Don't use assembly implementations ++ * If unset: Allow using assembly implementations ++ */ ++ HUF_flags_disableAsm = (1 << 4), ++ /* ++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. ++ * If unset: Use the fast decoding loop when possible. ++ */ ++ HUF_flags_disableFast = (1 << 5) ++} HUF_flags_e; + + + /* **************************************** + * HUF detailed API + * ****************************************/ ++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra + + /*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") +@@ -182,12 +128,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); ++unsigned HUF_minTableLog(unsigned symbolCardinality); ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); ++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, ++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ + size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +@@ -196,6 +142,7 @@ typedef enum { + HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; ++ + /* HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -206,13 +153,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) ++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) + #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) + size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, +@@ -238,7 +185,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, +- int bmi2); ++ int flags); + + /* HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +@@ -246,9 +193,22 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + + /* HUF_getNbBitsFromCTable() : + * Read nbBits from CTable symbolTable, for symbol `symbolValue` presumed <= HUF_SYMBOLVALUE_MAX +- * Note 1 : is not inlined, as HUF_CElt definition is private */ ++ * Note 1 : If symbolValue > HUF_readCTableHeader(symbolTable).maxSymbolValue, returns 0 ++ * Note 2 : is not inlined, as HUF_CElt definition is private ++ */ + U32 HUF_getNbBitsFromCTable(const HUF_CElt* symbolTable, U32 symbolValue); + ++typedef struct { ++ BYTE tableLog; ++ BYTE maxSymbolValue; ++ BYTE unused[sizeof(size_t) - 2]; ++} HUF_CTableHeader; ++ ++/* HUF_readCTableHeader() : ++ * @returns The header from the CTable specifying the tableLog and the maxSymbolValue. ++ */ ++HUF_CTableHeader HUF_readCTableHeader(HUF_CElt const* ctable); ++ + /* + * HUF_decompress() does the following: + * 1. select the decompression algorithm (X1, X2) based on pre-computed heuristics +@@ -276,32 +236,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) + #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +- + + /* ====================== */ + /* single stream variants */ + /* ====================== */ + +-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + /* HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -312,47 +252,27 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); +- +-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +-#endif +- +-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); ++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ + #endif + + /* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #endif +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + +-#endif /* HUF_STATIC_LINKING_ONLY */ +- ++#endif /* HUF_H_298734234 */ +diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h +index c22a2e69bf46..d9bd752fe17b 100644 +--- a/lib/zstd/common/mem.h ++++ b/lib/zstd/common/mem.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,6 +24,7 @@ + /*-**************************************** + * Compiler specifics + ******************************************/ ++#undef MEM_STATIC /* may be already defined from common/compiler.h */ + #define MEM_STATIC static inline + + /*-************************************************************** +diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h +index 0dde8bf56595..efae9465d57d 100644 +--- a/lib/zstd/common/portability_macros.h ++++ b/lib/zstd/common/portability_macros.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,7 +13,7 @@ + #define ZSTD_PORTABILITY_MACROS_H + + /* +- * This header file contains macro defintions to support portability. ++ * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * +@@ -45,30 +46,37 @@ + /* Mark the internal assembly functions as hidden */ + #ifdef __ELF__ + # define ZSTD_HIDE_ASM_FUNCTION(func) .hidden func ++#elif defined(__APPLE__) ++# define ZSTD_HIDE_ASM_FUNCTION(func) .private_extern func + #else + # define ZSTD_HIDE_ASM_FUNCTION(func) + #endif + ++/* Compile time determination of BMI2 support */ ++ ++ + /* Enable runtime BMI2 dispatch based on the CPU. +- * Enabled for clang & gcc >=4.8 on x86 when BMI2 isn't enabled by default. ++ * Enabled for clang & gcc >= 11.4 on x86 when BMI2 isn't enabled by default. ++ * Disabled for gcc < 11.4 because of a segfault while compiling ++ * HUF_compress1X_usingCTable_internal_body(). + */ + #ifndef DYNAMIC_BMI2 +- #if ((defined(__clang__) && __has_attribute(__target__)) \ ++# if ((defined(__clang__) && __has_attribute(__target__)) \ + || (defined(__GNUC__) \ +- && (__GNUC__ >= 11))) \ +- && (defined(__x86_64__) || defined(_M_X64)) \ ++ && (__GNUC__ >= 12 || (__GNUC__ == 11 && __GNUC_MINOR__ >= 4)))) \ ++ && (defined(__i386__) || defined(__x86_64__) || defined(_M_IX86) || defined(_M_X64)) \ + && !defined(__BMI2__) +- # define DYNAMIC_BMI2 1 +- #else +- # define DYNAMIC_BMI2 0 +- #endif ++# define DYNAMIC_BMI2 1 ++# else ++# define DYNAMIC_BMI2 0 ++# endif + #endif + + /* +- * Only enable assembly for GNUC comptabile compilers, ++ * Only enable assembly for GNU C compatible compilers, + * because other platforms may not support GAS assembly syntax. + * +- * Only enable assembly for Linux / MacOS, other platforms may ++ * Only enable assembly for Linux / MacOS / Win32, other platforms may + * work, but they haven't been tested. This could likely be + * extended to BSD systems. + * +@@ -90,4 +98,23 @@ + */ + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + ++/* ++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in ++ * assembly sources when CET is enabled. ++ * ++ * Additionally, any function that may be called indirectly must begin ++ * with ZSTD_CET_ENDBRANCH. ++ */ ++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ ++ && defined(__has_include) ++# if __has_include() ++# include ++# define ZSTD_CET_ENDBRANCH _CET_ENDBR ++# endif ++#endif ++ ++#ifndef ZSTD_CET_ENDBRANCH ++# define ZSTD_CET_ENDBRANCH ++#endif ++ + #endif /* ZSTD_PORTABILITY_MACROS_H */ +diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c +index 3d7e35b309b5..44b95b25344a 100644 +--- a/lib/zstd/common/zstd_common.c ++++ b/lib/zstd/common/zstd_common.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,6 @@ + * Dependencies + ***************************************/ + #define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + #include "error_private.h" + #include "zstd_internal.h" + +@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + /*! ZSTD_getErrorString() : + * provides error code string from enum */ + const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } +- +- +- +-/*=************************************************************** +-* Custom allocator +-****************************************************************/ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) +- return customMem.customAlloc(customMem.opaque, size); +- return ZSTD_malloc(size); +-} +- +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) { +- /* calloc implemented as malloc+memset; +- * not as efficient as calloc, but next best guess for custom malloc */ +- void* const ptr = customMem.customAlloc(customMem.opaque, size); +- ZSTD_memset(ptr, 0, size); +- return ptr; +- } +- return ZSTD_calloc(1, size); +-} +- +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +-{ +- if (ptr!=NULL) { +- if (customMem.customFree) +- customMem.customFree(customMem.opaque, ptr); +- else +- ZSTD_free(ptr); +- } +-} +diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h +index 2c34e8a33a1c..f931f7d0e294 100644 +--- a/lib/zstd/common/zstd_deps.h ++++ b/lib/zstd/common/zstd_deps.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { + + #endif /* ZSTD_DEPS_IO */ + #endif /* ZSTD_DEPS_NEED_IO */ ++ ++/* ++ * Only requested when MSAN is enabled. ++ * Need: ++ * intptr_t ++ */ ++#ifdef ZSTD_DEPS_NEED_STDINT ++#ifndef ZSTD_DEPS_STDINT ++#define ZSTD_DEPS_STDINT ++ ++/* intptr_t already provided by ZSTD_DEPS_COMMON */ ++ ++#endif /* ZSTD_DEPS_STDINT */ ++#endif /* ZSTD_DEPS_NEED_STDINT */ +diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h +index 93305d9b41bb..52a79435caf6 100644 +--- a/lib/zstd/common/zstd_internal.h ++++ b/lib/zstd/common/zstd_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -28,12 +29,10 @@ + #include + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "huf.h" + #include /* XXH_reset, update, digest */ + #define ZSTD_TRACE 0 + +- + /* ---- static assert (debug) --- */ + #define ZSTD_STATIC_ASSERT(c) DEBUG_STATIC_ASSERT(c) + #define ZSTD_isError ERR_isError /* for inlining */ +@@ -83,16 +82,17 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + #define ZSTD_FRAMECHECKSUMSIZE 4 + + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ ++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ ++#define MIN_LITERALS_FOR_4_STREAMS 6 + +-#define HufLog 12 +-typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; ++typedef enum { set_basic, set_rle, set_compressed, set_repeat } SymbolEncodingType_e; + + #define LONGNBSEQ 0x7F00 + + #define MINMATCH 3 + + #define Litbits 8 ++#define LitHufLog 11 + #define MaxLit ((1<= WILDCOPY_VECLEN || diff <= -WILDCOPY_VECLEN); +@@ -225,12 +227,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + * one COPY16() in the first call. Then, do two calls per loop since + * at that point it is more likely to have a high trip count. + */ +-#ifdef __aarch64__ +- do { +- COPY16(op, ip); +- } +- while (op < oend); +-#else + ZSTD_copy16(op, ip); + if (16 >= length) return; + op += 16; +@@ -240,7 +236,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + COPY16(op, ip); + } + while (op < oend); +-#endif + } + } + +@@ -273,62 +268,6 @@ typedef enum { + /*-******************************************* + * Private declarations + *********************************************/ +-typedef struct seqDef_s { +- U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ +- U16 litLength; +- U16 mlBase; /* mlBase == matchLength - MINMATCH */ +-} seqDef; +- +-/* Controls whether seqStore has a single "long" litLength or matchLength. See seqStore_t. */ +-typedef enum { +- ZSTD_llt_none = 0, /* no longLengthType */ +- ZSTD_llt_literalLength = 1, /* represents a long literal */ +- ZSTD_llt_matchLength = 2 /* represents a long match */ +-} ZSTD_longLengthType_e; +- +-typedef struct { +- seqDef* sequencesStart; +- seqDef* sequences; /* ptr to end of sequences */ +- BYTE* litStart; +- BYTE* lit; /* ptr to end of literals */ +- BYTE* llCode; +- BYTE* mlCode; +- BYTE* ofCode; +- size_t maxNbSeq; +- size_t maxNbLit; +- +- /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength +- * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment +- * the existing value of the litLength or matchLength by 0x10000. +- */ +- ZSTD_longLengthType_e longLengthType; +- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ +-} seqStore_t; +- +-typedef struct { +- U32 litLength; +- U32 matchLength; +-} ZSTD_sequenceLength; +- +-/* +- * Returns the ZSTD_sequenceLength for the given sequences. It handles the decoding of long sequences +- * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. +- */ +-MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore, seqDef const* seq) +-{ +- ZSTD_sequenceLength seqLen; +- seqLen.litLength = seq->litLength; +- seqLen.matchLength = seq->mlBase + MINMATCH; +- if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { +- if (seqStore->longLengthType == ZSTD_llt_literalLength) { +- seqLen.litLength += 0xFFFF; +- } +- if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- seqLen.matchLength += 0xFFFF; +- } +- } +- return seqLen; +-} + + /* + * Contains the compressed frame size and an upper-bound for the decompressed frame size. +@@ -337,74 +276,11 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ + typedef struct { ++ size_t nbBlocks; + size_t compressedSize; + unsigned long long decompressedBound; + } ZSTD_frameSizeInfo; /* decompress & legacy */ + +-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ +- +-/* custom memory allocation functions */ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); +- +- +-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- +-/* +- * Counts the number of trailing zeros of a `size_t`. +- * Most compilers should support CTZ as a builtin. A backup +- * implementation is provided if the builtin isn't supported, but +- * it may not be terribly efficient. +- */ +-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +-{ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return __builtin_ctzll((U64)val); +-# else +- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, +- 4, 25, 14, 28, 9, 34, 20, 56, +- 5, 17, 26, 54, 15, 41, 29, 43, +- 10, 31, 38, 35, 21, 45, 49, 57, +- 63, 6, 12, 18, 24, 27, 33, 55, +- 16, 53, 40, 42, 30, 37, 44, 48, +- 62, 11, 23, 32, 52, 39, 36, 47, +- 61, 22, 51, 46, 60, 50, 59, 58 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return __builtin_ctz((U32)val); +-# else +- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, +- 30, 22, 20, 15, 25, 17, 4, 8, +- 31, 27, 13, 23, 21, 19, 16, 7, +- 26, 12, 18, 6, 11, 5, 10, 9 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +-} +- +- + /* ZSTD_invalidateRepCodes() : + * ensures next compression will not use repcodes from previous block. + * Note : only works with regular variant; +@@ -420,13 +296,13 @@ typedef struct { + + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: decompress, fullbench */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + blockProperties_t* bpPtr); + + /*! ZSTD_decodeSeqHeaders() : + * decode sequence header from src */ +-/* Used by: decompress, fullbench (does not get its definition from here) */ ++/* Used by: zstd_decompress_block, fullbench */ + size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + const void* src, size_t srcSize); + +@@ -439,5 +315,4 @@ MEM_STATIC int ZSTD_cpuSupportsBmi2(void) + return ZSTD_cpuid_bmi1(cpuid) && ZSTD_cpuid_bmi2(cpuid); + } + +- + #endif /* ZSTD_CCOMMON_H_MODULE */ +diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h +index d9a76112ec3a..6ab8be6532ef 100644 +--- a/lib/zstd/compress/clevels.h ++++ b/lib/zstd/compress/clevels.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c +index ec5b1ca6d71a..44a3c10becf2 100644 +--- a/lib/zstd/compress/fse_compress.c ++++ b/lib/zstd/compress/fse_compress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy encoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -25,7 +26,8 @@ + #include "../common/error_private.h" + #define ZSTD_DEPS_NEED_MALLOC + #define ZSTD_DEPS_NEED_MATH64 +-#include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : +- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ ++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ +@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + break; + default : + assert(normalizedCounter[s] > 1); +- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); ++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +@@ -224,8 +226,8 @@ size_t FSE_NCountWriteBound(unsigned maxSymbolValue, unsigned tableLog) + size_t const maxHeaderSize = (((maxSymbolValue+1) * tableLog + + 4 /* bitCount initialized at 4 */ + + 2 /* first two symbols may use one additional bit each */) / 8) +- + 1 /* round up to whole nb bytes */ +- + 2 /* additional two bytes for bitstream flush */; ++ + 1 /* round up to whole nb bytes */ ++ + 2 /* additional two bytes for bitstream flush */; + return maxSymbolValue ? maxHeaderSize : FSE_NCOUNTBOUND; /* maxSymbolValue==0 ? use default */ + } + +@@ -254,7 +256,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + /* Init */ + remaining = tableSize+1; /* +1 for extra accuracy */ + threshold = tableSize; +- nbBits = tableLog+1; ++ nbBits = (int)tableLog+1; + + while ((symbol < alphabetSize) && (remaining>1)) { /* stops at 1 */ + if (previousIs0) { +@@ -273,7 +275,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + } + while (symbol >= start+3) { + start+=3; +- bitStream += 3 << bitCount; ++ bitStream += 3U << bitCount; + bitCount += 2; + } + bitStream += (symbol-start) << bitCount; +@@ -293,7 +295,7 @@ FSE_writeNCount_generic (void* header, size_t headerBufferSize, + count++; /* +1 for extra accuracy */ + if (count>=threshold) + count += max; /* [0..max[ [max..threshold[ (...) [threshold+max 2*threshold[ */ +- bitStream += count << bitCount; ++ bitStream += (U32)count << bitCount; + bitCount += nbBits; + bitCount -= (count>8); + out+= (bitCount+7) /8; + +- return (out-ostart); ++ assert(out >= ostart); ++ return (size_t)(out-ostart); + } + + +@@ -342,21 +345,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, + * FSE Compression Code + ****************************************************************/ + +-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +-{ +- size_t size; +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); +- return (FSE_CTable*)ZSTD_malloc(size); +-} +- +-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } +- + /* provides the minimum logSize to safely represent a distribution */ + static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + { +- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; +- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; ++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; ++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +@@ -364,7 +357,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) + { +- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; ++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ +@@ -532,40 +525,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + return tableLog; + } + +- +-/* fake FSE_CTable, for raw (uncompressed) input */ +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) +-{ +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSymbolValue = tableMask; +- void* const ptr = ct; +- U16* const tableU16 = ( (U16*) ptr) + 2; +- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ +- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* header */ +- tableU16[-2] = (U16) nbBits; +- tableU16[-1] = (U16) maxSymbolValue; +- +- /* Build table */ +- for (s=0; s= 2 ++ ++static size_t showU32(const U32* arr, size_t size) + { +- return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ size_t u; ++ for (u=0; u= sizeof(HUF_WriteCTableWksp)); ++ ++ assert(HUF_readCTableHeader(CTable).maxSymbolValue == maxSymbolValue); ++ assert(HUF_readCTableHeader(CTable).tableLog == huffLog); ++ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); +@@ -204,16 +286,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + return ((maxSymbolValue+1)/2) + 1; + } + +-/*! HUF_writeCTable() : +- `CTable` : Huffman tree to save, using huf representation. +- @return : size of saved CTable */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, +- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +-{ +- HUF_WriteCTableWksp wksp; +- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +-} +- + + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) + { +@@ -231,7 +303,9 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + if (tableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + if (nbSymbols > *maxSymbolValuePtr+1) return ERROR(maxSymbolValue_tooSmall); + +- CTable[0] = tableLog; ++ *maxSymbolValuePtr = nbSymbols - 1; ++ ++ HUF_writeCTableHeader(CTable, tableLog, *maxSymbolValuePtr); + + /* Prepare base value per rank */ + { U32 n, nextRankStart = 0; +@@ -263,74 +337,71 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + { U32 n; for (n=0; n HUF_readCTableHeader(CTable).maxSymbolValue) ++ return 0; + return (U32)HUF_getNbBits(ct[symbolValue]); + } + + +-typedef struct nodeElt_s { +- U32 count; +- U16 parent; +- BYTE byte; +- BYTE nbBits; +-} nodeElt; +- + /* + * HUF_setMaxHeight(): +- * Enforces maxNbBits on the Huffman tree described in huffNode. ++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * +- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts +- * the tree to so that it is a valid canonical Huffman tree. ++ * It attempts to convert all nodes with nbBits > @targetNbBits ++ * to employ @targetNbBits instead. Then it adjusts the tree ++ * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, +- * where largestBits is the return value <= maxNbBits. ++ * where largestBits is the return value (expected <= targetNbBits). + * +- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. ++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. ++ * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. +- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree ++ * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will +- * respect maxNbBits. +- * @return The maximum number of bits of the Huffman tree after adjustment, +- * necessarily no more than maxNbBits. ++ * respect targetNbBits. ++ * @return The maximum number of bits of the Huffman tree after adjustment. + */ +-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) ++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) + { + const U32 largestBits = huffNode[lastNonNull].nbBits; +- /* early exit : no elt > maxNbBits, so the tree is already valid. */ +- if (largestBits <= maxNbBits) return largestBits; ++ /* early exit : no elt > targetNbBits, so the tree is already valid. */ ++ if (largestBits <= targetNbBits) return largestBits; ++ ++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; +- const U32 baseCost = 1 << (largestBits - maxNbBits); ++ const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + +- /* Adjust any ranks > maxNbBits to maxNbBits. ++ /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ +- while (huffNode[n].nbBits > maxNbBits) { ++ while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); +- huffNode[n].nbBits = (BYTE)maxNbBits; ++ huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } +- /* n stops at huffNode[n].nbBits <= maxNbBits */ +- assert(huffNode[n].nbBits <= maxNbBits); +- /* n end at index of smallest symbol using < maxNbBits */ +- while (huffNode[n].nbBits == maxNbBits) --n; ++ /* n stops at huffNode[n].nbBits <= targetNbBits */ ++ assert(huffNode[n].nbBits <= targetNbBits); ++ /* n end at index of smallest symbol using < targetNbBits */ ++ while (huffNode[n].nbBits == targetNbBits) --n; + +- /* renorm totalCost from 2^largestBits to 2^maxNbBits ++ /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ +- assert((totalCost & (baseCost - 1)) == 0); +- totalCost >>= (largestBits - maxNbBits); ++ assert(((U32)totalCost & (baseCost - 1)) == 0); ++ totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ +@@ -339,19 +410,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); +- { U32 currentNbBits = maxNbBits; ++ { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; +- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ +- rankLast[maxNbBits-currentNbBits] = (U32)pos; ++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ ++ rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ +- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; ++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; +@@ -391,7 +462,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; +- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) ++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ +@@ -403,11 +474,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ +- /* special case : no rank 1 symbol (using maxNbBits-1); +- * let's create one from largest rank 0 (using maxNbBits). ++ /* special case : no rank 1 symbol (using targetNbBits-1); ++ * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { +- while (huffNode[n].nbBits == maxNbBits) n--; ++ while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); +@@ -421,7 +492,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + +- return maxNbBits; ++ return targetNbBits; + } + + typedef struct { +@@ -429,7 +500,7 @@ typedef struct { + U16 curr; + } rankPos; + +-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; ++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + + /* Number of buckets available for HUF_sort() */ + #define RANK_POSITION_TABLE_SIZE 192 +@@ -448,8 +519,8 @@ typedef struct { + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ + #define RANK_POSITION_MAX_COUNT_LOG 32 +-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ ++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) ++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + + /* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. +@@ -457,7 +528,7 @@ typedef struct { + static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count +- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; ++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; + } + + /* Helper swap function for HUF_quickSortPartition() */ +@@ -580,7 +651,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; ++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); +@@ -591,6 +662,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + assert(HUF_isSorted(huffNode, maxSymbolValue1)); + } + ++ + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). +@@ -611,6 +683,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; ++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; +@@ -637,6 +710,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + ++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); ++ + return nonNullRank; + } + +@@ -671,31 +746,40 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i + HUF_setNbBits(ct + huffNode[n].byte, huffNode[n].nbBits); /* push nbBits per symbol, symbol order */ + for (n=0; nhuffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + ++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); ++ ++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); ++ + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) +- return ERROR(workSpace_tooSmall); ++ return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) +- return ERROR(maxSymbolValue_tooLarge); ++ return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); ++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + +- /* enforce maxTableLog */ ++ /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + +@@ -716,13 +800,20 @@ size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, + } + + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue) { +- HUF_CElt const* ct = CTable + 1; +- int bad = 0; +- int s; +- for (s = 0; s <= (int)maxSymbolValue; ++s) { +- bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); +- } +- return !bad; ++ HUF_CTableHeader header = HUF_readCTableHeader(CTable); ++ HUF_CElt const* ct = CTable + 1; ++ int bad = 0; ++ int s; ++ ++ assert(header.tableLog <= HUF_TABLELOG_ABSOLUTEMAX); ++ ++ if (header.maxSymbolValue < maxSymbolValue) ++ return 0; ++ ++ for (s = 0; s <= (int)maxSymbolValue; ++s) { ++ bad |= (count[s] != 0) & (HUF_getNbBits(ct[s]) == 0); ++ } ++ return !bad; + } + + size_t HUF_compressBound(size_t size) { return HUF_COMPRESSBOUND(size); } +@@ -804,7 +895,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id + #if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); +- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; ++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +@@ -884,7 +975,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC) + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } + } + +@@ -964,17 +1055,17 @@ HUF_compress1X_usingCTable_internal_body(void* dst, size_t dstSize, + const void* src, size_t srcSize, + const HUF_CElt* CTable) + { +- U32 const tableLog = (U32)CTable[0]; ++ U32 const tableLog = HUF_readCTableHeader(CTable).tableLog; + HUF_CElt const* ct = CTable + 1; + const BYTE* ip = (const BYTE*) src; + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; +- BYTE* op = ostart; + HUF_CStream_t bitC; + + /* init */ + if (dstSize < 8) return 0; /* not enough space to compress */ +- { size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); ++ { BYTE* op = ostart; ++ size_t const initErr = HUF_initCStream(&bitC, op, (size_t)(oend-op)); + if (HUF_isError(initErr)) return 0; } + + if (dstSize < HUF_tightCompressBound(srcSize, (size_t)tableLog) || tableLog > 11) +@@ -1045,9 +1136,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +@@ -1058,28 +1149,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- (void)bmi2; ++ (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); + } + + #endif + +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +-{ +- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + static size_t + HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, int bmi2) ++ const HUF_CElt* CTable, int flags) + { + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; +@@ -1093,7 +1179,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + op += 6; /* jumpTable */ + + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; +@@ -1101,7 +1187,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; +@@ -1109,7 +1195,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; +@@ -1118,7 +1204,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } +@@ -1126,14 +1212,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + return (size_t)(op-ostart); + } + +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; +@@ -1141,11 +1222,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, +- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) ++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) + { + size_t const cSize = (nbStreams==HUF_singleStream) ? +- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : +- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); ++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : ++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; +@@ -1168,6 +1249,81 @@ typedef struct { + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) ++{ ++ unsigned cardinality = 0; ++ unsigned i; ++ ++ for (i = 0; i < maxSymbolValue + 1; i++) { ++ if (count[i] != 0) cardinality += 1; ++ } ++ ++ return cardinality; ++} ++ ++unsigned HUF_minTableLog(unsigned symbolCardinality) ++{ ++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; ++ return minBitsSymbols; ++} ++ ++unsigned HUF_optimalTableLog( ++ unsigned maxTableLog, ++ size_t srcSize, ++ unsigned maxSymbolValue, ++ void* workSpace, size_t wkspSize, ++ HUF_CElt* table, ++ const unsigned* count, ++ int flags) ++{ ++ assert(srcSize > 1); /* Not supported, RLE should be used instead */ ++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); ++ ++ if (!(flags & HUF_flags_optimalDepth)) { ++ /* cheap evaluation, based on FSE */ ++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ } ++ ++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); ++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); ++ size_t hSize, newSize; ++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); ++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); ++ size_t optSize = ((size_t) ~0) - 1; ++ unsigned optLog = maxTableLog, optLogGuess; ++ ++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); ++ ++ /* Search until size increases */ ++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { ++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); ++ ++ { size_t maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); ++ if (ERR_isError(maxBits)) continue; ++ ++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; ++ ++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); ++ } ++ ++ if (ERR_isError(hSize)) continue; ++ ++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; ++ ++ if (newSize > optSize + 1) { ++ break; ++ } ++ ++ if (newSize < optSize) { ++ optSize = newSize; ++ optLog = optLogGuess; ++ } ++ } ++ assert(optLog <= HUF_TABLELOG_MAX); ++ return optLog; ++ } ++} ++ + /* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +@@ -1177,14 +1333,14 @@ HUF_compress_internal (void* dst, size_t dstSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, +- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, +- const int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) + { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + ++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ +@@ -1198,16 +1354,17 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ +- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { ++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; ++ DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; +@@ -1224,6 +1381,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } ++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat +@@ -1232,25 +1390,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ +- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; +- } +- /* Zero unused symbols in CTable, so we can check it for validity */ +- { +- size_t const ctableSize = HUF_CTABLE_SIZE_ST(maxSymbolValue); +- size_t const unusedSize = sizeof(table->CTable) - ctableSize * sizeof(HUF_CElt); +- ZSTD_memset(table->CTable + ctableSize, 0, unusedSize); ++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + + /* Write table description header */ +@@ -1263,7 +1416,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ +@@ -1275,61 +1428,35 @@ HUF_compress_internal (void* dst, size_t dstSize, + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, table->CTable, bmi2); +-} +- +- +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_singleStream, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ nbStreams, table->CTable, flags); + } + + size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +- int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, +- repeat, preferRepeat, bmi2, suspectUncompressible); +-} +- +-/* HUF_compress4X_repeat(): +- * compress input using 4 streams. +- * provide workspace to generate compression tables */ +-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_fourStreams, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ repeat, flags); + } + + /* HUF_compress4X_repeat(): + * compress input using 4 streams. + * consider skipping quickly +- * re-use an existing huffman compression table */ ++ * reuse an existing huffman compression table */ + size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, +- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); ++ hufTable, repeat, flags); + } +- +diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c +index 16bb995bc6c4..c41a747413e0 100644 +--- a/lib/zstd/compress/zstd_compress.c ++++ b/lib/zstd/compress/zstd_compress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,12 +12,13 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ + #include "../common/mem.h" ++#include "../common/error_private.h" + #include "hist.h" /* HIST_countFast_wksp */ + #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_compress_internal.h" + #include "zstd_compress_sequences.h" +@@ -27,6 +29,7 @@ + #include "zstd_opt.h" + #include "zstd_ldm.h" + #include "zstd_compress_superblock.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + + /* *************************************************************** + * Tuning parameters +@@ -44,7 +47,7 @@ + * in log format, aka 17 => 1 << 17 == 128Ki positions. + * This structure is only used in zstd_opt. + * Since allocation is centralized for all strategies, it has to be known here. +- * The actual (selected) size of the hash table is then stored in ZSTD_matchState_t.hashLog3, ++ * The actual (selected) size of the hash table is then stored in ZSTD_MatchState_t.hashLog3, + * so that zstd_opt.c doesn't need to know about this constant. + */ + #ifndef ZSTD_HASHLOG3_MAX +@@ -55,14 +58,17 @@ + * Helper functions + ***************************************/ + /* ZSTD_compressBound() +- * Note that the result from this function is only compatible with the "normal" +- * full-block strategy. +- * When there are a lot of small blocks due to frequent flush in streaming mode +- * the overhead of headers can make the compressed data to be larger than the +- * return value of ZSTD_compressBound(). ++ * Note that the result from this function is only valid for ++ * the one-pass compression functions. ++ * When employing the streaming mode, ++ * if flushes are frequently altering the size of blocks, ++ * the overhead from block headers can make the compressed data larger ++ * than the return value of ZSTD_compressBound(). + */ + size_t ZSTD_compressBound(size_t srcSize) { +- return ZSTD_COMPRESSBOUND(srcSize); ++ size_t const r = ZSTD_COMPRESSBOUND(srcSize); ++ if (r==0) return ERROR(srcSize_wrong); ++ return r; + } + + +@@ -75,12 +81,12 @@ struct ZSTD_CDict_s { + ZSTD_dictContentType_e dictContentType; /* The dictContentType the CDict was created with */ + U32* entropyWorkspace; /* entropy workspace of HUF_WORKSPACE_SIZE bytes */ + ZSTD_cwksp workspace; +- ZSTD_matchState_t matchState; ++ ZSTD_MatchState_t matchState; + ZSTD_compressedBlockState_t cBlockState; + ZSTD_customMem customMem; + U32 dictID; + int compressionLevel; /* 0 indicates that advanced API was used to select CDict params */ +- ZSTD_paramSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use ++ ZSTD_ParamSwitch_e useRowMatchFinder; /* Indicates whether the CDict was created with params that would use + * row-based matchfinder. Unless the cdict is reloaded, we will use + * the same greedy/lazy matchfinder at compression time. + */ +@@ -130,11 +136,12 @@ ZSTD_CCtx* ZSTD_initStaticCCtx(void* workspace, size_t workspaceSize) + ZSTD_cwksp_move(&cctx->workspace, &ws); + cctx->staticSize = workspaceSize; + +- /* statically sized space. entropyWorkspace never moves (but prev/next block swap places) */ +- if (!ZSTD_cwksp_check_available(&cctx->workspace, ENTROPY_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; ++ /* statically sized space. tmpWorkspace never moves (but prev/next block swap places) */ ++ if (!ZSTD_cwksp_check_available(&cctx->workspace, TMP_WORKSPACE_SIZE + 2 * sizeof(ZSTD_compressedBlockState_t))) return NULL; + cctx->blockState.prevCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); + cctx->blockState.nextCBlock = (ZSTD_compressedBlockState_t*)ZSTD_cwksp_reserve_object(&cctx->workspace, sizeof(ZSTD_compressedBlockState_t)); +- cctx->entropyWorkspace = (U32*)ZSTD_cwksp_reserve_object(&cctx->workspace, ENTROPY_WORKSPACE_SIZE); ++ cctx->tmpWorkspace = ZSTD_cwksp_reserve_object(&cctx->workspace, TMP_WORKSPACE_SIZE); ++ cctx->tmpWkspSize = TMP_WORKSPACE_SIZE; + cctx->bmi2 = ZSTD_cpuid_bmi2(ZSTD_cpuid()); + return cctx; + } +@@ -168,15 +175,13 @@ static void ZSTD_freeCCtxContent(ZSTD_CCtx* cctx) + + size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) + { ++ DEBUGLOG(3, "ZSTD_freeCCtx (address: %p)", (void*)cctx); + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); +- { +- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); +- if (!cctxInWorkspace) { +- ZSTD_customFree(cctx, cctx->customMem); +- } ++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; + } +@@ -205,7 +210,7 @@ size_t ZSTD_sizeof_CStream(const ZSTD_CStream* zcs) + } + + /* private API call, for dictBuilder only */ +-const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } ++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx) { return &(ctx->seqStore); } + + /* Returns true if the strategy supports using a row based matchfinder */ + static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { +@@ -215,32 +220,27 @@ static int ZSTD_rowMatchFinderSupported(const ZSTD_strategy strategy) { + /* Returns true if the strategy and useRowMatchFinder mode indicate that we will use the row based matchfinder + * for this compression. + */ +-static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_paramSwitch_e mode) { ++static int ZSTD_rowMatchFinderUsed(const ZSTD_strategy strategy, const ZSTD_ParamSwitch_e mode) { + assert(mode != ZSTD_ps_auto); + return ZSTD_rowMatchFinderSupported(strategy) && (mode == ZSTD_ps_enable); + } + + /* Returns row matchfinder usage given an initial mode and cParams */ +-static ZSTD_paramSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveRowMatchFinderMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +-#if defined(ZSTD_ARCH_X86_SSE2) || defined(ZSTD_ARCH_ARM_NEON) +- int const kHasSIMD128 = 1; +-#else +- int const kHasSIMD128 = 0; +-#endif ++ /* The Linux Kernel does not use SIMD, and 128KB is a very common size, e.g. in BtrFS. ++ * The row match finder is slower for this size without SIMD, so disable it. ++ */ ++ const unsigned kWindowLogLowerBound = 17; + if (mode != ZSTD_ps_auto) return mode; /* if requested enabled, but no SIMD, we still will use row matchfinder */ + mode = ZSTD_ps_disable; + if (!ZSTD_rowMatchFinderSupported(cParams->strategy)) return mode; +- if (kHasSIMD128) { +- if (cParams->windowLog > 14) mode = ZSTD_ps_enable; +- } else { +- if (cParams->windowLog > 17) mode = ZSTD_ps_enable; +- } ++ if (cParams->windowLog > kWindowLogLowerBound) mode = ZSTD_ps_enable; + return mode; + } + + /* Returns block splitter usage (generally speaking, when using slower/stronger compression modes) */ +-static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 17) ? ZSTD_ps_enable : ZSTD_ps_disable; +@@ -248,7 +248,7 @@ static ZSTD_paramSwitch_e ZSTD_resolveBlockSplitterMode(ZSTD_paramSwitch_e mode, + + /* Returns 1 if the arguments indicate that we should allocate a chainTable, 0 otherwise */ + static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const U32 forDDSDict) { + assert(useRowMatchFinder != ZSTD_ps_auto); + /* We always should allocate a chaintable if we are allocating a matchstate for a DDS dictionary matchstate. +@@ -257,16 +257,44 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); + } + +-/* Returns 1 if compression parameters are such that we should ++/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). +- * Returns 0 otherwise. ++ * Returns ZSTD_ps_disable otherwise. + */ +-static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, ++static ZSTD_ParamSwitch_e ZSTD_resolveEnableLdm(ZSTD_ParamSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { + if (mode != ZSTD_ps_auto) return mode; + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; + } + ++static int ZSTD_resolveExternalSequenceValidation(int mode) { ++ return mode; ++} ++ ++/* Resolves maxBlockSize to the default if no value is present. */ ++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { ++ if (maxBlockSize == 0) { ++ return ZSTD_BLOCKSIZE_MAX; ++ } else { ++ return maxBlockSize; ++ } ++} ++ ++static ZSTD_ParamSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_ParamSwitch_e value, int cLevel) { ++ if (value != ZSTD_ps_auto) return value; ++ if (cLevel < 10) { ++ return ZSTD_ps_disable; ++ } else { ++ return ZSTD_ps_enable; ++ } ++} ++ ++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. ++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ ++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { ++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; ++} ++ + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) + { +@@ -282,8 +310,12 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + assert(cctxParams.ldmParams.hashLog >= cctxParams.ldmParams.bucketSizeLog); + assert(cctxParams.ldmParams.hashRateLog < 32); + } +- cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); ++ cctxParams.postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.postBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); ++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); ++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); ++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, ++ cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; + } +@@ -329,10 +361,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) + #define ZSTD_NO_CLEVEL 0 + + /* +- * Initializes the cctxParams from params and compressionLevel. ++ * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) ++static void ++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ++ const ZSTD_parameters* params, ++ int compressionLevel) + { + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); +@@ -343,10 +378,13 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par + */ + cctxParams->compressionLevel = compressionLevel; + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); +- cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); ++ cctxParams->postBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->postBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); ++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); ++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); ++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", +- cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); ++ cctxParams->useRowMatchFinder, cctxParams->postBlockSplitter, cctxParams->ldmParams.enableLdm); + } + + size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_parameters params) +@@ -359,7 +397,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete + + /* + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. +- * @param param Validated zstd parameters. ++ * @param params Validated zstd parameters. + */ + static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +@@ -455,8 +493,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + return bounds; + + case ZSTD_c_enableLongDistanceMatching: +- bounds.lowerBound = 0; +- bounds.upperBound = 1; ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: +@@ -534,11 +572,16 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + ++ case ZSTD_c_blockSplitterLevel: ++ bounds.lowerBound = 0; ++ bounds.upperBound = ZSTD_BLOCKSPLITTER_LEVEL_MAX; ++ return bounds; ++ + case ZSTD_c_useRowMatchFinder: + bounds.lowerBound = (int)ZSTD_ps_auto; + bounds.upperBound = (int)ZSTD_ps_disable; +@@ -549,6 +592,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + ++ case ZSTD_c_prefetchCDictTables: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ ++ case ZSTD_c_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ ++ case ZSTD_c_repcodeResolution: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; +@@ -567,10 +630,11 @@ static size_t ZSTD_cParam_clampBounds(ZSTD_cParameter cParam, int* value) + return 0; + } + +-#define BOUNDCHECK(cParam, val) { \ +- RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ +- parameter_outOfBound, "Param out of bounds"); \ +-} ++#define BOUNDCHECK(cParam, val) \ ++ do { \ ++ RETURN_ERROR_IF(!ZSTD_cParam_withinBounds(cParam,val), \ ++ parameter_outOfBound, "Param out of bounds"); \ ++ } while (0) + + + static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) +@@ -584,6 +648,7 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_minMatch: + case ZSTD_c_targetLength: + case ZSTD_c_strategy: ++ case ZSTD_c_blockSplitterLevel: + return 1; + + case ZSTD_c_format: +@@ -610,9 +675,13 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_repcodeResolution: + default: + return 0; + } +@@ -625,7 +694,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { +- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); ++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) +@@ -665,9 +734,14 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + case ZSTD_c_stableOutBuffer: + case ZSTD_c_blockDelimiters: + case ZSTD_c_validateSequences: +- case ZSTD_c_useBlockSplitter: ++ case ZSTD_c_splitAfterSequences: ++ case ZSTD_c_blockSplitterLevel: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_repcodeResolution: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); +@@ -723,12 +797,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); +- CCtxParams->cParams.minMatch = value; ++ CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); +- CCtxParams->cParams.targetLength = value; ++ CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : +@@ -741,12 +815,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; +- return CCtxParams->fParams.contentSizeFlag; ++ return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; +- return CCtxParams->fParams.checksumFlag; ++ return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); +@@ -755,18 +829,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); +- return CCtxParams->forceWindow; ++ return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; +- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); ++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { +- const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ const ZSTD_ParamSwitch_e lcm = (ZSTD_ParamSwitch_e)value; ++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } +@@ -789,47 +863,50 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); +- return CCtxParams->enableDedicatedDictSearch; ++ return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : +- CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; ++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); ++ CCtxParams->ldmParams.enableLdm = (ZSTD_ParamSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); +- CCtxParams->ldmParams.hashLog = value; ++ CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); +- CCtxParams->ldmParams.minMatchLength = value; ++ CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); +- CCtxParams->ldmParams.bucketSizeLog = value; ++ CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); +- CCtxParams->ldmParams.hashRateLog = value; ++ CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : +- if (value!=0) /* 0 ==> default */ ++ if (value!=0) { /* 0 ==> default */ ++ value = MAX(value, ZSTD_TARGETCBLOCKSIZE_MIN); + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); +- CCtxParams->targetCBlockSize = value; ++ } ++ CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; +- return CCtxParams->srcSizeHint; ++ return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); +@@ -843,28 +920,55 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_blockDelimiters: + BOUNDCHECK(ZSTD_c_blockDelimiters, value); +- CCtxParams->blockDelimiters = (ZSTD_sequenceFormat_e)value; ++ CCtxParams->blockDelimiters = (ZSTD_SequenceFormat_e)value; + return CCtxParams->blockDelimiters; + + case ZSTD_c_validateSequences: + BOUNDCHECK(ZSTD_c_validateSequences, value); + CCtxParams->validateSequences = value; +- return CCtxParams->validateSequences; ++ return (size_t)CCtxParams->validateSequences; ++ ++ case ZSTD_c_splitAfterSequences: ++ BOUNDCHECK(ZSTD_c_splitAfterSequences, value); ++ CCtxParams->postBlockSplitter = (ZSTD_ParamSwitch_e)value; ++ return CCtxParams->postBlockSplitter; + +- case ZSTD_c_useBlockSplitter: +- BOUNDCHECK(ZSTD_c_useBlockSplitter, value); +- CCtxParams->useBlockSplitter = (ZSTD_paramSwitch_e)value; +- return CCtxParams->useBlockSplitter; ++ case ZSTD_c_blockSplitterLevel: ++ BOUNDCHECK(ZSTD_c_blockSplitterLevel, value); ++ CCtxParams->preBlockSplitter_level = value; ++ return (size_t)CCtxParams->preBlockSplitter_level; + + case ZSTD_c_useRowMatchFinder: + BOUNDCHECK(ZSTD_c_useRowMatchFinder, value); +- CCtxParams->useRowMatchFinder = (ZSTD_paramSwitch_e)value; ++ CCtxParams->useRowMatchFinder = (ZSTD_ParamSwitch_e)value; + return CCtxParams->useRowMatchFinder; + + case ZSTD_c_deterministicRefPrefix: + BOUNDCHECK(ZSTD_c_deterministicRefPrefix, value); + CCtxParams->deterministicRefPrefix = !!value; +- return CCtxParams->deterministicRefPrefix; ++ return (size_t)CCtxParams->deterministicRefPrefix; ++ ++ case ZSTD_c_prefetchCDictTables: ++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); ++ CCtxParams->prefetchCDictTables = (ZSTD_ParamSwitch_e)value; ++ return CCtxParams->prefetchCDictTables; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); ++ CCtxParams->enableMatchFinderFallback = value; ++ return (size_t)CCtxParams->enableMatchFinderFallback; ++ ++ case ZSTD_c_maxBlockSize: ++ if (value!=0) /* 0 ==> default */ ++ BOUNDCHECK(ZSTD_c_maxBlockSize, value); ++ assert(value>=0); ++ CCtxParams->maxBlockSize = (size_t)value; ++ return CCtxParams->maxBlockSize; ++ ++ case ZSTD_c_repcodeResolution: ++ BOUNDCHECK(ZSTD_c_repcodeResolution, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_ParamSwitch_e)value; ++ return CCtxParams->searchForExternalRepcodes; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } +@@ -881,7 +985,7 @@ size_t ZSTD_CCtxParams_getParameter( + switch(param) + { + case ZSTD_c_format : +- *value = CCtxParams->format; ++ *value = (int)CCtxParams->format; + break; + case ZSTD_c_compressionLevel : + *value = CCtxParams->compressionLevel; +@@ -896,16 +1000,16 @@ size_t ZSTD_CCtxParams_getParameter( + *value = (int)CCtxParams->cParams.chainLog; + break; + case ZSTD_c_searchLog : +- *value = CCtxParams->cParams.searchLog; ++ *value = (int)CCtxParams->cParams.searchLog; + break; + case ZSTD_c_minMatch : +- *value = CCtxParams->cParams.minMatch; ++ *value = (int)CCtxParams->cParams.minMatch; + break; + case ZSTD_c_targetLength : +- *value = CCtxParams->cParams.targetLength; ++ *value = (int)CCtxParams->cParams.targetLength; + break; + case ZSTD_c_strategy : +- *value = (unsigned)CCtxParams->cParams.strategy; ++ *value = (int)CCtxParams->cParams.strategy; + break; + case ZSTD_c_contentSizeFlag : + *value = CCtxParams->fParams.contentSizeFlag; +@@ -920,10 +1024,10 @@ size_t ZSTD_CCtxParams_getParameter( + *value = CCtxParams->forceWindow; + break; + case ZSTD_c_forceAttachDict : +- *value = CCtxParams->attachDictPref; ++ *value = (int)CCtxParams->attachDictPref; + break; + case ZSTD_c_literalCompressionMode : +- *value = CCtxParams->literalCompressionMode; ++ *value = (int)CCtxParams->literalCompressionMode; + break; + case ZSTD_c_nbWorkers : + assert(CCtxParams->nbWorkers == 0); +@@ -939,19 +1043,19 @@ size_t ZSTD_CCtxParams_getParameter( + *value = CCtxParams->enableDedicatedDictSearch; + break; + case ZSTD_c_enableLongDistanceMatching : +- *value = CCtxParams->ldmParams.enableLdm; ++ *value = (int)CCtxParams->ldmParams.enableLdm; + break; + case ZSTD_c_ldmHashLog : +- *value = CCtxParams->ldmParams.hashLog; ++ *value = (int)CCtxParams->ldmParams.hashLog; + break; + case ZSTD_c_ldmMinMatch : +- *value = CCtxParams->ldmParams.minMatchLength; ++ *value = (int)CCtxParams->ldmParams.minMatchLength; + break; + case ZSTD_c_ldmBucketSizeLog : +- *value = CCtxParams->ldmParams.bucketSizeLog; ++ *value = (int)CCtxParams->ldmParams.bucketSizeLog; + break; + case ZSTD_c_ldmHashRateLog : +- *value = CCtxParams->ldmParams.hashRateLog; ++ *value = (int)CCtxParams->ldmParams.hashRateLog; + break; + case ZSTD_c_targetCBlockSize : + *value = (int)CCtxParams->targetCBlockSize; +@@ -971,8 +1075,11 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_validateSequences : + *value = (int)CCtxParams->validateSequences; + break; +- case ZSTD_c_useBlockSplitter : +- *value = (int)CCtxParams->useBlockSplitter; ++ case ZSTD_c_splitAfterSequences : ++ *value = (int)CCtxParams->postBlockSplitter; ++ break; ++ case ZSTD_c_blockSplitterLevel : ++ *value = CCtxParams->preBlockSplitter_level; + break; + case ZSTD_c_useRowMatchFinder : + *value = (int)CCtxParams->useRowMatchFinder; +@@ -980,6 +1087,18 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; ++ case ZSTD_c_prefetchCDictTables: ++ *value = (int)CCtxParams->prefetchCDictTables; ++ break; ++ case ZSTD_c_enableSeqProducerFallback: ++ *value = CCtxParams->enableMatchFinderFallback; ++ break; ++ case ZSTD_c_maxBlockSize: ++ *value = (int)CCtxParams->maxBlockSize; ++ break; ++ case ZSTD_c_repcodeResolution: ++ *value = (int)CCtxParams->searchForExternalRepcodes; ++ break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +@@ -1006,9 +1125,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( + return 0; + } + ++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); ++ /* only update if all parameters are valid */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, (int)cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, (int)cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, (int)cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, (int)cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, (int)cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, (int)cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, (int)cparams.strategy), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setParams"); ++ /* First check cParams, because we want to update all or none. */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); ++ /* Finally set cParams, which should succeed. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); ++ return 0; ++} ++ + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) + { +- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); ++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; +@@ -1024,9 +1181,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + + /* +- * Initializes the local dict using the requested parameters. +- * NOTE: This does not use the pledged src size, because it may be used for more +- * than one compression. ++ * Initializes the local dictionary using requested parameters. ++ * NOTE: Initialization does not employ the pledged src size, ++ * because the dictionary may be used for multiple compressions. + */ + static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + { +@@ -1039,8 +1196,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + return 0; + } + if (dl->cdict != NULL) { +- assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ ++ assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); +@@ -1060,26 +1217,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + } + + size_t ZSTD_CCtx_loadDictionary_advanced( +- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) ++ ZSTD_CCtx* cctx, ++ const void* dict, size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_dictContentType_e dictContentType) + { +- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't load a dictionary when ctx is not in init stage."); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); +- ZSTD_clearAllDicts(cctx); /* in case one already exists */ +- if (dict == NULL || dictSize == 0) /* no dictionary mode */ ++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ++ "Can't load a dictionary when cctx is not in init stage."); ++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ ++ if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { ++ /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, +- "no malloc for static CCtx"); ++ "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); +- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); ++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, ++ "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); +- cctx->localDict.dictBuffer = dictBuffer; +- cctx->localDict.dict = dictBuffer; ++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ ++ cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; +@@ -1149,7 +1310,7 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't reset parameters only when not in init stage."); ++ "Reset parameters is only possible during init stage."); + ZSTD_clearAllDicts(cctx); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } +@@ -1168,7 +1329,7 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) + BOUNDCHECK(ZSTD_c_searchLog, (int)cParams.searchLog); + BOUNDCHECK(ZSTD_c_minMatch, (int)cParams.minMatch); + BOUNDCHECK(ZSTD_c_targetLength,(int)cParams.targetLength); +- BOUNDCHECK(ZSTD_c_strategy, cParams.strategy); ++ BOUNDCHECK(ZSTD_c_strategy, (int)cParams.strategy); + return 0; + } + +@@ -1178,11 +1339,12 @@ size_t ZSTD_checkCParams(ZSTD_compressionParameters cParams) + static ZSTD_compressionParameters + ZSTD_clampCParams(ZSTD_compressionParameters cParams) + { +-# define CLAMP_TYPE(cParam, val, type) { \ +- ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ +- if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ +- } ++# define CLAMP_TYPE(cParam, val, type) \ ++ do { \ ++ ZSTD_bounds const bounds = ZSTD_cParam_getBounds(cParam); \ ++ if ((int)valbounds.upperBound) val=(type)bounds.upperBound; \ ++ } while (0) + # define CLAMP(cParam, val) CLAMP_TYPE(cParam, val, unsigned) + CLAMP(ZSTD_c_windowLog, cParams.windowLog); + CLAMP(ZSTD_c_chainLog, cParams.chainLog); +@@ -1240,19 +1402,62 @@ static U32 ZSTD_dictAndWindowLog(U32 windowLog, U64 srcSize, U64 dictSize) + * optimize `cPar` for a specified input (`srcSize` and `dictSize`). + * mostly downsize to reduce memory consumption and initialization latency. + * `srcSize` can be ZSTD_CONTENTSIZE_UNKNOWN when not known. +- * `mode` is the mode for parameter adjustment. See docs for `ZSTD_cParamMode_e`. ++ * `mode` is the mode for parameter adjustment. See docs for `ZSTD_CParamMode_e`. + * note : `srcSize==0` means 0! + * condition : cPar is presumed validated (can be checked using ZSTD_checkCParams()). */ + static ZSTD_compressionParameters + ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, +- ZSTD_cParamMode_e mode) ++ ZSTD_CParamMode_e mode, ++ ZSTD_ParamSwitch_e useRowMatchFinder) + { + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); + assert(ZSTD_checkCParams(cPar)==0); + ++ /* Cascade the selected strategy down to the next-highest one built into ++ * this binary. */ ++#ifdef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btultra2) { ++ cPar.strategy = ZSTD_btultra; ++ } ++ if (cPar.strategy == ZSTD_btultra) { ++ cPar.strategy = ZSTD_btopt; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btopt) { ++ cPar.strategy = ZSTD_btlazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_btlazy2) { ++ cPar.strategy = ZSTD_lazy2; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy2) { ++ cPar.strategy = ZSTD_lazy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_lazy) { ++ cPar.strategy = ZSTD_greedy; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_greedy) { ++ cPar.strategy = ZSTD_dfast; ++ } ++#endif ++#ifdef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ if (cPar.strategy == ZSTD_dfast) { ++ cPar.strategy = ZSTD_fast; ++ cPar.targetLength = 0; ++ } ++#endif ++ + switch (mode) { + case ZSTD_cpm_unknown: + case ZSTD_cpm_noAttachDict: +@@ -1281,8 +1486,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + } + + /* resize windowLog if input is small enough, to use less memory */ +- if ( (srcSize < maxWindowResize) +- && (dictSize < maxWindowResize) ) { ++ if ( (srcSize <= maxWindowResize) ++ && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : +@@ -1300,6 +1505,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + ++ /* We can't use more than 32 bits of hash in total, so that means that we require: ++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 ++ */ ++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { ++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; ++ if (cPar.hashLog > maxShortCacheHashLog) { ++ cPar.hashLog = maxShortCacheHashLog; ++ } ++ if (cPar.chainLog > maxShortCacheHashLog) { ++ cPar.chainLog = maxShortCacheHashLog; ++ } ++ } ++ ++ ++ /* At this point, we aren't 100% sure if we are using the row match finder. ++ * Unless it is explicitly disabled, conservatively assume that it is enabled. ++ * In this case it will only be disabled for small sources, so shrinking the ++ * hash log a little bit shouldn't result in any ratio loss. ++ */ ++ if (useRowMatchFinder == ZSTD_ps_auto) ++ useRowMatchFinder = ZSTD_ps_enable; ++ ++ /* We can't hash more than 32-bits in total. So that means that we require: ++ * (hashLog - rowLog + 8) <= 32 ++ */ ++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { ++ /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); ++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; ++ U32 const maxHashLog = maxRowHashLog + rowLog; ++ assert(cPar.hashLog >= rowLog); ++ if (cPar.hashLog > maxHashLog) { ++ cPar.hashLog = maxHashLog; ++ } ++ } ++ + return cPar; + } + +@@ -1310,11 +1551,11 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + { + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; +- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); ++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); + } + +-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); ++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); ++static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + + static void ZSTD_overrideCParams( + ZSTD_compressionParameters* cParams, +@@ -1330,24 +1571,25 @@ static void ZSTD_overrideCParams( + } + + ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( +- const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + ZSTD_compressionParameters cParams; + if (srcSizeHint == ZSTD_CONTENTSIZE_UNKNOWN && CCtxParams->srcSizeHint > 0) { +- srcSizeHint = CCtxParams->srcSizeHint; ++ assert(CCtxParams->srcSizeHint>=0); ++ srcSizeHint = (U64)CCtxParams->srcSizeHint; + } + cParams = ZSTD_getCParams_internal(CCtxParams->compressionLevel, srcSizeHint, dictSize, mode); + if (CCtxParams->ldmParams.enableLdm == ZSTD_ps_enable) cParams.windowLog = ZSTD_LDM_DEFAULT_WINDOW_LOG; + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ +- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); + } + + static size_t + ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, +- const ZSTD_paramSwitch_e useRowMatchFinder, +- const U32 enableDedicatedDictSearch, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, ++ const int enableDedicatedDictSearch, + const U32 forCCtx) + { + /* chain table size should be 0 for fast or row-hash strategies */ +@@ -1363,14 +1605,14 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + + hSize * sizeof(U32) + + h3Size * sizeof(U32); + size_t const optPotentialSpace = +- ZSTD_cwksp_aligned_alloc_size((MaxML+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((MaxLL+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((MaxOff+1) * sizeof(U32)) +- + ZSTD_cwksp_aligned_alloc_size((1<strategy, useRowMatchFinder) +- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) ++ ? ZSTD_cwksp_aligned64_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace +@@ -1386,30 +1628,38 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; + } + ++/* Helper function for calculating memory requirements. ++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ ++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { ++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; ++ return blockSize / divider; ++} ++ + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, + const int isStatic, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, +- const U64 pledgedSrcSize) ++ const U64 pledgedSrcSize, ++ int useSequenceProducer, ++ size_t maxBlockSize) + { + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (cParams->minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) +- + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) ++ + ZSTD_cwksp_aligned64_alloc_size(maxNbSeq * sizeof(SeqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); +- size_t const entropySpace = ZSTD_cwksp_alloc_size(ENTROPY_WORKSPACE_SIZE); ++ size_t const tmpWorkSpace = ZSTD_cwksp_alloc_size(TMP_WORKSPACE_SIZE); + size_t const blockStateSpace = 2 * ZSTD_cwksp_alloc_size(sizeof(ZSTD_compressedBlockState_t)); + size_t const matchStateSize = ZSTD_sizeof_matchState(cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 0, /* forCCtx */ 1); + + size_t const ldmSpace = ZSTD_ldm_getTableSize(*ldmParams); + size_t const maxNbLdmSeq = ZSTD_ldm_getMaxNbSeq(*ldmParams, blockSize); + size_t const ldmSeqSpace = ldmParams->enableLdm == ZSTD_ps_enable ? +- ZSTD_cwksp_aligned_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; ++ ZSTD_cwksp_aligned64_alloc_size(maxNbLdmSeq * sizeof(rawSeq)) : 0; + + + size_t const bufferSpace = ZSTD_cwksp_alloc_size(buffInSize) +@@ -1417,15 +1667,21 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ size_t const externalSeqSpace = useSequenceProducer ++ ? ZSTD_cwksp_aligned64_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ : 0; ++ + size_t const neededSpace = + cctxSpace + +- entropySpace + ++ tmpWorkSpace + + blockStateSpace + + ldmSpace + + ldmSeqSpace + + matchStateSize + + tokenSpace + +- bufferSpace; ++ bufferSpace + ++ externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +@@ -1435,7 +1691,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + { + ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, + &cParams); + + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); +@@ -1443,7 +1699,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( +- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); ++ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + + size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +@@ -1493,18 +1749,18 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; + size_t const outBuffSize = (params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params->useRowMatchFinder, ¶ms->cParams); + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, +- ZSTD_CONTENTSIZE_UNKNOWN); ++ ZSTD_CONTENTSIZE_UNKNOWN, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + } + } + +@@ -1600,7 +1856,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs) + * Invalidate all the matches in the match finder tables. + * Requires nextSrc and base to be set (can be NULL). + */ +-static void ZSTD_invalidateMatchState(ZSTD_matchState_t* ms) ++static void ZSTD_invalidateMatchState(ZSTD_MatchState_t* ms) + { + ZSTD_window_clear(&ms->window); + +@@ -1637,12 +1893,25 @@ typedef enum { + ZSTD_resetTarget_CCtx + } ZSTD_resetTarget_e; + ++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ ++static U64 ZSTD_bitmix(U64 val, U64 len) { ++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); ++ val *= 0x9FB21C651E98DF25ULL; ++ val ^= (val >> 35) + len ; ++ val *= 0x9FB21C651E98DF25ULL; ++ return val ^ (val >> 28); ++} ++ ++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ ++static void ZSTD_advanceHashSalt(ZSTD_MatchState_t* ms) { ++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); ++} + + static size_t +-ZSTD_reset_matchState(ZSTD_matchState_t* ms, ++ZSTD_reset_matchState(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + const ZSTD_compressionParameters* cParams, +- const ZSTD_paramSwitch_e useRowMatchFinder, ++ const ZSTD_ParamSwitch_e useRowMatchFinder, + const ZSTD_compResetPolicy_e crp, + const ZSTD_indexResetPolicy_e forceResetIndex, + const ZSTD_resetTarget_e forWho) +@@ -1664,6 +1933,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + + ms->hashLog3 = hashLog3; ++ ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + +@@ -1685,22 +1955,19 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp_clean_tables(ws); + } + +- /* opt parser space */ +- if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { +- DEBUGLOG(4, "reserving optimal parser space"); +- ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxLL+1) * sizeof(unsigned)); +- ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxML+1) * sizeof(unsigned)); +- ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned(ws, (MaxOff+1) * sizeof(unsigned)); +- ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)); +- ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); +- } +- + if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +- { /* Row match finder needs an additional table of hashes ("tags") */ +- size_t const tagTableSize = hSize*sizeof(U16); +- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ /* Row match finder needs an additional table of hashes ("tags") */ ++ size_t const tagTableSize = hSize; ++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use ++ * 0 when we reset a Cdict */ ++ if(forWho == ZSTD_resetTarget_CCtx) { ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ++ ZSTD_advanceHashSalt(ms); ++ } else { ++ /* When we are not salting we want to always memset the memory */ ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned64(ws, tagTableSize); ++ ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ ms->hashSalt = 0; + } + { /* Switch to 32-entry rows if searchLog is 5 (or more) */ + U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +@@ -1709,6 +1976,17 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + } + ++ /* opt parser space */ ++ if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { ++ DEBUGLOG(4, "reserving optimal parser space"); ++ ms->opt.litFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (1<opt.litLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxLL+1) * sizeof(unsigned)); ++ ms->opt.matchLengthFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxML+1) * sizeof(unsigned)); ++ ms->opt.offCodeFreq = (unsigned*)ZSTD_cwksp_reserve_aligned64(ws, (MaxOff+1) * sizeof(unsigned)); ++ ms->opt.matchTable = (ZSTD_match_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_match_t)); ++ ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned64(ws, ZSTD_OPT_SIZE * sizeof(ZSTD_optimal_t)); ++ } ++ + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, +@@ -1754,7 +2032,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + { + ZSTD_cwksp* const ws = &zc->workspace; + DEBUGLOG(4, "ZSTD_resetCCtx_internal: pledgedSrcSize=%u, wlog=%u, useRowMatchFinder=%d useBlockSplitter=%d", +- (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->useBlockSplitter); ++ (U32)pledgedSrcSize, params->cParams.windowLog, (int)params->useRowMatchFinder, (int)params->postBlockSplitter); + assert(!ZSTD_isError(ZSTD_checkCParams(params->cParams))); + + zc->isFirstBlock = 1; +@@ -1766,8 +2044,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + params = &zc->appliedParams; + + assert(params->useRowMatchFinder != ZSTD_ps_auto); +- assert(params->useBlockSplitter != ZSTD_ps_auto); ++ assert(params->postBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); ++ assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +@@ -1776,9 +2055,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(params->maxBlockSize, windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, ZSTD_hasExtSeqProd(params)); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +@@ -1795,8 +2073,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, +- buffInSize, buffOutSize, pledgedSrcSize); +- int resizeWorkspace; ++ buffInSize, buffOutSize, pledgedSrcSize, ZSTD_hasExtSeqProd(params), params->maxBlockSize); + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); + +@@ -1805,7 +2082,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + { /* Check if workspace is large enough, alloc a new one if needed */ + int const workspaceTooSmall = ZSTD_cwksp_sizeof(ws) < neededSpace; + int const workspaceWasteful = ZSTD_cwksp_check_wasteful(ws, neededSpace); +- resizeWorkspace = workspaceTooSmall || workspaceWasteful; ++ int resizeWorkspace = workspaceTooSmall || workspaceWasteful; + DEBUGLOG(4, "Need %zu B workspace", neededSpace); + DEBUGLOG(4, "windowSize: %zu - blockSize: %zu", windowSize, blockSize); + +@@ -1823,21 +2100,23 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + DEBUGLOG(5, "reserving object space"); + /* Statically sized space. +- * entropyWorkspace never moves, ++ * tmpWorkspace never moves, + * though prev/next block swap places */ + assert(ZSTD_cwksp_check_available(ws, 2 * sizeof(ZSTD_compressedBlockState_t))); + zc->blockState.prevCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.prevCBlock == NULL, memory_allocation, "couldn't allocate prevCBlock"); + zc->blockState.nextCBlock = (ZSTD_compressedBlockState_t*) ZSTD_cwksp_reserve_object(ws, sizeof(ZSTD_compressedBlockState_t)); + RETURN_ERROR_IF(zc->blockState.nextCBlock == NULL, memory_allocation, "couldn't allocate nextCBlock"); +- zc->entropyWorkspace = (U32*) ZSTD_cwksp_reserve_object(ws, ENTROPY_WORKSPACE_SIZE); +- RETURN_ERROR_IF(zc->entropyWorkspace == NULL, memory_allocation, "couldn't allocate entropyWorkspace"); ++ zc->tmpWorkspace = ZSTD_cwksp_reserve_object(ws, TMP_WORKSPACE_SIZE); ++ RETURN_ERROR_IF(zc->tmpWorkspace == NULL, memory_allocation, "couldn't allocate tmpWorkspace"); ++ zc->tmpWkspSize = TMP_WORKSPACE_SIZE; + } } + + ZSTD_cwksp_clear(ws); + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; ++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; +@@ -1845,7 +2124,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->appliedParams.fParams.contentSizeFlag = 0; + DEBUGLOG(4, "pledged content size : %u ; flag : %u", + (unsigned)pledgedSrcSize, zc->appliedParams.fParams.contentSizeFlag); +- zc->blockSize = blockSize; ++ zc->blockSizeMax = blockSize; + + xxh64_reset(&zc->xxhState, 0); + zc->stage = ZSTDcs_init; +@@ -1854,13 +2133,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + ++ FORWARD_IF_ERROR(ZSTD_reset_matchState( ++ &zc->blockState.matchState, ++ ws, ++ ¶ms->cParams, ++ params->useRowMatchFinder, ++ crp, ++ needsIndexReset, ++ ZSTD_resetTarget_CCtx), ""); ++ ++ zc->seqStore.sequencesStart = (SeqDef*)ZSTD_cwksp_reserve_aligned64(ws, maxNbSeq * sizeof(SeqDef)); ++ ++ /* ldm hash table */ ++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { ++ /* TODO: avoid memset? */ ++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned64(ws, ldmHSize * sizeof(ldmEntry_t)); ++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned64(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->maxNbLdmSequences = maxNbLdmSeq; ++ ++ ZSTD_window_init(&zc->ldmState.window); ++ zc->ldmState.loadedDictEnd = 0; ++ } ++ ++ /* reserve space for block-level external sequences */ ++ if (ZSTD_hasExtSeqProd(params)) { ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ zc->extSeqBufCapacity = maxNbExternalSeq; ++ zc->extSeqBuf = ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned64(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ } ++ ++ /* buffers */ ++ + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + +- /* buffers */ + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); +@@ -1883,32 +2195,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); +- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); +- +- FORWARD_IF_ERROR(ZSTD_reset_matchState( +- &zc->blockState.matchState, +- ws, +- ¶ms->cParams, +- params->useRowMatchFinder, +- crp, +- needsIndexReset, +- ZSTD_resetTarget_CCtx), ""); +- +- /* ldm hash table */ +- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { +- /* TODO: avoid memset? */ +- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; +- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); +- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); +- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); +- zc->maxNbLdmSequences = maxNbLdmSeq; +- +- ZSTD_window_init(&zc->ldmState.window); +- zc->ldmState.loadedDictEnd = 0; +- } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + +@@ -1980,7 +2269,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, +- cdict->dictContentSize, ZSTD_cpm_attachDict); ++ cdict->dictContentSize, ZSTD_cpm_attachDict, ++ params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +@@ -2019,6 +2309,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + return 0; + } + ++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ++ ZSTD_compressionParameters const* cParams) { ++ if (ZSTD_CDictIndicesAreTagged(cParams)){ ++ /* Remove tags from the CDict table if they are present. ++ * See docs on "short cache" in zstd_compress_internal.h for context. */ ++ size_t i; ++ for (i = 0; i < tableSize; i++) { ++ U32 const taggedIndex = src[i]; ++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; ++ dst[i] = index; ++ } ++ } else { ++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); ++ } ++} ++ + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, +@@ -2054,26 +2360,29 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + +- ZSTD_memcpy(cctx->blockState.matchState.hashTable, +- cdict->matchState.hashTable, +- hSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, ++ cdict->matchState.hashTable, ++ hSize, cdict_cParams); ++ + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +- ZSTD_memcpy(cctx->blockState.matchState.chainTable, +- cdict->matchState.chainTable, +- chainSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, ++ cdict->matchState.chainTable, ++ chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +- size_t const tagTableSize = hSize*sizeof(U16); ++ size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, +- cdict->matchState.tagTable, +- tagTableSize); ++ cdict->matchState.tagTable, ++ tagTableSize); ++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + + /* Zero the hashTable3, since the cdict never fills it */ +- { int const h3log = cctx->blockState.matchState.hashLog3; ++ assert(cctx->blockState.matchState.hashLog3 <= 31); ++ { U32 const h3log = cctx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + assert(cdict->matchState.hashLog3 == 0); + ZSTD_memset(cctx->blockState.matchState.hashTable3, 0, h3Size * sizeof(U32)); +@@ -2082,8 +2391,8 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + ZSTD_cwksp_mark_tables_clean(&cctx->workspace); + + /* copy dictionary offsets */ +- { ZSTD_matchState_t const* srcMatchState = &cdict->matchState; +- ZSTD_matchState_t* dstMatchState = &cctx->blockState.matchState; ++ { ZSTD_MatchState_t const* srcMatchState = &cdict->matchState; ++ ZSTD_MatchState_t* dstMatchState = &cctx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; +@@ -2141,12 +2450,13 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + /* Copy only compression parameters related to tables. */ + params.cParams = srcCCtx->appliedParams.cParams; + assert(srcCCtx->appliedParams.useRowMatchFinder != ZSTD_ps_auto); +- assert(srcCCtx->appliedParams.useBlockSplitter != ZSTD_ps_auto); ++ assert(srcCCtx->appliedParams.postBlockSplitter != ZSTD_ps_auto); + assert(srcCCtx->appliedParams.ldmParams.enableLdm != ZSTD_ps_auto); + params.useRowMatchFinder = srcCCtx->appliedParams.useRowMatchFinder; +- params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; ++ params.postBlockSplitter = srcCCtx->appliedParams.postBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; ++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); +@@ -2166,7 +2476,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + ? ((size_t)1 << srcCCtx->appliedParams.cParams.chainLog) + : 0; + size_t const hSize = (size_t)1 << srcCCtx->appliedParams.cParams.hashLog; +- int const h3log = srcCCtx->blockState.matchState.hashLog3; ++ U32 const h3log = srcCCtx->blockState.matchState.hashLog3; + size_t const h3Size = h3log ? ((size_t)1 << h3log) : 0; + + ZSTD_memcpy(dstCCtx->blockState.matchState.hashTable, +@@ -2184,8 +2494,8 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + + /* copy dictionary offsets */ + { +- const ZSTD_matchState_t* srcMatchState = &srcCCtx->blockState.matchState; +- ZSTD_matchState_t* dstMatchState = &dstCCtx->blockState.matchState; ++ const ZSTD_MatchState_t* srcMatchState = &srcCCtx->blockState.matchState; ++ ZSTD_MatchState_t* dstMatchState = &dstCCtx->blockState.matchState; + dstMatchState->window = srcMatchState->window; + dstMatchState->nextToUpdate = srcMatchState->nextToUpdate; + dstMatchState->loadedDictEnd= srcMatchState->loadedDictEnd; +@@ -2234,7 +2544,7 @@ ZSTD_reduceTable_internal (U32* const table, U32 const size, U32 const reducerVa + /* Protect special index values < ZSTD_WINDOW_START_INDEX. */ + U32 const reducerThreshold = reducerValue + ZSTD_WINDOW_START_INDEX; + assert((size & (ZSTD_ROWSIZE-1)) == 0); /* multiple of ZSTD_ROWSIZE */ +- assert(size < (1U<<31)); /* can be casted to int */ ++ assert(size < (1U<<31)); /* can be cast to int */ + + + for (rowNb=0 ; rowNb < nbRows ; rowNb++) { +@@ -2267,7 +2577,7 @@ static void ZSTD_reduceTable_btlazy2(U32* const table, U32 const size, U32 const + + /*! ZSTD_reduceIndex() : + * rescale all indexes to avoid future overflow (indexes are U32) */ +-static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue) ++static void ZSTD_reduceIndex (ZSTD_MatchState_t* ms, ZSTD_CCtx_params const* params, const U32 reducerValue) + { + { U32 const hSize = (U32)1 << params->cParams.hashLog; + ZSTD_reduceTable(ms->hashTable, hSize, reducerValue); +@@ -2294,26 +2604,32 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par + + /* See doc/zstd_compression_format.md for detailed format description */ + +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr) + { +- const seqDef* const sequences = seqStorePtr->sequencesStart; ++ const SeqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; + BYTE* const ofCodeTable = seqStorePtr->ofCode; + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; ++ int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); ++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) ++ longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; ++ return longOffsets; + } + + /* ZSTD_useTargetCBlockSize(): +@@ -2333,9 +2649,9 @@ static int ZSTD_useTargetCBlockSize(const ZSTD_CCtx_params* cctxParams) + * Returns 1 if true, 0 otherwise. */ + static int ZSTD_blockSplitterEnabled(ZSTD_CCtx_params* cctxParams) + { +- DEBUGLOG(5, "ZSTD_blockSplitterEnabled (useBlockSplitter=%d)", cctxParams->useBlockSplitter); +- assert(cctxParams->useBlockSplitter != ZSTD_ps_auto); +- return (cctxParams->useBlockSplitter == ZSTD_ps_enable); ++ DEBUGLOG(5, "ZSTD_blockSplitterEnabled (postBlockSplitter=%d)", cctxParams->postBlockSplitter); ++ assert(cctxParams->postBlockSplitter != ZSTD_ps_auto); ++ return (cctxParams->postBlockSplitter == ZSTD_ps_enable); + } + + /* Type returned by ZSTD_buildSequencesStatistics containing finalized symbol encoding types +@@ -2347,6 +2663,7 @@ typedef struct { + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ ++ int longOffsets; + } ZSTD_symbolEncodingTypeStats_t; + + /* ZSTD_buildSequencesStatistics(): +@@ -2357,11 +2674,13 @@ typedef struct { + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +- BYTE* dst, const BYTE* const dstEnd, +- ZSTD_strategy strategy, unsigned* countWorkspace, +- void* entropyWorkspace, size_t entropyWkspSize) { ++ZSTD_buildSequencesStatistics( ++ const SeqStore_t* seqStorePtr, size_t nbSeq, ++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, ++ BYTE* dst, const BYTE* const dstEnd, ++ ZSTD_strategy strategy, unsigned* countWorkspace, ++ void* entropyWorkspace, size_t entropyWkspSize) ++{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; +@@ -2375,7 +2694,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + + stats.lastCountSize = 0; + /* convert length/distances into codes */ +- ZSTD_seqToCodes(seqStorePtr); ++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ +@@ -2392,7 +2711,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + assert(!(stats.LLtype < set_compressed && nextEntropy->litlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_LitLength, LLFSELog, (symbolEncodingType_e)stats.LLtype, ++ CTable_LitLength, LLFSELog, (SymbolEncodingType_e)stats.LLtype, + countWorkspace, max, llCodeTable, nbSeq, + LL_defaultNorm, LL_defaultNormLog, MaxLL, + prevEntropy->litlengthCTable, +@@ -2413,7 +2732,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + size_t const mostFrequent = HIST_countFast_wksp( + countWorkspace, &max, ofCodeTable, nbSeq, entropyWorkspace, entropyWkspSize); /* can't fail */ + /* We can only use the basic table if max <= DefaultMaxOff, otherwise the offsets are too large */ +- ZSTD_defaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; ++ ZSTD_DefaultPolicy_e const defaultPolicy = (max <= DefaultMaxOff) ? ZSTD_defaultAllowed : ZSTD_defaultDisallowed; + DEBUGLOG(5, "Building OF table"); + nextEntropy->offcode_repeatMode = prevEntropy->offcode_repeatMode; + stats.Offtype = ZSTD_selectEncodingType(&nextEntropy->offcode_repeatMode, +@@ -2424,7 +2743,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + assert(!(stats.Offtype < set_compressed && nextEntropy->offcode_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_OffsetBits, OffFSELog, (symbolEncodingType_e)stats.Offtype, ++ CTable_OffsetBits, OffFSELog, (SymbolEncodingType_e)stats.Offtype, + countWorkspace, max, ofCodeTable, nbSeq, + OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, + prevEntropy->offcodeCTable, +@@ -2454,7 +2773,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + assert(!(stats.MLtype < set_compressed && nextEntropy->matchlength_repeatMode != FSE_repeat_none)); /* We don't copy tables */ + { size_t const countSize = ZSTD_buildCTable( + op, (size_t)(oend - op), +- CTable_MatchLength, MLFSELog, (symbolEncodingType_e)stats.MLtype, ++ CTable_MatchLength, MLFSELog, (SymbolEncodingType_e)stats.MLtype, + countWorkspace, max, mlCodeTable, nbSeq, + ML_defaultNorm, ML_defaultNormLog, MaxML, + prevEntropy->matchlengthCTable, +@@ -2480,22 +2799,23 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + */ + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- void* entropyWorkspace, size_t entropyWkspSize, +- const int bmi2) ++ZSTD_entropyCompressSeqStore_internal( ++ void* dst, size_t dstCapacity, ++ const void* literals, size_t litSize, ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ const int bmi2) + { +- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; +- const seqDef* const sequences = seqStorePtr->sequencesStart; +- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const SeqDef* const sequences = seqStorePtr->sequencesStart; ++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; +@@ -2503,29 +2823,28 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; ++ int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ +- { const BYTE* const literals = seqStorePtr->litStart; +- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; ++ { size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ +- unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); +- size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ int const suspectUncompressible = (numSequences == 0) || (litSize / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); ++ + size_t const cSize = ZSTD_compressLiterals( +- &prevEntropy->huf, &nextEntropy->huf, +- cctxParams->cParams.strategy, +- ZSTD_literalsCompressionIsDisabled(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, +- bmi2, suspectUncompressible); ++ &prevEntropy->huf, &nextEntropy->huf, ++ cctxParams->cParams.strategy, ++ ZSTD_literalsCompressionIsDisabled(cctxParams), ++ suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; +@@ -2551,11 +2870,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } +- { +- ZSTD_symbolEncodingTypeStats_t stats; +- BYTE* seqHead = op++; ++ { BYTE* const seqHead = op++; + /* build stats for sequences */ +- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, ++ const ZSTD_symbolEncodingTypeStats_t stats = ++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, +@@ -2564,6 +2882,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; ++ longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +@@ -2597,104 +2916,146 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + return (size_t)(op - ostart); + } + +-MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- size_t srcSize, +- void* entropyWorkspace, size_t entropyWkspSize, +- int bmi2) ++static size_t ++ZSTD_entropyCompressSeqStore_wExtLitBuffer( ++ void* dst, size_t dstCapacity, ++ const void* literals, size_t litSize, ++ size_t blockSize, ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) + { + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( +- seqStorePtr, prevEntropy, nextEntropy, cctxParams, + dst, dstCapacity, ++ literals, litSize, ++ seqStorePtr, prevEntropy, nextEntropy, cctxParams, + entropyWorkspace, entropyWkspSize, bmi2); + if (cSize == 0) return 0; + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ +- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) ++ if ((cSize == ERROR(dstSize_tooSmall)) & (blockSize <= dstCapacity)) { ++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ ++ } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ +- { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); ++ { size_t const maxCSize = blockSize - ZSTD_minGain(blockSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. ++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. ++ */ ++ assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; + } + ++static size_t ++ZSTD_entropyCompressSeqStore( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) ++{ ++ return ZSTD_entropyCompressSeqStore_wExtLitBuffer( ++ dst, dstCapacity, ++ seqStorePtr->litStart, (size_t)(seqStorePtr->lit - seqStorePtr->litStart), ++ srcSize, ++ seqStorePtr, ++ prevEntropy, nextEntropy, ++ cctxParams, ++ entropyWorkspace, entropyWkspSize, ++ bmi2); ++} ++ + /* ZSTD_selectBlockCompressor() : + * Not static, but internal use only (used by long distance matcher) + * assumption : strat is a valid strategy */ +-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) ++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e useRowMatchFinder, ZSTD_dictMode_e dictMode) + { +- static const ZSTD_blockCompressor blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { ++ static const ZSTD_BlockCompressor_f blockCompressor[4][ZSTD_STRATEGY_MAX+1] = { + { ZSTD_compressBlock_fast /* default for 0 */, + ZSTD_compressBlock_fast, +- ZSTD_compressBlock_doubleFast, +- ZSTD_compressBlock_greedy, +- ZSTD_compressBlock_lazy, +- ZSTD_compressBlock_lazy2, +- ZSTD_compressBlock_btlazy2, +- ZSTD_compressBlock_btopt, +- ZSTD_compressBlock_btultra, +- ZSTD_compressBlock_btultra2 }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST, ++ ZSTD_COMPRESSBLOCK_GREEDY, ++ ZSTD_COMPRESSBLOCK_LAZY, ++ ZSTD_COMPRESSBLOCK_LAZY2, ++ ZSTD_COMPRESSBLOCK_BTLAZY2, ++ ZSTD_COMPRESSBLOCK_BTOPT, ++ ZSTD_COMPRESSBLOCK_BTULTRA, ++ ZSTD_COMPRESSBLOCK_BTULTRA2 ++ }, + { ZSTD_compressBlock_fast_extDict /* default for 0 */, + ZSTD_compressBlock_fast_extDict, +- ZSTD_compressBlock_doubleFast_extDict, +- ZSTD_compressBlock_greedy_extDict, +- ZSTD_compressBlock_lazy_extDict, +- ZSTD_compressBlock_lazy2_extDict, +- ZSTD_compressBlock_btlazy2_extDict, +- ZSTD_compressBlock_btopt_extDict, +- ZSTD_compressBlock_btultra_extDict, +- ZSTD_compressBlock_btultra_extDict }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT, ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT, ++ ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ++ }, + { ZSTD_compressBlock_fast_dictMatchState /* default for 0 */, + ZSTD_compressBlock_fast_dictMatchState, +- ZSTD_compressBlock_doubleFast_dictMatchState, +- ZSTD_compressBlock_greedy_dictMatchState, +- ZSTD_compressBlock_lazy_dictMatchState, +- ZSTD_compressBlock_lazy2_dictMatchState, +- ZSTD_compressBlock_btlazy2_dictMatchState, +- ZSTD_compressBlock_btopt_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState, +- ZSTD_compressBlock_btultra_dictMatchState }, ++ ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE, ++ ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ++ }, + { NULL /* default for 0 */, + NULL, + NULL, +- ZSTD_compressBlock_greedy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy_dedicatedDictSearch, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch, ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH, + NULL, + NULL, + NULL, + NULL } + }; +- ZSTD_blockCompressor selectedCompressor; ++ ZSTD_BlockCompressor_f selectedCompressor; + ZSTD_STATIC_ASSERT((unsigned)ZSTD_fast == 1); + +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); +- DEBUGLOG(4, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); ++ DEBUGLOG(5, "Selected block compressor: dictMode=%d strat=%d rowMatchfinder=%d", (int)dictMode, (int)strat, (int)useRowMatchFinder); + if (ZSTD_rowMatchFinderUsed(strat, useRowMatchFinder)) { +- static const ZSTD_blockCompressor rowBasedBlockCompressors[4][3] = { +- { ZSTD_compressBlock_greedy_row, +- ZSTD_compressBlock_lazy_row, +- ZSTD_compressBlock_lazy2_row }, +- { ZSTD_compressBlock_greedy_extDict_row, +- ZSTD_compressBlock_lazy_extDict_row, +- ZSTD_compressBlock_lazy2_extDict_row }, +- { ZSTD_compressBlock_greedy_dictMatchState_row, +- ZSTD_compressBlock_lazy_dictMatchState_row, +- ZSTD_compressBlock_lazy2_dictMatchState_row }, +- { ZSTD_compressBlock_greedy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy_dedicatedDictSearch_row, +- ZSTD_compressBlock_lazy2_dedicatedDictSearch_row } ++ static const ZSTD_BlockCompressor_f rowBasedBlockCompressors[4][3] = { ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ++ }, ++ { ++ ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW, ++ ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ++ } + }; +- DEBUGLOG(4, "Selecting a row-based matchfinder"); ++ DEBUGLOG(5, "Selecting a row-based matchfinder"); + assert(useRowMatchFinder != ZSTD_ps_auto); + selectedCompressor = rowBasedBlockCompressors[(int)dictMode][(int)strat - (int)ZSTD_greedy]; + } else { +@@ -2704,30 +3065,126 @@ ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramS + return selectedCompressor; + } + +-static void ZSTD_storeLastLiterals(seqStore_t* seqStorePtr, ++static void ZSTD_storeLastLiterals(SeqStore_t* seqStorePtr, + const BYTE* anchor, size_t lastLLSize) + { + ZSTD_memcpy(seqStorePtr->lit, anchor, lastLLSize); + seqStorePtr->lit += lastLLSize; + } + +-void ZSTD_resetSeqStore(seqStore_t* ssPtr) ++void ZSTD_resetSeqStore(SeqStore_t* ssPtr) + { + ssPtr->lit = ssPtr->litStart; + ssPtr->sequences = ssPtr->sequencesStart; + ssPtr->longLengthType = ZSTD_llt_none; + } + +-typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; ++/* ZSTD_postProcessSequenceProducerResult() : ++ * Validates and post-processes sequences obtained through the external matchfinder API: ++ * - Checks whether nbExternalSeqs represents an error condition. ++ * - Appends a block delimiter to outSeqs if one is not already present. ++ * See zstd.h for context regarding block delimiters. ++ * Returns the number of sequences after post-processing, or an error code. */ ++static size_t ZSTD_postProcessSequenceProducerResult( ++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ++) { ++ RETURN_ERROR_IF( ++ nbExternalSeqs > outSeqsCapacity, ++ sequenceProducer_failed, ++ "External sequence producer returned error code %lu", ++ (unsigned long)nbExternalSeqs ++ ); ++ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == 0 && srcSize > 0, ++ sequenceProducer_failed, ++ "Got zero sequences from external sequence producer for a non-empty src buffer!" ++ ); ++ ++ if (srcSize == 0) { ++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); ++ return 1; ++ } ++ ++ { ++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; ++ ++ /* We can return early if lastSeq is already a block delimiter. */ ++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { ++ return nbExternalSeqs; ++ } ++ ++ /* This error condition is only possible if the external matchfinder ++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == outSeqsCapacity, ++ sequenceProducer_failed, ++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ++ ); ++ ++ /* lastSeq is not a block delimiter, so we need to append one. */ ++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); ++ return nbExternalSeqs + 1; ++ } ++} ++ ++/* ZSTD_fastSequenceLengthSum() : ++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. ++ * Similar to another function in zstd_compress.c (determine_blockSize), ++ * except it doesn't check for a block delimiter to end summation. ++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). ++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ ++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { ++ size_t matchLenSum, litLenSum, i; ++ matchLenSum = 0; ++ litLenSum = 0; ++ for (i = 0; i < seqBufSize; i++) { ++ litLenSum += seqBuf[i].litLength; ++ matchLenSum += seqBuf[i].matchLength; ++ } ++ return litLenSum + matchLenSum; ++} ++ ++/* ++ * Function to validate sequences produced by a block compressor. ++ */ ++static void ZSTD_validateSeqStore(const SeqStore_t* seqStore, const ZSTD_compressionParameters* cParams) ++{ ++#if DEBUGLEVEL >= 1 ++ const SeqDef* seq = seqStore->sequencesStart; ++ const SeqDef* const seqEnd = seqStore->sequences; ++ size_t const matchLenLowerBound = cParams->minMatch == 3 ? 3 : 4; ++ for (; seq < seqEnd; ++seq) { ++ const ZSTD_SequenceLength seqLength = ZSTD_getSequenceLength(seqStore, seq); ++ assert(seqLength.matchLength >= matchLenLowerBound); ++ (void)seqLength; ++ (void)matchLenLowerBound; ++ } ++#else ++ (void)seqStore; ++ (void)cParams; ++#endif ++} ++ ++static size_t ++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch); ++ ++typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_BuildSeqStore_e; + + static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + { +- ZSTD_matchState_t* const ms = &zc->blockState.matchState; ++ ZSTD_MatchState_t* const ms = &zc->blockState.matchState; + DEBUGLOG(5, "ZSTD_buildSeqStore (srcSize=%zu)", srcSize); + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); +- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { +@@ -2763,6 +3220,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, +@@ -2772,7 +3238,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + src, srcSize); + assert(zc->externSeqStore.pos <= zc->externSeqStore.size); + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { +- rawSeqStore_t ldmSeqStore = kNullRawSeqStore; ++ RawSeqStore_t ldmSeqStore = kNullRawSeqStore; ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ ZSTD_hasExtSeqProd(&zc->appliedParams), ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); + + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; +@@ -2788,42 +3262,116 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); +- } else { /* not long range mode */ +- ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, +- zc->appliedParams.useRowMatchFinder, +- dictMode); ++ } else if (ZSTD_hasExtSeqProd(&zc->appliedParams)) { ++ assert( ++ zc->extSeqBufCapacity >= ZSTD_sequenceBound(srcSize) ++ ); ++ assert(zc->appliedParams.extSeqProdFunc != NULL); ++ ++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; ++ ++ size_t const nbExternalSeqs = (zc->appliedParams.extSeqProdFunc)( ++ zc->appliedParams.extSeqProdState, ++ zc->extSeqBuf, ++ zc->extSeqBufCapacity, ++ src, srcSize, ++ NULL, 0, /* dict and dictSize, currently not supported */ ++ zc->appliedParams.compressionLevel, ++ windowSize ++ ); ++ ++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( ++ zc->extSeqBuf, ++ nbExternalSeqs, ++ zc->extSeqBufCapacity, ++ srcSize ++ ); ++ ++ /* Return early if there is no error, since we don't need to worry about last literals */ ++ if (!ZSTD_isError(nbPostProcessedSeqs)) { ++ ZSTD_SequencePosition seqPos = {0,0,0}; ++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->extSeqBuf, nbPostProcessedSeqs); ++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); ++ FORWARD_IF_ERROR( ++ ZSTD_transferSequences_wBlockDelim( ++ zc, &seqPos, ++ zc->extSeqBuf, nbPostProcessedSeqs, ++ src, srcSize, ++ zc->appliedParams.searchForExternalRepcodes ++ ), ++ "Failed to copy external sequences to seqStore!" ++ ); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); ++ return ZSTDbss_compress; ++ } ++ ++ /* Propagate the error if fallback is disabled */ ++ if (!zc->appliedParams.enableMatchFinderFallback) { ++ return nbPostProcessedSeqs; ++ } ++ ++ /* Fallback to software matchfinder */ ++ { ZSTD_BlockCompressor_f const blockCompressor = ++ ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG( ++ 5, ++ "External sequence producer returned error code %lu. Falling back to internal parser.", ++ (unsigned long)nbExternalSeqs ++ ); ++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); ++ } } ++ } else { /* not long range mode and no external matchfinder */ ++ ZSTD_BlockCompressor_f const blockCompressor = ZSTD_selectBlockCompressor( ++ zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); + ms->ldmSeqStore = NULL; + lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); + } + { const BYTE* const lastLiterals = (const BYTE*)src + srcSize - lastLLSize; + ZSTD_storeLastLiterals(&zc->seqStore, lastLiterals, lastLLSize); + } } ++ ZSTD_validateSeqStore(&zc->seqStore, &zc->appliedParams.cParams); + return ZSTDbss_compress; + } + +-static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) ++static size_t ZSTD_copyBlockSequences(SeqCollector* seqCollector, const SeqStore_t* seqStore, const U32 prevRepcodes[ZSTD_REP_NUM]) + { +- const seqStore_t* seqStore = ZSTD_getSeqStore(zc); +- const seqDef* seqStoreSeqs = seqStore->sequencesStart; +- size_t seqStoreSeqSize = seqStore->sequences - seqStoreSeqs; +- size_t seqStoreLiteralsSize = (size_t)(seqStore->lit - seqStore->litStart); +- size_t literalsRead = 0; +- size_t lastLLSize; ++ const SeqDef* inSeqs = seqStore->sequencesStart; ++ const size_t nbInSequences = (size_t)(seqStore->sequences - inSeqs); ++ const size_t nbInLiterals = (size_t)(seqStore->lit - seqStore->litStart); + +- ZSTD_Sequence* outSeqs = &zc->seqCollector.seqStart[zc->seqCollector.seqIndex]; ++ ZSTD_Sequence* outSeqs = seqCollector->seqIndex == 0 ? seqCollector->seqStart : seqCollector->seqStart + seqCollector->seqIndex; ++ const size_t nbOutSequences = nbInSequences + 1; ++ size_t nbOutLiterals = 0; ++ Repcodes_t repcodes; + size_t i; +- repcodes_t updatedRepcodes; +- +- assert(zc->seqCollector.seqIndex + 1 < zc->seqCollector.maxSequences); +- /* Ensure we have enough space for last literals "sequence" */ +- assert(zc->seqCollector.maxSequences >= seqStoreSeqSize + 1); +- ZSTD_memcpy(updatedRepcodes.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (i = 0; i < seqStoreSeqSize; ++i) { +- U32 rawOffset = seqStoreSeqs[i].offBase - ZSTD_REP_NUM; +- outSeqs[i].litLength = seqStoreSeqs[i].litLength; +- outSeqs[i].matchLength = seqStoreSeqs[i].mlBase + MINMATCH; ++ ++ /* Bounds check that we have enough space for every input sequence ++ * and the block delimiter ++ */ ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ RETURN_ERROR_IF( ++ nbOutSequences > (size_t)(seqCollector->maxSequences - seqCollector->seqIndex), ++ dstSize_tooSmall, ++ "Not enough space to copy sequences"); ++ ++ ZSTD_memcpy(&repcodes, prevRepcodes, sizeof(repcodes)); ++ for (i = 0; i < nbInSequences; ++i) { ++ U32 rawOffset; ++ outSeqs[i].litLength = inSeqs[i].litLength; ++ outSeqs[i].matchLength = inSeqs[i].mlBase + MINMATCH; + outSeqs[i].rep = 0; + ++ /* Handle the possible single length >= 64K ++ * There can only be one because we add MINMATCH to every match length, ++ * and blocks are at most 128K. ++ */ + if (i == seqStore->longLengthPos) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { + outSeqs[i].litLength += 0x10000; +@@ -2832,46 +3380,75 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + } + } + +- if (seqStoreSeqs[i].offBase <= ZSTD_REP_NUM) { +- /* Derive the correct offset corresponding to a repcode */ +- outSeqs[i].rep = seqStoreSeqs[i].offBase; ++ /* Determine the raw offset given the offBase, which may be a repcode. */ ++ if (OFFBASE_IS_REPCODE(inSeqs[i].offBase)) { ++ const U32 repcode = OFFBASE_TO_REPCODE(inSeqs[i].offBase); ++ assert(repcode > 0); ++ outSeqs[i].rep = repcode; + if (outSeqs[i].litLength != 0) { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep - 1]; ++ rawOffset = repcodes.rep[repcode - 1]; + } else { +- if (outSeqs[i].rep == 3) { +- rawOffset = updatedRepcodes.rep[0] - 1; ++ if (repcode == 3) { ++ assert(repcodes.rep[0] > 1); ++ rawOffset = repcodes.rep[0] - 1; + } else { +- rawOffset = updatedRepcodes.rep[outSeqs[i].rep]; ++ rawOffset = repcodes.rep[repcode]; + } + } ++ } else { ++ rawOffset = OFFBASE_TO_OFFSET(inSeqs[i].offBase); + } + outSeqs[i].offset = rawOffset; +- /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode +- so we provide seqStoreSeqs[i].offset - 1 */ +- ZSTD_updateRep(updatedRepcodes.rep, +- seqStoreSeqs[i].offBase - 1, +- seqStoreSeqs[i].litLength == 0); +- literalsRead += outSeqs[i].litLength; ++ ++ /* Update repcode history for the sequence */ ++ ZSTD_updateRep(repcodes.rep, ++ inSeqs[i].offBase, ++ inSeqs[i].litLength == 0); ++ ++ nbOutLiterals += outSeqs[i].litLength; + } + /* Insert last literals (if any exist) in the block as a sequence with ml == off == 0. + * If there are no last literals, then we'll emit (of: 0, ml: 0, ll: 0), which is a marker + * for the block boundary, according to the API. + */ +- assert(seqStoreLiteralsSize >= literalsRead); +- lastLLSize = seqStoreLiteralsSize - literalsRead; +- outSeqs[i].litLength = (U32)lastLLSize; +- outSeqs[i].matchLength = outSeqs[i].offset = outSeqs[i].rep = 0; +- seqStoreSeqSize++; +- zc->seqCollector.seqIndex += seqStoreSeqSize; ++ assert(nbInLiterals >= nbOutLiterals); ++ { ++ const size_t lastLLSize = nbInLiterals - nbOutLiterals; ++ outSeqs[nbInSequences].litLength = (U32)lastLLSize; ++ outSeqs[nbInSequences].matchLength = 0; ++ outSeqs[nbInSequences].offset = 0; ++ assert(nbOutSequences == nbInSequences + 1); ++ } ++ seqCollector->seqIndex += nbOutSequences; ++ assert(seqCollector->seqIndex <= seqCollector->maxSequences); ++ ++ return 0; ++} ++ ++size_t ZSTD_sequenceBound(size_t srcSize) { ++ const size_t maxNbSeq = (srcSize / ZSTD_MINMATCH_MIN) + 1; ++ const size_t maxNbDelims = (srcSize / ZSTD_BLOCKSIZE_MAX_MIN) + 1; ++ return maxNbSeq + maxNbDelims; + } + + size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) + { + const size_t dstCapacity = ZSTD_compressBound(srcSize); +- void* dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); ++ void* dst; /* Make C90 happy. */ + SeqCollector seqCollector; ++ { ++ int targetCBlockSize; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_targetCBlockSize, &targetCBlockSize), ""); ++ RETURN_ERROR_IF(targetCBlockSize != 0, parameter_unsupported, "targetCBlockSize != 0"); ++ } ++ { ++ int nbWorkers; ++ FORWARD_IF_ERROR(ZSTD_CCtx_getParameter(zc, ZSTD_c_nbWorkers, &nbWorkers), ""); ++ RETURN_ERROR_IF(nbWorkers != 0, parameter_unsupported, "nbWorkers != 0"); ++ } + ++ dst = ZSTD_customMalloc(dstCapacity, ZSTD_defaultCMem); + RETURN_ERROR_IF(dst == NULL, memory_allocation, "NULL pointer!"); + + seqCollector.collectSequences = 1; +@@ -2880,8 +3457,12 @@ size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + seqCollector.maxSequences = outSeqsSize; + zc->seqCollector = seqCollector; + +- ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); +- ZSTD_customFree(dst, ZSTD_defaultCMem); ++ { ++ const size_t ret = ZSTD_compress2(zc, dst, dstCapacity, src, srcSize); ++ ZSTD_customFree(dst, ZSTD_defaultCMem); ++ FORWARD_IF_ERROR(ret, "ZSTD_compress2 failed"); ++ } ++ assert(zc->seqCollector.seqIndex <= ZSTD_sequenceBound(srcSize)); + return zc->seqCollector.seqIndex; + } + +@@ -2910,19 +3491,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; +- size_t u; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { ++ size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; +- } +- } +- } ++ } } } + return 1; + } + +@@ -2930,7 +3509,7 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + * This is just a heuristic based on the compressibility. + * It may return both false positives and false negatives. + */ +-static int ZSTD_maybeRLE(seqStore_t const* seqStore) ++static int ZSTD_maybeRLE(SeqStore_t const* seqStore) + { + size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t const nbLits = (size_t)(seqStore->lit - seqStore->litStart); +@@ -2938,7 +3517,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) + return nbSeqs < 4 && nbLits < 10; + } + +-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) ++static void ++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) + { + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; +@@ -2946,12 +3526,14 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c + } + + /* Writes the block header */ +-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { ++static void ++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) ++{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(op, cBlockHeader); +- DEBUGLOG(3, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); ++ DEBUGLOG(5, "writeBlockHeader: cSize: %zu blockSize: %zu lastBlock: %u", cSize, blockSize, lastBlock); + } + + /* ZSTD_buildBlockEntropyStats_literals() : +@@ -2959,13 +3541,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace +- * @return : size of huffman description table or error code */ +-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +- const ZSTD_hufCTables_t* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_hufCTablesMetadata_t* hufMetadata, +- const int literalsCompressionIsDisabled, +- void* workspace, size_t wkspSize) ++ * @return : size of huffman description table, or an error code ++ */ ++static size_t ++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const int literalsCompressionIsDisabled, ++ void* workspace, size_t wkspSize, ++ int hufFlags) + { + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; +@@ -2973,9 +3558,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; +- const size_t nodeWkspSize = wkspEnd-nodeWksp; ++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +- unsigned huffLog = HUF_TABLELOG_DEFAULT; ++ unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +@@ -2990,73 +3575,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + + /* small ? don't even attempt compression (speed opt) */ + #ifndef COMPRESS_LITERALS_SIZE_MIN +-#define COMPRESS_LITERALS_SIZE_MIN 63 ++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ + #endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Scan input and build symbol stats */ +- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); ++ { size_t const largest = ++ HIST_count_wksp (countWksp, &maxSymbolValue, ++ (const BYTE*)src, srcSize, ++ workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { ++ /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { ++ /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Validate the previous Huffman table */ +- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { ++ if (repeat == HUF_repeat_check ++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); ++ assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; +- { /* Build and write the CTable */ +- size_t const newCSize = HUF_estimateCompressedSize( +- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +- size_t const hSize = HUF_writeCTable_wksp( +- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +- nodeWksp, nodeWkspSize); +- /* Check against repeating the previous CTable */ +- if (repeat != HUF_repeat_none) { +- size_t const oldCSize = HUF_estimateCompressedSize( +- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +- DEBUGLOG(5, "set_repeat - smaller"); +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_repeat; +- return 0; +- } +- } +- if (newCSize + hSize >= srcSize) { +- DEBUGLOG(5, "set_basic - no gains"); ++ } ++ { /* Build and write the CTable */ ++ size_t const newCSize = HUF_estimateCompressedSize( ++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); ++ size_t const hSize = HUF_writeCTable_wksp( ++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), ++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, ++ nodeWksp, nodeWkspSize); ++ /* Check against repeating the previous CTable */ ++ if (repeat != HUF_repeat_none) { ++ size_t const oldCSize = HUF_estimateCompressedSize( ++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); ++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { ++ DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_basic; ++ hufMetadata->hType = set_repeat; + return 0; +- } +- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +- hufMetadata->hType = set_compressed; +- nextHuf->repeatMode = HUF_repeat_check; +- return hSize; ++ } } ++ if (newCSize + hSize >= srcSize) { ++ DEBUGLOG(5, "set_basic - no gains"); ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ hufMetadata->hType = set_basic; ++ return 0; + } ++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); ++ hufMetadata->hType = set_compressed; ++ nextHuf->repeatMode = HUF_repeat_check; ++ return hSize; + } + } + +@@ -3066,8 +3655,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + * and updates nextEntropy to the appropriate repeatMode. + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; ++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) ++{ ++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; +@@ -3078,16 +3668,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. +- * @return : size of fse tables or error code */ +-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +- const ZSTD_fseCTables_t* prevEntropy, +- ZSTD_fseCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize) ++ * @return : size of fse tables or error code */ ++static size_t ++ZSTD_buildBlockEntropyStats_sequences( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_fseCTables_t* prevEntropy, ++ ZSTD_fseCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize) + { + ZSTD_strategy const strategy = cctxParams->cParams.strategy; +- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; +@@ -3103,9 +3695,9 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + entropyWorkspace, entropyWorkspaceSize) + : ZSTD_buildDummySequencesStatistics(nextEntropy); + FORWARD_IF_ERROR(stats.size, "ZSTD_buildSequencesStatistics failed!"); +- fseMetadata->llType = (symbolEncodingType_e) stats.LLtype; +- fseMetadata->ofType = (symbolEncodingType_e) stats.Offtype; +- fseMetadata->mlType = (symbolEncodingType_e) stats.MLtype; ++ fseMetadata->llType = (SymbolEncodingType_e) stats.LLtype; ++ fseMetadata->ofType = (SymbolEncodingType_e) stats.Offtype; ++ fseMetadata->mlType = (SymbolEncodingType_e) stats.MLtype; + fseMetadata->lastCountSize = stats.lastCountSize; + return stats.size; + } +@@ -3114,23 +3706,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE +- * +- * @return : 0 on success or error code ++ * @return : 0 on success, or an error code ++ * Note : also employed in superblock + */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize) +-{ +- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; ++size_t ZSTD_buildBlockEntropyStats( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize) ++{ ++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); ++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; ++ + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), +- workspace, wkspSize); ++ workspace, wkspSize, hufFlags); ++ + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +@@ -3143,11 +3740,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + } + + /* Returns the size estimate for the literals section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +- const ZSTD_hufCTables_t* huf, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, ++ const ZSTD_hufCTables_t* huf, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +@@ -3169,12 +3767,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz + } + + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +- const FSE_CTable* fseCTable, +- const U8* additionalBits, +- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +- void* workspace, size_t wkspSize) ++static size_t ++ZSTD_estimateBlockSize_symbolType(SymbolEncodingType_e type, ++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, ++ const FSE_CTable* fseCTable, ++ const U8* additionalBits, ++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, ++ void* workspace, size_t wkspSize) + { + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; +@@ -3206,116 +3805,121 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + } + + /* Returns the size estimate for the sequences section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +- fseTables->offcodeCTable, NULL, +- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +- workspace, wkspSize); ++ fseTables->offcodeCTable, NULL, ++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +- fseTables->litlengthCTable, LL_bits, +- LL_defaultNorm, LL_defaultNormLog, MaxLL, +- workspace, wkspSize); ++ fseTables->litlengthCTable, LL_bits, ++ LL_defaultNorm, LL_defaultNormLog, MaxLL, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +- fseTables->matchlengthCTable, ML_bits, +- ML_defaultNorm, ML_defaultNormLog, MaxML, +- workspace, wkspSize); ++ fseTables->matchlengthCTable, ML_bits, ++ ML_defaultNorm, ML_defaultNormLog, MaxML, ++ workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + + /* Returns the size estimate for a given stream of literals, of, ll, ml */ +-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +- const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_entropyCTables_t* entropy, +- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { ++static size_t ++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, ++ const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_entropyCTables_t* entropy, ++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize, ++ int writeLitEntropy, int writeSeqEntropy) ++{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +- workspace, wkspSize, writeSeqEntropy); ++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, ++ workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; + } + + /* Builds entropy statistics and uses them for blocksize estimation. + * +- * Returns the estimated compressed size of the seqStore, or a zstd error. ++ * @return: estimated compressed size of the seqStore, or a zstd error. + */ +-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; ++static size_t ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(SeqStore_t* seqStore, ZSTD_CCtx* zc) ++{ ++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), ++ zc->tmpWorkspace, zc->tmpWkspSize), ""); ++ return ZSTD_estimateBlockSize( ++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), +- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ &zc->blockState.nextCBlock->entropy, ++ entropyMetadata, ++ zc->tmpWorkspace, zc->tmpWkspSize, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); + } + + /* Returns literals bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreLiteralsBytes(const SeqStore_t* const seqStore) ++{ + size_t literalsBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ SeqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; +- } +- } ++ } } + return literalsBytes; + } + + /* Returns match bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreMatchBytes(const SeqStore_t* const seqStore) ++{ + size_t matchBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ SeqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; +- } +- } ++ } } + return matchBytes; + } + + /* Derives the seqStore that is a chunk of the originalSeqStore from [startIdx, endIdx). + * Stores the result in resultSeqStore. + */ +-static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, +- const seqStore_t* originalSeqStore, +- size_t startIdx, size_t endIdx) { +- BYTE* const litEnd = originalSeqStore->lit; +- size_t literalsBytes; +- size_t literalsBytesPreceding = 0; +- ++static void ZSTD_deriveSeqStoreChunk(SeqStore_t* resultSeqStore, ++ const SeqStore_t* originalSeqStore, ++ size_t startIdx, size_t endIdx) ++{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ +@@ -3328,13 +3932,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +- resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +- resultSeqStore->lit = litEnd; ++ assert(resultSeqStore->lit == originalSeqStore->lit); + } else { +- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; ++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; +@@ -3342,20 +3945,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + + /* +- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. +- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). ++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. ++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ + static U32 +-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +-{ +- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ +- assert(STORED_IS_REPCODE(offCode)); +- if (adjustedOffCode == ZSTD_REP_NUM) { +- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +- assert(rep[0] > 0); ++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) ++{ ++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ ++ assert(OFFBASE_IS_REPCODE(offBase)); ++ if (adjustedRepCode == ZSTD_REP_NUM) { ++ assert(ll0); ++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 ++ * This is only valid if it results in a valid offset value, aka > 0. ++ * Note : it may happen that `rep[0]==1` in exceptional circumstances. ++ * In which case this function will return 0, which is an invalid offset. ++ * It's not an issue though, since this value will be ++ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). ++ */ + return rep[0] - 1; + } +- return rep[adjustedOffCode]; ++ return rep[adjustedRepCode]; + } + + /* +@@ -3371,30 +3980,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +- seqStore_t* const seqStore, U32 const nbSeq) { ++static void ++ZSTD_seqStore_resolveOffCodes(Repcodes_t* const dRepcodes, Repcodes_t* const cRepcodes, ++ const SeqStore_t* const seqStore, U32 const nbSeq) ++{ + U32 idx = 0; ++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { +- seqDef* const seq = seqStore->sequencesStart + idx; +- U32 const ll0 = (seq->litLength == 0); +- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +- assert(seq->offBase > 0); +- if (STORED_IS_REPCODE(offCode)) { +- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ SeqDef* const seq = seqStore->sequencesStart + idx; ++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); ++ U32 const offBase = seq->offBase; ++ assert(offBase > 0); ++ if (OFFBASE_IS_REPCODE(offBase)) { ++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); ++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { +- seq->offBase = cRawOffset + ZSTD_REP_NUM; ++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ +- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); ++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } + } + +@@ -3404,10 +4016,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ + * Returns the total size of that block (including header) or a ZSTD error code. + */ + static size_t +-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, +- repcodes_t* const dRep, repcodes_t* const cRep, ++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, ++ const SeqStore_t* const seqStore, ++ Repcodes_t* const dRep, Repcodes_t* const cRep, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, ++ const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) + { + const U32 rleMaxLength = 25; +@@ -3417,7 +4030,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + size_t cSeqsSize; + + /* In case of an RLE or raw block, the simulated decompression repcode history must be reset */ +- repcodes_t const dRepOriginal = *dRep; ++ Repcodes_t const dRepOriginal = *dRep; + DEBUGLOG(5, "ZSTD_compressSeqStore_singleBlock"); + if (isPartition) + ZSTD_seqStore_resolveOffCodes(dRep, cRep, seqStore, (U32)(seqStore->sequences - seqStore->sequencesStart)); +@@ -3428,7 +4041,7 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + &zc->appliedParams, + op + ZSTD_blockHeaderSize, dstCapacity - ZSTD_blockHeaderSize, + srcSize, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + FORWARD_IF_ERROR(cSeqsSize, "ZSTD_entropyCompressSeqStore failed!"); + +@@ -3442,8 +4055,9 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + cSeqsSize = 1; + } + ++ /* Sequence collection not supported when block splitting */ + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, seqStore, dRepOriginal.rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3451,18 +4065,18 @@ ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, + if (cSeqsSize == 0) { + cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else if (cSeqsSize == 1) { + cSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cSize); + *dRep = dRepOriginal; /* reset simulated decompression repcode history */ + } else { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + writeBlockHeader(op, cSeqsSize, srcSize, lastBlock); + cSize = ZSTD_blockHeaderSize + cSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cSize); + } + + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) +@@ -3481,45 +4095,49 @@ typedef struct { + + /* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. +- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then +- * we do not recurse. ++ * If advantageous to split, then we recurse down the two sub-blocks. ++ * If not, or if an error occurred in estimation, then we do not recurse. + * +- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. ++ * Note: The recursion depth is capped by a heuristic minimum number of sequences, ++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * +- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize ++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. ++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ + static void + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, +- ZSTD_CCtx* zc, const seqStore_t* origSeqStore) ++ ZSTD_CCtx* zc, const SeqStore_t* origSeqStore) + { +- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ SeqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ SeqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ SeqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + ++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); ++ assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); ++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } +- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", ++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { ++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; +@@ -3527,14 +4145,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end + } + } + +-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. ++/* Base recursive function. ++ * Populates a table with intra-block partition indices that can improve compression ratio. + * +- * Returns the number of splits made (which equals the size of the partition table - 1). ++ * @return: number of splits made (which equals the size of the partition table - 1). + */ +-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +- seqStoreSplits splits = {partitions, 0}; ++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) ++{ ++ seqStoreSplits splits; ++ splits.splitLocations = partitions; ++ splits.idx = 0; + if (nbSeq <= 4) { +- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); ++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } +@@ -3550,18 +4172,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ + static size_t +-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) ++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t blockSize, ++ U32 lastBlock, U32 nbSeq) + { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; +- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); ++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ ++ SeqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ SeqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +@@ -3577,36 +4201,37 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + * + * See ZSTD_seqStore_resolveOffCodes() for more details. + */ +- repcodes_t dRep; +- repcodes_t cRep; +- ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); ++ Repcodes_t dRep; ++ Repcodes_t cRep; ++ ZSTD_memcpy(dRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ZSTD_memset(nextSeqStore, 0, sizeof(SeqStore_t)); + +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { +- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +- &dRep, &cRep, +- op, dstCapacity, +- ip, blockSize, +- lastBlock, 0 /* isPartition */); ++ size_t cSizeSingleBlock = ++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, ++ &dRep, &cRep, ++ op, dstCapacity, ++ ip, blockSize, ++ lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(zc->blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSizeMax + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { +- size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + +- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); ++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ +@@ -3621,7 +4246,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); +- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); ++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; +@@ -3629,12 +4255,12 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; +- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSizeMax + ZSTD_blockHeaderSize); + } +- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +- * for the next block. ++ /* cRep and dRep may have diverged during the compression. ++ * If so, we use the dRep repcodes for the next block. + */ +- ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(Repcodes_t)); + return cSize; + } + +@@ -3643,21 +4269,20 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) + { +- const BYTE* ip = (const BYTE*)src; +- BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +- assert(zc->appliedParams.useBlockSplitter == ZSTD_ps_enable); ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock"); ++ assert(zc->appliedParams.postBlockSplitter == ZSTD_ps_enable); + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; + } + nbSeq = (U32)(zc->seqStore.sequences - zc->seqStore.sequencesStart); +@@ -3673,9 +4298,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) + { +- /* This the upper bound for the length of an rle block. +- * This isn't the actual upper bound. Finding the real threshold +- * needs further investigation. ++ /* This is an estimated upper bound for the length of an rle block. ++ * This isn't the actual upper bound. ++ * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; +@@ -3687,11 +4312,15 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + + { const size_t bss = ZSTD_buildSeqStore(zc, src, srcSize); + FORWARD_IF_ERROR(bss, "ZSTD_buildSeqStore failed"); +- if (bss == ZSTDbss_noCompress) { cSize = 0; goto out; } ++ if (bss == ZSTDbss_noCompress) { ++ RETURN_ERROR_IF(zc->seqCollector.collectSequences, sequenceProducer_failed, "Uncompressible block"); ++ cSize = 0; ++ goto out; ++ } + } + + if (zc->seqCollector.collectSequences) { +- ZSTD_copyBlockSequences(zc); ++ FORWARD_IF_ERROR(ZSTD_copyBlockSequences(&zc->seqCollector, ZSTD_getSeqStore(zc), zc->blockState.prevCBlock->rep), "copyBlockSequences failed"); + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); + return 0; + } +@@ -3702,7 +4331,7 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + &zc->appliedParams, + dst, dstCapacity, + srcSize, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */, + zc->bmi2); + + if (frame && +@@ -3767,10 +4396,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ +- { +- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); ++ { size_t const cSize = ++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { +- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); ++ size_t const maxCSize = ++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +@@ -3778,7 +4408,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + } + } + } +- } ++ } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. +@@ -3807,7 +4437,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize(ZSTD_CCtx* zc, + return cSize; + } + +-static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, ++static void ZSTD_overflowCorrectIfNeeded(ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + void const* ip, +@@ -3831,39 +4461,82 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + } + } + ++#include "zstd_preSplit.h" ++ ++static size_t ZSTD_optimalBlockSize(ZSTD_CCtx* cctx, const void* src, size_t srcSize, size_t blockSizeMax, int splitLevel, ZSTD_strategy strat, S64 savings) ++{ ++ /* split level based on compression strategy, from `fast` to `btultra2` */ ++ static const int splitLevels[] = { 0, 0, 1, 2, 2, 3, 3, 4, 4, 4 }; ++ /* note: conservatively only split full blocks (128 KB) currently. ++ * While it's possible to go lower, let's keep it simple for a first implementation. ++ * Besides, benefits of splitting are reduced when blocks are already small. ++ */ ++ if (srcSize < 128 KB || blockSizeMax < 128 KB) ++ return MIN(srcSize, blockSizeMax); ++ /* do not split incompressible data though: ++ * require verified savings to allow pre-splitting. ++ * Note: as a consequence, the first full block is not split. ++ */ ++ if (savings < 3) { ++ DEBUGLOG(6, "don't attempt splitting: savings (%i) too low", (int)savings); ++ return 128 KB; ++ } ++ /* apply @splitLevel, or use default value (which depends on @strat). ++ * note that splitting heuristic is still conditioned by @savings >= 3, ++ * so the first block will not reach this code path */ ++ if (splitLevel == 1) return 128 KB; ++ if (splitLevel == 0) { ++ assert(ZSTD_fast <= strat && strat <= ZSTD_btultra2); ++ splitLevel = splitLevels[strat]; ++ } else { ++ assert(2 <= splitLevel && splitLevel <= 6); ++ splitLevel -= 2; ++ } ++ return ZSTD_splitBlock(src, blockSizeMax, splitLevel, cctx->tmpWorkspace, cctx->tmpWkspSize); ++} ++ + /*! ZSTD_compress_frameChunk() : + * Compress a chunk of data into one or multiple blocks. + * All blocks will be terminated, all input will be consumed. + * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. + * Frame is supposed already started (header already produced) +-* @return : compressed size, or an error code ++* @return : compressed size, or an error code + */ + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + U32 lastFrameChunk) + { +- size_t blockSize = cctx->blockSize; ++ size_t blockSizeMax = cctx->blockSizeMax; + size_t remaining = srcSize; + const BYTE* ip = (const BYTE*)src; + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; + U32 const maxDist = (U32)1 << cctx->appliedParams.cParams.windowLog; ++ S64 savings = (S64)cctx->consumedSrcSize - (S64)cctx->producedCSize; + + assert(cctx->appliedParams.cParams.windowLog <= ZSTD_WINDOWLOG_MAX); + +- DEBUGLOG(4, "ZSTD_compress_frameChunk (blockSize=%u)", (unsigned)blockSize); ++ DEBUGLOG(5, "ZSTD_compress_frameChunk (srcSize=%u, blockSizeMax=%u)", (unsigned)srcSize, (unsigned)blockSizeMax); + if (cctx->appliedParams.fParams.checksumFlag && srcSize) + xxh64_update(&cctx->xxhState, src, srcSize); + + while (remaining) { +- ZSTD_matchState_t* const ms = &cctx->blockState.matchState; +- U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); +- +- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; ++ size_t const blockSize = ZSTD_optimalBlockSize(cctx, ++ ip, remaining, ++ blockSizeMax, ++ cctx->appliedParams.preBlockSplitter_level, ++ cctx->appliedParams.cParams.strategy, ++ savings); ++ U32 const lastBlock = lastFrameChunk & (blockSize == remaining); ++ assert(blockSize <= remaining); ++ ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); +- if (remaining < blockSize) blockSize = remaining; + + ZSTD_overflowCorrectIfNeeded( + ms, &cctx->workspace, &cctx->appliedParams, ip, ip + blockSize); +@@ -3899,8 +4572,23 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } +- } +- ++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ ++ ++ /* @savings is employed to ensure that splitting doesn't worsen expansion of incompressible data. ++ * Without splitting, the maximum expansion is 3 bytes per full block. ++ * An adversarial input could attempt to fudge the split detector, ++ * and make it split incompressible data, resulting in more block headers. ++ * Note that, since ZSTD_COMPRESSBOUND() assumes a worst case scenario of 1KB per block, ++ * and the splitter never creates blocks that small (current lower limit is 8 KB), ++ * there is already no risk to expand beyond ZSTD_COMPRESSBOUND() limit. ++ * But if the goal is to not expand by more than 3-bytes per 128 KB full block, ++ * then yes, it becomes possible to make the block splitter oversplit incompressible data. ++ * Using @savings, we enforce an even more conservative condition, ++ * requiring the presence of enough savings (at least 3 bytes) to authorize splitting, ++ * otherwise only full blocks are used. ++ * But being conservative is fine, ++ * since splitting barely compressible blocks is not fruitful anyway */ ++ savings += (S64)blockSize - (S64)cSize; + + ip += blockSize; + assert(remaining >= blockSize); +@@ -3919,8 +4607,10 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + + + static size_t ZSTD_writeFrameHeader(void* dst, size_t dstCapacity, +- const ZSTD_CCtx_params* params, U64 pledgedSrcSize, U32 dictID) +-{ BYTE* const op = (BYTE*)dst; ++ const ZSTD_CCtx_params* params, ++ U64 pledgedSrcSize, U32 dictID) ++{ ++ BYTE* const op = (BYTE*)dst; + U32 const dictIDSizeCodeLength = (dictID>0) + (dictID>=256) + (dictID>=65536); /* 0-3 */ + U32 const dictIDSizeCode = params->fParams.noDictIDFlag ? 0 : dictIDSizeCodeLength; /* 0-3 */ + U32 const checksumFlag = params->fParams.checksumFlag>0; +@@ -4001,19 +4691,15 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity) + } + } + +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq) + { +- RETURN_ERROR_IF(cctx->stage != ZSTDcs_init, stage_wrong, +- "wrong cctx stage"); +- RETURN_ERROR_IF(cctx->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable, +- parameter_unsupported, +- "incompatible with ldm"); ++ assert(cctx->stage == ZSTDcs_init); ++ assert(nbSeq == 0 || cctx->appliedParams.ldmParams.enableLdm != ZSTD_ps_enable); + cctx->externSeqStore.seq = seq; + cctx->externSeqStore.size = nbSeq; + cctx->externSeqStore.capacity = nbSeq; + cctx->externSeqStore.pos = 0; + cctx->externSeqStore.posInSequence = 0; +- return 0; + } + + +@@ -4022,7 +4708,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + const void* src, size_t srcSize, + U32 frame, U32 lastFrameChunk) + { +- ZSTD_matchState_t* const ms = &cctx->blockState.matchState; ++ ZSTD_MatchState_t* const ms = &cctx->blockState.matchState; + size_t fhSize = 0; + + DEBUGLOG(5, "ZSTD_compressContinue_internal, stage: %u, srcSize: %u", +@@ -4057,7 +4743,7 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + src, (BYTE const*)src + srcSize); + } + +- DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSize); ++ DEBUGLOG(5, "ZSTD_compressContinue_internal (blockSize=%u)", (unsigned)cctx->blockSizeMax); + { size_t const cSize = frame ? + ZSTD_compress_frameChunk (cctx, dst, dstCapacity, src, srcSize, lastFrameChunk) : + ZSTD_compressBlock_internal (cctx, dst, dstCapacity, src, srcSize, 0 /* frame */); +@@ -4078,58 +4764,90 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + } + } + +-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressContinue_public() */ ++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); ++} + +-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) + { + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); +- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); ++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); + } + +-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ ++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++{ ++ return ZSTD_getBlockSize_deprecated(cctx); ++} ++ ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); +- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); ++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++{ ++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); ++} ++ + /*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +-static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, +- ldmState_t* ls, +- ZSTD_cwksp* ws, +- ZSTD_CCtx_params const* params, +- const void* src, size_t srcSize, +- ZSTD_dictTableLoadMethod_e dtlm) ++static size_t ++ZSTD_loadDictionaryContent(ZSTD_MatchState_t* ms, ++ ldmState_t* ls, ++ ZSTD_cwksp* ws, ++ ZSTD_CCtx_params const* params, ++ const void* src, size_t srcSize, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) + { + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + +- /* Assert that we the ms params match the params we're being given */ ++ /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +- if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ { /* Ensure large dictionaries can't cause index overflow */ ++ + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ +- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +- /* We must have cleared our windows when our source is this large. */ +- assert(ZSTD_window_isEmpty(ms->window)); +- if (loadLdmDict) +- assert(ZSTD_window_isEmpty(ls->window)); ++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; ++ ++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); ++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { ++ /* Some dictionary matchfinders in zstd use "short cache", ++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each ++ * CDict hashtable entry as a tag rather than as part of an index. ++ * When short cache is used, we need to truncate the dictionary ++ * so that its indices don't overlap with the tag. */ ++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; ++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); ++ assert(!loadLdmDict); ++ } ++ + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; +@@ -4138,35 +4856,59 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } + } + +- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ /* We must have cleared our windows when our source is this large. */ ++ assert(ZSTD_window_isEmpty(ms->window)); ++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); ++ } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); +- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +- ms->forceNonContiguous = params->deterministicRefPrefix; + +- if (loadLdmDict) { ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ ++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: Trigger loadLdmDict"); + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ++ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent: ZSTD_ldm_fillHashTable completes"); ++ } ++ ++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ ++ { U32 maxDictSize = 1U << MIN(MAX(params->cParams.hashLog + 3, params->cParams.chainLog + 1), 31); ++ if (srcSize > maxDictSize) { ++ ip = iend - maxDictSize; ++ src = ip; ++ srcSize = maxDictSize; ++ } + } + ++ ms->nextToUpdate = (U32)(ip - ms->window.base); ++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ++ ms->forceNonContiguous = params->deterministicRefPrefix; ++ + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + +- if (loadLdmDict) +- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); +- + switch(params->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, dtlm); ++ ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, dtlm); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: + case ZSTD_lazy: + case ZSTD_lazy2: ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); + if (ms->dedicatedDictSearch) { + assert(ms->chainTable != NULL); +@@ -4174,7 +4916,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { +- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); ++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); +@@ -4183,14 +4925,24 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + DEBUGLOG(4, "Using chain-based hash table for lazy dict"); + } + } ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_btlazy2: /* we want the dictionary table fully sorted */ + case ZSTD_btopt: + case ZSTD_btultra: + case ZSTD_btultra2: ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + assert(srcSize >= HASH_READ_SIZE); ++ DEBUGLOG(4, "Fill %u bytes into the Binary Tree", (unsigned)srcSize); + ZSTD_updateTree(ms, iend-HASH_READ_SIZE, iend); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + default: +@@ -4233,20 +4985,19 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + { unsigned maxSymbolValue = 255; + unsigned hasZeroWeights = 1; + size_t const hufHeaderSize = HUF_readCTable((HUF_CElt*)bs->entropy.huf.CTable, &maxSymbolValue, dictPtr, +- dictEnd-dictPtr, &hasZeroWeights); ++ (size_t)(dictEnd-dictPtr), &hasZeroWeights); + + /* We only set the loaded table as valid if it contains all non-zero + * weights. Otherwise, we set it to check */ +- if (!hasZeroWeights) ++ if (!hasZeroWeights && maxSymbolValue == 255) + bs->entropy.huf.repeatMode = HUF_repeat_valid; + + RETURN_ERROR_IF(HUF_isError(hufHeaderSize), dictionary_corrupted, ""); +- RETURN_ERROR_IF(maxSymbolValue < 255, dictionary_corrupted, ""); + dictPtr += hufHeaderSize; + } + + { unsigned offcodeLog; +- size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, dictEnd-dictPtr); ++ size_t const offcodeHeaderSize = FSE_readNCount(offcodeNCount, &offcodeMaxValue, &offcodeLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(offcodeHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(offcodeLog > OffFSELog, dictionary_corrupted, ""); + /* fill all offset symbols to avoid garbage at end of table */ +@@ -4261,7 +5012,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + { short matchlengthNCount[MaxML+1]; + unsigned matchlengthMaxValue = MaxML, matchlengthLog; +- size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, dictEnd-dictPtr); ++ size_t const matchlengthHeaderSize = FSE_readNCount(matchlengthNCount, &matchlengthMaxValue, &matchlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(matchlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(matchlengthLog > MLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( +@@ -4275,7 +5026,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + { short litlengthNCount[MaxLL+1]; + unsigned litlengthMaxValue = MaxLL, litlengthLog; +- size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, dictEnd-dictPtr); ++ size_t const litlengthHeaderSize = FSE_readNCount(litlengthNCount, &litlengthMaxValue, &litlengthLog, dictPtr, (size_t)(dictEnd-dictPtr)); + RETURN_ERROR_IF(FSE_isError(litlengthHeaderSize), dictionary_corrupted, ""); + RETURN_ERROR_IF(litlengthLog > LLFSELog, dictionary_corrupted, ""); + RETURN_ERROR_IF(FSE_isError(FSE_buildCTable_wksp( +@@ -4309,7 +5060,7 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + RETURN_ERROR_IF(bs->rep[u] > dictContentSize, dictionary_corrupted, ""); + } } } + +- return dictPtr - (const BYTE*)dict; ++ return (size_t)(dictPtr - (const BYTE*)dict); + } + + /* Dictionary format : +@@ -4322,11 +5073,12 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + * dictSize supposed >= 8 + */ + static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + const BYTE* dictPtr = (const BYTE*)dict; +@@ -4345,7 +5097,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( +- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); ++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; + } +@@ -4354,13 +5106,14 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + * @return : dictID, or an error code */ + static size_t + ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + ldmState_t* ls, + ZSTD_cwksp* ws, + const ZSTD_CCtx_params* params, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); +@@ -4373,13 +5126,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) +- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); ++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( +- ms, ls, ws, params, dict, dictSize, dtlm); ++ ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ +@@ -4387,13 +5140,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( +- bs, ms, ws, params, dict, dictSize, dtlm, workspace); ++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); + } + + #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) + #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + + /*! ZSTD_compressBegin_internal() : ++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ + static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, +@@ -4426,11 +5180,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, +- cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->tmpWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, +- dictContentType, dtlm, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->tmpWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; +@@ -4471,11 +5225,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + &cctxParams, pledgedSrcSize); + } + +-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++static size_t ++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) + { + ZSTD_CCtx_params cctxParams; +- { +- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); +@@ -4483,9 +5237,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); + } + ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++{ ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); ++} ++ + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) + { +- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); + } + + +@@ -4496,14 +5256,13 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + { + BYTE* const ostart = (BYTE*)dst; + BYTE* op = ostart; +- size_t fhSize = 0; + + DEBUGLOG(4, "ZSTD_writeEpilogue"); + RETURN_ERROR_IF(cctx->stage == ZSTDcs_created, stage_wrong, "init missing"); + + /* special case : empty frame */ + if (cctx->stage == ZSTDcs_init) { +- fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); ++ size_t fhSize = ZSTD_writeFrameHeader(dst, dstCapacity, &cctx->appliedParams, 0, 0); + FORWARD_IF_ERROR(fhSize, "ZSTD_writeFrameHeader failed"); + dstCapacity -= fhSize; + op += fhSize; +@@ -4513,8 +5272,9 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + if (cctx->stage != ZSTDcs_ending) { + /* write one last empty block, make it the "last" block */ + U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1) + 0; +- RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for epilogue"); +- MEM_writeLE32(op, cBlockHeader24); ++ ZSTD_STATIC_ASSERT(ZSTD_BLOCKHEADERSIZE == 3); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "no room for epilogue"); ++ MEM_writeLE24(op, cBlockHeader24); + op += ZSTD_blockHeaderSize; + dstCapacity -= ZSTD_blockHeaderSize; + } +@@ -4528,7 +5288,7 @@ static size_t ZSTD_writeEpilogue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity) + } + + cctx->stage = ZSTDcs_created; /* return to "created but no init" status */ +- return op-ostart; ++ return (size_t)(op-ostart); + } + + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) +@@ -4537,9 +5297,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) + (void)extraCSize; + } + +-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, +@@ -4563,6 +5323,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + return cSize + endResult; + } + ++/* NOTE: Must just wrap ZSTD_compressEnd_public() */ ++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); ++} ++ + size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -4591,7 +5359,7 @@ size_t ZSTD_compress_advanced_internal( + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, +@@ -4709,7 +5477,7 @@ static size_t ZSTD_initCDict_internal( + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, +- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); ++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; +@@ -4719,14 +5487,16 @@ static size_t ZSTD_initCDict_internal( + return 0; + } + +-static ZSTD_CDict* ZSTD_createCDict_advanced_internal(size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, +- ZSTD_compressionParameters cParams, +- ZSTD_paramSwitch_e useRowMatchFinder, +- U32 enableDedicatedDictSearch, +- ZSTD_customMem customMem) ++static ZSTD_CDict* ++ZSTD_createCDict_advanced_internal(size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_compressionParameters cParams, ++ ZSTD_ParamSwitch_e useRowMatchFinder, ++ int enableDedicatedDictSearch, ++ ZSTD_customMem customMem) + { + if ((!customMem.customAlloc) ^ (!customMem.customFree)) return NULL; ++ DEBUGLOG(3, "ZSTD_createCDict_advanced_internal (dictSize=%u)", (unsigned)dictSize); + + { size_t const workspaceSize = + ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) + +@@ -4763,6 +5533,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced(const void* dictBuffer, size_t dictSize, + { + ZSTD_CCtx_params cctxParams; + ZSTD_memset(&cctxParams, 0, sizeof(cctxParams)); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + ZSTD_CCtxParams_init(&cctxParams, 0); + cctxParams.cParams = cParams; + cctxParams.customMem = customMem; +@@ -4783,7 +5554,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + ZSTD_compressionParameters cParams; + ZSTD_CDict* cdict; + +- DEBUGLOG(3, "ZSTD_createCDict_advanced2, mode %u", (unsigned)dictContentType); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced2, dictSize=%u, mode=%u", (unsigned)dictSize, (unsigned)dictContentType); + if (!customMem.customAlloc ^ !customMem.customFree) return NULL; + + if (cctxParams.enableDedicatedDictSearch) { +@@ -4802,7 +5573,7 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_createCDict); + } + +- DEBUGLOG(3, "ZSTD_createCDict_advanced2: DDS: %u", cctxParams.enableDedicatedDictSearch); ++ DEBUGLOG(3, "ZSTD_createCDict_advanced2: DedicatedDictSearch=%u", cctxParams.enableDedicatedDictSearch); + cctxParams.cParams = cParams; + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); + +@@ -4810,10 +5581,8 @@ ZSTD_CDict* ZSTD_createCDict_advanced2( + dictLoadMethod, cctxParams.cParams, + cctxParams.useRowMatchFinder, cctxParams.enableDedicatedDictSearch, + customMem); +- if (!cdict) +- return NULL; + +- if (ZSTD_isError( ZSTD_initCDict_internal(cdict, ++ if (!cdict || ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, + dictLoadMethod, dictContentType, + cctxParams) )) { +@@ -4867,7 +5636,7 @@ size_t ZSTD_freeCDict(ZSTD_CDict* cdict) + * workspaceSize: Use ZSTD_estimateCDictSize() + * to determine how large workspace must be. + * cParams : use ZSTD_getCParams() to transform a compression level +- * into its relevants cParams. ++ * into its relevant cParams. + * @return : pointer to ZSTD_CDict*, or NULL if error (size too small) + * Note : there is no corresponding "free" function. + * Since workspace was allocated externally, it must be freed externally. +@@ -4879,7 +5648,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_dictContentType_e dictContentType, + ZSTD_compressionParameters cParams) + { +- ZSTD_paramSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); ++ ZSTD_ParamSwitch_e const useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(ZSTD_ps_auto, &cParams); + /* enableDedicatedDictSearch == 1 ensures matchstate is not too small in case this CDict will be used for DDS + row hash */ + size_t const matchStateSize = ZSTD_sizeof_matchState(&cParams, useRowMatchFinder, /* enableDedicatedDictSearch */ 1, /* forCCtx */ 0); + size_t const neededSize = ZSTD_cwksp_alloc_size(sizeof(ZSTD_CDict)) +@@ -4890,6 +5659,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_CDict* cdict; + ZSTD_CCtx_params params; + ++ DEBUGLOG(4, "ZSTD_initStaticCDict (dictSize==%u)", (unsigned)dictSize); + if ((size_t)workspace & 7) return NULL; /* 8-aligned */ + + { +@@ -4900,14 +5670,13 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + ZSTD_cwksp_move(&cdict->workspace, &ws); + } + +- DEBUGLOG(4, "(workspaceSize < neededSize) : (%u < %u) => %u", +- (unsigned)workspaceSize, (unsigned)neededSize, (unsigned)(workspaceSize < neededSize)); + if (workspaceSize < neededSize) return NULL; + + ZSTD_CCtxParams_init(¶ms, 0); + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; ++ cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, +@@ -4987,12 +5756,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( + + /* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + { + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + } + ++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++{ ++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); ++} ++ + /*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +@@ -5002,7 +5776,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) + { + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + /*! ZSTD_compress_usingCDict_advanced(): +@@ -5068,7 +5842,7 @@ size_t ZSTD_CStreamOutSize(void) + return ZSTD_compressBound(ZSTD_BLOCKSIZE_MAX) + ZSTD_blockHeaderSize + 4 /* 32-bits hash */ ; + } + +-static ZSTD_cParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) ++static ZSTD_CParamMode_e ZSTD_getCParamMode(ZSTD_CDict const* cdict, ZSTD_CCtx_params const* params, U64 pledgedSrcSize) + { + if (cdict != NULL && ZSTD_shouldAttachDict(cdict, params, pledgedSrcSize)) + return ZSTD_cpm_attachDict; +@@ -5199,30 +5973,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) + + static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) + { +- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; +- if (hintInSize==0) hintInSize = cctx->blockSize; +- return hintInSize; ++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ return cctx->blockSizeMax - cctx->stableIn_notConsumed; ++ } ++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); ++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; ++ if (hintInSize==0) hintInSize = cctx->blockSizeMax; ++ return hintInSize; ++ } + } + + /* ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants +- * non-static, because can be called from zstdmt_compress.c +- * @return : hint size for next input */ ++ * @return : hint size for next input to complete ongoing block */ + static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) + { +- const char* const istart = (const char*)input->src; +- const char* const iend = input->size != 0 ? istart + input->size : istart; +- const char* ip = input->pos != 0 ? istart + input->pos : istart; +- char* const ostart = (char*)output->dst; +- char* const oend = output->size != 0 ? ostart + output->size : ostart; +- char* op = output->pos != 0 ? ostart + output->pos : ostart; ++ const char* const istart = (assert(input != NULL), (const char*)input->src); ++ const char* const iend = (istart != NULL) ? istart + input->size : istart; ++ const char* ip = (istart != NULL) ? istart + input->pos : istart; ++ char* const ostart = (assert(output != NULL), (char*)output->dst); ++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; ++ char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ +- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); ++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); ++ assert(zcs != NULL); ++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ assert(input->pos >= zcs->stableIn_notConsumed); ++ input->pos -= zcs->stableIn_notConsumed; ++ if (ip) ip -= zcs->stableIn_notConsumed; ++ zcs->stableIn_notConsumed = 0; ++ } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); +@@ -5231,8 +6016,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } +- assert(output->pos <= output->size); ++ if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); ++ if (output->dst == NULL) assert(output->size == 0); ++ assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { +@@ -5243,12 +6030,13 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + + case zcss_load: + if ( (flushMode == ZSTD_e_end) +- && ( (size_t)(oend-op) >= ZSTD_compressBound(iend-ip) /* Enough output space */ ++ && ( (size_t)(oend-op) >= ZSTD_compressBound((size_t)(iend-ip)) /* Enough output space */ + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ +- size_t const cSize = ZSTD_compressEnd(zcs, +- op, oend-op, ip, iend-ip); ++ size_t const cSize = ZSTD_compressEnd_public(zcs, ++ op, (size_t)(oend-op), ++ ip, (size_t)(iend-ip)); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); + ip = iend; +@@ -5262,10 +6050,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + size_t const toLoad = zcs->inBuffTarget - zcs->inBuffPos; + size_t const loaded = ZSTD_limitCopy( + zcs->inBuff + zcs->inBuffPos, toLoad, +- ip, iend-ip); ++ ip, (size_t)(iend-ip)); + zcs->inBuffPos += loaded; +- if (loaded != 0) +- ip += loaded; ++ if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ +@@ -5276,16 +6063,29 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + /* empty */ + someMoreWork = 0; break; + } ++ } else { ++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ if ( (flushMode == ZSTD_e_continue) ++ && ( (size_t)(iend - ip) < zcs->blockSizeMax) ) { ++ /* can't compress a full block : stop here */ ++ zcs->stableIn_notConsumed = (size_t)(iend - ip); ++ ip = iend; /* pretend to have consumed input */ ++ someMoreWork = 0; break; ++ } ++ if ( (flushMode == ZSTD_e_flush) ++ && (ip == iend) ) { ++ /* empty */ ++ someMoreWork = 0; break; ++ } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); + { int const inputBuffered = (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered); + void* cDst; + size_t cSize; +- size_t oSize = oend-op; +- size_t const iSize = inputBuffered +- ? zcs->inBuffPos - zcs->inToCompress +- : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t oSize = (size_t)(oend-op); ++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress ++ : MIN((size_t)(iend - ip), zcs->blockSizeMax); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else +@@ -5293,34 +6093,31 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ++ ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ++ ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; + /* prepare next block */ +- zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSize; ++ zcs->inBuffTarget = zcs->inBuffPos + zcs->blockSizeMax; + if (zcs->inBuffTarget > zcs->inBuffSize) +- zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSize; ++ zcs->inBuffPos = 0, zcs->inBuffTarget = zcs->blockSizeMax; + DEBUGLOG(5, "inBuffTarget:%u / inBuffSize:%u", + (unsigned)zcs->inBuffTarget, (unsigned)zcs->inBuffSize); + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; +- } else { +- unsigned const lastBlock = (ip + iSize == iend); +- assert(flushMode == ZSTD_e_end /* Already validated */); ++ } else { /* !inputBuffered, hence ZSTD_bm_stable */ ++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); ++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ +- if (iSize > 0) +- ip += iSize; ++ if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +- if (lastBlock) +- assert(ip == iend); ++ if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; +@@ -5369,8 +6166,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + } + } + +- input->pos = ip - istart; +- output->pos = op - ostart; ++ input->pos = (size_t)(ip - istart); ++ output->pos = (size_t)(op - ostart); + if (zcs->frameEnded) return 0; + return ZSTD_nextInputSizeHint(zcs); + } +@@ -5390,8 +6187,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf + /* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) ++static void ++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) + { ++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } +@@ -5410,22 +6209,27 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + { + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; +- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); +- if (endOp != ZSTD_e_end) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); ++ if (expect.src != input->src || expect.pos != input->pos) ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } ++ (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) +- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; + } + ++/* ++ * If @endOp == ZSTD_e_end, @inSize becomes pledgedSrcSize. ++ * Otherwise, it's ignored. ++ * @return: 0 on success, or a ZSTD_error code otherwise. ++ */ + static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, +- size_t inSize) { ++ size_t inSize) ++{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ +@@ -5438,21 +6242,24 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + */ + params.compressionLevel = cctx->cdict->compressionLevel; + } +- DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ +- { +- size_t const dictSize = prefixDict.dict ++ DEBUGLOG(4, "ZSTD_CCtx_init_compressStream2 : transparent init stage"); ++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ ++ ++ { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); +- ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); ++ ZSTD_CParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); + params.cParams = ZSTD_getCParamsFromCCtxParams( + ¶ms, cctx->pledgedSrcSizePlusOne-1, + dictSize, mode); + } + +- params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); ++ params.postBlockSplitter = ZSTD_resolveBlockSplitterMode(params.postBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); ++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); ++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); ++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +@@ -5468,7 +6275,7 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + /* for small input: avoid automatic flush on reaching end of block, since + * it would require to add a 3-bytes null block to end frame + */ +- cctx->inBuffTarget = cctx->blockSize + (cctx->blockSize == pledgedSrcSize); ++ cctx->inBuffTarget = cctx->blockSizeMax + (cctx->blockSizeMax == pledgedSrcSize); + } else { + cctx->inBuffTarget = 0; + } +@@ -5479,6 +6286,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + return 0; + } + ++/* @return provides a minimum amount of data remaining to be flushed from internal buffers ++ */ + size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, +@@ -5493,8 +6302,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { +- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); +- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ ++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ ++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; ++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ ++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ ++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ ++ if (cctx->stableIn_notConsumed) { /* not the first time */ ++ /* check stable source guarantees */ ++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); ++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); ++ } ++ /* pretend input was consumed, to give a sense forward progress */ ++ input->pos = input->size; ++ /* save stable inBuffer, for later control, and flush/end */ ++ cctx->expectedInBuffer = *input; ++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ ++ cctx->stableIn_notConsumed += inputSize; ++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ ++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + +@@ -5512,13 +6340,20 @@ size_t ZSTD_compressStream2_simpleArgs ( + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } + + size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5541,6 +6376,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; ++ + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); +@@ -5551,64 +6387,67 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + } + } + +-typedef struct { +- U32 idx; /* Index in array of ZSTD_Sequence */ +- U32 posInSequence; /* Position within sequence at idx */ +- size_t posInSrc; /* Number of bytes given by sequences provided so far */ +-} ZSTD_sequencePosition; +- + /* ZSTD_validateSequence() : +- * @offCode : is presumed to follow format required by ZSTD_storeSeq() ++ * @offBase : must use the format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ + static size_t +-ZSTD_validateSequence(U32 offCode, U32 matchLength, +- size_t posInSrc, U32 windowLog, size_t dictSize) ++ZSTD_validateSequence(U32 offBase, U32 matchLength, U32 minMatch, ++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) + { +- U32 const windowSize = 1 << windowLog; ++ U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); ++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; ++ RETURN_ERROR_IF(offBase > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ ++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; + } + + /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) ++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) + { +- U32 offCode = STORE_OFFSET(rawOffset); ++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { +- offCode = STORE_REPCODE_1; ++ offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { +- offCode = STORE_REPCODE(2 - ll0); ++ offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { +- offCode = STORE_REPCODE(3 - ll0); ++ offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { +- offCode = STORE_REPCODE_3; ++ offBase = REPCODE3_TO_OFFBASE; + } +- return offCode; ++ return offBase; + } + +-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of +- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. ++/* This function scans through an array of ZSTD_Sequence, ++ * storing the sequences it reads, until it reaches a block delimiter. ++ * Note that the block delimiter includes the last literals of the block. ++ * @blockSize must be == sum(sequence_lengths). ++ * @returns @blockSize on success, and a ZSTD_error otherwise. + */ + static size_t +-ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, +- ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ZSTD_transferSequences_wBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; ++ U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; +- repcodes_t updatedRepcodes; ++ Repcodes_t updatedRepcodes; + U32 dictSize; + ++ DEBUGLOG(5, "ZSTD_transferSequences_wBlockDelim (blockSize = %zu)", blockSize); ++ + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5616,27 +6455,60 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + } else { + dictSize = 0; + } +- ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; +- U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; +- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ U32 offBase; ++ ++ if (externalRepSearch == ZSTD_ps_disable) { ++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); ++ } else { ++ U32 const ll0 = (litLength == 0); ++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } + +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, ++ seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ++ ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } +- ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ++ RETURN_ERROR_IF(idx == inSeqsSize, externalSequences_invalid, "Block delimiter not found."); ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ assert(externalRepSearch != ZSTD_ps_auto); ++ assert(idx >= startIdx); ++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { ++ U32* const rep = updatedRepcodes.rep; ++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ ++ ++ if (lastSeqIdx >= startIdx + 2) { ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (lastSeqIdx == startIdx + 1) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else { ++ assert(lastSeqIdx == startIdx); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } ++ } ++ ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + if (inSeqs[idx].litLength) { + DEBUGLOG(6, "Storing last literals of size: %u", inSeqs[idx].litLength); +@@ -5644,37 +6516,43 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } +- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); ++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; +- return 0; ++ return blockSize; + } + +-/* Returns the number of bytes to move the current read position back by. Only non-zero +- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something +- * went wrong. ++/* ++ * This function attempts to scan through @blockSize bytes in @src ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. + * +- * This function will attempt to scan through blockSize bytes represented by the sequences +- * in inSeqs, storing any (partial) sequences. ++ * Occasionally, we may want to reduce the actual number of bytes consumed from @src ++ * to avoid splitting a match, notably if it would produce a match smaller than MINMATCH. + * +- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to +- * avoid splitting a match, or to avoid splitting a match such that it would produce a match +- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. ++ * @returns the number of bytes consumed from @src, necessarily <= @blockSize. ++ * Otherwise, it may return a ZSTD error if something went wrong. + */ + static size_t +-ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ZSTD_transferSequences_noDelim(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; + U32 endPosInSequence = seqPos->posInSequence + (U32)blockSize; + size_t dictSize; +- BYTE const* ip = (BYTE const*)(src); +- BYTE const* iend = ip + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ +- repcodes_t updatedRepcodes; ++ const BYTE* const istart = (const BYTE*)(src); ++ const BYTE* ip = istart; ++ const BYTE* iend = istart + blockSize; /* May be adjusted if we decide to process fewer than blockSize bytes */ ++ Repcodes_t updatedRepcodes; + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + ++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ ++ (void)externalRepSearch; ++ + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5682,15 +6560,15 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } else { + dictSize = 0; + } +- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_transferSequences_noDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); +- ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { + const ZSTD_Sequence currSeq = inSeqs[idx]; + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; +- U32 offCode; ++ U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { +@@ -5704,7 +6582,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; +- idx++; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ +@@ -5744,58 +6621,113 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); +- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, ZSTD_hasExtSeqProd(&cctx->appliedParams)), + "Sequence validation failed"); + } +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; ++ if (!finalMatchSplit) ++ idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); + seqPos->idx = idx; + seqPos->posInSequence = endPosInSequence; +- ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); + + iend -= bytesAdjustment; + if (ip != iend) { + /* Store any last literals */ +- U32 lastLLSize = (U32)(iend - ip); ++ U32 const lastLLSize = (U32)(iend - ip); + assert(ip <= iend); + DEBUGLOG(6, "Storing last literals of size: %u", lastLLSize); + ZSTD_storeLastLiterals(&cctx->seqStore, ip, lastLLSize); + seqPos->posInSrc += lastLLSize; + } + +- return bytesAdjustment; ++ return (size_t)(iend-istart); + } + +-typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, +- const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); +-static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) ++/* @seqPos represents a position within @inSeqs, ++ * it is read and updated by this function, ++ * once the goal to produce a block of size @blockSize is reached. ++ * @return: nb of bytes consumed from @src, necessarily <= @blockSize. ++ */ ++typedef size_t (*ZSTD_SequenceCopier_f)(ZSTD_CCtx* cctx, ++ ZSTD_SequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ++ ZSTD_ParamSwitch_e externalRepSearch); ++ ++static ZSTD_SequenceCopier_f ZSTD_selectSequenceCopier(ZSTD_SequenceFormat_e mode) + { +- ZSTD_sequenceCopier sequenceCopier = NULL; +- assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, mode)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_blockDelimiters, (int)mode)); + if (mode == ZSTD_sf_explicitBlockDelimiters) { +- return ZSTD_copySequencesToSeqStoreExplicitBlockDelim; +- } else if (mode == ZSTD_sf_noBlockDelimiters) { +- return ZSTD_copySequencesToSeqStoreNoBlockDelim; ++ return ZSTD_transferSequences_wBlockDelim; ++ } ++ assert(mode == ZSTD_sf_noBlockDelimiters); ++ return ZSTD_transferSequences_noDelim; ++} ++ ++/* Discover the size of next block by searching for the delimiter. ++ * Note that a block delimiter **must** exist in this mode, ++ * otherwise it's an input error. ++ * The block size retrieved will be later compared to ensure it remains within bounds */ ++static size_t ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_SequencePosition seqPos) ++{ ++ int end = 0; ++ size_t blockSize = 0; ++ size_t spos = seqPos.idx; ++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); ++ assert(spos <= inSeqsSize); ++ while (spos < inSeqsSize) { ++ end = (inSeqs[spos].offset == 0); ++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; ++ if (end) { ++ if (inSeqs[spos].matchLength != 0) ++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); ++ break; ++ } ++ spos++; + } +- assert(sequenceCopier != NULL); +- return sequenceCopier; ++ if (!end) ++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); ++ return blockSize; + } + +-/* Compress, block-by-block, all of the sequences given. ++static size_t determine_blockSize(ZSTD_SequenceFormat_e mode, ++ size_t blockSize, size_t remaining, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ ZSTD_SequencePosition seqPos) ++{ ++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) { ++ /* Note: more a "target" block size */ ++ return MIN(remaining, blockSize); ++ } ++ assert(mode == ZSTD_sf_explicitBlockDelimiters); ++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); ++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); ++ if (explicitBlockSize > blockSize) ++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); ++ if (explicitBlockSize > remaining) ++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); ++ return explicitBlockSize; ++ } ++} ++ ++/* Compress all provided sequences, block-by-block. + * + * Returns the cumulative size of all compressed blocks (including their headers), + * otherwise a ZSTD error. +@@ -5807,15 +6739,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + const void* src, size_t srcSize) + { + size_t cSize = 0; +- U32 lastBlock; +- size_t blockSize; +- size_t compressedSeqsSize; + size_t remaining = srcSize; +- ZSTD_sequencePosition seqPos = {0, 0, 0}; ++ ZSTD_SequencePosition seqPos = {0, 0, 0}; + +- BYTE const* ip = (BYTE const*)src; ++ const BYTE* ip = (BYTE const*)src; + BYTE* op = (BYTE*)dst; +- ZSTD_sequenceCopier const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); ++ ZSTD_SequenceCopier_f const sequenceCopier = ZSTD_selectSequenceCopier(cctx->appliedParams.blockDelimiters); + + DEBUGLOG(4, "ZSTD_compressSequences_internal srcSize: %zu, inSeqsSize: %zu", srcSize, inSeqsSize); + /* Special case: empty frame */ +@@ -5829,22 +6758,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + } + + while (remaining) { ++ size_t compressedSeqsSize; + size_t cBlockSize; +- size_t additionalByteAdjustment; +- lastBlock = remaining <= cctx->blockSize; +- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; ++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, ++ cctx->blockSizeMax, remaining, ++ inSeqs, inSeqsSize, seqPos); ++ U32 const lastBlock = (blockSize == remaining); ++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); ++ assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); +- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); + +- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); +- FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); +- blockSize -= additionalByteAdjustment; ++ blockSize = sequenceCopier(cctx, ++ &seqPos, inSeqs, inSeqsSize, ++ ip, blockSize, ++ cctx->appliedParams.searchForExternalRepcodes); ++ FORWARD_IF_ERROR(blockSize, "Bad sequence copy"); + + /* If blocks are too small, emit as a nocompress block */ +- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small (%zu): data remains uncompressed: cSize=%zu", blockSize, cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; +@@ -5853,35 +6789,36 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + continue; + } + ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, + op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, + blockSize, +- cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, ++ cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); +- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && +- ZSTD_isRLE((BYTE const*)src, srcSize)) { +- /* We don't want to emit our first block as a RLE even if it qualifies because +- * doing so will cause the decoder (cli only) to throw a "should consume all input error." +- * This is only an issue for zstd <= v1.4.3 +- */ ++ ZSTD_isRLE(ip, blockSize)) { ++ /* Note: don't emit the first block as RLE even if it qualifies because ++ * doing so will cause the decoder (cli <= v1.4.3 only) to throw an (invalid) error ++ * "should consume all input error." ++ */ + compressedSeqsSize = 1; + } + + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ +@@ -5893,11 +6830,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; +- DEBUGLOG(4, "cSize running total: %zu", cSize); + + if (lastBlock) { + break; +@@ -5908,41 +6844,50 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + ++ DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; + } + +-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, ++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) + { + BYTE* op = (BYTE*)dst; + size_t cSize = 0; +- size_t compressedBlocksSize = 0; +- size_t frameHeaderSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ +- DEBUGLOG(3, "ZSTD_compressSequences()"); ++ DEBUGLOG(4, "ZSTD_compressSequences (nbSeqs=%zu,dstCapacity=%zu)", inSeqsSize, dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); ++ + /* Begin writing output, starting with frame header */ +- frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, &cctx->appliedParams, srcSize, cctx->dictID); +- op += frameHeaderSize; +- dstCapacity -= frameHeaderSize; +- cSize += frameHeaderSize; ++ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, ++ &cctx->appliedParams, srcSize, cctx->dictID); ++ op += frameHeaderSize; ++ assert(frameHeaderSize <= dstCapacity); ++ dstCapacity -= frameHeaderSize; ++ cSize += frameHeaderSize; ++ } + if (cctx->appliedParams.fParams.checksumFlag && srcSize) { + xxh64_update(&cctx->xxhState, src, srcSize); + } +- /* cSize includes block header size and compressed sequences size */ +- compressedBlocksSize = ZSTD_compressSequences_internal(cctx, ++ ++ /* Now generate compressed blocks */ ++ { size_t const cBlocksSize = ZSTD_compressSequences_internal(cctx, + op, dstCapacity, + inSeqs, inSeqsSize, + src, srcSize); +- FORWARD_IF_ERROR(compressedBlocksSize, "Compressing blocks failed!"); +- cSize += compressedBlocksSize; +- dstCapacity -= compressedBlocksSize; ++ FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); ++ cSize += cBlocksSize; ++ assert(cBlocksSize <= dstCapacity); ++ dstCapacity -= cBlocksSize; ++ } + ++ /* Complete with frame checksum, if needed */ + if (cctx->appliedParams.fParams.checksumFlag) { + U32 const checksum = (U32) xxh64_digest(&cctx->xxhState); + RETURN_ERROR_IF(dstCapacity<4, dstSize_tooSmall, "no room for checksum"); +@@ -5951,26 +6896,557 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + cSize += 4; + } + +- DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); ++ return cSize; ++} ++ ++ ++#if defined(__AVX2__) ++ ++#include /* AVX2 intrinsics */ ++ ++/* ++ * Convert 2 sequences per iteration, using AVX2 intrinsics: ++ * - offset -> offBase = offset + 2 ++ * - litLength -> (U16) litLength ++ * - matchLength -> (U16)(matchLength - 3) ++ * - rep is ignored ++ * Store only 8 bytes per SeqDef (offBase[4], litLength[2], mlBase[2]). ++ * ++ * At the end, instead of extracting two __m128i, ++ * we use _mm256_permute4x64_epi64(..., 0xE8) to move lane2 into lane1, ++ * then store the lower 16 bytes in one go. ++ * ++ * @returns 0 on succes, with no long length detected ++ * @returns > 0 if there is one long length (> 65535), ++ * indicating the position, and type. ++ */ ++static size_t convertSequences_noRepcodes( ++ SeqDef* dstSeqs, ++ const ZSTD_Sequence* inSeqs, ++ size_t nbSequences) ++{ ++ /* ++ * addition: ++ * For each 128-bit half: (offset+2, litLength+0, matchLength-3, rep+0) ++ */ ++ const __m256i addition = _mm256_setr_epi32( ++ ZSTD_REP_NUM, 0, -MINMATCH, 0, /* for sequence i */ ++ ZSTD_REP_NUM, 0, -MINMATCH, 0 /* for sequence i+1 */ ++ ); ++ ++ /* limit: check if there is a long length */ ++ const __m256i limit = _mm256_set1_epi32(65535); ++ ++ /* ++ * shuffle mask for byte-level rearrangement in each 128-bit half: ++ * ++ * Input layout (after addition) per 128-bit half: ++ * [ offset+2 (4 bytes) | litLength (4 bytes) | matchLength (4 bytes) | rep (4 bytes) ] ++ * We only need: ++ * offBase (4 bytes) = offset+2 ++ * litLength (2 bytes) = low 2 bytes of litLength ++ * mlBase (2 bytes) = low 2 bytes of (matchLength) ++ * => Bytes [0..3, 4..5, 8..9], zero the rest. ++ */ ++ const __m256i mask = _mm256_setr_epi8( ++ /* For the lower 128 bits => sequence i */ ++ 0, 1, 2, 3, /* offset+2 */ ++ 4, 5, /* litLength (16 bits) */ ++ 8, 9, /* matchLength (16 bits) */ ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ ++ /* For the upper 128 bits => sequence i+1 */ ++ 16,17,18,19, /* offset+2 */ ++ 20,21, /* litLength */ ++ 24,25, /* matchLength */ ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, ++ (BYTE)0x80, (BYTE)0x80, (BYTE)0x80, (BYTE)0x80 ++ ); ++ ++ /* ++ * Next, we'll use _mm256_permute4x64_epi64(vshf, 0xE8). ++ * Explanation of 0xE8 = 11101000b => [lane0, lane2, lane2, lane3]. ++ * So the lower 128 bits become [lane0, lane2] => combining seq0 and seq1. ++ */ ++#define PERM_LANE_0X_E8 0xE8 /* [0,2,2,3] in lane indices */ ++ ++ size_t longLen = 0, i = 0; ++ ++ /* AVX permutation depends on the specific definition of target structures */ ++ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, offset) == 0); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, litLength) == 4); ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); ++ ZSTD_STATIC_ASSERT(sizeof(SeqDef) == 8); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, offBase) == 0); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, litLength) == 4); ++ ZSTD_STATIC_ASSERT(offsetof(SeqDef, mlBase) == 6); ++ ++ /* Process 2 sequences per loop iteration */ ++ for (; i + 1 < nbSequences; i += 2) { ++ /* Load 2 ZSTD_Sequence (32 bytes) */ ++ __m256i vin = _mm256_loadu_si256((const __m256i*)(const void*)&inSeqs[i]); ++ ++ /* Add {2, 0, -3, 0} in each 128-bit half */ ++ __m256i vadd = _mm256_add_epi32(vin, addition); ++ ++ /* Check for long length */ ++ __m256i ll_cmp = _mm256_cmpgt_epi32(vadd, limit); /* 0xFFFFFFFF for element > 65535 */ ++ int ll_res = _mm256_movemask_epi8(ll_cmp); ++ ++ /* Shuffle bytes so each half gives us the 8 bytes we need */ ++ __m256i vshf = _mm256_shuffle_epi8(vadd, mask); ++ /* ++ * Now: ++ * Lane0 = seq0's 8 bytes ++ * Lane1 = 0 ++ * Lane2 = seq1's 8 bytes ++ * Lane3 = 0 ++ */ ++ ++ /* Permute 64-bit lanes => move Lane2 down into Lane1. */ ++ __m256i vperm = _mm256_permute4x64_epi64(vshf, PERM_LANE_0X_E8); ++ /* ++ * Now the lower 16 bytes (Lane0+Lane1) = [seq0, seq1]. ++ * The upper 16 bytes are [Lane2, Lane3] = [seq1, 0], but we won't use them. ++ */ ++ ++ /* Store only the lower 16 bytes => 2 SeqDef (8 bytes each) */ ++ _mm_storeu_si128((__m128i *)(void*)&dstSeqs[i], _mm256_castsi256_si128(vperm)); ++ /* ++ * This writes out 16 bytes total: ++ * - offset 0..7 => seq0 (offBase, litLength, mlBase) ++ * - offset 8..15 => seq1 (offBase, litLength, mlBase) ++ */ ++ ++ /* check (unlikely) long lengths > 65535 ++ * indices for lengths correspond to bits [4..7], [8..11], [20..23], [24..27] ++ * => combined mask = 0x0FF00FF0 ++ */ ++ if (UNLIKELY((ll_res & 0x0FF00FF0) != 0)) { ++ /* long length detected: let's figure out which one*/ ++ if (inSeqs[i].matchLength > 65535+MINMATCH) { ++ assert(longLen == 0); ++ longLen = i + 1; ++ } ++ if (inSeqs[i].litLength > 65535) { ++ assert(longLen == 0); ++ longLen = i + nbSequences + 1; ++ } ++ if (inSeqs[i+1].matchLength > 65535+MINMATCH) { ++ assert(longLen == 0); ++ longLen = i + 1 + 1; ++ } ++ if (inSeqs[i+1].litLength > 65535) { ++ assert(longLen == 0); ++ longLen = i + 1 + nbSequences + 1; ++ } ++ } ++ } ++ ++ /* Handle leftover if @nbSequences is odd */ ++ if (i < nbSequences) { ++ /* process last sequence */ ++ assert(i == nbSequences - 1); ++ dstSeqs[i].offBase = OFFSET_TO_OFFBASE(inSeqs[i].offset); ++ dstSeqs[i].litLength = (U16)inSeqs[i].litLength; ++ dstSeqs[i].mlBase = (U16)(inSeqs[i].matchLength - MINMATCH); ++ /* check (unlikely) long lengths > 65535 */ ++ if (UNLIKELY(inSeqs[i].matchLength > 65535+MINMATCH)) { ++ assert(longLen == 0); ++ longLen = i + 1; ++ } ++ if (UNLIKELY(inSeqs[i].litLength > 65535)) { ++ assert(longLen == 0); ++ longLen = i + nbSequences + 1; ++ } ++ } ++ ++ return longLen; ++} ++ ++/* the vector implementation could also be ported to SSSE3, ++ * but since this implementation is targeting modern systems (>= Sapphire Rapid), ++ * it's not useful to develop and maintain code for older pre-AVX2 platforms */ ++ ++#else /* no AVX2 */ ++ ++static size_t convertSequences_noRepcodes( ++ SeqDef* dstSeqs, ++ const ZSTD_Sequence* inSeqs, ++ size_t nbSequences) ++{ ++ size_t longLen = 0; ++ size_t n; ++ for (n=0; n 65535 */ ++ if (UNLIKELY(inSeqs[n].matchLength > 65535+MINMATCH)) { ++ assert(longLen == 0); ++ longLen = n + 1; ++ } ++ if (UNLIKELY(inSeqs[n].litLength > 65535)) { ++ assert(longLen == 0); ++ longLen = n + nbSequences + 1; ++ } ++ } ++ return longLen; ++} ++ ++#endif ++ ++/* ++ * Precondition: Sequences must end on an explicit Block Delimiter ++ * @return: 0 on success, or an error code. ++ * Note: Sequence validation functionality has been disabled (removed). ++ * This is helpful to generate a lean main pipeline, improving performance. ++ * It may be re-inserted later. ++ */ ++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, ++ const ZSTD_Sequence* const inSeqs, size_t nbSequences, ++ int repcodeResolution) ++{ ++ Repcodes_t updatedRepcodes; ++ size_t seqNb = 0; ++ ++ DEBUGLOG(5, "ZSTD_convertBlockSequences (nbSequences = %zu)", nbSequences); ++ ++ RETURN_ERROR_IF(nbSequences >= cctx->seqStore.maxNbSeq, externalSequences_invalid, ++ "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); ++ ++ ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(Repcodes_t)); ++ ++ /* check end condition */ ++ assert(nbSequences >= 1); ++ assert(inSeqs[nbSequences-1].matchLength == 0); ++ assert(inSeqs[nbSequences-1].offset == 0); ++ ++ /* Convert Sequences from public format to internal format */ ++ if (!repcodeResolution) { ++ size_t const longl = convertSequences_noRepcodes(cctx->seqStore.sequencesStart, inSeqs, nbSequences-1); ++ cctx->seqStore.sequences = cctx->seqStore.sequencesStart + nbSequences-1; ++ if (longl) { ++ DEBUGLOG(5, "long length"); ++ assert(cctx->seqStore.longLengthType == ZSTD_llt_none); ++ if (longl <= nbSequences-1) { ++ DEBUGLOG(5, "long match length detected at pos %zu", longl-1); ++ cctx->seqStore.longLengthType = ZSTD_llt_matchLength; ++ cctx->seqStore.longLengthPos = (U32)(longl-1); ++ } else { ++ DEBUGLOG(5, "long literals length detected at pos %zu", longl-nbSequences); ++ assert(longl <= 2* (nbSequences-1)); ++ cctx->seqStore.longLengthType = ZSTD_llt_literalLength; ++ cctx->seqStore.longLengthPos = (U32)(longl-(nbSequences-1)-1); ++ } ++ } ++ } else { ++ for (seqNb = 0; seqNb < nbSequences - 1 ; seqNb++) { ++ U32 const litLength = inSeqs[seqNb].litLength; ++ U32 const matchLength = inSeqs[seqNb].matchLength; ++ U32 const ll0 = (litLength == 0); ++ U32 const offBase = ZSTD_finalizeOffBase(inSeqs[seqNb].offset, updatedRepcodes.rep, ll0); ++ ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ ZSTD_storeSeqOnly(&cctx->seqStore, litLength, offBase, matchLength); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } ++ } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ if (!repcodeResolution && nbSequences > 1) { ++ U32* const rep = updatedRepcodes.rep; ++ ++ if (nbSequences >= 4) { ++ U32 lastSeqIdx = (U32)nbSequences - 2; /* index of last full sequence */ ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (nbSequences == 3) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[0].offset; ++ rep[0] = inSeqs[1].offset; ++ } else { ++ assert(nbSequences == 2); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[0].offset; ++ } ++ } ++ ++ ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(Repcodes_t)); ++ ++ return 0; ++} ++ ++#if defined(ZSTD_ARCH_X86_AVX2) ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) ++{ ++ size_t i; ++ __m256i const zeroVec = _mm256_setzero_si256(); ++ __m256i sumVec = zeroVec; /* accumulates match+lit in 32-bit lanes */ ++ ZSTD_ALIGNED(32) U32 tmp[8]; /* temporary buffer for reduction */ ++ size_t mSum = 0, lSum = 0; ++ ZSTD_STATIC_ASSERT(sizeof(ZSTD_Sequence) == 16); ++ ++ /* Process 2 structs (32 bytes) at a time */ ++ for (i = 0; i + 2 <= nbSeqs; i += 2) { ++ /* Load two consecutive ZSTD_Sequence (8×4 = 32 bytes) */ ++ __m256i data = _mm256_loadu_si256((const __m256i*)(const void*)&seqs[i]); ++ /* check end of block signal */ ++ __m256i cmp = _mm256_cmpeq_epi32(data, zeroVec); ++ int cmp_res = _mm256_movemask_epi8(cmp); ++ /* indices for match lengths correspond to bits [8..11], [24..27] ++ * => combined mask = 0x0F000F00 */ ++ ZSTD_STATIC_ASSERT(offsetof(ZSTD_Sequence, matchLength) == 8); ++ if (cmp_res & 0x0F000F00) break; ++ /* Accumulate in sumVec */ ++ sumVec = _mm256_add_epi32(sumVec, data); ++ } ++ ++ /* Horizontal reduction */ ++ _mm256_store_si256((__m256i*)tmp, sumVec); ++ lSum = tmp[1] + tmp[5]; ++ mSum = tmp[2] + tmp[6]; ++ ++ /* Handle the leftover */ ++ for (; i < nbSeqs; i++) { ++ lSum += seqs[i].litLength; ++ mSum += seqs[i].matchLength; ++ if (seqs[i].matchLength == 0) break; /* end of block */ ++ } ++ ++ if (i==nbSeqs) { ++ /* reaching end of sequences: end of block signal was not present */ ++ BlockSummary bs; ++ bs.nbSequences = ERROR(externalSequences_invalid); ++ return bs; ++ } ++ { BlockSummary bs; ++ bs.nbSequences = i+1; ++ bs.blockSize = lSum + mSum; ++ bs.litSize = lSum; ++ return bs; ++ } ++} ++ ++#else ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs) ++{ ++ size_t totalMatchSize = 0; ++ size_t litSize = 0; ++ size_t n; ++ assert(seqs); ++ for (n=0; nappliedParams.searchForExternalRepcodes == ZSTD_ps_enable); ++ assert(cctx->appliedParams.searchForExternalRepcodes != ZSTD_ps_auto); ++ ++ DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals_internal: nbSeqs=%zu, litSize=%zu", nbSequences, litSize); ++ RETURN_ERROR_IF(nbSequences == 0, externalSequences_invalid, "Requires at least 1 end-of-block"); ++ ++ /* Special case: empty frame */ ++ if ((nbSequences == 1) && (inSeqs[0].litLength == 0)) { ++ U32 const cBlockHeader24 = 1 /* last block */ + (((U32)bt_raw)<<1); ++ RETURN_ERROR_IF(dstCapacity<3, dstSize_tooSmall, "No room for empty frame block header"); ++ MEM_writeLE24(op, cBlockHeader24); ++ op += ZSTD_blockHeaderSize; ++ dstCapacity -= ZSTD_blockHeaderSize; ++ cSize += ZSTD_blockHeaderSize; ++ } ++ ++ while (nbSequences) { ++ size_t compressedSeqsSize, cBlockSize, conversionStatus; ++ BlockSummary const block = ZSTD_get1BlockSummary(inSeqs, nbSequences); ++ U32 const lastBlock = (block.nbSequences == nbSequences); ++ FORWARD_IF_ERROR(block.nbSequences, "Error while trying to determine nb of sequences for a block"); ++ assert(block.nbSequences <= nbSequences); ++ RETURN_ERROR_IF(block.litSize > litSize, externalSequences_invalid, "discrepancy: Sequences require more literals than present in buffer"); ++ ZSTD_resetSeqStore(&cctx->seqStore); ++ ++ conversionStatus = ZSTD_convertBlockSequences(cctx, ++ inSeqs, block.nbSequences, ++ repcodeResolution); ++ FORWARD_IF_ERROR(conversionStatus, "Bad sequence conversion"); ++ inSeqs += block.nbSequences; ++ nbSequences -= block.nbSequences; ++ remaining -= block.blockSize; ++ ++ /* Note: when blockSize is very small, other variant send it uncompressed. ++ * Here, we still send the sequences, because we don't have the original source to send it uncompressed. ++ * One could imagine in theory reproducing the source from the sequences, ++ * but that's complex and costly memory intensive, and goes against the objectives of this variant. */ ++ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); ++ ++ compressedSeqsSize = ZSTD_entropyCompressSeqStore_internal( ++ op + ZSTD_blockHeaderSize /* Leave space for block header */, dstCapacity - ZSTD_blockHeaderSize, ++ literals, block.litSize, ++ &cctx->seqStore, ++ &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, ++ &cctx->appliedParams, ++ cctx->tmpWorkspace, cctx->tmpWkspSize /* statically allocated in resetCCtx */, ++ cctx->bmi2); ++ FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); ++ /* note: the spec forbids for any compressed block to be larger than maximum block size */ ++ if (compressedSeqsSize > cctx->blockSizeMax) compressedSeqsSize = 0; ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); ++ litSize -= block.litSize; ++ literals = (const char*)literals + block.litSize; ++ ++ /* Note: difficult to check source for RLE block when only Literals are provided, ++ * but it could be considered from analyzing the sequence directly */ ++ ++ if (compressedSeqsSize == 0) { ++ /* Sending uncompressed blocks is out of reach, because the source is not provided. ++ * In theory, one could use the sequences to regenerate the source, like a decompressor, ++ * but it's complex, and memory hungry, killing the purpose of this variant. ++ * Current outcome: generate an error code. ++ */ ++ RETURN_ERROR(cannotProduce_uncompressedBlock, "ZSTD_compressSequencesAndLiterals cannot generate an uncompressed block"); ++ } else { ++ U32 cBlockHeader; ++ assert(compressedSeqsSize > 1); /* no RLE */ ++ /* Error checking and repcodes update */ ++ ZSTD_blockState_confirmRepcodesAndEntropyTables(&cctx->blockState); ++ if (cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) ++ cctx->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; ++ ++ /* Write block header into beginning of block*/ ++ cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); ++ MEM_writeLE24(op, cBlockHeader); ++ cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); ++ } ++ ++ cSize += cBlockSize; ++ op += cBlockSize; ++ dstCapacity -= cBlockSize; ++ cctx->isFirstBlock = 0; ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); ++ ++ if (lastBlock) { ++ assert(nbSequences == 0); ++ break; ++ } ++ } ++ ++ RETURN_ERROR_IF(litSize != 0, externalSequences_invalid, "literals must be entirely and exactly consumed"); ++ RETURN_ERROR_IF(remaining != 0, externalSequences_invalid, "Sequences must represent a total of exactly srcSize=%zu", srcSize); ++ DEBUGLOG(4, "cSize final total: %zu", cSize); ++ return cSize; ++} ++ ++size_t ++ZSTD_compressSequencesAndLiterals(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* literals, size_t litSize, size_t litCapacity, ++ size_t decompressedSize) ++{ ++ BYTE* op = (BYTE*)dst; ++ size_t cSize = 0; ++ ++ /* Transparent initialization stage, same as compressStream2() */ ++ DEBUGLOG(4, "ZSTD_compressSequencesAndLiterals (dstCapacity=%zu)", dstCapacity); ++ assert(cctx != NULL); ++ if (litCapacity < litSize) { ++ RETURN_ERROR(workSpace_tooSmall, "literals buffer is not large enough: must be at least 8 bytes larger than litSize (risk of read out-of-bound)"); ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, decompressedSize), "CCtx initialization failed"); ++ ++ if (cctx->appliedParams.blockDelimiters == ZSTD_sf_noBlockDelimiters) { ++ RETURN_ERROR(frameParameter_unsupported, "This mode is only compatible with explicit delimiters"); ++ } ++ if (cctx->appliedParams.validateSequences) { ++ RETURN_ERROR(parameter_unsupported, "This mode is not compatible with Sequence validation"); ++ } ++ if (cctx->appliedParams.fParams.checksumFlag) { ++ RETURN_ERROR(frameParameter_unsupported, "this mode is not compatible with frame checksum"); ++ } ++ ++ /* Begin writing output, starting with frame header */ ++ { size_t const frameHeaderSize = ZSTD_writeFrameHeader(op, dstCapacity, ++ &cctx->appliedParams, decompressedSize, cctx->dictID); ++ op += frameHeaderSize; ++ assert(frameHeaderSize <= dstCapacity); ++ dstCapacity -= frameHeaderSize; ++ cSize += frameHeaderSize; ++ } ++ ++ /* Now generate compressed blocks */ ++ { size_t const cBlocksSize = ZSTD_compressSequencesAndLiterals_internal(cctx, ++ op, dstCapacity, ++ inSeqs, inSeqsSize, ++ literals, litSize, decompressedSize); ++ FORWARD_IF_ERROR(cBlocksSize, "Compressing blocks failed!"); ++ cSize += cBlocksSize; ++ assert(cBlocksSize <= dstCapacity); ++ dstCapacity -= cBlocksSize; ++ } ++ ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; + } + + /*====== Finalize ======*/ + ++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) ++{ ++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; ++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ return stableInput ? zcs->expectedInBuffer : nullInput; ++} ++ + /*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ + size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); ++ input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); + } + +- + size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); +- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); ++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; +@@ -6046,7 +7522,7 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + } + } + +-static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + switch (mode) { + case ZSTD_cpm_unknown: +@@ -6070,8 +7546,8 @@ static U64 ZSTD_getCParamRowSize(U64 srcSizeHint, size_t dictSize, ZSTD_cParamMo + * @return ZSTD_compressionParameters structure for a selected compression level, srcSize and dictSize. + * Note: srcSizeHint 0 means 0, use ZSTD_CONTENTSIZE_UNKNOWN for unknown. + * Use dictSize == 0 for unknown or unused. +- * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_cParamMode_e`. */ +-static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) ++ * Note: `mode` controls how we treat the `dictSize`. See docs for `ZSTD_CParamMode_e`. */ ++static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) + { + U64 const rSize = ZSTD_getCParamRowSize(srcSizeHint, dictSize, mode); + U32 const tableID = (rSize <= 256 KB) + (rSize <= 128 KB) + (rSize <= 16 KB); +@@ -6092,7 +7568,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ +- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } + } + +@@ -6109,7 +7585,9 @@ ZSTD_compressionParameters ZSTD_getCParams(int compressionLevel, unsigned long l + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +-static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode) { ++static ZSTD_parameters ++ZSTD_getParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode) ++{ + ZSTD_parameters params; + ZSTD_compressionParameters const cParams = ZSTD_getCParams_internal(compressionLevel, srcSizeHint, dictSize, mode); + DEBUGLOG(5, "ZSTD_getParams (cLevel=%i)", compressionLevel); +@@ -6123,7 +7601,34 @@ static ZSTD_parameters ZSTD_getParams_internal(int compressionLevel, unsigned lo + * same idea as ZSTD_getCParams() + * @return a `ZSTD_parameters` structure (instead of `ZSTD_compressionParameters`). + * Fields of `ZSTD_frameParameters` are set to default values */ +-ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) { ++ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize) ++{ + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); + } ++ ++void ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* zc, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc) ++{ ++ assert(zc != NULL); ++ ZSTD_CCtxParams_registerSequenceProducer( ++ &zc->requestedParams, extSeqProdState, extSeqProdFunc ++ ); ++} ++ ++void ZSTD_CCtxParams_registerSequenceProducer( ++ ZSTD_CCtx_params* params, ++ void* extSeqProdState, ++ ZSTD_sequenceProducer_F extSeqProdFunc) ++{ ++ assert(params != NULL); ++ if (extSeqProdFunc != NULL) { ++ params->extSeqProdFunc = extSeqProdFunc; ++ params->extSeqProdState = extSeqProdState; ++ } else { ++ params->extSeqProdFunc = NULL; ++ params->extSeqProdState = NULL; ++ } ++} +diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h +index 71697a11ae30..b10978385876 100644 +--- a/lib/zstd/compress/zstd_compress_internal.h ++++ b/lib/zstd/compress/zstd_compress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,7 +21,8 @@ + ***************************************/ + #include "../common/zstd_internal.h" + #include "zstd_cwksp.h" +- ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ ++#include "zstd_preSplit.h" /* ZSTD_SLIPBLOCK_WORKSPACESIZE */ + + /*-************************************* + * Constants +@@ -32,7 +34,7 @@ + It's not a big deal though : candidate will just be sorted again. + Additionally, candidate position 1 will be lost. + But candidate 1 cannot hide a large tree of candidates, so it's a minimal loss. +- The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table re-use with a different strategy. ++ The benefit is that ZSTD_DUBT_UNSORTED_MARK cannot be mishandled after table reuse with a different strategy. + This constant is required by ZSTD_compressBlock_btlazy2() and ZSTD_reduceTable_internal() */ + + +@@ -75,6 +77,70 @@ typedef struct { + ZSTD_fseCTables_t fse; + } ZSTD_entropyCTables_t; + ++/* ********************************************* ++* Sequences * ++***********************************************/ ++typedef struct SeqDef_s { ++ U32 offBase; /* offBase == Offset + ZSTD_REP_NUM, or repcode 1,2,3 */ ++ U16 litLength; ++ U16 mlBase; /* mlBase == matchLength - MINMATCH */ ++} SeqDef; ++ ++/* Controls whether seqStore has a single "long" litLength or matchLength. See SeqStore_t. */ ++typedef enum { ++ ZSTD_llt_none = 0, /* no longLengthType */ ++ ZSTD_llt_literalLength = 1, /* represents a long literal */ ++ ZSTD_llt_matchLength = 2 /* represents a long match */ ++} ZSTD_longLengthType_e; ++ ++typedef struct { ++ SeqDef* sequencesStart; ++ SeqDef* sequences; /* ptr to end of sequences */ ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; ++ size_t maxNbSeq; ++ size_t maxNbLit; ++ ++ /* longLengthPos and longLengthType to allow us to represent either a single litLength or matchLength ++ * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment ++ * the existing value of the litLength or matchLength by 0x10000. ++ */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++} SeqStore_t; ++ ++typedef struct { ++ U32 litLength; ++ U32 matchLength; ++} ZSTD_SequenceLength; ++ ++/* ++ * Returns the ZSTD_SequenceLength for the given sequences. It handles the decoding of long sequences ++ * indicated by longLengthPos and longLengthType, and adds MINMATCH back to matchLength. ++ */ ++MEM_STATIC ZSTD_SequenceLength ZSTD_getSequenceLength(SeqStore_t const* seqStore, SeqDef const* seq) ++{ ++ ZSTD_SequenceLength seqLen; ++ seqLen.litLength = seq->litLength; ++ seqLen.matchLength = seq->mlBase + MINMATCH; ++ if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { ++ if (seqStore->longLengthType == ZSTD_llt_literalLength) { ++ seqLen.litLength += 0x10000; ++ } ++ if (seqStore->longLengthType == ZSTD_llt_matchLength) { ++ seqLen.matchLength += 0x10000; ++ } ++ } ++ return seqLen; ++} ++ ++const SeqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ ++int ZSTD_seqToCodes(const SeqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ ++ ++ + /* ********************************************* + * Entropy buffer statistics structs and funcs * + ***********************************************/ +@@ -84,7 +150,7 @@ typedef struct { + * hufDesSize refers to the size of huffman tree description in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_literals() */ + typedef struct { +- symbolEncodingType_e hType; ++ SymbolEncodingType_e hType; + BYTE hufDesBuffer[ZSTD_MAX_HUF_HEADER_SIZE]; + size_t hufDesSize; + } ZSTD_hufCTablesMetadata_t; +@@ -95,9 +161,9 @@ typedef struct { + * fseTablesSize refers to the size of fse tables in bytes. + * This metadata is populated in ZSTD_buildBlockEntropyStats_sequences() */ + typedef struct { +- symbolEncodingType_e llType; +- symbolEncodingType_e ofType; +- symbolEncodingType_e mlType; ++ SymbolEncodingType_e llType; ++ SymbolEncodingType_e ofType; ++ SymbolEncodingType_e mlType; + BYTE fseTablesBuffer[ZSTD_MAX_FSE_HEADERS_SIZE]; + size_t fseTablesSize; + size_t lastCountSize; /* This is to account for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ +@@ -111,12 +177,13 @@ typedef struct { + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize); ++size_t ZSTD_buildBlockEntropyStats( ++ const SeqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize); + + /* ******************************* + * Compression internals structs * +@@ -140,28 +207,29 @@ typedef struct { + stopped. posInSequence <= seq[pos].litLength + seq[pos].matchLength */ + size_t size; /* The number of sequences. <= capacity. */ + size_t capacity; /* The capacity starting from `seq` pointer */ +-} rawSeqStore_t; ++} RawSeqStore_t; + +-UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; ++UNUSED_ATTR static const RawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + + typedef struct { +- int price; +- U32 off; +- U32 mlen; +- U32 litlen; +- U32 rep[ZSTD_REP_NUM]; ++ int price; /* price from beginning of segment to this position */ ++ U32 off; /* offset of previous match */ ++ U32 mlen; /* length of previous match */ ++ U32 litlen; /* nb of literals since previous match */ ++ U32 rep[ZSTD_REP_NUM]; /* offset history after previous match */ + } ZSTD_optimal_t; + + typedef enum { zop_dynamic=0, zop_predef } ZSTD_OptPrice_e; + ++#define ZSTD_OPT_SIZE (ZSTD_OPT_NUM+3) + typedef struct { + /* All tables are allocated inside cctx->workspace by ZSTD_resetCCtx_internal() */ + unsigned* litFreq; /* table of literals statistics, of size 256 */ + unsigned* litLengthFreq; /* table of litLength statistics, of size (MaxLL+1) */ + unsigned* matchLengthFreq; /* table of matchLength statistics, of size (MaxML+1) */ + unsigned* offCodeFreq; /* table of offCode statistics, of size (MaxOff+1) */ +- ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_NUM+1 */ +- ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_NUM+1 */ ++ ZSTD_match_t* matchTable; /* list of found matches, of size ZSTD_OPT_SIZE */ ++ ZSTD_optimal_t* priceTable; /* All positions tracked by optimal parser, of size ZSTD_OPT_SIZE */ + + U32 litSum; /* nb of literals */ + U32 litLengthSum; /* nb of litLength codes */ +@@ -173,7 +241,7 @@ typedef struct { + U32 offCodeSumBasePrice; /* to compare to log2(offreq) */ + ZSTD_OptPrice_e priceType; /* prices can be determined dynamically, or follow a pre-defined cost structure */ + const ZSTD_entropyCTables_t* symbolCosts; /* pre-calculated dictionary statistics */ +- ZSTD_paramSwitch_e literalCompressionMode; ++ ZSTD_ParamSwitch_e literalCompressionMode; + } optState_t; + + typedef struct { +@@ -195,11 +263,11 @@ typedef struct { + + #define ZSTD_WINDOW_START_INDEX 2 + +-typedef struct ZSTD_matchState_t ZSTD_matchState_t; ++typedef struct ZSTD_MatchState_t ZSTD_MatchState_t; + + #define ZSTD_ROW_HASH_CACHE_SIZE 8 /* Size of prefetching hash cache for row-based matchfinder */ + +-struct ZSTD_matchState_t { ++struct ZSTD_MatchState_t { + ZSTD_window_t window; /* State for window round buffer management */ + U32 loadedDictEnd; /* index of end of dictionary, within context's referential. + * When loadedDictEnd != 0, a dictionary is in use, and still valid. +@@ -212,28 +280,42 @@ struct ZSTD_matchState_t { + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ ++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ ++ U64 hashSalt; /* For row-based matchFinder: salts the hash for reuse of tag table */ ++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + + U32* hashTable; + U32* hashTable3; + U32* chainTable; + +- U32 forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ ++ int forceNonContiguous; /* Non-zero if we should force non-contiguous load for the next window update. */ + + int dedicatedDictSearch; /* Indicates whether this matchState is using the + * dedicated dictionary search structure. + */ + optState_t opt; /* optimal parser state */ +- const ZSTD_matchState_t* dictMatchState; ++ const ZSTD_MatchState_t* dictMatchState; + ZSTD_compressionParameters cParams; +- const rawSeqStore_t* ldmSeqStore; ++ const RawSeqStore_t* ldmSeqStore; ++ ++ /* Controls prefetching in some dictMatchState matchfinders. ++ * This behavior is controlled from the cctx ms. ++ * This parameter has no effect in the cdict ms. */ ++ int prefetchCDictTables; ++ ++ /* When == 0, lazy match finders insert every position. ++ * When != 0, lazy match finders only insert positions they search. ++ * This allows them to skip much faster over incompressible data, ++ * at a small cost to compression ratio. ++ */ ++ int lazySkipping; + }; + + typedef struct { + ZSTD_compressedBlockState_t* prevCBlock; + ZSTD_compressedBlockState_t* nextCBlock; +- ZSTD_matchState_t matchState; ++ ZSTD_MatchState_t matchState; + } ZSTD_blockState_t; + + typedef struct { +@@ -260,7 +342,7 @@ typedef struct { + } ldmState_t; + + typedef struct { +- ZSTD_paramSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ ++ ZSTD_ParamSwitch_e enableLdm; /* ZSTD_ps_enable to enable LDM. ZSTD_ps_auto by default */ + U32 hashLog; /* Log size of hashTable */ + U32 bucketSizeLog; /* Log bucket size for collision resolution, at most 8 */ + U32 minMatchLength; /* Minimum match length */ +@@ -291,7 +373,7 @@ struct ZSTD_CCtx_params_s { + * There is no guarantee that hint is close to actual source size */ + + ZSTD_dictAttachPref_e attachDictPref; +- ZSTD_paramSwitch_e literalCompressionMode; ++ ZSTD_ParamSwitch_e literalCompressionMode; + + /* Multithreading: used to pass parameters to mtctx */ + int nbWorkers; +@@ -310,24 +392,54 @@ struct ZSTD_CCtx_params_s { + ZSTD_bufferMode_e outBufferMode; + + /* Sequence compression API */ +- ZSTD_sequenceFormat_e blockDelimiters; ++ ZSTD_SequenceFormat_e blockDelimiters; + int validateSequences; + +- /* Block splitting */ +- ZSTD_paramSwitch_e useBlockSplitter; ++ /* Block splitting ++ * @postBlockSplitter executes split analysis after sequences are produced, ++ * it's more accurate but consumes more resources. ++ * @preBlockSplitter_level splits before knowing sequences, ++ * it's more approximative but also cheaper. ++ * Valid @preBlockSplitter_level values range from 0 to 6 (included). ++ * 0 means auto, 1 means do not split, ++ * then levels are sorted in increasing cpu budget, from 2 (fastest) to 6 (slowest). ++ * Highest @preBlockSplitter_level combines well with @postBlockSplitter. ++ */ ++ ZSTD_ParamSwitch_e postBlockSplitter; ++ int preBlockSplitter_level; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; + + /* Param for deciding whether to use row-based matchfinder */ +- ZSTD_paramSwitch_e useRowMatchFinder; ++ ZSTD_ParamSwitch_e useRowMatchFinder; + + /* Always load a dictionary in ext-dict mode (not prefix mode)? */ + int deterministicRefPrefix; + + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ + ZSTD_customMem customMem; ++ ++ /* Controls prefetching in some dictMatchState matchfinders */ ++ ZSTD_ParamSwitch_e prefetchCDictTables; ++ ++ /* Controls whether zstd will fall back to an internal matchfinder ++ * if the external matchfinder returns an error code. */ ++ int enableMatchFinderFallback; ++ ++ /* Parameters for the external sequence producer API. ++ * Users set these parameters through ZSTD_registerSequenceProducer(). ++ * It is not possible to set these parameters individually through the public API. */ ++ void* extSeqProdState; ++ ZSTD_sequenceProducer_F extSeqProdFunc; ++ ++ /* Controls repcode search in external sequence parsing */ ++ ZSTD_ParamSwitch_e searchForExternalRepcodes; + }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ + + #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) + #define ENTROPY_WORKSPACE_SIZE (HUF_WORKSPACE_SIZE + COMPRESS_SEQUENCES_WORKSPACE_SIZE) ++#define TMP_WORKSPACE_SIZE (MAX(ENTROPY_WORKSPACE_SIZE, ZSTD_SLIPBLOCK_WORKSPACESIZE)) + + /* + * Indicates whether this compression proceeds directly from user-provided +@@ -345,11 +457,11 @@ typedef enum { + */ + #define ZSTD_MAX_NB_BLOCK_SPLITS 196 + typedef struct { +- seqStore_t fullSeqStoreChunk; +- seqStore_t firstHalfSeqStore; +- seqStore_t secondHalfSeqStore; +- seqStore_t currSeqStore; +- seqStore_t nextSeqStore; ++ SeqStore_t fullSeqStoreChunk; ++ SeqStore_t firstHalfSeqStore; ++ SeqStore_t secondHalfSeqStore; ++ SeqStore_t currSeqStore; ++ SeqStore_t nextSeqStore; + + U32 partitions[ZSTD_MAX_NB_BLOCK_SPLITS]; + ZSTD_entropyCTablesMetadata_t entropyMetadata; +@@ -366,7 +478,7 @@ struct ZSTD_CCtx_s { + size_t dictContentSize; + + ZSTD_cwksp workspace; /* manages buffer for dynamic allocations */ +- size_t blockSize; ++ size_t blockSizeMax; + unsigned long long pledgedSrcSizePlusOne; /* this way, 0 (default) == unknown */ + unsigned long long consumedSrcSize; + unsigned long long producedCSize; +@@ -378,13 +490,14 @@ struct ZSTD_CCtx_s { + int isFirstBlock; + int initialized; + +- seqStore_t seqStore; /* sequences storage ptrs */ ++ SeqStore_t seqStore; /* sequences storage ptrs */ + ldmState_t ldmState; /* long distance matching state */ + rawSeq* ldmSequences; /* Storage for the ldm output sequences */ + size_t maxNbLdmSequences; +- rawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ ++ RawSeqStore_t externSeqStore; /* Mutable reference to external sequences */ + ZSTD_blockState_t blockState; +- U32* entropyWorkspace; /* entropy workspace of ENTROPY_WORKSPACE_SIZE bytes */ ++ void* tmpWorkspace; /* used as substitute of stack space - must be aligned for S64 type */ ++ size_t tmpWkspSize; + + /* Whether we are streaming or not */ + ZSTD_buffered_policy_e bufferedPolicy; +@@ -404,6 +517,7 @@ struct ZSTD_CCtx_s { + + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; ++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + + /* Dictionary */ +@@ -417,9 +531,14 @@ struct ZSTD_CCtx_s { + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; ++ ++ /* Buffer for output from external sequence producer */ ++ ZSTD_Sequence* extSeqBuf; ++ size_t extSeqBufCapacity; + }; + + typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + + typedef enum { + ZSTD_noDict = 0, +@@ -441,17 +560,17 @@ typedef enum { + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ +- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. ++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. + */ +-} ZSTD_cParamMode_e; ++} ZSTD_CParamMode_e; + +-typedef size_t (*ZSTD_blockCompressor) ( +- ZSTD_matchState_t* bs, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++typedef size_t (*ZSTD_BlockCompressor_f) ( ++ ZSTD_MatchState_t* bs, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-ZSTD_blockCompressor ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_paramSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); ++ZSTD_BlockCompressor_f ZSTD_selectBlockCompressor(ZSTD_strategy strat, ZSTD_ParamSwitch_e rowMatchfinderMode, ZSTD_dictMode_e dictMode); + + + MEM_STATIC U32 ZSTD_LLcode(U32 litLength) +@@ -497,12 +616,33 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) + return 1; + } + ++/* ZSTD_selectAddr: ++ * @return index >= lowLimit ? candidate : backup, ++ * tries to force branchless codegen. */ ++MEM_STATIC const BYTE* ++ZSTD_selectAddr(U32 index, U32 lowLimit, const BYTE* candidate, const BYTE* backup) ++{ ++#if defined(__x86_64__) ++ __asm__ ( ++ "cmp %1, %2\n" ++ "cmova %3, %0\n" ++ : "+r"(candidate) ++ : "r"(index), "r"(lowLimit), "r"(backup) ++ ); ++ return candidate; ++#else ++ return index >= lowLimit ? candidate : backup; ++#endif ++} ++ + /* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) + { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); ++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); +@@ -510,7 +650,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi + return ZSTD_blockHeaderSize + srcSize; + } + +-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) + { + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); +@@ -529,7 +670,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) + { + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + return (srcSize >> minlog) + 2; + } + +@@ -565,29 +706,68 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con + while (ip < iend) *op++ = *ip++; + } + +-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +-#define STORE_REPCODE_1 STORE_REPCODE(1) +-#define STORE_REPCODE_2 STORE_REPCODE(2) +-#define STORE_REPCODE_3 STORE_REPCODE(3) +-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +-#define STORED_TO_OFFBASE(o) ((o)+1) +-#define OFFBASE_TO_STORED(o) ((o)-1) ++ ++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) ++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) ++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) ++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ ++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) ++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) ++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) ++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) ++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ ++ ++/*! ZSTD_storeSeqOnly() : ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. ++ * Literals themselves are not copied, but @litPtr is updated. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). ++ * @matchLength : must be >= MINMATCH ++*/ ++HINT_INLINE UNUSED_ATTR void ++ZSTD_storeSeqOnly(SeqStore_t* seqStorePtr, ++ size_t litLength, ++ U32 offBase, ++ size_t matchLength) ++{ ++ assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); ++ ++ /* literal Length */ ++ assert(litLength <= ZSTD_BLOCKSIZE_MAX); ++ if (UNLIKELY(litLength>0xFFFF)) { ++ assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ ++ seqStorePtr->longLengthType = ZSTD_llt_literalLength; ++ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ } ++ seqStorePtr->sequences[0].litLength = (U16)litLength; ++ ++ /* match offset */ ++ seqStorePtr->sequences[0].offBase = offBase; ++ ++ /* match Length */ ++ assert(matchLength <= ZSTD_BLOCKSIZE_MAX); ++ assert(matchLength >= MINMATCH); ++ { size_t const mlBase = matchLength - MINMATCH; ++ if (UNLIKELY(mlBase>0xFFFF)) { ++ assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ ++ seqStorePtr->longLengthType = ZSTD_llt_matchLength; ++ seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ } ++ seqStorePtr->sequences[0].mlBase = (U16)mlBase; ++ } ++ ++ seqStorePtr->sequences++; ++} + + /*! ZSTD_storeSeq() : +- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. +- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into SeqStore_t. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +- * Allowed to overread literals up to litLimit. ++ * Allowed to over-read literals up to litLimit. + */ + HINT_INLINE UNUSED_ATTR void +-ZSTD_storeSeq(seqStore_t* seqStorePtr, ++ZSTD_storeSeq(SeqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, +- U32 offBase_minus1, ++ U32 offBase, + size_t matchLength) + { + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; +@@ -596,8 +776,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); +- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", +- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); ++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", ++ pos, (U32)litLength, (U32)matchLength, (U32)offBase); + } + #endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); +@@ -607,9 +787,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. +- * First copy 16 bytes, because literals are likely short. +- */ +- assert(WILDCOPY_OVERLENGTH >= 16); ++ * First copy 16 bytes, because literals are likely short. ++ */ ++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); +@@ -619,44 +799,22 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + } + seqStorePtr->lit += litLength; + +- /* literal Length */ +- if (litLength>0xFFFF) { +- assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +- seqStorePtr->longLengthType = ZSTD_llt_literalLength; +- seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +- } +- seqStorePtr->sequences[0].litLength = (U16)litLength; +- +- /* match offset */ +- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); +- +- /* match Length */ +- assert(matchLength >= MINMATCH); +- { size_t const mlBase = matchLength - MINMATCH; +- if (mlBase>0xFFFF) { +- assert(seqStorePtr->longLengthType == ZSTD_llt_none); /* there can only be a single long length */ +- seqStorePtr->longLengthType = ZSTD_llt_matchLength; +- seqStorePtr->longLengthPos = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); +- } +- seqStorePtr->sequences[0].mlBase = (U16)mlBase; +- } +- +- seqStorePtr->sequences++; ++ ZSTD_storeSeqOnly(seqStorePtr, litLength, offBase, matchLength); + } + + /* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) +- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() ++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ + MEM_STATIC void +-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ ++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; +- rep[0] = STORED_OFFSET(offBase_minus1); ++ rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ +- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; ++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +@@ -670,14 +828,14 @@ ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) + + typedef struct repcodes_s { + U32 rep[3]; +-} repcodes_t; ++} Repcodes_t; + +-MEM_STATIC repcodes_t +-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++MEM_STATIC Repcodes_t ++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- repcodes_t newReps; ++ Repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); ++ ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; + } + +@@ -685,59 +843,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 + /*-************************************* + * Match length counter + ***************************************/ +-static unsigned ZSTD_NbCommonBytes (size_t val) +-{ +- if (MEM_isLittleEndian()) { +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_ctzll((U64)val) >> 3); +-# else +- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, +- 0, 3, 1, 3, 1, 4, 2, 7, +- 0, 2, 3, 6, 1, 5, 3, 5, +- 1, 3, 4, 4, 2, 5, 6, 7, +- 7, 0, 1, 2, 3, 3, 4, 6, +- 2, 6, 5, 5, 3, 4, 5, 6, +- 7, 1, 2, 4, 6, 4, 4, 5, +- 7, 2, 6, 5, 7, 6, 7, 7 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_ctz((U32)val) >> 3); +-# else +- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, +- 3, 2, 2, 1, 3, 2, 0, 1, +- 3, 3, 1, 2, 2, 2, 2, 0, +- 3, 1, 2, 0, 1, 0, 1, 1 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +- } else { /* Big Endian CPU */ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_clzll(val) >> 3); +-# else +- unsigned r; +- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ +- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } +- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } +- r += (!val); +- return r; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_clz((U32)val) >> 3); +-# else +- unsigned r; +- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } +- r += (!val); +- return r; +-# endif +- } } +-} +- +- + MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) + { + const BYTE* const pStart = pIn; +@@ -771,8 +876,8 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + size_t const matchLength = ZSTD_count(ip, match, vEnd); + if (match + matchLength != mEnd) return matchLength; + DEBUGLOG(7, "ZSTD_count_2segments: found a 2-parts match (current length==%zu)", matchLength); +- DEBUGLOG(7, "distance from match beginning to end dictionary = %zi", mEnd - match); +- DEBUGLOG(7, "distance from current pos to end buffer = %zi", iEnd - ip); ++ DEBUGLOG(7, "distance from match beginning to end dictionary = %i", (int)(mEnd - match)); ++ DEBUGLOG(7, "distance from current pos to end buffer = %i", (int)(iEnd - ip)); + DEBUGLOG(7, "next byte : ip==%02X, istart==%02X", ip[matchLength], *iStart); + DEBUGLOG(7, "final match length = %zu", matchLength + ZSTD_count(ip+matchLength, iStart, iEnd)); + return matchLength + ZSTD_count(ip+matchLength, iStart, iEnd); +@@ -783,32 +888,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + * Hashes + ***************************************/ + static const U32 prime3bytes = 506832829U; +-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } +-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ ++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } ++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ ++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } + + static const U32 prime4bytes = 2654435761U; +-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } ++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } + + static const U64 prime5bytes = 889523592379ULL; +-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } ++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } + + static const U64 prime6bytes = 227718039650203ULL; +-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } ++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } + + static const U64 prime7bytes = 58295818150454627ULL; +-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } ++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } + + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } ++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } ++ + + MEM_STATIC FORCE_INLINE_ATTR + size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ + switch(mls) + { + default: +@@ -820,6 +936,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + } + } + ++MEM_STATIC FORCE_INLINE_ATTR ++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ ++ switch(mls) ++ { ++ default: ++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); ++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); ++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); ++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); ++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); ++ } ++} ++ ++ + /* ZSTD_ipow() : + * Return base^exponent. + */ +@@ -881,11 +1015,12 @@ MEM_STATIC U64 ZSTD_rollingHash_rotate(U64 hash, BYTE toRemove, BYTE toAdd, U64 + /*-************************************* + * Round buffer management + ***************************************/ +-#if (ZSTD_WINDOWLOG_MAX_64 > 31) +-# error "ZSTD_WINDOWLOG_MAX is too large : would overflow ZSTD_CURRENT_MAX" +-#endif +-/* Max current allowed */ +-#define ZSTD_CURRENT_MAX ((3U << 29) + (1U << ZSTD_WINDOWLOG_MAX)) ++/* Max @current value allowed: ++ * In 32-bit mode: we want to avoid crossing the 2 GB limit, ++ * reducing risks of side effects in case of signed operations on indexes. ++ * In 64-bit mode: we want to ensure that adding the maximum job size (512 MB) ++ * doesn't overflow U32 index capacity (4 GB) */ ++#define ZSTD_CURRENT_MAX (MEM_64bits() ? 3500U MB : 2000U MB) + /* Maximum chunk size before overflow correction needs to be called again */ + #define ZSTD_CHUNKSIZE_MAX \ + ( ((U32)-1) /* Maximum ending current index */ \ +@@ -925,7 +1060,7 @@ MEM_STATIC U32 ZSTD_window_hasExtDict(ZSTD_window_t const window) + * Inspects the provided matchState and figures out what dictMode should be + * passed to the compressor. + */ +-MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_matchState_t *ms) ++MEM_STATIC ZSTD_dictMode_e ZSTD_matchState_dictMode(const ZSTD_MatchState_t *ms) + { + return ZSTD_window_hasExtDict(ms->window) ? + ZSTD_extDict : +@@ -1011,7 +1146,9 @@ MEM_STATIC U32 ZSTD_window_needOverflowCorrection(ZSTD_window_t const window, + * The least significant cycleLog bits of the indices must remain the same, + * which may be 0. Every index up to maxDist in the past must be valid. + */ +-MEM_STATIC U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_correctOverflow(ZSTD_window_t* window, U32 cycleLog, + U32 maxDist, void const* src) + { + /* preemptive overflow correction: +@@ -1112,7 +1249,7 @@ ZSTD_window_enforceMaxDist(ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, +- const ZSTD_matchState_t** dictMatchStatePtr) ++ const ZSTD_MatchState_t** dictMatchStatePtr) + { + U32 const blockEndIdx = (U32)((BYTE const*)blockEnd - window->base); + U32 const loadedDictEnd = (loadedDictEndPtr != NULL) ? *loadedDictEndPtr : 0; +@@ -1157,7 +1294,7 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + const void* blockEnd, + U32 maxDist, + U32* loadedDictEndPtr, +- const ZSTD_matchState_t** dictMatchStatePtr) ++ const ZSTD_MatchState_t** dictMatchStatePtr) + { + assert(loadedDictEndPtr != NULL); + assert(dictMatchStatePtr != NULL); +@@ -1167,10 +1304,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + +- if (blockEndIdx > loadedDictEnd + maxDist) { ++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. ++ * ++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected ++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. ++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use ++ * dictMatchState, so setting it to NULL is not a problem. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; +@@ -1199,9 +1341,11 @@ MEM_STATIC void ZSTD_window_init(ZSTD_window_t* window) { + * forget about the extDict. Handles overlap of the prefix and extDict. + * Returns non-zero if the segment is contiguous. + */ +-MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, +- void const* src, size_t srcSize, +- int forceNonContiguous) ++MEM_STATIC ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_window_update(ZSTD_window_t* window, ++ const void* src, size_t srcSize, ++ int forceNonContiguous) + { + BYTE const* const ip = (BYTE const*)src; + U32 contiguous = 1; +@@ -1228,8 +1372,9 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, + /* if input and dictionary overlap : reduce dictionary (area presumed modified by input) */ + if ( (ip+srcSize > window->dictBase + window->lowLimit) + & (ip < window->dictBase + window->dictLimit)) { +- ptrdiff_t const highInputIdx = (ip + srcSize) - window->dictBase; +- U32 const lowLimitMax = (highInputIdx > (ptrdiff_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; ++ size_t const highInputIdx = (size_t)((ip + srcSize) - window->dictBase); ++ U32 const lowLimitMax = (highInputIdx > (size_t)window->dictLimit) ? window->dictLimit : (U32)highInputIdx; ++ assert(highInputIdx < UINT_MAX); + window->lowLimit = lowLimitMax; + DEBUGLOG(5, "Overlapping extDict and input : new lowLimit = %u", window->lowLimit); + } +@@ -1239,7 +1384,7 @@ MEM_STATIC U32 ZSTD_window_update(ZSTD_window_t* window, + /* + * Returns the lowest allowed match index. It may either be in the ext-dict or the prefix. + */ +-MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) ++MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) + { + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.lowLimit; +@@ -1256,7 +1401,7 @@ MEM_STATIC U32 ZSTD_getLowestMatchIndex(const ZSTD_matchState_t* ms, U32 curr, u + /* + * Returns the lowest allowed match index in the prefix. + */ +-MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, unsigned windowLog) ++MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_MatchState_t* ms, U32 curr, unsigned windowLog) + { + U32 const maxDistance = 1U << windowLog; + U32 const lowestValid = ms->window.dictLimit; +@@ -1269,6 +1414,13 @@ MEM_STATIC U32 ZSTD_getLowestPrefixIndex(const ZSTD_matchState_t* ms, U32 curr, + return matchLowest; + } + ++/* index_safety_check: ++ * intentional underflow : ensure repIndex isn't overlapping dict + prefix ++ * @return 1 if values are not overlapping, ++ * 0 otherwise */ ++MEM_STATIC int ZSTD_index_overlap_check(const U32 prefixLowestIndex, const U32 repIndex) { ++ return ((U32)((prefixLowestIndex-1) - repIndex) >= 3); ++} + + + /* debug functions */ +@@ -1302,7 +1454,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) + + #endif + ++/* Short Cache */ ++ ++/* Normally, zstd matchfinders follow this flow: ++ * 1. Compute hash at ip ++ * 2. Load index from hashTable[hash] ++ * 3. Check if *ip == *(base + index) ++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. ++ * ++ * Short cache is an optimization which allows us to avoid step 3 most of the time ++ * when the data doesn't actually match. With short cache, the flow becomes: ++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. ++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. ++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. ++ * ++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to ++ * dictMatchState matchfinders. ++ */ ++#define ZSTD_SHORT_CACHE_TAG_BITS 8 ++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) ++ ++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. ++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ ++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { ++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); ++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); ++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; ++} + ++/* Helper function for short cache matchfinders. ++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ ++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { ++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; ++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; ++ return tag1 == tag2; ++} + + /* =============================================================== + * Shared internal declarations +@@ -1319,6 +1506,25 @@ size_t ZSTD_loadCEntropy(ZSTD_compressedBlockState_t* bs, void* workspace, + + void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_SequencePosition; ++ ++/* for benchmark */ ++size_t ZSTD_convertBlockSequences(ZSTD_CCtx* cctx, ++ const ZSTD_Sequence* const inSeqs, size_t nbSequences, ++ int const repcodeResolution); ++ ++typedef struct { ++ size_t nbSequences; ++ size_t blockSize; ++ size_t litSize; ++} BlockSummary; ++ ++BlockSummary ZSTD_get1BlockSummary(const ZSTD_Sequence* seqs, size_t nbSeqs); ++ + /* ============================================================== + * Private declarations + * These prototypes shall only be called from within lib/compress +@@ -1330,7 +1536,7 @@ void ZSTD_reset_compressedBlockState(ZSTD_compressedBlockState_t* bs); + * Note: srcSizeHint == 0 means 0! + */ + ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( +- const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); ++ const ZSTD_CCtx_params* CCtxParams, U64 srcSizeHint, size_t dictSize, ZSTD_CParamMode_e mode); + + /*! ZSTD_initCStream_internal() : + * Private use only. Init streaming operation. +@@ -1342,7 +1548,7 @@ size_t ZSTD_initCStream_internal(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + const ZSTD_CCtx_params* params, unsigned long long pledgedSrcSize); + +-void ZSTD_resetSeqStore(seqStore_t* ssPtr); ++void ZSTD_resetSeqStore(SeqStore_t* ssPtr); + + /*! ZSTD_getCParamsFromCDict() : + * as the name implies */ +@@ -1381,11 +1587,10 @@ size_t ZSTD_writeLastEmptyBlock(void* dst, size_t dstCapacity); + * This cannot be used when long range matching is enabled. + * Zstd will use these sequences, and pass the literals to a secondary block + * compressor. +- * @return : An error code on failure. + * NOTE: seqs are not verified! Invalid sequences can cause out-of-bounds memory + * access and data corruption. + */ +-size_t ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); ++void ZSTD_referenceExternalSequences(ZSTD_CCtx* cctx, rawSeq* seq, size_t nbSeq); + + /* ZSTD_cycleLog() : + * condition for correct operation : hashLog > 1 */ +@@ -1396,4 +1601,28 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + */ + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + ++/* Returns 1 if an external sequence producer is registered, otherwise returns 0. */ ++MEM_STATIC int ZSTD_hasExtSeqProd(const ZSTD_CCtx_params* params) { ++ return params->extSeqProdFunc != NULL; ++} ++ ++/* =============================================================== ++ * Deprecated definitions that are still used internally to avoid ++ * deprecation warnings. These functions are exactly equivalent to ++ * their public variants, but avoid the deprecation warnings. ++ * =============================================================== */ ++ ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ++ ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ ++ + #endif /* ZSTD_COMPRESS_H */ +diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c +index 52b0a8059aba..ec39b4299b6f 100644 +--- a/lib/zstd/compress/zstd_compress_literals.c ++++ b/lib/zstd/compress/zstd_compress_literals.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,11 +14,36 @@ + ***************************************/ + #include "zstd_compress_literals.h" + ++ ++/* ************************************************************** ++* Debug Traces ++****************************************************************/ ++#if DEBUGLEVEL >= 2 ++ ++static size_t showHexa(const void* src, size_t srcSize) ++{ ++ const BYTE* const ip = (const BYTE*)src; ++ size_t u; ++ for (u=0; u31) + (srcSize>4095); + ++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); ++ + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) +@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, + } + + ZSTD_memcpy(ostart + flSize, src, srcSize); +- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); ++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; + } + ++static int allBytesIdentical(const void* src, size_t srcSize) ++{ ++ assert(srcSize >= 1); ++ assert(src != NULL); ++ { const BYTE b = ((const BYTE*)src)[0]; ++ size_t p; ++ for (p=1; p31) + (srcSize>4095); + +- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ ++ assert(dstCapacity >= 4); (void)dstCapacity; ++ assert(allBytesIdentical(src, srcSize)); + + switch(flSize) + { +@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* + } + + ostart[flSize] = *(const BYTE*)src; +- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); ++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); + return flSize+1; + } + +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible) ++/* ZSTD_minLiteralsToCompress() : ++ * returns minimal amount of literals ++ * for literal compression to even be attempted. ++ * Minimum is made tighter as compression strategy increases. ++ */ ++static size_t ++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) ++{ ++ assert((int)strategy >= 0); ++ assert((int)strategy <= 9); ++ /* btultra2 : min 8 bytes; ++ * then 2x larger for each successive compression strategy ++ * max threshold 64 bytes */ ++ { int const shift = MIN(9-(int)strategy, 3); ++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; ++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); ++ return mintc; ++ } ++} ++ ++size_t ZSTD_compressLiterals ( ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ void* entropyWorkspace, size_t entropyWorkspaceSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, ++ int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2) + { +- size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; +- symbolEncodingType_e hType = set_compressed; ++ SymbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", +- disableLiteralCompression, (U32)srcSize); ++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", ++ disableLiteralCompression, (U32)srcSize, dstCapacity); ++ ++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + +- /* small ? don't even attempt compression (speed opt) */ +-# define COMPRESS_LITERALS_SIZE_MIN 63 +- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ /* if too small, don't even attempt compression (speed opt) */ ++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; +- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; ++ int const flags = 0 ++ | (bmi2 ? HUF_flags_bmi2 : 0) ++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) ++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) ++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); ++ ++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); ++ huf_compress_f huf_compress; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; +- cLitSize = singleStream ? +- HUF_compress1X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : +- HUF_compress4X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); ++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; ++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, ++ src, srcSize, ++ HUF_SYMBOLVALUE_MAX, LitHufLog, ++ entropyWorkspace, entropyWorkspaceSize, ++ (HUF_CElt*)nextHuf->CTable, ++ &repeat, flags); ++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ +- DEBUGLOG(5, "Reusing previous huffman table"); ++ DEBUGLOG(5, "reusing statistics from previous huffman block"); + hType = set_repeat; + } + } + +- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ { size_t const minGain = ZSTD_minGain(srcSize, strategy); ++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); ++ } } + if (cLitSize==1) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); +- } ++ /* A return value of 1 signals that the alphabet consists of a single symbol. ++ * However, in some rare circumstances, it could be the compressed size (a single byte). ++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. ++ * (it's also necessary to not generate statistics). ++ * Therefore, in such a case, actively check that all bytes are identical. */ ++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); ++ } } + + if (hType == set_compressed) { + /* using a newly constructed table */ +@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); ++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); +diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h +index 9775fb97cb70..a2a85d6b69e5 100644 +--- a/lib/zstd/compress/zstd_compress_literals.h ++++ b/lib/zstd/compress/zstd_compress_literals.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,16 +17,24 @@ + + size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ++/* ZSTD_compressRleLiteralsBlock() : ++ * Conditions : ++ * - All bytes in @src are identical ++ * - dstCapacity >= 4 */ + size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, ++/* ZSTD_compressLiterals(): ++ * @entropyWorkspace: must be aligned on 4-bytes boundaries ++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE ++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding ++ */ ++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible); ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2); + + #endif /* ZSTD_COMPRESS_LITERALS_H */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c +index 21ddc1b37acf..256980c9d85a 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.c ++++ b/lib/zstd/compress/zstd_compress_sequences.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq) + { + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on +- * comprssibility. ++ * compressibility. + */ + return nbSeq >= 2048; + } +@@ -153,20 +154,20 @@ size_t ZSTD_crossEntropyCost(short const* norm, unsigned accuracyLog, + return cost >> 8; + } + +-symbolEncodingType_e ++SymbolEncodingType_e + ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, +- ZSTD_defaultPolicy_e const isDefaultAllowed, ++ ZSTD_DefaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy) + { + ZSTD_STATIC_ASSERT(ZSTD_defaultDisallowed == 0 && ZSTD_defaultAllowed != 0); + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { +- /* Prefer set_basic over set_rle when there are 2 or less symbols, ++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ +@@ -241,7 +242,7 @@ typedef struct { + + size_t + ZSTD_buildCTable(void* dst, size_t dstCapacity, +- FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, ++ FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, +@@ -293,7 +294,7 @@ ZSTD_encodeSequences_body( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + BIT_CStream_t blockStream; + FSE_CState_t stateMatchLength; +@@ -387,7 +388,7 @@ ZSTD_encodeSequences_default( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, +@@ -405,7 +406,7 @@ ZSTD_encodeSequences_bmi2( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets) + { + return ZSTD_encodeSequences_body(dst, dstCapacity, + CTable_MatchLength, mlCodeTable, +@@ -421,7 +422,7 @@ size_t ZSTD_encodeSequences( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) ++ SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2) + { + DEBUGLOG(5, "ZSTD_encodeSequences: dstCapacity = %u", (unsigned)dstCapacity); + #if DYNAMIC_BMI2 +diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h +index 7991364c2f71..14fdccb6547f 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.h ++++ b/lib/zstd/compress/zstd_compress_sequences.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,26 +12,27 @@ + #ifndef ZSTD_COMPRESS_SEQUENCES_H + #define ZSTD_COMPRESS_SEQUENCES_H + ++#include "zstd_compress_internal.h" /* SeqDef */ + #include "../common/fse.h" /* FSE_repeat, FSE_CTable */ +-#include "../common/zstd_internal.h" /* symbolEncodingType_e, ZSTD_strategy */ ++#include "../common/zstd_internal.h" /* SymbolEncodingType_e, ZSTD_strategy */ + + typedef enum { + ZSTD_defaultDisallowed = 0, + ZSTD_defaultAllowed = 1 +-} ZSTD_defaultPolicy_e; ++} ZSTD_DefaultPolicy_e; + +-symbolEncodingType_e ++SymbolEncodingType_e + ZSTD_selectEncodingType( + FSE_repeat* repeatMode, unsigned const* count, unsigned const max, + size_t const mostFrequent, size_t nbSeq, unsigned const FSELog, + FSE_CTable const* prevCTable, + short const* defaultNorm, U32 defaultNormLog, +- ZSTD_defaultPolicy_e const isDefaultAllowed, ++ ZSTD_DefaultPolicy_e const isDefaultAllowed, + ZSTD_strategy const strategy); + + size_t + ZSTD_buildCTable(void* dst, size_t dstCapacity, +- FSE_CTable* nextCTable, U32 FSELog, symbolEncodingType_e type, ++ FSE_CTable* nextCTable, U32 FSELog, SymbolEncodingType_e type, + unsigned* count, U32 max, + const BYTE* codeTable, size_t nbSeq, + const S16* defaultNorm, U32 defaultNormLog, U32 defaultMax, +@@ -42,7 +44,7 @@ size_t ZSTD_encodeSequences( + FSE_CTable const* CTable_MatchLength, BYTE const* mlCodeTable, + FSE_CTable const* CTable_OffsetBits, BYTE const* ofCodeTable, + FSE_CTable const* CTable_LitLength, BYTE const* llCodeTable, +- seqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); ++ SeqDef const* sequences, size_t nbSeq, int longOffsets, int bmi2); + + size_t ZSTD_fseBitCost( + FSE_CTable const* ctable, +diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c +index 17d836cc84e8..dc12d64e935c 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.c ++++ b/lib/zstd/compress/zstd_compress_superblock.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -36,13 +37,14 @@ + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block +- * Or 0 if it unable to compress. ++ * Or 0 if unable to compress. + * Or error code */ +-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- const BYTE* literals, size_t litSize, +- void* dst, size_t dstSize, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const BYTE* literals, size_t litSize, ++ void* dst, size_t dstSize, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); +@@ -50,11 +52,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart + lhSize; + U32 const singleStream = lhSize == 3; +- symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; ++ SymbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + +- (void)bmi2; /* TODO bmi2... */ +- + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; +@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + +- /* TODO bmi2 */ +- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) +- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); ++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; ++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags) ++ : HUF_compress4X_usingCTable(op, (size_t)(oend-op), literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { +@@ -103,7 +103,7 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)litSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } +@@ -123,26 +123,30 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + } + *entropyWritten = 1; + DEBUGLOG(5, "Compressed literals: %u -> %u", (U32)litSize, (U32)(op-ostart)); +- return op-ostart; ++ return (size_t)(op-ostart); + } + +-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { +- const seqDef* const sstart = sequences; +- const seqDef* const send = sequences + nbSeq; +- const seqDef* sp = sstart; ++static size_t ++ZSTD_seqDecompressedSize(SeqStore_t const* seqStore, ++ const SeqDef* sequences, size_t nbSeqs, ++ size_t litSize, int lastSubBlock) ++{ + size_t matchLengthSum = 0; + size_t litLengthSum = 0; +- (void)(litLengthSum); /* suppress unused variable warning on some environments */ +- while (send-sp > 0) { +- ZSTD_sequenceLength const seqLen = ZSTD_getSequenceLength(seqStore, sp); ++ size_t n; ++ for (n=0; ncParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; +@@ -176,14 +181,14 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + /* Sequences Header */ + RETURN_ERROR_IF((oend-op) < 3 /*max nbSeq Size*/ + 1 /*seqHead*/, + dstSize_tooSmall, ""); +- if (nbSeq < 0x7F) ++ if (nbSeq < 128) + *op++ = (BYTE)nbSeq; + else if (nbSeq < LONGNBSEQ) + op[0] = (BYTE)((nbSeq>>8) + 0x80), op[1] = (BYTE)nbSeq, op+=2; + else + op[0]=0xFF, MEM_writeLE16(op+1, (U16)(nbSeq - LONGNBSEQ)), op+=3; + if (nbSeq==0) { +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* seqHead : flags for FSE encoding type */ +@@ -205,7 +210,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +- op, oend - op, ++ op, (size_t)(oend - op), + fseTables->matchlengthCTable, mlCode, + fseTables->offcodeCTable, ofCode, + fseTables->litlengthCTable, llCode, +@@ -249,7 +254,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + #endif + + *entropyWritten = 1; +- return op - ostart; ++ return (size_t)(op - ostart); + } + + /* ZSTD_compressSubBlock() : +@@ -258,7 +263,7 @@ static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables + * Or 0 if it failed to compress. */ + static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- const seqDef* sequences, size_t nbSeq, ++ const SeqDef* sequences, size_t nbSeq, + const BYTE* literals, size_t litSize, + const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, + const ZSTD_CCtx_params* cctxParams, +@@ -275,7 +280,8 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + litSize, nbSeq, writeLitEntropy, writeSeqEntropy, lastBlock); + { size_t cLitSize = ZSTD_compressSubBlock_literal((const HUF_CElt*)entropy->huf.CTable, + &entropyMetadata->hufMetadata, literals, litSize, +- op, oend-op, bmi2, writeLitEntropy, litEntropyWritten); ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, litEntropyWritten); + FORWARD_IF_ERROR(cLitSize, "ZSTD_compressSubBlock_literal failed"); + if (cLitSize == 0) return 0; + op += cLitSize; +@@ -285,18 +291,18 @@ static size_t ZSTD_compressSubBlock(const ZSTD_entropyCTables_t* entropy, + sequences, nbSeq, + llCode, mlCode, ofCode, + cctxParams, +- op, oend-op, ++ op, (size_t)(oend-op), + bmi2, writeSeqEntropy, seqEntropyWritten); + FORWARD_IF_ERROR(cSeqSize, "ZSTD_compressSubBlock_sequences failed"); + if (cSeqSize == 0) return 0; + op += cSeqSize; + } + /* Write block header */ +- { size_t cSize = (op-ostart)-ZSTD_blockHeaderSize; ++ { size_t cSize = (size_t)(op-ostart) - ZSTD_blockHeaderSize; + U32 const cBlockHeader24 = lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); + MEM_writeLE24(ostart, cBlockHeader24); + } +- return op-ostart; ++ return (size_t)(op-ostart); + } + + static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t litSize, +@@ -322,7 +328,7 @@ static size_t ZSTD_estimateSubBlockSize_literal(const BYTE* literals, size_t lit + return 0; + } + +-static size_t ZSTD_estimateSubBlockSize_symbolType(symbolEncodingType_e type, ++static size_t ZSTD_estimateSubBlockSize_symbolType(SymbolEncodingType_e type, + const BYTE* codeTable, unsigned maxCode, + size_t nbSeq, const FSE_CTable* fseCTable, + const U8* additionalBits, +@@ -385,7 +391,11 @@ static size_t ZSTD_estimateSubBlockSize_sequences(const BYTE* ofCodeTable, + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + +-static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, ++typedef struct { ++ size_t estLitSize; ++ size_t estBlockSize; ++} EstimatedBlockSize; ++static EstimatedBlockSize ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const BYTE* ofCodeTable, + const BYTE* llCodeTable, + const BYTE* mlCodeTable, +@@ -393,15 +403,17 @@ static size_t ZSTD_estimateSubBlockSize(const BYTE* literals, size_t litSize, + const ZSTD_entropyCTables_t* entropy, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, + void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { +- size_t cSizeEstimate = 0; +- cSizeEstimate += ZSTD_estimateSubBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); +- cSizeEstimate += ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, ++ int writeLitEntropy, int writeSeqEntropy) ++{ ++ EstimatedBlockSize ebs; ++ ebs.estLitSize = ZSTD_estimateSubBlockSize_literal(literals, litSize, ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); ++ ebs.estBlockSize = ZSTD_estimateSubBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, + nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, + workspace, wkspSize, writeSeqEntropy); +- return cSizeEstimate + ZSTD_blockHeaderSize; ++ ebs.estBlockSize += ebs.estLitSize + ZSTD_blockHeaderSize; ++ return ebs; + } + + static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMetadata) +@@ -415,14 +427,57 @@ static int ZSTD_needSequenceEntropyTables(ZSTD_fseCTablesMetadata_t const* fseMe + return 0; + } + ++static size_t countLiterals(SeqStore_t const* seqStore, const SeqDef* sp, size_t seqCount) ++{ ++ size_t n, total = 0; ++ assert(sp != NULL); ++ for (n=0; n %zu bytes", seqCount, (const void*)sp, total); ++ return total; ++} ++ ++#define BYTESCALE 256 ++ ++static size_t sizeBlockSequences(const SeqDef* sp, size_t nbSeqs, ++ size_t targetBudget, size_t avgLitCost, size_t avgSeqCost, ++ int firstSubBlock) ++{ ++ size_t n, budget = 0, inSize=0; ++ /* entropy headers */ ++ size_t const headerSize = (size_t)firstSubBlock * 120 * BYTESCALE; /* generous estimate */ ++ assert(firstSubBlock==0 || firstSubBlock==1); ++ budget += headerSize; ++ ++ /* first sequence => at least one sequence*/ ++ budget += sp[0].litLength * avgLitCost + avgSeqCost; ++ if (budget > targetBudget) return 1; ++ inSize = sp[0].litLength + (sp[0].mlBase+MINMATCH); ++ ++ /* loop over sequences */ ++ for (n=1; n targetBudget) ++ /* though continue to expand until the sub-block is deemed compressible */ ++ && (budget < inSize * BYTESCALE) ) ++ break; ++ } ++ ++ return n; ++} ++ + /* ZSTD_compressSubBlock_multi() : + * Breaks super-block into multiple sub-blocks and compresses them. +- * Entropy will be written to the first block. +- * The following blocks will use repeat mode to compress. +- * All sub-blocks are compressed blocks (no raw or rle blocks). +- * @return : compressed size of the super block (which is multiple ZSTD blocks) +- * Or 0 if it failed to compress. */ +-static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, ++ * Entropy will be written into the first block. ++ * The following blocks use repeat_mode to compress. ++ * Sub-blocks are all compressed, except the last one when beneficial. ++ * @return : compressed size of the super block (which features multiple ZSTD blocks) ++ * or 0 if it failed to compress. */ ++static size_t ZSTD_compressSubBlock_multi(const SeqStore_t* seqStorePtr, + const ZSTD_compressedBlockState_t* prevCBlock, + ZSTD_compressedBlockState_t* nextCBlock, + const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +@@ -432,12 +487,14 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const int bmi2, U32 lastBlock, + void* workspace, size_t wkspSize) + { +- const seqDef* const sstart = seqStorePtr->sequencesStart; +- const seqDef* const send = seqStorePtr->sequences; +- const seqDef* sp = sstart; ++ const SeqDef* const sstart = seqStorePtr->sequencesStart; ++ const SeqDef* const send = seqStorePtr->sequences; ++ const SeqDef* sp = sstart; /* tracks progresses within seqStorePtr->sequences */ ++ size_t const nbSeqs = (size_t)(send - sstart); + const BYTE* const lstart = seqStorePtr->litStart; + const BYTE* const lend = seqStorePtr->lit; + const BYTE* lp = lstart; ++ size_t const nbLiterals = (size_t)(lend - lstart); + BYTE const* ip = (BYTE const*)src; + BYTE const* const iend = ip + srcSize; + BYTE* const ostart = (BYTE*)dst; +@@ -446,112 +503,171 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + const BYTE* llCodePtr = seqStorePtr->llCode; + const BYTE* mlCodePtr = seqStorePtr->mlCode; + const BYTE* ofCodePtr = seqStorePtr->ofCode; +- size_t targetCBlockSize = cctxParams->targetCBlockSize; +- size_t litSize, seqCount; +- int writeLitEntropy = entropyMetadata->hufMetadata.hType == set_compressed; ++ size_t const minTarget = ZSTD_TARGETCBLOCKSIZE_MIN; /* enforce minimum size, to reduce undesirable side effects */ ++ size_t const targetCBlockSize = MAX(minTarget, cctxParams->targetCBlockSize); ++ int writeLitEntropy = (entropyMetadata->hufMetadata.hType == set_compressed); + int writeSeqEntropy = 1; +- int lastSequence = 0; +- +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi (litSize=%u, nbSeq=%u)", +- (unsigned)(lend-lp), (unsigned)(send-sstart)); +- +- litSize = 0; +- seqCount = 0; +- do { +- size_t cBlockSizeEstimate = 0; +- if (sstart == send) { +- lastSequence = 1; +- } else { +- const seqDef* const sequence = sp + seqCount; +- lastSequence = sequence == send - 1; +- litSize += ZSTD_getSequenceLength(seqStorePtr, sequence).litLength; +- seqCount++; +- } +- if (lastSequence) { +- assert(lp <= lend); +- assert(litSize <= (size_t)(lend - lp)); +- litSize = (size_t)(lend - lp); ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi (srcSize=%u, litSize=%u, nbSeq=%u)", ++ (unsigned)srcSize, (unsigned)(lend-lstart), (unsigned)(send-sstart)); ++ ++ /* let's start by a general estimation for the full block */ ++ if (nbSeqs > 0) { ++ EstimatedBlockSize const ebs = ++ ZSTD_estimateSubBlockSize(lp, nbLiterals, ++ ofCodePtr, llCodePtr, mlCodePtr, nbSeqs, ++ &nextCBlock->entropy, entropyMetadata, ++ workspace, wkspSize, ++ writeLitEntropy, writeSeqEntropy); ++ /* quick estimation */ ++ size_t const avgLitCost = nbLiterals ? (ebs.estLitSize * BYTESCALE) / nbLiterals : BYTESCALE; ++ size_t const avgSeqCost = ((ebs.estBlockSize - ebs.estLitSize) * BYTESCALE) / nbSeqs; ++ const size_t nbSubBlocks = MAX((ebs.estBlockSize + (targetCBlockSize/2)) / targetCBlockSize, 1); ++ size_t n, avgBlockBudget, blockBudgetSupp=0; ++ avgBlockBudget = (ebs.estBlockSize * BYTESCALE) / nbSubBlocks; ++ DEBUGLOG(5, "estimated fullblock size=%u bytes ; avgLitCost=%.2f ; avgSeqCost=%.2f ; targetCBlockSize=%u, nbSubBlocks=%u ; avgBlockBudget=%.0f bytes", ++ (unsigned)ebs.estBlockSize, (double)avgLitCost/BYTESCALE, (double)avgSeqCost/BYTESCALE, ++ (unsigned)targetCBlockSize, (unsigned)nbSubBlocks, (double)avgBlockBudget/BYTESCALE); ++ /* simplification: if estimates states that the full superblock doesn't compress, just bail out immediately ++ * this will result in the production of a single uncompressed block covering @srcSize.*/ ++ if (ebs.estBlockSize > srcSize) return 0; ++ ++ /* compress and write sub-blocks */ ++ assert(nbSubBlocks>0); ++ for (n=0; n < nbSubBlocks-1; n++) { ++ /* determine nb of sequences for current sub-block + nbLiterals from next sequence */ ++ size_t const seqCount = sizeBlockSequences(sp, (size_t)(send-sp), ++ avgBlockBudget + blockBudgetSupp, avgLitCost, avgSeqCost, n==0); ++ /* if reached last sequence : break to last sub-block (simplification) */ ++ assert(seqCount <= (size_t)(send-sp)); ++ if (sp + seqCount == send) break; ++ assert(seqCount > 0); ++ /* compress sub-block */ ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = countLiterals(seqStorePtr, sp, seqCount); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 0); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ 0); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* check compressibility, update state components */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Committed sub-block compressing %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; ++ } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; ++ blockBudgetSupp = 0; ++ } } ++ /* otherwise : do not compress yet, coalesce current sub-block with following one */ + } +- /* I think there is an optimization opportunity here. +- * Calling ZSTD_estimateSubBlockSize for every sequence can be wasteful +- * since it recalculates estimate from scratch. +- * For example, it would recount literal distribution and symbol codes every time. +- */ +- cBlockSizeEstimate = ZSTD_estimateSubBlockSize(lp, litSize, ofCodePtr, llCodePtr, mlCodePtr, seqCount, +- &nextCBlock->entropy, entropyMetadata, +- workspace, wkspSize, writeLitEntropy, writeSeqEntropy); +- if (cBlockSizeEstimate > targetCBlockSize || lastSequence) { +- int litEntropyWritten = 0; +- int seqEntropyWritten = 0; +- const size_t decompressedSize = ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, lastSequence); +- const size_t cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, +- sp, seqCount, +- lp, litSize, +- llCodePtr, mlCodePtr, ofCodePtr, +- cctxParams, +- op, oend-op, +- bmi2, writeLitEntropy, writeSeqEntropy, +- &litEntropyWritten, &seqEntropyWritten, +- lastBlock && lastSequence); +- FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); +- if (cSize > 0 && cSize < decompressedSize) { +- DEBUGLOG(5, "Committed the sub-block"); +- assert(ip + decompressedSize <= iend); +- ip += decompressedSize; +- sp += seqCount; +- lp += litSize; +- op += cSize; +- llCodePtr += seqCount; +- mlCodePtr += seqCount; +- ofCodePtr += seqCount; +- litSize = 0; +- seqCount = 0; +- /* Entropy only needs to be written once */ +- if (litEntropyWritten) { +- writeLitEntropy = 0; +- } +- if (seqEntropyWritten) { +- writeSeqEntropy = 0; +- } ++ } /* if (nbSeqs > 0) */ ++ ++ /* write last block */ ++ DEBUGLOG(5, "Generate last sub-block: %u sequences remaining", (unsigned)(send - sp)); ++ { int litEntropyWritten = 0; ++ int seqEntropyWritten = 0; ++ size_t litSize = (size_t)(lend - lp); ++ size_t seqCount = (size_t)(send - sp); ++ const size_t decompressedSize = ++ ZSTD_seqDecompressedSize(seqStorePtr, sp, seqCount, litSize, 1); ++ size_t const cSize = ZSTD_compressSubBlock(&nextCBlock->entropy, entropyMetadata, ++ sp, seqCount, ++ lp, litSize, ++ llCodePtr, mlCodePtr, ofCodePtr, ++ cctxParams, ++ op, (size_t)(oend-op), ++ bmi2, writeLitEntropy, writeSeqEntropy, ++ &litEntropyWritten, &seqEntropyWritten, ++ lastBlock); ++ FORWARD_IF_ERROR(cSize, "ZSTD_compressSubBlock failed"); ++ ++ /* update pointers, the nb of literals borrowed from next sequence must be preserved */ ++ if (cSize > 0 && cSize < decompressedSize) { ++ DEBUGLOG(5, "Last sub-block compressed %u bytes => %u bytes", ++ (unsigned)decompressedSize, (unsigned)cSize); ++ assert(ip + decompressedSize <= iend); ++ ip += decompressedSize; ++ lp += litSize; ++ op += cSize; ++ llCodePtr += seqCount; ++ mlCodePtr += seqCount; ++ ofCodePtr += seqCount; ++ /* Entropy only needs to be written once */ ++ if (litEntropyWritten) { ++ writeLitEntropy = 0; + } ++ if (seqEntropyWritten) { ++ writeSeqEntropy = 0; ++ } ++ sp += seqCount; + } +- } while (!lastSequence); ++ } ++ ++ + if (writeLitEntropy) { +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has literal entropy tables unwritten"); ++ DEBUGLOG(5, "Literal entropy tables were never written"); + ZSTD_memcpy(&nextCBlock->entropy.huf, &prevCBlock->entropy.huf, sizeof(prevCBlock->entropy.huf)); + } + if (writeSeqEntropy && ZSTD_needSequenceEntropyTables(&entropyMetadata->fseMetadata)) { + /* If we haven't written our entropy tables, then we've violated our contract and + * must emit an uncompressed block. + */ +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi has sequence entropy tables unwritten"); ++ DEBUGLOG(5, "Sequence entropy tables were never written => cancel, emit an uncompressed block"); + return 0; + } ++ + if (ip < iend) { +- size_t const cSize = ZSTD_noCompressBlock(op, oend - op, ip, iend - ip, lastBlock); +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi last sub-block uncompressed, %zu bytes", (size_t)(iend - ip)); ++ /* some data left : last part of the block sent uncompressed */ ++ size_t const rSize = (size_t)((iend - ip)); ++ size_t const cSize = ZSTD_noCompressBlock(op, (size_t)(oend - op), ip, rSize, lastBlock); ++ DEBUGLOG(5, "Generate last uncompressed sub-block of %u bytes", (unsigned)(rSize)); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + assert(cSize != 0); + op += cSize; + /* We have to regenerate the repcodes because we've skipped some sequences */ + if (sp < send) { +- seqDef const* seq; +- repcodes_t rep; ++ const SeqDef* seq; ++ Repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { +- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); ++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } + } +- DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed"); +- return op-ostart; ++ ++ DEBUGLOG(5, "ZSTD_compressSubBlock_multi compressed all subBlocks: total compressed size = %u", ++ (unsigned)(op-ostart)); ++ return (size_t)(op-ostart); + } + + size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, +- void const* src, size_t srcSize, +- unsigned lastBlock) { ++ const void* src, size_t srcSize, ++ unsigned lastBlock) ++{ + ZSTD_entropyCTablesMetadata_t entropyMetadata; + + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(&zc->seqStore, +@@ -559,7 +675,7 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + &entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */), ""); + + return ZSTD_compressSubBlock_multi(&zc->seqStore, + zc->blockState.prevCBlock, +@@ -569,5 +685,5 @@ size_t ZSTD_compressSuperBlock(ZSTD_CCtx* zc, + dst, dstCapacity, + src, srcSize, + zc->bmi2, lastBlock, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */); ++ zc->tmpWorkspace, zc->tmpWkspSize /* statically allocated in resetCCtx */); + } +diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h +index 224ece79546e..826bbc9e029b 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.h ++++ b/lib/zstd/compress/zstd_compress_superblock.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h +index 349fc923c355..dce42f653bae 100644 +--- a/lib/zstd/compress/zstd_cwksp.h ++++ b/lib/zstd/compress/zstd_cwksp.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,8 +15,10 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_internal.h" +- ++#include "../common/portability_macros.h" ++#include "../common/compiler.h" /* ZS2_isPower2 */ + + /*-************************************* + * Constants +@@ -41,8 +44,9 @@ + ***************************************/ + typedef enum { + ZSTD_cwksp_alloc_objects, +- ZSTD_cwksp_alloc_buffers, +- ZSTD_cwksp_alloc_aligned ++ ZSTD_cwksp_alloc_aligned_init_once, ++ ZSTD_cwksp_alloc_aligned, ++ ZSTD_cwksp_alloc_buffers + } ZSTD_cwksp_alloc_phase_e; + + /* +@@ -95,8 +99,8 @@ typedef enum { + * + * Workspace Layout: + * +- * [ ... workspace ... ] +- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] ++ * [ ... workspace ... ] ++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: +@@ -120,9 +124,18 @@ typedef enum { + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. These tables are 64-byte aligned. + * +- * - Aligned: these buffers are used for various purposes that require 4 byte +- * alignment, but don't require any initialization before they're used. These +- * buffers are each aligned to 64 bytes. ++ * - Init once: these buffers require to be initialized at least once before ++ * use. They should be used when we want to skip memory initialization ++ * while not triggering memory checkers (like Valgrind) when reading from ++ * from this memory without writing to it first. ++ * These buffers should be used carefully as they might contain data ++ * from previous compressions. ++ * Buffers are aligned to 64 bytes. ++ * ++ * - Aligned: these buffers don't require any initialization before they're ++ * used. The user of the buffer should make sure they write into a buffer ++ * location before reading from it. ++ * Buffers are aligned to 64 bytes. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can +@@ -134,8 +147,9 @@ typedef enum { + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects +- * 2. Buffers +- * 3. Aligned/Tables ++ * 2. Init once / Tables ++ * 3. Aligned / Tables ++ * 4. Buffers / Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +@@ -147,6 +161,7 @@ typedef struct { + void* tableEnd; + void* tableValidEnd; + void* allocStart; ++ void* initOnceStart; + + BYTE allocFailed; + int workspaceOversizedDuration; +@@ -159,6 +174,7 @@ typedef struct { + ***************************************/ + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); + + MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; +@@ -168,14 +184,16 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); ++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); ++ assert(ws->workspace <= ws->initOnceStart); + } + + /* + * Align must be a power of 2. + */ +-MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { ++MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t align) { + size_t const mask = align - 1; +- assert((align & mask) == 0); ++ assert(ZSTD_isPower2(align)); + return (size + mask) & ~mask; + } + +@@ -189,7 +207,7 @@ MEM_STATIC size_t ZSTD_cwksp_align(size_t size, size_t const align) { + * to figure out how much space you need for the matchState tables. Everything + * else is though. + * +- * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned_alloc_size(). ++ * Do not use for sizing aligned buffers. Instead, use ZSTD_cwksp_aligned64_alloc_size(). + */ + MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { + if (size == 0) +@@ -197,12 +215,16 @@ MEM_STATIC size_t ZSTD_cwksp_alloc_size(size_t size) { + return size; + } + ++MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size, size_t alignment) { ++ return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, alignment)); ++} ++ + /* + * Returns an adjusted alloc size that is the nearest larger multiple of 64 bytes. + * Used to determine the number of bytes required for a given "aligned". + */ +-MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { +- return ZSTD_cwksp_alloc_size(ZSTD_cwksp_align(size, ZSTD_CWKSP_ALIGNMENT_BYTES)); ++MEM_STATIC size_t ZSTD_cwksp_aligned64_alloc_size(size_t size) { ++ return ZSTD_cwksp_aligned_alloc_size(size, ZSTD_CWKSP_ALIGNMENT_BYTES); + } + + /* +@@ -210,14 +232,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + * for internal purposes (currently only alignment). + */ + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +- * to align the beginning of the aligned section. +- * +- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +- * aligneds being sized in multiples of 64 bytes. ++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES ++ * bytes to align the beginning of tables section and end of buffers; + */ +- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; ++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; + } + +@@ -229,11 +247,23 @@ MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { + MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignBytes) { + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; +- assert((alignBytes & alignBytesMask) == 0); +- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(ZSTD_isPower2(alignBytes)); ++ assert(bytes < alignBytes); + return bytes; + } + ++/* ++ * Returns the initial value for allocStart which is used to determine the position from ++ * which we can allocate from the end of the workspace. ++ */ ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) ++{ ++ char* endPtr = (char*)ws->workspaceEnd; ++ assert(ZSTD_isPower2(ZSTD_CWKSP_ALIGNMENT_BYTES)); ++ endPtr = endPtr - ((size_t)endPtr % ZSTD_CWKSP_ALIGNMENT_BYTES); ++ return (void*)endPtr; ++} ++ + /* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, +@@ -246,7 +276,7 @@ ZSTD_cwksp_reserve_internal_buffer_space(ZSTD_cwksp* ws, size_t const bytes) + { + void* const alloc = (BYTE*)ws->allocStart - bytes; + void* const bottom = ws->tableEnd; +- DEBUGLOG(5, "cwksp: reserving %p %zd bytes, %zd bytes remaining", ++ DEBUGLOG(5, "cwksp: reserving [0x%p]:%zd bytes; %zd bytes remaining", + alloc, bytes, ZSTD_cwksp_available_space(ws) - bytes); + ZSTD_cwksp_assert_internal_consistency(ws); + assert(alloc >= bottom); +@@ -274,27 +304,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + { + assert(phase >= ws->phase); + if (phase > ws->phase) { +- /* Going from allocating objects to allocating buffers */ +- if (ws->phase < ZSTD_cwksp_alloc_buffers && +- phase >= ZSTD_cwksp_alloc_buffers) { ++ /* Going from allocating objects to allocating initOnce / tables */ ++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && ++ phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; +- } ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + +- /* Going from allocating buffers to allocating aligneds/tables */ +- if (ws->phase < ZSTD_cwksp_alloc_aligned && +- phase >= ZSTD_cwksp_alloc_aligned) { +- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +- size_t const bytesToAlign = +- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +- memory_allocation, "aligned phase - alignment initial allocation failed!"); +- } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +- void* const alloc = ws->objectEnd; ++ void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +- void* const objectEnd = (BYTE*)alloc + bytesToAlign; ++ void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); +@@ -302,7 +321,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; +- } } } ++ } ++ } ++ } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -314,7 +335,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + */ + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) + { +- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); ++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); + } + + /* +@@ -345,29 +366,61 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) + + /* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ * This memory has been initialized at least once in the past. ++ * This doesn't mean it has been initialized this time, and it might contain data from previous ++ * operations. ++ * The main usage is for algorithms that might need read access into uninitialized memory. ++ * The algorithm must maintain safety under these conditions and must make sure it doesn't ++ * leak any of the past data (directly or in side channels). + */ +-MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) + { +- void* ptr = ZSTD_cwksp_reserve_internal(ws, ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), +- ZSTD_cwksp_alloc_aligned); +- assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); ++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); ++ if(ptr && ptr < ws->initOnceStart) { ++ /* We assume the memory following the current allocation is either: ++ * 1. Not usable as initOnce memory (end of workspace) ++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) ++ * 3. An ASAN redzone, in which case we don't want to write on it ++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. ++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ++ ws->initOnceStart = ptr; ++ } ++ return ptr; ++} ++ ++/* ++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned64(ZSTD_cwksp* ws, size_t bytes) ++{ ++ void* const ptr = ZSTD_cwksp_reserve_internal(ws, ++ ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES), ++ ZSTD_cwksp_alloc_aligned); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + return ptr; + } + + /* + * Aligned on 64 bytes. These buffers have the special property that +- * their values remain constrained, allowing us to re-use them without ++ * their values remain constrained, allowing us to reuse them without + * memset()-ing them. + */ + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + { +- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; ++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + +- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +- return NULL; ++ /* We can only start allocating tables after we are done reserving space for objects at the ++ * start of the workspace */ ++ if(ws->phase < phase) { ++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { ++ return NULL; ++ } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; +@@ -387,7 +440,7 @@ MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + + + assert((bytes & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); +- assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ assert(((size_t)alloc & (ZSTD_CWKSP_ALIGNMENT_BYTES-1)) == 0); + return alloc; + } + +@@ -421,6 +474,20 @@ MEM_STATIC void* ZSTD_cwksp_reserve_object(ZSTD_cwksp* ws, size_t bytes) + + return alloc; + } ++/* ++ * with alignment control ++ * Note : should happen only once, at workspace first initialization ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_object_aligned(ZSTD_cwksp* ws, size_t byteSize, size_t alignment) ++{ ++ size_t const mask = alignment - 1; ++ size_t const surplus = (alignment > sizeof(void*)) ? alignment - sizeof(void*) : 0; ++ void* const start = ZSTD_cwksp_reserve_object(ws, byteSize + surplus); ++ if (start == NULL) return NULL; ++ if (surplus == 0) return start; ++ assert(ZSTD_isPower2(alignment)); ++ return (void*)(((size_t)start + surplus) & ~mask); ++} + + MEM_STATIC void ZSTD_cwksp_mark_tables_dirty(ZSTD_cwksp* ws) + { +@@ -451,7 +518,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { +- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); ++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); + } + ZSTD_cwksp_mark_tables_clean(ws); + } +@@ -460,7 +527,8 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + * Invalidates table allocations. + * All other allocations remain valid. + */ +-MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) { ++MEM_STATIC void ZSTD_cwksp_clear_tables(ZSTD_cwksp* ws) ++{ + DEBUGLOG(4, "cwksp: clearing tables!"); + + +@@ -478,14 +546,23 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + + + ws->tableEnd = ws->objectEnd; +- ws->allocStart = ws->workspaceEnd; ++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); + ws->allocFailed = 0; +- if (ws->phase > ZSTD_cwksp_alloc_buffers) { +- ws->phase = ZSTD_cwksp_alloc_buffers; ++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { ++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; + } + ZSTD_cwksp_assert_internal_consistency(ws); + } + ++MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); ++} ++ ++MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { ++ return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) ++ + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); ++} ++ + /* + * The provided workspace takes ownership of the buffer [start, start+size). + * Any existing values in the workspace are ignored (the previously managed +@@ -498,6 +575,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; + ZSTD_cwksp_clear(ws); +@@ -529,15 +607,6 @@ MEM_STATIC void ZSTD_cwksp_move(ZSTD_cwksp* dst, ZSTD_cwksp* src) { + ZSTD_memset(src, 0, sizeof(ZSTD_cwksp)); + } + +-MEM_STATIC size_t ZSTD_cwksp_sizeof(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->workspace); +-} +- +-MEM_STATIC size_t ZSTD_cwksp_used(const ZSTD_cwksp* ws) { +- return (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->workspace) +- + (size_t)((BYTE*)ws->workspaceEnd - (BYTE*)ws->allocStart); +-} +- + MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + return ws->allocFailed; + } +@@ -550,17 +619,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +- size_t const estimatedSpace, int resizedWorkspace) { +- if (resizedWorkspace) { +- /* Resized/newly allocated wksp should have exact bounds */ +- return ZSTD_cwksp_used(ws) == estimatedSpace; +- } else { +- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +- * than estimatedSpace. See the comments in zstd_cwksp.h for details. +- */ +- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +- } ++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { ++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice ++ * the alignment bytes difference between estimation and actual usage */ ++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && ++ ZSTD_cwksp_used(ws) <= estimatedSpace; + } + + +@@ -591,5 +654,4 @@ MEM_STATIC void ZSTD_cwksp_bump_oversized_duration( + } + } + +- + #endif /* ZSTD_CWKSP_H */ +diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c +index 76933dea2624..995e83f3a183 100644 +--- a/lib/zstd/compress/zstd_double_fast.c ++++ b/lib/zstd/compress/zstd_double_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,49 @@ + #include "zstd_compress_internal.h" + #include "zstd_double_fast.h" + ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCDict(ZSTD_MatchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashLarge = ms->hashTable; ++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ U32* const hashSmall = ms->chainTable; ++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; ++ ++ /* Always insert every fastHashFillStep position into the hash tables. ++ * Insert the other positions into the large hash table if their entry ++ * is empty. ++ */ ++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ U32 i; ++ for (i = 0; i < fastHashFillStep; ++i) { ++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); ++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); ++ if (i == 0) { ++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); ++ } ++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { ++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); ++ } ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ if (dtlm == ZSTD_dtlm_fast) ++ break; ++ } } ++} ++ ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillDoubleHashTableForCCtx(ZSTD_MatchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -43,13 +85,26 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; +- } } ++ } } ++} ++ ++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); ++ } + } + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_noDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls /* template */) + { + ZSTD_compressionParameters const* cParams = &ms->cParams; +@@ -67,7 +122,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; +@@ -88,9 +143,14 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* matchl0; /* the long match for ip */ + const BYTE* matchs0; /* the short match for ip */ + const BYTE* matchl1; /* the long match for ip1 */ ++ const BYTE* matchs0_safe; /* matchs0 or safe address */ + + const BYTE* ip = istart; /* the current position */ + const BYTE* ip1; /* the next position */ ++ /* Array of ~random data, should have low probability of matching data ++ * we load from here instead of from tables, if matchl0/matchl1 are ++ * invalid indices. Used to avoid unpredictable branches. */ ++ const BYTE dummy[] = {0x12,0x34,0x56,0x78,0x9a,0xbc,0xde,0xf0,0xe2,0xb4}; + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_noDict_generic"); + +@@ -100,8 +160,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; +- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ +@@ -131,30 +191,35 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + + hl1 = ZSTD_hashPtr(ip1, hBitsL, 8); + +- if (idxl0 > prefixLowestIndex) { ++ /* idxl0 > prefixLowestIndex is a (somewhat) unpredictable branch. ++ * However expression below complies into conditional move. Since ++ * match is unlikely and we only *branch* on idxl0 > prefixLowestIndex ++ * if there is a match, all branches become predictable. */ ++ { const BYTE* const matchl0_safe = ZSTD_selectAddr(idxl0, prefixLowestIndex, matchl0, &dummy[0]); ++ + /* check prefix long match */ +- if (MEM_read64(matchl0) == MEM_read64(ip)) { ++ if (MEM_read64(matchl0_safe) == MEM_read64(ip) && matchl0_safe == matchl0) { + mLength = ZSTD_count(ip+8, matchl0+8, iend) + 8; + offset = (U32)(ip-matchl0); + while (((ip>anchor) & (matchl0>prefixLowest)) && (ip[-1] == matchl0[-1])) { ip--; matchl0--; mLength++; } /* catch up */ + goto _match_found; +- } +- } ++ } } + + idxl1 = hashLong[hl1]; + matchl1 = base + idxl1; + +- if (idxs0 > prefixLowestIndex) { +- /* check prefix short match */ +- if (MEM_read32(matchs0) == MEM_read32(ip)) { +- goto _search_next_long; +- } ++ /* Same optimization as matchl0 above */ ++ matchs0_safe = ZSTD_selectAddr(idxs0, prefixLowestIndex, matchs0, &dummy[0]); ++ ++ /* check prefix short match */ ++ if(MEM_read32(matchs0_safe) == MEM_read32(ip) && matchs0_safe == matchs0) { ++ goto _search_next_long; + } + + if (ip1 >= nextStep) { +@@ -175,30 +240,36 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + } while (ip1 <= ilimit); + + _cleanup: ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + + _search_next_long: + +- /* check prefix long +1 match */ +- if (idxl1 > prefixLowestIndex) { +- if (MEM_read64(matchl1) == MEM_read64(ip1)) { ++ /* short match found: let's check for a longer one */ ++ mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; ++ offset = (U32)(ip - matchs0); ++ ++ /* check long match at +1 position */ ++ if ((idxl1 > prefixLowestIndex) && (MEM_read64(matchl1) == MEM_read64(ip1))) { ++ size_t const l1len = ZSTD_count(ip1+8, matchl1+8, iend) + 8; ++ if (l1len > mLength) { ++ /* use the long match instead */ + ip = ip1; +- mLength = ZSTD_count(ip+8, matchl1+8, iend) + 8; ++ mLength = l1len; + offset = (U32)(ip-matchl1); +- while (((ip>anchor) & (matchl1>prefixLowest)) && (ip[-1] == matchl1[-1])) { ip--; matchl1--; mLength++; } /* catch up */ +- goto _match_found; ++ matchs0 = matchl1; + } + } + +- /* if no long +1 match, explore the short match we found */ +- mLength = ZSTD_count(ip+4, matchs0+4, iend) + 4; +- offset = (U32)(ip - matchs0); +- while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* catch up */ ++ while (((ip>anchor) & (matchs0>prefixLowest)) && (ip[-1] == matchs0[-1])) { ip--; matchs0--; mLength++; } /* complete backward */ + + /* fall-through */ + +@@ -217,7 +288,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + hashLong[hl1] = (U32)(ip1 - base); + } + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -243,7 +314,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -254,8 +325,9 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) + { +@@ -275,9 +347,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; + const U32* const dictHashLong = dms->hashTable; + const U32* const dictHashSmall = dms->chainTable; +@@ -286,8 +357,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); +- const U32 dictHBitsL = dictCParams->hashLog; +- const U32 dictHBitsS = dictCParams->chainLog; ++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); +@@ -295,6 +366,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashLong, hashTableBytes); ++ PREFETCH_AREA(dictHashSmall, chainTableBytes); ++ } ++ + /* init */ + ip += (dictAndPrefixLength == 0); + +@@ -309,8 +387,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); +- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); +- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); ++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); ++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; +@@ -323,26 +405,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + hashLong[h2] = hashSmall[h] = curr; /* update hash tables */ + + /* check repcode */ +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +- if (matchIndexL > prefixLowestIndex) { ++ if ((matchIndexL >= prefixLowestIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + /* check prefix long match */ +- if (MEM_read64(matchLong) == MEM_read64(ip)) { +- mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; +- offset = (U32)(ip-matchLong); +- while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ +- goto _match_found; +- } +- } else { ++ mLength = ZSTD_count(ip+8, matchLong+8, iend) + 8; ++ offset = (U32)(ip-matchLong); ++ while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ ++ goto _match_found; ++ } else if (dictTagsMatchL) { + /* check dictMatchState long match */ +- U32 const dictMatchIndexL = dictHashLong[dictHL]; ++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + +@@ -354,13 +434,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } } + + if (matchIndexS > prefixLowestIndex) { +- /* check prefix short match */ ++ /* short match candidate */ + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } +- } else { ++ } else if (dictTagsMatchS) { + /* check dictMatchState short match */ +- U32 const dictMatchIndexS = dictHashSmall[dictHS]; ++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + +@@ -375,25 +455,24 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + continue; + + _search_next_long: +- + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); ++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; ++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + + /* check prefix long +1 match */ +- if (matchIndexL3 > prefixLowestIndex) { +- if (MEM_read64(matchL3) == MEM_read64(ip+1)) { +- mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; +- ip++; +- offset = (U32)(ip-matchL3); +- while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ +- goto _match_found; +- } +- } else { ++ if ((matchIndexL3 >= prefixLowestIndex) && (MEM_read64(matchL3) == MEM_read64(ip+1))) { ++ mLength = ZSTD_count(ip+9, matchL3+8, iend) + 8; ++ ip++; ++ offset = (U32)(ip-matchL3); ++ while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ ++ goto _match_found; ++ } else if (dictTagsMatchL3) { + /* check dict long +1 match */ +- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; ++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { +@@ -419,7 +498,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + offset_2 = offset_1; + offset_1 = offset; + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -443,12 +522,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* repMatch2 = repIndex2 < prefixLowestIndex ? + dictBase + repIndex2 - dictIndexDelta : + base + repIndex2; +- if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) ++ if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex2)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -461,8 +540,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } /* while (ip < ilimit) */ + + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -470,7 +549,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + + #define ZSTD_GEN_DFAST_FN(dictMode, mls) \ + static size_t ZSTD_compressBlock_doubleFast_##dictMode##_##mls( \ +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ + return ZSTD_compressBlock_doubleFast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls); \ +@@ -488,7 +567,7 @@ ZSTD_GEN_DFAST_FN(dictMatchState, 7) + + + size_t ZSTD_compressBlock_doubleFast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + const U32 mls = ms->cParams.minMatch; +@@ -508,7 +587,7 @@ size_t ZSTD_compressBlock_doubleFast( + + + size_t ZSTD_compressBlock_doubleFast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + const U32 mls = ms->cParams.minMatch; +@@ -527,8 +606,10 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_doubleFast_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_doubleFast_extDict_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, + U32 const mls /* template */) + { +@@ -579,13 +660,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + size_t mLength; + hashSmall[hSmall] = hashLong[hLong] = curr; /* update hash table */ + +- if ((((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex doesn't overlap dict + prefix */ ++ if (((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) + & (offset_1 <= curr+1 - dictStartIndex)) /* note: we are searching at curr+1 */ + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; +@@ -596,7 +677,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +@@ -621,7 +702,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + } + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; +@@ -647,13 +728,13 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + U32 const current2 = (U32)(ip-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) /* intentional overflow : ensure repIndex2 doesn't overlap dict + prefix */ ++ if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) + & (offset_2 <= current2 - dictStartIndex)) + && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -677,7 +758,7 @@ ZSTD_GEN_DFAST_FN(extDict, 6) + ZSTD_GEN_DFAST_FN(extDict, 7) + + size_t ZSTD_compressBlock_doubleFast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; +@@ -694,3 +775,5 @@ size_t ZSTD_compressBlock_doubleFast_extDict( + return ZSTD_compressBlock_doubleFast_extDict_7(ms, seqStore, rep, src, srcSize); + } + } ++ ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ +diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h +index 6822bde65a1d..011556ce56f7 100644 +--- a/lib/zstd/compress/zstd_double_fast.h ++++ b/lib/zstd/compress/zstd_double_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,22 +12,32 @@ + #ifndef ZSTD_DOUBLE_FAST_H + #define ZSTD_DOUBLE_FAST_H + +- + #include "../common/mem.h" /* U32 */ + #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ++void ZSTD_fillDoubleHashTable(ZSTD_MatchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); ++ + size_t ZSTD_compressBlock_doubleFast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_doubleFast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_doubleFast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +- ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST ZSTD_compressBlock_doubleFast ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE ZSTD_compressBlock_doubleFast_dictMatchState ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT ZSTD_compressBlock_doubleFast_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_DOUBLEFAST_EXTDICT NULL ++#endif /* ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR */ + + #endif /* ZSTD_DOUBLE_FAST_H */ +diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c +index a752e6beab52..60e07e839e5f 100644 +--- a/lib/zstd/compress/zstd_fast.c ++++ b/lib/zstd/compress/zstd_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,46 @@ + #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ + #include "zstd_fast.h" + ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCDict(ZSTD_MatchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashTable = ms->hashTable; ++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; ++ ++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_full); ++ ++ /* Always insert every fastHashFillStep position into the hash table. ++ * Insert the other positions if their hash entry is empty. ++ */ ++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } ++ ++ if (dtlm == ZSTD_dtlm_fast) continue; ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ { U32 p; ++ for (p = 1; p < fastHashFillStep; ++p) { ++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); ++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); ++ } } } } ++} + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_fillHashTableForCCtx(ZSTD_MatchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) + { +@@ -25,6 +64,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + ++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_fast); ++ + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ +@@ -42,6 +85,60 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + } } } } + } + ++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillHashTableForCCtx(ms, end, dtlm); ++ } ++} ++ ++ ++typedef int (*ZSTD_match4Found) (const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit); ++ ++static int ++ZSTD_match4Found_cmov(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) ++{ ++ /* Array of ~random data, should have low probability of matching data. ++ * Load from here if the index is invalid. ++ * Used to avoid unpredictable branches. */ ++ static const BYTE dummy[] = {0x12,0x34,0x56,0x78}; ++ ++ /* currentIdx >= lowLimit is a (somewhat) unpredictable branch. ++ * However expression below compiles into conditional move. ++ */ ++ const BYTE* mvalAddr = ZSTD_selectAddr(matchIdx, idxLowLimit, matchAddress, dummy); ++ /* Note: this used to be written as : return test1 && test2; ++ * Unfortunately, once inlined, these tests become branches, ++ * in which case it becomes critical that they are executed in the right order (test1 then test2). ++ * So we have to write these tests in a specific manner to ensure their ordering. ++ */ ++ if (MEM_read32(currentPtr) != MEM_read32(mvalAddr)) return 0; ++ /* force ordering of these tests, which matters once the function is inlined, as they become branches */ ++ __asm__(""); ++ return matchIdx >= idxLowLimit; ++} ++ ++static int ++ZSTD_match4Found_branch(const BYTE* currentPtr, const BYTE* matchAddress, U32 matchIdx, U32 idxLowLimit) ++{ ++ /* using a branch instead of a cmov, ++ * because it's faster in scenarios where matchIdx >= idxLowLimit is generally true, ++ * aka almost all candidates are within range */ ++ U32 mval; ++ if (matchIdx >= idxLowLimit) { ++ mval = MEM_read32(matchAddress); ++ } else { ++ mval = MEM_read32(currentPtr) ^ 1; /* guaranteed to not match. */ ++ } ++ ++ return (MEM_read32(currentPtr) == mval); ++} ++ + + /* + * If you squint hard enough (and ignore repcodes), the search operation at any +@@ -89,17 +186,17 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + * + * This is also the work we do at the beginning to enter the loop initially. + */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_fast_noDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_noDict_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, +- U32 const mls, U32 const hasStep) ++ U32 const mls, int useCmov) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; +- /* support stepSize of 0 */ +- size_t const stepSize = hasStep ? (cParams->targetLength + !(cParams->targetLength) + 1) : 2; ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; /* min 2 */ + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); +@@ -117,12 +214,11 @@ ZSTD_compressBlock_fast_noDict_generic( + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ +- U32 idx; /* match idx for ip0 */ +- U32 mval; /* src value at match idx */ ++ U32 matchIdx; /* match idx for ip0 */ + + U32 offcode; + const BYTE* match0; +@@ -135,14 +231,15 @@ ZSTD_compressBlock_fast_noDict_generic( + size_t step; + const BYTE* nextStep; + const size_t kStepIncr = (1 << (kSearchStrength - 1)); ++ const ZSTD_match4Found matchFound = useCmov ? ZSTD_match4Found_cmov : ZSTD_match4Found_branch; + + DEBUGLOG(5, "ZSTD_compressBlock_fast_generic"); + ip0 += (ip0 == prefixStart); + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; +- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; ++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; ++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +@@ -163,7 +260,7 @@ ZSTD_compressBlock_fast_noDict_generic( + hash0 = ZSTD_hashPtr(ip0, hlog, mls); + hash1 = ZSTD_hashPtr(ip1, hlog, mls); + +- idx = hashTable[hash0]; ++ matchIdx = hashTable[hash0]; + + do { + /* load repcode match for ip[2]*/ +@@ -180,26 +277,28 @@ ZSTD_compressBlock_fast_noDict_generic( + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; +- offcode = STORE_REPCODE_1; ++ offcode = REPCODE1_TO_OFFBASE; + mLength += 4; ++ ++ /* Write next hash table entry: it's already calculated. ++ * This write is known to be safe because ip1 is before the ++ * repcode (ip2). */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _match; + } + +- /* load match for ip[0] */ +- if (idx >= prefixStartIndex) { +- mval = MEM_read32(base + idx); +- } else { +- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ +- } ++ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { ++ /* Write next hash table entry (it's already calculated). ++ * This write is known to be safe because the ip1 == ip0 + 1, ++ * so searching will resume after ip1 */ ++ hashTable[hash1] = (U32)(ip1 - base); + +- /* check match at ip[0] */ +- if (MEM_read32(ip0) == mval) { +- /* found a match! */ + goto _offset; + } + + /* lookup ip[1] */ +- idx = hashTable[hash1]; ++ matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; +@@ -214,21 +313,19 @@ ZSTD_compressBlock_fast_noDict_generic( + current0 = (U32)(ip0 - base); + hashTable[hash0] = current0; + +- /* load match for ip[0] */ +- if (idx >= prefixStartIndex) { +- mval = MEM_read32(base + idx); +- } else { +- mval = MEM_read32(ip0) ^ 1; /* guaranteed to not match. */ +- } +- +- /* check match at ip[0] */ +- if (MEM_read32(ip0) == mval) { +- /* found a match! */ ++ if (matchFound(ip0, base + matchIdx, matchIdx, prefixStartIndex)) { ++ /* Write next hash table entry, since it's already calculated */ ++ if (step <= 4) { ++ /* Avoid writing an index if it's >= position where search will resume. ++ * The minimum possible match has length 4, so search can resume at ip0 + 4. ++ */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ } + goto _offset; + } + + /* lookup ip[1] */ +- idx = hashTable[hash1]; ++ matchIdx = hashTable[hash1]; + + /* hash ip[2] */ + hash0 = hash1; +@@ -250,13 +347,28 @@ ZSTD_compressBlock_fast_noDict_generic( + } while (ip3 < ilimit); + + _cleanup: +- /* Note that there are probably still a couple positions we could search. ++ /* Note that there are probably still a couple positions one could search. + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + ++ /* When the repcodes are outside of the prefix, we set them to zero before the loop. ++ * When the offsets are still zero, we need to restore them after the block to have a correct ++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both ++ * offsets were invalid. We need to figure out which offset to refill with. ++ * - If both offsets are zero they are in the same order. ++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. ++ * - If only one is zero, we need to decide which offset to restore. ++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. ++ * - It is impossible for rep_offset2 to be non-zero. ++ * ++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then ++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. ++ */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; ++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; ++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -264,10 +376,10 @@ ZSTD_compressBlock_fast_noDict_generic( + _offset: /* Requires: ip0, idx */ + + /* Compute the offset code. */ +- match0 = base + idx; ++ match0 = base + matchIdx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); +- offcode = STORE_OFFSET(rep_offset1); ++ offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ +@@ -287,11 +399,6 @@ ZSTD_compressBlock_fast_noDict_generic( + ip0 += mLength; + anchor = ip0; + +- /* write next hash table entry */ +- if (ip1 < ip0) { +- hashTable[hash1] = (U32)(ip1 - base); +- } +- + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ +@@ -306,7 +413,7 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; +- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } +@@ -314,12 +421,12 @@ ZSTD_compressBlock_fast_noDict_generic( + goto _start; + } + +-#define ZSTD_GEN_FAST_FN(dictMode, mls, step) \ +- static size_t ZSTD_compressBlock_fast_##dictMode##_##mls##_##step( \ +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ ++#define ZSTD_GEN_FAST_FN(dictMode, mml, cmov) \ ++ static size_t ZSTD_compressBlock_fast_##dictMode##_##mml##_##cmov( \ ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], \ + void const* src, size_t srcSize) \ + { \ +- return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mls, step); \ ++ return ZSTD_compressBlock_fast_##dictMode##_generic(ms, seqStore, rep, src, srcSize, mml, cmov); \ + } + + ZSTD_GEN_FAST_FN(noDict, 4, 1) +@@ -333,13 +440,15 @@ ZSTD_GEN_FAST_FN(noDict, 6, 0) + ZSTD_GEN_FAST_FN(noDict, 7, 0) + + size_t ZSTD_compressBlock_fast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- U32 const mls = ms->cParams.minMatch; ++ U32 const mml = ms->cParams.minMatch; ++ /* use cmov when "candidate in range" branch is likely unpredictable */ ++ int const useCmov = ms->cParams.windowLog < 19; + assert(ms->dictMatchState == NULL); +- if (ms->cParams.targetLength > 1) { +- switch(mls) ++ if (useCmov) { ++ switch(mml) + { + default: /* includes case 3 */ + case 4 : +@@ -352,7 +461,8 @@ size_t ZSTD_compressBlock_fast( + return ZSTD_compressBlock_fast_noDict_7_1(ms, seqStore, rep, src, srcSize); + } + } else { +- switch(mls) ++ /* use a branch instead */ ++ switch(mml) + { + default: /* includes case 3 */ + case 4 : +@@ -364,13 +474,13 @@ size_t ZSTD_compressBlock_fast( + case 7 : + return ZSTD_compressBlock_fast_noDict_7_0(ms, seqStore, rep, src, srcSize); + } +- + } + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_fast_dictMatchState_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -380,16 +490,16 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; ++ const BYTE* ip0 = istart; ++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; + const U32* const dictHashTable = dms->hashTable; + const U32 dictStartIndex = dms->window.dictLimit; +@@ -397,13 +507,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); +- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); +- const U32 dictHLog = dictCParams->hashLog; ++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); ++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; +- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); ++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + +@@ -413,106 +523,154 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashTable, hashTableBytes); ++ } ++ + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); +- ip += (dictAndPrefixLength == 0); ++ ip0 += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + +- /* Main Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ++ /* Outer search loop */ ++ assert(stepSize >= 1); ++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + size_t mLength; +- size_t const h = ZSTD_hashPtr(ip, hlog, mls); +- U32 const curr = (U32)(ip-base); +- U32 const matchIndex = hashTable[h]; +- const BYTE* match = base + matchIndex; +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* repMatch = (repIndex < prefixStartIndex) ? +- dictBase + (repIndex - dictIndexDelta) : +- base + repIndex; +- hashTable[h] = curr; /* update hash table */ +- +- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +- } else if ( (matchIndex <= prefixStartIndex) ) { +- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); +- U32 const dictMatchIndex = dictHashTable[dictHash]; +- const BYTE* dictMatch = dictBase + dictMatchIndex; +- if (dictMatchIndex <= dictStartIndex || +- MEM_read32(dictMatch) != MEM_read32(ip)) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a dict match */ +- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); +- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; +- while (((ip>anchor) & (dictMatch>dictStart)) +- && (ip[-1] == dictMatch[-1])) { +- ip--; dictMatch--; mLength++; ++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ ++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); ++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); ++ ++ U32 matchIndex = hashTable[hash0]; ++ U32 curr = (U32)(ip0 - base); ++ size_t step = stepSize; ++ const size_t kStepIncr = 1 << kSearchStrength; ++ const BYTE* nextStep = ip0 + kStepIncr; ++ ++ /* Inner search loop */ ++ while (1) { ++ const BYTE* match = base + matchIndex; ++ const U32 repIndex = curr + 1 - offset_1; ++ const BYTE* repMatch = (repIndex < prefixStartIndex) ? ++ dictBase + (repIndex - dictIndexDelta) : ++ base + repIndex; ++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); ++ hashTable[hash0] = curr; /* update hash table */ ++ ++ if ((ZSTD_index_overlap_check(prefixStartIndex, repIndex)) ++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { ++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ++ ip0++; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ++ break; ++ } ++ ++ if (dictTagsMatch) { ++ /* Found a possible dict match */ ++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* dictMatch = dictBase + dictMatchIndex; ++ if (dictMatchIndex > dictStartIndex && ++ MEM_read32(dictMatch) == MEM_read32(ip0)) { ++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ ++ if (matchIndex <= prefixStartIndex) { ++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); ++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; ++ while (((ip0 > anchor) & (dictMatch > dictStart)) ++ && (ip0[-1] == dictMatch[-1])) { ++ ip0--; ++ dictMatch--; ++ mLength++; ++ } /* catch up */ ++ offset_2 = offset_1; ++ offset_1 = offset; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; ++ } ++ } ++ } ++ ++ if (ZSTD_match4Found_cmov(ip0, match, matchIndex, prefixStartIndex)) { ++ /* found a regular match of size >= 4 */ ++ U32 const offset = (U32) (ip0 - match); ++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; ++ while (((ip0 > anchor) & (match > prefixStart)) ++ && (ip0[-1] == match[-1])) { ++ ip0--; ++ match--; ++ mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; + } +- } else if (MEM_read32(match) != MEM_read32(ip)) { +- /* it's not a match, and we're not going to check the dictionary */ +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a regular match */ +- U32 const offset = (U32)(ip-match); +- mLength = ZSTD_count(ip+4, match+4, iend) + 4; +- while (((ip>anchor) & (match>prefixStart)) +- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; +- offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- } ++ ++ /* Prepare for next iteration */ ++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); ++ matchIndex = hashTable[hash1]; ++ ++ if (ip1 >= nextStep) { ++ step++; ++ nextStep += kStepIncr; ++ } ++ ip0 = ip1; ++ ip1 = ip1 + step; ++ if (ip1 > ilimit) goto _cleanup; ++ ++ curr = (U32)(ip0 - base); ++ hash0 = hash1; ++ } /* end inner search loop */ + + /* match found */ +- ip += mLength; +- anchor = ip; ++ assert(mLength); ++ ip0 += mLength; ++ anchor = ip0; + +- if (ip <= ilimit) { ++ if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); ++ while (ip0 <= ilimit) { ++ U32 const current2 = (U32)(ip0-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; +- if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ if ( (ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) ++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ++ ip0 += repLength2; ++ anchor = ip0; + continue; + } + break; + } + } ++ ++ /* Prepare for next iteration */ ++ assert(ip0 == anchor); ++ ip1 = ip0 + stepSize; + } + ++_cleanup: + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -525,7 +683,7 @@ ZSTD_GEN_FAST_FN(dictMatchState, 6, 0) + ZSTD_GEN_FAST_FN(dictMatchState, 7, 0) + + size_t ZSTD_compressBlock_fast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; +@@ -545,19 +703,20 @@ size_t ZSTD_compressBlock_fast_dictMatchState( + } + + +-static size_t ZSTD_compressBlock_fast_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_fast_extDict_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize, U32 const mls, U32 const hasStep) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ +- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); +@@ -570,6 +729,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; ++ ++ const BYTE* ip0 = istart; ++ const BYTE* ip1; ++ const BYTE* ip2; ++ const BYTE* ip3; ++ U32 current0; ++ ++ ++ size_t hash0; /* hash for ip0 */ ++ size_t hash1; /* hash for ip1 */ ++ U32 idx; /* match idx for ip0 */ ++ const BYTE* idxBase; /* base pointer for idx */ ++ ++ U32 offcode; ++ const BYTE* match0; ++ size_t mLength; ++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ ++ ++ size_t step; ++ const BYTE* nextStep; ++ const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ + +@@ -579,75 +760,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + +- /* Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ +- const size_t h = ZSTD_hashPtr(ip, hlog, mls); +- const U32 matchIndex = hashTable[h]; +- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; +- const BYTE* match = matchBase + matchIndex; +- const U32 curr = (U32)(ip-base); +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; +- const BYTE* const repMatch = repBase + repIndex; +- hashTable[h] = curr; /* update hash table */ +- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); +- +- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); +- ip += rLength; +- anchor = ip; +- } else { +- if ( (matchIndex < dictStartIndex) || +- (MEM_read32(match) != MEM_read32(ip)) ) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; ++ { U32 const curr = (U32)(ip0 - base); ++ U32 const maxRep = curr - dictStartIndex; ++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; ++ } ++ ++ /* start each op */ ++_start: /* Requires: ip0 */ ++ ++ step = stepSize; ++ nextStep = ip0 + kStepIncr; ++ ++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ++ ip1 = ip0 + 1; ++ ip2 = ip0 + step; ++ ip3 = ip2 + 1; ++ ++ if (ip3 >= ilimit) { ++ goto _cleanup; ++ } ++ ++ hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ ++ idx = hashTable[hash0]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ do { ++ { /* load repcode match for ip[2] */ ++ U32 const current2 = (U32)(ip2 - base); ++ U32 const repIndex = current2 - offset_1; ++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; ++ U32 rval; ++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ ++ & (offset_1 > 0) ) { ++ rval = MEM_read32(repBase + repIndex); ++ } else { ++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ + } +- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; +- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; +- U32 const offset = curr - matchIndex; +- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; +- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = offset; /* update offset history */ +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- ip += mLength; +- anchor = ip; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ /* check repcode at ip[2] */ ++ if (MEM_read32(ip2) == rval) { ++ ip0 = ip2; ++ match0 = repBase + repIndex; ++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ assert((match0 != prefixStart) & (match0 != dictStart)); ++ mLength = ip0[-1] == match0[-1]; ++ ip0 -= mLength; ++ match0 -= mLength; ++ offcode = REPCODE1_TO_OFFBASE; ++ mLength += 4; ++ goto _match; + } } + +- if (ip <= ilimit) { +- /* Fill Table */ +- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); +- /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); +- U32 const repIndex2 = current2 - offset_2; +- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; +- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; +- continue; +- } +- break; +- } } } ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip3; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip0 + step; ++ ip3 = ip1 + step; ++ ++ /* calculate step */ ++ if (ip2 >= nextStep) { ++ step++; ++ PREFETCH_L1(ip1 + 64); ++ PREFETCH_L1(ip1 + 128); ++ nextStep += kStepIncr; ++ } ++ } while (ip3 < ilimit); ++ ++_cleanup: ++ /* Note that there are probably still a couple positions we could search. ++ * However, it seems to be a meaningful performance hit to try to search ++ * them. So let's not. */ ++ ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ +- rep[0] = offset_1; +- rep[1] = offset_2; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); ++ ++_offset: /* Requires: ip0, idx, idxBase */ ++ ++ /* Compute the offset code. */ ++ { U32 const offset = current0 - idx; ++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; ++ matchEnd = idx < prefixStartIndex ? dictEnd : iend; ++ match0 = idxBase + idx; ++ offset_2 = offset_1; ++ offset_1 = offset; ++ offcode = OFFSET_TO_OFFBASE(offset); ++ mLength = 4; ++ ++ /* Count the backwards match length. */ ++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { ++ ip0--; ++ match0--; ++ mLength++; ++ } } ++ ++_match: /* Requires: ip0, match0, offcode, matchEnd */ ++ ++ /* Count the forward length. */ ++ assert(matchEnd != 0); ++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); ++ ++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); ++ ++ ip0 += mLength; ++ anchor = ip0; ++ ++ /* write next hash table entry */ ++ if (ip1 < ip0) { ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ ++ /* Fill table and check for immediate repcode. */ ++ if (ip0 <= ilimit) { ++ /* Fill Table */ ++ assert(base+current0+2 > istart); /* check base overflow */ ++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); ++ ++ while (ip0 <= ilimit) { ++ U32 const repIndex2 = (U32)(ip0-base) - offset_2; ++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; ++ if ( ((ZSTD_index_overlap_check(prefixStartIndex, repIndex2)) & (offset_2 > 0)) ++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { ++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ++ ip0 += repLength2; ++ anchor = ip0; ++ continue; ++ } ++ break; ++ } } ++ ++ goto _start; + } + + ZSTD_GEN_FAST_FN(extDict, 4, 0) +@@ -656,10 +964,11 @@ ZSTD_GEN_FAST_FN(extDict, 6, 0) + ZSTD_GEN_FAST_FN(extDict, 7, 0) + + size_t ZSTD_compressBlock_fast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; ++ assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ +diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h +index fddc2f532d21..04fde0a72a4e 100644 +--- a/lib/zstd/compress/zstd_fast.h ++++ b/lib/zstd/compress/zstd_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,21 +12,20 @@ + #ifndef ZSTD_FAST_H + #define ZSTD_FAST_H + +- + #include "../common/mem.h" /* U32 */ + #include "zstd_compress_internal.h" + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++void ZSTD_fillHashTable(ZSTD_MatchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_fast( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_fast_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_fast_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +- + #endif /* ZSTD_FAST_H */ +diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c +index 0298a01a7504..88e2501fe3ef 100644 +--- a/lib/zstd/compress/zstd_lazy.c ++++ b/lib/zstd/compress/zstd_lazy.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -10,14 +11,23 @@ + + #include "zstd_compress_internal.h" + #include "zstd_lazy.h" ++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) ++ ++#define kLazySkippingStep 8 + + + /*-************************************* + * Binary Tree search + ***************************************/ + +-static void +-ZSTD_updateDUBT(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_updateDUBT(ZSTD_MatchState_t* ms, + const BYTE* ip, const BYTE* iend, + U32 mls) + { +@@ -60,8 +70,9 @@ ZSTD_updateDUBT(ZSTD_matchState_t* ms, + * sort one already inserted but unsorted position + * assumption : curr >= btlow == (curr - btmask) + * doesn't fail */ +-static void +-ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_insertDUBT1(const ZSTD_MatchState_t* ms, + U32 curr, const BYTE* inputEnd, + U32 nbCompares, U32 btLow, + const ZSTD_dictMode_e dictMode) +@@ -149,9 +160,10 @@ ZSTD_insertDUBT1(const ZSTD_matchState_t* ms, + } + + +-static size_t +-ZSTD_DUBT_findBetterDictMatch ( +- const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBetterDictMatch ( ++ const ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + size_t* offsetPtr, + size_t bestLength, +@@ -159,7 +171,7 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +- const ZSTD_matchState_t * const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t * const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dmsCParams = &dms->cParams; + const U32 * const dictHashTable = dms->hashTable; + U32 const hashLog = dmsCParams->hashLog; +@@ -197,8 +209,8 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", +- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); ++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ +@@ -218,7 +230,7 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } +@@ -227,10 +239,11 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + +-static size_t +-ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_DUBT_findBestMatch(ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +- size_t* offsetPtr, ++ size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +@@ -327,8 +340,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; +- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) ++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any +@@ -361,16 +374,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, +- offsetPtr, bestLength, nbCompares, ++ offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", +- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); ++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +@@ -378,24 +391,25 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + + + /* ZSTD_BtFindBestMatch() : Tree updater, providing best match */ +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_BtFindBestMatch( ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +- size_t* offsetPtr, ++ size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) + { + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); +- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); ++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); + } + + /* ********************************* + * Dedicated dict search + ***********************************/ + +-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip) ++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip) + { + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); +@@ -514,7 +528,7 @@ void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const B + */ + FORCE_INLINE_TEMPLATE + size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nbAttempts, +- const ZSTD_matchState_t* const dms, ++ const ZSTD_MatchState_t* const dms, + const BYTE* const ip, const BYTE* const iLimit, + const BYTE* const prefixStart, const U32 curr, + const U32 dictLimit, const size_t ddsIdx) { +@@ -561,7 +575,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; +@@ -598,7 +612,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -614,10 +628,12 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + + /* Update chains up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( +- ZSTD_matchState_t* ms, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndex_internal( ++ ZSTD_MatchState_t* ms, + const ZSTD_compressionParameters* const cParams, +- const BYTE* ip, U32 const mls) ++ const BYTE* ip, U32 const mls, U32 const lazySkipping) + { + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; +@@ -632,21 +648,25 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; ++ /* Stop inserting every position when in the lazy skipping mode. */ ++ if (lazySkipping) ++ break; + } + + ms->nextToUpdate = target; + return hashTable[ZSTD_hashPtr(ip, hashLog, mls)]; + } + +-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { ++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); ++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); + } + + /* inlining is important to hardwire a hot branch (template emulation) */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_HcFindBestMatch( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode) +@@ -670,7 +690,7 @@ size_t ZSTD_HcFindBestMatch( + U32 nbAttempts = 1U << cParams->searchLog; + size_t ml=4-1; + +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 ddsHashLog = dictMode == ZSTD_dedicatedDictSearch + ? dms->cParams.hashLog - ZSTD_LAZY_DDSS_BUCKET_LOG : 0; + const size_t ddsIdx = dictMode == ZSTD_dedicatedDictSearch +@@ -684,14 +704,15 @@ size_t ZSTD_HcFindBestMatch( + } + + /* HC4 match finder */ +- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); ++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -703,7 +724,7 @@ size_t ZSTD_HcFindBestMatch( + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -739,7 +760,7 @@ size_t ZSTD_HcFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -756,8 +777,6 @@ size_t ZSTD_HcFindBestMatch( + * (SIMD) Row-based matchfinder + ***********************************/ + /* Constants for row-based hash */ +-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +@@ -769,64 +788,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +- assert(val != 0); +-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +- if (sizeof(size_t) == 4) { +- U32 mostSignificantWord = (U32)(val >> 32); +- U32 leastSignificantWord = (U32)val; +- if (leastSignificantWord == 0) { +- return 32 + (U32)__builtin_ctz(mostSignificantWord); +- } else { +- return (U32)__builtin_ctz(leastSignificantWord); +- } +- } else { +- return (U32)__builtin_ctzll(val); +- } +-# else +- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +- */ +- val = ~val & (val - 1ULL); /* Lowest set bit mask */ +- val = val - ((val >> 1) & 0x5555555555555555); +- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +-# endif +-} +- +-/* ZSTD_rotateRight_*(): +- * Rotates a bitfield to the right by "count" bits. +- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts +- */ +-FORCE_INLINE_TEMPLATE +-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +- assert(count < 64); +- count &= 0x3F; /* for fickle pattern recognition */ +- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +- assert(count < 32); +- count &= 0x1F; /* for fickle pattern recognition */ +- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +- assert(count < 16); +- count &= 0x0F; /* for fickle pattern recognition */ +- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { ++ return ZSTD_countTrailingZeros64(val); + } + + /* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" +- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) ++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +- U32 const next = (*tagRow - 1) & rowMask; +- *tagRow = (BYTE)next; +- return next; ++ U32 next = (*tagRow-1) & rowMask; ++ next += (next == 0) ? rowMask : 0; /* skip first position */ ++ *tagRow = (BYTE)next; ++ return next; + } + + /* ZSTD_isAligned(): +@@ -840,7 +814,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + /* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { ++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); +@@ -859,18 +833,20 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* ta + * Fill up the hash cache starting at idx, prefetching up to ZSTD_ROW_HASH_CACHE_SIZE entries, + * but not beyond iLimit. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const BYTE* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_fillHashCache(ZSTD_MatchState_t* ms, const BYTE* base, + U32 const rowLog, U32 const mls, + U32 idx, const BYTE* const iLimit) + { + U32 const* const hashTable = ms->hashTable; +- U16 const* const tagTable = ms->tagTable; ++ BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { +- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; +@@ -885,12 +861,15 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + * Returns the hash of base + idx, and replaces the hash in the hash cache with the byte at + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ +-FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +- U16 const* tagTable, BYTE const* base, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, ++ BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, +- U32 const rowLog, U32 const mls) ++ U32 const rowLog, U32 const mls, ++ U64 const hashSalt) + { +- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +@@ -902,28 +881,29 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTab + /* ZSTD_row_update_internalImpl(): + * Updates the hash table with positions starting from updateStartIdx until updateEndIdx. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, +- U32 updateStartIdx, U32 const updateEndIdx, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internalImpl(ZSTD_MatchState_t* ms, ++ U32 updateStartIdx, U32 const updateEndIdx, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) ++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; +- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +- Explicit cast allows us to get exact desired position within each row */ ++ BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; ++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } + } +@@ -932,9 +912,11 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + * Inserts the byte at ip into the appropriate position in the hash table, and updates ms->nextToUpdate. + * Skips sections of long matches as is necessary. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const BYTE* ip, +- U32 const mls, U32 const rowLog, +- U32 const rowMask, U32 const useCache) ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_row_update_internal(ZSTD_MatchState_t* ms, const BYTE* ip, ++ U32 const mls, U32 const rowLog, ++ U32 const rowMask, U32 const useCache) + { + U32 idx = ms->nextToUpdate; + const BYTE* const base = ms->window.base; +@@ -965,13 +947,41 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internal(ZSTD_matchState_t* ms, const + * External wrapper for ZSTD_row_update_internal(). Used for filling the hashtable during dictionary + * processing. + */ +-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { ++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip) { + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + const U32 rowMask = (1u << rowLog) - 1; + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); ++} ++ ++/* Returns the mask width of bits group of which will be set to 1. Given not all ++ * architectures have easy movemask instruction, this helps to iterate over ++ * groups of bits easier and faster. ++ */ ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ (void)rowEntries; ++#if defined(ZSTD_ARCH_ARM_NEON) ++ /* NEON path only works for little endian */ ++ if (!MEM_isLittleEndian()) { ++ return 1; ++ } ++ if (rowEntries == 16) { ++ return 4; ++ } ++ if (rowEntries == 32) { ++ return 2; ++ } ++ if (rowEntries == 64) { ++ return 1; ++ } ++#endif ++ return 1; + } + + #if defined(ZSTD_ARCH_X86_SSE2) +@@ -994,71 +1004,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U + } + #endif + +-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches +- * the hash at the nth position in a row of the tagTable. +- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield +- * to match up with the actual layout of the entries within the hashTable */ ++#if defined(ZSTD_ARCH_ARM_NEON) ++FORCE_INLINE_TEMPLATE ZSTD_VecMask ++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ if (rowEntries == 16) { ++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. ++ * After that groups of 4 bits represent the equalMask. We lower ++ * all bits except the highest in these groups by doing AND with ++ * 0x88 = 0b10001000. ++ */ ++ const uint8x16_t chunk = vld1q_u8(src); ++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); ++ const uint8x8_t res = vshrn_n_u16(equalMask, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; ++ } else if (rowEntries == 32) { ++ /* Same idea as with rowEntries == 16 but doing AND with ++ * 0x55 = 0b01010101. ++ */ ++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); ++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); ++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); ++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); ++ const uint8x8_t res = vsli_n_u8(t0, t1, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; ++ } else { /* rowEntries == 64 */ ++ const uint8x16x4_t chunk = vld4q_u8(src); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); ++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); ++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); ++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); ++ ++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); ++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); ++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); ++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); ++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped); ++ } ++} ++#endif ++ ++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by ++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" ++ * matches the hash at the nth position in a row of the tagTable. ++ * Each row is a circular buffer beginning at the value of "headGrouped". So we ++ * must rotate the "matches" bitfield to match up with the actual layout of the ++ * entries within the hashTable */ + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) + { +- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; ++ const BYTE* const src = tagRow; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); + + #if defined(ZSTD_ARCH_X86_SSE2) + +- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); ++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); + + #else /* SW or NEON-LE */ + + # if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { +- if (rowEntries == 16) { +- const uint8x16_t chunk = vld1q_u8(src); +- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +- const U16 hi = (U16)vgetq_lane_u8(t3, 8); +- const U16 lo = (U16)vgetq_lane_u8(t3, 0); +- return ZSTD_rotateRight_U16((hi << 8) | lo, head); +- } else if (rowEntries == 32) { +- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +- const uint8x8x2_t t3 = vuzp_u8(t2, t0); +- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +- return ZSTD_rotateRight_U32(matches, head); +- } else { /* rowEntries == 64 */ +- const uint8x16x4_t chunk = vld4q_u8(src); +- const uint8x16_t dup = vdupq_n_u8(tag); +- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); +- +- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +- return ZSTD_rotateRight_U64(matches, head); +- } ++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); + } + # endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ +- { const size_t chunkSize = sizeof(size_t); ++ { const int chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; +@@ -1091,11 +1112,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + } + matches = ~matches; + if (rowEntries == 16) { +- return ZSTD_rotateRight_U16((U16)matches, head); ++ return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { +- return ZSTD_rotateRight_U32((U32)matches, head); ++ return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { +- return ZSTD_rotateRight_U64((U64)matches, head); ++ return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } + #endif +@@ -1103,29 +1124,30 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + + /* The high-level approach of the SIMD row based match finder is as follows: + * - Figure out where to insert the new entry: +- * - Generate a hash from a byte along with an additional 1-byte "short hash". The additional byte is our "tag" +- * - The hashTable is effectively split into groups or "rows" of 16 or 32 entries of U32, and the hash determines ++ * - Generate a hash for current input position and split it into a one byte of tag and `rowHashLog` bits of index. ++ * - The hash is salted by a value that changes on every context reset, so when the same table is used ++ * we will avoid collisions that would otherwise slow us down by introducing phantom matches. ++ * - The hashTable is effectively split into groups or "rows" of 15 or 31 entries of U32, and the index determines + * which row to insert into. +- * - Determine the correct position within the row to insert the entry into. Each row of 16 or 32 can +- * be considered as a circular buffer with a "head" index that resides in the tagTable. +- * - Also insert the "tag" into the equivalent row and position in the tagTable. +- * - Note: The tagTable has 17 or 33 1-byte entries per row, due to 16 or 32 tags, and 1 "head" entry. +- * The 17 or 33 entry rows are spaced out to occur every 32 or 64 bytes, respectively, +- * for alignment/performance reasons, leaving some bytes unused. +- * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte "short hash" and ++ * - Determine the correct position within the row to insert the entry into. Each row of 15 or 31 can ++ * be considered as a circular buffer with a "head" index that resides in the tagTable (overall 16 or 32 bytes ++ * per row). ++ * - Use SIMD to efficiently compare the tags in the tagTable to the 1-byte tag calculated for the position and + * generate a bitfield that we can cycle through to check the collisions in the hash table. + * - Pick the longest match. ++ * - Insert the tag into the equivalent row and position in the tagTable. + */ + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_RowFindBestMatch( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, + size_t* offsetPtr, + const U32 mls, const ZSTD_dictMode_e dictMode, + const U32 rowLog) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -1143,11 +1165,14 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ ++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); ++ const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; ++ U32 hash; + + /* DMS/DDS variables that may be referenced laster */ +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + + /* Initialize the following variables to satisfy static analyzer */ + size_t ddsIdx = 0; +@@ -1168,7 +1193,7 @@ size_t ZSTD_RowFindBestMatch( + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; +- U16* const dmsTagTable = dms->tagTable; ++ BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +@@ -1178,23 +1203,34 @@ size_t ZSTD_RowFindBestMatch( + } + + /* Update the hashTable and tagTable up to (but not including) ip */ +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ if (!ms->lazySkipping) { ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); ++ } else { ++ /* Stop inserting every position when in the lazy skipping mode. ++ * The hash cache is also not kept up to date in this mode. ++ */ ++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); ++ ms->nextToUpdate = curr; ++ } ++ ms->hashSaltEntropy += hash; /* collect salt entropy */ ++ + { /* Get the hash for ip, compute the appropriate row */ +- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); +- U32 const head = *tagRow & rowMask; ++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; ++ if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; +@@ -1204,13 +1240,14 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; ++ tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + +@@ -1224,7 +1261,8 @@ size_t ZSTD_RowFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -1236,7 +1274,7 @@ size_t ZSTD_RowFindBestMatch( + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -1254,19 +1292,21 @@ size_t ZSTD_RowFindBestMatch( + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + +- { U32 const head = *dmsTagRow & rowMask; ++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; ++ if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Return the longest match */ +@@ -1285,7 +1325,7 @@ size_t ZSTD_RowFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } +@@ -1301,7 +1341,7 @@ size_t ZSTD_RowFindBestMatch( + * ZSTD_searchMax() dispatches to the correct implementation function. + * + * TODO: The start of the search function involves loading and calculating a +- * bunch of constants from the ZSTD_matchState_t. These computations could be ++ * bunch of constants from the ZSTD_MatchState_t. These computations could be + * done in an initialization function, and saved somewhere in the match state. + * Then we could pass a pointer to the saved state instead of the match state, + * and avoid duplicate computations. +@@ -1325,7 +1365,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_BT_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_BT_SEARCH_FN(dictMode, mls)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offBasePtr) \ + { \ +@@ -1335,7 +1375,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_HC_SEARCH_FN(dictMode, mls) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_HC_SEARCH_FN(dictMode, mls)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ +@@ -1345,7 +1385,7 @@ size_t ZSTD_RowFindBestMatch( + + #define GEN_ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog) \ + ZSTD_SEARCH_FN_ATTRS size_t ZSTD_ROW_SEARCH_FN(dictMode, mls, rowLog)( \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + const BYTE* ip, const BYTE* const iLimit, \ + size_t* offsetPtr) \ + { \ +@@ -1446,7 +1486,7 @@ typedef enum { search_hashChain=0, search_binaryTree=1, search_rowHash=2 } searc + * If a match is found its offset is stored in @p offsetPtr. + */ + FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* ip, + const BYTE* iend, + size_t* offsetPtr, +@@ -1472,9 +1512,10 @@ FORCE_INLINE_TEMPLATE size_t ZSTD_searchMax( + * Common parser - lazy strategy + *********************************/ + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_lazy_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_compressBlock_lazy_generic( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth, +@@ -1491,12 +1532,13 @@ ZSTD_compressBlock_lazy_generic( + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + +- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; ++ U32 offset_1 = rep[0], offset_2 = rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; + const int isDxS = isDMS || isDDS; +- const ZSTD_matchState_t* const dms = ms->dictMatchState; ++ const ZSTD_MatchState_t* const dms = ms->dictMatchState; + const U32 dictLowestIndex = isDxS ? dms->window.dictLimit : 0; + const BYTE* const dictBase = isDxS ? dms->window.base : NULL; + const BYTE* const dictLowest = isDxS ? dictBase + dictLowestIndex : NULL; +@@ -1512,8 +1554,8 @@ ZSTD_compressBlock_lazy_generic( + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; +- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 +@@ -1522,10 +1564,11 @@ ZSTD_compressBlock_lazy_generic( + assert(offset_2 <= dictAndPrefixLength); + } + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1537,7 +1580,7 @@ ZSTD_compressBlock_lazy_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + +@@ -1548,7 +1591,7 @@ ZSTD_compressBlock_lazy_generic( + && repIndex < prefixLowestIndex) ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; +@@ -1562,14 +1605,23 @@ ZSTD_compressBlock_lazy_generic( + } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); ++ { size_t offbaseFound = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; ++ ip += step; ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1579,34 +1631,34 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 1"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1615,34 +1667,34 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 2"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase + (repIndex - dictIndexDelta) : + base + repIndex; +- if (((U32)((prefixLowestIndex-1) - repIndex) >= 3 /* intentional underflow */) ++ if ((ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ +@@ -1653,26 +1705,33 @@ ZSTD_compressBlock_lazy_generic( + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { ++ if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { +- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ ++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) ++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + if (isDxS) { +@@ -1682,12 +1741,12 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatch = repIndex < prefixLowestIndex ? + dictBase - dictIndexDelta + repIndex : + base + repIndex; +- if ( ((U32)((prefixLowestIndex-1) - (U32)repIndex) >= 3 /* intentional overflow */) ++ if ( (ZSTD_index_overlap_check(prefixLowestIndex, repIndex)) + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; +@@ -1701,168 +1760,183 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + +- /* Save reps for next block */ +- rep[0] = offset_1 ? offset_1 : savedOffset; +- rep[1] = offset_2 ? offset_2 : savedOffset; ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ ++ /* save reps for next block */ ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + + +-size_t ZSTD_compressBlock_btlazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_lazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_greedy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dictMatchState); + } + +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); + } + +-/* Row-based matchfinder */ +-size_t ZSTD_compressBlock_lazy2_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_noDict); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2, ZSTD_dedicatedDictSearch); + } + +-size_t ZSTD_compressBlock_lazy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dictMatchState); + } + +- + size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2, ZSTD_dedicatedDictSearch); + } ++#endif + +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_noDict); + } + +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { +- return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0, ZSTD_dedicatedDictSearch); ++ return ZSTD_compressBlock_lazy_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2, ZSTD_dictMatchState); + } ++#endif + ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_compressBlock_lazy_extDict_generic( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const searchMethod_e searchMethod, const U32 depth) +@@ -1886,12 +1960,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1903,7 +1978,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + +@@ -1912,7 +1987,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const U32 repIndex = (U32)(curr+1 - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr+1 - windowLow) ) /* note: we are searching at curr+1 */ + if (MEM_read32(ip+1) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ +@@ -1922,14 +1997,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + } } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); ++ ip += step + 1; /* jump faster over incompressible sections */ ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1939,30 +2023,30 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1971,50 +2055,57 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_1 <= curr - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ if (OFFBASE_IS_OFFSET(offBase)) { ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + while (ip <= ilimit) { +@@ -2023,14 +2114,14 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const U32 repIndex = repCurrent - offset_2; + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; + const BYTE* const repMatch = repBase + repIndex; +- if ( ((U32)((dictLimit-1) - repIndex) >= 3) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ if ( (ZSTD_index_overlap_check(dictLimit, repIndex)) + & (offset_2 <= repCurrent - windowLow) ) /* equivalent to `curr > repIndex >= windowLow` */ + if (MEM_read32(ip) == MEM_read32(repMatch)) { + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -2045,58 +2136,65 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + +- ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_greedy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + { + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 0); + } + +-size_t ZSTD_compressBlock_lazy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 1); + } + +-size_t ZSTD_compressBlock_btlazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); + } ++#endif + +-size_t ZSTD_compressBlock_greedy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) ++ + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 0); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_hashChain, 2); + } + +-size_t ZSTD_compressBlock_lazy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 1); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + } ++#endif + +-size_t ZSTD_compressBlock_lazy2_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) + + { +- return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); ++ return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_binaryTree, 2); + } ++#endif +diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h +index e5bdf4df8dde..987a036d8bde 100644 +--- a/lib/zstd/compress/zstd_lazy.h ++++ b/lib/zstd/compress/zstd_lazy.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,7 +12,6 @@ + #ifndef ZSTD_LAZY_H + #define ZSTD_LAZY_H + +- + #include "zstd_compress_internal.h" + + /* +@@ -22,98 +22,173 @@ + */ + #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + +-U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); +-void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); ++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ ++ ++#if !defined(ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) ++U32 ZSTD_insertAndFindFirstIndex(ZSTD_MatchState_t* ms, const BYTE* ip); ++void ZSTD_row_update(ZSTD_MatchState_t* const ms, const BYTE* ip); + +-void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_matchState_t* ms, const BYTE* const ip); ++void ZSTD_dedicatedDictSearch_lazy_loadDictionary(ZSTD_MatchState_t* ms, const BYTE* const ip); + + void ZSTD_preserveUnsortedMark (U32* const table, U32 const size, U32 const reducerValue); /*! used in ZSTD_reduceIndex(). preemptively increase value of ZSTD_DUBT_UNSORTED_MARK */ ++#endif + +-size_t ZSTD_compressBlock_btlazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_GREEDY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_greedy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_greedy_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); ++size_t ZSTD_compressBlock_greedy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + +-size_t ZSTD_compressBlock_btlazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#define ZSTD_COMPRESSBLOCK_GREEDY ZSTD_compressBlock_greedy ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW ZSTD_compressBlock_greedy_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE ZSTD_compressBlock_greedy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW ZSTD_compressBlock_greedy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH ZSTD_compressBlock_greedy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_greedy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT ZSTD_compressBlock_greedy_extDict ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW ZSTD_compressBlock_greedy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_GREEDY NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_GREEDY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dictMatchState_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ++#define ZSTD_COMPRESSBLOCK_LAZY ZSTD_compressBlock_lazy ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW ZSTD_compressBlock_lazy_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE ZSTD_compressBlock_lazy_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT ZSTD_compressBlock_lazy_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW ZSTD_compressBlock_lazy_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_LAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_lazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_dedicatedDictSearch_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_greedy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dictMatchState_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); ++size_t ZSTD_compressBlock_lazy2_dedicatedDictSearch_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_lazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_greedy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_lazy2_extDict_row( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ++#define ZSTD_COMPRESSBLOCK_LAZY2 ZSTD_compressBlock_lazy2 ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW ZSTD_compressBlock_lazy2_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE ZSTD_compressBlock_lazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW ZSTD_compressBlock_lazy2_dictMatchState_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH ZSTD_compressBlock_lazy2_dedicatedDictSearch ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW ZSTD_compressBlock_lazy2_dedicatedDictSearch_row ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT ZSTD_compressBlock_lazy2_extDict ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW ZSTD_compressBlock_lazy2_extDict_row ++#else ++#define ZSTD_COMPRESSBLOCK_LAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DICTMATCHSTATE_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_DEDICATEDDICTSEARCH_ROW NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_LAZY2_EXTDICT_ROW NULL ++#endif ++ ++#ifndef ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btlazy2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_lazy2_extDict_row( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btlazy2_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btlazy2_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- + ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 ZSTD_compressBlock_btlazy2 ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE ZSTD_compressBlock_btlazy2_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT ZSTD_compressBlock_btlazy2_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTLAZY2 NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTLAZY2_EXTDICT NULL ++#endif + + #endif /* ZSTD_LAZY_H */ +diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c +index dd86fc83e7dd..54eefad9cae6 100644 +--- a/lib/zstd/compress/zstd_ldm.c ++++ b/lib/zstd/compress/zstd_ldm.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,7 @@ + #include "zstd_double_fast.h" /* ZSTD_fillDoubleHashTable() */ + #include "zstd_ldm_geartab.h" + +-#define LDM_BUCKET_SIZE_LOG 3 ++#define LDM_BUCKET_SIZE_LOG 4 + #define LDM_MIN_MATCH_LENGTH 64 + #define LDM_HASH_RLOG 7 + +@@ -133,21 +134,35 @@ static size_t ZSTD_ldm_gear_feed(ldmRollingHashState_t* state, + } + + void ZSTD_ldm_adjustParameters(ldmParams_t* params, +- ZSTD_compressionParameters const* cParams) ++ const ZSTD_compressionParameters* cParams) + { + params->windowLog = cParams->windowLog; + ZSTD_STATIC_ASSERT(LDM_BUCKET_SIZE_LOG <= ZSTD_LDM_BUCKETSIZELOG_MAX); + DEBUGLOG(4, "ZSTD_ldm_adjustParameters"); +- if (!params->bucketSizeLog) params->bucketSizeLog = LDM_BUCKET_SIZE_LOG; +- if (!params->minMatchLength) params->minMatchLength = LDM_MIN_MATCH_LENGTH; ++ if (params->hashRateLog == 0) { ++ if (params->hashLog > 0) { ++ /* if params->hashLog is set, derive hashRateLog from it */ ++ assert(params->hashLog <= ZSTD_HASHLOG_MAX); ++ if (params->windowLog > params->hashLog) { ++ params->hashRateLog = params->windowLog - params->hashLog; ++ } ++ } else { ++ assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); ++ /* mapping from [fast, rate7] to [btultra2, rate4] */ ++ params->hashRateLog = 7 - (cParams->strategy/3); ++ } ++ } + if (params->hashLog == 0) { +- params->hashLog = MAX(ZSTD_HASHLOG_MIN, params->windowLog - LDM_HASH_RLOG); +- assert(params->hashLog <= ZSTD_HASHLOG_MAX); ++ params->hashLog = BOUNDED(ZSTD_HASHLOG_MIN, params->windowLog - params->hashRateLog, ZSTD_HASHLOG_MAX); + } +- if (params->hashRateLog == 0) { +- params->hashRateLog = params->windowLog < params->hashLog +- ? 0 +- : params->windowLog - params->hashLog; ++ if (params->minMatchLength == 0) { ++ params->minMatchLength = LDM_MIN_MATCH_LENGTH; ++ if (cParams->strategy >= ZSTD_btultra) ++ params->minMatchLength /= 2; ++ } ++ if (params->bucketSizeLog==0) { ++ assert(1 <= (int)cParams->strategy && (int)cParams->strategy <= 9); ++ params->bucketSizeLog = BOUNDED(LDM_BUCKET_SIZE_LOG, (U32)cParams->strategy, ZSTD_LDM_BUCKETSIZELOG_MAX); + } + params->bucketSizeLog = MIN(params->bucketSizeLog, params->hashLog); + } +@@ -170,22 +185,22 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize) + /* ZSTD_ldm_getBucket() : + * Returns a pointer to the start of the bucket associated with hash. */ + static ldmEntry_t* ZSTD_ldm_getBucket( +- ldmState_t* ldmState, size_t hash, ldmParams_t const ldmParams) ++ const ldmState_t* ldmState, size_t hash, U32 const bucketSizeLog) + { +- return ldmState->hashTable + (hash << ldmParams.bucketSizeLog); ++ return ldmState->hashTable + (hash << bucketSizeLog); + } + + /* ZSTD_ldm_insertEntry() : + * Insert the entry with corresponding hash into the hash table */ + static void ZSTD_ldm_insertEntry(ldmState_t* ldmState, + size_t const hash, const ldmEntry_t entry, +- ldmParams_t const ldmParams) ++ U32 const bucketSizeLog) + { + BYTE* const pOffset = ldmState->bucketOffsets + hash; + unsigned const offset = *pOffset; + +- *(ZSTD_ldm_getBucket(ldmState, hash, ldmParams) + offset) = entry; +- *pOffset = (BYTE)((offset + 1) & ((1u << ldmParams.bucketSizeLog) - 1)); ++ *(ZSTD_ldm_getBucket(ldmState, hash, bucketSizeLog) + offset) = entry; ++ *pOffset = (BYTE)((offset + 1) & ((1u << bucketSizeLog) - 1)); + + } + +@@ -234,7 +249,7 @@ static size_t ZSTD_ldm_countBackwardsMatch_2segments( + * + * The tables for the other strategies are filled within their + * block compressors. */ +-static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, ++static size_t ZSTD_ldm_fillFastTables(ZSTD_MatchState_t* ms, + void const* end) + { + const BYTE* const iend = (const BYTE*)end; +@@ -242,11 +257,15 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + switch(ms->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); ++#ifndef ZSTD_EXCLUDE_DFAST_BLOCK_COMPRESSOR ++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); ++#else ++ assert(0); /* shouldn't be called: cparams should've been adjusted. */ ++#endif + break; + + case ZSTD_greedy: +@@ -269,7 +288,8 @@ void ZSTD_ldm_fillHashTable( + const BYTE* iend, ldmParams_t const* params) + { + U32 const minMatchLength = params->minMatchLength; +- U32 const hBits = params->hashLog - params->bucketSizeLog; ++ U32 const bucketSizeLog = params->bucketSizeLog; ++ U32 const hBits = params->hashLog - bucketSizeLog; + BYTE const* const base = ldmState->window.base; + BYTE const* const istart = ip; + ldmRollingHashState_t hashState; +@@ -284,7 +304,7 @@ void ZSTD_ldm_fillHashTable( + unsigned n; + + numSplits = 0; +- hashed = ZSTD_ldm_gear_feed(&hashState, ip, iend - ip, splits, &numSplits); ++ hashed = ZSTD_ldm_gear_feed(&hashState, ip, (size_t)(iend - ip), splits, &numSplits); + + for (n = 0; n < numSplits; n++) { + if (ip + splits[n] >= istart + minMatchLength) { +@@ -295,7 +315,7 @@ void ZSTD_ldm_fillHashTable( + + entry.offset = (U32)(split - base); + entry.checksum = (U32)(xxhash >> 32); +- ZSTD_ldm_insertEntry(ldmState, hash, entry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, entry, params->bucketSizeLog); + } + } + +@@ -309,7 +329,7 @@ void ZSTD_ldm_fillHashTable( + * Sets cctx->nextToUpdate to a position corresponding closer to anchor + * if it is far way + * (after a long match, only update tables a limited amount). */ +-static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) ++static void ZSTD_ldm_limitTableUpdate(ZSTD_MatchState_t* ms, const BYTE* anchor) + { + U32 const curr = (U32)(anchor - ms->window.base); + if (curr > ms->nextToUpdate + 1024) { +@@ -318,8 +338,10 @@ static void ZSTD_ldm_limitTableUpdate(ZSTD_matchState_t* ms, const BYTE* anchor) + } + } + +-static size_t ZSTD_ldm_generateSequences_internal( +- ldmState_t* ldmState, rawSeqStore_t* rawSeqStore, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_ldm_generateSequences_internal( ++ ldmState_t* ldmState, RawSeqStore_t* rawSeqStore, + ldmParams_t const* params, void const* src, size_t srcSize) + { + /* LDM parameters */ +@@ -373,7 +395,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + candidates[n].split = split; + candidates[n].hash = hash; + candidates[n].checksum = (U32)(xxhash >> 32); +- candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, *params); ++ candidates[n].bucket = ZSTD_ldm_getBucket(ldmState, hash, params->bucketSizeLog); + PREFETCH_L1(candidates[n].bucket); + } + +@@ -396,7 +418,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + * the previous one, we merely register it in the hash table and + * move on */ + if (split < anchor) { +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + +@@ -443,7 +465,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + /* No match found -- insert an entry into the hash table + * and process the next candidate match */ + if (bestEntry == NULL) { +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + continue; + } + +@@ -464,7 +486,7 @@ static size_t ZSTD_ldm_generateSequences_internal( + + /* Insert the current entry into the hash table --- it must be + * done after the previous block to avoid clobbering bestEntry */ +- ZSTD_ldm_insertEntry(ldmState, hash, newEntry, *params); ++ ZSTD_ldm_insertEntry(ldmState, hash, newEntry, params->bucketSizeLog); + + anchor = split + forwardMatchLength; + +@@ -503,7 +525,7 @@ static void ZSTD_ldm_reduceTable(ldmEntry_t* const table, U32 const size, + } + + size_t ZSTD_ldm_generateSequences( +- ldmState_t* ldmState, rawSeqStore_t* sequences, ++ ldmState_t* ldmState, RawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize) + { + U32 const maxDist = 1U << params->windowLog; +@@ -549,7 +571,7 @@ size_t ZSTD_ldm_generateSequences( + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the +- * the offset against maxDist directly. ++ * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may +@@ -580,7 +602,7 @@ size_t ZSTD_ldm_generateSequences( + } + + void +-ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) ++ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, U32 const minMatch) + { + while (srcSize > 0 && rawSeqStore->pos < rawSeqStore->size) { + rawSeq* seq = rawSeqStore->seq + rawSeqStore->pos; +@@ -616,7 +638,7 @@ ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, U32 const min + * Returns the current sequence to handle, or if the rest of the block should + * be literals, it returns a sequence with offset == 0. + */ +-static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, ++static rawSeq maybeSplitSequence(RawSeqStore_t* rawSeqStore, + U32 const remaining, U32 const minMatch) + { + rawSeq sequence = rawSeqStore->seq[rawSeqStore->pos]; +@@ -640,7 +662,7 @@ static rawSeq maybeSplitSequence(rawSeqStore_t* rawSeqStore, + return sequence; + } + +-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { ++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { + rawSeq currSeq = rawSeqStore->seq[rawSeqStore->pos]; +@@ -657,14 +679,14 @@ void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) { + } + } + +-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_paramSwitch_e useRowMatchFinder, ++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_ParamSwitch_e useRowMatchFinder, + void const* src, size_t srcSize) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + unsigned const minMatch = cParams->minMatch; +- ZSTD_blockCompressor const blockCompressor = ++ ZSTD_BlockCompressor_f const blockCompressor = + ZSTD_selectBlockCompressor(cParams->strategy, useRowMatchFinder, ZSTD_matchState_dictMode(ms)); + /* Input bounds */ + BYTE const* const istart = (BYTE const*)src; +@@ -689,7 +711,6 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* maybeSplitSequence updates rawSeqStore->pos */ + rawSeq const sequence = maybeSplitSequence(rawSeqStore, + (U32)(iend - ip), minMatch); +- int i; + /* End signal */ + if (sequence.offset == 0) + break; +@@ -702,6 +723,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + /* Run the block compressor */ + DEBUGLOG(5, "pos %u : calling block compressor on segment of size %u", (unsigned)(ip-istart), sequence.litLength); + { ++ int i; + size_t const newLitLength = + blockCompressor(ms, seqStore, rep, ip, sequence.litLength); + ip += sequence.litLength; +@@ -711,7 +733,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, +- STORE_OFFSET(sequence.offset), ++ OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } +diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h +index fbc6a5e88fd7..41400a7191b2 100644 +--- a/lib/zstd/compress/zstd_ldm.h ++++ b/lib/zstd/compress/zstd_ldm.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,7 +12,6 @@ + #ifndef ZSTD_LDM_H + #define ZSTD_LDM_H + +- + #include "zstd_compress_internal.h" /* ldmParams_t, U32 */ + #include /* ZSTD_CCtx, size_t */ + +@@ -40,7 +40,7 @@ void ZSTD_ldm_fillHashTable( + * sequences. + */ + size_t ZSTD_ldm_generateSequences( +- ldmState_t* ldms, rawSeqStore_t* sequences, ++ ldmState_t* ldms, RawSeqStore_t* sequences, + ldmParams_t const* params, void const* src, size_t srcSize); + + /* +@@ -61,9 +61,9 @@ size_t ZSTD_ldm_generateSequences( + * two. We handle that case correctly, and update `rawSeqStore` appropriately. + * NOTE: This function does not return any errors. + */ +-size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- ZSTD_paramSwitch_e useRowMatchFinder, ++size_t ZSTD_ldm_blockCompress(RawSeqStore_t* rawSeqStore, ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_ParamSwitch_e useRowMatchFinder, + void const* src, size_t srcSize); + + /* +@@ -73,7 +73,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + * Avoids emitting matches less than `minMatch` bytes. + * Must be called for data that is not passed to ZSTD_ldm_blockCompress(). + */ +-void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, ++void ZSTD_ldm_skipSequences(RawSeqStore_t* rawSeqStore, size_t srcSize, + U32 const minMatch); + + /* ZSTD_ldm_skipRawSeqStoreBytes(): +@@ -81,7 +81,7 @@ void ZSTD_ldm_skipSequences(rawSeqStore_t* rawSeqStore, size_t srcSize, + * Not to be used in conjunction with ZSTD_ldm_skipSequences(). + * Must be called for data with is not passed to ZSTD_ldm_blockCompress(). + */ +-void ZSTD_ldm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes); ++void ZSTD_ldm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes); + + /* ZSTD_ldm_getTableSize() : + * Estimate the space needed for long distance matching tables or 0 if LDM is +@@ -107,5 +107,4 @@ size_t ZSTD_ldm_getMaxNbSeq(ldmParams_t params, size_t maxChunkSize); + void ZSTD_ldm_adjustParameters(ldmParams_t* params, + ZSTD_compressionParameters const* cParams); + +- + #endif /* ZSTD_FAST_H */ +diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h +index 647f865be290..cfccfc46f6f7 100644 +--- a/lib/zstd/compress/zstd_ldm_geartab.h ++++ b/lib/zstd/compress/zstd_ldm_geartab.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c +index fd82acfda62f..b62fd1b0d83e 100644 +--- a/lib/zstd/compress/zstd_opt.c ++++ b/lib/zstd/compress/zstd_opt.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,11 +13,14 @@ + #include "hist.h" + #include "zstd_opt.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + + #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ + #define ZSTD_MAX_PRICE (1<<30) + +-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + + /*-************************************* +@@ -26,27 +30,35 @@ + #if 0 /* approximation at bit level (for tests) */ + # define BITCOST_ACCURACY 0 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) ++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) + #elif 0 /* fractional bit accuracy (for tests) */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) + #else /* opt==approx, ultra==accurate */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) ++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) + #endif + ++/* ZSTD_bitWeight() : ++ * provide estimated "cost" of a stat in full bits only */ + MEM_STATIC U32 ZSTD_bitWeight(U32 stat) + { + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); + } + ++/* ZSTD_fracWeight() : ++ * provide fractional-bit "cost" of a stat, ++ * using linear interpolation approximation */ + MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + { + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; ++ /* Fweight was meant for "Fractional weight" ++ * but it's effectively a value between 1 and 2 ++ * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); +@@ -57,7 +69,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + /* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +-MEM_STATIC double ZSTD_fCost(U32 price) ++MEM_STATIC double ZSTD_fCost(int price) + { + return (double)price / (BITCOST_MULTIPLIER*8); + } +@@ -88,20 +100,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts) + return total; + } + +-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) ++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; ++ ++static U32 ++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) + { + U32 s, sum=0; +- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); ++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", ++ (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); + for (s=0; s> shift); +- sum += table[s]; ++ unsigned const base = base1 ? 1 : (table[s]>0); ++ unsigned const newStat = base + (table[s] >> shift); ++ sum += newStat; ++ table[s] = newStat; + } + return sum; + } + + /* ZSTD_scaleStats() : +- * reduce all elements in table is sum too large ++ * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + { +@@ -110,7 +128,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; +- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); ++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); + } + + /* ZSTD_rescaleFreqs() : +@@ -129,18 +147,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + +- if (optPtr->litLengthSum == 0) { /* first block : init */ +- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ +- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); ++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ ++ ++ /* heuristic: use pre-defined stats for too small inputs */ ++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { ++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { +- /* huffman table presumed generated by dictionary */ ++ ++ /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { ++ /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; +@@ -188,13 +210,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + +- } else { /* not a dictionary */ ++ } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { ++ /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ +- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); ++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { +@@ -224,10 +247,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + +- + } + +- } else { /* new block : re-use previous statistics, scaled down */ ++ } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +@@ -246,6 +268,7 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + const optState_t* const optPtr, + int optLevel) + { ++ DEBUGLOG(8, "ZSTD_rawLiteralsCost (%u literals)", litLength); + if (litLength == 0) return 0; + + if (!ZSTD_compressedLiterals(optPtr)) +@@ -255,11 +278,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ +- { U32 price = litLength * optPtr->litSumBasePrice; ++ { U32 price = optPtr->litSumBasePrice * litLength; ++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; ++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { +- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ +- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; ++ price -= litPrice; + } + return price; + } +@@ -272,10 +298,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); +- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +- * because it isn't representable in the zstd format. So instead just +- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +- * would be all literals. ++ ++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX ++ * because it isn't representable in the zstd format. ++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. ++ * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); +@@ -289,24 +316,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + } + + /* ZSTD_getMatchPrice() : +- * Provides the cost of the match part (offset + matchLength) of a sequence ++ * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. +- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 ++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ + FORCE_INLINE_TEMPLATE U32 +-ZSTD_getMatchPrice(U32 const offcode, ++ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) + { + U32 price; +- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); ++ U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + +- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ +- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); ++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ ++ return WEIGHT(mlBase, optLevel) ++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); +@@ -325,10 +353,10 @@ ZSTD_getMatchPrice(U32 const offcode, + } + + /* ZSTD_updateStats() : +- * assumption : literals + litLengtn <= iend */ ++ * assumption : literals + litLength <= iend */ + static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, +- U32 offsetCode, U32 matchLength) ++ U32 offBase, U32 matchLength) + { + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { +@@ -344,8 +372,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, + optPtr->litLengthSum++; + } + +- /* offset code : expected to follow storeSeq() numeric representation */ +- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); ++ /* offset code : follows storeSeq() numeric representation */ ++ { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; +@@ -379,9 +407,11 @@ MEM_STATIC U32 ZSTD_readMINMATCH(const void* memPtr, U32 length) + + /* Update hashTable3 up to ip (excluded) + Assumption : always within prefix (i.e. not within extDict) */ +-static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_MatchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip) + { + U32* const hashTable3 = ms->hashTable3; + U32 const hashLog3 = ms->hashLog3; +@@ -408,8 +438,10 @@ static U32 ZSTD_insertAndFindFirstIndexHash3 (const ZSTD_matchState_t* ms, + * @param ip assumed <= iend-8 . + * @param target The target of ZSTD_updateTree_internal() - we are filling to this position + * @return : nb of positions added */ +-static U32 ZSTD_insertBt1( +- const ZSTD_matchState_t* ms, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_insertBt1( ++ const ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + U32 const target, + U32 const mls, const int extDict) +@@ -527,15 +559,16 @@ static U32 ZSTD_insertBt1( + } + + FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_updateTree_internal( +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + const BYTE* const ip, const BYTE* const iend, + const U32 mls, const ZSTD_dictMode_e dictMode) + { + const BYTE* const base = ms->window.base; + U32 const target = (U32)(ip - base); + U32 idx = ms->nextToUpdate; +- DEBUGLOG(6, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", ++ DEBUGLOG(7, "ZSTD_updateTree_internal, from %u to %u (dictMode:%u)", + idx, target, dictMode); + + while(idx < target) { +@@ -548,20 +581,23 @@ void ZSTD_updateTree_internal( + ms->nextToUpdate = target; + } + +-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { ++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); + } + + FORCE_INLINE_TEMPLATE +-U32 ZSTD_insertBtAndGetAllMatches ( +- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ +- ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, +- const U32 rep[ZSTD_REP_NUM], +- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ +- const U32 lengthToBeat, +- U32 const mls /* template */) ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ++ZSTD_insertBtAndGetAllMatches ( ++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ++ ZSTD_MatchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip, const BYTE* const iLimit, ++ const ZSTD_dictMode_e dictMode, ++ const U32 rep[ZSTD_REP_NUM], ++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ ++ const U32 lengthToBeat, ++ const U32 mls /* template */) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -590,7 +626,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + U32 mnum = 0; + U32 nbCompares = 1U << cParams->searchLog; + +- const ZSTD_matchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; ++ const ZSTD_MatchState_t* dms = dictMode == ZSTD_dictMatchState ? ms->dictMatchState : NULL; + const ZSTD_compressionParameters* const dmsCParams = + dictMode == ZSTD_dictMatchState ? &dms->cParams : NULL; + const BYTE* const dmsBase = dictMode == ZSTD_dictMatchState ? dms->window.base : NULL; +@@ -629,13 +665,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + assert(curr >= windowLow); + if ( dictMode == ZSTD_extDict + && ( ((repOffset-1) /*intentional overflow*/ < curr - windowLow) /* equivalent to `curr > repIndex >= windowLow` */ +- & (((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */) ++ & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dictEnd, prefixStart) + minMatch; + } + if (dictMode == ZSTD_dictMatchState + && ( ((repOffset-1) /*intentional overflow*/ < curr - (dmsLowLimit + dmsIndexDelta)) /* equivalent to `curr > repIndex >= dmsLowLimit` */ +- & ((U32)((dictLimit-1) - repIndex) >= 3) ) /* intentional overflow : do not test positions overlapping 2 memory segments */ ++ & (ZSTD_index_overlap_check(dictLimit, repIndex)) ) + && (ZSTD_readMINMATCH(ip, minMatch) == ZSTD_readMINMATCH(repMatch, minMatch)) ) { + repLen = (U32)ZSTD_count_2segments(ip+minMatch, repMatch+minMatch, iLimit, dmsEnd, prefixStart) + minMatch; + } } +@@ -644,7 +680,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; +- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ ++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) +@@ -673,7 +709,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ +- matches[0].off = STORE_OFFSET(curr - matchIndex3); ++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | +@@ -706,13 +742,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + } + + if (matchLength > bestLength) { +- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -754,12 +790,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; +- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -784,7 +820,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + typedef U32 (*ZSTD_getAllMatchesFn)( + ZSTD_match_t*, +- ZSTD_matchState_t*, ++ ZSTD_MatchState_t*, + U32*, + const BYTE*, + const BYTE*, +@@ -792,9 +828,11 @@ typedef U32 (*ZSTD_getAllMatchesFn)( + U32 const ll0, + U32 const lengthToBeat); + +-FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++U32 ZSTD_btGetAllMatches_internal( + ZSTD_match_t* matches, +- ZSTD_matchState_t* ms, ++ ZSTD_MatchState_t* ms, + U32* nextToUpdate3, + const BYTE* ip, + const BYTE* const iHighLimit, +@@ -817,7 +855,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_btGetAllMatches_internal( + #define GEN_ZSTD_BT_GET_ALL_MATCHES_(dictMode, mls) \ + static U32 ZSTD_BT_GET_ALL_MATCHES_FN(dictMode, mls)( \ + ZSTD_match_t* matches, \ +- ZSTD_matchState_t* ms, \ ++ ZSTD_MatchState_t* ms, \ + U32* nextToUpdate3, \ + const BYTE* ip, \ + const BYTE* const iHighLimit, \ +@@ -849,7 +887,7 @@ GEN_ZSTD_BT_GET_ALL_MATCHES(dictMatchState) + } + + static ZSTD_getAllMatchesFn +-ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const dictMode) ++ZSTD_selectBtGetAllMatches(ZSTD_MatchState_t const* ms, ZSTD_dictMode_e const dictMode) + { + ZSTD_getAllMatchesFn const getAllMatchesFns[3][4] = { + ZSTD_BT_GET_ALL_MATCHES_ARRAY(noDict), +@@ -868,7 +906,7 @@ ZSTD_selectBtGetAllMatches(ZSTD_matchState_t const* ms, ZSTD_dictMode_e const di + + /* Struct containing info needed to make decision about ldm inclusion */ + typedef struct { +- rawSeqStore_t seqStore; /* External match candidates store for this block */ ++ RawSeqStore_t seqStore; /* External match candidates store for this block */ + U32 startPosInBlock; /* Start position of the current match candidate */ + U32 endPosInBlock; /* End position of the current match candidate */ + U32 offset; /* Offset of the match candidate */ +@@ -878,7 +916,7 @@ typedef struct { + * Moves forward in @rawSeqStore by @nbBytes, + * which will update the fields 'pos' and 'posInSequence'. + */ +-static void ZSTD_optLdm_skipRawSeqStoreBytes(rawSeqStore_t* rawSeqStore, size_t nbBytes) ++static void ZSTD_optLdm_skipRawSeqStoreBytes(RawSeqStore_t* rawSeqStore, size_t nbBytes) + { + U32 currPos = (U32)(rawSeqStore->posInSequence + nbBytes); + while (currPos && rawSeqStore->pos < rawSeqStore->size) { +@@ -935,7 +973,7 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock + return; + } + +- /* Matches may be < MINMATCH by this process. In that case, we will reject them ++ /* Matches may be < minMatch by this process. In that case, we will reject them + when we are deciding whether or not to add the ldm */ + optLdm->startPosInBlock = currPosInBlock + literalsBytesRemaining; + optLdm->endPosInBlock = optLdm->startPosInBlock + matchBytesRemaining; +@@ -957,25 +995,26 @@ ZSTD_opt_getNextMatchAndUpdateSeqStore(ZSTD_optLdm_t* optLdm, U32 currPosInBlock + * into 'matches'. Maintains the correct ordering of 'matches'. + */ + static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, +- const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) ++ const ZSTD_optLdm_t* optLdm, U32 currPosInBlock, ++ U32 minMatch) + { + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; +- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ ++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ + if (currPosInBlock < optLdm->startPosInBlock + || currPosInBlock >= optLdm->endPosInBlock +- || candidateMatchLength < MINMATCH) { ++ || candidateMatchLength < minMatch) { + return; + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); +- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", +- candidateOffCode, candidateMatchLength, currPosInBlock); ++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); ++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", ++ candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; +- matches[*nbMatches].off = candidateOffCode; ++ matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } + } +@@ -986,7 +1025,8 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + static void + ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + ZSTD_match_t* matches, U32* nbMatches, +- U32 currPosInBlock, U32 remainingBytes) ++ U32 currPosInBlock, U32 remainingBytes, ++ U32 minMatch) + { + if (optLdm->seqStore.size == 0 || optLdm->seqStore.pos >= optLdm->seqStore.size) { + return; +@@ -1003,7 +1043,7 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + } + ZSTD_opt_getNextMatchAndUpdateSeqStore(optLdm, currPosInBlock, remainingBytes); + } +- ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock); ++ ZSTD_optLdm_maybeAddMatch(matches, nbMatches, optLdm, currPosInBlock, minMatch); + } + + +@@ -1011,11 +1051,6 @@ ZSTD_optLdm_processMatchCandidate(ZSTD_optLdm_t* optLdm, + * Optimal parser + *********************************/ + +-static U32 ZSTD_totalLen(ZSTD_optimal_t sol) +-{ +- return sol.litlen + sol.mlen; +-} +- + #if 0 /* debug */ + + static void +@@ -1033,9 +1068,15 @@ listStats(const U32* table, int lastEltID) + + #endif + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, ++#define LIT_PRICE(_p) (int)ZSTD_rawLiteralsCost(_p, 1, optStatePtr, optLevel) ++#define LL_PRICE(_l) (int)ZSTD_litLengthPrice(_l, optStatePtr, optLevel) ++#define LL_INCPRICE(_l) (LL_PRICE(_l) - LL_PRICE(_l-1)) ++ ++FORCE_INLINE_TEMPLATE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ++ZSTD_compressBlock_opt_generic(ZSTD_MatchState_t* ms, ++ SeqStore_t* seqStore, + U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, + const int optLevel, +@@ -1059,9 +1100,11 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + ZSTD_optimal_t* const opt = optStatePtr->priceTable; + ZSTD_match_t* const matches = optStatePtr->matchTable; +- ZSTD_optimal_t lastSequence; ++ ZSTD_optimal_t lastStretch; + ZSTD_optLdm_t optLdm; + ++ ZSTD_memset(&lastStretch, 0, sizeof(ZSTD_optimal_t)); ++ + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); +@@ -1082,103 +1125,140 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const ll0 = !litlen; + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, ip, iend, rep, ll0, minMatch); + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(ip-istart), (U32)(iend - ip)); +- if (!nbMatches) { ip++; continue; } ++ (U32)(ip-istart), (U32)(iend-ip), ++ minMatch); ++ if (!nbMatches) { ++ DEBUGLOG(8, "no match found at cPos %u", (unsigned)(ip-istart)); ++ ip++; ++ continue; ++ } ++ ++ /* Match found: let's store this solution, and eventually find more candidates. ++ * During this forward pass, @opt is used to store stretches, ++ * defined as "a match followed by N literals". ++ * Note how this is different from a Sequence, which is "N literals followed by a match". ++ * Storing stretches allows us to store different match predecessors ++ * for each literal position part of a literals run. */ + + /* initialize opt[0] */ +- { U32 i ; for (i=0; i immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; +- U32 const maxOffcode = matches[nbMatches-1].off; +- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", +- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); ++ U32 const maxOffBase = matches[nbMatches-1].off; ++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", ++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { +- lastSequence.litlen = litlen; +- lastSequence.mlen = maxML; +- lastSequence.off = maxOffcode; +- DEBUGLOG(6, "large match (%u>%u), immediate encoding", ++ lastStretch.litlen = 0; ++ lastStretch.mlen = maxML; ++ lastStretch.off = maxOffBase; ++ DEBUGLOG(6, "large match (%u>%u) => immediate encoding", + maxML, sufficient_len); + cur = 0; +- last_pos = ZSTD_totalLen(lastSequence); ++ last_pos = maxML; + goto _shortestPath; + } } + + /* set prices for first matches starting position == 0 */ + assert(opt[0].price >= 0); +- { U32 const literalsPrice = (U32)opt[0].price + ZSTD_litLengthPrice(0, optStatePtr, optLevel); +- U32 pos; ++ { U32 pos; + U32 matchNb; + for (pos = 1; pos < minMatch; pos++) { +- opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ ++ opt[pos].price = ZSTD_MAX_PRICE; ++ opt[pos].mlen = 0; ++ opt[pos].litlen = litlen + pos; + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { +- U32 const offcode = matches[matchNb].off; ++ U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { +- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); +- U32 const sequencePrice = literalsPrice + matchPrice; ++ int const matchPrice = (int)ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); ++ int const sequencePrice = opt[0].price + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", + pos, ZSTD_fCost(sequencePrice)); + opt[pos].mlen = pos; +- opt[pos].off = offcode; +- opt[pos].litlen = litlen; +- opt[pos].price = (int)sequencePrice; +- } } ++ opt[pos].off = offBase; ++ opt[pos].litlen = 0; /* end of match */ ++ opt[pos].price = sequencePrice + LL_PRICE(0); ++ } ++ } + last_pos = pos-1; ++ opt[pos].price = ZSTD_MAX_PRICE; + } + } + + /* check further positions */ + for (cur = 1; cur <= last_pos; cur++) { + const BYTE* const inr = ip + cur; +- assert(cur < ZSTD_OPT_NUM); +- DEBUGLOG(7, "cPos:%zi==rPos:%u", inr-istart, cur) ++ assert(cur <= ZSTD_OPT_NUM); ++ DEBUGLOG(7, "cPos:%i==rPos:%u", (int)(inr-istart), cur); + + /* Fix current position with one literal if cheaper */ +- { U32 const litlen = (opt[cur-1].mlen == 0) ? opt[cur-1].litlen + 1 : 1; ++ { U32 const litlen = opt[cur-1].litlen + 1; + int const price = opt[cur-1].price +- + (int)ZSTD_rawLiteralsCost(ip+cur-1, 1, optStatePtr, optLevel) +- + (int)ZSTD_litLengthPrice(litlen, optStatePtr, optLevel) +- - (int)ZSTD_litLengthPrice(litlen-1, optStatePtr, optLevel); ++ + LIT_PRICE(ip+cur-1) ++ + LL_INCPRICE(litlen); + assert(price < 1000000000); /* overflow check */ + if (price <= opt[cur].price) { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, ++ ZSTD_optimal_t const prevMatch = opt[cur]; ++ DEBUGLOG(7, "cPos:%i==rPos:%u : better price (%.2f<=%.2f) using literal (ll==%u) (hist:%u,%u,%u)", ++ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), litlen, + opt[cur-1].rep[0], opt[cur-1].rep[1], opt[cur-1].rep[2]); +- opt[cur].mlen = 0; +- opt[cur].off = 0; ++ opt[cur] = opt[cur-1]; + opt[cur].litlen = litlen; + opt[cur].price = price; ++ if ( (optLevel >= 1) /* additional check only for higher modes */ ++ && (prevMatch.litlen == 0) /* replace a match */ ++ && (LL_INCPRICE(1) < 0) /* ll1 is cheaper than ll0 */ ++ && LIKELY(ip + cur < iend) ++ ) { ++ /* check next position, in case it would be cheaper */ ++ int with1literal = prevMatch.price + LIT_PRICE(ip+cur) + LL_INCPRICE(1); ++ int withMoreLiterals = price + LIT_PRICE(ip+cur) + LL_INCPRICE(litlen+1); ++ DEBUGLOG(7, "then at next rPos %u : match+1lit %.2f vs %ulits %.2f", ++ cur+1, ZSTD_fCost(with1literal), litlen+1, ZSTD_fCost(withMoreLiterals)); ++ if ( (with1literal < withMoreLiterals) ++ && (with1literal < opt[cur+1].price) ) { ++ /* update offset history - before it disappears */ ++ U32 const prev = cur - prevMatch.mlen; ++ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, prevMatch.off, opt[prev].litlen==0); ++ assert(cur >= prevMatch.mlen); ++ DEBUGLOG(7, "==> match+1lit is cheaper (%.2f < %.2f) (hist:%u,%u,%u) !", ++ ZSTD_fCost(with1literal), ZSTD_fCost(withMoreLiterals), ++ newReps.rep[0], newReps.rep[1], newReps.rep[2] ); ++ opt[cur+1] = prevMatch; /* mlen & offbase */ ++ ZSTD_memcpy(opt[cur+1].rep, &newReps, sizeof(Repcodes_t)); ++ opt[cur+1].litlen = 1; ++ opt[cur+1].price = with1literal; ++ if (last_pos < cur+1) last_pos = cur+1; ++ } ++ } + } else { +- DEBUGLOG(7, "cPos:%zi==rPos:%u : literal would cost more (%.2f>%.2f) (hist:%u,%u,%u)", +- inr-istart, cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price), +- opt[cur].rep[0], opt[cur].rep[1], opt[cur].rep[2]); ++ DEBUGLOG(7, "cPos:%i==rPos:%u : literal would cost more (%.2f>%.2f)", ++ (int)(inr-istart), cur, ZSTD_fCost(price), ZSTD_fCost(opt[cur].price)); + } + } + +- /* Set the repcodes of the current position. We must do it here +- * because we rely on the repcodes of the 2nd to last sequence being +- * correct to set the next chunks repcodes during the backward +- * traversal. ++ /* Offset history is not updated during match comparison. ++ * Do it here, now that the match is selected and confirmed. + */ +- ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(repcodes_t)); ++ ZSTD_STATIC_ASSERT(sizeof(opt[cur].rep) == sizeof(Repcodes_t)); + assert(cur >= opt[cur].mlen); +- if (opt[cur].mlen != 0) { ++ if (opt[cur].litlen == 0) { ++ /* just finished a match => alter offset history */ + U32 const prev = cur - opt[cur].mlen; +- repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[cur].litlen==0); +- ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(repcodes_t)); +- } else { +- ZSTD_memcpy(opt[cur].rep, opt[cur - 1].rep, sizeof(repcodes_t)); ++ Repcodes_t const newReps = ZSTD_newRep(opt[prev].rep, opt[cur].off, opt[prev].litlen==0); ++ ZSTD_memcpy(opt[cur].rep, &newReps, sizeof(Repcodes_t)); + } + + /* last match must start at a minimum distance of 8 from oend */ +@@ -1188,38 +1268,37 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + if ( (optLevel==0) /*static_test*/ + && (opt[cur+1].price <= opt[cur].price + (BITCOST_MULTIPLIER/2)) ) { +- DEBUGLOG(7, "move to next rPos:%u : price is <=", cur+1); ++ DEBUGLOG(7, "skip current position : next rPos(%u) price is cheaper", cur+1); + continue; /* skip unpromising positions; about ~+6% speed, -0.01 ratio */ + } + + assert(opt[cur].price >= 0); +- { U32 const ll0 = (opt[cur].mlen != 0); +- U32 const litlen = (opt[cur].mlen == 0) ? opt[cur].litlen : 0; +- U32 const previousPrice = (U32)opt[cur].price; +- U32 const basePrice = previousPrice + ZSTD_litLengthPrice(0, optStatePtr, optLevel); ++ { U32 const ll0 = (opt[cur].litlen == 0); ++ int const previousPrice = opt[cur].price; ++ int const basePrice = previousPrice + LL_PRICE(0); + U32 nbMatches = getAllMatches(matches, ms, &nextToUpdate3, inr, iend, opt[cur].rep, ll0, minMatch); + U32 matchNb; + + ZSTD_optLdm_processMatchCandidate(&optLdm, matches, &nbMatches, +- (U32)(inr-istart), (U32)(iend-inr)); ++ (U32)(inr-istart), (U32)(iend-inr), ++ minMatch); + + if (!nbMatches) { + DEBUGLOG(7, "rPos:%u : no match found", cur); + continue; + } + +- { U32 const maxML = matches[nbMatches-1].len; +- DEBUGLOG(7, "cPos:%zi==rPos:%u, found %u matches, of maxLength=%u", +- inr-istart, cur, nbMatches, maxML); +- +- if ( (maxML > sufficient_len) +- || (cur + maxML >= ZSTD_OPT_NUM) ) { +- lastSequence.mlen = maxML; +- lastSequence.off = matches[nbMatches-1].off; +- lastSequence.litlen = litlen; +- cur -= (opt[cur].mlen==0) ? opt[cur].litlen : 0; /* last sequence is actually only literals, fix cur to last match - note : may underflow, in which case, it's first sequence, and it's okay */ +- last_pos = cur + ZSTD_totalLen(lastSequence); +- if (cur > ZSTD_OPT_NUM) cur = 0; /* underflow => first match */ ++ { U32 const longestML = matches[nbMatches-1].len; ++ DEBUGLOG(7, "cPos:%i==rPos:%u, found %u matches, of longest ML=%u", ++ (int)(inr-istart), cur, nbMatches, longestML); ++ ++ if ( (longestML > sufficient_len) ++ || (cur + longestML >= ZSTD_OPT_NUM) ++ || (ip + cur + longestML >= iend) ) { ++ lastStretch.mlen = longestML; ++ lastStretch.off = matches[nbMatches-1].off; ++ lastStretch.litlen = 0; ++ last_pos = cur + longestML; + goto _shortestPath; + } } + +@@ -1230,20 +1309,25 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + +- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", +- matchNb, matches[matchNb].off, lastML, litlen); ++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", ++ matchNb, matches[matchNb].off, lastML, opt[cur].litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ + U32 const pos = cur + mlen; +- int const price = (int)basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); ++ int const price = basePrice + (int)ZSTD_getMatchPrice(offset, mlen, optStatePtr, optLevel); + + if ((pos > last_pos) || (price < opt[pos].price)) { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new better price (%.2f<%.2f)", + pos, mlen, ZSTD_fCost(price), ZSTD_fCost(opt[pos].price)); +- while (last_pos < pos) { opt[last_pos+1].price = ZSTD_MAX_PRICE; last_pos++; } /* fill empty positions */ ++ while (last_pos < pos) { ++ /* fill empty positions, for future comparisons */ ++ last_pos++; ++ opt[last_pos].price = ZSTD_MAX_PRICE; ++ opt[last_pos].litlen = !0; /* just needs to be != 0, to mean "not an end of match" */ ++ } + opt[pos].mlen = mlen; + opt[pos].off = offset; +- opt[pos].litlen = litlen; ++ opt[pos].litlen = 0; + opt[pos].price = price; + } else { + DEBUGLOG(7, "rPos:%u (ml=%2u) => new price is worse (%.2f>=%.2f)", +@@ -1251,55 +1335,89 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + if (optLevel==0) break; /* early update abort; gets ~+10% speed for about -0.01 ratio loss */ + } + } } } ++ opt[last_pos+1].price = ZSTD_MAX_PRICE; + } /* for (cur = 1; cur <= last_pos; cur++) */ + +- lastSequence = opt[last_pos]; +- cur = last_pos > ZSTD_totalLen(lastSequence) ? last_pos - ZSTD_totalLen(lastSequence) : 0; /* single sequence, and it starts before `ip` */ +- assert(cur < ZSTD_OPT_NUM); /* control overflow*/ ++ lastStretch = opt[last_pos]; ++ assert(cur >= lastStretch.mlen); ++ cur = last_pos - lastStretch.mlen; + + _shortestPath: /* cur, last_pos, best_mlen, best_off have to be set */ + assert(opt[0].mlen == 0); ++ assert(last_pos >= lastStretch.mlen); ++ assert(cur == last_pos - lastStretch.mlen); + +- /* Set the next chunk's repcodes based on the repcodes of the beginning +- * of the last match, and the last sequence. This avoids us having to +- * update them while traversing the sequences. +- */ +- if (lastSequence.mlen != 0) { +- repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastSequence.off, lastSequence.litlen==0); +- ZSTD_memcpy(rep, &reps, sizeof(reps)); ++ if (lastStretch.mlen==0) { ++ /* no solution : all matches have been converted into literals */ ++ assert(lastStretch.litlen == (ip - anchor) + last_pos); ++ ip += last_pos; ++ continue; ++ } ++ assert(lastStretch.off > 0); ++ ++ /* Update offset history */ ++ if (lastStretch.litlen == 0) { ++ /* finishing on a match : update offset history */ ++ Repcodes_t const reps = ZSTD_newRep(opt[cur].rep, lastStretch.off, opt[cur].litlen==0); ++ ZSTD_memcpy(rep, &reps, sizeof(Repcodes_t)); + } else { +- ZSTD_memcpy(rep, opt[cur].rep, sizeof(repcodes_t)); ++ ZSTD_memcpy(rep, lastStretch.rep, sizeof(Repcodes_t)); ++ assert(cur >= lastStretch.litlen); ++ cur -= lastStretch.litlen; + } + +- { U32 const storeEnd = cur + 1; ++ /* Let's write the shortest path solution. ++ * It is stored in @opt in reverse order, ++ * starting from @storeEnd (==cur+2), ++ * effectively partially @opt overwriting. ++ * Content is changed too: ++ * - So far, @opt stored stretches, aka a match followed by literals ++ * - Now, it will store sequences, aka literals followed by a match ++ */ ++ { U32 const storeEnd = cur + 2; + U32 storeStart = storeEnd; +- U32 seqPos = cur; ++ U32 stretchPos = cur; + + DEBUGLOG(6, "start reverse traversal (last_pos:%u, cur:%u)", + last_pos, cur); (void)last_pos; +- assert(storeEnd < ZSTD_OPT_NUM); +- DEBUGLOG(6, "last sequence copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- storeEnd, lastSequence.litlen, lastSequence.mlen, lastSequence.off); +- opt[storeEnd] = lastSequence; +- while (seqPos > 0) { +- U32 const backDist = ZSTD_totalLen(opt[seqPos]); ++ assert(storeEnd < ZSTD_OPT_SIZE); ++ DEBUGLOG(6, "last stretch copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", ++ storeEnd, lastStretch.litlen, lastStretch.mlen, lastStretch.off); ++ if (lastStretch.litlen > 0) { ++ /* last "sequence" is unfinished: just a bunch of literals */ ++ opt[storeEnd].litlen = lastStretch.litlen; ++ opt[storeEnd].mlen = 0; ++ storeStart = storeEnd-1; ++ opt[storeStart] = lastStretch; ++ } { ++ opt[storeEnd] = lastStretch; /* note: litlen will be fixed */ ++ storeStart = storeEnd; ++ } ++ while (1) { ++ ZSTD_optimal_t nextStretch = opt[stretchPos]; ++ opt[storeStart].litlen = nextStretch.litlen; ++ DEBUGLOG(6, "selected sequence (llen=%u,mlen=%u,ofc=%u)", ++ opt[storeStart].litlen, opt[storeStart].mlen, opt[storeStart].off); ++ if (nextStretch.mlen == 0) { ++ /* reaching beginning of segment */ ++ break; ++ } + storeStart--; +- DEBUGLOG(6, "sequence from rPos=%u copied into pos=%u (llen=%u,mlen=%u,ofc=%u)", +- seqPos, storeStart, opt[seqPos].litlen, opt[seqPos].mlen, opt[seqPos].off); +- opt[storeStart] = opt[seqPos]; +- seqPos = (seqPos > backDist) ? seqPos - backDist : 0; ++ opt[storeStart] = nextStretch; /* note: litlen will be fixed */ ++ assert(nextStretch.litlen + nextStretch.mlen <= stretchPos); ++ stretchPos -= nextStretch.litlen + nextStretch.mlen; + } + + /* save sequences */ +- DEBUGLOG(6, "sending selected sequences into seqStore") ++ DEBUGLOG(6, "sending selected sequences into seqStore"); + { U32 storePos; + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; +- U32 const offCode = opt[storePos].off; ++ U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; +- DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", +- anchor - istart, (unsigned)llen, (unsigned)mlen); ++ DEBUGLOG(6, "considering seq starting at %i, llen=%u, mlen=%u", ++ (int)(anchor - istart), (unsigned)llen, (unsigned)mlen); + + if (mlen==0) { /* only literals => must be last "sequence", actually starting a new stream of sequences */ + assert(storePos == storeEnd); /* must be last sequence */ +@@ -1308,11 +1426,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + } + + assert(anchor + llen <= iend); +- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); +- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); ++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); ++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } ++ DEBUGLOG(7, "new offset history : %u, %u, %u", rep[0], rep[1], rep[2]); ++ ++ /* update all costs */ + ZSTD_setBasePrices(optStatePtr, optLevel); + } + } /* while (ip < ilimit) */ +@@ -1320,42 +1441,51 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + /* Return the last literals size */ + return (size_t)(iend - anchor); + } ++#endif /* build exclusions */ + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt0( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 0 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + static size_t ZSTD_compressBlock_opt2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize, const ZSTD_dictMode_e dictMode) + { + return ZSTD_compressBlock_opt_generic(ms, seqStore, rep, src, srcSize, 2 /* optLevel */, dictMode); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock_btopt"); + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + + + + ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR + /* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. +- * this function cannot error, hence its contract must be respected. ++ * this function cannot error out, its narrow contract must be respected. + */ +-static void +-ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +- seqStore_t* seqStore, +- U32 rep[ZSTD_REP_NUM], +- const void* src, size_t srcSize) ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++void ZSTD_initStats_ultra(ZSTD_MatchState_t* ms, ++ SeqStore_t* seqStore, ++ U32 rep[ZSTD_REP_NUM], ++ const void* src, size_t srcSize) + { + U32 tmpRep[ZSTD_REP_NUM]; /* updated rep codes will sink here */ + ZSTD_memcpy(tmpRep, rep, sizeof(tmpRep)); +@@ -1368,7 +1498,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + +- /* invalidate first scan from history */ ++ /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; +@@ -1378,7 +1508,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + } + + size_t ZSTD_compressBlock_btultra( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock_btultra (srcSize=%zu)", srcSize); +@@ -1386,16 +1516,16 @@ size_t ZSTD_compressBlock_btultra( + } + + size_t ZSTD_compressBlock_btultra2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + +- /* 2-pass strategy: ++ /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics +- * and seed next round's statistics with it. +- * After 1st pass, function forgets everything, and starts a new block. ++ * in order to seed next round's statistics with it. ++ * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), +@@ -1404,42 +1534,47 @@ size_t ZSTD_compressBlock_btultra2( + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ +- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ +- && (srcSize > ZSTD_PREDEF_THRESHOLD) ++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ ++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } + + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_noDict); + } ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + +-size_t ZSTD_compressBlock_btultra_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btopt_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); ++ return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + +-size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { +- return ZSTD_compressBlock_opt0(ms, seqStore, rep, src, srcSize, ZSTD_extDict); ++ return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_dictMatchState); + } + + size_t ZSTD_compressBlock_btultra_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + const void* src, size_t srcSize) + { + return ZSTD_compressBlock_opt2(ms, seqStore, rep, src, srcSize, ZSTD_extDict); + } ++#endif + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries +diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h +index 22b862858ba7..fbdc540ec9d1 100644 +--- a/lib/zstd/compress/zstd_opt.h ++++ b/lib/zstd/compress/zstd_opt.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,40 +12,62 @@ + #ifndef ZSTD_OPT_H + #define ZSTD_OPT_H + +- + #include "zstd_compress_internal.h" + ++#if !defined(ZSTD_EXCLUDE_BTLAZY2_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR) \ ++ || !defined(ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR) + /* used in ZSTD_loadDictionaryContent() */ +-void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend); ++void ZSTD_updateTree(ZSTD_MatchState_t* ms, const BYTE* ip, const BYTE* iend); ++#endif + ++#ifndef ZSTD_EXCLUDE_BTOPT_BLOCK_COMPRESSOR + size_t ZSTD_compressBlock_btopt( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btopt_dictMatchState( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +-size_t ZSTD_compressBlock_btultra2( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++size_t ZSTD_compressBlock_btopt_extDict( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_BTOPT ZSTD_compressBlock_btopt ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE ZSTD_compressBlock_btopt_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT ZSTD_compressBlock_btopt_extDict ++#else ++#define ZSTD_COMPRESSBLOCK_BTOPT NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTOPT_EXTDICT NULL ++#endif + +-size_t ZSTD_compressBlock_btopt_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++#ifndef ZSTD_EXCLUDE_BTULTRA_BLOCK_COMPRESSOR ++size_t ZSTD_compressBlock_btultra( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_dictMatchState( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], +- void const* src, size_t srcSize); +- +-size_t ZSTD_compressBlock_btopt_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + size_t ZSTD_compressBlock_btultra_extDict( +- ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); + + /* note : no btultra2 variant for extDict nor dictMatchState, + * because btultra2 is not meant to work with dictionaries + * and is only specific for the first block (no prefix) */ ++size_t ZSTD_compressBlock_btultra2( ++ ZSTD_MatchState_t* ms, SeqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], ++ void const* src, size_t srcSize); + ++#define ZSTD_COMPRESSBLOCK_BTULTRA ZSTD_compressBlock_btultra ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE ZSTD_compressBlock_btultra_dictMatchState ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT ZSTD_compressBlock_btultra_extDict ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 ZSTD_compressBlock_btultra2 ++#else ++#define ZSTD_COMPRESSBLOCK_BTULTRA NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_DICTMATCHSTATE NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA_EXTDICT NULL ++#define ZSTD_COMPRESSBLOCK_BTULTRA2 NULL ++#endif + + #endif /* ZSTD_OPT_H */ +diff --git a/lib/zstd/compress/zstd_preSplit.c b/lib/zstd/compress/zstd_preSplit.c +new file mode 100644 +index 000000000000..7d9403c9a3bc +--- /dev/null ++++ b/lib/zstd/compress/zstd_preSplit.c +@@ -0,0 +1,239 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#include "../common/compiler.h" /* ZSTD_ALIGNOF */ ++#include "../common/mem.h" /* S64 */ ++#include "../common/zstd_deps.h" /* ZSTD_memset */ ++#include "../common/zstd_internal.h" /* ZSTD_STATIC_ASSERT */ ++#include "hist.h" /* HIST_add */ ++#include "zstd_preSplit.h" ++ ++ ++#define BLOCKSIZE_MIN 3500 ++#define THRESHOLD_PENALTY_RATE 16 ++#define THRESHOLD_BASE (THRESHOLD_PENALTY_RATE - 2) ++#define THRESHOLD_PENALTY 3 ++ ++#define HASHLENGTH 2 ++#define HASHLOG_MAX 10 ++#define HASHTABLESIZE (1 << HASHLOG_MAX) ++#define HASHMASK (HASHTABLESIZE - 1) ++#define KNUTH 0x9e3779b9 ++ ++/* for hashLog > 8, hash 2 bytes. ++ * for hashLog == 8, just take the byte, no hashing. ++ * The speed of this method relies on compile-time constant propagation */ ++FORCE_INLINE_TEMPLATE unsigned hash2(const void *p, unsigned hashLog) ++{ ++ assert(hashLog >= 8); ++ if (hashLog == 8) return (U32)((const BYTE*)p)[0]; ++ assert(hashLog <= HASHLOG_MAX); ++ return (U32)(MEM_read16(p)) * KNUTH >> (32 - hashLog); ++} ++ ++ ++typedef struct { ++ unsigned events[HASHTABLESIZE]; ++ size_t nbEvents; ++} Fingerprint; ++typedef struct { ++ Fingerprint pastEvents; ++ Fingerprint newEvents; ++} FPStats; ++ ++static void initStats(FPStats* fpstats) ++{ ++ ZSTD_memset(fpstats, 0, sizeof(FPStats)); ++} ++ ++FORCE_INLINE_TEMPLATE void ++addEvents_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) ++{ ++ const char* p = (const char*)src; ++ size_t limit = srcSize - HASHLENGTH + 1; ++ size_t n; ++ assert(srcSize >= HASHLENGTH); ++ for (n = 0; n < limit; n+=samplingRate) { ++ fp->events[hash2(p+n, hashLog)]++; ++ } ++ fp->nbEvents += limit/samplingRate; ++} ++ ++FORCE_INLINE_TEMPLATE void ++recordFingerprint_generic(Fingerprint* fp, const void* src, size_t srcSize, size_t samplingRate, unsigned hashLog) ++{ ++ ZSTD_memset(fp, 0, sizeof(unsigned) * ((size_t)1 << hashLog)); ++ fp->nbEvents = 0; ++ addEvents_generic(fp, src, srcSize, samplingRate, hashLog); ++} ++ ++typedef void (*RecordEvents_f)(Fingerprint* fp, const void* src, size_t srcSize); ++ ++#define FP_RECORD(_rate) ZSTD_recordFingerprint_##_rate ++ ++#define ZSTD_GEN_RECORD_FINGERPRINT(_rate, _hSize) \ ++ static void FP_RECORD(_rate)(Fingerprint* fp, const void* src, size_t srcSize) \ ++ { \ ++ recordFingerprint_generic(fp, src, srcSize, _rate, _hSize); \ ++ } ++ ++ZSTD_GEN_RECORD_FINGERPRINT(1, 10) ++ZSTD_GEN_RECORD_FINGERPRINT(5, 10) ++ZSTD_GEN_RECORD_FINGERPRINT(11, 9) ++ZSTD_GEN_RECORD_FINGERPRINT(43, 8) ++ ++ ++static U64 abs64(S64 s64) { return (U64)((s64 < 0) ? -s64 : s64); } ++ ++static U64 fpDistance(const Fingerprint* fp1, const Fingerprint* fp2, unsigned hashLog) ++{ ++ U64 distance = 0; ++ size_t n; ++ assert(hashLog <= HASHLOG_MAX); ++ for (n = 0; n < ((size_t)1 << hashLog); n++) { ++ distance += ++ abs64((S64)fp1->events[n] * (S64)fp2->nbEvents - (S64)fp2->events[n] * (S64)fp1->nbEvents); ++ } ++ return distance; ++} ++ ++/* Compare newEvents with pastEvents ++ * return 1 when considered "too different" ++ */ ++static int compareFingerprints(const Fingerprint* ref, ++ const Fingerprint* newfp, ++ int penalty, ++ unsigned hashLog) ++{ ++ assert(ref->nbEvents > 0); ++ assert(newfp->nbEvents > 0); ++ { U64 p50 = (U64)ref->nbEvents * (U64)newfp->nbEvents; ++ U64 deviation = fpDistance(ref, newfp, hashLog); ++ U64 threshold = p50 * (U64)(THRESHOLD_BASE + penalty) / THRESHOLD_PENALTY_RATE; ++ return deviation >= threshold; ++ } ++} ++ ++static void mergeEvents(Fingerprint* acc, const Fingerprint* newfp) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ acc->events[n] += newfp->events[n]; ++ } ++ acc->nbEvents += newfp->nbEvents; ++} ++ ++static void flushEvents(FPStats* fpstats) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ fpstats->pastEvents.events[n] = fpstats->newEvents.events[n]; ++ } ++ fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents; ++ ZSTD_memset(&fpstats->newEvents, 0, sizeof(fpstats->newEvents)); ++} ++ ++static void removeEvents(Fingerprint* acc, const Fingerprint* slice) ++{ ++ size_t n; ++ for (n = 0; n < HASHTABLESIZE; n++) { ++ assert(acc->events[n] >= slice->events[n]); ++ acc->events[n] -= slice->events[n]; ++ } ++ acc->nbEvents -= slice->nbEvents; ++} ++ ++#define CHUNKSIZE (8 << 10) ++static size_t ZSTD_splitBlock_byChunks(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize) ++{ ++ static const RecordEvents_f records_fs[] = { ++ FP_RECORD(43), FP_RECORD(11), FP_RECORD(5), FP_RECORD(1) ++ }; ++ static const unsigned hashParams[] = { 8, 9, 10, 10 }; ++ const RecordEvents_f record_f = (assert(0<=level && level<=3), records_fs[level]); ++ FPStats* const fpstats = (FPStats*)workspace; ++ const char* p = (const char*)blockStart; ++ int penalty = THRESHOLD_PENALTY; ++ size_t pos = 0; ++ assert(blockSize == (128 << 10)); ++ assert(workspace != NULL); ++ assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); ++ ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); ++ assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; ++ ++ initStats(fpstats); ++ record_f(&fpstats->pastEvents, p, CHUNKSIZE); ++ for (pos = CHUNKSIZE; pos <= blockSize - CHUNKSIZE; pos += CHUNKSIZE) { ++ record_f(&fpstats->newEvents, p + pos, CHUNKSIZE); ++ if (compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, penalty, hashParams[level])) { ++ return pos; ++ } else { ++ mergeEvents(&fpstats->pastEvents, &fpstats->newEvents); ++ if (penalty > 0) penalty--; ++ } ++ } ++ assert(pos == blockSize); ++ return blockSize; ++ (void)flushEvents; (void)removeEvents; ++} ++ ++/* ZSTD_splitBlock_fromBorders(): very fast strategy : ++ * compare fingerprint from beginning and end of the block, ++ * derive from their difference if it's preferable to split in the middle, ++ * repeat the process a second time, for finer grained decision. ++ * 3 times did not brought improvements, so I stopped at 2. ++ * Benefits are good enough for a cheap heuristic. ++ * More accurate splitting saves more, but speed impact is also more perceptible. ++ * For better accuracy, use more elaborate variant *_byChunks. ++ */ ++static size_t ZSTD_splitBlock_fromBorders(const void* blockStart, size_t blockSize, ++ void* workspace, size_t wkspSize) ++{ ++#define SEGMENT_SIZE 512 ++ FPStats* const fpstats = (FPStats*)workspace; ++ Fingerprint* middleEvents = (Fingerprint*)(void*)((char*)workspace + 512 * sizeof(unsigned)); ++ assert(blockSize == (128 << 10)); ++ assert(workspace != NULL); ++ assert((size_t)workspace % ZSTD_ALIGNOF(FPStats) == 0); ++ ZSTD_STATIC_ASSERT(ZSTD_SLIPBLOCK_WORKSPACESIZE >= sizeof(FPStats)); ++ assert(wkspSize >= sizeof(FPStats)); (void)wkspSize; ++ ++ initStats(fpstats); ++ HIST_add(fpstats->pastEvents.events, blockStart, SEGMENT_SIZE); ++ HIST_add(fpstats->newEvents.events, (const char*)blockStart + blockSize - SEGMENT_SIZE, SEGMENT_SIZE); ++ fpstats->pastEvents.nbEvents = fpstats->newEvents.nbEvents = SEGMENT_SIZE; ++ if (!compareFingerprints(&fpstats->pastEvents, &fpstats->newEvents, 0, 8)) ++ return blockSize; ++ ++ HIST_add(middleEvents->events, (const char*)blockStart + blockSize/2 - SEGMENT_SIZE/2, SEGMENT_SIZE); ++ middleEvents->nbEvents = SEGMENT_SIZE; ++ { U64 const distFromBegin = fpDistance(&fpstats->pastEvents, middleEvents, 8); ++ U64 const distFromEnd = fpDistance(&fpstats->newEvents, middleEvents, 8); ++ U64 const minDistance = SEGMENT_SIZE * SEGMENT_SIZE / 3; ++ if (abs64((S64)distFromBegin - (S64)distFromEnd) < minDistance) ++ return 64 KB; ++ return (distFromBegin > distFromEnd) ? 32 KB : 96 KB; ++ } ++} ++ ++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize) ++{ ++ DEBUGLOG(6, "ZSTD_splitBlock (level=%i)", level); ++ assert(0<=level && level<=4); ++ if (level == 0) ++ return ZSTD_splitBlock_fromBorders(blockStart, blockSize, workspace, wkspSize); ++ /* level >= 1*/ ++ return ZSTD_splitBlock_byChunks(blockStart, blockSize, level-1, workspace, wkspSize); ++} +diff --git a/lib/zstd/compress/zstd_preSplit.h b/lib/zstd/compress/zstd_preSplit.h +new file mode 100644 +index 000000000000..f98f797fe191 +--- /dev/null ++++ b/lib/zstd/compress/zstd_preSplit.h +@@ -0,0 +1,34 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_PRESPLIT_H ++#define ZSTD_PRESPLIT_H ++ ++#include /* size_t */ ++ ++#define ZSTD_SLIPBLOCK_WORKSPACESIZE 8208 ++ ++/* ZSTD_splitBlock(): ++ * @level must be a value between 0 and 4. ++ * higher levels spend more energy to detect block boundaries. ++ * @workspace must be aligned for size_t. ++ * @wkspSize must be at least >= ZSTD_SLIPBLOCK_WORKSPACESIZE ++ * note: ++ * For the time being, this function only accepts full 128 KB blocks. ++ * Therefore, @blockSize must be == 128 KB. ++ * While this could be extended to smaller sizes in the future, ++ * it is not yet clear if this would be useful. TBD. ++ */ ++size_t ZSTD_splitBlock(const void* blockStart, size_t blockSize, ++ int level, ++ void* workspace, size_t wkspSize); ++ ++#endif /* ZSTD_PRESPLIT_H */ +diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c +index 60958afebc41..ac8b87f48f84 100644 +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,10 +20,10 @@ + #include "../common/compiler.h" + #include "../common/bitstream.h" /* BIT_* */ + #include "../common/fse.h" /* to compress headers */ +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/error_private.h" + #include "../common/zstd_internal.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + + /* ************************************************************** + * Constants +@@ -34,6 +35,12 @@ + * Macros + ****************************************************************/ + ++#ifdef HUF_DISABLE_FAST_DECODE ++# define HUF_ENABLE_FAST_DECODE 0 ++#else ++# define HUF_ENABLE_FAST_DECODE 1 ++#endif ++ + /* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. +@@ -43,27 +50,25 @@ + #error "Cannot force the use of the X1 and X2 decoders at the same time!" + #endif + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE ++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is ++ * supported at runtime, so we can add the BMI2 target attribute. ++ * When it is disabled, we will still get BMI2 if it is enabled statically. ++ */ ++#if DYNAMIC_BMI2 ++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE + #else +-# define HUF_ASM_X86_64_BMI2_ATTRS ++# define HUF_FAST_BMI2_ATTRS + #endif + + #define HUF_EXTERN_C + #define HUF_ASM_DECL HUF_EXTERN_C + +-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) ++#if DYNAMIC_BMI2 + # define HUF_NEED_BMI2_FUNCTION 1 + #else + # define HUF_NEED_BMI2_FUNCTION 0 + #endif + +-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +-# define HUF_NEED_DEFAULT_FUNCTION 1 +-#else +-# define HUF_NEED_DEFAULT_FUNCTION 0 +-#endif +- + /* ************************************************************** + * Error Management + ****************************************************************/ +@@ -80,6 +85,11 @@ + /* ************************************************************** + * BMI2 Variant Wrappers + ****************************************************************/ ++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, ++ const void *cSrc, ++ size_t cSrcSize, ++ const HUF_DTable *DTable); ++ + #if DYNAMIC_BMI2 + + #define HUF_DGEN(fn) \ +@@ -101,9 +111,9 @@ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- if (bmi2) { \ ++ if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ +@@ -113,9 +123,9 @@ + + #define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- (void)bmi2; \ ++ (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +@@ -134,43 +144,66 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) + return dtd; + } + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 +- +-static size_t HUF_initDStream(BYTE const* ip) { ++static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; +- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); ++ assert(sizeof(size_t) == 8); + return value << bitsConsumed; + } ++ ++ ++/* ++ * The input/output arguments to the Huffman fast decoding loop: ++ * ++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. ++ * op [in/out] - The output pointers, must be updated to reflect what is written. ++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. ++ * dt [in] - The decoding table. ++ * ilowest [in] - The beginning of the valid range of the input. Decoders may read ++ * down to this pointer. It may be below iend[0]. ++ * oend [in] - The end of the output stream. op[3] must not cross oend. ++ * iend [in] - The end of each input stream. ip[i] may cross iend[i], ++ * as long as it is above ilowest, but that indicates corruption. ++ */ + typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; + U64 bits[4]; + void const* dt; +- BYTE const* ilimit; ++ BYTE const* ilowest; + BYTE* oend; + BYTE const* iend[4]; +-} HUF_DecompressAsmArgs; ++} HUF_DecompressFastArgs; ++ ++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + + /* +- * Initializes args for the asm decoding loop. +- * @returns 0 on success +- * 1 if the fallback implementation should be used. ++ * Initializes args for the fast decoding loop. ++ * @returns 1 on success ++ * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) ++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) + { + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; + +- const BYTE* const ilimit = (const BYTE*)src + 6 + 8; ++ const BYTE* const istart = (const BYTE*)src; + +- BYTE* const oend = (BYTE*)dst + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); + +- /* The following condition is false on x32 platform, +- * but HUF_asm is not compatible with this ABI */ +- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; ++ /* The fast decoding loop assumes 64-bit little-endian. ++ * This condition is false on x32. ++ */ ++ if (!MEM_isLittleEndian() || MEM_32bits()) ++ return 0; ++ ++ /* Avoid nullptr addition */ ++ if (dstSize == 0) ++ return 0; ++ assert(dst != NULL); + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) +@@ -181,11 +214,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) +- return 1; ++ return 0; + + /* Read the jump table. */ + { +- const BYTE* const istart = (const BYTE*)src; + size_t const length1 = MEM_readLE16(istart); + size_t const length2 = MEM_readLE16(istart+2); + size_t const length3 = MEM_readLE16(istart+4); +@@ -195,13 +227,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + +- /* HUF_initDStream() requires this, and this small of an input ++ /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. +- * length1 must be >= 16 so that ip[0] >= ilimit before the loop +- * starts. + */ +- if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +- return 1; ++ if (length1 < 8 || length2 < 8 || length3 < 8 || length4 < 8) ++ return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ +@@ -218,7 +248,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) +- return 1; ++ return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. +@@ -227,24 +257,25 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ +- args->bits[0] = HUF_initDStream(args->ip[0]); +- args->bits[1] = HUF_initDStream(args->ip[1]); +- args->bits[2] = HUF_initDStream(args->ip[2]); +- args->bits[3] = HUF_initDStream(args->ip[3]); +- +- /* If ip[] >= ilimit, it is guaranteed to be safe to +- * reload bits[]. It may be beyond its section, but is +- * guaranteed to be valid (>= istart). +- */ +- args->ilimit = ilimit; ++ args->bits[0] = HUF_initFastDStream(args->ip[0]); ++ args->bits[1] = HUF_initFastDStream(args->ip[1]); ++ args->bits[2] = HUF_initFastDStream(args->ip[2]); ++ args->bits[3] = HUF_initFastDStream(args->ip[3]); ++ ++ /* The decoders must be sure to never read beyond ilowest. ++ * This is lower than iend[0], but allowing decoders to read ++ * down to ilowest can allow an extra iteration or two in the ++ * fast loop. ++ */ ++ args->ilowest = istart; + + args->oend = oend; + args->dt = dt; + +- return 0; ++ return 1; + } + +-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) ++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) + { + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) +@@ -258,15 +289,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ +- bit->bitContainer = MEM_readLE64(args->ip[stream]); +- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); +- bit->start = (const char*)args->iend[0]; ++ assert(sizeof(size_t) == 8); ++ bit->bitContainer = MEM_readLEST(args->ip[stream]); ++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); ++ bit->start = (const char*)args->ilowest; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; + } +-#endif ++ ++/* Calls X(N) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM(X) \ ++ do { \ ++ X(0); \ ++ X(1); \ ++ X(2); \ ++ X(3); \ ++ } while (0) ++ ++/* Calls X(N, var) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ ++ do { \ ++ X(0, (var)); \ ++ X(1, (var)); \ ++ X(2, (var)); \ ++ X(3, (var)); \ ++ } while (0) + + + #ifndef HUF_FORCE_DECOMPRESS_X2 +@@ -283,10 +332,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi + static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { +- D4 = (symbol << 8) + nbBits; ++ D4 = (U64)((symbol << 8) + nbBits); + } else { +- D4 = symbol + (nbBits << 8); ++ D4 = (U64)(symbol + (nbBits << 8)); + } ++ assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; + } +@@ -329,13 +379,7 @@ typedef struct { + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; + } HUF_ReadDTableX1_Workspace; + +- +-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog = 0; + U32 nbSymbols = 0; +@@ -350,7 +394,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + +@@ -377,9 +421,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ +- { +- int n; +- int nextRankStart = 0; ++ { int n; ++ U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { +@@ -406,10 +449,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ +- { +- U32 w; +- int symbol=wksp->rankVal[0]; +- int rankStart=0; ++ { U32 w; ++ int symbol = wksp->rankVal[0]; ++ int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; +@@ -483,15 +525,19 @@ HUF_decodeSymbolX1(BIT_DStream_t* Dstream, const HUF_DEltX1* dt, const U32 dtLog + } + + #define HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) \ +- *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog) ++ do { *ptr++ = HUF_decodeSymbolX1(DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr) ++#define HUF_DECODE_SYMBOLX1_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ HUF_DECODE_SYMBOLX1_0(ptr, DStreamPtr); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, const HUF_DEltX1* const dt, const U32 dtLog) +@@ -519,7 +565,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + +- return pEnd-pStart; ++ return (size_t)(pEnd-pStart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -529,7 +575,7 @@ HUF_decompress1X1_usingDTable_internal_body( + const HUF_DTable* DTable) + { + BYTE* op = (BYTE*)dst; +- BYTE* const oend = op + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(op, dstSize); + const void* dtPtr = DTable + 1; + const HUF_DEltX1* const dt = (const HUF_DEltX1*)dtPtr; + BIT_DStream_t bitD; +@@ -545,6 +591,10 @@ HUF_decompress1X1_usingDTable_internal_body( + return dstSize; + } + ++/* HUF_decompress4X1_usingDTable_internal_body(): ++ * Conditions : ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -553,6 +603,7 @@ HUF_decompress4X1_usingDTable_internal_body( + { + /* Check */ + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -588,6 +639,7 @@ HUF_decompress4X1_usingDTable_internal_body( + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6); /* validated above */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -650,52 +702,173 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ U16 const* const dtable = (U16 const*)args->dt; ++ BYTE* const oend = args->oend; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local variables */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each iteration produces 5 output symbols per stream */ ++ size_t const oiters = (size_t)(oend - op[3]) / 5; ++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes ++ * per stream. ++ */ ++ size_t const iiters = (size_t)(ip[0] - ilowest) / 7; ++ /* We can safely run iters iterations before running bounds checks */ ++ size_t const iters = MIN(oiters, iiters); ++ size_t const symbols = iters * 5; ++ ++ /* We can simply check that op[3] < olimit, instead of checking all ++ * of our bounds, since we can't hit the other bounds until we've run ++ * iters iterations, which only happens when op[3] == olimit. ++ */ ++ olimit = op[3] + symbols; ++ ++ /* Exit fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ ++ do { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ int const entry = (int)dtable[index]; \ ++ bits[(_stream)] <<= (entry & 0x3F); \ ++ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ ++ } while (0) ++ ++#define HUF_4X1_RELOAD_STREAM(_stream) \ ++ do { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ op[(_stream)] += 5; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols in each of the 4 streams */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4); ++ ++ /* Reload each of the 4 the bitstreams */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ ++#undef HUF_4X1_DECODE_SYMBOL ++#undef HUF_4X1_RELOAD_STREAM ++ } + +-static HUF_ASM_X86_64_BMI2_ATTRS ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++/* ++ * @returns @p dstSize on success (>= 6) ++ * 0 if the fallback implementation should be used ++ * An error if an error occurred ++ */ ++static HUF_FAST_BMI2_ATTRS + size_t +-HUF_decompress4X1_usingDTable_internal_bmi2_asm( ++HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) + { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; +- { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +- FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ BYTE const* const ilowest = (BYTE const*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; ++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + +- /* Our loop guarantees that ip[] >= ilimit and that we haven't ++ /* Our loop guarantees that ip[] >= ilowest and that we haven't + * overwritten any op[]. + */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bit streams one by one. */ +- { +- size_t const segmentSize = (dstSize+3) / 4; ++ { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { +@@ -712,97 +885,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + } + + /* decoded size */ ++ assert(dstSize != 0); + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +- +-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, +- const void *cSrc, +- size_t cSrcSize, +- const HUF_DTable *DTable); + + HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + #endif +-} +- +- +-size_t HUF_decompress1X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} + +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- const BYTE* ip = (const BYTE*) cSrc; +- +- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); +- if (HUF_isError(hSize)) return hSize; +- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); +- ip += hSize; cSrcSize -= hSize; +- +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + +-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +-} +- +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); ++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +- + #endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +@@ -985,7 +1120,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +@@ -1040,14 +1175,7 @@ typedef struct { + + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +- const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); +@@ -1069,7 +1197,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ +@@ -1159,15 +1287,19 @@ HUF_decodeLastSymbolX2(void* op, BIT_DStream_t* DStream, const HUF_DEltX2* dt, c + } + + #define HUF_DECODE_SYMBOLX2_0(ptr, DStreamPtr) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++ do { ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); } while (0) + +-#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ +- if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_1(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits() || (HUF_TABLELOG_MAX<=12)) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + +-#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ +- if (MEM_64bits()) \ +- ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog) ++#define HUF_DECODE_SYMBOLX2_2(ptr, DStreamPtr) \ ++ do { \ ++ if (MEM_64bits()) \ ++ ptr += HUF_decodeSymbolX2(ptr, DStreamPtr, dt, dtLog); \ ++ } while (0) + + HINT_INLINE size_t + HUF_decodeStreamX2(BYTE* p, BIT_DStream_t* bitDPtr, BYTE* const pEnd, +@@ -1227,7 +1359,7 @@ HUF_decompress1X2_usingDTable_internal_body( + + /* decode */ + { BYTE* const ostart = (BYTE*) dst; +- BYTE* const oend = ostart + dstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, dstSize); + const void* const dtPtr = DTable+1; /* force compiler to not use strict-aliasing */ + const HUF_DEltX2* const dt = (const HUF_DEltX2*)dtPtr; + DTableDesc const dtd = HUF_getDTableDesc(DTable); +@@ -1240,6 +1372,11 @@ HUF_decompress1X2_usingDTable_internal_body( + /* decoded size */ + return dstSize; + } ++ ++/* HUF_decompress4X2_usingDTable_internal_body(): ++ * Conditions: ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -1247,6 +1384,7 @@ HUF_decompress4X2_usingDTable_internal_body( + const HUF_DTable* DTable) + { + if (cSrcSize < 10) return ERROR(corruption_detected); /* strict minimum : jump table + 1 byte per stream */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + + { const BYTE* const istart = (const BYTE*) cSrc; + BYTE* const ostart = (BYTE*) dst; +@@ -1280,8 +1418,9 @@ HUF_decompress4X2_usingDTable_internal_body( + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + +- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ +- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ ++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ assert(dstSize >= 6 /* validated above */); + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -1366,44 +1505,191 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ BYTE* oend[4]; ++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; ++ BYTE const* const ilowest = args->ilowest; ++ ++ /* Copy the arguments to local registers. */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ oend[0] = op[1]; ++ oend[1] = op[2]; ++ oend[2] = op[3]; ++ oend[3] = args->oend; ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= oend[stream]); ++ assert(ip[stream] >= ilowest); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each loop does 5 table lookups for each of the 4 streams. ++ * Each table lookup consumes up to 11 bits of input, and produces ++ * up to 2 bytes of output. ++ */ ++ /* We can consume up to 7 bytes of input per iteration per stream. ++ * We also know that each input pointer is >= ip[0]. So we can run ++ * iters loops before running out of input. ++ */ ++ size_t iters = (size_t)(ip[0] - ilowest) / 7; ++ /* Each iteration can produce up to 10 bytes of output per stream. ++ * Each output stream my advance at different rates. So take the ++ * minimum number of safe iterations among all the output streams. ++ */ ++ for (stream = 0; stream < 4; ++stream) { ++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; ++ iters = MIN(iters, oiters); ++ } ++ ++ /* Each iteration produces at least 5 output symbols. So until ++ * op[3] crosses olimit, we know we haven't executed iters ++ * iterations yet. This saves us maintaining an iters counter, ++ * at the expense of computing the remaining # of iterations ++ * more frequently. ++ */ ++ olimit = op[3] + (iters * 5); ++ ++ /* Exit the fast decoding loop once we reach the end. */ ++ if (op[3] == olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif + +-static HUF_ASM_X86_64_BMI2_ATTRS size_t +-HUF_decompress4X2_usingDTable_internal_bmi2_asm( ++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ ++ do { \ ++ if ((_decode3) || (_stream) != 3) { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ HUF_DEltX2 const entry = dtable[index]; \ ++ MEM_write16(op[(_stream)], entry.sequence); \ ++ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ ++ op[(_stream)] += (entry.length); \ ++ } \ ++ } while (0) ++ ++#define HUF_4X2_RELOAD_STREAM(_stream) \ ++ do { \ ++ HUF_4X2_DECODE_SYMBOL(3, 1); \ ++ { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } \ ++ } while (0) ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols from each of the first 3 streams. ++ * The final stream will be decoded during the reload phase ++ * to reduce register pressure. ++ */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0); ++ ++ /* Decode one symbol from the final stream */ ++ HUF_4X2_DECODE_SYMBOL(3, 1); ++ ++ /* Decode 4 symbols from the final stream & reload bitstreams. ++ * The final stream is reloaded last, meaning that all 5 symbols ++ * are decoded from the final stream before it is reloaded. ++ */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM); ++ } while (op[3] < olimit); ++ } ++ ++#undef HUF_4X2_DECODE_SYMBOL ++#undef HUF_4X2_RELOAD_STREAM ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++ ++static HUF_FAST_BMI2_ATTRS size_t ++HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) { ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; +- const BYTE* const iend = (const BYTE*)cSrc + 6; +- BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; ++ const BYTE* const ilowest = (const BYTE*)cSrc; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd((BYTE*)dst, dstSize); ++ HUF_DecompressFastArgs args; + { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (ret == 0) ++ return 0; + } + +- assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); ++ assert(args.ip[0] >= args.ilowest); ++ loopFn(&args); + + /* note : op4 already verified within main loop */ +- assert(args.ip[0] >= iend); +- assert(args.ip[1] >= iend); +- assert(args.ip[2] >= iend); +- assert(args.ip[3] >= iend); ++ assert(args.ip[0] >= ilowest); ++ assert(args.ip[1] >= ilowest); ++ assert(args.ip[2] >= ilowest); ++ assert(args.ip[3] >= ilowest); + assert(args.op[3] <= oend); +- (void)iend; ++ ++ assert(ilowest == args.ilowest); ++ assert(ilowest + 6 == args.iend[0]); ++ (void)ilowest; + + /* finish bitStreams one by one */ + { +@@ -1426,91 +1712,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( + /* decoded size */ + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + #endif ++ ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +-size_t HUF_decompress1X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- + size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); ++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); + } + +- +-size_t HUF_decompress4X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- +-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +@@ -1518,44 +1785,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + /* Universal decompression selectors */ + /* ***********************************/ + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- + + #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +@@ -1610,36 +1839,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) + #endif + } + +- +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, +- size_t dstSize, const void* cSrc, +- size_t cSrcSize, void* workSpace, +- size_t wkspSize) +-{ +- /* validation checks */ +- if (dstSize == 0) return ERROR(dstSize_tooSmall); +- if (cSrcSize == 0) return ERROR(corruption_detected); +- +- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)algoNb; +- assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)algoNb; +- assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#else +- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): +- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#endif +- } +-} +- + size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1652,71 +1854,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): ++ cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #endif + } + } + + +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + #endif + +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1726,15 +1928,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #else +- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : +- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : ++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #endif + } + } +- +diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c +index dbbc7919de53..30ef65e1ab5c 100644 +--- a/lib/zstd/decompress/zstd_ddict.c ++++ b/lib/zstd/decompress/zstd_ddict.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,12 +15,12 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/cpu.h" /* bmi2 */ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_decompress_internal.h" + #include "zstd_ddict.h" +@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + ZSTD_memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; +- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); +@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) + unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) + { + if (ddict==NULL) return 0; +- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); ++ return ddict->dictID; + } +diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h +index 8c1a79d666f8..de459a0dacd1 100644 +--- a/lib/zstd/decompress/zstd_ddict.h ++++ b/lib/zstd/decompress/zstd_ddict.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c +index 6b3177c94711..da8b4cf116e3 100644 +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -53,13 +54,15 @@ + * Dependencies + *********************************************************/ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ ++#include "../common/error_private.h" ++#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "../common/mem.h" /* low level memory routines */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ +-#include "../common/zstd_internal.h" /* blockProperties_t */ + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ +@@ -72,11 +75,11 @@ + *************************************/ + + #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. +- * Currently, that means a 0.75 load factor. +- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded +- * the load factor of the ddict hash set. +- */ ++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. ++ * Currently, that means a 0.75 load factor. ++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded ++ * the load factor of the ddict hash set. ++ */ + + #define DDICT_HASHSET_TABLE_BASE_SIZE 64 + #define DDICT_HASHSET_RESIZE_FACTOR 2 +@@ -237,6 +240,8 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; ++ dctx->disableHufAsm = 0; ++ dctx->maxBlockSizeParam = 0; + } + + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +@@ -253,6 +258,7 @@ static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; + dctx->oversizedDuration = 0; ++ dctx->isFrameDecompression = 1; + #if DYNAMIC_BMI2 + dctx->bmi2 = ZSTD_cpuSupportsBmi2(); + #endif +@@ -421,16 +427,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) ++** or an error code, which can be tested using ZSTD_isError() */ ++size_t ZSTD_getFrameHeader_advanced(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) + { + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + +- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ +- if (srcSize < minInputSize) return minInputSize; +- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); ++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); ++ ++ if (srcSize > 0) { ++ /* note : technically could be considered an assert(), since it's an invalid entry */ ++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); ++ } ++ if (srcSize < minInputSize) { ++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { ++ /* when receiving less than @minInputSize bytes, ++ * control these bytes at least correspond to a supported magic number ++ * in order to error out early if they don't. ++ **/ ++ size_t const toCopy = MIN(4, srcSize); ++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); ++ assert(src != NULL); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { ++ /* not a zstd frame : let's check if it's a skippable frame */ ++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { ++ RETURN_ERROR(prefix_unknown, ++ "first bytes don't correspond to any supported magic number"); ++ } } } ++ return minInputSize; ++ } + ++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { +@@ -438,8 +468,10 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s + if (srcSize < ZSTD_SKIPPABLEHEADERSIZE) + return ZSTD_SKIPPABLEHEADERSIZE; /* magic number + frame length */ + ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); +- zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + zfhPtr->frameType = ZSTD_skippableFrame; ++ zfhPtr->dictID = MEM_readLE32(src) - ZSTD_MAGIC_SKIPPABLE_START; ++ zfhPtr->headerSize = ZSTD_SKIPPABLEHEADERSIZE; ++ zfhPtr->frameContentSize = MEM_readLE32((const char *)src + ZSTD_FRAMEIDSIZE); + return 0; + } + RETURN_ERROR(prefix_unknown, ""); +@@ -508,7 +540,7 @@ size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, s + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, + * or an error code, which can be tested using ZSTD_isError() */ +-size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize) ++size_t ZSTD_getFrameHeader(ZSTD_FrameHeader* zfhPtr, const void* src, size_t srcSize) + { + return ZSTD_getFrameHeader_advanced(zfhPtr, src, srcSize, ZSTD_f_zstd1); + } +@@ -520,7 +552,7 @@ size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t src + * - ZSTD_CONTENTSIZE_ERROR if an error occurred (e.g. invalid magic number, srcSize too small) */ + unsigned long long ZSTD_getFrameContentSize(const void *src, size_t srcSize) + { +- { ZSTD_frameHeader zfh; ++ { ZSTD_FrameHeader zfh; + if (ZSTD_getFrameHeader(&zfh, src, srcSize) != 0) + return ZSTD_CONTENTSIZE_ERROR; + if (zfh.frameType == ZSTD_skippableFrame) { +@@ -540,61 +572,62 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); +- { +- size_t const skippableSize = skippableHeaderSize + sizeU32; ++ { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } + } + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * +- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. ++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize) ++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, /* optional, can be NULL */ ++ const void* src, size_t srcSize) + { +- U32 const magicNumber = MEM_readLE32(src); +- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; +- +- /* check input validity */ +- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + +- /* deliver payload */ +- if (skippableContentSize > 0 && dst != NULL) +- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +- if (magicVariant != NULL) +- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +- return skippableContentSize; ++ { U32 const magicNumber = MEM_readLE32(src); ++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); ++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; ++ ++ /* check input validity */ ++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); ++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); ++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ ++ /* deliver payload */ ++ if (skippableContentSize > 0 && dst != NULL) ++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); ++ if (magicVariant != NULL) ++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; ++ return skippableContentSize; ++ } + } + + /* ZSTD_findDecompressedSize() : +- * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames +- * @return : decompressed size of the frames contained */ ++ * note: compatible with legacy mode ++ * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { +- unsigned long long totalDstSize = 0; ++ U64 totalDstSize = 0; + + while (srcSize >= ZSTD_startingInputLength(ZSTD_f_zstd1)) { + U32 const magicNumber = MEM_readLE32(src); + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- if (ZSTD_isError(skippableSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; +@@ -602,17 +635,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + continue; + } + +- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); +- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; ++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); ++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- /* check for overflow */ +- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; +- totalDstSize += ret; ++ if (U64_MAX - totalDstSize < fcs) ++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ ++ totalDstSize += fcs; + } ++ /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); +- if (ZSTD_isError(frameSrcSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; ++ assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; +@@ -676,13 +709,13 @@ static ZSTD_frameSizeInfo ZSTD_errorFrameSizeInfo(size_t ret) + return frameSizeInfo; + } + +-static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize) ++static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize, ZSTD_format_e format) + { + ZSTD_frameSizeInfo frameSizeInfo; + ZSTD_memset(&frameSizeInfo, 0, sizeof(ZSTD_frameSizeInfo)); + + +- if ((srcSize >= ZSTD_SKIPPABLEHEADERSIZE) ++ if (format == ZSTD_f_zstd1 && (srcSize >= ZSTD_SKIPPABLEHEADERSIZE) + && (MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + frameSizeInfo.compressedSize = readSkippableFrameSize(src, srcSize); + assert(ZSTD_isError(frameSizeInfo.compressedSize) || +@@ -693,10 +726,10 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + const BYTE* const ipstart = ip; + size_t remainingSize = srcSize; + size_t nbBlocks = 0; +- ZSTD_frameHeader zfh; ++ ZSTD_FrameHeader zfh; + + /* Extract Frame Header */ +- { size_t const ret = ZSTD_getFrameHeader(&zfh, src, srcSize); ++ { size_t const ret = ZSTD_getFrameHeader_advanced(&zfh, src, srcSize, format); + if (ZSTD_isError(ret)) + return ZSTD_errorFrameSizeInfo(ret); + if (ret > 0) +@@ -730,28 +763,31 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ip += 4; + } + ++ frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize +- : nbBlocks * zfh.blockSizeMax; ++ : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } + } + ++static size_t ZSTD_findFrameCompressedSize_advanced(const void *src, size_t srcSize, ZSTD_format_e format) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, format); ++ return frameSizeInfo.compressedSize; ++} ++ + /* ZSTD_findFrameCompressedSize() : +- * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame, ZSTD legacy frame, or skippable frame +- * `srcSize` must be at least as large as the frame contained +- * @return : the compressed size of the frame starting at `src` */ ++ * See docs in zstd.h ++ * Note: compatible with legacy mode */ + size_t ZSTD_findFrameCompressedSize(const void *src, size_t srcSize) + { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); +- return frameSizeInfo.compressedSize; ++ return ZSTD_findFrameCompressedSize_advanced(src, srcSize, ZSTD_f_zstd1); + } + + /* ZSTD_decompressBound() : + * compatible with legacy mode +- * `src` must point to the start of a ZSTD frame or a skippeable frame ++ * `src` must point to the start of a ZSTD frame or a skippable frame + * `srcSize` must be at least as large as the frame contained + * @return : the maximum decompressed size of the compressed source + */ +@@ -760,7 +796,7 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + unsigned long long bound = 0; + /* Iterate over each frame */ + while (srcSize > 0) { +- ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); + size_t const compressedSize = frameSizeInfo.compressedSize; + unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; + if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) +@@ -773,6 +809,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + return bound; + } + ++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) ++{ ++ size_t margin = 0; ++ unsigned maxBlockSize = 0; ++ ++ /* Iterate over each frame */ ++ while (srcSize > 0) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize, ZSTD_f_zstd1); ++ size_t const compressedSize = frameSizeInfo.compressedSize; ++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ++ ZSTD_FrameHeader zfh; ++ ++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); ++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) ++ return ERROR(corruption_detected); ++ ++ if (zfh.frameType == ZSTD_frame) { ++ /* Add the frame header to our margin */ ++ margin += zfh.headerSize; ++ /* Add the checksum to our margin */ ++ margin += zfh.checksumFlag ? 4 : 0; ++ /* Add 3 bytes per block */ ++ margin += 3 * frameSizeInfo.nbBlocks; ++ ++ /* Compute the max block size */ ++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); ++ } else { ++ assert(zfh.frameType == ZSTD_skippableFrame); ++ /* Add the entire skippable frame size to our margin. */ ++ margin += compressedSize; ++ } ++ ++ assert(srcSize >= compressedSize); ++ src = (const BYTE*)src + compressedSize; ++ srcSize -= compressedSize; ++ } ++ ++ /* Add the max block size back to the margin. */ ++ margin += maxBlockSize; ++ ++ return margin; ++} + + /*-************************************************************* + * Frame decoding +@@ -815,7 +893,7 @@ static size_t ZSTD_setRleBlock(void* dst, size_t dstCapacity, + return regenSize; + } + +-static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, unsigned streaming) ++static void ZSTD_DCtx_trace_end(ZSTD_DCtx const* dctx, U64 uncompressedSize, U64 compressedSize, int streaming) + { + (void)dctx; + (void)uncompressedSize; +@@ -856,6 +934,10 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + ip += frameHeaderSize; remainingSrcSize -= frameHeaderSize; + } + ++ /* Shrink the blockSizeMax if enabled */ ++ if (dctx->maxBlockSizeParam != 0) ++ dctx->fParams.blockSizeMax = MIN(dctx->fParams.blockSizeMax, (unsigned)dctx->maxBlockSizeParam); ++ + /* Loop on each block */ + while (1) { + BYTE* oBlockEnd = oend; +@@ -888,7 +970,8 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + switch(blockProperties.blockType) + { + case bt_compressed: +- decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, /* frame */ 1, not_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ decodedSize = ZSTD_decompressBlock_internal(dctx, op, (size_t)(oBlockEnd-op), ip, cBlockSize, not_streaming); + break; + case bt_raw : + /* Use oend instead of oBlockEnd because this function is safe to overlap. It uses memmove. */ +@@ -901,12 +984,14 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + default: + RETURN_ERROR(corruption_detected, "invalid block type"); + } +- +- if (ZSTD_isError(decodedSize)) return decodedSize; +- if (dctx->validateChecksum) ++ FORWARD_IF_ERROR(decodedSize, "Block decompression failure"); ++ DEBUGLOG(5, "Decompressed block of dSize = %u", (unsigned)decodedSize); ++ if (dctx->validateChecksum) { + xxh64_update(&dctx->xxhState, op, decodedSize); +- if (decodedSize != 0) ++ } ++ if (decodedSize) /* support dst = NULL,0 */ { + op += decodedSize; ++ } + assert(ip != NULL); + ip += cBlockSize; + remainingSrcSize -= cBlockSize; +@@ -930,12 +1015,15 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %i, consuming %i bytes of input", (int)(op-ostart), (int)(ip - (const BYTE*)*srcPtr)); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); + } + +-static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, ++static ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR ++size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + const void* dict, size_t dictSize, +@@ -955,17 +1043,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + + +- { U32 const magicNumber = MEM_readLE32(src); +- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", +- (unsigned)magicNumber, ZSTD_MAGICNUMBER); ++ if (dctx->format == ZSTD_f_zstd1 && srcSize >= 4) { ++ U32 const magicNumber = MEM_readLE32(src); ++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { ++ /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); ++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; +- continue; ++ continue; /* check next frame */ + } } + + if (ddict) { +@@ -1061,8 +1150,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr + size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + + /* +- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, +- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can ++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we ++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce +@@ -1181,7 +1270,8 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + { + case bt_compressed: + DEBUGLOG(5, "ZSTD_decompressContinue: case bt_compressed"); +- rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 1, is_streaming); ++ assert(dctx->isFrameDecompression == 1); ++ rSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, is_streaming); + dctx->expected = 0; /* Streaming not supported */ + break; + case bt_raw : +@@ -1250,6 +1340,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + case ZSTDds_decodeSkippableHeader: + assert(src != NULL); + assert(srcSize <= ZSTD_SKIPPABLEHEADERSIZE); ++ assert(dctx->format != ZSTD_f_zstd1_magicless); + ZSTD_memcpy(dctx->headerBuffer + (ZSTD_SKIPPABLEHEADERSIZE - srcSize), src, srcSize); /* complete skippable header */ + dctx->expected = MEM_readLE32(dctx->headerBuffer + ZSTD_FRAMEIDSIZE); /* note : dctx->expected can grow seriously large, beyond local buffer size */ + dctx->stage = ZSTDds_skipFrame; +@@ -1262,7 +1353,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } + } + +@@ -1303,11 +1394,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; +@@ -1403,10 +1494,11 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; +- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; ++ dctx->isFrameDecompression = 1; + ZSTD_STATIC_ASSERT(sizeof(dctx->entropy.rep) == sizeof(repStartValue)); + ZSTD_memcpy(dctx->entropy.rep, repStartValue, sizeof(repStartValue)); /* initial repcodes */ + dctx->LLTptr = dctx->entropy.LLTable; +@@ -1465,7 +1557,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. +- * Needed dictionary is a hidden information. ++ * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. +@@ -1474,7 +1566,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ + unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) + { +- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; ++ ZSTD_FrameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +@@ -1581,7 +1673,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di + size_t ZSTD_initDStream(ZSTD_DStream* zds) + { + DEBUGLOG(4, "ZSTD_initDStream"); +- return ZSTD_initDStream_usingDDict(zds, NULL); ++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); ++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); ++ return ZSTD_startingInputLength(zds->format); + } + + /* ZSTD_initDStream_usingDDict() : +@@ -1589,6 +1683,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) + * this function cannot fail */ + size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + { ++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +@@ -1599,6 +1694,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + * this function cannot fail */ + size_t ZSTD_resetDStream(ZSTD_DStream* dctx) + { ++ DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); + } +@@ -1670,6 +1766,15 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; ++ case ZSTD_d_disableHuffmanAssembly: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ case ZSTD_d_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ + default:; + } + bounds.error = ERROR(parameter_unsupported); +@@ -1710,6 +1815,12 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ *value = (int)dctx->disableHufAsm; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ *value = dctx->maxBlockSizeParam; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1743,6 +1854,14 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); ++ dctx->disableHufAsm = value != 0; ++ return 0; ++ case ZSTD_d_maxBlockSize: ++ if (value != 0) CHECK_DBOUNDS(ZSTD_d_maxBlockSize, value); ++ dctx->maxBlockSizeParam = value; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1754,6 +1873,7 @@ size_t ZSTD_DCtx_reset(ZSTD_DCtx* dctx, ZSTD_ResetDirective reset) + || (reset == ZSTD_reset_session_and_parameters) ) { + dctx->streamStage = zdss_init; + dctx->noForwardProgress = 0; ++ dctx->isFrameDecompression = 1; + } + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { +@@ -1770,11 +1890,17 @@ size_t ZSTD_sizeof_DStream(const ZSTD_DStream* dctx) + return ZSTD_sizeof_DCtx(dctx); + } + +-size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++static size_t ZSTD_decodingBufferSize_internal(unsigned long long windowSize, unsigned long long frameContentSize, size_t blockSizeMax) + { +- size_t const blockSize = (size_t) MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +- /* space is needed to store the litbuffer after the output of a given block without stomping the extDict of a previous run, as well as to cover both windows against wildcopy*/ +- unsigned long long const neededRBSize = windowSize + blockSize + ZSTD_BLOCKSIZE_MAX + (WILDCOPY_OVERLENGTH * 2); ++ size_t const blockSize = MIN((size_t)MIN(windowSize, ZSTD_BLOCKSIZE_MAX), blockSizeMax); ++ /* We need blockSize + WILDCOPY_OVERLENGTH worth of buffer so that if a block ++ * ends at windowSize + WILDCOPY_OVERLENGTH + 1 bytes, we can start writing ++ * the block at the beginning of the output buffer, and maintain a full window. ++ * ++ * We need another blockSize worth of buffer so that we can store split ++ * literals at the end of the block without overwriting the extDict window. ++ */ ++ unsigned long long const neededRBSize = windowSize + (blockSize * 2) + (WILDCOPY_OVERLENGTH * 2); + unsigned long long const neededSize = MIN(frameContentSize, neededRBSize); + size_t const minRBSize = (size_t) neededSize; + RETURN_ERROR_IF((unsigned long long)minRBSize != neededSize, +@@ -1782,6 +1908,11 @@ size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long + return minRBSize; + } + ++size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize) ++{ ++ return ZSTD_decodingBufferSize_internal(windowSize, frameContentSize, ZSTD_BLOCKSIZE_MAX); ++} ++ + size_t ZSTD_estimateDStreamSize(size_t windowSize) + { + size_t const blockSize = MIN(windowSize, ZSTD_BLOCKSIZE_MAX); +@@ -1793,7 +1924,7 @@ size_t ZSTD_estimateDStreamSize(size_t windowSize) + size_t ZSTD_estimateDStreamSize_fromFrame(const void* src, size_t srcSize) + { + U32 const windowSizeMax = 1U << ZSTD_WINDOWLOG_MAX; /* note : should be user-selectable, but requires an additional parameter (or a dctx) */ +- ZSTD_frameHeader zfh; ++ ZSTD_FrameHeader zfh; + size_t const err = ZSTD_getFrameHeader(&zfh, src, srcSize); + if (ZSTD_isError(err)) return err; + RETURN_ERROR_IF(err>0, srcSize_wrong, ""); +@@ -1888,6 +2019,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + U32 someMoreWork = 1; + + DEBUGLOG(5, "ZSTD_decompressStream"); ++ assert(zds != NULL); + RETURN_ERROR_IF( + input->pos > input->size, + srcSize_wrong, +@@ -1918,7 +2050,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } +- DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { + return hSize; /* error */ + } +@@ -1932,6 +2063,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->lhSize += remainingInput; + } + input->pos = input->size; ++ /* check first few bytes */ ++ FORWARD_IF_ERROR( ++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), ++ "First few bytes detected incorrect" ); ++ /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); +@@ -1943,14 +2079,15 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->fParams.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN + && zds->fParams.frameType != ZSTD_skippableFrame + && (U64)(size_t)(oend-op) >= zds->fParams.frameContentSize) { +- size_t const cSize = ZSTD_findFrameCompressedSize(istart, (size_t)(iend-istart)); ++ size_t const cSize = ZSTD_findFrameCompressedSize_advanced(istart, (size_t)(iend-istart), zds->format); + if (cSize <= (size_t)(iend-istart)) { + /* shortcut : using single-pass mode */ + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; +- DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") ++ DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()"); ++ assert(istart != NULL); + ip = istart + cSize; +- op += decompressedSize; ++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; +@@ -1969,7 +2106,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + DEBUGLOG(4, "Consume header"); + FORWARD_IF_ERROR(ZSTD_decompressBegin_usingDDict(zds, ZSTD_getDDict(zds)), ""); + +- if ((MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ ++ if (zds->format == ZSTD_f_zstd1 ++ && (MEM_readLE32(zds->headerBuffer) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { /* skippable frame */ + zds->expected = MEM_readLE32(zds->headerBuffer + ZSTD_FRAMEIDSIZE); + zds->stage = ZSTDds_skipFrame; + } else { +@@ -1985,11 +2123,13 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->fParams.windowSize = MAX(zds->fParams.windowSize, 1U << ZSTD_WINDOWLOG_ABSOLUTEMIN); + RETURN_ERROR_IF(zds->fParams.windowSize > zds->maxWindowSize, + frameParameter_windowTooLarge, ""); ++ if (zds->maxBlockSizeParam != 0) ++ zds->fParams.blockSizeMax = MIN(zds->fParams.blockSizeMax, (unsigned)zds->maxBlockSizeParam); + + /* Adapt buffer sizes to frame header instructions */ + { size_t const neededInBuffSize = MAX(zds->fParams.blockSizeMax, 4 /* frame checksum */); + size_t const neededOutBuffSize = zds->outBufferMode == ZSTD_bm_buffered +- ? ZSTD_decodingBufferSize_min(zds->fParams.windowSize, zds->fParams.frameContentSize) ++ ? ZSTD_decodingBufferSize_internal(zds->fParams.windowSize, zds->fParams.frameContentSize, zds->fParams.blockSizeMax) + : 0; + + ZSTD_DCtx_updateOversizedDuration(zds, neededInBuffSize, neededOutBuffSize); +@@ -2034,6 +2174,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); ++ assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; +@@ -2048,7 +2189,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ +- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); ++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { +@@ -2057,8 +2198,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } +- ip += loadedSize; +- zds->inPos += loadedSize; ++ if (loadedSize != 0) { ++ /* ip may be NULL */ ++ ip += loadedSize; ++ zds->inPos += loadedSize; ++ } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ +@@ -2068,14 +2212,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + break; + } + case zdss_flush: +- { size_t const toFlushSize = zds->outEnd - zds->outStart; ++ { ++ size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); +- op += flushedSize; ++ ++ op = op ? op + flushedSize : op; ++ + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) +- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { ++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); +@@ -2089,7 +2236,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ +@@ -2102,8 +2249,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { +- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); +- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); ++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); ++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { +@@ -2140,11 +2287,17 @@ size_t ZSTD_decompressStream_simpleArgs ( + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; +- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; ++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } +diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c +index c1913b8e7c89..710eb0ffd5a3 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.c ++++ b/lib/zstd/decompress/zstd_decompress_block.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,12 +21,12 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/zstd_internal.h" + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /*_******************************************************* + * Macros +@@ -51,6 +52,13 @@ static void ZSTD_copy4(void* dst, const void* src) { ZSTD_memcpy(dst, src, 4); } + * Block decoding + ***************************************************************/ + ++static size_t ZSTD_blockSizeMax(ZSTD_DCtx const* dctx) ++{ ++ size_t const blockSizeMax = dctx->isFrameDecompression ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX; ++ assert(blockSizeMax <= ZSTD_BLOCKSIZE_MAX); ++ return blockSizeMax; ++} ++ + /*! ZSTD_getcBlockSize() : + * Provides the size of compressed block from block header `src` */ + size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, +@@ -73,41 +81,49 @@ size_t ZSTD_getcBlockSize(const void* src, size_t srcSize, + static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const size_t dstCapacity, const size_t litSize, + const streaming_operation streaming, const size_t expectedWriteSize, const unsigned splitImmediately) + { +- if (streaming == not_streaming && dstCapacity > ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) +- { +- /* room for litbuffer to fit without read faulting */ +- dctx->litBuffer = (BYTE*)dst + ZSTD_BLOCKSIZE_MAX + WILDCOPY_OVERLENGTH; ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); ++ assert(litSize <= blockSizeMax); ++ assert(dctx->isFrameDecompression || streaming == not_streaming); ++ assert(expectedWriteSize <= blockSizeMax); ++ if (streaming == not_streaming && dstCapacity > blockSizeMax + WILDCOPY_OVERLENGTH + litSize + WILDCOPY_OVERLENGTH) { ++ /* If we aren't streaming, we can just put the literals after the output ++ * of the current block. We don't need to worry about overwriting the ++ * extDict of our window, because it doesn't exist. ++ * So if we have space after the end of the block, just put it there. ++ */ ++ dctx->litBuffer = (BYTE*)dst + blockSizeMax + WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize; + dctx->litBufferLocation = ZSTD_in_dst; +- } +- else if (litSize > ZSTD_LITBUFFEREXTRASIZE) +- { +- /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ ++ } else if (litSize <= ZSTD_LITBUFFEREXTRASIZE) { ++ /* Literals fit entirely within the extra buffer, put them there to avoid ++ * having to split the literals. ++ */ ++ dctx->litBuffer = dctx->litExtraBuffer; ++ dctx->litBufferEnd = dctx->litBuffer + litSize; ++ dctx->litBufferLocation = ZSTD_not_in_dst; ++ } else { ++ assert(blockSizeMax > ZSTD_LITBUFFEREXTRASIZE); ++ /* Literals must be split between the output block and the extra lit ++ * buffer. We fill the extra lit buffer with the tail of the literals, ++ * and put the rest of the literals at the end of the block, with ++ * WILDCOPY_OVERLENGTH of buffer room to allow for overreads. ++ * This MUST not write more than our maxBlockSize beyond dst, because in ++ * streaming mode, that could overwrite part of our extDict window. ++ */ + if (splitImmediately) { + /* won't fit in litExtraBuffer, so it will be split between end of dst and extra buffer */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; +- } +- else { +- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ ++ } else { ++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } + dctx->litBufferLocation = ZSTD_split; +- } +- else +- { +- /* fits entirely within litExtraBuffer, so no split is necessary */ +- dctx->litBuffer = dctx->litExtraBuffer; +- dctx->litBufferEnd = dctx->litBuffer + litSize; +- dctx->litBufferLocation = ZSTD_not_in_dst; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + expectedWriteSize); + } + } + +-/* Hidden declaration for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, +- const void* src, size_t srcSize, +- void* dst, size_t dstCapacity, const streaming_operation streaming); + /*! ZSTD_decodeLiteralsBlock() : + * Where it is possible to do so without being stomped by the output during decompression, the literals block will be stored + * in the dstBuffer. If there is room to do so, it will be stored in full in the excess dst space after where the current +@@ -116,7 +132,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + * + * @return : nb of bytes read from src (< srcSize ) + * note : symbol not declared but exposed for fullbench */ +-size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, ++static size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, /* note : srcSize < BLOCKSIZE */ + void* dst, size_t dstCapacity, const streaming_operation streaming) + { +@@ -124,7 +140,8 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + RETURN_ERROR_IF(srcSize < MIN_CBLOCK_SIZE, corruption_detected, ""); + + { const BYTE* const istart = (const BYTE*) src; +- symbolEncodingType_e const litEncType = (symbolEncodingType_e)(istart[0] & 3); ++ SymbolEncodingType_e const litEncType = (SymbolEncodingType_e)(istart[0] & 3); ++ size_t const blockSizeMax = ZSTD_blockSizeMax(dctx); + + switch(litEncType) + { +@@ -134,13 +151,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + ZSTD_FALLTHROUGH; + + case set_compressed: +- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); ++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); ++ int const flags = 0 ++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) ++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -164,7 +184,11 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); ++ if (!singleStream) ++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, ++ "Not enough literals (%zu) for the 4-streams mode (min %u)", ++ litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); +@@ -176,13 +200,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + if (litEncType==set_repeat) { + if (singleStream) { +- hufSuccess = HUF_decompress1X_usingDTable_bmi2( ++ hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } else { +- hufSuccess = HUF_decompress4X_usingDTable_bmi2( ++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); ++ hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } + } else { + if (singleStream) { +@@ -190,26 +215,28 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace)); ++ sizeof(dctx->workspace), flags); + #else +- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( ++ hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + #endif + } else { +- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( ++ hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) + { ++ assert(litSize > ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memcpy(dctx->litExtraBuffer, dctx->litBufferEnd - ZSTD_LITBUFFEREXTRASIZE, ZSTD_LITBUFFEREXTRASIZE); + ZSTD_memmove(dctx->litBuffer + ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH, dctx->litBuffer, litSize - ZSTD_LITBUFFEREXTRASIZE); + dctx->litBuffer += ZSTD_LITBUFFEREXTRASIZE - WILDCOPY_OVERLENGTH; + dctx->litBufferEnd -= WILDCOPY_OVERLENGTH; ++ assert(dctx->litBufferEnd <= (BYTE*)dst + blockSizeMax); + } + + RETURN_ERROR_IF(HUF_isError(hufSuccess), corruption_detected, ""); +@@ -224,7 +251,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_basic: + { size_t litSize, lhSize; + U32 const lhlCode = ((istart[0]) >> 2) & 3; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -237,11 +264,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } + + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (lhSize+litSize+WILDCOPY_OVERLENGTH > srcSize) { /* risk reading beyond src buffer with wildcopy */ +@@ -270,7 +299,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + case set_rle: + { U32 const lhlCode = ((istart[0]) >> 2) & 3; + size_t litSize, lhSize; +- size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ size_t expectedWriteSize = MIN(blockSizeMax, dstCapacity); + switch(lhlCode) + { + case 0: case 2: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -279,16 +308,17 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 1: + lhSize = 2; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; +- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +- RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ RETURN_ERROR_IF(litSize > blockSizeMax, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize, dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 1); + if (dctx->litBufferLocation == ZSTD_split) +@@ -310,6 +340,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + } + } + ++/* Hidden declaration for fullbench */ ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity); ++size_t ZSTD_decodeLiteralsBlock_wrapper(ZSTD_DCtx* dctx, ++ const void* src, size_t srcSize, ++ void* dst, size_t dstCapacity) ++{ ++ dctx->isFrameDecompression = 0; ++ return ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, not_streaming); ++} ++ + /* Default FSE distribution tables. + * These are pre-calculated FSE decoding tables using default distributions as defined in specification : + * https://github.com/facebook/zstd/blob/release/doc/zstd_compression_format.md#default-distributions +@@ -317,7 +359,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + * - start from default distributions, present in /lib/common/zstd_internal.h + * - generate tables normally, using ZSTD_buildFSETable() + * - printout the content of tables +- * - pretify output, report below, test with fuzzer to ensure it's correct */ ++ * - prettify output, report below, test with fuzzer to ensure it's correct */ + + /* Default FSE distribution table for Literal Lengths */ + static const ZSTD_seqSymbol LL_defaultDTable[(1<=0); ++ pos += (size_t)n; + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -540,7 +583,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ ++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } +@@ -551,7 +594,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (u=0; u 0x7F) { + if (nbSeq == 0xFF) { + RETURN_ERROR_IF(ip+2 > iend, srcSize_wrong, ""); +@@ -681,11 +719,19 @@ size_t ZSTD_decodeSeqHeaders(ZSTD_DCtx* dctx, int* nbSeqPtr, + } + *nbSeqPtr = nbSeq; + ++ if (nbSeq == 0) { ++ /* No sequence : section ends immediately */ ++ RETURN_ERROR_IF(ip != iend, corruption_detected, ++ "extraneous data present in the Sequences section"); ++ return (size_t)(ip - istart); ++ } ++ + /* FSE table descriptors */ + RETURN_ERROR_IF(ip+1 > iend, srcSize_wrong, ""); /* minimum possible size: 1 byte for symbol encoding types */ +- { symbolEncodingType_e const LLtype = (symbolEncodingType_e)(*ip >> 6); +- symbolEncodingType_e const OFtype = (symbolEncodingType_e)((*ip >> 4) & 3); +- symbolEncodingType_e const MLtype = (symbolEncodingType_e)((*ip >> 2) & 3); ++ RETURN_ERROR_IF(*ip & 3, corruption_detected, ""); /* The last field, Reserved, must be all-zeroes. */ ++ { SymbolEncodingType_e const LLtype = (SymbolEncodingType_e)(*ip >> 6); ++ SymbolEncodingType_e const OFtype = (SymbolEncodingType_e)((*ip >> 4) & 3); ++ SymbolEncodingType_e const MLtype = (SymbolEncodingType_e)((*ip >> 2) & 3); + ip++; + + /* Build DTables */ +@@ -829,7 +875,7 @@ static void ZSTD_safecopy(BYTE* op, const BYTE* const oend_w, BYTE const* ip, pt + /* ZSTD_safecopyDstBeforeSrc(): + * This version allows overlap with dst before src, or handles the non-overlap case with dst after src + * Kept separate from more common ZSTD_safecopy case to avoid performance impact to the safecopy common case */ +-static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length) { ++static void ZSTD_safecopyDstBeforeSrc(BYTE* op, const BYTE* ip, ptrdiff_t length) { + ptrdiff_t const diff = op - ip; + BYTE* const oend = op + length; + +@@ -858,6 +904,7 @@ static void ZSTD_safecopyDstBeforeSrc(BYTE* op, BYTE const* ip, ptrdiff_t length + * to be optimized for many small sequences, since those fall into ZSTD_execSequence(). + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEnd(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -905,6 +952,7 @@ size_t ZSTD_execSequenceEnd(BYTE* op, + * This version is intended to be used during instances where the litBuffer is still split. It is kept separate to avoid performance impact for the good case. + */ + FORCE_NOINLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -950,6 +998,7 @@ size_t ZSTD_execSequenceEndSplitLitBuffer(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequence(BYTE* op, + BYTE* const oend, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -964,6 +1013,11 @@ size_t ZSTD_execSequence(BYTE* op, + + assert(op != NULL /* Precondition */); + assert(oend_w < oend /* No underflow */); ++ ++#if defined(__aarch64__) ++ /* prefetch sequence starting from match that will be used for copy later */ ++ PREFETCH_L1(match); ++#endif + /* Handle edge cases in a slow path: + * - Read beyond end of literals + * - Match end is within WILDCOPY_OVERLIMIT of oend +@@ -1043,6 +1097,7 @@ size_t ZSTD_execSequence(BYTE* op, + } + + HINT_INLINE ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + size_t ZSTD_execSequenceSplitLitBuffer(BYTE* op, + BYTE* const oend, const BYTE* const oend_w, seq_t sequence, + const BYTE** litPtr, const BYTE* const litLimit, +@@ -1154,7 +1209,7 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + } + + /* We need to add at most (ZSTD_WINDOWLOG_MAX_32 - 1) bits to read the maximum +- * offset bits. But we can only read at most (STREAM_ACCUMULATOR_MIN_32 - 1) ++ * offset bits. But we can only read at most STREAM_ACCUMULATOR_MIN_32 + * bits before reloading. This value is the maximum number of bytes we read + * after reloading when we are decoding long offsets. + */ +@@ -1165,13 +1220,37 @@ ZSTD_updateFseStateWithDInfo(ZSTD_fseState* DStatePtr, BIT_DStream_t* bitD, U16 + + typedef enum { ZSTD_lo_isRegularOffset, ZSTD_lo_isLongOffset=1 } ZSTD_longOffset_e; + ++/* ++ * ZSTD_decodeSequence(): ++ * @p longOffsets : tells the decoder to reload more bit while decoding large offsets ++ * only used in 32-bit mode ++ * @return : Sequence (litL + matchL + offset) ++ */ + FORCE_INLINE_TEMPLATE seq_t +-ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) ++ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets, const int isLastSeq) + { + seq_t seq; ++ /* ++ * ZSTD_seqSymbol is a 64 bits wide structure. ++ * It can be loaded in one operation ++ * and its fields extracted by simply shifting or bit-extracting on aarch64. ++ * GCC doesn't recognize this and generates more unnecessary ldr/ldrb/ldrh ++ * operations that cause performance drop. This can be avoided by using this ++ * ZSTD_memcpy hack. ++ */ ++#if defined(__aarch64__) && (defined(__GNUC__) && !defined(__clang__)) ++ ZSTD_seqSymbol llDInfoS, mlDInfoS, ofDInfoS; ++ ZSTD_seqSymbol* const llDInfo = &llDInfoS; ++ ZSTD_seqSymbol* const mlDInfo = &mlDInfoS; ++ ZSTD_seqSymbol* const ofDInfo = &ofDInfoS; ++ ZSTD_memcpy(llDInfo, seqState->stateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); ++#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; ++#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; +@@ -1186,28 +1265,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; ++ ++ assert(llBits <= MaxLLBits); ++ assert(mlBits <= MaxMLBits); ++ assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only +- * valuable to mark likelyness for clang, it gives around 3-4% of ++ * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; +- #if defined(__clang__) +- if (LIKELY(ofBits > 1)) { +- #else + if (ofBits > 1) { +- #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +- assert(ofBits <= MaxOff); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); ++ /* Always read extra bits, this keeps the logic simple, ++ * avoids branches, and avoids accidentally reading 0 bits. ++ */ ++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); +- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ ++ offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); +@@ -1224,7 +1306,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + } else { + offset = ofBase + ll0 + BIT_readBitsFast(&seqState->DStream, 1); + { size_t temp = (offset==3) ? seqState->prevOffset[0] - 1 : seqState->prevOffset[offset]; +- temp += !temp; /* 0 is not valid; input is corrupted; force offset to 1 */ ++ temp -= !temp; /* 0 is not valid: input corrupted => force offset to -1 => corruption detected at execSequence */ + if (offset != 1) seqState->prevOffset[2] = seqState->prevOffset[1]; + seqState->prevOffset[1] = seqState->prevOffset[0]; + seqState->prevOffset[0] = offset = temp; +@@ -1232,11 +1314,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + seq.offset = offset; + } + +- #if defined(__clang__) +- if (UNLIKELY(mlBits > 0)) +- #else + if (mlBits > 0) +- #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +@@ -1246,11 +1324,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +- #if defined(__clang__) +- if (UNLIKELY(llBits > 0)) +- #else + if (llBits > 0) +- #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) +@@ -1259,17 +1333,22 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + DEBUGLOG(6, "seq: litL=%u, matchL=%u, offset=%u", + (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); + +- ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ +- if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ +- ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ if (!isLastSeq) { ++ /* don't update FSE state for last Sequence */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateLL, &seqState->DStream, llNext, llnbBits); /* <= 9 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateML, &seqState->DStream, mlNext, mlnbBits); /* <= 9 bits */ ++ if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); /* <= 18 bits */ ++ ZSTD_updateFseStateWithDInfo(&seqState->stateOffb, &seqState->DStream, ofNext, ofnbBits); /* <= 8 bits */ ++ BIT_reloadDStream(&seqState->DStream); ++ } + } + + return seq; + } + +-#ifdef FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION +-MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) ++#if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) ++#if DEBUGLEVEL >= 1 ++static int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefixStart, BYTE const* oLitEnd) + { + size_t const windowSize = dctx->fParams.windowSize; + /* No dictionary used. */ +@@ -1283,30 +1362,33 @@ MEM_STATIC int ZSTD_dictionaryIsActive(ZSTD_DCtx const* dctx, BYTE const* prefix + /* Dictionary is active. */ + return 1; + } ++#endif + +-MEM_STATIC void ZSTD_assertValidSequence( ++static void ZSTD_assertValidSequence( + ZSTD_DCtx const* dctx, + BYTE const* op, BYTE const* oend, + seq_t const seq, + BYTE const* prefixStart, BYTE const* virtualStart) + { + #if DEBUGLEVEL >= 1 +- size_t const windowSize = dctx->fParams.windowSize; +- size_t const sequenceSize = seq.litLength + seq.matchLength; +- BYTE const* const oLitEnd = op + seq.litLength; +- DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", +- (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); +- assert(op <= oend); +- assert((size_t)(oend - op) >= sequenceSize); +- assert(sequenceSize <= ZSTD_BLOCKSIZE_MAX); +- if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { +- size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); +- /* Offset must be within the dictionary. */ +- assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); +- assert(seq.offset <= windowSize + dictSize); +- } else { +- /* Offset must be within our window. */ +- assert(seq.offset <= windowSize); ++ if (dctx->isFrameDecompression) { ++ size_t const windowSize = dctx->fParams.windowSize; ++ size_t const sequenceSize = seq.litLength + seq.matchLength; ++ BYTE const* const oLitEnd = op + seq.litLength; ++ DEBUGLOG(6, "Checking sequence: litL=%u matchL=%u offset=%u", ++ (U32)seq.litLength, (U32)seq.matchLength, (U32)seq.offset); ++ assert(op <= oend); ++ assert((size_t)(oend - op) >= sequenceSize); ++ assert(sequenceSize <= ZSTD_blockSizeMax(dctx)); ++ if (ZSTD_dictionaryIsActive(dctx, prefixStart, oLitEnd)) { ++ size_t const dictSize = (size_t)((char const*)dctx->dictContentEndForFuzzing - (char const*)dctx->dictContentBeginForFuzzing); ++ /* Offset must be within the dictionary. */ ++ assert(seq.offset <= (size_t)(oLitEnd - virtualStart)); ++ assert(seq.offset <= windowSize + dictSize); ++ } else { ++ /* Offset must be within our window. */ ++ assert(seq.offset <= windowSize); ++ } + } + #else + (void)dctx, (void)op, (void)oend, (void)seq, (void)prefixStart, (void)virtualStart; +@@ -1322,23 +1404,21 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = ostart + maxDstSize; ++ BYTE* const oend = ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const vBase = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer (%i seqs)", nbSeq); + +- /* Regen sequences */ ++ /* Literals are split between internal buffer & output buffer */ + if (nbSeq) { + seqState_t seqState; + dctx->fseEntropy = 1; +@@ -1357,8 +1437,7 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + BIT_DStream_completed < BIT_DStream_overflow); + + /* decompress without overrunning litPtr begins */ +- { +- seq_t sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ { seq_t sequence = {0,0,0}; /* some static analyzer believe that @sequence is not initialized (it necessarily is, since for(;;) loop as at least one iteration) */ + /* Align the decompression loop to 32 + 16 bytes. + * + * zstd compiled with gcc-9 on an Intel i9-9900k shows 10% decompression +@@ -1420,27 +1499,26 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + #endif + + /* Handle the initial state where litBuffer is currently split between dst and litExtraBuffer */ +- for (; litPtr + sequence.litLength <= dctx->litBufferEnd; ) { +- size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ for ( ; nbSeq; nbSeq--) { ++ sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); ++ if (litPtr + sequence.litLength > dctx->litBufferEnd) break; ++ { size_t const oneSeqSize = ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequence.litLength - WILDCOPY_OVERLENGTH, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif +- if (UNLIKELY(ZSTD_isError(oneSeqSize))) +- return oneSeqSize; +- DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); +- op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); +- sequence = ZSTD_decodeSequence(&seqState, isLongOffset); +- } ++ if (UNLIKELY(ZSTD_isError(oneSeqSize))) ++ return oneSeqSize; ++ DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); ++ op += oneSeqSize; ++ } } ++ DEBUGLOG(6, "reached: (litPtr + sequence.litLength > dctx->litBufferEnd)"); + + /* If there are more sequences, they will need to read literals from litExtraBuffer; copy over the remainder from dst and update litPtr and litEnd */ + if (nbSeq > 0) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ DEBUGLOG(6, "There are %i sequences left, and %zu/%zu literals left in buffer", nbSeq, leftoverLit, sequence.litLength); ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence.litLength -= leftoverLit; +@@ -1449,24 +1527,22 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (--nbSeq) +- BIT_reloadDStream(&(seqState.DStream)); + } ++ nbSeq--; + } + } + +- if (nbSeq > 0) /* there is remaining lit from extra buffer */ +- { ++ if (nbSeq > 0) { ++ /* there is remaining lit from extra buffer */ + + #if defined(__x86_64__) + __asm__(".p2align 6"); +@@ -1485,35 +1561,34 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + # endif + #endif + +- for (; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litBufferEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + } + + /* check if reached exact end */ + DEBUGLOG(5, "ZSTD_decompressSequences_bodySplitLitBuffer: after decode loop, remaining nbSeq : %i", nbSeq); + RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ DEBUGLOG(5, "bitStream : start=%p, ptr=%p, bitsConsumed=%u", seqState.DStream.start, seqState.DStream.ptr, seqState.DStream.bitsConsumed); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ +- { +- size_t const lastLLSize = litBufferEnd - litPtr; ++ if (dctx->litBufferLocation == ZSTD_split) { ++ /* split hasn't been reached yet, first get dst then copy litExtraBuffer */ ++ size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from segment : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memmove(op, litPtr, lastLLSize); +@@ -1523,15 +1598,17 @@ ZSTD_decompressSequences_bodySplitLitBuffer( ZSTD_DCtx* dctx, + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; + } +- { size_t const lastLLSize = litBufferEnd - litPtr; ++ /* copy last literals from internal buffer */ ++ { size_t const lastLLSize = (size_t)(litBufferEnd - litPtr); ++ DEBUGLOG(6, "copy last literals from internal buffer : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -1539,21 +1616,19 @@ DONT_VECTORIZE + ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ostart + maxDstSize : dctx->litBuffer; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_not_in_dst ? ZSTD_maybeNullPtrAdd(ostart, maxDstSize) : dctx->litBuffer; + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* const litEnd = litPtr + dctx->litSize; + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_body"); +- (void)frame; ++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + + /* Regen sequences */ + if (nbSeq) { +@@ -1568,11 +1643,6 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + assert(dst != NULL); + +- ZSTD_STATIC_ASSERT( +- BIT_DStream_unfinished < BIT_DStream_completed && +- BIT_DStream_endOfBuffer < BIT_DStream_completed && +- BIT_DStream_completed < BIT_DStream_overflow); +- + #if defined(__x86_64__) + __asm__(".p2align 6"); + __asm__("nop"); +@@ -1587,73 +1657,70 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + # endif + #endif + +- for ( ; ; ) { +- seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset); ++ for ( ; nbSeq ; nbSeq--) { ++ seq_t const sequence = ZSTD_decodeSequence(&seqState, isLongOffset, nbSeq==1); + size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequence, &litPtr, litEnd, prefixStart, vBase, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); ++ ZSTD_assertValidSequence(dctx, op, oend, sequence, prefixStart, vBase); + #endif + if (UNLIKELY(ZSTD_isError(oneSeqSize))) + return oneSeqSize; + DEBUGLOG(6, "regenerated sequence size : %u", (U32)oneSeqSize); + op += oneSeqSize; +- if (UNLIKELY(!--nbSeq)) +- break; +- BIT_reloadDStream(&(seqState.DStream)); + } + + /* check if reached exact end */ +- DEBUGLOG(5, "ZSTD_decompressSequences_body: after decode loop, remaining nbSeq : %i", nbSeq); +- RETURN_ERROR_IF(nbSeq, corruption_detected, ""); +- RETURN_ERROR_IF(BIT_reloadDStream(&seqState.DStream) < BIT_DStream_completed, corruption_detected, ""); ++ assert(nbSeq == 0); ++ RETURN_ERROR_IF(!BIT_endOfDStream(&seqState.DStream), corruption_detected, ""); + /* save reps for next block */ + { U32 i; for (i=0; ientropy.rep[i] = (U32)(seqState.prevOffset[i]); } + } + + /* last literal segment */ +- { size_t const lastLLSize = litEnd - litPtr; ++ { size_t const lastLLSize = (size_t)(litEnd - litPtr); ++ DEBUGLOG(6, "copy last literals : %u", (U32)lastLLSize); + RETURN_ERROR_IF(lastLLSize > (size_t)(oend-op), dstSize_tooSmall, ""); + if (op != NULL) { + ZSTD_memcpy(op, litPtr, lastLLSize); + op += lastLLSize; +- } +- } ++ } } + +- return op-ostart; ++ DEBUGLOG(6, "decoded block of size %u bytes", (U32)(op - ostart)); ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequences_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + + static size_t + ZSTD_decompressSequencesSplitLitBuffer_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + +-FORCE_INLINE_TEMPLATE size_t +-ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, ++FORCE_INLINE_TEMPLATE ++ ++size_t ZSTD_prefetchMatch(size_t prefetchPos, seq_t const sequence, + const BYTE* const prefixStart, const BYTE* const dictEnd) + { + prefetchPos += sequence.litLength; + { const BYTE* const matchBase = (sequence.offset > prefetchPos) ? dictEnd : prefixStart; +- const BYTE* const match = matchBase + prefetchPos - sequence.offset; /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. +- * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ /* note : this operation can overflow when seq.offset is really too large, which can only happen when input is corrupted. ++ * No consequence though : memory address is only used for prefetching, not for dereferencing */ ++ const BYTE* const match = ZSTD_wrappedPtrSub(ZSTD_wrappedPtrAdd(matchBase, prefetchPos), sequence.offset); + PREFETCH_L1(match); PREFETCH_L1(match+CACHELINE_SIZE); /* note : it's safe to invoke PREFETCH() on any memory address, including invalid ones */ + } + return prefetchPos + sequence.matchLength; +@@ -1668,20 +1735,18 @@ ZSTD_decompressSequencesLong_body( + ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + const BYTE* ip = (const BYTE*)seqStart; + const BYTE* const iend = ip + seqSize; + BYTE* const ostart = (BYTE*)dst; +- BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ostart + maxDstSize; ++ BYTE* const oend = dctx->litBufferLocation == ZSTD_in_dst ? dctx->litBuffer : ZSTD_maybeNullPtrAdd(ostart, maxDstSize); + BYTE* op = ostart; + const BYTE* litPtr = dctx->litPtr; + const BYTE* litBufferEnd = dctx->litBufferEnd; + const BYTE* const prefixStart = (const BYTE*) (dctx->prefixStart); + const BYTE* const dictStart = (const BYTE*) (dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*) (dctx->dictEnd); +- (void)frame; + + /* Regen sequences */ + if (nbSeq) { +@@ -1706,20 +1771,17 @@ ZSTD_decompressSequencesLong_body( + ZSTD_initFseState(&seqState.stateML, &seqState.DStream, dctx->MLTptr); + + /* prepare in advance */ +- for (seqNb=0; (BIT_reloadDStream(&seqState.DStream) <= BIT_DStream_completed) && (seqNblitBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength > dctx->litBufferEnd) { + /* lit buffer is reaching split point, empty out the first buffer and transition to litExtraBuffer */ + const size_t leftoverLit = dctx->litBufferEnd - litPtr; + if (leftoverLit) +@@ -1732,26 +1794,26 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) +- assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ assert(!ZSTD_isError(oneSeqSize)); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif +- if (ZSTD_isError(oneSeqSize)) return oneSeqSize; ++ if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +- prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); +- sequences[seqNb & STORED_SEQS_MASK] = sequence; +- op += oneSeqSize; +- } ++ prefetchPos = ZSTD_prefetchMatch(prefetchPos, sequence, prefixStart, dictEnd); ++ sequences[seqNb & STORED_SEQS_MASK] = sequence; ++ op += oneSeqSize; ++ } } + else + { + /* lit buffer is either wholly contained in first or second split, or not split at all*/ +- oneSeqSize = dctx->litBufferLocation == ZSTD_split ? ++ size_t const oneSeqSize = dctx->litBufferLocation == ZSTD_split ? + ZSTD_execSequenceSplitLitBuffer(op, oend, litPtr + sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK].litLength - WILDCOPY_OVERLENGTH, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd) : + ZSTD_execSequence(op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[(seqNb - ADVANCED_SEQS) & STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + +@@ -1760,17 +1822,15 @@ ZSTD_decompressSequencesLong_body( + op += oneSeqSize; + } + } +- RETURN_ERROR_IF(seqNblitBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) +- { ++ if (dctx->litBufferLocation == ZSTD_split && litPtr + sequence->litLength > dctx->litBufferEnd) { + const size_t leftoverLit = dctx->litBufferEnd - litPtr; +- if (leftoverLit) +- { ++ if (leftoverLit) { + RETURN_ERROR_IF(leftoverLit > (size_t)(oend - op), dstSize_tooSmall, "remaining lit must fit within dstBuffer"); + ZSTD_safecopyDstBeforeSrc(op, litPtr, leftoverLit); + sequence->litLength -= leftoverLit; +@@ -1779,11 +1839,10 @@ ZSTD_decompressSequencesLong_body( + litPtr = dctx->litExtraBuffer; + litBufferEnd = dctx->litExtraBuffer + ZSTD_LITBUFFEREXTRASIZE; + dctx->litBufferLocation = ZSTD_not_in_dst; +- { +- size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); ++ { size_t const oneSeqSize = ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1796,7 +1855,7 @@ ZSTD_decompressSequencesLong_body( + ZSTD_execSequence(op, oend, *sequence, &litPtr, litBufferEnd, prefixStart, dictStart, dictEnd); + #if defined(FUZZING_BUILD_MODE_UNSAFE_FOR_PRODUCTION) && defined(FUZZING_ASSERT_VALID_SEQUENCE) + assert(!ZSTD_isError(oneSeqSize)); +- if (frame) ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); ++ ZSTD_assertValidSequence(dctx, op, oend, sequences[seqNb&STORED_SEQS_MASK], prefixStart, dictStart); + #endif + if (ZSTD_isError(oneSeqSize)) return oneSeqSize; + op += oneSeqSize; +@@ -1808,8 +1867,7 @@ ZSTD_decompressSequencesLong_body( + } + + /* last literal segment */ +- if (dctx->litBufferLocation == ZSTD_split) /* first deplete literal buffer in dst, then copy litExtraBuffer */ +- { ++ if (dctx->litBufferLocation == ZSTD_split) { /* first deplete literal buffer in dst, then copy litExtraBuffer */ + size_t const lastLLSize = litBufferEnd - litPtr; + RETURN_ERROR_IF(lastLLSize > (size_t)(oend - op), dstSize_tooSmall, ""); + if (op != NULL) { +@@ -1827,17 +1885,16 @@ ZSTD_decompressSequencesLong_body( + } + } + +- return op-ostart; ++ return (size_t)(op - ostart); + } + + static size_t + ZSTD_decompressSequencesLong_default(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + +@@ -1851,20 +1908,18 @@ DONT_VECTORIZE + ZSTD_decompressSequences_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static BMI2_TARGET_ATTRIBUTE size_t + DONT_VECTORIZE + ZSTD_decompressSequencesSplitLitBuffer_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bodySplitLitBuffer(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1873,50 +1928,40 @@ static BMI2_TARGET_ATTRIBUTE size_t + ZSTD_decompressSequencesLong_bmi2(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { +- return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_body(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + #endif /* DYNAMIC_BMI2 */ + +-typedef size_t (*ZSTD_decompressSequences_t)( +- ZSTD_DCtx* dctx, +- void* dst, size_t maxDstSize, +- const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame); +- + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + static size_t + ZSTD_decompressSequences(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequences"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + static size_t + ZSTD_decompressSequencesSplitLitBuffer(ZSTD_DCtx* dctx, void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesSplitLitBuffer"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG */ + +@@ -1931,69 +1976,114 @@ static size_t + ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + void* dst, size_t maxDstSize, + const void* seqStart, size_t seqSize, int nbSeq, +- const ZSTD_longOffset_e isLongOffset, +- const int frame) ++ const ZSTD_longOffset_e isLongOffset) + { + DEBUGLOG(5, "ZSTD_decompressSequencesLong"); + #if DYNAMIC_BMI2 + if (ZSTD_DCtx_get_bmi2(dctx)) { +- return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_bmi2(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif +- return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong_default(dctx, dst, maxDstSize, seqStart, seqSize, nbSeq, isLongOffset); + } + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + ++/* ++ * @returns The total size of the history referenceable by zstd, including ++ * both the prefix and the extDict. At @p op any offset larger than this ++ * is invalid. ++ */ ++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) ++{ ++ return (size_t)(op - virtualStart); ++} ++ ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : ++/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) +- * compared to maximum possible of (1< 22) total += 1; ++ ZSTD_OffsetInfo info = {0, 0}; ++ /* If nbSeq == 0, then the offTable is uninitialized, but we have ++ * no sequences, so both values should be 0. ++ */ ++ if (nbSeq != 0) { ++ const void* ptr = offTable; ++ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; ++ const ZSTD_seqSymbol* table = offTable + 1; ++ U32 const max = 1 << tableLog; ++ U32 u; ++ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); ++ ++ assert(max <= (1 << OffFSELog)); /* max not too large */ ++ for (u=0; u 22) info.longOffsetShare += 1; ++ } ++ ++ assert(tableLog <= OffFSELog); ++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + +- assert(tableLog <= OffFSELog); +- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ ++ return info; ++} + +- return total; ++/* ++ * @returns The maximum offset we can decode in one read of our bitstream, without ++ * reloading more bits in the middle of the offset bits read. Any offsets larger ++ * than this must use the long offset decoder. ++ */ ++static size_t ZSTD_maxShortOffset(void) ++{ ++ if (MEM_64bits()) { ++ /* We can decode any offset without reloading bits. ++ * This might change if the max window size grows. ++ */ ++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ++ return (size_t)-1; ++ } else { ++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. ++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. ++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. ++ */ ++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; ++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; ++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); ++ return maxOffset; ++ } + } +-#endif + + size_t + ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming) ++ const void* src, size_t srcSize, const streaming_operation streaming) + { /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; +- /* isLongOffset must be true if there are long offsets. +- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. +- * We don't expect that to be the case in 64-bit mode. +- * In block mode, window size is not known, so we have to be conservative. +- * (note: but it could be evaluated from current-lowLimit) +- */ +- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); +- DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); +- +- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); ++ DEBUGLOG(5, "ZSTD_decompressBlock_internal (cSize : %u)", (unsigned)srcSize); ++ ++ /* Note : the wording of the specification ++ * allows compressed block to be sized exactly ZSTD_blockSizeMax(dctx). ++ * This generally does not happen, as it makes little sense, ++ * since an uncompressed block would feature same size and have no decompression cost. ++ * Also, note that decoder from reference libzstd before < v1.5.4 ++ * would consider this edge case as an error. ++ * As a consequence, avoid generating compressed blocks of size ZSTD_blockSizeMax(dctx) ++ * for broader compatibility with the deployed ecosystem of zstd decoders */ ++ RETURN_ERROR_IF(srcSize > ZSTD_blockSizeMax(dctx), srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); +- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); ++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; +@@ -2001,6 +2091,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + + /* Build Decoding Tables */ + { ++ /* Compute the maximum block size, which must also work when !frame and fParams are unset. ++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. ++ */ ++ size_t const blockSizeMax = MIN(dstCapacity, ZSTD_blockSizeMax(dctx)); ++ size_t const totalHistorySize = ZSTD_totalHistorySize(ZSTD_maybeNullPtrAdd((BYTE*)dst, blockSizeMax), (BYTE const*)dctx->virtualStart); ++ /* isLongOffset must be true if there are long offsets. ++ * Offsets are long if they are larger than ZSTD_maxShortOffset(). ++ * We don't expect that to be the case in 64-bit mode. ++ * ++ * We check here to see if our history is large enough to allow long offsets. ++ * If it isn't, then we can't possible have (valid) long offsets. If the offset ++ * is invalid, then it is okay to read it incorrectly. ++ * ++ * If isLongOffsets is true, then we will later check our decoding table to see ++ * if it is even possible to generate long offsets. ++ */ ++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. +@@ -2008,6 +2115,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; ++#else ++ /* Set to 1 to avoid computing offset info if we don't need to. ++ * Otherwise this value is ignored. ++ */ ++ int usePrefetchDecoder = 1; + #endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); +@@ -2015,40 +2127,55 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + ip += seqHSize; + srcSize -= seqHSize; + +- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, ++ "invalid dst"); + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if ( !usePrefetchDecoder +- && (!frame || (dctx->fParams.windowSize > (1<<24))) +- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ +- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); +- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ +- usePrefetchDecoder = (shareLongOffsets >= minShare); ++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, ++ * compute information about the share of long offsets, and the maximum nbAdditionalBits. ++ * NOTE: could probably use a larger nbSeq limit ++ */ ++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); ++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { ++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small ++ * enough, then we know it is impossible to have too long an offset in this block, so we can ++ * use the regular offset decoder. ++ */ ++ isLongOffset = ZSTD_lo_isRegularOffset; ++ } ++ if (!usePrefetchDecoder) { ++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ ++ usePrefetchDecoder = (info.longOffsetShare >= minShare); ++ } + } +-#endif + + dctx->ddictIsCold = 0; + + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if (usePrefetchDecoder) ++ if (usePrefetchDecoder) { ++#else ++ (void)usePrefetchDecoder; ++ { + #endif + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT +- return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif ++ } + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ + if (dctx->litBufferLocation == ZSTD_split) +- return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequencesSplitLitBuffer(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + else +- return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); ++ return ZSTD_decompressSequences(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset); + #endif + } + } + + ++ZSTD_ALLOW_POINTER_OVERFLOW_ATTR + void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + { + if (dst != dctx->previousDstEnd && dstSize > 0) { /* not contiguous */ +@@ -2060,13 +2187,24 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + } + + +-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t dSize; ++ dctx->isFrameDecompression = 0; + ZSTD_checkContinuity(dctx, dst, dstCapacity); +- dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, /* frame */ 0, not_streaming); ++ dSize = ZSTD_decompressBlock_internal(dctx, dst, dstCapacity, src, srcSize, not_streaming); ++ FORWARD_IF_ERROR(dSize, ""); + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; + } ++ ++ ++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ ++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); ++} +diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h +index 3d2d57a5d25a..becffbd89364 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.h ++++ b/lib/zstd/decompress/zstd_decompress_block.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -47,7 +48,7 @@ typedef enum { + */ + size_t ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, const int frame, const streaming_operation streaming); ++ const void* src, size_t srcSize, const streaming_operation streaming); + + /* ZSTD_buildFSETable() : + * generate FSE decoding table for one symbol (ll, ml or off) +@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + ++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ + + #endif /* ZSTD_DEC_BLOCK_H */ +diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h +index 98102edb6a83..2a225d1811c4 100644 +--- a/lib/zstd/decompress/zstd_decompress_internal.h ++++ b/lib/zstd/decompress/zstd_decompress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = { + + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) ++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + + typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ +- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ ++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; + } ZSTD_entropyDTables_t; +@@ -135,7 +137,7 @@ struct ZSTD_DCtx_s + const void* virtualStart; /* virtual start of previous segment if it was just before current one */ + const void* dictEnd; /* end of previous segment */ + size_t expected; +- ZSTD_frameHeader fParams; ++ ZSTD_FrameHeader fParams; + U64 processedCSize; + U64 decodedSize; + blockType_e bType; /* used in ZSTD_decompressContinue(), store blockType between block header decoding and block decompression stages */ +@@ -152,7 +154,8 @@ struct ZSTD_DCtx_s + size_t litSize; + size_t rleSize; + size_t staticSize; +-#if DYNAMIC_BMI2 != 0 ++ int isFrameDecompression; ++#if DYNAMIC_BMI2 + int bmi2; /* == 1 if the CPU supports BMI2 and 0 otherwise. CPU support is determined dynamically once per context lifetime. */ + #endif + +@@ -164,6 +167,8 @@ struct ZSTD_DCtx_s + ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ ++ int disableHufAsm; ++ int maxBlockSizeParam; + + /* streaming */ + ZSTD_dStreamStage streamStage; +@@ -199,11 +204,11 @@ struct ZSTD_DCtx_s + }; /* typedef'd to ZSTD_DCtx within "zstd.h" */ + + MEM_STATIC int ZSTD_DCtx_get_bmi2(const struct ZSTD_DCtx_s *dctx) { +-#if DYNAMIC_BMI2 != 0 +- return dctx->bmi2; ++#if DYNAMIC_BMI2 ++ return dctx->bmi2; + #else + (void)dctx; +- return 0; ++ return 0; + #endif + } + +diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h +index a06ca187aab5..8a47eb2a4514 100644 +--- a/lib/zstd/decompress_sources.h ++++ b/lib/zstd/decompress_sources.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c +index 22686e367e6f..466828e35752 100644 +--- a/lib/zstd/zstd_common_module.c ++++ b/lib/zstd/zstd_common_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); + EXPORT_SYMBOL_GPL(ZSTD_isError); + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +-EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customFree); + + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Common"); +diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c +index bd8784449b31..7651b53551c8 100644 +--- a/lib/zstd/zstd_compress_module.c ++++ b/lib/zstd/zstd_compress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,6 +16,7 @@ + + #include "common/zstd_deps.h" + #include "common/zstd_internal.h" ++#include "compress/zstd_compress_internal.h" + + #define ZSTD_FORWARD_IF_ERR(ret) \ + do { \ +@@ -92,12 +93,64 @@ zstd_compression_parameters zstd_get_cparams(int level, + } + EXPORT_SYMBOL(zstd_get_cparams); + ++size_t zstd_cctx_set_param(zstd_cctx *cctx, ZSTD_cParameter param, int value) ++{ ++ return ZSTD_CCtx_setParameter(cctx, param, value); ++} ++EXPORT_SYMBOL(zstd_cctx_set_param); ++ + size_t zstd_cctx_workspace_bound(const zstd_compression_parameters *cparams) + { + return ZSTD_estimateCCtxSize_usingCParams(*cparams); + } + EXPORT_SYMBOL(zstd_cctx_workspace_bound); + ++// Used by zstd_cctx_workspace_bound_with_ext_seq_prod() ++static size_t dummy_external_sequence_producer( ++ void *sequenceProducerState, ++ ZSTD_Sequence *outSeqs, size_t outSeqsCapacity, ++ const void *src, size_t srcSize, ++ const void *dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize) ++{ ++ (void)sequenceProducerState; ++ (void)outSeqs; (void)outSeqsCapacity; ++ (void)src; (void)srcSize; ++ (void)dict; (void)dictSize; ++ (void)compressionLevel; ++ (void)windowSize; ++ return ZSTD_SEQUENCE_PRODUCER_ERROR; ++} ++ ++static void init_cctx_params_from_compress_params( ++ ZSTD_CCtx_params *cctx_params, ++ const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_parameters zstd_params; ++ memset(&zstd_params, 0, sizeof(zstd_params)); ++ zstd_params.cParams = *compress_params; ++ ZSTD_CCtxParams_init_advanced(cctx_params, zstd_params); ++} ++ ++size_t zstd_cctx_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_CCtx_params cctx_params; ++ init_cctx_params_from_compress_params(&cctx_params, compress_params); ++ ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer); ++ return ZSTD_estimateCCtxSize_usingCCtxParams(&cctx_params); ++} ++EXPORT_SYMBOL(zstd_cctx_workspace_bound_with_ext_seq_prod); ++ ++size_t zstd_cstream_workspace_bound_with_ext_seq_prod(const zstd_compression_parameters *compress_params) ++{ ++ ZSTD_CCtx_params cctx_params; ++ init_cctx_params_from_compress_params(&cctx_params, compress_params); ++ ZSTD_CCtxParams_registerSequenceProducer(&cctx_params, NULL, dummy_external_sequence_producer); ++ return ZSTD_estimateCStreamSize_usingCCtxParams(&cctx_params); ++} ++EXPORT_SYMBOL(zstd_cstream_workspace_bound_with_ext_seq_prod); ++ + zstd_cctx *zstd_init_cctx(void *workspace, size_t workspace_size) + { + if (workspace == NULL) +@@ -209,5 +262,25 @@ size_t zstd_end_stream(zstd_cstream *cstream, zstd_out_buffer *output) + } + EXPORT_SYMBOL(zstd_end_stream); + ++void zstd_register_sequence_producer( ++ zstd_cctx *cctx, ++ void* sequence_producer_state, ++ zstd_sequence_producer_f sequence_producer ++) { ++ ZSTD_registerSequenceProducer(cctx, sequence_producer_state, sequence_producer); ++} ++EXPORT_SYMBOL(zstd_register_sequence_producer); ++ ++size_t zstd_compress_sequences_and_literals(zstd_cctx *cctx, void* dst, size_t dst_capacity, ++ const zstd_sequence *in_seqs, size_t in_seqs_size, ++ const void* literals, size_t lit_size, size_t lit_capacity, ++ size_t decompressed_size) ++{ ++ return ZSTD_compressSequencesAndLiterals(cctx, dst, dst_capacity, in_seqs, ++ in_seqs_size, literals, lit_size, ++ lit_capacity, decompressed_size); ++} ++EXPORT_SYMBOL(zstd_compress_sequences_and_literals); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Compressor"); +diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c +index 469fc3059be0..0ae819f0c927 100644 +--- a/lib/zstd/zstd_decompress_module.c ++++ b/lib/zstd/zstd_decompress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -113,7 +113,7 @@ EXPORT_SYMBOL(zstd_init_dstream); + + size_t zstd_reset_dstream(zstd_dstream *dstream) + { +- return ZSTD_resetDStream(dstream); ++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); + } + EXPORT_SYMBOL(zstd_reset_dstream); + +-- +2.49.0.634.g8613c2bb6c + diff --git a/sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15 b/sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15 new file mode 120000 index 0000000..9c73995 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.14/gentoo-sources-6.15 @@ -0,0 +1 @@ +gentoo-sources-6.15 \ No newline at end of file diff --git a/sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip b/sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip new file mode 100644 index 0000000..2a4aa7f --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/0001-amd-pstate.patch.skip @@ -0,0 +1,402 @@ +From 93b3c85030525027181d7ae26378331eeea06a29 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Wed, 4 Jun 2025 16:40:31 +0200 +Subject: [PATCH 1/8] amd-pstate + +Signed-off-by: Peter Jung +--- + drivers/cpufreq/amd-pstate.c | 111 +++++++++++++++++++++++++-------- + drivers/cpufreq/amd-pstate.h | 2 + + include/linux/sched/topology.h | 6 ++ + kernel/sched/debug.c | 4 ++ + kernel/sched/fair.c | 5 +- + kernel/sched/topology.c | 58 +++++++++++++++++ + 6 files changed, 158 insertions(+), 28 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index b961f3a3b580..12331e127d96 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -389,7 +389,8 @@ static inline int amd_pstate_cppc_enable(struct cpufreq_policy *policy) + static int msr_init_perf(struct amd_cpudata *cpudata) + { + union perf_cached perf = READ_ONCE(cpudata->perf); +- u64 cap1, numerator; ++ u64 cap1, numerator, cppc_req; ++ u8 min_perf; + + int ret = rdmsrl_safe_on_cpu(cpudata->cpu, MSR_AMD_CPPC_CAP1, + &cap1); +@@ -400,6 +401,22 @@ static int msr_init_perf(struct amd_cpudata *cpudata) + if (ret) + return ret; + ++ ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &cppc_req); ++ if (ret) ++ return ret; ++ ++ WRITE_ONCE(cpudata->cppc_req_cached, cppc_req); ++ min_perf = FIELD_GET(AMD_CPPC_MIN_PERF_MASK, cppc_req); ++ ++ /* ++ * Clear out the min_perf part to check if the rest of the MSR is 0, if yes, this is an ++ * indication that the min_perf value is the one specified through the BIOS option ++ */ ++ cppc_req &= ~(AMD_CPPC_MIN_PERF_MASK); ++ ++ if (!cppc_req) ++ perf.bios_min_perf = min_perf; ++ + perf.highest_perf = numerator; + perf.max_limit_perf = numerator; + perf.min_limit_perf = FIELD_GET(AMD_CPPC_LOWEST_PERF_MASK, cap1); +@@ -554,6 +571,10 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, + if (!policy) + return; + ++ /* limit the max perf when core performance boost feature is disabled */ ++ if (!cpudata->boost_supported) ++ max_perf = min_t(u8, perf.nominal_perf, max_perf); ++ + des_perf = clamp_t(u8, des_perf, min_perf, max_perf); + + policy->cur = perf_to_freq(perf, cpudata->nominal_freq, des_perf); +@@ -563,10 +584,6 @@ static void amd_pstate_update(struct amd_cpudata *cpudata, u8 min_perf, + des_perf = 0; + } + +- /* limit the max perf when core performance boost feature is disabled */ +- if (!cpudata->boost_supported) +- max_perf = min_t(u8, perf.nominal_perf, max_perf); +- + if (trace_amd_pstate_perf_enabled() && amd_pstate_sample(cpudata)) { + trace_amd_pstate_perf(min_perf, des_perf, max_perf, cpudata->freq, + cpudata->cur.mperf, cpudata->cur.aperf, cpudata->cur.tsc, +@@ -580,20 +597,26 @@ static int amd_pstate_verify(struct cpufreq_policy_data *policy_data) + { + /* + * Initialize lower frequency limit (i.e.policy->min) with +- * lowest_nonlinear_frequency which is the most energy efficient +- * frequency. Override the initial value set by cpufreq core and +- * amd-pstate qos_requests. ++ * lowest_nonlinear_frequency or the min frequency (if) specified in BIOS, ++ * Override the initial value set by cpufreq core and amd-pstate qos_requests. + */ + if (policy_data->min == FREQ_QOS_MIN_DEFAULT_VALUE) { + struct cpufreq_policy *policy __free(put_cpufreq_policy) = + cpufreq_cpu_get(policy_data->cpu); + struct amd_cpudata *cpudata; ++ union perf_cached perf; + + if (!policy) + return -EINVAL; + + cpudata = policy->driver_data; +- policy_data->min = cpudata->lowest_nonlinear_freq; ++ perf = READ_ONCE(cpudata->perf); ++ ++ if (perf.bios_min_perf) ++ policy_data->min = perf_to_freq(perf, cpudata->nominal_freq, ++ perf.bios_min_perf); ++ else ++ policy_data->min = cpudata->lowest_nonlinear_freq; + } + + cpufreq_verify_within_cpu_limits(policy_data); +@@ -831,8 +854,10 @@ static void amd_pstate_update_limits(unsigned int cpu) + if (highest_perf_changed) { + WRITE_ONCE(cpudata->prefcore_ranking, cur_high); + +- if (cur_high < CPPC_MAX_PERF) ++ if (cur_high < CPPC_MAX_PERF) { + sched_set_itmt_core_prio((int)cur_high, cpu); ++ sched_update_asym_prefer_cpu(cpu, prev_high, cur_high); ++ } + } + } + +@@ -1024,6 +1049,10 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) + static void amd_pstate_cpu_exit(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ ++ /* Reset CPPC_REQ MSR to the BIOS value */ ++ amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + + freq_qos_remove_request(&cpudata->req[1]); + freq_qos_remove_request(&cpudata->req[0]); +@@ -1419,7 +1448,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + struct amd_cpudata *cpudata; + union perf_cached perf; + struct device *dev; +- u64 value; + int ret; + + /* +@@ -1484,12 +1512,6 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) + cpudata->epp_default = AMD_CPPC_EPP_BALANCE_PERFORMANCE; + } + +- if (cpu_feature_enabled(X86_FEATURE_CPPC)) { +- ret = rdmsrl_on_cpu(cpudata->cpu, MSR_AMD_CPPC_REQ, &value); +- if (ret) +- return ret; +- WRITE_ONCE(cpudata->cppc_req_cached, value); +- } + ret = amd_pstate_set_epp(policy, cpudata->epp_default); + if (ret) + return ret; +@@ -1509,6 +1531,11 @@ static void amd_pstate_epp_cpu_exit(struct cpufreq_policy *policy) + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata) { ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ ++ /* Reset CPPC_REQ MSR to the BIOS value */ ++ amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); ++ + kfree(cpudata); + policy->driver_data = NULL; + } +@@ -1559,21 +1586,38 @@ static int amd_pstate_epp_set_policy(struct cpufreq_policy *policy) + return 0; + } + +-static int amd_pstate_epp_cpu_online(struct cpufreq_policy *policy) ++static int amd_pstate_cpu_online(struct cpufreq_policy *policy) + { +- pr_debug("AMD CPU Core %d going online\n", policy->cpu); +- + return amd_pstate_cppc_enable(policy); + } + +-static int amd_pstate_epp_cpu_offline(struct cpufreq_policy *policy) ++static int amd_pstate_cpu_offline(struct cpufreq_policy *policy) + { +- return 0; ++ struct amd_cpudata *cpudata = policy->driver_data; ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ ++ /* ++ * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified ++ * min_perf value across kexec reboots. If this CPU is just onlined normally after this, the ++ * limits, epp and desired perf will get reset to the cached values in cpudata struct ++ */ ++ return amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); + } + +-static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) ++static int amd_pstate_suspend(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ int ret; ++ ++ /* ++ * Reset CPPC_REQ MSR to the BIOS value, this will allow us to retain the BIOS specified ++ * min_perf value across kexec reboots. If this CPU is just resumed back without kexec, ++ * the limits, epp and desired perf will get reset to the cached values in cpudata struct ++ */ ++ ret = amd_pstate_update_perf(policy, perf.bios_min_perf, 0U, 0U, 0U, false); ++ if (ret) ++ return ret; + + /* invalidate to ensure it's rewritten during resume */ + cpudata->cppc_req_cached = 0; +@@ -1584,6 +1628,17 @@ static int amd_pstate_epp_suspend(struct cpufreq_policy *policy) + return 0; + } + ++static int amd_pstate_resume(struct cpufreq_policy *policy) ++{ ++ struct amd_cpudata *cpudata = policy->driver_data; ++ union perf_cached perf = READ_ONCE(cpudata->perf); ++ int cur_perf = freq_to_perf(perf, cpudata->nominal_freq, policy->cur); ++ ++ /* Set CPPC_REQ to last sane value until the governor updates it */ ++ return amd_pstate_update_perf(policy, perf.min_limit_perf, cur_perf, perf.max_limit_perf, ++ 0U, false); ++} ++ + static int amd_pstate_epp_resume(struct cpufreq_policy *policy) + { + struct amd_cpudata *cpudata = policy->driver_data; +@@ -1609,6 +1664,10 @@ static struct cpufreq_driver amd_pstate_driver = { + .fast_switch = amd_pstate_fast_switch, + .init = amd_pstate_cpu_init, + .exit = amd_pstate_cpu_exit, ++ .online = amd_pstate_cpu_online, ++ .offline = amd_pstate_cpu_offline, ++ .suspend = amd_pstate_suspend, ++ .resume = amd_pstate_resume, + .set_boost = amd_pstate_set_boost, + .update_limits = amd_pstate_update_limits, + .name = "amd-pstate", +@@ -1621,9 +1680,9 @@ static struct cpufreq_driver amd_pstate_epp_driver = { + .setpolicy = amd_pstate_epp_set_policy, + .init = amd_pstate_epp_cpu_init, + .exit = amd_pstate_epp_cpu_exit, +- .offline = amd_pstate_epp_cpu_offline, +- .online = amd_pstate_epp_cpu_online, +- .suspend = amd_pstate_epp_suspend, ++ .offline = amd_pstate_cpu_offline, ++ .online = amd_pstate_cpu_online, ++ .suspend = amd_pstate_suspend, + .resume = amd_pstate_epp_resume, + .update_limits = amd_pstate_update_limits, + .set_boost = amd_pstate_set_boost, +diff --git a/drivers/cpufreq/amd-pstate.h b/drivers/cpufreq/amd-pstate.h +index fbe1c08d3f06..2f7ae364d331 100644 +--- a/drivers/cpufreq/amd-pstate.h ++++ b/drivers/cpufreq/amd-pstate.h +@@ -30,6 +30,7 @@ + * @lowest_perf: the absolute lowest performance level of the processor + * @min_limit_perf: Cached value of the performance corresponding to policy->min + * @max_limit_perf: Cached value of the performance corresponding to policy->max ++ * @bios_min_perf: Cached perf value corresponding to the "Requested CPU Min Frequency" BIOS option + */ + union perf_cached { + struct { +@@ -39,6 +40,7 @@ union perf_cached { + u8 lowest_perf; + u8 min_limit_perf; + u8 max_limit_perf; ++ u8 bios_min_perf; + }; + u64 val; + }; +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 7b4301b7235f..198bb5cc1774 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -195,6 +195,8 @@ struct sched_domain_topology_level { + }; + + extern void __init set_sched_topology(struct sched_domain_topology_level *tl); ++extern void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio); ++ + + # define SD_INIT_NAME(type) .name = #type + +@@ -223,6 +225,10 @@ static inline bool cpus_share_resources(int this_cpu, int that_cpu) + return true; + } + ++static inline void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) ++{ ++} ++ + #endif /* !CONFIG_SMP */ + + #if defined(CONFIG_ENERGY_MODEL) && defined(CONFIG_CPU_FREQ_GOV_SCHEDUTIL) +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 56ae54e0ce6a..557246880a7e 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -588,6 +588,10 @@ static void register_sd(struct sched_domain *sd, struct dentry *parent) + debugfs_create_file("flags", 0444, parent, &sd->flags, &sd_flags_fops); + debugfs_create_file("groups_flags", 0444, parent, &sd->groups->flags, &sd_flags_fops); + debugfs_create_u32("level", 0444, parent, (u32 *)&sd->level); ++ ++ if (sd->flags & SD_ASYM_PACKING) ++ debugfs_create_u32("group_asym_prefer_cpu", 0444, parent, ++ (u32 *)&sd->groups->asym_prefer_cpu); + } + + void update_sched_domain_debugfs(void) +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0fb9bf995a47..8d0f462e8c8b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10256,7 +10256,7 @@ sched_group_asym(struct lb_env *env, struct sg_lb_stats *sgs, struct sched_group + (sgs->group_weight - sgs->idle_cpus != 1)) + return false; + +- return sched_asym(env->sd, env->dst_cpu, group->asym_prefer_cpu); ++ return sched_asym(env->sd, env->dst_cpu, READ_ONCE(group->asym_prefer_cpu)); + } + + /* One group has more than one SMT CPU while the other group does not */ +@@ -10493,7 +10493,8 @@ static bool update_sd_pick_busiest(struct lb_env *env, + + case group_asym_packing: + /* Prefer to move from lowest priority CPU's work */ +- return sched_asym_prefer(sds->busiest->asym_prefer_cpu, sg->asym_prefer_cpu); ++ return sched_asym_prefer(READ_ONCE(sds->busiest->asym_prefer_cpu), ++ READ_ONCE(sg->asym_prefer_cpu)); + + case group_misfit_task: + /* +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index f1ebc60d967f..8426de317835 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -1333,6 +1333,64 @@ static void init_sched_groups_capacity(int cpu, struct sched_domain *sd) + update_group_capacity(sd, cpu); + } + ++#ifdef CONFIG_SMP ++ ++/* Update the "asym_prefer_cpu" when arch_asym_cpu_priority() changes. */ ++void sched_update_asym_prefer_cpu(int cpu, int old_prio, int new_prio) ++{ ++ int asym_prefer_cpu = cpu; ++ struct sched_domain *sd; ++ ++ guard(rcu)(); ++ ++ for_each_domain(cpu, sd) { ++ struct sched_group *sg; ++ int group_cpu; ++ ++ if (!(sd->flags & SD_ASYM_PACKING)) ++ continue; ++ ++ /* ++ * Groups of overlapping domain are replicated per NUMA ++ * node and will require updating "asym_prefer_cpu" on ++ * each local copy. ++ * ++ * If you are hitting this warning, consider moving ++ * "sg->asym_prefer_cpu" to "sg->sgc->asym_prefer_cpu" ++ * which is shared by all the overlapping groups. ++ */ ++ WARN_ON_ONCE(sd->flags & SD_OVERLAP); ++ ++ sg = sd->groups; ++ if (cpu != sg->asym_prefer_cpu) { ++ /* ++ * Since the parent is a superset of the current group, ++ * if the cpu is not the "asym_prefer_cpu" at the ++ * current level, it cannot be the preferred CPU at a ++ * higher levels either. ++ */ ++ if (!sched_asym_prefer(cpu, sg->asym_prefer_cpu)) ++ return; ++ ++ WRITE_ONCE(sg->asym_prefer_cpu, cpu); ++ continue; ++ } ++ ++ /* Ranking has improved; CPU is still the preferred one. */ ++ if (new_prio >= old_prio) ++ continue; ++ ++ for_each_cpu(group_cpu, sched_group_span(sg)) { ++ if (sched_asym_prefer(group_cpu, asym_prefer_cpu)) ++ asym_prefer_cpu = group_cpu; ++ } ++ ++ WRITE_ONCE(sg->asym_prefer_cpu, asym_prefer_cpu); ++ } ++} ++ ++#endif /* CONFIG_SMP */ ++ + /* + * Set of available CPUs grouped by their corresponding capacities + * Each list entry contains a CPU mask reflecting CPUs that share the same +-- +2.50.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.15/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.15/0004-bbr3.patch new file mode 100644 index 0000000..4a0e492 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/0004-bbr3.patch @@ -0,0 +1,3404 @@ +From 103efa50b54199447f56196e0b1b2f6d13db2a54 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Wed, 4 Jun 2025 16:41:07 +0200 +Subject: [PATCH 4/8] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 6 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 73 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 4 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2232 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 4 +- + 16 files changed, 1941 insertions(+), 555 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 1669d95bb0f9..951a5ed55a27 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -248,7 +248,8 @@ struct tcp_sock { + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); + #endif + u32 snd_ssthresh; /* Slow start size threshold */ +- u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ ++ u32 recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ ++ fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */ + __cacheline_group_end(tcp_sock_read_rx); + + /* TX read-write hotpath cache lines */ +@@ -305,7 +306,8 @@ struct tcp_sock { + */ + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ +- rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ ++ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ ++ tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index 1735db332aab..2c4a94af7093 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -132,8 +132,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 4450c384ef17..61f73ca30be3 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -379,11 +379,14 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_DEMAND_CWR BIT(2) + #define TCP_ECN_SEEN BIT(3) + #define TCP_ECN_MODE_ACCECN BIT(4) ++#define TCP_ECN_LOW BIT(5) ++#define TCP_ECN_ECT_PERMANENT BIT(6) + + #define TCP_ECN_DISABLED 0 + #define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + #define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + ++ + static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) + { + return tp->ecn_flags & TCP_ECN_MODE_ANY; +@@ -840,6 +843,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -945,6 +957,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -1043,9 +1060,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1158,6 +1180,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1180,7 +1203,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED BIT(0) + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN BIT(1) +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS BIT(2) ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1200,10 +1227,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1214,7 +1244,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1238,8 +1270,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1305,6 +1340,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1324,6 +1367,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1336,6 +1380,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2489,7 +2548,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index dab9493c791b..cce4975fdcfe 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -517,12 +517,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index dc8fdc80e16b..6b2003dbae81 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -184,6 +184,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ ++#define TCPI_OPT_ECN_LOW 128 /* Low-latency ECN enabled at conn init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 6d2c97f8e9ef..ddc116ef22cb 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index e01492234b0b..27893b774e08 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } +@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 6edc441b3702..bf52c5744acf 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3411,6 +3411,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4158,6 +4159,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..066da5e5747c 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,122 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ return (tcp_ecn_mode_any(tp)) && (tp->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +383,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +410,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +434,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +457,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, ++ bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +474,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +535,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +548,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +580,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +600,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +671,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +682,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +711,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +740,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +796,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +804,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +850,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +859,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +887,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +924,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +947,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +972,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ !!bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * and in PROBE_UP. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2361,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2398,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index df758adbb445..e98e5dbc050e 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index a35018e2d0ba..b849d76b24da 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -381,7 +381,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -392,7 +392,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1139,7 +1139,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1511,6 +1516,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3848,7 +3864,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3865,6 +3882,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3875,6 +3893,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3994,6 +4017,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4059,7 +4083,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_in_ack_event(sk, flag); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4083,6 +4107,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4103,7 +4128,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5782,13 +5807,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index fb9349be36b8..3c53e39f8201 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -472,6 +472,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 13295a59d22e..3effb6e51e96 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1614,7 +1617,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + u16 flags; + int nlen; +@@ -1689,6 +1692,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2045,13 +2072,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2777,6 +2803,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2989,6 +3016,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index e4c616bbd727..e4a7a25d667d 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -565,7 +565,7 @@ void tcp_retransmit_timer(struct sock *sk) + struct inet_sock *inet = inet_sk(sk); + u32 rtx_delta; + +- rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: ++ rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: + tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); + if (tp->tcp_usec_ts) + rtx_delta /= USEC_PER_MSEC; +@@ -702,6 +702,8 @@ void tcp_write_timer_handler(struct sock *sk) + icsk_timeout(icsk)); + return; + } ++ ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.50.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.15/0005-block.patch.skip b/sys-kernel/gentoo-sources-6.15/0005-block.patch.skip new file mode 100644 index 0000000..2b076d6 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/0005-block.patch.skip @@ -0,0 +1,288 @@ +From 4ef24b41f1c812f829943ac1b0f2f245cee2eba8 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Wed, 4 Jun 2025 16:41:18 +0200 +Subject: [PATCH 5/8] block + +Signed-off-by: Peter Jung +--- + block/bfq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++------ + block/bfq-iosched.h | 12 +++++++++-- + block/mq-deadline.c | 48 +++++++++++++++++++++++++++++++++++------ + 3 files changed, 96 insertions(+), 16 deletions(-) + +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +index abd80dc13562..cd06c79c4e92 100644 +--- a/block/bfq-iosched.c ++++ b/block/bfq-iosched.c +@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) + return icq; + } + ++static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q) ++{ ++ if (!current->io_context) ++ return NULL; ++ if (spin_trylock_irq(&q->queue_lock)) { ++ struct bfq_io_cq *icq; ++ ++ icq = icq_to_bic(ioc_lookup_icq(q)); ++ spin_unlock_irq(&q->queue_lock); ++ return icq; ++ } ++ ++ return NULL; ++} ++ + /* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. +@@ -2465,10 +2480,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, + * returned by bfq_bic_lookup does not go away before + * bfqd->lock is taken. + */ +- struct bfq_io_cq *bic = bfq_bic_lookup(q); ++ struct bfq_io_cq *bic = bfq_bic_try_lookup(q); + bool ret; + +- spin_lock_irq(&bfqd->lock); ++ /* ++ * bio merging is called for every bio queued, and it's very easy ++ * to run into contention because of that. If we fail getting ++ * the dd lock, just skip this merge attempt. For related IO, the ++ * plug will be the successful merging point. If we get here, we ++ * already failed doing the obvious merge. Chances of actually ++ * getting a merge off this path is a lot slimmer, so skipping an ++ * occassional lookup that will most likely not succeed anyway should ++ * not be a problem. ++ */ ++ if (!spin_trylock_irq(&bfqd->lock)) ++ return false; + + if (bic) { + /* +@@ -5317,6 +5343,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled = false; + ++ /* ++ * If someone else is already dispatching, skip this one. This will ++ * defer the next dispatch event to when something completes, and could ++ * potentially lower the queue depth for contended cases. ++ * ++ * See the logic in blk_mq_do_dispatch_sched(), which loops and ++ * retries if nothing is dispatched. ++ */ ++ if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) || ++ test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state)) ++ return NULL; ++ + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; +@@ -5328,6 +5366,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + } + ++ clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state); + spin_unlock_irq(&bfqd->lock); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, +@@ -6250,10 +6289,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q, + + static struct bfq_queue *bfq_init_rq(struct request *rq); + +-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++static void bfq_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags) + { +- struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + bool idle_timer_disabled = false; +@@ -6315,7 +6353,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); +- bfq_insert_request(hctx, rq, flags); ++ bfq_insert_request(hctx->queue, rq, flags); + } + } + +@@ -7254,6 +7292,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + q->elevator = eq; + spin_unlock_irq(&q->queue_lock); + ++ spin_lock_init(&bfqd->lock); ++ + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow +@@ -7371,8 +7411,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + /* see comments on the definition of next field inside bfq_data */ + bfqd->actuator_load_threshold = 4; + +- spin_lock_init(&bfqd->lock); +- + /* + * The invocation of the next bfq_create_group_hierarchy + * function is the head of a chain of function calls +diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h +index 687a3a7ba784..8589b58af79f 100644 +--- a/block/bfq-iosched.h ++++ b/block/bfq-iosched.h +@@ -504,12 +504,22 @@ struct bfq_io_cq { + unsigned int requests; /* Number of requests this process has in flight */ + }; + ++enum { ++ BFQ_DISPATCHING = 0, ++}; ++ + /** + * struct bfq_data - per-device data structure. + * + * All the fields are protected by @lock. + */ + struct bfq_data { ++ struct { ++ spinlock_t lock; ++ } ____cacheline_aligned_in_smp; ++ ++ unsigned long run_state; ++ + /* device request queue */ + struct request_queue *queue; + /* dispatch queue */ +@@ -795,8 +805,6 @@ struct bfq_data { + /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; + +- spinlock_t lock; +- + /* + * bic associated with the task issuing current bio for + * merging. This and the next field are used as a support to +diff --git a/block/mq-deadline.c b/block/mq-deadline.c +index 754f6b7415cd..a5fa8f86178d 100644 +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -79,10 +79,20 @@ struct dd_per_prio { + struct io_stats_per_prio stats; + }; + ++enum { ++ DD_DISPATCHING = 0, ++}; ++ + struct deadline_data { + /* + * run time data + */ ++ struct { ++ spinlock_t lock; ++ spinlock_t zone_lock; ++ } ____cacheline_aligned_in_smp; ++ ++ unsigned long run_state; + + struct dd_per_prio per_prio[DD_PRIO_COUNT]; + +@@ -100,8 +110,6 @@ struct deadline_data { + int front_merges; + u32 async_depth; + int prio_aging_expire; +- +- spinlock_t lock; + }; + + /* Maps an I/O priority class to a deadline scheduler priority. */ +@@ -466,6 +474,18 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) + struct request *rq; + enum dd_prio prio; + ++ /* ++ * If someone else is already dispatching, skip this one. This will ++ * defer the next dispatch event to when something completes, and could ++ * potentially lower the queue depth for contended cases. ++ * ++ * See the logic in blk_mq_do_dispatch_sched(), which loops and ++ * retries if nothing is dispatched. ++ */ ++ if (test_bit(DD_DISPATCHING, &dd->run_state) || ++ test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state)) ++ return NULL; ++ + spin_lock(&dd->lock); + rq = dd_dispatch_prio_aged_requests(dd, now); + if (rq) +@@ -482,6 +502,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) + } + + unlock: ++ clear_bit_unlock(DD_DISPATCHING, &dd->run_state); + spin_unlock(&dd->lock); + + return rq; +@@ -585,6 +606,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) + + eq->elevator_data = dd; + ++ spin_lock_init(&dd->lock); ++ spin_lock_init(&dd->zone_lock); ++ + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + +@@ -601,7 +625,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) + dd->last_dir = DD_WRITE; + dd->fifo_batch = fifo_batch; + dd->prio_aging_expire = prio_aging_expire; +- spin_lock_init(&dd->lock); + + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); +@@ -657,7 +680,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + struct request *free = NULL; + bool ret; + +- spin_lock(&dd->lock); ++ /* ++ * bio merging is called for every bio queued, and it's very easy ++ * to run into contention because of that. If we fail getting ++ * the dd lock, just skip this merge attempt. For related IO, the ++ * plug will be the successful merging point. If we get here, we ++ * already failed doing the obvious merge. Chances of actually ++ * getting a merge off this path is a lot slimmer, so skipping an ++ * occassional lookup that will most likely not succeed anyway should ++ * not be a problem. ++ */ ++ if (!spin_trylock(&dd->lock)) ++ return false; ++ + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock(&dd->lock); + +@@ -670,10 +705,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + /* + * add rq to rbtree and fifo + */ +-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++static void dd_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags, struct list_head *free) + { +- struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); +@@ -731,7 +765,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); +- dd_insert_request(hctx, rq, flags, &free); ++ dd_insert_request(q, rq, flags, &free); + } + spin_unlock(&dd->lock); + +-- +2.50.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch new file mode 100644 index 0000000..4350db3 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-01-20-sched-Cache-aware-load-balancing.patch @@ -0,0 +1,803 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0EE3F2F4A0C + for ; Wed, 18 Jun 2025 18:21:36 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270898; cv=none; b=i6UpioJgMk5GxDDiJNU6ym/ql7fYtIxc3m0laytI789opI8LDjTvyDtwqrIQyQ1c4ZCnekjBz/wO4Aujx1CK9ipZtczqav2p9tw1Hd3Voibb0lwiXLdi8v6PAAo2cAsX9FlCAQdMHkE7TrPq3hrfK8cTUOewsHi11k8otsoUoF8= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270898; c=relaxed/simple; + bh=TYBL04yFFQ2m3vahwqxlPWkj0L0Itx8xkCpecBkZyTo=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=NvFXEMhOMy2ly3nACqhZx/B7xBuxxjLzhrCAPQhizdE6auJ2An7O1CCizePDbPUt3x9R8OXvvONlUeFHXfcGpYcXF9IC+2ogKFFrxsqx07VvjHSs8Ud1keSWy0e/3m5mCyNcyiv9/x9sdp1mHOJsM6ZH+h1t1P9HV+a9nVVwfeU= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=m8zYzaEs; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="m8zYzaEs" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270896; x=1781806896; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=TYBL04yFFQ2m3vahwqxlPWkj0L0Itx8xkCpecBkZyTo=; + b=m8zYzaEsszmRk8d10p09dWuoM0QHBZbGZnY8ZU7YQ5oc73eP1UDpRePw + Iu/jPTNT8uBPQfAA79j/b4tbxQImw3Vm2EFuhqgbCGzn8WuO4p6CfGONf + nNIOGop6F+y60hE6rmyGD7GqoufIf/Xz+S8d55r5HVm1AioodkMfJsCMq + p01kqQ4AItdgoMkEnocPNDlyafzx3MsZtFHCCqR6F929sYF5LFax3HY4G + ozjUU62bKBmlkBn1eho1JY0ZSLQPetm2LAIQE4QynDS4MQ7bq112tEsWH + BJ/IXkd445+qfY1nWMwW+JO0pyelws29KEOXY7BvPQVHFy5P4DCp3UAI/ + w==; +X-CSE-ConnectionGUID: 97R+yvzpS6K/KMRnntFzgg== +X-CSE-MsgGUID: WI5jQwWESQSCLf6+tddXlg== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931324" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931324" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:35 -0700 +X-CSE-ConnectionGUID: 3Wla1slbSPWqbG0bkSSkVA== +X-CSE-MsgGUID: c3ximr2GR+C5Bp6cvb58uA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959507" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:34 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 01/20] sched: Cache aware load-balancing +Date: Wed, 18 Jun 2025 11:27:49 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Peter Zijlstra + +Hi all, + +One of the many things on the eternal todo list has been finishing the +below hackery. + +It is an attempt at modelling cache affinity -- and while the patch +really only targets LLC, it could very well be extended to also apply to +clusters (L2). Specifically any case of multiple cache domains inside a +node. + +Anyway, I wrote this about a year ago, and I mentioned this at the +recent OSPM conf where Gautham and Prateek expressed interest in playing +with this code. + +So here goes, very rough and largely unproven code ahead :-) + +It applies to current tip/master, but I know it will fail the __percpu +validation that sits in -next, although that shouldn't be terribly hard +to fix up. + +As is, it only computes a CPU inside the LLC that has the highest recent +runtime, this CPU is then used in the wake-up path to steer towards this +LLC and in task_hot() to limit migrations away from it. + +More elaborate things could be done, notably there is an XXX in there +somewhere about finding the best LLC inside a NODE (interaction with +NUMA_BALANCING). + +Signed-off-by: Peter Zijlstra (Intel) +--- + include/linux/mm_types.h | 44 ++++++ + include/linux/sched.h | 4 + + init/Kconfig | 4 + + kernel/fork.c | 5 + + kernel/sched/core.c | 13 +- + kernel/sched/fair.c | 330 +++++++++++++++++++++++++++++++++++++-- + kernel/sched/sched.h | 8 + + 7 files changed, 388 insertions(+), 20 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 56d07edd01f9..013291c6aaa2 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -893,6 +893,12 @@ struct mm_cid { + }; + #endif + ++struct mm_sched { ++ u64 runtime; ++ unsigned long epoch; ++ unsigned long occ; ++}; ++ + struct kioctx_table; + struct iommu_mm_data; + struct mm_struct { +@@ -983,6 +989,17 @@ struct mm_struct { + */ + raw_spinlock_t cpus_allowed_lock; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Track per-cpu-per-process occupancy as a proxy for cache residency. ++ * See account_mm_sched() and ... ++ */ ++ struct mm_sched __percpu *pcpu_sched; ++ raw_spinlock_t mm_sched_lock; ++ unsigned long mm_sched_epoch; ++ int mm_sched_cpu; ++#endif ++ + #ifdef CONFIG_MMU + atomic_long_t pgtables_bytes; /* size of all page tables */ + #endif +@@ -1393,6 +1410,33 @@ static inline unsigned int mm_cid_size(void) + static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } + #endif /* CONFIG_SCHED_MM_CID */ + ++#ifdef CONFIG_SCHED_CACHE ++extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched); ++ ++static inline int mm_alloc_sched_noprof(struct mm_struct *mm) ++{ ++ struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ if (!pcpu_sched) ++ return -ENOMEM; ++ ++ mm_init_sched(mm, pcpu_sched); ++ return 0; ++} ++ ++#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) ++ ++static inline void mm_destroy_sched(struct mm_struct *mm) ++{ ++ free_percpu(mm->pcpu_sched); ++ mm->pcpu_sched = NULL; ++} ++#else /* !CONFIG_SCHED_CACHE */ ++ ++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } ++static inline void mm_destroy_sched(struct mm_struct *mm) { } ++ ++#endif /* CONFIG_SCHED_CACHE */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index f96ac1982893..d0e4cda2b3cd 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1399,6 +1399,10 @@ struct task_struct { + unsigned long numa_pages_migrated; + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ struct callback_head cache_work; ++#endif ++ + #ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; +diff --git a/init/Kconfig b/init/Kconfig +index bf3a920064be..e2509127b6f9 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -953,6 +953,10 @@ config NUMA_BALANCING + + This system will be inactive on UMA systems. + ++config SCHED_CACHE ++ bool "Cache aware scheduler" ++ default y ++ + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y +diff --git a/kernel/fork.c b/kernel/fork.c +index 168681fc4b25..da1387823b9e 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1332,6 +1332,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + if (mm_alloc_cid(mm, p)) + goto fail_cid; + ++ if (mm_alloc_sched(mm)) ++ goto fail_sched; ++ + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; +@@ -1341,6 +1344,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + return mm; + + fail_pcpu: ++ mm_destroy_sched(mm); ++fail_sched: + mm_destroy_cid(mm); + fail_cid: + destroy_context(mm); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index c81cf642dba0..d9c3e75f79d1 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4524,6 +4524,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->migration_pending = NULL; + #endif + init_sched_mm_cid(p); ++ init_sched_mm(p); + } + + DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); +@@ -8526,6 +8527,7 @@ static struct kmem_cache *task_group_cache __ro_after_init; + + void __init sched_init(void) + { ++ unsigned long now = jiffies; + unsigned long ptr = 0; + int i; + +@@ -8600,7 +8602,7 @@ void __init sched_init(void) + raw_spin_lock_init(&rq->__lock); + rq->nr_running = 0; + rq->calc_load_active = 0; +- rq->calc_load_update = jiffies + LOAD_FREQ; ++ rq->calc_load_update = now + LOAD_FREQ; + init_cfs_rq(&rq->cfs); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); +@@ -8644,7 +8646,7 @@ void __init sched_init(void) + rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->balance_callback = &balance_push_callback; + rq->active_balance = 0; +- rq->next_balance = jiffies; ++ rq->next_balance = now; + rq->push_cpu = 0; + rq->cpu = i; + rq->online = 0; +@@ -8656,7 +8658,7 @@ void __init sched_init(void) + + rq_attach_root(rq, &def_root_domain); + #ifdef CONFIG_NO_HZ_COMMON +- rq->last_blocked_load_update_tick = jiffies; ++ rq->last_blocked_load_update_tick = now; + atomic_set(&rq->nohz_flags, 0); + + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); +@@ -8681,6 +8683,11 @@ void __init sched_init(void) + + rq->core_cookie = 0UL; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spin_lock_init(&rq->cpu_epoch_lock); ++ rq->cpu_epoch_next = now; ++#endif ++ + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); + } + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0fb9bf995a47..df7d4a324fbe 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + return delta_exec; + } + +-static inline void update_curr_task(struct task_struct *p, s64 delta_exec) ++#ifdef CONFIG_SCHED_CACHE ++ ++/* ++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD ++ * tunable or so. ++ */ ++#define EPOCH_PERIOD (HZ/100) /* 10 ms */ ++#define EPOCH_OLD 5 /* 50 ms */ ++ ++void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched) ++{ ++ unsigned long epoch; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); ++ struct rq *rq = cpu_rq(i); ++ ++ pcpu_sched->runtime = 0; ++ pcpu_sched->epoch = epoch = rq->cpu_epoch; ++ pcpu_sched->occ = -1; ++ } ++ ++ raw_spin_lock_init(&mm->mm_sched_lock); ++ mm->mm_sched_epoch = epoch; ++ mm->mm_sched_cpu = -1; ++ ++ smp_store_release(&mm->pcpu_sched, _pcpu_sched); ++} ++ ++/* because why would C be fully specified */ ++static __always_inline void __shr_u64(u64 *val, unsigned int n) ++{ ++ if (n >= 64) { ++ *val = 0; ++ return; ++ } ++ *val >>= n; ++} ++ ++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ lockdep_assert_held(&rq->cpu_epoch_lock); ++ ++ unsigned long n, now = jiffies; ++ long delta = now - rq->cpu_epoch_next; ++ ++ if (delta > 0) { ++ n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ rq->cpu_epoch += n; ++ rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ __shr_u64(&rq->cpu_runtime, n); ++ } ++ ++ n = rq->cpu_epoch - pcpu_sched->epoch; ++ if (n) { ++ pcpu_sched->epoch += n; ++ __shr_u64(&pcpu_sched->runtime, n); ++ } ++} ++ ++static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); ++ ++ __update_mm_sched(rq, pcpu_sched); ++ ++ /* ++ * Runtime is a geometric series (r=0.5) and as such will sum to twice ++ * the accumulation period, this means the multiplcation here should ++ * not overflow. ++ */ ++ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); ++} ++ ++static inline ++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) ++{ ++ struct mm_struct *mm = p->mm; ++ struct mm_sched *pcpu_sched; ++ unsigned long epoch; ++ ++ /* ++ * init_task and kthreads don't be having no mm ++ */ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched); ++ ++ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { ++ __update_mm_sched(rq, pcpu_sched); ++ pcpu_sched->runtime += delta_exec; ++ rq->cpu_runtime += delta_exec; ++ epoch = rq->cpu_epoch; ++ } ++ ++ /* ++ * If this task hasn't hit task_cache_work() for a while, invalidate ++ * it's preferred state. ++ */ ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) { ++ mm->mm_sched_cpu = -1; ++ pcpu_sched->occ = -1; ++ } ++} ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ struct mm_struct *mm = p->mm; ++ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ if (mm->mm_sched_epoch == rq->cpu_epoch) ++ return; ++ ++ guard(raw_spinlock)(&mm->mm_sched_lock); ++ ++ if (mm->mm_sched_epoch == rq->cpu_epoch) ++ return; ++ ++ if (work->next == work) { ++ task_work_add(p, work, TWA_RESUME); ++ WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch); ++ } ++} ++ ++static void task_cache_work(struct callback_head *work) ++{ ++ struct task_struct *p = current; ++ struct mm_struct *mm = p->mm; ++ unsigned long m_a_occ = 0; ++ int cpu, m_a_cpu = -1; ++ cpumask_var_t cpus; ++ ++ WARN_ON_ONCE(work != &p->cache_work); ++ ++ work->next = work; ++ ++ if (p->flags & PF_EXITING) ++ return; ++ ++ if (!alloc_cpumask_var(&cpus, GFP_KERNEL)) ++ return; ++ ++ scoped_guard (cpus_read_lock) { ++ cpumask_copy(cpus, cpu_online_mask); ++ ++ for_each_cpu(cpu, cpus) { ++ /* XXX sched_cluster_active */ ++ struct sched_domain *sd = per_cpu(sd_llc, cpu); ++ unsigned long occ, m_occ = 0, a_occ = 0; ++ int m_cpu = -1, nr = 0, i; ++ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ occ = fraction_mm_sched(cpu_rq(i), ++ per_cpu_ptr(mm->pcpu_sched, i)); ++ a_occ += occ; ++ if (occ > m_occ) { ++ m_occ = occ; ++ m_cpu = i; ++ } ++ nr++; ++ trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n", ++ per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr); ++ } ++ ++ a_occ /= nr; ++ if (a_occ > m_a_occ) { ++ m_a_occ = a_occ; ++ m_a_cpu = m_cpu; ++ } ++ ++ trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n", ++ per_cpu(sd_llc_id, cpu), a_occ, m_a_occ); ++ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ /* XXX threshold ? */ ++ per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ; ++ } ++ ++ cpumask_andnot(cpus, cpus, sched_domain_span(sd)); ++ } ++ } ++ ++ /* ++ * If the max average cache occupancy is 'small' we don't care. ++ */ ++ if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD)) ++ m_a_cpu = -1; ++ ++ mm->mm_sched_cpu = m_a_cpu; ++ ++ free_cpumask_var(cpus); ++} ++ ++void init_sched_mm(struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ init_task_work(work, task_cache_work); ++ work->next = work; ++} ++ ++#else ++ ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, ++ s64 delta_exec) { } ++ ++ ++void init_sched_mm(struct task_struct *p) { } ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) { } ++ ++#endif ++ ++static inline ++void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec) + { + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); ++ account_mm_sched(rq, p, delta_exec); + cgroup_account_cputime(p, delta_exec); + } + +@@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq) + + delta_exec = update_curr_se(rq, &donor->se); + if (likely(delta_exec > 0)) +- update_curr_task(donor, delta_exec); ++ update_curr_task(rq, donor, delta_exec); + + return delta_exec; + } +@@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq) + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + +- update_curr_task(p, delta_exec); ++ update_curr_task(rq, p, delta_exec); + + /* + * If the fair_server is active, we need to account for the +@@ -7848,7 +8067,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + * per-cpu select_rq_mask usage + */ + lockdep_assert_irqs_disabled(); +- ++again: + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_cpu(task_util, util_min, util_max, target)) + return target; +@@ -7886,7 +8105,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + /* Check a recently used CPU as a potential idle candidate: */ + recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; +- if (recent_used_cpu != prev && ++ if (prev == p->wake_cpu && ++ recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +@@ -7939,6 +8159,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + if ((unsigned)i < nr_cpumask_bits) + return i; + ++ if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) { ++ /* ++ * Most likely select_cache_cpu() will have re-directed ++ * the wakeup, but getting here means the preferred cache is ++ * too busy, so re-try with the actual previous. ++ * ++ * XXX wake_affine is lost for this pass. ++ */ ++ prev = target = p->wake_cpu; ++ goto again; ++ } ++ + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster +@@ -8561,6 +8793,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + return target; + } + ++#ifdef CONFIG_SCHED_CACHE ++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle); ++ ++static int select_cache_cpu(struct task_struct *p, int prev_cpu) ++{ ++ struct mm_struct *mm = p->mm; ++ int cpu; ++ ++ if (!mm || p->nr_cpus_allowed == 1) ++ return prev_cpu; ++ ++ cpu = mm->mm_sched_cpu; ++ if (cpu < 0) ++ return prev_cpu; ++ ++ ++ if (static_branch_likely(&sched_numa_balancing) && ++ __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) { ++ /* ++ * XXX look for max occupancy inside prev_cpu's node ++ */ ++ return prev_cpu; ++ } ++ ++ return cpu; ++} ++#else ++static int select_cache_cpu(struct task_struct *p, int prev_cpu) ++{ ++ return prev_cpu; ++} ++#endif ++ ++ + /* + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, +@@ -8586,6 +8852,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); ++ guard(rcu)(); ++ + if (wake_flags & WF_TTWU) { + record_wakee(p); + +@@ -8593,6 +8861,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + ++ new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu); ++ + if (!is_rd_overutilized(this_rq()->rd)) { + new_cpu = find_energy_efficient_cpu(p, prev_cpu); + if (new_cpu >= 0) +@@ -8603,7 +8873,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); + } + +- rcu_read_lock(); + for_each_domain(cpu, tmp) { + /* + * If both 'cpu' and 'prev_cpu' are part of this domain, +@@ -8636,7 +8905,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + /* Fast path */ + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + } +- rcu_read_unlock(); + + return new_cpu; + } +@@ -9286,6 +9554,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + if (sysctl_sched_migration_cost == 0) + return 0; + ++#ifdef CONFIG_SCHED_CACHE ++ if (p->mm && p->mm->pcpu_sched) { ++ /* ++ * XXX things like Skylake have non-inclusive L3 and might not ++ * like this L3 centric view. What to do about L2 stickyness ? ++ */ ++ return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ > ++ per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ; ++ } ++#endif ++ + delta = rq_clock_task(env->src_rq) - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +@@ -9297,27 +9576,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. + */ +-static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) ++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle) + { + struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; + +- if (!static_branch_likely(&sched_numa_balancing)) +- return 0; +- +- if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) ++ if (!p->numa_faults) + return 0; + +- src_nid = cpu_to_node(env->src_cpu); +- dst_nid = cpu_to_node(env->dst_cpu); ++ src_nid = cpu_to_node(src_cpu); ++ dst_nid = cpu_to_node(dst_cpu); + + if (src_nid == dst_nid) + return 0; + + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { +- if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) ++ struct rq *src_rq = cpu_rq(src_cpu); ++ if (src_rq->nr_running > src_rq->nr_preferred_running) + return 1; + else + return 0; +@@ -9328,7 +9605,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) + return -1; + + /* Leaving a core idle is often worse than degrading locality. */ +- if (env->idle == CPU_IDLE) ++ if (idle) + return 0; + + dist = node_distance(src_nid, dst_nid); +@@ -9343,7 +9620,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) + return src_weight - dst_weight; + } + ++static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) ++{ ++ if (!static_branch_likely(&sched_numa_balancing)) ++ return 0; ++ ++ if (!(env->sd->flags & SD_NUMA)) ++ return 0; ++ ++ return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu, ++ env->idle == CPU_IDLE); ++} ++ + #else ++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle) ++{ ++ return 0; ++} ++ + static inline long migrate_degrades_locality(struct task_struct *p, + struct lb_env *env) + { +@@ -13102,8 +13396,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} + */ + static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + { +- struct cfs_rq *cfs_rq; + struct sched_entity *se = &curr->se; ++ struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); +@@ -13113,6 +13407,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + if (static_branch_unlikely(&sched_numa_balancing)) + task_tick_numa(rq, curr); + ++ task_tick_cache(rq, curr); ++ + update_misfit_status(curr, rq); + check_update_overutilized_status(task_rq(curr)); + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 47972f34ea70..d16ccd66ca07 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1171,6 +1171,12 @@ struct rq { + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spinlock_t cpu_epoch_lock; ++ u64 cpu_runtime; ++ unsigned long cpu_epoch; ++ unsigned long cpu_epoch_next; ++#endif + + atomic_t nr_iowait; + +@@ -3861,6 +3867,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } + static inline void init_sched_mm_cid(struct task_struct *t) { } + #endif /* !CONFIG_SCHED_MM_CID */ + ++extern void init_sched_mm(struct task_struct *p); ++ + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); + extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + #ifdef CONFIG_SMP +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch new file mode 100644 index 0000000..2527cbc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-02-20-sched-Several-fixes-for-cache-aware-scheduling.patch @@ -0,0 +1,230 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id C4E492F4A06 + for ; Wed, 18 Jun 2025 18:21:47 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270909; cv=none; b=OfHuG3LBktIYQ17A0wezDcBygFYIqQGnVMGi+J74hfqxXToXJOkcfe/QAshk8VQr3iHhepGalcue2+Gh9lXUo6YIap3bPlMoXEKyEF/uKj/HOqBRTfPfSFVzKLCzuG1BPrKVWm/9VqF9CtRE/PxcAkoHlqkKJK38pOqYBIlTkpU= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270909; c=relaxed/simple; + bh=bDfBIxEdnv+hYygaV+u3o+TV5wT/EFTyHlaYTI7nFpw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=UKMgGxYucBL+0GlD7p3d/Zom0U72DS+gR1yhIB67WE5LMqkpy3l5lREKGfo/WMbkvVplPyT3O4LIWAcMuzVNNgwy1U2yRPrfXUYrbe55jB5Ido4zUO7riYoUV38Tur4ZomgT4/03W4QWQnIXvK43x1VsVDq4rpLILrJkc0MLRz4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZPYgHNmn; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZPYgHNmn" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270907; x=1781806907; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=bDfBIxEdnv+hYygaV+u3o+TV5wT/EFTyHlaYTI7nFpw=; + b=ZPYgHNmnGkft4GpxmGCyeKoUkQ215gIizexSy0kMiXa/NTiZo/gGlfyT + kUMF1ZHUyeWicU44K6z5ga9ude/u1b7dOInRSMexJBl7xg2wWzt43htgN + 0SZD8bKm1Psl9VbJQzK0J75KsgRFBxuGgnNUxw1QIktunEAn4cwXF97df + RDtco1RU/rA4YaqRY//20xf5f/vHjT5XptItMKZ8mzjiE8ikBvrqvp2HQ + 6dxxSgUzxm0LyOJqknyMhoG24RtRQixTDOHtv0zR/a3Etu3Yfy4fbLcgR + 4fJb3VSDSPIj3CLVFTTh/id7nCY7gUsIPDC9MnN4GNRnUeVr8jGQaTFWv + Q==; +X-CSE-ConnectionGUID: YR1pxp3JSTSWW1r6sY2fCw== +X-CSE-MsgGUID: N3vXqxYJSWukJNGgS8J6EQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931471" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931471" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:47 -0700 +X-CSE-ConnectionGUID: 4Lza8fDiSzyAbCqKOh54VQ== +X-CSE-MsgGUID: sumQVaeBRRG5ZeV1Vg8XHA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959777" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:46 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org +Subject: [RFC patch v3 02/20] sched: Several fixes for cache aware scheduling +Date: Wed, 18 Jun 2025 11:27:50 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +1. Fix compile error on percpu allocation. +2. Enqueue to the target CPU rather than the current CPU. +3. NULL LLC sched domain check(Libo Chen). +4. Introduce sched feature SCHED_CACHE to control cache aware scheduling +5. Fix unsigned occupancy initialization to -1. +6. If there is only 1 thread in the process, no need to enable cache + awareness +7. Add __maybe_unused to __migrate_degrades_locality() to + avoid compile warnings. + +Signed-off-by: Chen Yu +--- + include/linux/mm_types.h | 4 ++-- + kernel/sched/fair.c | 27 ++++++++++++++++----------- + kernel/sched/features.h | 1 + + 3 files changed, 19 insertions(+), 13 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 013291c6aaa2..9de4a0a13c4d 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1411,11 +1411,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas + #endif /* CONFIG_SCHED_MM_CID */ + + #ifdef CONFIG_SCHED_CACHE +-extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched); ++extern void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched); + + static inline int mm_alloc_sched_noprof(struct mm_struct *mm) + { +- struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched); + if (!pcpu_sched) + return -ENOMEM; + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index df7d4a324fbe..89db97f8ef02 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + #define EPOCH_PERIOD (HZ/100) /* 10 ms */ + #define EPOCH_OLD 5 /* 50 ms */ + +-void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched) ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; + int i; +@@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched) + + pcpu_sched->runtime = 0; + pcpu_sched->epoch = epoch = rq->cpu_epoch; +- pcpu_sched->occ = -1; ++ pcpu_sched->occ = 0; + } + + raw_spin_lock_init(&mm->mm_sched_lock); +@@ -1254,7 +1254,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (!mm || !mm->pcpu_sched) + return; + +- pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched); ++ pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq)); + + scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { + __update_mm_sched(rq, pcpu_sched); +@@ -1264,12 +1264,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + } + + /* +- * If this task hasn't hit task_cache_work() for a while, invalidate ++ * If this task hasn't hit task_cache_work() for a while, or it ++ * has only 1 thread, invalidate + * it's preferred state. + */ +- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) { ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD || ++ get_nr_threads(p) <= 1) { + mm->mm_sched_cpu = -1; +- pcpu_sched->occ = -1; ++ pcpu_sched->occ = 0; + } + } + +@@ -1286,9 +1288,6 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + + guard(raw_spinlock)(&mm->mm_sched_lock); + +- if (mm->mm_sched_epoch == rq->cpu_epoch) +- return; +- + if (work->next == work) { + task_work_add(p, work, TWA_RESUME); + WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch); +@@ -1322,6 +1321,9 @@ static void task_cache_work(struct callback_head *work) + unsigned long occ, m_occ = 0, a_occ = 0; + int m_cpu = -1, nr = 0, i; + ++ if (!sd) ++ continue; ++ + for_each_cpu(i, sched_domain_span(sd)) { + occ = fraction_mm_sched(cpu_rq(i), + per_cpu_ptr(mm->pcpu_sched, i)); +@@ -8801,6 +8803,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + struct mm_struct *mm = p->mm; + int cpu; + ++ if (!sched_feat(SCHED_CACHE)) ++ return prev_cpu; ++ + if (!mm || p->nr_cpus_allowed == 1) + return prev_cpu; + +@@ -9555,7 +9560,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + return 0; + + #ifdef CONFIG_SCHED_CACHE +- if (p->mm && p->mm->pcpu_sched) { ++ if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) { + /* + * XXX things like Skylake have non-inclusive L3 and might not + * like this L3 centric view. What to do about L2 stickyness ? +@@ -9633,7 +9638,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) + } + + #else +-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle) ++static __maybe_unused long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle) + { + return 0; + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 3c12d9f93331..d2af7bfd36bf 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true) + */ + SCHED_FEAT(SIS_UTIL, true) + ++SCHED_FEAT(SCHED_CACHE, true) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch new file mode 100644 index 0000000..152e3a3 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-03-20-sched-Avoid-task-migration-within-its-preferred-LLC.patch @@ -0,0 +1,112 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id B98A42F4A13 + for ; Wed, 18 Jun 2025 18:21:49 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270911; cv=none; b=H37a6GaMCpKVbBfru0xhkv/YQMjuzakfh40XV8mJ06HkLTiVswK7M40TUc0iJ2+QdHbjvIsa3fkD0Ch9hrzqgWR417U/tS7He62fpoRnc/RWieBtEAO7KEIcS4LI+2bm+YmBVIN6m7jaZ7yUlmNHWqu6HcD8VDmZ1CHMeSMizgk= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270911; c=relaxed/simple; + bh=56rh2PM2yAL35gap+jzrhdtnDXnsh7kQnStk2sSwL5M=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=G2idI3SEXApx4GgXZ/P+aOwo15Jk0qYkNXdo1GgZkBwTVB4/wZnkC8GaXlcdpiVIBcXRH+vvno/YvO528eUsbhW6TDkWnRait/B5YQRy1pg5uGy44IKpVEiwxTH7cssQJgE1Tsmt4x5g8AlJKz8IC1CaADUdr9RqjRgXQ9GaDLs= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=nWZpvDE7; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="nWZpvDE7" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270909; x=1781806909; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=56rh2PM2yAL35gap+jzrhdtnDXnsh7kQnStk2sSwL5M=; + b=nWZpvDE7SLXGGfo3jHHGEomAw3ClY8hUPhWrfFErNHzBVjyIHSQntZd2 + 1nZXjrvyQxuRk9ZxQYH3QHfm14LYHc70BTSraxKI+8chQcuVj6tsAgrD1 + RWdBKfVBbXjt+LaiRwCbUYMll6u+jjqnmEMHSrUpZujg3klH237md9SXa + 37yFQxyarddD1nF8E+ny40AEdtC3cTGt5Ar19Wsp+W6417mEmx6ktZkZd + 2s4JDPZkFpV3gOOZumfVaiUV2iM+gjXUhTEiJqzIpaoYdgOiBcYYH7tlm + I/na3T9fEDfmsq6JxtGp9O8CDC/E09i7K1m27+a7hskfivj2Uj+kXW6uY + w==; +X-CSE-ConnectionGUID: iTxb4W8TSr+JVDmjhHc1nA== +X-CSE-MsgGUID: 3TLdAYAnTC6j+0rzbFr1IQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931484" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931484" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:47 -0700 +X-CSE-ConnectionGUID: VnM+trhjTJSnHQCXxzcG2w== +X-CSE-MsgGUID: xdjg5U3US8yEOUARte8LqQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959793" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:47 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org +Subject: [RFC patch v3 03/20] sched: Avoid task migration within its preferred LLC +Date: Wed, 18 Jun 2025 11:27:51 -0700 +Message-Id: <284223928844c9ae64de1fa142f8db89775de27b.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +It was found that when running schbench, there is a +significant amount of in-LLC task migrations, even if +the wakee is woken up on its preferred LLC. This +leads to core-to-core latency and impairs performance. + +Inhibit task migration if the wakee is already in its +preferred LLC. + +Signed-off-by: Chen Yu +--- + kernel/sched/fair.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 89db97f8ef02..567ad2a0cfa2 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8813,6 +8813,8 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + if (cpu < 0) + return prev_cpu; + ++ if (cpus_share_cache(cpu, prev_cpu)) ++ return prev_cpu; + + if (static_branch_likely(&sched_numa_balancing) && + __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) { +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch new file mode 100644 index 0000000..313ef8d --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-04-20-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch @@ -0,0 +1,122 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8C23B2F4A0D + for ; Wed, 18 Jun 2025 18:21:49 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270911; cv=none; b=qgOqtbwj/b1lT+pk59OtjXjeOpPuI2zhfn0D21JXULqPLorw/ZgDKmNNS7Urzt/SfBFCUz801jIRaBO9Cslv2B7LxeJe//HjIB3+4P845payLN3vcYxAOxAbfLaUARgyfK6W4UUcOiOk0TZHcE9SbDMxaEDC/rvvNnXsMH1W0ug= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270911; c=relaxed/simple; + bh=nwpdkAoJpW4EOqPZQh+uhJ+qmZILgurvv6g4rDaWrYo=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=HAZIoXDlwzbldKWFUzY/Q4Jaxhhc2Mkt0bb9WOBfRPLAMS3DspRW02WEau/R32ErYRWHNC9ZlNYoWyuZiO4M7Awr204pB/+urb4Knb8pmbJ2BIdcVmJE5vaTPh/a1tQlov0Ea2J7+pXzMfBW5Wl8AAuJzrs80wakQeKZDNOWJJ0= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ivStBZKK; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ivStBZKK" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270909; x=1781806909; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=nwpdkAoJpW4EOqPZQh+uhJ+qmZILgurvv6g4rDaWrYo=; + b=ivStBZKKr1kLat+jKvOOQxbf9d4bC1I/SJy7LDib/zgN3n589asFL4+r + PYsbbyBX6DjZAdbQ1Ik/G6Sc6usq0dD9Ziu/7QfHFyk6vz2whi6PSRkGc + qh0Xwo9lT9BAKy1pR/Oo42AYGiTgM5CVtwmFP8HFWa4TdxGk4w9dRkMX3 + 4DxdIZo1ar93mu1DEN1+6WJ0elyBKkJVxfHeC50jqR9/1tsGUnexjnQ7X + 3xWpf6BNsr2eTOb+JyArdlGpErCa4hDLn0ptngc6kRC2FeOvceeJAY/Rw + iKQTxZNYHsLyWzeH16AJJGbNG66qTaf60D44RqGpvBPrHgL3ZhdDghMpz + A==; +X-CSE-ConnectionGUID: x7oHEr0dTrOTH6OFzpdkdg== +X-CSE-MsgGUID: UfwNMRYcRZKjKAWmE406aQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931496" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931496" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:48 -0700 +X-CSE-ConnectionGUID: FdoRwVvNTIiAzpt945cGzw== +X-CSE-MsgGUID: TkAAX+2KQ6qAC2qWqI6SgA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959815" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:48 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 04/20] sched: Avoid calculating the cpumask if the system is overloaded +Date: Wed, 18 Jun 2025 11:27:52 -0700 +Message-Id: <2901db739c1792eb9ece08a2b8d3b8de32efd0eb.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: K Prateek Nayak + +If the SIS_UTIL cuts off idle cpu search, result of the cpumask_and() is +of no use. Since select_idle_cpu() can now be called twice per wake up +in the select_idle_sibling() due to cache aware wake up, this overhead +can be visible in benchmarks like hackbench. + +To save some additional cycles, especially in cases where we target +the LLC frequently and the search bails out because the LLC is busy, +only calculate the cpumask if the system is not overloaded. + +Signed-off-by: K Prateek Nayak +--- + kernel/sched/fair.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 567ad2a0cfa2..6a2678f9d44a 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -7918,8 +7918,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool + int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; + +- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +- + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { +@@ -7931,6 +7929,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool + } + } + ++ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); ++ + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch new file mode 100644 index 0000000..3bcee46 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-05-20-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch @@ -0,0 +1,157 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 546862F948D + for ; Wed, 18 Jun 2025 18:21:51 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270915; cv=none; b=gCit7OdxmL/z+sHqvOYlmAu0jpeZwqhZeORAcAbKfGEmC7Cut1d8DS6/6wZGZvCrl4vEp3HCh8qjH6ozHZZ6tcNiYj+z1Y0m+CEOlg676Q1clxzwBbJ2P+CbCmGGNeEg3rwixeD6+R0v81UsIKJXJfMsf8UU4IJzxCienwRrWeA= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270915; c=relaxed/simple; + bh=GRt1KcpJc3uX3j1lW75IjTwwtuVzCFPIjuwEQmJ7o0c=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=FBJTd0ukxWRW+/MIo7zsf0SmGsGIBox8sqZYavmOM5fv6LOEqVyOr4wp2ndfb2WlEQFkw2Pp36oF0gqWn9mhyDwfdEsnpnqdcv1XeDKQshvz9ZWQHVjdtgZQp4BkNGTp0CFAVo9mAHC/VnVDZvOjoYt4QE/rqzevU8YFJ+7bEko= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MSgKa/Xs; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MSgKa/Xs" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270911; x=1781806911; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=GRt1KcpJc3uX3j1lW75IjTwwtuVzCFPIjuwEQmJ7o0c=; + b=MSgKa/Xs8zOJ4H2lqLt5AoI5bQ2Z5hgnpUup28q98ByliGPYLMtTuVoT + +lSE1UHq2qbmWe+CGFnXXNN0O11daPjgIRfzwTIAXeYYAwWhWm/SJOst4 + 2yClxpLCWgokA1/yxRHLW2J/20uBhmoIokqluLvohhQNoEZU8oeZgagEC + Urji0g1zMpdionTkeyTJrvrZh+ExyPjKEjVPQFLk6s+JnHq/wiwVKWRjm + iKAY3vicJCdgEZaqexyIOwSVKYAdj5Ds+qaro+e1pYLQIVMXZfJCil89W + 2TwNI95OlQaBFc3aLSCKuvjf4TUtpWBzOnftomqkcPANu/uLnxs7/ZZJ5 + Q==; +X-CSE-ConnectionGUID: OsOzpi1fQVmQCEIlEJnnaA== +X-CSE-MsgGUID: b5N1VeS7R72ao2jICSSBCQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931510" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931510" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:49 -0700 +X-CSE-ConnectionGUID: 03mrFPsdTSSv8opMC44BDA== +X-CSE-MsgGUID: vBS5/z1NSra3RrHanVjnzw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959835" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:49 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 05/20] sched: Add hysteresis to switch a task's preferred LLC +Date: Wed, 18 Jun 2025 11:27:53 -0700 +Message-Id: <7371f30196b317c0c5a0ae3fa463ec76a4dc69ef.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Switching a process's preferred LLC generates lots of task +migrations across LLCs. To avoid frequent switches +of home LLC, implement the following policy: + +1. Require a 2x occ change threshold to switch preferred LLC +2. Don't discard preferred LLC for a task + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6a2678f9d44a..7fb2322c5d9e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1175,6 +1175,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + #define EPOCH_PERIOD (HZ/100) /* 10 ms */ + #define EPOCH_OLD 5 /* 50 ms */ + ++static int llc_id(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_id, cpu); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +@@ -1299,6 +1307,7 @@ static void task_cache_work(struct callback_head *work) + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; ++ unsigned long last_m_a_occ = 0; + int cpu, m_a_cpu = -1; + cpumask_var_t cpus; + +@@ -1337,11 +1346,13 @@ static void task_cache_work(struct callback_head *work) + per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr); + } + +- a_occ /= nr; ++ // a_occ /= nr; + if (a_occ > m_a_occ) { + m_a_occ = a_occ; + m_a_cpu = m_cpu; + } ++ if (llc_id(cpu) == llc_id(mm->mm_sched_cpu)) ++ last_m_a_occ = a_occ; + + trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n", + per_cpu(sd_llc_id, cpu), a_occ, m_a_occ); +@@ -1355,13 +1366,10 @@ static void task_cache_work(struct callback_head *work) + } + } + +- /* +- * If the max average cache occupancy is 'small' we don't care. +- */ +- if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD)) +- m_a_cpu = -1; +- +- mm->mm_sched_cpu = m_a_cpu; ++ if (m_a_occ > (2 * last_m_a_occ)) { ++ /* avoid the bouncing of mm_sched_cpu */ ++ mm->mm_sched_cpu = m_a_cpu; ++ } + + free_cpumask_var(cpus); + } +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch new file mode 100644 index 0000000..c2d0adb --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-06-20-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch @@ -0,0 +1,195 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 229A22F9488 + for ; Wed, 18 Jun 2025 18:21:51 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270912; cv=none; b=trjx3OlHRA8G6/xha0gDitGyeQDoeJ3CWSLlk3i5tVcRLQ172YBtygleCjw//E9Ox3BBaWN+ph357z6VKUSjNpOMgeWNiH6GkUqOMtdSlowllHpGMFXca9dnbLicNEyUsvDUBI3SRpuyiOyhA3wQi6hex0PK3QdUEepICMIjXZg= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270912; c=relaxed/simple; + bh=o9ssIKZoupCUY+RNkWM8+C2a5S5kPuUisQEegnB07u4=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=iGg9GEruTAvKeZoWaAvOBUnqouWOODirEVRQZXGYRmNvKmUBtFTbO+hnJ2kyJkTtn96ZqswISYcW/8MaFKP99lSrk/CuZH9xxItsJABocbfde14vreP3VS50k7ELI9JacoChlVu0tLEaIQCSl73iwrgLFU0W1jkrM5FMDR2/bjQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gB32GMi+; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gB32GMi+" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270911; x=1781806911; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=o9ssIKZoupCUY+RNkWM8+C2a5S5kPuUisQEegnB07u4=; + b=gB32GMi+ZsCEz3HcthojXq5IZgpULXknP3fum+zJzV6sVIWDquFR+WL7 + p4aRxUvmzJTtGtb6DbOhdfWcTMwfYPeBoThNWTLkO+kN/Gx/5mzI4RDN3 + 3JDAP4eXcOHwI+Xgzs+L46NGuc3oyWxTcDIB8oNEL0esdpvR2zH9nzgTk + s/AtWUC3ubNeM+NWKgu756KSw8M2pErJkISkQA7CeZMciVqZKfTbgApS0 + EUcI9uAwecjVzzaaA+BEUO8jqFgqqlw3NL3G9rPT6t50c2BEIqPjRs/ed + abvo42eYS3OUZKzeI3iJnnkUKHv/OSzErtpgNUBNHgkLcvq3ke4HuEw9u + A==; +X-CSE-ConnectionGUID: PSo7rU+bQcqeonMNyHQ+LA== +X-CSE-MsgGUID: ps4xpG3nQJSMcwDcH4WDhg== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931523" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931523" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:50 -0700 +X-CSE-ConnectionGUID: SALh/A1xQRqjZut3f9eZCA== +X-CSE-MsgGUID: 8pi5QrQxSdKOJKN1adsLzw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959857" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:49 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org +Subject: [RFC patch v3 06/20] sched: Save the per LLC utilization for better cache aware scheduling +Date: Wed, 18 Jun 2025 11:27:54 -0700 +Message-Id: <22f5c52b3e904bd782c43bc4bfc6fcd4b447ea54.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +When a system gets busy and a process's preferred LLC +is saturated by too many threads within this process, there are significant +in-LLC task migrations within its preferred LLC. This leads to migration +latency and degrades performance. Ideally, task aggregation should be +inhibited if the task's preferred LLC is overloaded. This implies that a +metric is needed to indicate whether the LLC is busy. + +Store the per-LLC utilization calculated via periodic load +balancing. These statistics will be used in subsequent patches to +determine whether tasks should be aggregated to their preferred LLC. + +Signed-off-by: Chen Yu +--- + include/linux/sched/topology.h | 3 ++ + kernel/sched/fair.c | 53 ++++++++++++++++++++++++++++++++++ + 2 files changed, 56 insertions(+) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 7b4301b7235f..b3115bc1cbc0 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -78,6 +78,9 @@ struct sched_domain_shared { + atomic_t nr_busy_cpus; + int has_idle_cores; + int nr_idle_scan; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned long util_avg; ++#endif + }; + + struct sched_domain { +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 7fb2322c5d9e..02f104414b9a 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8806,6 +8806,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + #ifdef CONFIG_SCHED_CACHE + static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle); + ++/* expected to be protected by rcu_read_lock() */ ++static bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ struct sched_domain_shared *sd_share; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu)); ++ if (!sd_share) ++ return false; ++ ++ *util = READ_ONCE(sd_share->util_avg); ++ *cap = per_cpu(sd_llc_size, cpu) * SCHED_CAPACITY_SCALE; ++ ++ return true; ++} ++ + static int select_cache_cpu(struct task_struct *p, int prev_cpu) + { + struct mm_struct *mm = p->mm; +@@ -10646,6 +10662,42 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) + return check_cpu_capacity(rq, sd); + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Save this sched group's statistic for later use: ++ * The task wakeup and load balance can make better ++ * decision based on these statistics. ++ */ ++static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ /* Find the sched domain that spans this group. */ ++ struct sched_domain *sd = env->sd->child; ++ struct sched_domain_shared *sd_share; ++ ++ if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ return; ++ ++ /* only care the sched domain that spans 1 LLC */ ++ if (!sd || !(sd->flags & SD_SHARE_LLC) || ++ !sd->parent || (sd->parent->flags & SD_SHARE_LLC)) ++ return; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, ++ cpumask_first(sched_group_span(group)))); ++ if (!sd_share) ++ return; ++ ++ if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util)) ++ WRITE_ONCE(sd_share->util_avg, sgs->group_util); ++} ++#else ++static inline void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++} ++#endif ++ + /** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @env: The load balancing environment. +@@ -10735,6 +10787,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ update_sg_if_llc(env, sgs, group); + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch new file mode 100644 index 0000000..10724fe --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-07-20-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch @@ -0,0 +1,279 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id CFC872F949E + for ; Wed, 18 Jun 2025 18:21:51 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270915; cv=none; b=Z36ep254EkAHaKvpr8i7KwV4mxwpymtfd2E0A6r1XAf82xXaqo+m0qmyyZX4NW84q+tYHhFu/VEpulhnEvbYzsslAlOsEdbj/cgL3d/z0RoO88Yz4a6r2b06VMNmTH912fsGsTfN/YW+VYbD10CiJVQVXES+s/I6OYnWE4BIsqk= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270915; c=relaxed/simple; + bh=HEb10UrMOCgohlL9Nxhp873UqhXKYVZZ5sEwo42BWsw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=V5Z/EUYBBE/1u61niCwmDRwy0KcujoUnnkZEXjAt3pPVOfg6lPDVWI8oDLucO2nivKtjepMD9B9l2n0gliPOmSpL08JDjYQMNdIIyu7FMX+A1RKWk/VwrZpb/ie9Q5tCUnYK9oOIFmI9VCF2TdqXlfQyE/9gjbxg71N+r5vzmpE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=iTHUAET4; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="iTHUAET4" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270911; x=1781806911; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=HEb10UrMOCgohlL9Nxhp873UqhXKYVZZ5sEwo42BWsw=; + b=iTHUAET4QNzU15KipInKDKa553rGd8p4EGv5ueHYEHtVMDJBGVWb9jyb + 0f+AUwD60jIkZnWy1Hh0X/GY6o6skz2X0cFXL5PYANaViVdrNCYYcBI9+ + 93Yy0pV59RjM4ec4buLe0pykhrmHjPvgPH4t2P2rCGCOr9UAxesex95B2 + ljR8tWCmfhg2uyQELaySWiA4N1O7lUWXcjia1sXNj2D47V0T1Gu0IqDrm + dg1Y/Am2QjVh/PycKohb5TLEWxUNqGp1dzcMX1OUkpYS92qdY5o4yxWJp + mPLxjSRGG+jTFdBLrVXXU3kBfrZAbV2I+WaHLGmP5yRhH+/iv/CBputDH + g==; +X-CSE-ConnectionGUID: nGL6WccUTluIsoPploYN4A== +X-CSE-MsgGUID: vdU2TrNrS76S5tDix/iXow== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931537" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931537" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:51 -0700 +X-CSE-ConnectionGUID: 2qoc+SXgS1igWBhKnpTp2g== +X-CSE-MsgGUID: +fdgV0C1R2CwRkSohE+TnA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959878" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:50 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 07/20] sched: Add helper function to decide whether to allow cache aware scheduling +Date: Wed, 18 Jun 2025 11:27:55 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Cache-aware scheduling is designed to aggregate threads into their +preferred LLC, either via the task wake up path or the load balancing +path. One side effect is that when the preferred LLC is saturated, +more threads will continue to be stacked on it, degrading the workload's +latency. A strategy is needed to prevent this aggregation from going too +far such that the preferred LLC is too overloaded. + +Introduce helper function _get_migrate_hint() to implement the LLC +migration policy: + +1) A task is aggregated to its preferred LLC if both source/dest LLC + are not too busy (<50% utilization, tunable), or the preferred + LLC will not be too out of balanced from the non preferred LLC + (>20% utilization, tunable, close to imbalance_pct of the LLC + domain). +2) Allow a task to be moved from the preferred LLC to the + non-preferred one if the non-preferred LLC will not be too out + of balanced from the preferred prompting an aggregation task + migration later. We are still experimenting with the aggregation + and migration policy. Some other possibilities are policy based + on LLC's load or average number of tasks running. Those could + be tried out by tweaking _get_migrate_hint(). + +The function _get_migrate_hint() returns migration suggestions for the upper-level +functions. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/debug.c | 4 ++ + kernel/sched/fair.c | 110 ++++++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 5 ++ + 3 files changed, 118 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 56ae54e0ce6a..7271ad1152af 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -532,6 +532,10 @@ static __init int sched_init_debug(void) + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); + #endif + ++#ifdef CONFIG_SCHED_CACHE ++ debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap); ++ debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb); ++#endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + + debugfs_fair_server_init(); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 02f104414b9a..10ea408d0e40 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8804,7 +8804,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + } + + #ifdef CONFIG_SCHED_CACHE +-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle); ++static long __migrate_degrades_locality(struct task_struct *p, ++ int src_cpu, int dst_cpu, ++ bool idle); ++__read_mostly unsigned int sysctl_llc_aggr_cap = 50; ++__read_mostly unsigned int sysctl_llc_aggr_imb = 20; ++ ++/* ++ * The margin used when comparing LLC utilization with CPU capacity. ++ * Parameter sysctl_llc_aggr_cap determines the LLC load level where ++ * active LLC aggregation is done. ++ * Derived from fits_capacity(). ++ * ++ * (default: ~50%) ++ */ ++#define fits_llc_capacity(util, max) \ ++ ((util) * 100 < (max) * sysctl_llc_aggr_cap) ++ ++/* ++ * The margin used when comparing utilization. ++ * is 'util1' noticeably greater than 'util2' ++ * Derived from capacity_greater(). ++ * Bias is in perentage. ++ */ ++/* Allows dst util to be bigger than src util by up to bias percent */ ++#define util_greater(util1, util2) \ ++ ((util1) * 100 > (util2) * (100 + sysctl_llc_aggr_imb)) ++ ++enum llc_mig_hint { ++ mig_allow = 0, ++ mig_ignore, ++ mig_forbid ++}; ++ + + /* expected to be protected by rcu_read_lock() */ + static bool get_llc_stats(int cpu, unsigned long *util, +@@ -8822,6 +8854,82 @@ static bool get_llc_stats(int cpu, unsigned long *util, + return true; + } + ++static enum llc_mig_hint _get_migrate_hint(int src_cpu, int dst_cpu, ++ unsigned long tsk_util, ++ bool to_pref) ++{ ++ unsigned long src_util, dst_util, src_cap, dst_cap; ++ ++ if (cpus_share_cache(src_cpu, dst_cpu)) ++ return mig_allow; ++ ++ if (!get_llc_stats(src_cpu, &src_util, &src_cap) || ++ !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) ++ return mig_allow; ++ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ !fits_llc_capacity(src_util, src_cap)) ++ return mig_ignore; ++ ++ src_util = src_util < tsk_util ? 0 : src_util - tsk_util; ++ dst_util = dst_util + tsk_util; ++ if (to_pref) { ++ /* ++ * sysctl_llc_aggr_imb is the imbalance allowed between ++ * preferred LLC and non-preferred LLC. ++ * Don't migrate if we will get preferred LLC too ++ * heavily loaded and if the dest is much busier ++ * than the src, in which case migration will ++ * increase the imbalance too much. ++ */ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ util_greater(dst_util, src_util)) ++ return mig_forbid; ++ } else { ++ /* ++ * Don't migrate if we will leave preferred LLC ++ * too idle, or if this migration leads to the ++ * non-preferred LLC falls within sysctl_aggr_imb percent ++ * of preferred LLC, leading to migration again ++ * back to preferred LLC. ++ */ ++ if (fits_llc_capacity(src_util, src_cap) || ++ !util_greater(src_util, dst_util)) ++ return mig_forbid; ++ } ++ return mig_allow; ++} ++ ++/* ++ * Give suggestion when task p is migrated from src_cpu to dst_cpu. ++ */ ++static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cpu, ++ struct task_struct *p) ++{ ++ struct mm_struct *mm; ++ int cpu; ++ ++ if (cpus_share_cache(src_cpu, dst_cpu)) ++ return mig_allow; ++ ++ mm = p->mm; ++ if (!mm) ++ return mig_allow; ++ ++ cpu = mm->mm_sched_cpu; ++ if (cpu < 0) ++ return mig_allow; ++ ++ if (cpus_share_cache(dst_cpu, cpu)) ++ return _get_migrate_hint(src_cpu, dst_cpu, ++ task_util(p), true); ++ else if (cpus_share_cache(src_cpu, cpu)) ++ return _get_migrate_hint(src_cpu, dst_cpu, ++ task_util(p), false); ++ else ++ return mig_allow; ++} ++ + static int select_cache_cpu(struct task_struct *p, int prev_cpu) + { + struct mm_struct *mm = p->mm; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index d16ccd66ca07..1c6fd45c7f62 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2818,6 +2818,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; + extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + ++#ifdef CONFIG_SCHED_CACHE ++extern unsigned int sysctl_llc_aggr_cap; ++extern unsigned int sysctl_llc_aggr_imb; ++#endif ++ + #ifdef CONFIG_SCHED_HRTICK + + /* +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch new file mode 100644 index 0000000..5998f9e --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-08-20-sched-Set-up-LLC-indexing.patch @@ -0,0 +1,224 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id D922E2F94BC + for ; Wed, 18 Jun 2025 18:21:52 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270914; cv=none; b=qo1W6boom/FPcJM6aUbqBPYFsJsbF683DNq+T76orD1BhUuPT/cDgxLm/IdIt7lsAwwvhls6rRgrRp3wVI2a2orhxiRxH4pzTcUnStzKQ94lhDbiQkmwAnBP+Oe6i31HfDZbyBWWJXZl9duCrd/52c4F5rx8/huBgPpKES9g+o0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270914; c=relaxed/simple; + bh=cQUCpe4LZfLrzrqrRvhJ4zn32opdkosRxPW3YpSdQpw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=k8VqwMymU4J6e9GkRGLqFZapA+h/KmaiCqV6pWjSVsWwmgnsiVhiYCMQUYaIYCuRy8xdvkKxXLmpnnj81wxXCZFpX6tgVD+1igdqcNGnsw+8Dd7OREe7hmOl9DpLac08ZCVRrBBzXbfElsFBTfBsioIq0k6loXV4uYpQZrYBrMU= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=FUltCdk8; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="FUltCdk8" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270912; x=1781806912; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=cQUCpe4LZfLrzrqrRvhJ4zn32opdkosRxPW3YpSdQpw=; + b=FUltCdk86P1n8b/xtaRLXyRNaWFyT3up5/DOjijJa7AptKtTq7F17sT9 + +qgOnAsLS4hrBcE0M3b/NgTR2SCnVrsRwXJKozVz4N9t01io3n/dvVQKq + 38gbrgGuDv4YYXh0s0Tdj8hQgPW825VDrCKW2iASUc/Zz+VmPLgQmKiPp + FyR41eBRrDbzEAAwNxvUiMbjT740rIgIieuCoK8C/tv7tcqrUHNVi1T/k + b/vnTpMgt+sYhmf2tlLBsLIZRkLEKBWUqEj3rUfk1D31j6gtYWu/kqjBU + asxz7novbH7ygWHnG2F/F9OvSjwzEc0+3fkEynQRMLwKacbHoHNVa0CrB + w==; +X-CSE-ConnectionGUID: pkqYpBEfS3upf7WK/GoLIw== +X-CSE-MsgGUID: paR/450tR6eOwDodmnSOag== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931550" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931550" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:52 -0700 +X-CSE-ConnectionGUID: uNM5F0vKRkCxQO9WvZjBoQ== +X-CSE-MsgGUID: qccG1toMTWyEIVjk+nl3kg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959901" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:51 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 08/20] sched: Set up LLC indexing +Date: Wed, 18 Jun 2025 11:27:56 -0700 +Message-Id: <71e251a086be786fb2a0480bbab69142d14bd22d.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Prepare for indexing arrays that track in each run queue: the number +of tasks preferring current LLC and each of the other LLC. + +The reason to introduce LLC index is because the per LLC-scope data +is needed to do cache aware load balancing. However, the native lld_id +is usually the first CPU of that LLC domain, which is not continuous, +which might waste the space if the per LLC-scope data is stored +in an array (in current implementation). + +In the future, this LLC index could be removed after +the native llc_id is used as the key to search into xarray based +array. + +Signed-off-by: Tim Chen +--- + include/linux/sched.h | 3 +++ + kernel/sched/fair.c | 12 ++++++++++++ + kernel/sched/sched.h | 2 ++ + kernel/sched/topology.c | 29 +++++++++++++++++++++++++++++ + 4 files changed, 46 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index d0e4cda2b3cd..7ce95a32e9ff 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -810,6 +810,9 @@ struct kmap_ctrl { + #endif + }; + ++/* XXX need fix to not use magic number */ ++#define MAX_LLC 64 ++ + struct task_struct { + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 10ea408d0e40..5549710d95cf 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1183,6 +1183,18 @@ static int llc_id(int cpu) + return per_cpu(sd_llc_id, cpu); + } + ++/* ++ * continous index. ++ * TBD: replace by xarray with key llc_id() ++ */ ++static inline int llc_idx(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_idx, cpu); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 1c6fd45c7f62..74eb2f3615aa 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2037,6 +2037,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DECLARE_PER_CPU(int, sd_llc_size); + DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(int, sd_llc_idx); + DECLARE_PER_CPU(int, sd_share_id); + DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -2045,6 +2046,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + + extern struct static_key_false sched_asym_cpucapacity; + extern struct static_key_false sched_cluster_active; ++extern int max_llcs; + + static __always_inline bool sched_asym_cpucap_active(void) + { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index f1ebc60d967f..b7bb13045dd8 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -672,6 +672,7 @@ static void destroy_sched_domains(struct sched_domain *sd) + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DEFINE_PER_CPU(int, sd_llc_size); + DEFINE_PER_CPU(int, sd_llc_id); ++DEFINE_PER_CPU(int, sd_llc_idx); + DEFINE_PER_CPU(int, sd_share_id); + DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -681,6 +682,25 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_cluster_active); + ++int max_llcs = -1; ++ ++static void update_llc_idx(int cpu) ++{ ++#ifdef CONFIG_SCHED_CACHE ++ int idx = -1, llc_id = -1; ++ ++ llc_id = per_cpu(sd_llc_id, cpu); ++ idx = per_cpu(sd_llc_idx, llc_id); ++ ++ if (idx < 0) { ++ idx = max_llcs++; ++ BUG_ON(idx > MAX_LLC); ++ per_cpu(sd_llc_idx, llc_id) = idx; ++ } ++ per_cpu(sd_llc_idx, cpu) = idx; ++#endif ++} ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +@@ -699,6 +719,7 @@ static void update_top_cache_domain(int cpu) + per_cpu(sd_llc_size, cpu) = size; + per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); ++ update_llc_idx(cpu); + + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) +@@ -2394,6 +2415,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + bool has_asym = false; + bool has_cluster = false; + ++#ifdef CONFIG_SCHED_CACHE ++ if (max_llcs < 0) { ++ for_each_possible_cpu(i) ++ per_cpu(sd_llc_idx, i) = -1; ++ max_llcs = 0; ++ } ++#endif ++ + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch new file mode 100644 index 0000000..043d343 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-09-20-sched-Introduce-task-preferred-LLC-field.patch @@ -0,0 +1,148 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id B94F12FA65E + for ; Wed, 18 Jun 2025 18:21:54 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270916; cv=none; b=UeyCyGRjZM/3la5K7W5DZr7Fqbx/pXCPlXhvjgFOgYIXCDUEtBpJd57eBNDqgWvGJv8lBL+mUxf5kMWOHyA8RedaqjM+j02Jn9B78T1lChIZ2n/HcQ4ovyIdvMDIjh6GJZ0rAG3+mVxq530ReordkAU/8zkHRYpmSCOgeBTL+I4= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270916; c=relaxed/simple; + bh=qCkIhL6rGvZ+dzcJ6E4keINBL3DRXFHBoE2OhjjXt84=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=HdR2Xf4V3RIfo4sN94qeqMUmpg9AvXnNLA7hey4fd0WeAkTF0T4KDeRwQToF9rBlqaacSEzIC3ldfo3gUF0PSPBuQsHrIDtfOE0rb1e2syBB9Uy0m0Pyuh26N9i8RFDaGsxHG8i9HPtFTwvfAZ4PLiBu3T6sgQXJ8rTnQ7LikE0= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ctebiIFc; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ctebiIFc" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270914; x=1781806914; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=qCkIhL6rGvZ+dzcJ6E4keINBL3DRXFHBoE2OhjjXt84=; + b=ctebiIFceLs/6XgbLjwR55dJzRZbzapx502Dxi0l/NvCbPchL4pYfpTv + mX4fL7dCkAt2uRGyN8gw9ioXWlpiuXifnjj+0MNxYIis5NlGMWrVuELt9 + p2k9M3g2gYolFGidjsV123j/xwGYMbxHzvGAIu7gZe6H/GBXSmGkX0BuE + BbuXEWcCw0iqTOGZJwjotpQh0+0BjJedRDEyx/wJT4zQv28fNmgWwOtv1 + f4suB3nLhc82MQOzvFx7z7nB0rbHQlioxhlaZW+cZpn776eX5rSkia9jE + XjCTqRmxuGrPP0O3C/HkP/FgXNG323aYUO7tahFuWXcmZCHjHmALri07M + Q==; +X-CSE-ConnectionGUID: NPicx326QWy/zNraeo2kXQ== +X-CSE-MsgGUID: V9D14qHKQa6053W/dqsxug== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931566" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931566" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:53 -0700 +X-CSE-ConnectionGUID: 7bYdnBGCRGq67Ukuexzzfg== +X-CSE-MsgGUID: bMqkNepHSg24kSctAmlfHw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959924" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:52 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 09/20] sched: Introduce task preferred LLC field +Date: Wed, 18 Jun 2025 11:27:57 -0700 +Message-Id: <7b9df4433d73ce415a27925ce42cec53774debaf.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +With cache aware scheduling enabled, each process is assigned +a preferred LLC id, which will be used to quickly identify +the LLC domain this thread prefers to run. This is similar to +numa_preferred_nid for NUMA balance. + +Signed-off-by: Tim Chen +--- + include/linux/sched.h | 1 + + init/init_task.c | 3 +++ + kernel/sched/fair.c | 7 +++++++ + 3 files changed, 11 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 7ce95a32e9ff..2f1cb7445733 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1404,6 +1404,7 @@ struct task_struct { + + #ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; ++ int preferred_llc; + #endif + + #ifdef CONFIG_RSEQ +diff --git a/init/init_task.c b/init/init_task.c +index e557f622bd90..5fffbe766f57 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { + .numa_group = NULL, + .numa_faults = NULL, + #endif ++#ifdef CONFIG_SCHED_CACHE ++ .preferred_llc = -1, ++#endif + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + .kasan_depth = 1, + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 5549710d95cf..cc804a8c7061 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1267,6 +1267,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_struct *mm = p->mm; + struct mm_sched *pcpu_sched; + unsigned long epoch; ++ int mm_sched_llc = -1; + + /* + * init_task and kthreads don't be having no mm +@@ -1293,6 +1294,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + mm->mm_sched_cpu = -1; + pcpu_sched->occ = 0; + } ++ ++ if (mm->mm_sched_cpu != -1) ++ mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); ++ ++ if (p->preferred_llc != mm_sched_llc) ++ p->preferred_llc = mm_sched_llc; + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch new file mode 100644 index 0000000..cd8cf95 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-10-20-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch @@ -0,0 +1,238 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 378672FBFE4 + for ; Wed, 18 Jun 2025 18:21:55 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270916; cv=none; b=e05IFwV/MLqo2ClgaOGWLNSlY10NaanqCNFgdhzJHiSJKN+sh3Zlln5UdTmioRlRhRjp/nMlzDaeMob8JkL6Vnprcb7T4E++++CTqUtUCV4CFP4PSK0vw5A7hPHtk/OEkDT1g3ZfjDaU9iC9y5xW8mEtVA1goM0jAs9Hl7McPfo= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270916; c=relaxed/simple; + bh=WuCB5SIKo2iM8nX5ebWteeRjxdPAXbYteS+EwA2pE0w=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=oAGm3igrNrOpxMrHQQoLXScf+6Qhzs+QT0b8NDUn/Z1Lg6wRKdWBLm3Z8cBcgmvttwNrNTS1WRshJkfEt9buiwj9p9r7b5+8Pgfu5tiqdhoABS40DvzWpr6d+nOBznhgOieUV5aoD2LXxBNoP4rNF0a9Ez39F7B7HtKZvpqaKlk= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=NHLCwYLq; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="NHLCwYLq" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270915; x=1781806915; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=WuCB5SIKo2iM8nX5ebWteeRjxdPAXbYteS+EwA2pE0w=; + b=NHLCwYLqVTRJAuEFNcV41R7MnvuS9z2QmW5K059zPETnEpDlEodn/VG8 + IZzAuJmXE4uhiAbjaKpwaneVSv8DpipBx12ro57h14bYGeGVOnAEqtRC0 + MYOOelRwA6HB65s8wwQlNHjKsFH8Px5CvzkVOr9zfB2+Cf3ZCiBWNjFvJ + ia6JfMeXbhgywU5X/aCFqFVuO0i7U1S4e/3PZ/4lISImQE5ptcYQvqsVE + 7frWf0qbM3P8Z3xratwf4AuiFQOa5n18Y0HGvqXbmBvbVIX5w1NHG2f4g + 7u4XdLdk1q19T6udH+vKwbKmJFB9NiWgUD4pYJZpJbmpavs+mwAM+IGJz + Q==; +X-CSE-ConnectionGUID: ABRohcKYRLuwXUvFn04ETg== +X-CSE-MsgGUID: ajqbr60WT6mWJDBrVQIQ4w== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931579" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931579" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:54 -0700 +X-CSE-ConnectionGUID: wKSilpu8SRKhyBsxbDtmNA== +X-CSE-MsgGUID: LTGYQY+lTBaZzmctTTPKPQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959948" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:53 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 10/20] sched: Calculate the number of tasks that have LLC preference on a runqueue +Date: Wed, 18 Jun 2025 11:27:58 -0700 +Message-Id: <0664be8a3e805ed93eb930131951b1a84cebed66.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Track for each run queue, the number of tasks that have a LLC preference +and how many of those tasks are running in its preferred LLC. This is +similar to nr_numa_running and nr_preferred_running for NUMA balance, +and will be used by the cache-aware load balancing in subsequent patches. + +Signed-off-by: Tim Chen +--- + kernel/sched/core.c | 12 ++++++++++++ + kernel/sched/fair.c | 42 +++++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 7 +++++++ + 3 files changed, 60 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index d9c3e75f79d1..34056eb79ef2 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -498,6 +498,18 @@ void __trace_set_current_state(int state_value) + } + EXPORT_SYMBOL(__trace_set_current_state); + ++#ifdef CONFIG_SMP ++int task_llc(const struct task_struct *p) ++{ ++ return per_cpu(sd_llc_id, task_cpu(p)); ++} ++#else ++int task_llc(const struct task_struct *p) ++{ ++ return 0; ++} ++#endif ++ + /* + * Serialization rules: + * +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cc804a8c7061..88ff47194faa 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1195,6 +1195,18 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) ++{ ++ rq->nr_llc_running += (p->preferred_llc != -1); ++ rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) ++{ ++ rq->nr_llc_running -= (p->preferred_llc != -1); ++ rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +@@ -1298,8 +1310,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); + +- if (p->preferred_llc != mm_sched_llc) ++ if (p->preferred_llc != mm_sched_llc) { ++ account_llc_dequeue(rq, p); + p->preferred_llc = mm_sched_llc; ++ account_llc_enqueue(rq, p); ++ } + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +@@ -1400,6 +1415,14 @@ void init_sched_mm(struct task_struct *p) + work->next = work; + } + ++void reset_llc_stats(struct rq *rq) ++{ ++ if (rq->nr_llc_running) ++ rq->nr_llc_running = 0; ++ ++ rq->nr_pref_llc_running = 0; ++} ++ + #else + + static inline void account_mm_sched(struct rq *rq, struct task_struct *p, +@@ -1410,6 +1433,17 @@ void init_sched_mm(struct task_struct *p) { } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) ++{ ++} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) ++{ ++} ++ ++void reset_llc_stats(struct rq *rq) ++{ ++} + #endif + + static inline +@@ -3939,6 +3973,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); ++ account_llc_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } + #endif +@@ -3952,10 +3987,15 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) + #ifdef CONFIG_SMP + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); ++ account_llc_dequeue(rq_of(cfs_rq), task_of(se)); + list_del_init(&se->group_node); + } + #endif + cfs_rq->nr_queued--; ++ ++ /* safeguard? */ ++ if (!parent_entity(se) && !cfs_rq->nr_queued) ++ reset_llc_stats(rq_of(cfs_rq)); + } + + /* +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 74eb2f3615aa..6c83a71ac8ca 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1104,6 +1104,10 @@ struct rq { + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc_running; ++ unsigned int nr_llc_running; ++#endif + #ifdef CONFIG_NO_HZ_COMMON + #ifdef CONFIG_SMP + unsigned long last_blocked_load_update_tick; +@@ -1948,6 +1952,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) + + #endif /* !CONFIG_NUMA_BALANCING */ + ++extern void reset_llc_stats(struct rq *rq); ++extern int task_llc(const struct task_struct *p); ++ + #ifdef CONFIG_SMP + + static inline void +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch new file mode 100644 index 0000000..101e114 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-11-20-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch @@ -0,0 +1,180 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 88EE22FBFED + for ; Wed, 18 Jun 2025 18:21:55 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270917; cv=none; b=P5gHhCOHlQM9p9ku+pEKNmIWE5uT5+S4kV5oxYtsqpEFVlr9fc0D6NpdlPtZ/gPYaVzFIEml4c8bRLDf/rApQ0P+4X3sXKjvOocZZdUlhKVWOp4g7Z3DkjKRfUK4EbZevwf1AguUUNQhOhr+jz43UGTNA9B35kqwXwKY13QXM6M= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270917; c=relaxed/simple; + bh=RUIbYeV38UehaAavCqtUVWreQbqIvMzQPunFd7dVPyE=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Nzm7Ku6skORzgLcSoIMVnTsqDhfLNjvSWDaPtFERhQETbGDLm9wp9WChfMfmJr8ewsuksm4tdeOptPipC31yfOalmbU4lZM/tCb9mFe/8h4/7fz9qSGQznPwU1NdTCxhel20eNqlBKW2RqU7JuzYJfo7KMA4C1hPgt5AyDArxK4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Mvd9Vfzt; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Mvd9Vfzt" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270915; x=1781806915; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=RUIbYeV38UehaAavCqtUVWreQbqIvMzQPunFd7dVPyE=; + b=Mvd9VfztrhHTmbjDNIxNZ/GGBzT7vnFVGVF55+uq3hkxTLk1MtmoMPMI + U1wXH7aIGF8CNiN6VHqa6PsrvDZd1CfkDD23bWW38C2q0vuFWUdOR1rsg + nQS1Vx/AFI6+tsMY9N0jzPGLqIf//4y/teLgExUZvlOCdWkv+ZRBOa19l + Q/hMdcFdmtGM8n1dub+WeL8RYjxLFhZ3ifnf7sPjEA0wCKGpnuAk0VE1o + xK9Vp73JH2YBrGV5TiS4D6hJAPIirmfsd5xm4xtPojrdCuhbtq4bZsQTB + QF0NjbAxGEPgdEHlXrMy/bsHGHXWbrQ1UYQwmOgyrnmyyqbinzuyILx3v + g==; +X-CSE-ConnectionGUID: s21awipcQP25JSLcWRQsFw== +X-CSE-MsgGUID: 8CcDA+u7TbaNoc5uJcqMDw== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931593" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931593" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:55 -0700 +X-CSE-ConnectionGUID: SW2wbjbRTeKDU7dLzQyhpA== +X-CSE-MsgGUID: /CMNaZ0xTte1NASZbF1kDA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959969" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:54 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 11/20] sched: Introduce per runqueue task LLC preference counter +Date: Wed, 18 Jun 2025 11:27:59 -0700 +Message-Id: <5334cbd97788ba58938444f6e6f07e6c433a9e1c.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Each runqueue is assigned a static array, where each element indicates +the number of tasks preferring a particular LLC mapped to the +array index. + +For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on +this runqueue which prefer to run within LLC3 (indexed from 0 to MAX_LLC +across the entire system). With this information, the load balancer can +make better decisions to select the busiest runqueue and migrate tasks +to their preferred LLC domains. + +Note: The static array could be converted to an xarray in the future. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 1 + + 2 files changed, 36 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 88ff47194faa..ba62b445bbbb 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1195,16 +1195,45 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static inline int pref_llc_idx(struct task_struct *p) ++{ ++ return llc_idx(p->preferred_llc); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + rq->nr_llc_running += (p->preferred_llc != -1); + rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ ++rq->nr_pref_llc[pref_llc]; + } + + static void account_llc_dequeue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + rq->nr_llc_running -= (p->preferred_llc != -1); + rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ /* avoid negative counter */ ++ if (rq->nr_pref_llc[pref_llc] > 0) ++ --rq->nr_pref_llc[pref_llc]; + } + + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) +@@ -1417,8 +1446,13 @@ void init_sched_mm(struct task_struct *p) + + void reset_llc_stats(struct rq *rq) + { +- if (rq->nr_llc_running) ++ int i; ++ ++ if (rq->nr_llc_running) { ++ for (i = 0; i < MAX_LLC; ++i) ++ rq->nr_pref_llc[i] = 0; + rq->nr_llc_running = 0; ++ } + + rq->nr_pref_llc_running = 0; + } +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 6c83a71ac8ca..391ddc0195f8 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1107,6 +1107,7 @@ struct rq { + #ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; ++ unsigned int nr_pref_llc[MAX_LLC]; + #endif + #ifdef CONFIG_NO_HZ_COMMON + #ifdef CONFIG_SMP +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch new file mode 100644 index 0000000..959d150 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-12-20-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch @@ -0,0 +1,139 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id A23042FC004 + for ; Wed, 18 Jun 2025 18:21:56 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270918; cv=none; b=prYV6kiZ/g5CU/Hn41eTTdcf+nsgsejRMF9YlOFqiewBWHbnrWCTP9kxBZckxiRQ1VvQpER8tjN7QgbQ4c0zij9LcckrJVkX+Cpu6SZazEmgx+hiz7gehO5ul8BA5MMZlqJwJ29H/mrdXyUEt1ZTi0aUDrhm0/ch8vBT2HSUIiY= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270918; c=relaxed/simple; + bh=iM3ABT5TB/b+NuSHpzRR+MTBRchvN6UYmnNwwhN0XIw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=UorTPRN8/VKWenu+tPzkgHwZNthp7FChWvQMxBlnU9ZXHkqWwmKJBieJwmssJMgRaYf3QdYXGefsj9yI6+t1biKPLv8Rtoe+CX8vRIiRQisArNkktnElOHhLTlNzPEMBul5M5VTszzGE4dDKi0vulBDpxRNY+j7NieHcUzTr6NQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=dvIH6C13; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="dvIH6C13" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270916; x=1781806916; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=iM3ABT5TB/b+NuSHpzRR+MTBRchvN6UYmnNwwhN0XIw=; + b=dvIH6C13m8eg38MbrhvutO7tNOXJjoru+H/dxlcGqGgIEqt/3lnui+ls + Ax0AdHjwUeIvAv0vKW02IFqBb62BDr2GZeTL5v+KcPecWocJqc8AwLJPW + p6Re4BEEp9c3O5ht3z8Rh9lsWPW/V46p2aLbDPxAIC/89O9nQObGsK7fd + S23TsGqyhc3rr4+MaCrD+MN/GwL1Up9gi5S59wfKXiFZTw2VyXU6i/ieb + p3W93cwc0GbptCXluULNXuQNFNSSINbzdZ13xvmBr8sQkUjlHJttZa6ng + jxyrfFMwPCHG2cw2U4W0FjchU7U1sQuFxJb51T+CMtt3NmPwYWM3XaTsO + Q==; +X-CSE-ConnectionGUID: WGpTAeg9TUi2Z/6fkCAarg== +X-CSE-MsgGUID: 7dA/FD/PQxGrwYmk5u2dRA== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931607" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931607" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:56 -0700 +X-CSE-ConnectionGUID: bXFqvuY3S6+9Zd3SiPYU/Q== +X-CSE-MsgGUID: /yrcI1KcQO2+tdFpydQB5w== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180959988" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:55 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 12/20] sched: Calculate the total number of preferred LLC tasks during load balance +Date: Wed, 18 Jun 2025 11:28:00 -0700 +Message-Id: <4a37811c12bbca8cb669904ad67dad3b7e99a552.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During load balancing between LLCs, gather the number of tasks +on each runqueue of a source LLC. + +For example, consider a system with 4 sched groups LLC0, LLC1, +..., LLC3. We are balancing towards LLC3 and LLC0 has 3 tasks +preferring LLC3, LLC1 has 2 tasks preferring LLC3 and LLC2 has +1 task preferring LLC3. LLC0 with most tasks preferring LLC3 +will be chosen as the busiest LLC to pick the tasks from. + +The number of tasks preferring the destination LLC are gathered +from each run queue for a source LLC. + +For example, consider the sched_group LLC0 with two CPUs, CPU0 +and CPU1. On CPU0, 2 tasks prefer to run on LLC3, and on CPU1, +one task prefers LLC3. The total number of tasks preferring +LLC3 in LLC0 is 2 + 1 = 3. + +These statistics enable the load balancer to select tasks from +a sched_group that best aligns tasks with their preferred LLCs. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ba62b445bbbb..99f3cee7b276 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10459,6 +10459,9 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc[MAX_LLC]; ++#endif + }; + + /* +@@ -10937,6 +10940,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (cpu_overutilized(i)) + *sg_overutilized = 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_feat(SCHED_CACHE)) { ++ int j; ++ ++ for (j = 0; j < max_llcs; ++j) ++ sgs->nr_pref_llc[j] += rq->nr_pref_llc[j]; ++ } ++#endif + /* + * No need to call idle_cpu() if nr_running is not 0 + */ +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch new file mode 100644 index 0000000..06504e2 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-13-20-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch @@ -0,0 +1,169 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 514DE2FC01C + for ; Wed, 18 Jun 2025 18:21:57 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270919; cv=none; b=nmqdaWDatMrhBkfjuY3zJis51UO9eAa5aRb1rJdWVySUjW3tfYRxyj1Xkvi+fNpajS95RQl1kNM/Uc2yZ/0qy4Yr0n5zWNCB62WmrDP+LPoiGxGjwroeiGueYQuwTtAOG6KXPOSjIfKn4GM4dEwjzo3+VttU3Mxq2/vSBP1gjkQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270919; c=relaxed/simple; + bh=qbjVDBu0+RR9cVBkMVV/EEaCntO2T94kbrD7rZkO8yA=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=XnMghoHFlyLEgu+Wh96kFEDjFxOvIdG1kYivt+ooFQzL7JHqy7Y2tRCCgBmmjgXcODTYXXNN9TLYbc2t4TSsUmKPzAY7GwTWviiMJPDQpqTfLl+bgoY1YdlK7e1ynWuUJ9NxwRUCfO0asQkBgDzntM+cRVZ3lV7tz/MiGA6JuHE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=coUsxsM/; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="coUsxsM/" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270917; x=1781806917; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=qbjVDBu0+RR9cVBkMVV/EEaCntO2T94kbrD7rZkO8yA=; + b=coUsxsM/ixxY43XGQ61Df5pL4/CwC9wk7zMajcpFP2eKll8eBCTuGhU1 + TDmuCcPGg5tMI5ZhS8hwToyQBxfmHALCjIHPMRTTN7NWZkIjloQEW5hzf + 8OM/inZ27wXqGy9oddWdVppotNblwyx73zjRCiYiilRwXBDqWBWSJby2f + mn56QOTvTT4uucpyocRsNzlz0tvki+S25xv2mNIZJ1GFIXdpAREJ2ZZvQ + 7hlrzMUkv6jPGBx21WWsulHPgdDzFpzgrgy7hSF/p1HI793hc8L9jfEZv + KcS4ylrKsFNBqYOFqL6hfs7PvPzeeEHhVD6z0cM0apx9kBQCg3dCDjTKK + Q==; +X-CSE-ConnectionGUID: tZPMCIUPT5iVlY3/j2c6gg== +X-CSE-MsgGUID: qbFq77/RT1KHBqt/bJ6sEA== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931622" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931622" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:57 -0700 +X-CSE-ConnectionGUID: btZ9nZi5QCuWcK+hX1K3bQ== +X-CSE-MsgGUID: 68bF5Nl8R3yXe/gNzr0YHw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960011" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:56 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 13/20] sched: Tag the sched group as llc_balance if it has tasks prefer other LLC +Date: Wed, 18 Jun 2025 11:28:01 -0700 +Message-Id: <936c261e6283b8fa8c2d7e60493721f6594ce176.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During load balancing between LLCs, check whether there are tasks +preferring the destination LLC. If so, balance those tasks to the +destination LLC first. + +Tag the sched_group that has tasks preferring to run on other LLCs +(non-local) with the group_llc_balance flag. This way, the load +balancer will later attempt to pull/push these tasks to their +preferred LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 43 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 99f3cee7b276..48a090c6e885 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10454,6 +10454,7 @@ struct sg_lb_stats { + enum group_type group_type; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ ++ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + #ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; +@@ -10818,6 +10819,43 @@ static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, + return false; + } + ++/* ++ * Do LLC balance on sched group that contains LLC, and have tasks preferring ++ * to run on LLC in idle dst_cpu. ++ */ ++#ifdef CONFIG_SCHED_CACHE ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ struct sched_domain *child = env->sd->child; ++ int llc; ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return false; ++ ++ if (env->sd->flags & SD_SHARE_LLC) ++ return false; ++ ++ /* only care about task migration among LLCs */ ++ if (child && !(child->flags & SD_SHARE_LLC)) ++ return false; ++ ++ llc = llc_idx(env->dst_cpu); ++ if (sgs->nr_pref_llc[llc] > 0 && ++ _get_migrate_hint(env->src_cpu, env->dst_cpu, ++ 0, true) == mig_allow) ++ return true; ++ ++ return false; ++} ++#else ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ return false; ++} ++#endif ++ + static inline long sibling_imbalance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sg_lb_stats *busiest, +@@ -11000,6 +11038,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + update_sg_if_llc(env, sgs, group); ++ ++ /* Check for tasks in this group can be moved to their preferred LLC */ ++ if (!local_group && llc_balance(env, sgs, group)) ++ sgs->group_llc_balance = 1; ++ + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch new file mode 100644 index 0000000..b51a45b --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-14-20-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch @@ -0,0 +1,173 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 378E62FCE1A + for ; Wed, 18 Jun 2025 18:21:58 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270920; cv=none; b=ASxthUrBX6Z81qbsQocazhyRsc3w4KjSaibX0r0fmO/uPp3e/rDgPPSjzptpfRM65fVEmwToh+9nY+/mmo0DpYzyL1hx2NIpj7GZfFXAuMz7beZVpYQkrh2HNY8gyzOoVYXLKhwer420hvK4In5+4ah/Az0BdRL3g8Qqt51fikc= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270920; c=relaxed/simple; + bh=cMgIIHv7v3o6hwLDWo7qzlIUuIJ0MkqCBDVD8FqJjxk=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=EWQtFnwscFIHJgsfTtDl3lp0BZZED+6rJ4mkP26EG71H0KTH1swfv+jnlTEZByRE4fdbCqlRsxWJKQE1P2n1+rbfG/iNowQx5qRuzogKgl+wAixpKa+2O1Es/si7+y0czWh1Gp4kfwIn7pT6wQ0T9XCYr9+UWmHFzVEOypFRnzs= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BpQBSzXY; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BpQBSzXY" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270918; x=1781806918; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=cMgIIHv7v3o6hwLDWo7qzlIUuIJ0MkqCBDVD8FqJjxk=; + b=BpQBSzXYfQCuuNQrY07i0oxTUklWpFxgVOI3T2jR9DS0iIIrA0R9YJ83 + A9emVOApgFn/Vtg45BEuMxyBgA2TiW4xddTrQIm5gKRorVrmWRGguFZxO + nCW/eG3N/h/KeRxeDQDhVLByESmAqIOMi1VfU1gEw2Y77ZQX7MjFWlXNH + OUxB74DFQr31EirYxBp+QPY8d/5S5jyj2WR0Nq+yVEz01jtl24VXePQsv + wBg0aK2thwbQ070vTU2iI+McTBs29ChLZRqwba7zv7kzEGNCrqmDUK6Zg + bhc7UBL3FUSUKcR5z/7hq6ahpD4cObaTjU9buWfjUFBWJ1k3FZrl+UJpI + g==; +X-CSE-ConnectionGUID: 3Abw6Bt+Qx+VsWy2ZRBi4Q== +X-CSE-MsgGUID: HFq6k7fqT72cxCbUZ+ATOA== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931636" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931636" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:58 -0700 +X-CSE-ConnectionGUID: ul8Mak7HRui0K/AZNo//iw== +X-CSE-MsgGUID: NFSnEWS8RbuYiY0YXOwNnQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960034" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:57 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 14/20] sched: Introduce update_llc_busiest() to deal with groups having preferred LLC tasks +Date: Wed, 18 Jun 2025 11:28:02 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +The load balancer attempts to identify the busiest sched_group with +the highest load and migrates some tasks to a less busy sched_group +to distribute the load across different CPUs. + +When cache-aware scheduling is enabled, the busiest sched_group is +defined as the one with the highest number of tasks preferring to run +on the destination LLC. If the busiest group has llc_balance tag, +the cache aware load balance will be launched. + +Introduce the helper function update_llc_busiest() to identify +such sched group with most tasks preferring the destination LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++++- + 1 file changed, 35 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 48a090c6e885..ab3d1239d6e4 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10848,12 +10848,36 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ int idx; ++ ++ /* Only the candidate with llc_balance need to be taken care of */ ++ if (!sgs->group_llc_balance) ++ return false; ++ ++ /* ++ * There are more tasks that want to run on dst_cpu's LLC. ++ */ ++ idx = llc_idx(env->dst_cpu); ++ return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx]; ++} + #else + static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) + { + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ return false; ++} + #endif + + static inline long sibling_imbalance(struct lb_env *env, +@@ -11085,6 +11109,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, + sds->local_stat.group_type != group_has_spare)) + return false; + ++ /* deal with prefer LLC load balance, if failed, fall into normal load balance */ ++ if (update_llc_busiest(env, busiest, sgs)) ++ return true; ++ ++ /* if there is already a busy group, skip the normal load balance */ ++ if (busiest->group_llc_balance) ++ return false; ++ + if (sgs->group_type > busiest->group_type) + return true; + +@@ -11991,9 +12023,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. ++ * Also do so if we can move some tasks that prefer the local LLC. + */ + if (sds.prefer_sibling && local->group_type == group_has_spare && +- sibling_imbalance(env, &sds, busiest, local) > 1) ++ (busiest->group_llc_balance || ++ sibling_imbalance(env, &sds, busiest, local) > 1)) + goto force_balance; + + if (busiest->group_type != group_overloaded) { +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch new file mode 100644 index 0000000..8a0371a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-15-20-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch @@ -0,0 +1,183 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 571022FCFC2 + for ; Wed, 18 Jun 2025 18:21:59 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270921; cv=none; b=Au/8Jdq57vaG6zpgwbWDMgAuubIJAHPnTlsoAXwGoHognpeK/aWGydhvQxM3536916CeCjNp7EH7OJ1j+rscZhPywV3siybixACVKTWmKknqhXSmK9iQja3rE6sE7M29Xk/pKSsaah9dw+I+23TM1f6VNcw/zxHYJJuvbu42ScY= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270921; c=relaxed/simple; + bh=qHzPg7pOAdSMp76icLDVAZqOGBB1+iXyIxtSLlESXR0=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=UvCpUAB1cF+/DlV2OPMao3wito5w7p/P7XCMH0zVQpdX7ISAPe7+UYDSTlR5CXGTWmTgG7MhjDnYZB0VvoII8J7ZwG7QGcKzF1ITC8sBcvoSR2nl05LkQA/9d/FIPodpuCurin5CPmjX8yQEcG/PuH0gr8OoT0oFfJQ9PTGL73c= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=FmFBFr6Y; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="FmFBFr6Y" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270919; x=1781806919; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=qHzPg7pOAdSMp76icLDVAZqOGBB1+iXyIxtSLlESXR0=; + b=FmFBFr6Yj94ghXvPX0OCmpCOy98F1E44OpxMfBpTuE01Up5uaW3BC4dp + LKM2y1rnTUzZVvsXBUk+n0OQLTLEDTa762KmotgATQyk408JVWd7CeTmx + a5qvM/9qZL3kEomZaLdyET8OE/W/+gBaxg35o/VfV60g6iC8kUriAFUIK + FnOkrknbKEmGtpNieAKL4Z11kucxta5+z0O7A4asBMEslen5BktgpvTBS + OaNU8TXkSuVwDP/FVVia7CCMK0h99Xst5sxVHgrZZz/hLD2iZNRH8LdJs + at3EQiEbK/gun5R/uTtPhw5w9l5xV9iGjFYl1aRfV6FTSQSAQS5govoCZ + w==; +X-CSE-ConnectionGUID: l1gKsZfFSOGB2PlCgz10uA== +X-CSE-MsgGUID: vayaBofJRAyAxRdcDQ65+A== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931650" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931650" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:58 -0700 +X-CSE-ConnectionGUID: K6PGgga3SImXHYmzMR4M/w== +X-CSE-MsgGUID: l0kLMoLZQamEcnOsk8UfMg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960060" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:58 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 15/20] sched: Introduce a new migration_type to track the preferred LLC load balance +Date: Wed, 18 Jun 2025 11:28:03 -0700 +Message-Id: <5b9c5a9ddb5b8b16ad20fbba9d41288de95741bc.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce a new migration type named migrate_llc_task to facilitate +cache-aware load balancing. + +After the busiest sched_group is identified as the one that needs +migration due to having most tasks preferring destination LLC, tag the +migration type as the newly introduced migrate_llc_task. During load +balancing, each runqueue within the busiest preferred-LLC sched_group +is checked, and the runqueue with the highest number of tasks preferring +to run on the destination CPU is chosen as the busiest runqueue. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 37 ++++++++++++++++++++++++++++++++++++- + 1 file changed, 36 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ab3d1239d6e4..42222364ad9c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9711,7 +9711,8 @@ enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, +- migrate_misfit ++ migrate_misfit, ++ migrate_llc_task + }; + + #define LBF_ALL_PINNED 0x01 +@@ -10143,6 +10144,15 @@ static int detach_tasks(struct lb_env *env) + env->imbalance -= util; + break; + ++ case migrate_llc_task: ++ /* ++ * Since can_migrate_task() succeed, when we reach here, it means that p ++ * can be migrated even if dst_cpu is not p's preferred_llc, because there ++ * are no idle cores for p to do in-llc load balance. ++ */ ++ env->imbalance--; ++ break; ++ + case migrate_task: + env->imbalance--; + break; +@@ -11779,6 +11789,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + return; + } + ++#ifdef CONFIG_SCHED_CACHE ++ if (busiest->group_llc_balance) { ++ /* Move a task that prefer local LLC */ ++ env->migration_type = migrate_llc_task; ++ env->imbalance = 1; ++ return; ++ } ++#endif ++ + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages +@@ -12087,6 +12106,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int busiest_pref_llc = 0; ++ int dst_llc; ++#endif + int i; + + for_each_cpu_and(i, sched_group_span(group), env->cpus) { +@@ -12195,6 +12218,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + } + break; + ++ case migrate_llc_task: ++#ifdef CONFIG_SCHED_CACHE ++ dst_llc = llc_idx(env->dst_cpu); ++ if (!cpus_share_cache(env->dst_cpu, rq->cpu) && ++ busiest_pref_llc < rq->nr_pref_llc[dst_llc]) { ++ busiest_pref_llc = rq->nr_pref_llc[dst_llc]; ++ busiest = rq; ++ } ++#endif ++ break; + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; +@@ -12377,6 +12410,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; ++ case migrate_llc_task: ++ break; + } + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch new file mode 100644 index 0000000..7a821da --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-16-20-sched-Consider-LLC-locality-for-active-balance.patch @@ -0,0 +1,182 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 10A9D2FCFE0 + for ; Wed, 18 Jun 2025 18:22:00 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270921; cv=none; b=P+N3O5LY+75YpxuXAxCziwrhSLux4hrlWTDJ+f8IcG0rzPNOVsWLmvsBedk/2+jdPdqDw1wzG7atrrNckzm5Yyg74mjwX53XlvX5jdoIe7rAPpy4h3viBerEuO6WUgh96xv+h8Lwf0GEIyOdryyHXJYAAnIgHDvJPZSQuXjuTaw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270921; c=relaxed/simple; + bh=v21t2Zwrb1Nh/wAuSkK34PJS3ovpgrP9JoFKO8zGmaY=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=ajQTLgiN8PnOKWjIVod1oKlrsTKQVI7OTOmzFXtG+73wlaVQvWGhqk0WMJJXjNaJRuZH21mpCLXZaDLmAm/qdrOjIx4FzgPZZPymYhkjL6psVeBZGckQhtba+IMm3kDkbtawrmcsQMOZ+Zwe6kb6VuOntBmYomWGuxm13ODEJMI= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eNKkiPXx; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eNKkiPXx" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270920; x=1781806920; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=v21t2Zwrb1Nh/wAuSkK34PJS3ovpgrP9JoFKO8zGmaY=; + b=eNKkiPXxlLeUp9MrxUVzTWIyvBs9ufToarQslgq0bYk5s/czMppb7IvI + h6f2ZKRMsVlGKRrHwIGJjWC2qE8qaTMXUEdQj3r8D+h0SN43VWG+hMsrT + RYlk/KgtxAMp9QrgVWboKznJ7vUI8egFwzA9KbPGigmWN87qDwRCRj+PI + 1DVrQqjN70cltKuzihCwLLBClt53KvfL9NiCkywt0JuRcLKIP2iJEokrX + ajvz3jtkrzeg38383zwJHxSMbB8WBku1/QPExsvxhxX+x84ckOTN+YWR0 + r0W/M6eCut34E/W0ufbrbXos01UbUqSL73VaxS852oWlwl340CVxjghCK + g==; +X-CSE-ConnectionGUID: WN1cDgRBRjePAqtNlvvThA== +X-CSE-MsgGUID: GG9U87oTRii/aNSvr7x6sQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931663" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931663" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:21:59 -0700 +X-CSE-ConnectionGUID: RBeMMmkDTB+RrdOWQJsvcA== +X-CSE-MsgGUID: 9zB/bWW2R3GKVE6WyIDfGA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960082" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:21:59 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 16/20] sched: Consider LLC locality for active balance +Date: Wed, 18 Jun 2025 11:28:04 -0700 +Message-Id: <1ce821178bf178ce841ea94bb8139fd9a197b86b.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +If busiest run queue has only one task, active balance is enlisted +to actually move the task. However, before moving the task, +we should consider whether we are moving the task from its preferred +LLC. + +Don't move the single running task in a run queue to another LLC, if +we are moving it from its desired LLC, or moving it will cause too much +imbalance between the LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 48 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 42222364ad9c..3a8f6fc52055 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -12294,10 +12294,43 @@ imbalanced_active_balance(struct lb_env *env) + return 0; + } + ++#ifdef CONFIG_SCHED_CACHE ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ if (!sched_feat(SCHED_CACHE)) ++ return 0; ++ ++ if (cpus_share_cache(env->src_cpu, env->dst_cpu)) ++ return 0; ++ /* ++ * All tasks want to stay put. Move only if LLC is ++ * heavily loaded or don't pull a task from its ++ * preferred CPU if it is the only one running. ++ */ ++ if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable && ++ (env->src_rq->nr_running <= 1 || ++ _get_migrate_hint(env->src_cpu, env->dst_cpu, ++ 0, false) == mig_forbid)) ++ return 1; ++ ++ return 0; ++} ++#else ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ return 0; ++} ++#endif ++ + static int need_active_balance(struct lb_env *env) + { + struct sched_domain *sd = env->sd; + ++ if (break_llc_locality(env)) ++ return 0; ++ + if (asym_active_balance(env)) + return 1; + +@@ -12317,7 +12350,8 @@ static int need_active_balance(struct lb_env *env) + return 1; + } + +- if (env->migration_type == migrate_misfit) ++ if (env->migration_type == migrate_misfit || ++ env->migration_type == migrate_llc_task) + return 1; + + return 0; +@@ -12762,9 +12796,20 @@ static int active_load_balance_cpu_stop(void *data) + goto out_unlock; + + /* Is there any task to move? */ +- if (busiest_rq->nr_running <= 1) +- goto out_unlock; ++ if (busiest_rq->nr_running <= 1) { ++#ifdef CONFIG_SCHED_CACHE ++ int llc = llc_idx(target_cpu); + ++ if (!sched_feat(SCHED_CACHE)) ++ goto out_unlock; ++ ++ if (llc < 0) ++ goto out_unlock; ++ /* don't migrate if task does not prefer target */ ++ if (busiest_rq->nr_pref_llc[llc] < 1) ++#endif ++ goto out_unlock; ++ } + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch new file mode 100644 index 0000000..abd082e --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-17-20-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch @@ -0,0 +1,193 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 1B6EE2F5476 + for ; Wed, 18 Jun 2025 18:22:00 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270922; cv=none; b=MFTVX6qRlvnmcnypqJAnNUvjVFnCj+BYWehMpFkjTBU+YkJWvxgKrAJGcPlOnWFlULsIE0HJF5adxlSs+4NcBPZqPwLUEpp3DyzPS31YqqskBVjcvxtKVfWLg48hqUzzgp9v2j0fKtLs13VTywRh7Dh2csNg/XDFtX5FiqAZvbY= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270922; c=relaxed/simple; + bh=UQoBqN95xnQudsJ44o6C5oD7PSQHCIXgA6EcclP4fcc=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=nDrGI2/xX7+VWoIaZPAxSQ62lLIbd2FIobNRajXdY8S5xE+UkDaqRZV9oSWRZyNefE1ch1lBYfvNcBa+4ghO/kDKZP04UYkGh8gv4TzDurIYTenC4Ns0bIWzJq0lXhvHvCBGCuNffM2eKs5JKCc2O0pb2ptRCWrh8hgx0OzEDJU= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gwPPS4Py; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gwPPS4Py" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270921; x=1781806921; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=UQoBqN95xnQudsJ44o6C5oD7PSQHCIXgA6EcclP4fcc=; + b=gwPPS4PyJxuWZjpqppzzQNy6oNFS2apouvtFBoztM3FRMsIZNoXOCYZS + 4ZGsvXZ1GBKYWiosJJLy3Afvyz6rPjZGp6kTCMd3SEk6QElyc++ZHbpeH + U+87HjtVKO3MPeHlo5eycdT091abyiOHsWsk02bh++KLCXtrZChonH3SN + EXN9QhBQhTsKkKvGvzRjZJXx+5ylM+EmAu0SlP86VdBwSp8bjkVa10OXt + tZ/lEtGqbQUS8nYQOIluXmFXapZZs3teRfaTMOdaD+49KrQPZjXrq2Ex4 + mkaL045bqiOr9hiagrNO4Meh5T5RKF9itXcTr61PJGxchTt1XfKGFw5XC + g==; +X-CSE-ConnectionGUID: Yl7CRBbAQ5iJFKjFcNyJkw== +X-CSE-MsgGUID: JmWjpB8BQ8SyduLVhShd6A== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931677" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931677" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:00 -0700 +X-CSE-ConnectionGUID: PriWcHtzQ+CMXjb98vjSvg== +X-CSE-MsgGUID: 4FJAHvYoQNmNJWCnl/pw1g== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960102" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:00 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 17/20] sched: Consider LLC preference when picking tasks from busiest queue +Date: Wed, 18 Jun 2025 11:28:05 -0700 +Message-Id: <9d28a5a892f0413a96498bbf711eaa9b354ca895.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +When picking tasks from busiest queue for load balance, we currently +do not consider LLC preference. + +Order the task in the busiest queue such that we picked the tasks in the +following order: + 1. tasks that prefer dst cpu's LLC + 2. tasks that have no preference in LLC + 3. tasks that prefer LLC other than the ones they are on + 4. tasks that prefer the LLC that they are currently on + +This will allow tasks better chances to wind up in its preferred LLC. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 3a8f6fc52055..c9db32c2df63 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10056,6 +10056,68 @@ static struct task_struct *detach_one_task(struct lb_env *env) + return NULL; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Prepare lists to detach tasks in the following order: ++ * 1. tasks that prefer dst cpu's LLC ++ * 2. tasks that have no preference in LLC ++ * 3. tasks that prefer LLC other than the ones they are on ++ * 4. tasks that prefer the LLC that they are currently on. ++ */ ++static struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ struct task_struct *p; ++ LIST_HEAD(pref_old_llc); ++ LIST_HEAD(pref_new_llc); ++ LIST_HEAD(no_pref_llc); ++ LIST_HEAD(pref_other_llc); ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return tasks; ++ ++ if (cpus_share_cache(env->dst_cpu, env->src_cpu)) ++ return tasks; ++ ++ while (!list_empty(tasks)) { ++ p = list_last_entry(tasks, struct task_struct, se.group_node); ++ ++ if (p->preferred_llc == llc_id(env->dst_cpu)) { ++ list_move(&p->se.group_node, &pref_new_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == llc_id(env->src_cpu)) { ++ list_move(&p->se.group_node, &pref_old_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == -1) { ++ list_move(&p->se.group_node, &no_pref_llc); ++ continue; ++ } ++ ++ list_move(&p->se.group_node, &pref_other_llc); ++ } ++ ++ /* ++ * We detach tasks from list tail in detach tasks. Put tasks ++ * to be chosen first at end of list. ++ */ ++ list_splice(&pref_new_llc, tasks); ++ list_splice(&no_pref_llc, tasks); ++ list_splice(&pref_other_llc, tasks); ++ list_splice(&pref_old_llc, tasks); ++ return tasks; ++} ++#else ++static inline struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ return tasks; ++} ++#endif ++ + /* + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from + * busiest_rq, as part of a balancing operation within domain "sd". +@@ -10064,7 +10126,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) + */ + static int detach_tasks(struct lb_env *env) + { +- struct list_head *tasks = &env->src_rq->cfs_tasks; ++ struct list_head *tasks; + unsigned long util, load; + struct task_struct *p; + int detached = 0; +@@ -10083,6 +10145,8 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + return 0; + ++ tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks); ++ + while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch new file mode 100644 index 0000000..e8d24ba --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-18-20-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch @@ -0,0 +1,155 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9EB802F5473 + for ; Wed, 18 Jun 2025 18:22:02 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270924; cv=none; b=UcUwkmFrXm1QZiHhRd9nLRLaJcXdFq15Quaiz8ZBN1nnL9SrnbVlLxUTqIyE9whgxAiEKu2+OgsC5VVcnjsA8wMU0p6jVlFPPQ7qmeBTzB6VM8FM85LAnq7ENrafpJvlDPCDIM9KyyIse0EZlGPKURu465AkRFJXtSxWwqh6Huo= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270924; c=relaxed/simple; + bh=LvkJ+My0UyW3xwEVx+qylSLEWrcmxbixkOEWZ10FTps=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=OB9jn7VvZ/rPmdAtTnDXoDgCyo72RKBXKCzeOdkrfLO6e85bK8hUMUUUOJEZalXFcdLxjZn/HiIycPeaDEtK0UPQwahP6NKXPIqG2XHiIkz2fC4a83E4onHZ8UGT50ZPOtl9Nzhvr5GlVAWllkK6TCvxOkmzUawTYoseeve+8+0= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GlYX1oli; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GlYX1oli" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270922; x=1781806922; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=LvkJ+My0UyW3xwEVx+qylSLEWrcmxbixkOEWZ10FTps=; + b=GlYX1olin1BGUE11MHHJj9SNVdaVYXouh/gB+4N21ppPm9OJp1+kXEIW + jYKsD82DRpSUkmfbSWF6qlHl4i9BpZM+/sHr7a8DHXRrYaxO/Rj5jbXOw + +J6epKBAznqSQbDha14UPGm8Z7tWVAxbi3VVbxuqnoizc+7JuMcPHjd8u + wYx2yGauKvj0wQL3aVlaHP9Wp4NxgHk3BFgHplWMZc9XEc1wUSkwxQzqV + T9whL5z31EzCyXebtiORr7A5MGjv8KhiLdLGBw82yyfUiT62ZER6bvm4y + FqiKWIa9GZLlg4Z0hqOBSzlk3RseQZUvCgYBNujCvQGV07FE+Rdzvhqlv + Q==; +X-CSE-ConnectionGUID: 4oIUGMDbTxGXwK/Erd3pww== +X-CSE-MsgGUID: tbl/Cv7HTzy00jIWW42WkQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931694" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931694" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:02 -0700 +X-CSE-ConnectionGUID: dreUdLK/Snq9Tth5AMDo9w== +X-CSE-MsgGUID: FI0VhmRdTFe0V4qTkbZ9qg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960127" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:00 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 18/20] sched: Do not migrate task if it is moving out of its preferred LLC +Date: Wed, 18 Jun 2025 11:28:06 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +In the final step of task migration during load balancing, +can_migrate_task() is used to determine whether a task can +be moved to the destination. If the task has an LLC preference, +consider this preference when moving it out of its preferred LLC. +With this check in place, there is no need to retain the task's +cache-hot CPU check in task_hot(); remove it accordingly. + +Besides, add more checks in detach_tasks() to avoid choosing +tasks that prefer their current LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 28 +++++++++++++++++----------- + 1 file changed, 17 insertions(+), 11 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index c9db32c2df63..e342524481ed 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9787,17 +9787,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + if (sysctl_sched_migration_cost == 0) + return 0; + +-#ifdef CONFIG_SCHED_CACHE +- if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) { +- /* +- * XXX things like Skylake have non-inclusive L3 and might not +- * like this L3 centric view. What to do about L2 stickyness ? +- */ +- return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ > +- per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ; +- } +-#endif +- + delta = rq_clock_task(env->src_rq) - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +@@ -9992,6 +9981,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + if (env->flags & LBF_ACTIVE_LB) + return 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_feat(SCHED_CACHE) && ++ get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid) ++ return 0; ++#endif ++ + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); +@@ -10252,6 +10247,17 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + break; + ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Don't detach more tasks if remaining tasks want to stay: ++ * The tasks have already been sorted by order_tasks_by_llc(), ++ * they are tasks that prefer the current LLC. ++ */ ++ if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 && ++ llc_id(env->src_cpu) == p->preferred_llc) ++ break; ++#endif ++ + continue; + next: + if (p->sched_task_hot) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch new file mode 100644 index 0000000..d0ac1f3 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-19-20-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch @@ -0,0 +1,185 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 36AF22FD899 + for ; Wed, 18 Jun 2025 18:22:03 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270924; cv=none; b=rqcnz4lr519vfb2X2M9DeLf5O+erdaRAJSoZ2E9S2odeoi76dMp/OZU1NB58Qjs+uncaH3qLdMqonjZ3kQl6htfGCrXwxMWgW2YZT8y6e/FYDEkDc76bmoSAGQbtAHi6zdd/a0QMbqOAiPFDTuQ/Av7Zd2Z3POZHWg4NK1E6gso= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270924; c=relaxed/simple; + bh=RBEoEDu2A+FwLyh++5jokg1I7TUhNUbkFf08/S4koCY=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=u++l74ct5wI88qe+ZhTxahgVxibbraZWIJUXAiGrfYKDyKJFt9lgn/tcRZtplKvjvmasXpdfWkYumF3dDOdoABJmyCFInhfV661Idkc/VE7bFkTegUBPg18Oyk856hhDCaV4uOx+JU6Wj+pkcN21wugnWmWN1myHTXggm5UEej4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=OxZqIyLs; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="OxZqIyLs" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270923; x=1781806923; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=RBEoEDu2A+FwLyh++5jokg1I7TUhNUbkFf08/S4koCY=; + b=OxZqIyLsJSroJ2i9MdsrROxH70E3NsG6kvAKifMPTRvyyrGMmXsUYxD3 + CPzVxg8vQa7ptoVtXf4Q8V1g+8odAq77fXL+wB1Yz3cOwa5oIFmdnB5YW + BortfRVvhpa+xJaYKcO1/iYGTjoGzZlBvQ4DqinF+ijFvIH3FXFHUY7Yw + dnqNv+RspKaZf5GkEERusnRttKQTb+Ybdex2YDNmVMcMaLi3YqDVwQEd+ + zvko7J7nf4iHqzRFD8LqvQWYwg1aAy+yQ4qBaHEh90PM1XJHSY8jbNW6c + NQxsij/EBLJiRtqClKTlBCTYmaEChOOO3OgR1tIMqHZLc+QmYryVKIQJ7 + Q==; +X-CSE-ConnectionGUID: DbGqwajpS0SQ1kimRCqNOw== +X-CSE-MsgGUID: IxguJ0uxQf2FmLjSrT5dxA== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931707" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931707" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:02 -0700 +X-CSE-ConnectionGUID: 55fSL6zAQZGOckEnYdnVsw== +X-CSE-MsgGUID: LUEEbBOqRk6qNZaG5v1xSg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960148" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:02 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 19/20] sched: Introduce SCHED_CACHE_LB to control cache aware load balance +Date: Wed, 18 Jun 2025 11:28:07 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce the SCHED_CACHE_LB sched feature to enable or disable +cache aware load balance in the schduler. + +Co-developed-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 18 ++++++++++-------- + kernel/sched/features.h | 1 + + 2 files changed, 11 insertions(+), 8 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index e342524481ed..af742601f2d7 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9982,7 +9982,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + return 1; + + #ifdef CONFIG_SCHED_CACHE +- if (sched_feat(SCHED_CACHE) && ++ if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) && + get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid) + return 0; + #endif +@@ -10068,7 +10068,7 @@ static struct list_head + LIST_HEAD(no_pref_llc); + LIST_HEAD(pref_other_llc); + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + return tasks; + + if (cpus_share_cache(env->dst_cpu, env->src_cpu)) +@@ -10253,7 +10253,8 @@ static int detach_tasks(struct lb_env *env) + * The tasks have already been sorted by order_tasks_by_llc(), + * they are tasks that prefer the current LLC. + */ +- if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 && ++ if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) && ++ p->preferred_llc != -1 && + llc_id(env->src_cpu) == p->preferred_llc) + break; + #endif +@@ -10910,7 +10911,7 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_domain *child = env->sd->child; + int llc; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + return false; + + if (env->sd->flags & SD_SHARE_LLC) +@@ -11021,7 +11022,8 @@ static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_domain *sd = env->sd->child; + struct sched_domain_shared *sd_share; + +- if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE || ++ !sched_feat(SCHED_CACHE_LB)) + return; + + /* only care the sched domain that spans 1 LLC */ +@@ -11083,7 +11085,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + *sg_overutilized = 1; + + #ifdef CONFIG_SCHED_CACHE +- if (sched_feat(SCHED_CACHE)) { ++ if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB)) { + int j; + + for (j = 0; j < max_llcs; ++j) +@@ -12368,7 +12370,7 @@ imbalanced_active_balance(struct lb_env *env) + static inline bool + break_llc_locality(struct lb_env *env) + { +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + return 0; + + if (cpus_share_cache(env->src_cpu, env->dst_cpu)) +@@ -12870,7 +12872,7 @@ static int active_load_balance_cpu_stop(void *data) + #ifdef CONFIG_SCHED_CACHE + int llc = llc_idx(target_cpu); + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + goto out_unlock; + + if (llc < 0) +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index d2af7bfd36bf..11dbd74cd365 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -88,6 +88,7 @@ SCHED_FEAT(TTWU_QUEUE, true) + SCHED_FEAT(SIS_UTIL, true) + + SCHED_FEAT(SCHED_CACHE, true) ++SCHED_FEAT(SCHED_CACHE_LB, true) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch new file mode 100644 index 0000000..1ae2586 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.15/RFC-patch-v3-20-20-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch @@ -0,0 +1,136 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 37FAE2FE315 + for ; Wed, 18 Jun 2025 18:22:04 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1750270929; cv=none; b=onyBzk/JV7TsJ4rqzmPQxAynG5u8Uiv7NZBFqjIXX/vDeaVGTR6XM7u2t1DQFh6/8C8E442NQANkusHEp/W0G7MKp4l8bRLhTJDwy/WN6tGk0cfY5IF9GwVw8LyU0L2HDfqYL9FKb8t0ShAVnCE5wIeOC+RJNYKLspjyv345oV0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1750270929; c=relaxed/simple; + bh=GCa+xOiJ3NtDwAxf3UTuM7FdSBlF1t2VJbPM88NipeI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=lTB8OKjeVhk51AV9Z+yat4UlgOV4jPlwKTO3U5BqELX9+KB8jLro3KX85VFITsktx5Jba308vcVyotxvDMbwXzp5+qGVCYSvBcIyb/4B4Tot6SKFpcXHB6THYMdfQWeqbaG5Ds7ceyhFMgv2UbsDTiF0uZ6QidbwEUWatyQKrK0= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=none smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=IXZaT3cj; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=none smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="IXZaT3cj" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1750270924; x=1781806924; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=GCa+xOiJ3NtDwAxf3UTuM7FdSBlF1t2VJbPM88NipeI=; + b=IXZaT3cj2WJHpQsa0eOY4RKLxD0XHxWtW3DBUY3jLShIsyHbPv6kV6PS + wBTZARncyqd81MhW2Dh6tAi77Kk2I1a86TYlMhKSh30I/NZi9Ohg6RQEG + B2e6bpm5YRM81JbZP0vAzdhRwJTJ6z+fezdmgGlo8EIBWlV8PKGUd4V1y + Q1K/xPtqmRaKI9stHeDWuocbpuMmO319jhINNuhdgtWOIH748A4vI8EIM + vIBaj9+wydAwFrFxiz/O6rePd8/Uv/i5oca2c3tnOmtRZT0khUyTei51V + l/RHQAM9KvyJRqc9LiIGKWsg+Dg76/187VJoJGXKdR6viVZo8MkXQOzWR + g==; +X-CSE-ConnectionGUID: bOCOGOeLQ5mxgMbCQRgg+A== +X-CSE-MsgGUID: LC2a0LmnQ2qip+retIyBQw== +X-IronPort-AV: E=McAfee;i="6800,10657,11468"; a="63931720" +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="63931720" +Received: from fmviesa001.fm.intel.com ([10.60.135.141]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 18 Jun 2025 11:22:03 -0700 +X-CSE-ConnectionGUID: rby7BZr4SRm2YmXdsRLRtQ== +X-CSE-MsgGUID: N6pda5x3S0GkUhyJrY38ZA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.16,246,1744095600"; + d="scan'208";a="180960173" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa001.fm.intel.com with ESMTP; 18 Jun 2025 11:22:03 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Tim Chen , + Vincent Guittot , + Libo Chen , + Abel Wu , + Madadi Vineeth Reddy , + Hillf Danton , + Len Brown , + linux-kernel@vger.kernel.org, + Chen Yu +Subject: [RFC patch v3 20/20] sched: Introduce SCHED_CACHE_WAKE to control LLC aggregation on wake up +Date: Wed, 18 Jun 2025 11:28:08 -0700 +Message-Id: <1f8e7ec2d84a94ac0a31ca6182218ffaf7e166df.1750268218.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce SCHED_CACHE_WAKE feature to enable or disable cache-aware +wake up. Disable this feature by default because cache-aware wakeup +is overly aggressive in stacking wakees of the same process on the same LLC, +if they are frequently woken up. + +The wake ups can be much more frequent than load balances, adding +much overhead when load balance alone for LLC aggregation is sufficient. + +Co-developed-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 6 +++++- + kernel/sched/features.h | 1 + + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index af742601f2d7..32c90fab0d63 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9028,7 +9028,7 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + struct mm_struct *mm = p->mm; + int cpu; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE)) + return prev_cpu; + + if (!mm || p->nr_cpus_allowed == 1) +@@ -9041,6 +9041,10 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + if (cpus_share_cache(cpu, prev_cpu)) + return prev_cpu; + ++ if (_get_migrate_hint(prev_cpu, cpu, ++ task_util(p), true) == mig_forbid) ++ return prev_cpu; ++ + if (static_branch_likely(&sched_numa_balancing) && + __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) { + /* +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 11dbd74cd365..44b408cf0dd4 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -89,6 +89,7 @@ SCHED_FEAT(SIS_UTIL, true) + + SCHED_FEAT(SCHED_CACHE, true) + SCHED_FEAT(SCHED_CACHE_LB, true) ++SCHED_FEAT(SCHED_CACHE_WAKE, false) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip b/sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip new file mode 100644 index 0000000..d4243e9 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/0001-bore.patch.skip @@ -0,0 +1,1032 @@ +From 22d4c29e7e688b17f8c7b25324c6b4bbfb07d52e Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Mon, 21 Jul 2025 21:13:03 +0200 +Subject: [PATCH] bore + +Signed-off-by: Piotr Gorski +--- + include/linux/sched.h | 18 ++ + include/linux/sched/bore.h | 42 ++++ + init/Kconfig | 17 ++ + kernel/Kconfig.hz | 17 ++ + kernel/fork.c | 8 + + kernel/sched/Makefile | 1 + + kernel/sched/bore.c | 425 +++++++++++++++++++++++++++++++++++++ + kernel/sched/core.c | 8 + + kernel/sched/debug.c | 61 +++++- + kernel/sched/fair.c | 88 +++++++- + kernel/sched/sched.h | 9 + + 11 files changed, 690 insertions(+), 4 deletions(-) + create mode 100644 include/linux/sched/bore.h + create mode 100644 kernel/sched/bore.c + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index aa9c5be7a..197a58414 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -565,6 +565,14 @@ struct sched_statistics { + #endif /* CONFIG_SCHEDSTATS */ + } ____cacheline_aligned; + ++#ifdef CONFIG_SCHED_BORE ++struct sched_burst_cache { ++ u32 value; ++ u32 count; ++ u64 timestamp; ++}; ++#endif // CONFIG_SCHED_BORE ++ + struct sched_entity { + /* For load-balancing: */ + struct load_weight load; +@@ -584,6 +592,16 @@ struct sched_entity { + u64 sum_exec_runtime; + u64 prev_sum_exec_runtime; + u64 vruntime; ++#ifdef CONFIG_SCHED_BORE ++ u64 burst_time; ++ u32 prev_burst_penalty; ++ u32 curr_burst_penalty; ++ u32 burst_penalty; ++ u8 burst_score; ++ u8 burst_count; ++ struct sched_burst_cache child_burst; ++ struct sched_burst_cache group_burst; ++#endif // CONFIG_SCHED_BORE + s64 vlag; + u64 slice; + +diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h +new file mode 100644 +index 000000000..55c19da46 +--- /dev/null ++++ b/include/linux/sched/bore.h +@@ -0,0 +1,42 @@ ++ ++#include ++#include ++ ++#ifndef _LINUX_SCHED_BORE_H ++#define _LINUX_SCHED_BORE_H ++#define SCHED_BORE_AUTHOR "Masahito Suzuki" ++#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" ++ ++#define SCHED_BORE_VERSION "6.1.0" ++ ++#ifdef CONFIG_SCHED_BORE ++extern u8 __read_mostly sched_bore; ++extern u8 __read_mostly sched_burst_exclude_kthreads; ++extern u8 __read_mostly sched_burst_smoothness; ++extern u8 __read_mostly sched_burst_fork_atavistic; ++extern u8 __read_mostly sched_burst_parity_threshold; ++extern u8 __read_mostly sched_burst_penalty_offset; ++extern uint __read_mostly sched_burst_penalty_scale; ++extern uint __read_mostly sched_burst_cache_stop_count; ++extern uint __read_mostly sched_burst_cache_lifetime; ++extern uint __read_mostly sched_deadline_boost_mask; ++ ++extern void update_burst_score(struct sched_entity *se); ++extern void update_curr_bore(u64 delta_exec, struct sched_entity *se); ++ ++extern void restart_burst(struct sched_entity *se); ++extern void restart_burst_rescale_deadline(struct sched_entity *se); ++ ++extern int sched_bore_update_handler(const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++ ++extern void sched_clone_bore( ++ struct task_struct *p, struct task_struct *parent, u64 clone_flags, u64 now); ++ ++extern void reset_task_bore(struct task_struct *p); ++extern void sched_bore_init(void); ++ ++extern void reweight_entity(struct cfs_rq *cfs_rq, ++ struct sched_entity *se, unsigned long weight, bool no_update_curr); ++#endif // CONFIG_SCHED_BORE ++#endif // _LINUX_SCHED_BORE_H +diff --git a/init/Kconfig b/init/Kconfig +index 666783eb5..9f32a8c27 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1381,6 +1381,23 @@ config CHECKPOINT_RESTORE + + If unsure, say N here. + ++config SCHED_BORE ++ bool "Burst-Oriented Response Enhancer" ++ default y ++ help ++ In Desktop and Mobile computing, one might prefer interactive ++ tasks to keep responsive no matter what they run in the background. ++ ++ Enabling this kernel feature modifies the scheduler to discriminate ++ tasks by their burst time (runtime since it last went sleeping or ++ yielding state) and prioritize those that run less bursty. ++ Such tasks usually include window compositor, widgets backend, ++ terminal emulator, video playback, games and so on. ++ With a little impact to scheduling fairness, it may improve ++ responsiveness especially under heavy background workload. ++ ++ If unsure, say Y here. ++ + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index ce1435cb0..b93d1f657 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -55,5 +55,22 @@ config HZ + default 300 if HZ_300 + default 1000 if HZ_1000 + ++config MIN_BASE_SLICE_NS ++ int "Default value for min_base_slice_ns" ++ default 2000000 ++ help ++ The BORE Scheduler automatically calculates the optimal base ++ slice for the configured HZ using the following equation: ++ ++ base_slice_ns = ++ 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) ++ ++ This option sets the default lower bound limit of the base slice ++ to prevent the loss of task throughput due to overscheduling. ++ ++ Setting this value too high can cause the system to boot with ++ an unnecessarily large base slice, resulting in high scheduling ++ latency and poor system responsiveness. ++ + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/fork.c b/kernel/fork.c +index 1ee8eb11f..2eaaaf9e8 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -115,6 +115,10 @@ + /* For dup_mmap(). */ + #include "../mm/internal.h" + ++#ifdef CONFIG_SCHED_BORE ++#include ++#endif // CONFIG_SCHED_BORE ++ + #include + + #define CREATE_TRACE_POINTS +@@ -2313,6 +2317,10 @@ __latent_entropy struct task_struct *copy_process( + * Need tasklist lock for parent etc handling! + */ + write_lock_irq(&tasklist_lock); ++#ifdef CONFIG_SCHED_BORE ++ if (likely(p->pid)) ++ sched_clone_bore(p, current, clone_flags, p->start_time); ++#endif // CONFIG_SCHED_BORE + + /* CLONE_PARENT re-uses the old parent */ + if (clone_flags & (CLONE_PARENT|CLONE_THREAD)) { +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index 8ae86371d..b688084bc 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -37,3 +37,4 @@ obj-y += core.o + obj-y += fair.o + obj-y += build_policy.o + obj-y += build_utility.o ++obj-$(CONFIG_SCHED_BORE) += bore.o +diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c +new file mode 100644 +index 000000000..e7f80d91c +--- /dev/null ++++ b/kernel/sched/bore.c +@@ -0,0 +1,425 @@ ++/* ++ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler ++ * Copyright (C) 2021-2025 Masahito Suzuki ++ */ ++#include ++#include ++#include ++#include "sched.h" ++ ++#ifdef CONFIG_SCHED_BORE ++u8 __read_mostly sched_bore = 1; ++u8 __read_mostly sched_burst_exclude_kthreads = 1; ++u8 __read_mostly sched_burst_smoothness = 40; ++u8 __read_mostly sched_burst_fork_atavistic = 2; ++u8 __read_mostly sched_burst_parity_threshold = 2; ++u8 __read_mostly sched_burst_penalty_offset = 24; ++uint __read_mostly sched_burst_penalty_scale = 3180; ++uint __read_mostly sched_burst_cache_stop_count = 64; ++uint __read_mostly sched_burst_cache_lifetime = 75000000; ++uint __read_mostly sched_deadline_boost_mask = ENQUEUE_INITIAL ++ | ENQUEUE_WAKEUP; ++static int __maybe_unused maxval_6_bits = 63; ++static int __maybe_unused maxval_8_bits = 255; ++static int __maybe_unused maxval_12_bits = 4095; ++ ++#define BURST_PENALTY_SHIFT 12 ++#define MAX_BURST_PENALTY ((40U << BURST_PENALTY_SHIFT) - 1) ++ ++static u32 log2p1_u64_u32fp(u64 v, u8 fp) { ++ if (!v) return 0; ++ u32 exponent = fls64(v); ++ u32 mantissa = (u32)(v << (64 - exponent) << 1 >> (64 - fp)); ++ return exponent << fp | mantissa; ++} ++ ++static inline u32 calc_burst_penalty(u64 burst_time) { ++ u32 greed, tolerance, penalty, scaled_penalty; ++ ++ greed = log2p1_u64_u32fp(burst_time, BURST_PENALTY_SHIFT); ++ tolerance = sched_burst_penalty_offset << BURST_PENALTY_SHIFT; ++ penalty = max(0, (s32)(greed - tolerance)); ++ scaled_penalty = penalty * sched_burst_penalty_scale >> 10; ++ ++ return min(MAX_BURST_PENALTY, scaled_penalty); ++} ++ ++static inline u64 __scale_slice(u64 delta, u8 score) ++{return mul_u64_u32_shr(delta, sched_prio_to_wmult[score], 22);} ++ ++static inline u64 __unscale_slice(u64 delta, u8 score) ++{return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10);} ++ ++static void reweight_task_by_prio(struct task_struct *p, int prio) { ++ struct sched_entity *se = &p->se; ++ unsigned long weight = scale_load(sched_prio_to_weight[prio]); ++ ++ reweight_entity(cfs_rq_of(se), se, weight, true); ++ se->load.inv_weight = sched_prio_to_wmult[prio]; ++} ++ ++static inline u8 effective_prio(struct task_struct *p) { ++ u8 prio = p->static_prio - MAX_RT_PRIO; ++ if (likely(sched_bore)) ++ prio += p->se.burst_score; ++ return min(39, prio); ++} ++ ++void update_burst_score(struct sched_entity *se) { ++ if (!entity_is_task(se)) return; ++ struct task_struct *p = task_of(se); ++ u8 prev_prio = effective_prio(p); ++ ++ u8 burst_score = 0; ++ if (!((p->flags & PF_KTHREAD) && likely(sched_burst_exclude_kthreads))) ++ burst_score = se->burst_penalty >> BURST_PENALTY_SHIFT; ++ se->burst_score = burst_score; ++ ++ u8 new_prio = effective_prio(p); ++ if (new_prio != prev_prio) ++ reweight_task_by_prio(p, new_prio); ++} ++ ++void update_curr_bore(u64 delta_exec, struct sched_entity *se) { ++ if (!entity_is_task(se)) return; ++ ++ se->burst_time += delta_exec; ++ se->curr_burst_penalty = calc_burst_penalty(se->burst_time); ++ if (se->curr_burst_penalty > se->prev_burst_penalty) ++ se->burst_penalty = se->prev_burst_penalty + ++ (se->curr_burst_penalty - se->prev_burst_penalty) / se->burst_count; ++ update_burst_score(se); ++} ++ ++static inline u32 binary_smooth(u32 new, u32 old, u8 dumper) { ++ u32 abs_diff = (new > old)? (new - old): (old - new); ++ u32 adj_diff = (abs_diff / dumper) + ((abs_diff % dumper) != 0); ++ return (new > old)? (old + adj_diff): (old - adj_diff); ++} ++ ++static void __restart_burst(struct sched_entity *se) { ++ se->prev_burst_penalty = binary_smooth( ++ se->curr_burst_penalty, se->prev_burst_penalty, se->burst_count); ++ se->burst_time = 0; ++ se->curr_burst_penalty = 0; ++ ++ u8 smoothness = sched_burst_smoothness; ++ if (se->burst_count < smoothness) ++ se->burst_count++; ++ else if (unlikely(se->burst_count > smoothness)) ++ se->burst_count = smoothness; ++} ++ ++inline void restart_burst(struct sched_entity *se) { ++ __restart_burst(se); ++ se->burst_penalty = se->prev_burst_penalty; ++ update_burst_score(se); ++} ++ ++void restart_burst_rescale_deadline(struct sched_entity *se) { ++ s64 vscaled, wremain, vremain = se->deadline - se->vruntime; ++ struct task_struct *p = task_of(se); ++ u8 prev_prio = effective_prio(p); ++ restart_burst(se); ++ u8 new_prio = effective_prio(p); ++ if (prev_prio > new_prio) { ++ wremain = __unscale_slice(abs(vremain), prev_prio); ++ vscaled = __scale_slice(wremain, new_prio); ++ if (unlikely(vremain < 0)) ++ vscaled = -vscaled; ++ se->deadline = se->vruntime + vscaled; ++ } ++} ++ ++static inline bool task_is_bore_eligible(struct task_struct *p) ++{return p && p->sched_class == &fair_sched_class && !p->exit_state;} ++ ++static inline void reset_task_weights_bore(void) { ++ struct task_struct *task; ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ write_lock_irq(&tasklist_lock); ++ for_each_process(task) { ++ if (!task_is_bore_eligible(task)) continue; ++ rq = task_rq(task); ++ rq_pin_lock(rq, &rf); ++ update_rq_clock(rq); ++ reweight_task_by_prio(task, effective_prio(task)); ++ rq_unpin_lock(rq, &rf); ++ } ++ write_unlock_irq(&tasklist_lock); ++} ++ ++int sched_bore_update_handler(const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) { ++ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); ++ if (ret || !write) ++ return ret; ++ ++ reset_task_weights_bore(); ++ ++ return 0; ++} ++ ++#define for_each_child(p, t) \ ++ list_for_each_entry(t, &(p)->children, sibling) ++ ++static inline u32 count_entries_upto2(struct list_head *head) { ++ struct list_head *next = head->next; ++ return (next != head) + (next->next != head); ++} ++ ++static inline bool burst_cache_expired(struct sched_burst_cache *bc, u64 now) ++{return (s64)(bc->timestamp + sched_burst_cache_lifetime - now) < 0;} ++ ++static void update_burst_cache(struct sched_burst_cache *bc, ++ struct task_struct *p, u32 cnt, u32 sum, u64 now) { ++ u32 avg = cnt ? sum / cnt : 0; ++ bc->value = max(avg, p->se.burst_penalty); ++ bc->count = cnt; ++ bc->timestamp = now; ++} ++ ++static inline void update_child_burst_direct(struct task_struct *p, u64 now) { ++ u32 cnt = 0, sum = 0; ++ struct task_struct *child; ++ ++ for_each_child(p, child) { ++ if (!task_is_bore_eligible(child)) continue; ++ cnt++; ++ sum += child->se.burst_penalty; ++ } ++ ++ update_burst_cache(&p->se.child_burst, p, cnt, sum, now); ++} ++ ++static inline u32 inherit_burst_direct( ++ struct task_struct *p, u64 now, u64 clone_flags) { ++ struct task_struct *parent = p; ++ struct sched_burst_cache *bc; ++ ++ if (clone_flags & CLONE_PARENT) ++ parent = parent->real_parent; ++ ++ bc = &parent->se.child_burst; ++ if (burst_cache_expired(bc, now)) ++ update_child_burst_direct(parent, now); ++ ++ return bc->value; ++} ++ ++static void update_child_burst_topological( ++ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { ++ u32 cnt = 0, dcnt = 0, sum = 0; ++ struct task_struct *child, *dec; ++ struct sched_burst_cache *bc __maybe_unused; ++ ++ for_each_child(p, child) { ++ dec = child; ++ while ((dcnt = count_entries_upto2(&dec->children)) == 1) ++ dec = list_first_entry(&dec->children, struct task_struct, sibling); ++ ++ if (!dcnt || !depth) { ++ if (!task_is_bore_eligible(dec)) continue; ++ cnt++; ++ sum += dec->se.burst_penalty; ++ continue; ++ } ++ bc = &dec->se.child_burst; ++ if (!burst_cache_expired(bc, now)) { ++ cnt += bc->count; ++ sum += bc->value * bc->count; ++ if (sched_burst_cache_stop_count <= cnt) break; ++ continue; ++ } ++ update_child_burst_topological(dec, now, depth - 1, &cnt, &sum); ++ } ++ ++ update_burst_cache(&p->se.child_burst, p, cnt, sum, now); ++ *acnt += cnt; ++ *asum += sum; ++} ++ ++static inline u32 inherit_burst_topological( ++ struct task_struct *p, u64 now, u64 clone_flags) { ++ struct task_struct *anc = p; ++ struct sched_burst_cache *bc; ++ u32 cnt = 0, sum = 0; ++ u32 base_child_cnt = 0; ++ ++ if (clone_flags & CLONE_PARENT) { ++ anc = anc->real_parent; ++ base_child_cnt = 1; ++ } ++ ++ for (struct task_struct *next; ++ anc != (next = anc->real_parent) && ++ count_entries_upto2(&anc->children) <= base_child_cnt;) { ++ anc = next; ++ base_child_cnt = 1; ++ } ++ ++ bc = &anc->se.child_burst; ++ if (burst_cache_expired(bc, now)) ++ update_child_burst_topological( ++ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); ++ ++ return bc->value; ++} ++ ++static inline void update_tg_burst(struct task_struct *p, u64 now) { ++ struct task_struct *task; ++ u32 cnt = 0, sum = 0; ++ ++ for_each_thread(p, task) { ++ if (!task_is_bore_eligible(task)) continue; ++ cnt++; ++ sum += task->se.burst_penalty; ++ } ++ ++ update_burst_cache(&p->se.group_burst, p, cnt, sum, now); ++} ++ ++static inline u32 inherit_burst_tg(struct task_struct *p, u64 now) { ++ struct task_struct *parent = p->group_leader; ++ struct sched_burst_cache *bc = &parent->se.group_burst; ++ if (burst_cache_expired(bc, now)) ++ update_tg_burst(parent, now); ++ ++ return bc->value; ++} ++ ++void sched_clone_bore(struct task_struct *p, ++ struct task_struct *parent, u64 clone_flags, u64 now) { ++ struct sched_entity *se = &p->se; ++ u32 penalty; ++ ++ if (!task_is_bore_eligible(p)) return; ++ ++ penalty = (clone_flags & CLONE_THREAD)? ++ inherit_burst_tg(parent, now): ++ (likely(sched_burst_fork_atavistic)? ++ inherit_burst_topological(parent, now, clone_flags): ++ inherit_burst_direct(parent, now, clone_flags)); ++ ++ __restart_burst(se); ++ se->burst_penalty = se->prev_burst_penalty = ++ max(se->prev_burst_penalty, penalty); ++ se->burst_count = 1; ++ se->child_burst.timestamp = 0; ++ se->group_burst.timestamp = 0; ++} ++ ++void reset_task_bore(struct task_struct *p) { ++ p->se.burst_time = 0; ++ p->se.prev_burst_penalty = 0; ++ p->se.curr_burst_penalty = 0; ++ p->se.burst_penalty = 0; ++ p->se.burst_score = 0; ++ p->se.burst_count = 1; ++ memset(&p->se.child_burst, 0, sizeof(struct sched_burst_cache)); ++ memset(&p->se.group_burst, 0, sizeof(struct sched_burst_cache)); ++} ++ ++void __init sched_bore_init(void) { ++ printk(KERN_INFO "%s %s by %s\n", ++ SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); ++ reset_task_bore(&init_task); ++} ++ ++#ifdef CONFIG_SYSCTL ++static struct ctl_table sched_bore_sysctls[] = { ++ { ++ .procname = "sched_bore", ++ .data = &sched_bore, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = sched_bore_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_exclude_kthreads", ++ .data = &sched_burst_exclude_kthreads, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_smoothness", ++ .data = &sched_burst_smoothness, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = &maxval_8_bits, ++ }, ++ { ++ .procname = "sched_burst_fork_atavistic", ++ .data = &sched_burst_fork_atavistic, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_THREE, ++ }, ++ { ++ .procname = "sched_burst_parity_threshold", ++ .data = &sched_burst_parity_threshold, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_8_bits, ++ }, ++ { ++ .procname = "sched_burst_penalty_offset", ++ .data = &sched_burst_penalty_offset, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_6_bits, ++ }, ++ { ++ .procname = "sched_burst_penalty_scale", ++ .data = &sched_burst_penalty_scale, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_12_bits, ++ }, ++ { ++ .procname = "sched_burst_cache_stop_count", ++ .data = &sched_burst_cache_stop_count, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++ { ++ .procname = "sched_burst_cache_lifetime", ++ .data = &sched_burst_cache_lifetime, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++ { ++ .procname = "sched_deadline_boost_mask", ++ .data = &sched_deadline_boost_mask, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++}; ++ ++static int __init sched_bore_sysctl_init(void) { ++ register_sysctl_init("kernel", sched_bore_sysctls); ++ return 0; ++} ++late_initcall(sched_bore_sysctl_init); ++#endif // CONFIG_SYSCTL ++#endif // CONFIG_SCHED_BORE +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 81c6df746..45832d151 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -97,6 +97,10 @@ + #include "../../io_uring/io-wq.h" + #include "../smpboot.h" + ++#ifdef CONFIG_SCHED_BORE ++#include ++#endif // CONFIG_SCHED_BORE ++ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); + +@@ -8523,6 +8527,10 @@ void __init sched_init(void) + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ sched_bore_init(); ++#endif // CONFIG_SCHED_BORE ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 557246880..c1f6219f2 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -167,7 +167,53 @@ static const struct file_operations sched_feat_fops = { + }; + + #ifdef CONFIG_SMP ++#ifdef CONFIG_SCHED_BORE ++#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ ++static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ ++{ \ ++ char buf[16]; \ ++ unsigned int value; \ ++\ ++ if (cnt > 15) \ ++ cnt = 15; \ ++\ ++ if (copy_from_user(&buf, ubuf, cnt)) \ ++ return -EFAULT; \ ++ buf[cnt] = '\0'; \ ++\ ++ if (kstrtouint(buf, 10, &value)) \ ++ return -EINVAL; \ ++\ ++ sysctl_sched_##name = value; \ ++ sched_update_##update_func(); \ ++\ ++ *ppos += cnt; \ ++ return cnt; \ ++} \ ++\ ++static int sched_##name##_show(struct seq_file *m, void *v) \ ++{ \ ++ seq_printf(m, "%d\n", sysctl_sched_##name); \ ++ return 0; \ ++} \ ++\ ++static int sched_##name##_open(struct inode *inode, struct file *filp) \ ++{ \ ++ return single_open(filp, sched_##name##_show, NULL); \ ++} \ ++\ ++static const struct file_operations sched_##name##_fops = { \ ++ .open = sched_##name##_open, \ ++ .write = sched_##name##_write, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++}; + ++DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) ++ ++#undef DEFINE_SYSCTL_SCHED_FUNC ++#else // !CONFIG_SCHED_BORE + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +@@ -213,7 +259,7 @@ static const struct file_operations sched_scaling_fops = { + .llseek = seq_lseek, + .release = single_release, + }; +- ++#endif // CONFIG_SCHED_BORE + #endif /* SMP */ + + #ifdef CONFIG_PREEMPT_DYNAMIC +@@ -507,13 +553,20 @@ static __init int sched_init_debug(void) + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); ++ debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); ++#else // !CONFIG_SCHED_BORE + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); ++#endif // CONFIG_SCHED_BORE + + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); + + #ifdef CONFIG_SMP ++#if !defined(CONFIG_SCHED_BORE) + debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); ++#endif // CONFIG_SCHED_BORE + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); + debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); + +@@ -762,6 +815,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + ++#ifdef CONFIG_SCHED_BORE ++ SEQ_printf(m, " %2d", p->se.burst_score); ++#endif // CONFIG_SCHED_BORE + #ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif +@@ -1248,6 +1304,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + + P(se.load.weight); + #ifdef CONFIG_SMP ++#ifdef CONFIG_SCHED_BORE ++ P(se.burst_score); ++#endif // CONFIG_SCHED_BORE + P(se.avg.load_sum); + P(se.avg.runnable_sum); + P(se.avg.util_sum); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 7a14da539..5f44bd194 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -58,6 +58,10 @@ + #include "stats.h" + #include "autogroup.h" + ++#ifdef CONFIG_SCHED_BORE ++#include ++#endif // CONFIG_SCHED_BORE ++ + /* + * The initial- and re-scaling of tunables is configurable + * +@@ -67,17 +71,30 @@ + * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * +- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) ++ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant ++ * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + */ ++#ifdef CONFIG_SCHED_BORE ++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; ++#else // !CONFIG_SCHED_BORE + unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; ++#endif // CONFIG_SCHED_BORE + + /* + * Minimal preemption granularity for CPU-bound tasks: + * +- * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) ++ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice ++ * (default min_base_slice = 2000000 constant, units: nanoseconds) ++ * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds + */ ++#ifdef CONFIG_SCHED_BORE ++static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; ++unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; ++__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; ++#else // !CONFIG_SCHED_BORE + unsigned int sysctl_sched_base_slice = 700000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; ++#endif // CONFIG_SCHED_BORE + + __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; + +@@ -191,6 +208,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) + * + * This idea comes from the SD scheduler of Con Kolivas: + */ ++#ifdef CONFIG_SCHED_BORE ++static void update_sysctl(void) { ++ sysctl_sched_base_slice = nsecs_per_tick * ++ max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); ++} ++void sched_update_min_base_slice(void) { update_sysctl(); } ++#else // !CONFIG_SCHED_BORE + static unsigned int get_update_sysctl_factor(void) + { + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); +@@ -221,6 +245,7 @@ static void update_sysctl(void) + SET_SYSCTL(sched_base_slice); + #undef SET_SYSCTL + } ++#endif // CONFIG_SCHED_BORE + + void __init sched_init_granularity(void) + { +@@ -700,6 +725,9 @@ static void update_entity_lag(struct cfs_rq *cfs_rq, struct sched_entity *se) + + vlag = avg_vruntime(cfs_rq) - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); ++#ifdef CONFIG_SCHED_BORE ++ limit >>= !!sched_bore; ++#endif // CONFIG_SCHED_BORE + + se->vlag = clamp(vlag, -limit, limit); + } +@@ -940,6 +968,10 @@ static struct sched_entity *pick_eevdf(struct cfs_rq *cfs_rq) + curr = NULL; + + if (sched_feat(RUN_TO_PARITY) && curr && protect_slice(curr)) ++#ifdef CONFIG_SCHED_BORE ++ if (!(likely(sched_bore) && likely(sched_burst_parity_threshold) && ++ sched_burst_parity_threshold < cfs_rq->nr_queued)) ++#endif // CONFIG_SCHED_BORE + return curr; + + /* Pick the leftmost entity if it's eligible */ +@@ -997,6 +1029,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + * Scheduling class statistics methods: + */ + #ifdef CONFIG_SMP ++#if !defined(CONFIG_SCHED_BORE) + int sched_update_scaling(void) + { + unsigned int factor = get_update_sysctl_factor(); +@@ -1008,6 +1041,7 @@ int sched_update_scaling(void) + + return 0; + } ++#endif // CONFIG_SCHED_BORE + #endif + + static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); +@@ -1237,6 +1271,9 @@ static void update_curr(struct cfs_rq *cfs_rq) + if (unlikely(delta_exec <= 0)) + return; + ++#ifdef CONFIG_SCHED_BORE ++ update_curr_bore(delta_exec, curr); ++#endif // CONFIG_SCHED_BORE + curr->vruntime += calc_delta_fair(delta_exec, curr); + resched = update_deadline(cfs_rq, curr); + update_min_vruntime(cfs_rq); +@@ -3794,13 +3831,22 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) { } + + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); + ++ ++#ifdef CONFIG_SCHED_BORE ++void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, ++ unsigned long weight, bool no_update_curr) ++#else // !CONFIG_SCHED_BORE + static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) ++#endif // CONFIG_SCHED_BORE + { + bool curr = cfs_rq->curr == se; + + if (se->on_rq) { + /* commit outstanding execution time */ ++#ifdef CONFIG_SCHED_BORE ++ if (!no_update_curr) ++#endif // CONFIG_SCHED_BORE + update_curr(cfs_rq); + update_entity_lag(cfs_rq, se); + se->deadline -= se->vruntime; +@@ -3856,7 +3902,11 @@ static void reweight_task_fair(struct rq *rq, struct task_struct *p, + struct cfs_rq *cfs_rq = cfs_rq_of(se); + struct load_weight *load = &se->load; + ++#ifdef CONFIG_SCHED_BORE ++ reweight_entity(cfs_rq, se, lw->weight, false); ++#else // !CONFIG_SCHED_BORE + reweight_entity(cfs_rq, se, lw->weight); ++#endif // CONFIG_SCHED_BORE + load->inv_weight = lw->inv_weight; + } + +@@ -3997,7 +4047,11 @@ static void update_cfs_group(struct sched_entity *se) + shares = calc_group_shares(gcfs_rq); + #endif + if (unlikely(se->load.weight != shares)) ++#ifdef CONFIG_SCHED_BORE ++ reweight_entity(cfs_rq_of(se), se, shares, false); ++#else // !CONFIG_SCHED_BORE + reweight_entity(cfs_rq_of(se), se, shares); ++#endif // CONFIG_SCHED_BORE + } + + #else /* CONFIG_FAIR_GROUP_SCHED */ +@@ -5295,7 +5349,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + se->rel_deadline = 0; + return; + } +- ++#ifdef CONFIG_SCHED_BORE ++ else if (likely(sched_bore)) ++ vslice >>= !!(flags & sched_deadline_boost_mask); ++ else ++#endif // CONFIG_SCHED_BORE + /* + * When joining the competition; the existing tasks will be, + * on average, halfway through their slice, as such start tasks +@@ -7190,6 +7248,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + util_est_dequeue(&rq->cfs, p); + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); ++#ifdef CONFIG_SCHED_BORE ++ struct cfs_rq *cfs_rq = &rq->cfs; ++ struct sched_entity *se = &p->se; ++ if (flags & DEQUEUE_SLEEP && entity_is_task(se)) { ++ if (cfs_rq->curr == se) ++ update_curr(cfs_rq); ++ restart_burst(se); ++ } ++#endif // CONFIG_SCHED_BORE + if (dequeue_entities(rq, &p->se, flags) < 0) + return false; + +@@ -9019,16 +9086,25 @@ static void yield_task_fair(struct rq *rq) + /* + * Are we the only task in the tree? + */ ++#if !defined(CONFIG_SCHED_BORE) + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); ++#endif // CONFIG_SCHED_BORE + + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); ++#ifdef CONFIG_SCHED_BORE ++ restart_burst_rescale_deadline(se); ++ if (unlikely(rq->nr_running == 1)) ++ return; ++ ++ clear_buddies(cfs_rq, se); ++#endif // CONFIG_SCHED_BORE + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() +@@ -13142,6 +13218,9 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + static void task_fork_fair(struct task_struct *p) + { + set_task_max_allowed_capacity(p); ++#ifdef CONFIG_SCHED_BORE ++ update_burst_score(&p->se); ++#endif // CONFIG_SCHED_BORE + } + + /* +@@ -13259,6 +13338,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) + { + WARN_ON_ONCE(p->se.sched_delayed); + ++#ifdef CONFIG_SCHED_BORE ++ reset_task_bore(p); ++#endif // CONFIG_SCHED_BORE + attach_task_cfs_rq(p); + + set_task_max_allowed_capacity(p); +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 83e3aa917..ef5d684df 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2119,7 +2119,11 @@ extern int group_balance_cpu(struct sched_group *sg); + extern void update_sched_domain_debugfs(void); + extern void dirty_sched_domain_sysctl(int cpu); + ++#ifdef CONFIG_SCHED_BORE ++extern void sched_update_min_base_slice(void); ++#else // !CONFIG_SCHED_BORE + extern int sched_update_scaling(void); ++#endif // CONFIG_SCHED_BORE + + static inline const struct cpumask *task_user_cpus(struct task_struct *p) + { +@@ -2825,7 +2829,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + extern __read_mostly unsigned int sysctl_sched_nr_migrate; + extern __read_mostly unsigned int sysctl_sched_migration_cost; + ++#ifdef CONFIG_SCHED_BORE ++extern unsigned int sysctl_sched_min_base_slice; ++extern __read_mostly uint sysctl_sched_base_slice; ++#else // !CONFIG_SCHED_BORE + extern unsigned int sysctl_sched_base_slice; ++#endif // CONFIG_SCHED_BORE + + extern int sysctl_resched_latency_warn_ms; + extern int sysctl_resched_latency_warn_once; +-- +2.50.1 + diff --git a/sys-kernel/gentoo-sources-6.16/0002-bbr3.patch b/sys-kernel/gentoo-sources-6.16/0002-bbr3.patch new file mode 100644 index 0000000..63816a2 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/0002-bbr3.patch @@ -0,0 +1,3404 @@ +From 66b42eef90f200265e2fc1695808c6626d50c6c5 Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 28 Jul 2025 11:50:37 +0700 +Subject: [PATCH 2/7] bbr3 + +Signed-off-by: Eric Naim +--- + include/linux/tcp.h | 6 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 73 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 4 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2232 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 4 +- + 16 files changed, 1941 insertions(+), 555 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 29f59d50dc73..811850c240cc 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -248,7 +248,8 @@ struct tcp_sock { + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); + #endif + u32 snd_ssthresh; /* Slow start size threshold */ +- u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ ++ u32 recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ ++ fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */ + __cacheline_group_end(tcp_sock_read_rx); + + /* TX read-write hotpath cache lines */ +@@ -305,7 +306,8 @@ struct tcp_sock { + */ + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ +- rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ ++ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ ++ tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index 1735db332aab..2c4a94af7093 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -132,8 +132,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 5078ad868fee..de404e4370d4 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -379,11 +379,14 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_DEMAND_CWR BIT(2) + #define TCP_ECN_SEEN BIT(3) + #define TCP_ECN_MODE_ACCECN BIT(4) ++#define TCP_ECN_LOW BIT(5) ++#define TCP_ECN_ECT_PERMANENT BIT(6) + + #define TCP_ECN_DISABLED 0 + #define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + #define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + ++ + static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) + { + return tp->ecn_flags & TCP_ECN_MODE_ANY; +@@ -841,6 +844,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -946,6 +958,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -1044,9 +1061,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1159,6 +1181,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1181,7 +1204,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED BIT(0) + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN BIT(1) +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS BIT(2) ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1201,10 +1228,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1215,7 +1245,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1239,8 +1271,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1306,6 +1341,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1325,6 +1368,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1337,6 +1381,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2490,7 +2549,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index dab9493c791b..cce4975fdcfe 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -517,12 +517,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index bdac8c42fa82..362644a272ba 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ + #define TCPI_OPT_TFO_CHILD 128 /* child from a Fast Open option on SYN */ ++#define TCPI_OPT_ECN_LOW 256 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 12850a277251..3b8b96692fb4 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index e01492234b0b..27893b774e08 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } +@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 461a9ab540af..02ae796fa17e 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3442,6 +3442,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4189,6 +4190,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..066da5e5747c 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,122 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ return (tcp_ecn_mode_any(tp)) && (tp->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +383,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +410,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +434,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +457,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, ++ bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +474,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +535,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +548,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +580,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +600,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +671,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +682,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +711,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +740,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +796,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +804,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +850,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +859,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +887,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +924,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +947,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +972,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ !!bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * and in PROBE_UP. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2361,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2398,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index df758adbb445..e98e5dbc050e 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 68bc79eb9019..7991a7589109 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -381,7 +381,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -392,7 +392,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1134,7 +1134,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1506,6 +1511,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3856,7 +3872,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3873,6 +3890,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3883,6 +3901,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -4002,6 +4025,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4067,7 +4091,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_in_ack_event(sk, flag); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4091,6 +4115,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4111,7 +4136,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5793,13 +5818,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 43d7852ce07e..df386419b9bf 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -475,6 +475,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 3ac8d2d17e1f..cc75963b5a4c 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1614,7 +1617,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + u16 flags; + int nlen; +@@ -1689,6 +1692,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2045,13 +2072,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2776,6 +2802,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2988,6 +3015,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index bb37e24b97a7..9adfc1131d1f 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -565,7 +565,7 @@ void tcp_retransmit_timer(struct sock *sk) + struct inet_sock *inet = inet_sk(sk); + u32 rtx_delta; + +- rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: ++ rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: + tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); + if (tp->tcp_usec_ts) + rtx_delta /= USEC_PER_MSEC; +@@ -702,6 +702,8 @@ void tcp_write_timer_handler(struct sock *sk) + icsk_timeout(icsk)); + return; + } ++ ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.50.1 + diff --git a/sys-kernel/gentoo-sources-6.16/0003-block.patch b/sys-kernel/gentoo-sources-6.16/0003-block.patch new file mode 100644 index 0000000..61b2d59 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/0003-block.patch @@ -0,0 +1,288 @@ +From e6160758a8f7593c49db07cb995ad1c3a7eb60ff Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 28 Jul 2025 11:50:37 +0700 +Subject: [PATCH 3/7] block + +Signed-off-by: Eric Naim +--- + block/bfq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++------ + block/bfq-iosched.h | 12 +++++++++-- + block/mq-deadline.c | 48 +++++++++++++++++++++++++++++++++++------ + 3 files changed, 96 insertions(+), 16 deletions(-) + +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +index 0cb1e9873aab..4e3e4d3ce88c 100644 +--- a/block/bfq-iosched.c ++++ b/block/bfq-iosched.c +@@ -467,6 +467,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) + return icq; + } + ++static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q) ++{ ++ if (!current->io_context) ++ return NULL; ++ if (spin_trylock_irq(&q->queue_lock)) { ++ struct bfq_io_cq *icq; ++ ++ icq = icq_to_bic(ioc_lookup_icq(q)); ++ spin_unlock_irq(&q->queue_lock); ++ return icq; ++ } ++ ++ return NULL; ++} ++ + /* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. +@@ -2465,10 +2480,21 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, + * returned by bfq_bic_lookup does not go away before + * bfqd->lock is taken. + */ +- struct bfq_io_cq *bic = bfq_bic_lookup(q); ++ struct bfq_io_cq *bic = bfq_bic_try_lookup(q); + bool ret; + +- spin_lock_irq(&bfqd->lock); ++ /* ++ * bio merging is called for every bio queued, and it's very easy ++ * to run into contention because of that. If we fail getting ++ * the dd lock, just skip this merge attempt. For related IO, the ++ * plug will be the successful merging point. If we get here, we ++ * already failed doing the obvious merge. Chances of actually ++ * getting a merge off this path is a lot slimmer, so skipping an ++ * occassional lookup that will most likely not succeed anyway should ++ * not be a problem. ++ */ ++ if (!spin_trylock_irq(&bfqd->lock)) ++ return false; + + if (bic) { + /* +@@ -5317,6 +5343,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled = false; + ++ /* ++ * If someone else is already dispatching, skip this one. This will ++ * defer the next dispatch event to when something completes, and could ++ * potentially lower the queue depth for contended cases. ++ * ++ * See the logic in blk_mq_do_dispatch_sched(), which loops and ++ * retries if nothing is dispatched. ++ */ ++ if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) || ++ test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state)) ++ return NULL; ++ + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; +@@ -5328,6 +5366,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + } + ++ clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state); + spin_unlock_irq(&bfqd->lock); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, +@@ -6250,10 +6289,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q, + + static struct bfq_queue *bfq_init_rq(struct request *rq); + +-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++static void bfq_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags) + { +- struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + bool idle_timer_disabled = false; +@@ -6315,7 +6353,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); +- bfq_insert_request(hctx, rq, flags); ++ bfq_insert_request(hctx->queue, rq, flags); + } + } + +@@ -7254,6 +7292,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + q->elevator = eq; + spin_unlock_irq(&q->queue_lock); + ++ spin_lock_init(&bfqd->lock); ++ + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow +@@ -7371,8 +7411,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_type *e) + /* see comments on the definition of next field inside bfq_data */ + bfqd->actuator_load_threshold = 4; + +- spin_lock_init(&bfqd->lock); +- + /* + * The invocation of the next bfq_create_group_hierarchy + * function is the head of a chain of function calls +diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h +index 687a3a7ba784..8589b58af79f 100644 +--- a/block/bfq-iosched.h ++++ b/block/bfq-iosched.h +@@ -504,12 +504,22 @@ struct bfq_io_cq { + unsigned int requests; /* Number of requests this process has in flight */ + }; + ++enum { ++ BFQ_DISPATCHING = 0, ++}; ++ + /** + * struct bfq_data - per-device data structure. + * + * All the fields are protected by @lock. + */ + struct bfq_data { ++ struct { ++ spinlock_t lock; ++ } ____cacheline_aligned_in_smp; ++ ++ unsigned long run_state; ++ + /* device request queue */ + struct request_queue *queue; + /* dispatch queue */ +@@ -795,8 +805,6 @@ struct bfq_data { + /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; + +- spinlock_t lock; +- + /* + * bic associated with the task issuing current bio for + * merging. This and the next field are used as a support to +diff --git a/block/mq-deadline.c b/block/mq-deadline.c +index 2edf1cac06d5..1bae19f17722 100644 +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -79,10 +79,20 @@ struct dd_per_prio { + struct io_stats_per_prio stats; + }; + ++enum { ++ DD_DISPATCHING = 0, ++}; ++ + struct deadline_data { + /* + * run time data + */ ++ struct { ++ spinlock_t lock; ++ spinlock_t zone_lock; ++ } ____cacheline_aligned_in_smp; ++ ++ unsigned long run_state; + + struct dd_per_prio per_prio[DD_PRIO_COUNT]; + +@@ -100,8 +110,6 @@ struct deadline_data { + int front_merges; + u32 async_depth; + int prio_aging_expire; +- +- spinlock_t lock; + }; + + /* Maps an I/O priority class to a deadline scheduler priority. */ +@@ -466,6 +474,18 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) + struct request *rq; + enum dd_prio prio; + ++ /* ++ * If someone else is already dispatching, skip this one. This will ++ * defer the next dispatch event to when something completes, and could ++ * potentially lower the queue depth for contended cases. ++ * ++ * See the logic in blk_mq_do_dispatch_sched(), which loops and ++ * retries if nothing is dispatched. ++ */ ++ if (test_bit(DD_DISPATCHING, &dd->run_state) || ++ test_and_set_bit_lock(DD_DISPATCHING, &dd->run_state)) ++ return NULL; ++ + spin_lock(&dd->lock); + rq = dd_dispatch_prio_aged_requests(dd, now); + if (rq) +@@ -482,6 +502,7 @@ static struct request *dd_dispatch_request(struct blk_mq_hw_ctx *hctx) + } + + unlock: ++ clear_bit_unlock(DD_DISPATCHING, &dd->run_state); + spin_unlock(&dd->lock); + + return rq; +@@ -585,6 +606,9 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) + + eq->elevator_data = dd; + ++ spin_lock_init(&dd->lock); ++ spin_lock_init(&dd->zone_lock); ++ + for (prio = 0; prio <= DD_PRIO_MAX; prio++) { + struct dd_per_prio *per_prio = &dd->per_prio[prio]; + +@@ -601,7 +625,6 @@ static int dd_init_sched(struct request_queue *q, struct elevator_type *e) + dd->last_dir = DD_WRITE; + dd->fifo_batch = fifo_batch; + dd->prio_aging_expire = prio_aging_expire; +- spin_lock_init(&dd->lock); + + /* We dispatch from request queue wide instead of hw queue */ + blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); +@@ -657,7 +680,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + struct request *free = NULL; + bool ret; + +- spin_lock(&dd->lock); ++ /* ++ * bio merging is called for every bio queued, and it's very easy ++ * to run into contention because of that. If we fail getting ++ * the dd lock, just skip this merge attempt. For related IO, the ++ * plug will be the successful merging point. If we get here, we ++ * already failed doing the obvious merge. Chances of actually ++ * getting a merge off this path is a lot slimmer, so skipping an ++ * occassional lookup that will most likely not succeed anyway should ++ * not be a problem. ++ */ ++ if (!spin_trylock(&dd->lock)) ++ return false; ++ + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock(&dd->lock); + +@@ -670,10 +705,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + /* + * add rq to rbtree and fifo + */ +-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++static void dd_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags, struct list_head *free) + { +- struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); +@@ -731,7 +765,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); +- dd_insert_request(hctx, rq, flags, &free); ++ dd_insert_request(q, rq, flags, &free); + } + spin_unlock(&dd->lock); + +-- +2.50.1 + diff --git a/sys-kernel/gentoo-sources-6.16/0005-fixes.patch b/sys-kernel/gentoo-sources-6.16/0005-fixes.patch new file mode 100644 index 0000000..2995b13 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/0005-fixes.patch @@ -0,0 +1,59 @@ +From cc66b41be3df74ec55f57e9fd047315384fe1052 Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 28 Jul 2025 11:50:38 +0700 +Subject: [PATCH 5/7] fixes + +Signed-off-by: Eric Naim +--- + drivers/bluetooth/btusb.c | 2 ++ + drivers/gpu/drm/i915/display/intel_dsb.c | 4 ++++ + scripts/package/PKGBUILD | 5 +++++ + 3 files changed, 11 insertions(+) + +diff --git a/drivers/bluetooth/btusb.c b/drivers/bluetooth/btusb.c +index f9eeec0aed57..6b12b44f3a0d 100644 +--- a/drivers/bluetooth/btusb.c ++++ b/drivers/bluetooth/btusb.c +@@ -705,6 +705,8 @@ static const struct usb_device_id quirks_table[] = { + BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x0489, 0xe139), .driver_info = BTUSB_MEDIATEK | + BTUSB_WIDEBAND_SPEECH }, ++ { USB_DEVICE(0x0489, 0xe14e), .driver_info = BTUSB_MEDIATEK | ++ BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x0489, 0xe14f), .driver_info = BTUSB_MEDIATEK | + BTUSB_WIDEBAND_SPEECH }, + { USB_DEVICE(0x0489, 0xe150), .driver_info = BTUSB_MEDIATEK | +diff --git a/drivers/gpu/drm/i915/display/intel_dsb.c b/drivers/gpu/drm/i915/display/intel_dsb.c +index 481488d1fe67..271229500c62 100644 +--- a/drivers/gpu/drm/i915/display/intel_dsb.c ++++ b/drivers/gpu/drm/i915/display/intel_dsb.c +@@ -808,6 +808,10 @@ struct intel_dsb *intel_dsb_prepare(struct intel_atomic_state *state, + if (!display->params.enable_dsb) + return NULL; + ++ /* TODO: DSB is broken in Xe KMD, so disabling it until fixed */ ++ if (!IS_ENABLED(I915)) ++ return NULL; ++ + dsb = kzalloc(sizeof(*dsb), GFP_KERNEL); + if (!dsb) + goto out; +diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD +index 452374d63c24..08f80d7c5df0 100644 +--- a/scripts/package/PKGBUILD ++++ b/scripts/package/PKGBUILD +@@ -90,6 +90,11 @@ _package-headers() { + "${srctree}/scripts/package/install-extmod-build" "${builddir}" + fi + ++ # required when DEBUG_INFO_BTF_MODULES is enabled ++ if [ -f tools/bpf/resolve_btfids/resolve_btfids ]; then ++ install -Dt "$builddir/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids ++ fi ++ + echo "Installing System.map and config..." + mkdir -p "${builddir}" + cp System.map "${builddir}/System.map" +-- +2.50.1 + diff --git a/sys-kernel/gentoo-sources-6.16/0006-s5-power.patch b/sys-kernel/gentoo-sources-6.16/0006-s5-power.patch new file mode 100644 index 0000000..b846780 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/0006-s5-power.patch @@ -0,0 +1,329 @@ +From 0fc382f7d5a69dcfabaa0d7a24b1bc1dd7af1d40 Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 28 Jul 2025 11:50:38 +0700 +Subject: [PATCH 6/7] s5-power + +Signed-off-by: Eric Naim +--- + drivers/base/power/main.c | 7 ++ + drivers/gpu/drm/amd/amdgpu/amdgpu_device.c | 4 + + drivers/pci/pci-driver.c | 94 ++++++++++++++-------- + drivers/scsi/mesh.c | 1 + + drivers/scsi/stex.c | 1 + + drivers/usb/host/sl811-hcd.c | 1 + + include/linux/pm.h | 3 + + include/trace/events/power.h | 3 +- + kernel/reboot.c | 6 ++ + 9 files changed, 86 insertions(+), 34 deletions(-) + +diff --git a/drivers/base/power/main.c b/drivers/base/power/main.c +index 7a50af416cac..b8f0343a8673 100644 +--- a/drivers/base/power/main.c ++++ b/drivers/base/power/main.c +@@ -85,6 +85,8 @@ static const char *pm_verb(int event) + return "restore"; + case PM_EVENT_RECOVER: + return "recover"; ++ case PM_EVENT_POWEROFF: ++ return "poweroff"; + default: + return "(unknown PM event)"; + } +@@ -355,6 +357,7 @@ static pm_callback_t pm_op(const struct dev_pm_ops *ops, pm_message_t state) + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + return ops->freeze; ++ case PM_EVENT_POWEROFF: + case PM_EVENT_HIBERNATE: + return ops->poweroff; + case PM_EVENT_THAW: +@@ -389,6 +392,7 @@ static pm_callback_t pm_late_early_op(const struct dev_pm_ops *ops, + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + return ops->freeze_late; ++ case PM_EVENT_POWEROFF: + case PM_EVENT_HIBERNATE: + return ops->poweroff_late; + case PM_EVENT_THAW: +@@ -423,6 +427,7 @@ static pm_callback_t pm_noirq_op(const struct dev_pm_ops *ops, pm_message_t stat + case PM_EVENT_FREEZE: + case PM_EVENT_QUIESCE: + return ops->freeze_noirq; ++ case PM_EVENT_POWEROFF: + case PM_EVENT_HIBERNATE: + return ops->poweroff_noirq; + case PM_EVENT_THAW: +@@ -1313,6 +1318,8 @@ static pm_message_t resume_event(pm_message_t sleep_state) + return PMSG_RECOVER; + case PM_EVENT_HIBERNATE: + return PMSG_RESTORE; ++ case PM_EVENT_POWEROFF: ++ return PMSG_ON; + } + return PMSG_ON; + } +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +index aa32df7e2fb2..839117782949 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_device.c +@@ -4961,6 +4961,10 @@ static int amdgpu_device_evict_resources(struct amdgpu_device *adev) + if (!adev->in_s4 && (adev->flags & AMD_IS_APU)) + return 0; + ++ /* No need to evict when going to S5 through S4 callbacks */ ++ if (system_state == SYSTEM_HALT || system_state == SYSTEM_POWER_OFF) ++ return 0; ++ + ret = amdgpu_ttm_evict_resources(adev, TTM_PL_VRAM); + if (ret) + DRM_WARN("evicting device resources failed\n"); +diff --git a/drivers/pci/pci-driver.c b/drivers/pci/pci-driver.c +index 67db34fd10ee..b78b98133e7d 100644 +--- a/drivers/pci/pci-driver.c ++++ b/drivers/pci/pci-driver.c +@@ -758,6 +758,56 @@ static void pci_pm_complete(struct device *dev) + + #endif /* !CONFIG_PM_SLEEP */ + ++#if defined(CONFIG_SUSPEND) || defined(CONFIG_HIBERNATE_CALLBACKS) ++/** ++ * pci_pm_set_prepare_bus_pm ++ * @pci_dev: pci device ++ * ++ * Prepare the device to go into a low power state by saving state ++ * and configure bus PM policy. ++ * ++ * Return: TRUE for bus PM will be used ++ * FALSE for bus PM will be skipped ++ */ ++static bool pci_pm_set_prepare_bus_pm(struct pci_dev *pci_dev) ++{ ++ if (!pci_dev->state_saved) { ++ pci_save_state(pci_dev); ++ ++ /* ++ * If the device is a bridge with a child in D0 below it, ++ * it needs to stay in D0, so check skip_bus_pm to avoid ++ * putting it into a low-power state in that case. ++ */ ++ if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev)) ++ pci_prepare_to_sleep(pci_dev); ++ } ++ ++ pci_dbg(pci_dev, "PCI PM: Sleep power state: %s\n", ++ pci_power_name(pci_dev->current_state)); ++ ++ if (pci_dev->current_state == PCI_D0) { ++ pci_dev->skip_bus_pm = true; ++ /* ++ * Per PCI PM r1.2, table 6-1, a bridge must be in D0 if any ++ * downstream device is in D0, so avoid changing the power state ++ * of the parent bridge by setting the skip_bus_pm flag for it. ++ */ ++ if (pci_dev->bus->self) ++ pci_dev->bus->self->skip_bus_pm = true; ++ } ++ ++ if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) { ++ pci_dbg(pci_dev, "PCI PM: Skipped\n"); ++ return FALSE; ++ } ++ ++ pci_pm_set_unknown_state(pci_dev); ++ ++ return TRUE; ++} ++#endif /* CONFIG_SUSPEND || CONFIG_HIBERNATE_CALLBACKS */ ++ + #ifdef CONFIG_SUSPEND + static void pcie_pme_root_status_cleanup(struct pci_dev *pci_dev) + { +@@ -877,38 +927,8 @@ static int pci_pm_suspend_noirq(struct device *dev) + } + } + +- if (!pci_dev->state_saved) { +- pci_save_state(pci_dev); +- +- /* +- * If the device is a bridge with a child in D0 below it, +- * it needs to stay in D0, so check skip_bus_pm to avoid +- * putting it into a low-power state in that case. +- */ +- if (!pci_dev->skip_bus_pm && pci_power_manageable(pci_dev)) +- pci_prepare_to_sleep(pci_dev); +- } +- +- pci_dbg(pci_dev, "PCI PM: Suspend power state: %s\n", +- pci_power_name(pci_dev->current_state)); +- +- if (pci_dev->current_state == PCI_D0) { +- pci_dev->skip_bus_pm = true; +- /* +- * Per PCI PM r1.2, table 6-1, a bridge must be in D0 if any +- * downstream device is in D0, so avoid changing the power state +- * of the parent bridge by setting the skip_bus_pm flag for it. +- */ +- if (pci_dev->bus->self) +- pci_dev->bus->self->skip_bus_pm = true; +- } +- +- if (pci_dev->skip_bus_pm && pm_suspend_no_platform()) { +- pci_dbg(pci_dev, "PCI PM: Skipped\n"); ++ if (!pci_pm_set_prepare_bus_pm(pci_dev)) + goto Fixup; +- } +- +- pci_pm_set_unknown_state(pci_dev); + + /* + * Some BIOSes from ASUS have a bug: If a USB EHCI host controller's +@@ -1135,6 +1155,8 @@ static int pci_pm_poweroff(struct device *dev) + struct pci_dev *pci_dev = to_pci_dev(dev); + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; + ++ pci_dev->skip_bus_pm = false; ++ + if (pci_has_legacy_pm_support(pci_dev)) + return pci_legacy_suspend(dev, PMSG_HIBERNATE); + +@@ -1198,8 +1220,8 @@ static int pci_pm_poweroff_noirq(struct device *dev) + return error; + } + +- if (!pci_dev->state_saved && !pci_has_subordinate(pci_dev)) +- pci_prepare_to_sleep(pci_dev); ++ if (!pci_pm_set_prepare_bus_pm(pci_dev)) ++ goto Fixup; + + /* + * The reason for doing this here is the same as for the analogous code +@@ -1208,6 +1230,7 @@ static int pci_pm_poweroff_noirq(struct device *dev) + if (pci_dev->class == PCI_CLASS_SERIAL_USB_EHCI) + pci_write_config_word(pci_dev, PCI_COMMAND, 0); + ++Fixup: + pci_fixup_device(pci_fixup_suspend_late, pci_dev); + + return 0; +@@ -1217,10 +1240,15 @@ static int pci_pm_restore_noirq(struct device *dev) + { + struct pci_dev *pci_dev = to_pci_dev(dev); + const struct dev_pm_ops *pm = dev->driver ? dev->driver->pm : NULL; ++ pci_power_t prev_state = pci_dev->current_state; ++ bool skip_bus_pm = pci_dev->skip_bus_pm; + + pci_pm_default_resume_early(pci_dev); + pci_fixup_device(pci_fixup_resume_early, pci_dev); + ++ if (!skip_bus_pm && prev_state == PCI_D3cold) ++ pci_pm_bridge_power_up_actions(pci_dev); ++ + if (pci_has_legacy_pm_support(pci_dev)) + return 0; + +diff --git a/drivers/scsi/mesh.c b/drivers/scsi/mesh.c +index 1c15cac41d80..768b85eecc8f 100644 +--- a/drivers/scsi/mesh.c ++++ b/drivers/scsi/mesh.c +@@ -1762,6 +1762,7 @@ static int mesh_suspend(struct macio_dev *mdev, pm_message_t mesg) + case PM_EVENT_SUSPEND: + case PM_EVENT_HIBERNATE: + case PM_EVENT_FREEZE: ++ case PM_EVENT_POWEROFF: + break; + default: + return 0; +diff --git a/drivers/scsi/stex.c b/drivers/scsi/stex.c +index 63ed7f9aaa93..ee9372e1f7f0 100644 +--- a/drivers/scsi/stex.c ++++ b/drivers/scsi/stex.c +@@ -1965,6 +1965,7 @@ static int stex_choice_sleep_mic(struct st_hba *hba, pm_message_t state) + case PM_EVENT_SUSPEND: + return ST_S3; + case PM_EVENT_HIBERNATE: ++ case PM_EVENT_POWEROFF: + hba->msi_lock = 0; + return ST_S4; + default: +diff --git a/drivers/usb/host/sl811-hcd.c b/drivers/usb/host/sl811-hcd.c +index ea3cab99c5d4..5d6dba681e50 100644 +--- a/drivers/usb/host/sl811-hcd.c ++++ b/drivers/usb/host/sl811-hcd.c +@@ -1748,6 +1748,7 @@ sl811h_suspend(struct platform_device *dev, pm_message_t state) + break; + case PM_EVENT_SUSPEND: + case PM_EVENT_HIBERNATE: ++ case PM_EVENT_POWEROFF: + case PM_EVENT_PRETHAW: /* explicitly discard hw state */ + port_power(sl811, 0); + break; +diff --git a/include/linux/pm.h b/include/linux/pm.h +index f0bd8fbae4f2..cb66f47631a7 100644 +--- a/include/linux/pm.h ++++ b/include/linux/pm.h +@@ -506,6 +506,7 @@ const struct dev_pm_ops name = { \ + * RECOVER Creation of a hibernation image or restoration of the main + * memory contents from a hibernation image has failed, call + * ->thaw() and ->complete() for all devices. ++ * POWEROFF System will poweroff, call ->poweroff() for all devices. + * + * The following PM_EVENT_ messages are defined for internal use by + * kernel subsystems. They are never issued by the PM core. +@@ -536,6 +537,7 @@ const struct dev_pm_ops name = { \ + #define PM_EVENT_USER 0x0100 + #define PM_EVENT_REMOTE 0x0200 + #define PM_EVENT_AUTO 0x0400 ++#define PM_EVENT_POWEROFF 0x0800 + + #define PM_EVENT_SLEEP (PM_EVENT_SUSPEND | PM_EVENT_HIBERNATE) + #define PM_EVENT_USER_SUSPEND (PM_EVENT_USER | PM_EVENT_SUSPEND) +@@ -550,6 +552,7 @@ const struct dev_pm_ops name = { \ + #define PMSG_QUIESCE ((struct pm_message){ .event = PM_EVENT_QUIESCE, }) + #define PMSG_SUSPEND ((struct pm_message){ .event = PM_EVENT_SUSPEND, }) + #define PMSG_HIBERNATE ((struct pm_message){ .event = PM_EVENT_HIBERNATE, }) ++#define PMSG_POWEROFF ((struct pm_message){ .event = PM_EVENT_POWEROFF, }) + #define PMSG_RESUME ((struct pm_message){ .event = PM_EVENT_RESUME, }) + #define PMSG_THAW ((struct pm_message){ .event = PM_EVENT_THAW, }) + #define PMSG_RESTORE ((struct pm_message){ .event = PM_EVENT_RESTORE, }) +diff --git a/include/trace/events/power.h b/include/trace/events/power.h +index 6c631eec23e3..8fa70f239737 100644 +--- a/include/trace/events/power.h ++++ b/include/trace/events/power.h +@@ -199,7 +199,8 @@ TRACE_EVENT(pstate_sample, + { PM_EVENT_HIBERNATE, "hibernate" }, \ + { PM_EVENT_THAW, "thaw" }, \ + { PM_EVENT_RESTORE, "restore" }, \ +- { PM_EVENT_RECOVER, "recover" }) ++ { PM_EVENT_RECOVER, "recover" }, \ ++ { PM_EVENT_POWEROFF, "poweroff" }) + + DEFINE_EVENT(cpu, cpu_frequency, + +diff --git a/kernel/reboot.c b/kernel/reboot.c +index ec087827c85c..c8835f8e5f27 100644 +--- a/kernel/reboot.c ++++ b/kernel/reboot.c +@@ -13,6 +13,7 @@ + #include + #include + #include ++#include + #include + #include + #include +@@ -305,6 +306,11 @@ static void kernel_shutdown_prepare(enum system_states state) + (state == SYSTEM_HALT) ? SYS_HALT : SYS_POWER_OFF, NULL); + system_state = state; + usermodehelper_disable(); ++#ifdef CONFIG_HIBERNATE_CALLBACKS ++ if (!dpm_suspend_start(PMSG_POWEROFF) && !dpm_suspend_end(PMSG_POWEROFF)) ++ return; ++ pr_emerg("Failed to power off devices, using shutdown instead.\n"); ++#endif + device_shutdown(); + } + /** +-- +2.50.1 + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch new file mode 100644 index 0000000..0d2a7cb --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-01-28-sched-Cache-aware-load-balancing.patch @@ -0,0 +1,810 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.19]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 93DAD125D6 + for ; Sat, 9 Aug 2025 05:07:12 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.19 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716034; cv=none; b=YUmFsTNH1uwQvpOHnTp11Akd3lgJWMfavT04pYrRO6bSLY9uShqjjFR32v7kBjYwOu9HZts4Psvms0Up5yiFkgkTpBdbC8CX/E7Z4c1Klx1PkIf3BPuhpb8ZvRx+SMdhPpzo/SQA6Ht628h/WhbmPYoJzx1WyHar5r5e0vVf1nw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716034; c=relaxed/simple; + bh=B8SncTRCfxokFw3HLq476F91kwXYiv+eNctY+3vgxDg=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=LoeDB94MVSwRN9DiDv9UOzgtbfNf7Z8gM97rUyOhBmJ8xbH9EBSw6tUtmKoGA8eGV7pQZyNxS628yTpoLSby3RcBvL9Nu68rCTwLJMSC6e/upA0JZGqZ1E/H3XAf1XnjpHP133kxqHoHsAf9B7kQtb8FMbTEqziO+wZWq/wHV7c= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=EoDNlyYD; arc=none smtp.client-ip=192.198.163.19 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="EoDNlyYD" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716032; x=1786252032; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=B8SncTRCfxokFw3HLq476F91kwXYiv+eNctY+3vgxDg=; + b=EoDNlyYDA5oPL7kQOXZZlmGPK3p6khDrsVNfQ0JxnGYMNTmoWPG3Trqv + G2IvQfodRnNQCYPKgy20JzG+hnRCEBJuWbDYvbBKAv1X1Y6JYcYj11fWU + ZEKDojM5x6NyBsP6fUSaKmteIt+dcABM+mQ1mSY84wSYIPWQMFhGWqxKi + 6u+a+ocT6BdIAxulicFjYoaLOtii26qUbwZRgLo92ZRGMfUm3fzaPrvmE + Ao5J3uJtLRfBswzdorTuQV5vLeCnDshzqwFinb0JTb2FOypjk2LzTN+gp + 1hpBBXiAasf2lovIh8TYi7x2VmGvVyHeq1JBHV/mgRFlzzU7UVpHT7daf + Q==; +X-CSE-ConnectionGUID: to6lNAM4S3a+AafwUff0zw== +X-CSE-MsgGUID: crHzkh/2S0a7G+FwBgPlVQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56091915" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56091915" +Received: from orviesa007.jf.intel.com ([10.64.159.147]) + by fmvoesa113.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:07:11 -0700 +X-CSE-ConnectionGUID: BhKTydHRScuaqNFjz5BXWQ== +X-CSE-MsgGUID: m+FrJxJNTOeCg6U8j8DjQQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165475503" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:07:06 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 01/28] sched: Cache aware load-balancing +Date: Sat, 9 Aug 2025 13:00:59 +0800 +Message-Id: <9157186cf9e3fd541f62c637579ff736b3704c51.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Peter Zijlstra + +Hi all, + +One of the many things on the eternal todo list has been finishing the +below hackery. + +It is an attempt at modelling cache affinity -- and while the patch +really only targets LLC, it could very well be extended to also apply to +clusters (L2). Specifically any case of multiple cache domains inside a +node. + +Anyway, I wrote this about a year ago, and I mentioned this at the +recent OSPM conf where Gautham and Prateek expressed interest in playing +with this code. + +So here goes, very rough and largely unproven code ahead :-) + +It applies to current tip/master, but I know it will fail the __percpu +validation that sits in -next, although that shouldn't be terribly hard +to fix up. + +As is, it only computes a CPU inside the LLC that has the highest recent +runtime, this CPU is then used in the wake-up path to steer towards this +LLC and in task_hot() to limit migrations away from it. + +More elaborate things could be done, notably there is an XXX in there +somewhere about finding the best LLC inside a NODE (interaction with +NUMA_BALANCING). + +Signed-off-by: Peter Zijlstra (Intel) +--- + include/linux/mm_types.h | 44 ++++++ + include/linux/sched.h | 4 + + init/Kconfig | 4 + + kernel/fork.c | 5 + + kernel/sched/core.c | 13 +- + kernel/sched/fair.c | 330 +++++++++++++++++++++++++++++++++++++-- + kernel/sched/sched.h | 8 + + 7 files changed, 388 insertions(+), 20 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index d6b91e8a66d6..cf26ad8b41ab 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -928,6 +928,12 @@ struct mm_cid { + }; + #endif + ++struct mm_sched { ++ u64 runtime; ++ unsigned long epoch; ++ unsigned long occ; ++}; ++ + struct kioctx_table; + struct iommu_mm_data; + struct mm_struct { +@@ -1018,6 +1024,17 @@ struct mm_struct { + */ + raw_spinlock_t cpus_allowed_lock; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Track per-cpu-per-process occupancy as a proxy for cache residency. ++ * See account_mm_sched() and ... ++ */ ++ struct mm_sched __percpu *pcpu_sched; ++ raw_spinlock_t mm_sched_lock; ++ unsigned long mm_sched_epoch; ++ int mm_sched_cpu; ++#endif ++ + #ifdef CONFIG_MMU + atomic_long_t pgtables_bytes; /* size of all page tables */ + #endif +@@ -1432,6 +1449,33 @@ static inline unsigned int mm_cid_size(void) + static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } + #endif /* CONFIG_SCHED_MM_CID */ + ++#ifdef CONFIG_SCHED_CACHE ++extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched); ++ ++static inline int mm_alloc_sched_noprof(struct mm_struct *mm) ++{ ++ struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ if (!pcpu_sched) ++ return -ENOMEM; ++ ++ mm_init_sched(mm, pcpu_sched); ++ return 0; ++} ++ ++#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) ++ ++static inline void mm_destroy_sched(struct mm_struct *mm) ++{ ++ free_percpu(mm->pcpu_sched); ++ mm->pcpu_sched = NULL; ++} ++#else /* !CONFIG_SCHED_CACHE */ ++ ++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } ++static inline void mm_destroy_sched(struct mm_struct *mm) { } ++ ++#endif /* CONFIG_SCHED_CACHE */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index aa9c5be7a632..02ff8b8be25b 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1403,6 +1403,10 @@ struct task_struct { + unsigned long numa_pages_migrated; + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ struct callback_head cache_work; ++#endif ++ + #ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; +diff --git a/init/Kconfig b/init/Kconfig +index 666783eb50ab..27f4012347f9 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -947,6 +947,10 @@ config NUMA_BALANCING + + This system will be inactive on UMA systems. + ++config SCHED_CACHE ++ bool "Cache aware scheduler" ++ default y ++ + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y +diff --git a/kernel/fork.c b/kernel/fork.c +index 1ee8eb11f38b..546c49e46d48 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -1073,6 +1073,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + if (mm_alloc_cid(mm, p)) + goto fail_cid; + ++ if (mm_alloc_sched(mm)) ++ goto fail_sched; ++ + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; +@@ -1082,6 +1085,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + return mm; + + fail_pcpu: ++ mm_destroy_sched(mm); ++fail_sched: + mm_destroy_cid(mm); + fail_cid: + destroy_context(mm); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 81c6df746df1..a5fb3057b1c4 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4539,6 +4539,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->migration_pending = NULL; + #endif + init_sched_mm_cid(p); ++ init_sched_mm(p); + } + + DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); +@@ -8508,6 +8509,7 @@ static struct kmem_cache *task_group_cache __ro_after_init; + + void __init sched_init(void) + { ++ unsigned long now = jiffies; + unsigned long ptr = 0; + int i; + +@@ -8582,7 +8584,7 @@ void __init sched_init(void) + raw_spin_lock_init(&rq->__lock); + rq->nr_running = 0; + rq->calc_load_active = 0; +- rq->calc_load_update = jiffies + LOAD_FREQ; ++ rq->calc_load_update = now + LOAD_FREQ; + init_cfs_rq(&rq->cfs); + init_rt_rq(&rq->rt); + init_dl_rq(&rq->dl); +@@ -8626,7 +8628,7 @@ void __init sched_init(void) + rq->cpu_capacity = SCHED_CAPACITY_SCALE; + rq->balance_callback = &balance_push_callback; + rq->active_balance = 0; +- rq->next_balance = jiffies; ++ rq->next_balance = now; + rq->push_cpu = 0; + rq->cpu = i; + rq->online = 0; +@@ -8638,7 +8640,7 @@ void __init sched_init(void) + + rq_attach_root(rq, &def_root_domain); + #ifdef CONFIG_NO_HZ_COMMON +- rq->last_blocked_load_update_tick = jiffies; ++ rq->last_blocked_load_update_tick = now; + atomic_set(&rq->nohz_flags, 0); + + INIT_CSD(&rq->nohz_csd, nohz_csd_func, rq); +@@ -8663,6 +8665,11 @@ void __init sched_init(void) + + rq->core_cookie = 0UL; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spin_lock_init(&rq->cpu_epoch_lock); ++ rq->cpu_epoch_next = now; ++#endif ++ + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); + } + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 7a14da5396fb..e3897cd7696d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1166,10 +1166,229 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + return delta_exec; + } + +-static inline void update_curr_task(struct task_struct *p, s64 delta_exec) ++#ifdef CONFIG_SCHED_CACHE ++ ++/* ++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD ++ * tunable or so. ++ */ ++#define EPOCH_PERIOD (HZ/100) /* 10 ms */ ++#define EPOCH_OLD 5 /* 50 ms */ ++ ++void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched) ++{ ++ unsigned long epoch; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); ++ struct rq *rq = cpu_rq(i); ++ ++ pcpu_sched->runtime = 0; ++ pcpu_sched->epoch = epoch = rq->cpu_epoch; ++ pcpu_sched->occ = -1; ++ } ++ ++ raw_spin_lock_init(&mm->mm_sched_lock); ++ mm->mm_sched_epoch = epoch; ++ mm->mm_sched_cpu = -1; ++ ++ smp_store_release(&mm->pcpu_sched, _pcpu_sched); ++} ++ ++/* because why would C be fully specified */ ++static __always_inline void __shr_u64(u64 *val, unsigned int n) ++{ ++ if (n >= 64) { ++ *val = 0; ++ return; ++ } ++ *val >>= n; ++} ++ ++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ lockdep_assert_held(&rq->cpu_epoch_lock); ++ ++ unsigned long n, now = jiffies; ++ long delta = now - rq->cpu_epoch_next; ++ ++ if (delta > 0) { ++ n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ rq->cpu_epoch += n; ++ rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ __shr_u64(&rq->cpu_runtime, n); ++ } ++ ++ n = rq->cpu_epoch - pcpu_sched->epoch; ++ if (n) { ++ pcpu_sched->epoch += n; ++ __shr_u64(&pcpu_sched->runtime, n); ++ } ++} ++ ++static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); ++ ++ __update_mm_sched(rq, pcpu_sched); ++ ++ /* ++ * Runtime is a geometric series (r=0.5) and as such will sum to twice ++ * the accumulation period, this means the multiplcation here should ++ * not overflow. ++ */ ++ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); ++} ++ ++static inline ++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) ++{ ++ struct mm_struct *mm = p->mm; ++ struct mm_sched *pcpu_sched; ++ unsigned long epoch; ++ ++ /* ++ * init_task and kthreads don't be having no mm ++ */ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched); ++ ++ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { ++ __update_mm_sched(rq, pcpu_sched); ++ pcpu_sched->runtime += delta_exec; ++ rq->cpu_runtime += delta_exec; ++ epoch = rq->cpu_epoch; ++ } ++ ++ /* ++ * If this task hasn't hit task_cache_work() for a while, invalidate ++ * it's preferred state. ++ */ ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) { ++ mm->mm_sched_cpu = -1; ++ pcpu_sched->occ = -1; ++ } ++} ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ struct mm_struct *mm = p->mm; ++ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ if (mm->mm_sched_epoch == rq->cpu_epoch) ++ return; ++ ++ guard(raw_spinlock)(&mm->mm_sched_lock); ++ ++ if (mm->mm_sched_epoch == rq->cpu_epoch) ++ return; ++ ++ if (work->next == work) { ++ task_work_add(p, work, TWA_RESUME); ++ WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch); ++ } ++} ++ ++static void task_cache_work(struct callback_head *work) ++{ ++ struct task_struct *p = current; ++ struct mm_struct *mm = p->mm; ++ unsigned long m_a_occ = 0; ++ int cpu, m_a_cpu = -1; ++ cpumask_var_t cpus; ++ ++ WARN_ON_ONCE(work != &p->cache_work); ++ ++ work->next = work; ++ ++ if (p->flags & PF_EXITING) ++ return; ++ ++ if (!alloc_cpumask_var(&cpus, GFP_KERNEL)) ++ return; ++ ++ scoped_guard (cpus_read_lock) { ++ cpumask_copy(cpus, cpu_online_mask); ++ ++ for_each_cpu(cpu, cpus) { ++ /* XXX sched_cluster_active */ ++ struct sched_domain *sd = per_cpu(sd_llc, cpu); ++ unsigned long occ, m_occ = 0, a_occ = 0; ++ int m_cpu = -1, nr = 0, i; ++ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ occ = fraction_mm_sched(cpu_rq(i), ++ per_cpu_ptr(mm->pcpu_sched, i)); ++ a_occ += occ; ++ if (occ > m_occ) { ++ m_occ = occ; ++ m_cpu = i; ++ } ++ nr++; ++ trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n", ++ per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr); ++ } ++ ++ a_occ /= nr; ++ if (a_occ > m_a_occ) { ++ m_a_occ = a_occ; ++ m_a_cpu = m_cpu; ++ } ++ ++ trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n", ++ per_cpu(sd_llc_id, cpu), a_occ, m_a_occ); ++ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ /* XXX threshold ? */ ++ per_cpu_ptr(mm->pcpu_sched, i)->occ = a_occ; ++ } ++ ++ cpumask_andnot(cpus, cpus, sched_domain_span(sd)); ++ } ++ } ++ ++ /* ++ * If the max average cache occupancy is 'small' we don't care. ++ */ ++ if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD)) ++ m_a_cpu = -1; ++ ++ mm->mm_sched_cpu = m_a_cpu; ++ ++ free_cpumask_var(cpus); ++} ++ ++void init_sched_mm(struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ init_task_work(work, task_cache_work); ++ work->next = work; ++} ++ ++#else ++ ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, ++ s64 delta_exec) { } ++ ++ ++void init_sched_mm(struct task_struct *p) { } ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) { } ++ ++#endif ++ ++static inline ++void update_curr_task(struct rq *rq, struct task_struct *p, s64 delta_exec) + { + trace_sched_stat_runtime(p, delta_exec); + account_group_exec_runtime(p, delta_exec); ++ account_mm_sched(rq, p, delta_exec); + cgroup_account_cputime(p, delta_exec); + } + +@@ -1215,7 +1434,7 @@ s64 update_curr_common(struct rq *rq) + + delta_exec = update_curr_se(rq, &donor->se); + if (likely(delta_exec > 0)) +- update_curr_task(donor, delta_exec); ++ update_curr_task(rq, donor, delta_exec); + + return delta_exec; + } +@@ -1244,7 +1463,7 @@ static void update_curr(struct cfs_rq *cfs_rq) + if (entity_is_task(curr)) { + struct task_struct *p = task_of(curr); + +- update_curr_task(p, delta_exec); ++ update_curr_task(rq, p, delta_exec); + + /* + * If the fair_server is active, we need to account for the +@@ -7862,7 +8081,7 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + * per-cpu select_rq_mask usage + */ + lockdep_assert_irqs_disabled(); +- ++again: + if ((available_idle_cpu(target) || sched_idle_cpu(target)) && + asym_fits_cpu(task_util, util_min, util_max, target)) + return target; +@@ -7900,7 +8119,8 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + /* Check a recently used CPU as a potential idle candidate: */ + recent_used_cpu = p->recent_used_cpu; + p->recent_used_cpu = prev; +- if (recent_used_cpu != prev && ++ if (prev == p->wake_cpu && ++ recent_used_cpu != prev && + recent_used_cpu != target && + cpus_share_cache(recent_used_cpu, target) && + (available_idle_cpu(recent_used_cpu) || sched_idle_cpu(recent_used_cpu)) && +@@ -7953,6 +8173,18 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) + if ((unsigned)i < nr_cpumask_bits) + return i; + ++ if (prev != p->wake_cpu && !cpus_share_cache(prev, p->wake_cpu)) { ++ /* ++ * Most likely select_cache_cpu() will have re-directed ++ * the wakeup, but getting here means the preferred cache is ++ * too busy, so re-try with the actual previous. ++ * ++ * XXX wake_affine is lost for this pass. ++ */ ++ prev = target = p->wake_cpu; ++ goto again; ++ } ++ + /* + * For cluster machines which have lower sharing cache like L2 or + * LLC Tag, we tend to find an idle CPU in the target's cluster +@@ -8575,6 +8807,40 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + return target; + } + ++#ifdef CONFIG_SCHED_CACHE ++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle); ++ ++static int select_cache_cpu(struct task_struct *p, int prev_cpu) ++{ ++ struct mm_struct *mm = p->mm; ++ int cpu; ++ ++ if (!mm || p->nr_cpus_allowed == 1) ++ return prev_cpu; ++ ++ cpu = mm->mm_sched_cpu; ++ if (cpu < 0) ++ return prev_cpu; ++ ++ ++ if (static_branch_likely(&sched_numa_balancing) && ++ __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) { ++ /* ++ * XXX look for max occupancy inside prev_cpu's node ++ */ ++ return prev_cpu; ++ } ++ ++ return cpu; ++} ++#else ++static int select_cache_cpu(struct task_struct *p, int prev_cpu) ++{ ++ return prev_cpu; ++} ++#endif ++ ++ + /* + * select_task_rq_fair: Select target runqueue for the waking task in domains + * that have the relevant SD flag set. In practice, this is SD_BALANCE_WAKE, +@@ -8600,6 +8866,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + * required for stable ->cpus_allowed + */ + lockdep_assert_held(&p->pi_lock); ++ guard(rcu)(); ++ + if (wake_flags & WF_TTWU) { + record_wakee(p); + +@@ -8607,6 +8875,8 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + cpumask_test_cpu(cpu, p->cpus_ptr)) + return cpu; + ++ new_cpu = prev_cpu = select_cache_cpu(p, prev_cpu); ++ + if (!is_rd_overutilized(this_rq()->rd)) { + new_cpu = find_energy_efficient_cpu(p, prev_cpu); + if (new_cpu >= 0) +@@ -8617,7 +8887,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + want_affine = !wake_wide(p) && cpumask_test_cpu(cpu, p->cpus_ptr); + } + +- rcu_read_lock(); + for_each_domain(cpu, tmp) { + /* + * If both 'cpu' and 'prev_cpu' are part of this domain, +@@ -8650,7 +8919,6 @@ select_task_rq_fair(struct task_struct *p, int prev_cpu, int wake_flags) + /* Fast path */ + new_cpu = select_idle_sibling(p, prev_cpu, new_cpu); + } +- rcu_read_unlock(); + + return new_cpu; + } +@@ -9300,6 +9568,17 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + if (sysctl_sched_migration_cost == 0) + return 0; + ++#ifdef CONFIG_SCHED_CACHE ++ if (p->mm && p->mm->pcpu_sched) { ++ /* ++ * XXX things like Skylake have non-inclusive L3 and might not ++ * like this L3 centric view. What to do about L2 stickyness ? ++ */ ++ return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ > ++ per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ; ++ } ++#endif ++ + delta = rq_clock_task(env->src_rq) - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +@@ -9311,27 +9590,25 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + * Returns 0, if task migration is not affected by locality. + * Returns a negative value, if task migration improves locality i.e migration preferred. + */ +-static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) ++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle) + { + struct numa_group *numa_group = rcu_dereference(p->numa_group); + unsigned long src_weight, dst_weight; + int src_nid, dst_nid, dist; + +- if (!static_branch_likely(&sched_numa_balancing)) +- return 0; +- +- if (!p->numa_faults || !(env->sd->flags & SD_NUMA)) ++ if (!p->numa_faults) + return 0; + +- src_nid = cpu_to_node(env->src_cpu); +- dst_nid = cpu_to_node(env->dst_cpu); ++ src_nid = cpu_to_node(src_cpu); ++ dst_nid = cpu_to_node(dst_cpu); + + if (src_nid == dst_nid) + return 0; + + /* Migrating away from the preferred node is always bad. */ + if (src_nid == p->numa_preferred_nid) { +- if (env->src_rq->nr_running > env->src_rq->nr_preferred_running) ++ struct rq *src_rq = cpu_rq(src_cpu); ++ if (src_rq->nr_running > src_rq->nr_preferred_running) + return 1; + else + return 0; +@@ -9342,7 +9619,7 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) + return -1; + + /* Leaving a core idle is often worse than degrading locality. */ +- if (env->idle == CPU_IDLE) ++ if (idle) + return 0; + + dist = node_distance(src_nid, dst_nid); +@@ -9357,7 +9634,24 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) + return src_weight - dst_weight; + } + ++static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) ++{ ++ if (!static_branch_likely(&sched_numa_balancing)) ++ return 0; ++ ++ if (!(env->sd->flags & SD_NUMA)) ++ return 0; ++ ++ return __migrate_degrades_locality(p, env->src_cpu, env->dst_cpu, ++ env->idle == CPU_IDLE); ++} ++ + #else ++static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle) ++{ ++ return 0; ++} ++ + static inline long migrate_degrades_locality(struct task_struct *p, + struct lb_env *env) + { +@@ -13117,8 +13411,8 @@ static inline void task_tick_core(struct rq *rq, struct task_struct *curr) {} + */ + static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + { +- struct cfs_rq *cfs_rq; + struct sched_entity *se = &curr->se; ++ struct cfs_rq *cfs_rq; + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); +@@ -13128,6 +13422,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + if (static_branch_unlikely(&sched_numa_balancing)) + task_tick_numa(rq, curr); + ++ task_tick_cache(rq, curr); ++ + update_misfit_status(curr, rq); + check_update_overutilized_status(task_rq(curr)); + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 83e3aa917142..839463027ab0 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1173,6 +1173,12 @@ struct rq { + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spinlock_t cpu_epoch_lock; ++ u64 cpu_runtime; ++ unsigned long cpu_epoch; ++ unsigned long cpu_epoch_next; ++#endif + + atomic_t nr_iowait; + +@@ -3885,6 +3891,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } + static inline void init_sched_mm_cid(struct task_struct *t) { } + #endif /* !CONFIG_SCHED_MM_CID */ + ++extern void init_sched_mm(struct task_struct *p); ++ + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); + extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + #ifdef CONFIG_SMP +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch new file mode 100644 index 0000000..118118a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-02-28-sched-Several-fixes-for-cache-aware-scheduling.patch @@ -0,0 +1,318 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 365A021D3E1 + for ; Sat, 9 Aug 2025 05:07:36 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716057; cv=none; b=gkWBJvwJg5iw9w3Hcj2a+7isBgs+dQ6fQDbY6wOnLy8+dyj/K69iJ9MXZ3iC+AHiVKMdhhAQoR1l9wBbUy+BDlfe78+DRZUcHT2UIqJWtHq9xcndAunehdB/pDXNo95Uc+pmFlmpm5x4k3E0kzRAGeqzAXJ2da+LetkIln18z8w= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716057; c=relaxed/simple; + bh=0AeD1Ue2wq1wzi/RuwjSxpYJG1oGbIqRn0kfUtN8vWQ=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=ezySzW6WeP6U1sHlrXJpD4tSHCfswUkVtMkKRoY8mH+Is59q9EsKFce/r5LHpaugK4Vf9AWVbLfOfFJGgeU54XDE5BxdVqKFZzyDDz9t1/tqydhF9wFSbw/pomx2BYrO+hWtoQKyQHnIN8AUaxQhvGebuiVcyUt9i8bQDwF00zU= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=L1tFeu3x; arc=none smtp.client-ip=198.175.65.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="L1tFeu3x" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716056; x=1786252056; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=0AeD1Ue2wq1wzi/RuwjSxpYJG1oGbIqRn0kfUtN8vWQ=; + b=L1tFeu3xpYyhkd6vtGYa4ACOStVJ3enDl+olXnAD4THPx7m0Kc/94fQX + 0NFLzELjFB+k0dXkFEcvhvn2VXQNCEOqpU4KBJdAapmZmEa5Kw2a3uSD5 + 5xGm04sNo/62GAtkSLJDhfLmYvSib+2Y+m+5iYRVQYWZMC9fcPoUUOJIk + 57s73MqGMxeACxAjkhR9PE504WxXvkEUrsCDlWBeU6A00KrTz8w5uJ8fg + 62R1OQ44QJ5eTLS/469R4lFtouEYqw6B8JU9gex0GxRi5dRP00WgMGCoV + CD4HfPgwZPIsG54V4ibpdGi2Z/RSrK2prH4PrMvMSdJ5asPDTFjEDlvfI + Q==; +X-CSE-ConnectionGUID: lQLqoBuSQl6aw8VZynnN4g== +X-CSE-MsgGUID: yMcXhXa1RZSWwEf2YK1T6Q== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137717" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="57137717" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:07:36 -0700 +X-CSE-ConnectionGUID: ofaF5mj0TIucF4p9yS/Zvw== +X-CSE-MsgGUID: bcXwCarVSY2Y3E4fjnIy5g== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="170730164" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa004.fm.intel.com with ESMTP; 08 Aug 2025 22:07:29 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 02/28] sched: Several fixes for cache aware scheduling +Date: Sat, 9 Aug 2025 13:01:15 +0800 +Message-Id: <84ceaca0a1de853284b4fc9888af806b03cde8bb.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +1. Fix compile error on percpu allocation. +2. Enqueue to the target CPU rather than the current CPU. +3. NULL LLC sched domain check(Libo Chen). +4. Introduce sched feature SCHED_CACHE to control cache aware scheduling +5. Fix unsigned occupancy initialization to -1. +6. If there is only 1 thread in the process, no need to enable cache + awareness +7. Add __maybe_unused to __migrate_degrades_locality() to + avoid compile warnings. +8. Do not enable gcov coverage for task_cache_work() and + fraction_mm_sched() to avoid softlockup by gcov. +9. Make CONFIG_SCHED_CACHE depending on CONFIG_SMP to + avoid compile error on non-SMP system like microblaze + architecture. +10. Do not enable account cache aware statistics in + account_mm_sched() for non-normal tasks, as it could + be invoked by RT tasks.(Shrikanth Hegde) +11. Place cpu_epoch related fields in a dedicated cache line + to avoid interfering with clock_idle* fields. + (Shrikanth Hegde) + +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + include/linux/mm_types.h | 4 ++-- + init/Kconfig | 4 ++++ + kernel/sched/fair.c | 41 +++++++++++++++++++++++++++------------- + kernel/sched/features.h | 1 + + kernel/sched/sched.h | 2 +- + 5 files changed, 36 insertions(+), 16 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index cf26ad8b41ab..41a598a44361 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1450,11 +1450,11 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas + #endif /* CONFIG_SCHED_MM_CID */ + + #ifdef CONFIG_SCHED_CACHE +-extern void mm_init_sched(struct mm_struct *mm, struct mm_sched *pcpu_sched); ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched); + + static inline int mm_alloc_sched_noprof(struct mm_struct *mm) + { +- struct mm_sched *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched); + if (!pcpu_sched) + return -ENOMEM; + +diff --git a/init/Kconfig b/init/Kconfig +index 27f4012347f9..4bab39a5254c 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -950,6 +950,10 @@ config NUMA_BALANCING + config SCHED_CACHE + bool "Cache aware scheduler" + default y ++ depends on SMP ++ help ++ If set, the scheduler will try to aggregate tasks in the same process to ++ a single LLC if possible. + + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index e3897cd7696d..e97ab46509e3 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1175,7 +1175,7 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + #define EPOCH_PERIOD (HZ/100) /* 10 ms */ + #define EPOCH_OLD 5 /* 50 ms */ + +-void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched) ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; + int i; +@@ -1186,7 +1186,7 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched *_pcpu_sched) + + pcpu_sched->runtime = 0; + pcpu_sched->epoch = epoch = rq->cpu_epoch; +- pcpu_sched->occ = -1; ++ pcpu_sched->occ = 0; + } + + raw_spin_lock_init(&mm->mm_sched_lock); +@@ -1227,7 +1227,7 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) + } + } + +-static unsigned long fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) + { + guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); + +@@ -1248,13 +1248,18 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_sched *pcpu_sched; + unsigned long epoch; + ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ if (p->sched_class != &fair_sched_class) ++ return; + /* + * init_task and kthreads don't be having no mm + */ + if (!mm || !mm->pcpu_sched) + return; + +- pcpu_sched = this_cpu_ptr(p->mm->pcpu_sched); ++ pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq)); + + scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { + __update_mm_sched(rq, pcpu_sched); +@@ -1264,12 +1269,14 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + } + + /* +- * If this task hasn't hit task_cache_work() for a while, invalidate ++ * If this task hasn't hit task_cache_work() for a while, or it ++ * has only 1 thread, invalidate + * it's preferred state. + */ +- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD) { ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD || ++ get_nr_threads(p) <= 1) { + mm->mm_sched_cpu = -1; +- pcpu_sched->occ = -1; ++ pcpu_sched->occ = 0; + } + } + +@@ -1278,6 +1285,9 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + struct callback_head *work = &p->cache_work; + struct mm_struct *mm = p->mm; + ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ + if (!mm || !mm->pcpu_sched) + return; + +@@ -1286,16 +1296,13 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + + guard(raw_spinlock)(&mm->mm_sched_lock); + +- if (mm->mm_sched_epoch == rq->cpu_epoch) +- return; +- + if (work->next == work) { + task_work_add(p, work, TWA_RESUME); + WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch); + } + } + +-static void task_cache_work(struct callback_head *work) ++static void __no_profile task_cache_work(struct callback_head *work) + { + struct task_struct *p = current; + struct mm_struct *mm = p->mm; +@@ -1322,6 +1329,9 @@ static void task_cache_work(struct callback_head *work) + unsigned long occ, m_occ = 0, a_occ = 0; + int m_cpu = -1, nr = 0, i; + ++ if (!sd) ++ continue; ++ + for_each_cpu(i, sched_domain_span(sd)) { + occ = fraction_mm_sched(cpu_rq(i), + per_cpu_ptr(mm->pcpu_sched, i)); +@@ -8815,6 +8825,9 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + struct mm_struct *mm = p->mm; + int cpu; + ++ if (!sched_feat(SCHED_CACHE)) ++ return prev_cpu; ++ + if (!mm || p->nr_cpus_allowed == 1) + return prev_cpu; + +@@ -9569,7 +9582,7 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + return 0; + + #ifdef CONFIG_SCHED_CACHE +- if (p->mm && p->mm->pcpu_sched) { ++ if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) { + /* + * XXX things like Skylake have non-inclusive L3 and might not + * like this L3 centric view. What to do about L2 stickyness ? +@@ -9647,7 +9660,9 @@ static long migrate_degrades_locality(struct task_struct *p, struct lb_env *env) + } + + #else +-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle) ++static __maybe_unused long __migrate_degrades_locality(struct task_struct *p, ++ int src_cpu, int dst_cpu, ++ bool idle) + { + return 0; + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 3c12d9f93331..d2af7bfd36bf 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true) + */ + SCHED_FEAT(SIS_UTIL, true) + ++SCHED_FEAT(SCHED_CACHE, true) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 839463027ab0..f4ab45ecca86 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1174,7 +1174,7 @@ struct rq { + u64 clock_idle_copy; + #endif + #ifdef CONFIG_SCHED_CACHE +- raw_spinlock_t cpu_epoch_lock; ++ raw_spinlock_t cpu_epoch_lock ____cacheline_aligned; + u64 cpu_runtime; + unsigned long cpu_epoch; + unsigned long cpu_epoch_next; +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch new file mode 100644 index 0000000..b8354d1 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-03-28-sched-Avoid-task-migration-within-its-preferred-LLC.patch @@ -0,0 +1,117 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4AE9D226CFC + for ; Sat, 9 Aug 2025 05:07:58 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.15 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716079; cv=none; b=mzMpUPIBhCIfqgfGYLirSyVew1DiGNGy8kHH9pByDFwjQLg/R08SklG4sqt+h9F0MjNW8uROdXW9EhU0eQGBZx9K4bKZLpb32NTZ568kuQTL5xijzNnbyKAfpI4nekWx9gHcKn2NZrcT76Sz4xJ2qgXu5qqYX/ksmmq4ZH9u+uk= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716079; c=relaxed/simple; + bh=57gcT4d4kI048m64jkmqBDrfcowgupEqZQwQ5AEee0w=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=f3RQRxzvnj66spzaJhbe9MgqAYCa98AUSZuwljPsRXInxq/Oxk06wgAT2vRlhS0ehsQOQHM82nnzblQnvrJvdVfOkRoSiG94h3cOAWLd5yBkPjkPpHdCL+rW9rkGbaTLW4RhQdXSHhYol4ZYkaUUjpFkZLW21Gb6+B8vqJUbOW8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=c7C5KF6q; arc=none smtp.client-ip=198.175.65.15 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="c7C5KF6q" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716078; x=1786252078; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=57gcT4d4kI048m64jkmqBDrfcowgupEqZQwQ5AEee0w=; + b=c7C5KF6q/sRDGDVG+lM2wu4H0TzbLyHIoFXYrwK1PhMTWehBhmUrl96S + T3NkwyIpBdauVKsc3hrNWBirHezNT+Ts0OE7838wAraS+qmqOaNyn/zFO + uMSRssAaGwLukBsRJhTXc1N5I0Xy/egiTw1fhkKvS4U8SfrTfRWrmwRa3 + RPU4tMB524z4Z6MxtH6azdWGiN57MoFd2/dFpTSaE7cXAWavDizO4/WkF + yI3XD8KwS9r/rQo9E5DRI45b4Vgd1JhUvkVPHt9fZqza6Nai4EKnx1UNE + 0Vq3A218YIyzDKDxbIwkrtpMqkq0EWb+pBp1au3p6UMfK3D2O6VNQOM+h + A==; +X-CSE-ConnectionGUID: Fqgj+0eySbSqlUZZn4AFOw== +X-CSE-MsgGUID: Y/ZbdGdBSSKyPLJRd4lBYA== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60682928" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="60682928" +Received: from orviesa005.jf.intel.com ([10.64.159.145]) + by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:07:58 -0700 +X-CSE-ConnectionGUID: vDQ06Yc6TseUjHJDDorjIA== +X-CSE-MsgGUID: sVVKpPMjSQmXlszsGV2bRg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="170841605" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa005.jf.intel.com with ESMTP; 08 Aug 2025 22:07:52 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 03/28] sched: Avoid task migration within its preferred LLC +Date: Sat, 9 Aug 2025 13:01:41 +0800 +Message-Id: <37376d2072f6175d2fb909a29b66a3da0bcfcce3.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +It was found that when running schbench, there is a +significant amount of in-LLC task migrations, even if +the wakee is woken up on its preferred LLC. This +leads to core-to-core latency and impairs performance. + +Inhibit task migration if the wakee is already in its +preferred LLC. + +Signed-off-by: Chen Yu +--- + kernel/sched/fair.c | 2 ++ + 1 file changed, 2 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index e97ab46509e3..00bd0d25bc91 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8835,6 +8835,8 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + if (cpu < 0) + return prev_cpu; + ++ if (cpus_share_cache(cpu, prev_cpu)) ++ return prev_cpu; + + if (static_branch_likely(&sched_numa_balancing) && + __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) { +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch new file mode 100644 index 0000000..392f3c3 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-04-28-sched-Avoid-calculating-the-cpumask-if-the-system-is-overloaded.patch @@ -0,0 +1,131 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.15]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 029841DC994 + for ; Sat, 9 Aug 2025 05:08:14 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.15 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716096; cv=none; b=WeFRqzbJuIG6rs2oBAOGSTvUwR0GN2LxtVKltBBp1IAWJ1/M5927lhVOryPqkDV68MiNoDiPeaUuIGeJXy1yxTNPU+76g8h8o2kq0++bTNlmXdtCkgRKkjAyvo6JUXPfk9qDPu5fNyxlfwrmUYWgRrIiKd3DbVL5bDFDsKlvmIg= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716096; c=relaxed/simple; + bh=3Uph+Pq82/wD/SKt6Wb33FmEmMZN7GyPmlnOYYynb0Q=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=o3E1Bq0Hs0dTChFZYVm4GVOaPeXcpLPhZZEQJ43VOlsuNVair0TGmjdyd1fLlVjeODS5guLeDmjO76w+loIt+jPuKVqUMOnTWw1sIHx/QjCLlCjeJzEknh9dFn7KMZ1m1CPRGI1DknlEPNf/b1KDvycj+UhJPpZyr/+EIpZ7wE0= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=KCQ9b2gg; arc=none smtp.client-ip=198.175.65.15 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="KCQ9b2gg" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716095; x=1786252095; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=3Uph+Pq82/wD/SKt6Wb33FmEmMZN7GyPmlnOYYynb0Q=; + b=KCQ9b2ggnvTfhFeV/E50fpZUEEpmBsHHnpwH10t35fPh5GZQ4EVxbF9O + iwfbtGzsyddng/NZIteqbsCZ21Nl1B6x7QxI9972j42g46j13xjdwRoZ3 + 8A5ColX2OkCXP0fikLLx5ox8/8xMCGNiAOuHNT4EVTgK2VkSLTjB4x6k2 + OuuokNSBejb3QbstBidVgae5eMr6rPiKsjUpKeIv2M/QgpCk+dAN8C98Z + 9hQOg7BYjmAjMdUmUQXdfIf7u4hNaX6qUCPOtPPWVhaIxMAKXUR8DS4hA + yBD3fm5G5+abwatbqRE6FgrAva6LfJ6mMuAuKCPYQ9SPgzZrUlqukRs7B + A==; +X-CSE-ConnectionGUID: 2W0v7dR/SgiRTYDXa05aZw== +X-CSE-MsgGUID: hTr8aLcoRrKUtcFs+hv2jQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60682947" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="60682947" +Received: from orviesa005.jf.intel.com ([10.64.159.145]) + by orvoesa107.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:08:15 -0700 +X-CSE-ConnectionGUID: /DUYqfzETGitdTxBYN+2cg== +X-CSE-MsgGUID: /4qu8b2ATfih1n9Fc4FHNQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="170841644" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa005.jf.intel.com with ESMTP; 08 Aug 2025 22:08:09 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 04/28] sched: Avoid calculating the cpumask if the system is overloaded +Date: Sat, 9 Aug 2025 13:02:04 +0800 +Message-Id: <88d1c3bc1e817cc72346f566153a4618604b9ecd.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: K Prateek Nayak + +If SIS_UTIL terminates the search for idle CPUs, the result of +cpumask_and() becomes irrelevant. Given that select_idle_cpu() +may now be invoked twice per wake-up within select_idle_sibling() +due to cache-aware wake-ups, this overhead can be observed in +benchmarks such as hackbench. + +To conserve additional cycles-particularly in scenarios where +the LLC is frequently targeted and the search aborts because +the LLC is busy - calculate the cpumask only when the system is +not overloaded. + +Signed-off-by: K Prateek Nayak +--- + kernel/sched/fair.c | 4 ++-- + 1 file changed, 2 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 00bd0d25bc91..a7be5c5ecba3 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -7940,8 +7940,6 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool + int i, cpu, idle_cpu = -1, nr = INT_MAX; + struct sched_domain_shared *sd_share; + +- cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); +- + if (sched_feat(SIS_UTIL)) { + sd_share = rcu_dereference(per_cpu(sd_llc_shared, target)); + if (sd_share) { +@@ -7953,6 +7951,8 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool + } + } + ++ cpumask_and(cpus, sched_domain_span(sd), p->cpus_ptr); ++ + if (static_branch_unlikely(&sched_cluster_active)) { + struct sched_group *sg = sd->groups; + +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch new file mode 100644 index 0000000..5c19d12 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-05-28-sched-Add-hysteresis-to-switch-a-task-s-preferred-LLC.patch @@ -0,0 +1,165 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7E3651DE3BE + for ; Sat, 9 Aug 2025 05:08:29 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716112; cv=none; b=PThukVAMb7sFuKxVcgD3wTM4phVJ8rI9r7+ebpAt9DmMUXFw/IDfhBgJzVDETnMraJUzTpjxdg0CH+MqsJFdZpG0+0YGabzIwZ03oS5dGUCpiuwcqMBi79EXkvVi691ZUTMUjUdIFwJpzWnQscUTpYi1EC8GJgP4BnZ2xG+wmR8= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716112; c=relaxed/simple; + bh=LDYuA5WJyjvULccdhS0DHdWu2p7tGoJ/eeAWhiaAlrg=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=bj8SEXlQMrLRUboVVGnQOu9JBe5g36Sf1XMNZ4Fxig9ZZalYOFEmmdY0+oe5Ky5U018MFmBrwaWfJieFQcobideyIOiTbWBmhitES6gj23mv2S9buVmE5umygmQde5ClZGVf904vuEv77RefIPDld9g1BbkRQFkgRCwN1dgJYBw= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Yz3P0As2; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Yz3P0As2" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716109; x=1786252109; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=LDYuA5WJyjvULccdhS0DHdWu2p7tGoJ/eeAWhiaAlrg=; + b=Yz3P0As2P7sAAeLgT7nQe+nKKbEUL/+9rKdSGXrRCerJoyPyZVJiAIca + rTCc2mIWo0r7xKXHBwJCzl+lhPLNxfq7ThXFDqw/086ptM6gsSmcdBYFy + 0XG6Bpx4G8F6WIlomNDg2uKFh3+Gf6iv4ohkTrkI1AR9d2HRIWlbSXqPg + gjIc3qKxMHdoEmw84F/oRaqsVQVLHKyRLGcXSUZ869pJdp3tCl5EFYIHx + RipzC73I4/a7J8WSfr1XW9s1QojcqMVZE0c1LndRlkFmT99Paa711cvo0 + L3/AK8mOeiqf6B9FcyzKS+XWq8jtfhVABayP9NRmDhXta4Wem5Y5cg2O/ + Q==; +X-CSE-ConnectionGUID: 1ql6YZT/RpyNFYw0G8nvRg== +X-CSE-MsgGUID: Py+W8Q8/QNGIYDmbtHMFoQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57019866" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="57019866" +Received: from fmviesa005.fm.intel.com ([10.60.135.145]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:08:29 -0700 +X-CSE-ConnectionGUID: IcQ9RT+CQgWiji82ikx4LA== +X-CSE-MsgGUID: ojuGhmoFTkKNKnNcd1Z5hA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="169704810" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa005.fm.intel.com with ESMTP; 08 Aug 2025 22:08:23 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 05/28] sched: Add hysteresis to switch a task's preferred LLC +Date: Sat, 9 Aug 2025 13:02:18 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Switching a process's preferred LLC generates lots of task +migrations across LLCs. To avoid frequent switches +of home LLC, implement the following policy: + +1. Require a 2x occ change threshold to switch preferred LLC +2. Don't discard preferred LLC for a task + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 24 ++++++++++++++++-------- + 1 file changed, 16 insertions(+), 8 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a7be5c5ecba3..9e3c6f0eb934 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1175,6 +1175,14 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + #define EPOCH_PERIOD (HZ/100) /* 10 ms */ + #define EPOCH_OLD 5 /* 50 ms */ + ++static int llc_id(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_id, cpu); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +@@ -1307,6 +1315,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; ++ unsigned long last_m_a_occ = 0; + int cpu, m_a_cpu = -1; + cpumask_var_t cpus; + +@@ -1345,11 +1354,13 @@ static void __no_profile task_cache_work(struct callback_head *work) + per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr); + } + +- a_occ /= nr; ++ // a_occ /= nr; + if (a_occ > m_a_occ) { + m_a_occ = a_occ; + m_a_cpu = m_cpu; + } ++ if (llc_id(cpu) == llc_id(mm->mm_sched_cpu)) ++ last_m_a_occ = a_occ; + + trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n", + per_cpu(sd_llc_id, cpu), a_occ, m_a_occ); +@@ -1363,13 +1374,10 @@ static void __no_profile task_cache_work(struct callback_head *work) + } + } + +- /* +- * If the max average cache occupancy is 'small' we don't care. +- */ +- if (m_a_occ < (NICE_0_LOAD >> EPOCH_OLD)) +- m_a_cpu = -1; +- +- mm->mm_sched_cpu = m_a_cpu; ++ if (m_a_occ > (2 * last_m_a_occ)) { ++ /* avoid the bouncing of mm_sched_cpu */ ++ mm->mm_sched_cpu = m_a_cpu; ++ } + + free_cpumask_var(cpus); + } +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch new file mode 100644 index 0000000..4054b16 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-06-28-sched-Save-the-per-LLC-utilization-for-better-cache-aware-scheduling.patch @@ -0,0 +1,200 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id F24CE21D3E1 + for ; Sat, 9 Aug 2025 05:09:05 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716147; cv=none; b=LI4h+6OJWZaBS0TR7Q3NBzoXkmy9JmDvvsP0v6h3Wr+GpKZR5W1whx1t+MpULY/tpCopCTQwtk+d4eHYbBbXGG8tw911CUm66GpfCtas8ctsmrrtOtpyFMSEQ1wSEmj4dWMkrZhPa8ugb3u+CyP9djkHSe8sZ2gx2yAfRIPx9CU= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716147; c=relaxed/simple; + bh=euRWyMdufoCin//rgMJ4T3fWMZpHw1jQB7L14khTmb4=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=FhwU+egXgnivL5wAIp8WWClNxp4yT7p2+qq0OVtPsZHuRHUqSefqGa8Mw69mD2l/SvPpXJI146UF1gIL0OohXuZVBnBx9uuzjtJB0fwJPdjrZGa51C5jL55hj27fGIPyUJpmNwnKZK7cVsQFji5MJ9gcLGaigOLthcWnAQfM4Ww= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=XB8zwFIH; arc=none smtp.client-ip=198.175.65.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="XB8zwFIH" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716146; x=1786252146; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=euRWyMdufoCin//rgMJ4T3fWMZpHw1jQB7L14khTmb4=; + b=XB8zwFIHW5tROwSjY+TkhHve8zxklhKn+ovNLIF8SV5U0iIW3uisGZ+h + hY/ESXsB+pgUaisyny/yy5pPLNKo8MGtQy5YXsFvNJCRR/qBzsPTciwpd + DWKZJ1KhyMu77ycu6eYUAzqZuN/gRnxFRxIu0gB+CDzvdaP0yW6Alm3q2 + 6uSDm53TwoaOggR9d3iPh9Z+dpDEn90e2yYpi8OZHptMKcxOxMQuhOE8g + XyVt9GJRY6uXVn+Xhk0ObrEJv4d8fU3+v015Xl9/d69ko5uk8uOcrTvoC + 5KhPHr2patZHHRizOM78ma1nH5m9MLqfkUzr1tVCq2xOpXisttW9XZBQc + Q==; +X-CSE-ConnectionGUID: fZf0KiitSgGguAZwYKknTQ== +X-CSE-MsgGUID: qtUqGlf/QgGtoI55sDwkdQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137770" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="57137770" +Received: from fmviesa006.fm.intel.com ([10.60.135.146]) + by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:09:05 -0700 +X-CSE-ConnectionGUID: 7a50TPFuTq2l0110KDRwEA== +X-CSE-MsgGUID: IsqZoesVQLiHgA5hLOCuhg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165374544" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa006.fm.intel.com with ESMTP; 08 Aug 2025 22:09:00 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 06/28] sched: Save the per LLC utilization for better cache aware scheduling +Date: Sat, 9 Aug 2025 13:02:54 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +When a system gets busy and a process's preferred LLC +is saturated by too many threads within this process, there are significant +in-LLC task migrations within its preferred LLC. This leads to migration +latency and degrades performance. Ideally, task aggregation should be +inhibited if the task's preferred LLC is overloaded. This implies that a +metric is needed to indicate whether the LLC is busy. + +Store the per-LLC utilization calculated via periodic load +balancing. These statistics will be used in subsequent patches to +determine whether tasks should be aggregated to their preferred LLC. + +Signed-off-by: Chen Yu +--- + include/linux/sched/topology.h | 3 ++ + kernel/sched/fair.c | 53 ++++++++++++++++++++++++++++++++++ + 2 files changed, 56 insertions(+) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 198bb5cc1774..692f8a703b93 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -78,6 +78,9 @@ struct sched_domain_shared { + atomic_t nr_busy_cpus; + int has_idle_cores; + int nr_idle_scan; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned long util_avg; ++#endif + }; + + struct sched_domain { +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 9e3c6f0eb934..4f79b7652642 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8828,6 +8828,22 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + #ifdef CONFIG_SCHED_CACHE + static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle); + ++/* expected to be protected by rcu_read_lock() */ ++static bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ struct sched_domain_shared *sd_share; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu)); ++ if (!sd_share) ++ return false; ++ ++ *util = READ_ONCE(sd_share->util_avg); ++ *cap = per_cpu(sd_llc_size, cpu) * SCHED_CAPACITY_SCALE; ++ ++ return true; ++} ++ + static int select_cache_cpu(struct task_struct *p, int prev_cpu) + { + struct mm_struct *mm = p->mm; +@@ -10670,6 +10686,42 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) + return check_cpu_capacity(rq, sd); + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Save this sched group's statistic for later use: ++ * The task wakeup and load balance can make better ++ * decision based on these statistics. ++ */ ++static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ /* Find the sched domain that spans this group. */ ++ struct sched_domain *sd = env->sd->child; ++ struct sched_domain_shared *sd_share; ++ ++ if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ return; ++ ++ /* only care the sched domain that spans 1 LLC */ ++ if (!sd || !(sd->flags & SD_SHARE_LLC) || ++ !sd->parent || (sd->parent->flags & SD_SHARE_LLC)) ++ return; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, ++ cpumask_first(sched_group_span(group)))); ++ if (!sd_share) ++ return; ++ ++ if (likely(READ_ONCE(sd_share->util_avg) != sgs->group_util)) ++ WRITE_ONCE(sd_share->util_avg, sgs->group_util); ++} ++#else ++static inline void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++} ++#endif ++ + /** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @env: The load balancing environment. +@@ -10759,6 +10811,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ update_sg_if_llc(env, sgs, group); + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch new file mode 100644 index 0000000..e02811a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-07-28-sched-Add-helper-function-to-decide-whether-to-allow-cache-aware-scheduling.patch @@ -0,0 +1,293 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id A097E125D6 + for ; Sat, 9 Aug 2025 05:09:19 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716161; cv=none; b=Rm7TcpHSNUIIoFYEPwrZQSk2+mFBFaIi8Biv/YBu8NhjoOpqLYiSDc8n/N7a+PcWKj1D5lN8yxsfXFZpqwNZo9V27otdxT/bMNi/j+pCcQsy85gMx6mqoYUfLUdWB0a4zERaoznppBe6okhDs8L/kX4GGnSX8g32CRW4pXc/jS4= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716161; c=relaxed/simple; + bh=V4wQFWsJMut3Mv4WM/pNBpXZswfWqWCXgqNTuOQgQwc=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=tR/Eg9pwE3kfKHQ1XrSvfns7tRo1/54dBTa3Mcw/Vf9PUP5J9yjcpYZjzIWQ96CkLtFqrg19Zl0Cj25CGXm9QEsmQiICsQrbY8sep3kg5LmP1PeugdTFvBMNkphNKcutc0NLmiPwdKekqx5gwUaOM5x4KAy1UaLJ56yS2wqthlE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=cZoFv+co; arc=none smtp.client-ip=198.175.65.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="cZoFv+co" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716160; x=1786252160; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=V4wQFWsJMut3Mv4WM/pNBpXZswfWqWCXgqNTuOQgQwc=; + b=cZoFv+cooYycPX6fhTDFz3M70RpTXLjWW54A4RKCHWeGADL7LpNgffxW + gBfRlECeZkPmt7ZbNEgFqOrc7h37RbVfI0hpXkfJXAbSVhFqX1dyT3XdI + KBuYaf5c3EOsWZOREhMvQUtHsoLmWta+xL56O2v1gsR3leEwTYp2Wagee + zuCK7oxtqjbXilAu6g6eLj5fAL1la9xryvQW3Hx9lwncNu2ChThNoNOIL + 8rqMMvopoFaWOd9vVKtpzIX0eyrh2S0jAjm/gycY3Z9ipQFIzNCz17K0J + RJ7dwofq53rsYOMlEnDTxOs2VjSm+OkCdnAYfR59wl5PkmnVd8HuSY4K7 + g==; +X-CSE-ConnectionGUID: U6TaVEBsQ1urnPH9VT6FpQ== +X-CSE-MsgGUID: xLUFPU/zRaayB1BNHTCWGw== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137798" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="57137798" +Received: from fmviesa006.fm.intel.com ([10.60.135.146]) + by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:09:19 -0700 +X-CSE-ConnectionGUID: MAk8xVHNTeiRA2GQcBH6/g== +X-CSE-MsgGUID: rE8RpT+sQLqRJLnxvLOvng== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165374559" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa006.fm.intel.com with ESMTP; 08 Aug 2025 22:09:13 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 07/28] sched: Add helper function to decide whether to allow cache aware scheduling +Date: Sat, 9 Aug 2025 13:03:10 +0800 +Message-Id: <701c7be7f0e69582d9ad0c25025ec2e133e73fbb.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Cache-aware scheduling is designed to aggregate threads into their +preferred LLC, either via the task wake up path or the load balancing +path. One side effect is that when the preferred LLC is saturated, +more threads will continue to be stacked on it, degrading the workload's +latency. A strategy is needed to prevent this aggregation from going too +far such that the preferred LLC is too overloaded. + +Introduce helper function _get_migrate_hint() to implement the +LLC migration policy: + +1) A task is aggregated to its preferred LLC if both source/dest LLC + are not too busy (<50% utilization, tunable), or the preferred + LLC will not be too out of balanced from the non preferred LLC + (>20% utilization, tunable, close to imbalance_pct of the LLC + domain). +2) Allow a task to be moved from the preferred LLC to the + non-preferred one if the non-preferred LLC will not be too out + of balanced from the preferred prompting an aggregation task + migration later. We are still experimenting with the aggregation + and migration policy. Some other possibilities are policy based + on LLC's load or average number of tasks running. Those could + be tried out by tweaking _get_migrate_hint(). + +The function _get_migrate_hint() returns migration suggestions for +the upper-level functions. + +Aggregation will tend to make utilization on the preferred LLC to +be more than the non-preferred one. Parameter "sysctl_llc_aggr_imb" +is the imbalance allowed. If it is set to 0, as long as the preferred +LLC is not utilized more than the source LLC, we could still aggregate +towards the preferred LLC and a preference could still be there. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/debug.c | 4 ++ + kernel/sched/fair.c | 110 ++++++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 5 ++ + 3 files changed, 118 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 557246880a7e..682fd91a42a0 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -532,6 +532,10 @@ static __init int sched_init_debug(void) + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); + #endif + ++#ifdef CONFIG_SCHED_CACHE ++ debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap); ++ debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb); ++#endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + + debugfs_fair_server_init(); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 4f79b7652642..3128dbcf0a36 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -8826,7 +8826,39 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + } + + #ifdef CONFIG_SCHED_CACHE +-static long __migrate_degrades_locality(struct task_struct *p, int src_cpu, int dst_cpu, bool idle); ++static long __migrate_degrades_locality(struct task_struct *p, ++ int src_cpu, int dst_cpu, ++ bool idle); ++__read_mostly unsigned int sysctl_llc_aggr_cap = 50; ++__read_mostly unsigned int sysctl_llc_aggr_imb = 20; ++ ++/* ++ * The margin used when comparing LLC utilization with CPU capacity. ++ * Parameter sysctl_llc_aggr_cap determines the LLC load level where ++ * active LLC aggregation is done. ++ * Derived from fits_capacity(). ++ * ++ * (default: ~50%) ++ */ ++#define fits_llc_capacity(util, max) \ ++ ((util) * 100 < (max) * sysctl_llc_aggr_cap) ++ ++/* ++ * The margin used when comparing utilization. ++ * is 'util1' noticeably greater than 'util2' ++ * Derived from capacity_greater(). ++ * Bias is in perentage. ++ */ ++/* Allows dst util to be bigger than src util by up to bias percent */ ++#define util_greater(util1, util2) \ ++ ((util1) * 100 > (util2) * (100 + sysctl_llc_aggr_imb)) ++ ++enum llc_mig_hint { ++ mig_allow = 0, ++ mig_ignore, ++ mig_forbid ++}; ++ + + /* expected to be protected by rcu_read_lock() */ + static bool get_llc_stats(int cpu, unsigned long *util, +@@ -8844,6 +8876,82 @@ static bool get_llc_stats(int cpu, unsigned long *util, + return true; + } + ++static enum llc_mig_hint _get_migrate_hint(int src_cpu, int dst_cpu, ++ unsigned long tsk_util, ++ bool to_pref) ++{ ++ unsigned long src_util, dst_util, src_cap, dst_cap; ++ ++ if (cpus_share_cache(src_cpu, dst_cpu)) ++ return mig_allow; ++ ++ if (!get_llc_stats(src_cpu, &src_util, &src_cap) || ++ !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) ++ return mig_ignore; ++ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ !fits_llc_capacity(src_util, src_cap)) ++ return mig_ignore; ++ ++ src_util = src_util < tsk_util ? 0 : src_util - tsk_util; ++ dst_util = dst_util + tsk_util; ++ if (to_pref) { ++ /* ++ * sysctl_llc_aggr_imb is the imbalance allowed between ++ * preferred LLC and non-preferred LLC. ++ * Don't migrate if we will get preferred LLC too ++ * heavily loaded and if the dest is much busier ++ * than the src, in which case migration will ++ * increase the imbalance too much. ++ */ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ util_greater(dst_util, src_util)) ++ return mig_forbid; ++ } else { ++ /* ++ * Don't migrate if we will leave preferred LLC ++ * too idle, or if this migration leads to the ++ * non-preferred LLC falls within sysctl_aggr_imb percent ++ * of preferred LLC, leading to migration again ++ * back to preferred LLC. ++ */ ++ if (fits_llc_capacity(src_util, src_cap) || ++ !util_greater(src_util, dst_util)) ++ return mig_forbid; ++ } ++ return mig_allow; ++} ++ ++/* ++ * Give suggestion when task p is migrated from src_cpu to dst_cpu. ++ */ ++static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cpu, ++ struct task_struct *p) ++{ ++ struct mm_struct *mm; ++ int cpu; ++ ++ if (cpus_share_cache(src_cpu, dst_cpu)) ++ return mig_allow; ++ ++ mm = p->mm; ++ if (!mm) ++ return mig_allow; ++ ++ cpu = mm->mm_sched_cpu; ++ if (cpu < 0) ++ return mig_allow; ++ ++ if (cpus_share_cache(dst_cpu, cpu)) ++ return _get_migrate_hint(src_cpu, dst_cpu, ++ task_util(p), true); ++ else if (cpus_share_cache(src_cpu, cpu)) ++ return _get_migrate_hint(src_cpu, dst_cpu, ++ task_util(p), false); ++ else ++ return mig_allow; ++} ++ + static int select_cache_cpu(struct task_struct *p, int prev_cpu) + { + struct mm_struct *mm = p->mm; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index f4ab45ecca86..83552aab74fb 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2844,6 +2844,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; + extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + ++#ifdef CONFIG_SCHED_CACHE ++extern unsigned int sysctl_llc_aggr_cap; ++extern unsigned int sysctl_llc_aggr_imb; ++#endif ++ + #ifdef CONFIG_SCHED_HRTICK + + /* +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch new file mode 100644 index 0000000..0e4cfdb --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-08-28-sched-Set-up-LLC-indexing.patch @@ -0,0 +1,232 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7A24F2749D5 + for ; Sat, 9 Aug 2025 05:09:34 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716177; cv=none; b=NNJRjopqrRbhFk19x1BndJWZ90HqNxRnx0H7JE+07eSr/bdUJMU/c0NJ3LB2cV94Rsi1R1AdGM1d2xlML1jh2RnTHB1Dzdvr0qBwgdFuA8zjncQEUZO6kHF1Y2GSQmE70Toj/gzstTrxtr3JAqld0iuOXw9GF3i3gZmGNxoXo9Q= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716177; c=relaxed/simple; + bh=Q9WyVWFAxaya8q+BQJRZR5gSlB4xoSQBTrL3eFp0H3k=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=OJCxUgjq3LhGvzfQnlmgDk0eZNcoMO3+ooDXAn+dS2XPaoAD00XBVRhoIDDmzlGUakQWDO9E3wgaByP2px0tcqYpCXMaz1PTKVbRf0IMfCL7wmj/Pl6WrMp5Uk7woFJsPxrVPRghCGLe/mewLfBt3ueL13B0csiHsH9qkS/YD8s= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=OW5YzRQl; arc=none smtp.client-ip=198.175.65.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="OW5YzRQl" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716176; x=1786252176; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Q9WyVWFAxaya8q+BQJRZR5gSlB4xoSQBTrL3eFp0H3k=; + b=OW5YzRQlQh/nBHXwqrjN0l8Y7LfITnFjeexFpLxenGyiJvbsUX2LdH+f + NmdygIjqrdclruZgedLbBxuvOrc8rS64ODYq+fjwfwXQVB0yteRfnTP/u + KUJ6NLJp7E5qLizUurKDYQ/CQ8WhKvO+A1CCWJcny7Ywyk5pWHn0+ihL2 + fyfKV1cKZRnLjLxVHkt7AZCj9E7OPIlGwDuDChPwUD61pbaKxh7wR9gpr + q7g35VzJcDGPAJtv/VzN73wW2yx/6zcGH0VLxrR+XHBCqIvDlMHA2v4f0 + DYgKaiB40pfFiu4dwUi7Ps3HxC6vxt6/7c8fQVryZXz/WrZQ5I8EfdNEJ + A==; +X-CSE-ConnectionGUID: Y02Bl/8pShaUv1kRhHj39A== +X-CSE-MsgGUID: 3/idV79SS1m5g1kaTEBzDw== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57137811" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="57137811" +Received: from fmviesa006.fm.intel.com ([10.60.135.146]) + by orvoesa110.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:09:34 -0700 +X-CSE-ConnectionGUID: XuuWOYxBTOOma0SA+bSkWg== +X-CSE-MsgGUID: Ip2uRtmWRum9W2ItftRIPg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165374575" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa006.fm.intel.com with ESMTP; 08 Aug 2025 22:09:28 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 08/28] sched: Set up LLC indexing +Date: Sat, 9 Aug 2025 13:03:24 +0800 +Message-Id: <959d897daadc28b8115c97df04eec2af0fd79c5d.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Prepare for indexing arrays that track in each run queue: the number +of tasks preferring current LLC and each of the other LLC. + +The reason to introduce LLC index is because the per LLC-scope data +is needed to do cache aware load balancing. However, the native lld_id +is usually the first CPU of that LLC domain, which is not continuous, +which might waste the space if the per LLC-scope data is stored +in an array (in current implementation). + +In the future, this LLC index could be removed after +the native llc_id is used as the key to search into xarray based +array. + +Signed-off-by: Tim Chen +--- + include/linux/sched.h | 3 +++ + kernel/sched/fair.c | 12 ++++++++++++ + kernel/sched/sched.h | 2 ++ + kernel/sched/topology.c | 29 +++++++++++++++++++++++++++++ + 4 files changed, 46 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 02ff8b8be25b..81d92e8097f5 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -809,6 +809,9 @@ struct kmap_ctrl { + #endif + }; + ++/* XXX need fix to not use magic number */ ++#define MAX_LLC 64 ++ + struct task_struct { + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 3128dbcf0a36..f5075d287c51 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1183,6 +1183,18 @@ static int llc_id(int cpu) + return per_cpu(sd_llc_id, cpu); + } + ++/* ++ * continuous index. ++ * TBD: replace by xarray with key llc_id() ++ */ ++static inline int llc_idx(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_idx, cpu); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 83552aab74fb..c37c74dfce25 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2056,6 +2056,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DECLARE_PER_CPU(int, sd_llc_size); + DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(int, sd_llc_idx); + DECLARE_PER_CPU(int, sd_share_id); + DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -2064,6 +2065,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + + extern struct static_key_false sched_asym_cpucapacity; + extern struct static_key_false sched_cluster_active; ++extern int max_llcs; + + static __always_inline bool sched_asym_cpucap_active(void) + { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index b958fe48e020..91a2b7f65fee 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -657,6 +657,7 @@ static void destroy_sched_domains(struct sched_domain *sd) + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DEFINE_PER_CPU(int, sd_llc_size); + DEFINE_PER_CPU(int, sd_llc_id); ++DEFINE_PER_CPU(int, sd_llc_idx); + DEFINE_PER_CPU(int, sd_share_id); + DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -666,6 +667,25 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_cluster_active); + ++int max_llcs = -1; ++ ++static void update_llc_idx(int cpu) ++{ ++#ifdef CONFIG_SCHED_CACHE ++ int idx = -1, llc_id = -1; ++ ++ llc_id = per_cpu(sd_llc_id, cpu); ++ idx = per_cpu(sd_llc_idx, llc_id); ++ ++ if (idx < 0) { ++ idx = max_llcs++; ++ BUG_ON(idx > MAX_LLC); ++ per_cpu(sd_llc_idx, llc_id) = idx; ++ } ++ per_cpu(sd_llc_idx, cpu) = idx; ++#endif ++} ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +@@ -684,6 +704,7 @@ static void update_top_cache_domain(int cpu) + per_cpu(sd_llc_size, cpu) = size; + per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); ++ update_llc_idx(cpu); + + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) +@@ -2456,6 +2477,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + bool has_asym = false; + bool has_cluster = false; + ++#ifdef CONFIG_SCHED_CACHE ++ if (max_llcs < 0) { ++ for_each_possible_cpu(i) ++ per_cpu(sd_llc_idx, i) = -1; ++ max_llcs = 0; ++ } ++#endif ++ + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch new file mode 100644 index 0000000..c115c45 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-09-28-sched-Introduce-task-preferred-LLC-field.patch @@ -0,0 +1,156 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.13]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 72B6D15665C + for ; Sat, 9 Aug 2025 05:10:00 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.13 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716202; cv=none; b=b2L1GhXR1bKfaGNtSt2TSFJgivwyGWl/zG0ke7CHSqEAcHuFzHVF+Vph/AG5ZfJphDMNbIxy4SUFIumjAOZK2TB0Z4jGWHlzOvGKs9kRxGy3WkdJTVEO3FLULtEJnBKj5AORTkYZlfIB4LE4Izx1MQm/ZkRn8Sz9XQb/WKOg49I= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716202; c=relaxed/simple; + bh=yDyElrdoJP1owudVvXOmuFFmGrAdqlZ/3LSJv9270PI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=j3HhsKTtfxpLA3rJndUgWlhiEvEp59nMppnJBuBt3n57eWQsLpmRNumx60yqEWTiU+2a0Akk/6QCT1AAxHoly+zNGqSyQJ4Og7AOKWr039BhdBdB9rB0XwOlLRo6MBk6oRA3xZMgTm/i1/Glk/eLvpJOrXxvUAqHVunt8/0Gy+Q= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=jA/q+1HC; arc=none smtp.client-ip=198.175.65.13 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="jA/q+1HC" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716201; x=1786252201; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=yDyElrdoJP1owudVvXOmuFFmGrAdqlZ/3LSJv9270PI=; + b=jA/q+1HCZoWL9/if9zUiP2RagKNIlzVG2/9XmXTq+Ai1+idEA3GniBpc + 0U+c7IPLnvxvaiyKdJCCUu5unbGE6uc4OZCK3b3LFPBZpAbM8stdCZMnd + Wj/PbIIK7iHErHRNJoSuTG2Hz3Kd1S2DZTWM7lcoF8Rml/dJplEh3gVCt + vpngwG0Zm9NV0fxTmPcRsqshl1tnvy4tttj+WdiTSfQEPhYj49I+gD0bh + 3UQewsPVTarSIp+hr1KFG3cogmN+Rd4lGhrxiPXp8zr5spR5put/n5Xyn + 1MEtslmziwaMvG+ZfcPPao0HwFNVTDTVL4ngyCM61uwjqrVJWtD3T8BQ/ + Q==; +X-CSE-ConnectionGUID: W/xDy5btSjOwlHAt10VALQ== +X-CSE-MsgGUID: hD1wAyqHT7WAJ49GLugw0A== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="68139885" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="68139885" +Received: from fmviesa008.fm.intel.com ([10.60.135.148]) + by orvoesa105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:00 -0700 +X-CSE-ConnectionGUID: c2jIhCe2THSYJtrA2gygJg== +X-CSE-MsgGUID: PCF36aw+QPy8wB+b//2Ymw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165891237" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa008.fm.intel.com with ESMTP; 08 Aug 2025 22:09:54 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 09/28] sched: Introduce task preferred LLC field +Date: Sat, 9 Aug 2025 13:03:50 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +With cache aware scheduling enabled, each process is assigned +a preferred LLC id, which will be used to quickly identify +the LLC domain this thread prefers to run. This is similar to +numa_preferred_nid for NUMA balance. + +Signed-off-by: Tim Chen +--- + include/linux/sched.h | 1 + + init/init_task.c | 3 +++ + kernel/sched/fair.c | 7 +++++++ + 3 files changed, 11 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 81d92e8097f5..ac4973728c3e 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1408,6 +1408,7 @@ struct task_struct { + + #ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; ++ int preferred_llc; + #endif + + #ifdef CONFIG_RSEQ +diff --git a/init/init_task.c b/init/init_task.c +index e557f622bd90..5fffbe766f57 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { + .numa_group = NULL, + .numa_faults = NULL, + #endif ++#ifdef CONFIG_SCHED_CACHE ++ .preferred_llc = -1, ++#endif + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + .kasan_depth = 1, + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index f5075d287c51..94ad84ba19e1 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1267,6 +1267,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_struct *mm = p->mm; + struct mm_sched *pcpu_sched; + unsigned long epoch; ++ int mm_sched_llc = -1; + + if (!sched_feat(SCHED_CACHE)) + return; +@@ -1298,6 +1299,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + mm->mm_sched_cpu = -1; + pcpu_sched->occ = 0; + } ++ ++ if (mm->mm_sched_cpu != -1) ++ mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); ++ ++ if (p->preferred_llc != mm_sched_llc) ++ p->preferred_llc = mm_sched_llc; + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch new file mode 100644 index 0000000..9f593d2 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-10-28-sched-Calculate-the-number-of-tasks-that-have-LLC-preference-on-a-runqueue.patch @@ -0,0 +1,255 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.13]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 89D17157E6B + for ; Sat, 9 Aug 2025 05:10:13 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.13 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716215; cv=none; b=YUFlGTDaJF85FWib/Q7a+Wsp38xQlzoRV7AlyUsBs2T1Nm/D3GTDbCOLCu/JYT4Bz1kY7FBLiNzXW6SeRLGEdj+kiCkLEbPCq1Dkw53ko18P2N2wbe+qOYsR7L33XPzRdv8x7pin6JN4QQ3K2vGxPtYxzPr3f13C84cowPsbc0I= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716215; c=relaxed/simple; + bh=z06d0DcakiSO/sbAcrP1nzo0jya94AyoNYzp6vdKu+E=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Ie5cAVqHtvb8rhtiw/uv2b7q9XOHp9GEaOPJqeh1V5G10tNLgO252LhnRQe82GxwRDsA9JqO8Z/Pn7dGb6/9pzL+eDl4+d0jt7D8uohXyByy5gEgRhnGJ+jOPjC5jrJlsyqSb7bw9iDQtCUKZPX6LrQnQOAMGwjypK1vT60zh4Q= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=A43oUD+X; arc=none smtp.client-ip=198.175.65.13 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="A43oUD+X" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716214; x=1786252214; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=z06d0DcakiSO/sbAcrP1nzo0jya94AyoNYzp6vdKu+E=; + b=A43oUD+XrpEd//+VfIwxy0/clww1jBOSaqukWPErBvX/YdTyCblU2SZc + 2RzVMdCc8oIYfY7mAnN+bu4TlDneL5nuPMn8idWsUe/ibRoK5MwcfHrk1 + 4wf705GDHnZFwZzx7MaW2tVbkko9eMPuBBX9wEZV9YRSNvgsYWWcfkB9f + X09DYxiaF6aoyy46GVmca0RePk7ZqdJVl5uzZAHcWSo20QuUXb6HVtKNk + d843I5ITdrSq6lu1g0W1GAYdjZ+obzYIC4503sdpdA31Ura1IlPBWiexd + +xto3bCIJV99nwMfneStXQQK9UCe9VrSRX+40SkrUq9jsfJUv0KlPZ3A8 + Q==; +X-CSE-ConnectionGUID: plGWHn/OT2iWZBXIShk1/w== +X-CSE-MsgGUID: HnPTC8sbTiuVF1BIb62/Dg== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="68139903" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="68139903" +Received: from fmviesa008.fm.intel.com ([10.60.135.148]) + by orvoesa105.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:13 -0700 +X-CSE-ConnectionGUID: UNc6FU+9Rn6IWN1WcGQpPA== +X-CSE-MsgGUID: H1SRXM8VR6yqBYvNRQaDcA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165891283" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa008.fm.intel.com with ESMTP; 08 Aug 2025 22:10:07 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 10/28] sched: Calculate the number of tasks that have LLC preference on a runqueue +Date: Sat, 9 Aug 2025 13:04:04 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Track for each run queue, the number of tasks that have a LLC preference +and how many of those tasks are running in its preferred LLC. This is +similar to nr_numa_running and nr_preferred_running for NUMA balance, +and will be used by the cache-aware load balancing in subsequent patches. + +Signed-off-by: Tim Chen +--- + kernel/sched/core.c | 12 +++++++++++ + kernel/sched/fair.c | 51 +++++++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 7 ++++++ + 3 files changed, 69 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index a5fb3057b1c4..a97a8039ce91 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -499,6 +499,18 @@ void __trace_set_current_state(int state_value) + } + EXPORT_SYMBOL(__trace_set_current_state); + ++#ifdef CONFIG_SMP ++int task_llc(const struct task_struct *p) ++{ ++ return per_cpu(sd_llc_id, task_cpu(p)); ++} ++#else ++int task_llc(const struct task_struct *p) ++{ ++ return 0; ++} ++#endif ++ + /* + * Serialization rules: + * +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 94ad84ba19e1..f964d5a44fcc 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1195,6 +1195,24 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) ++{ ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ rq->nr_llc_running += (p->preferred_llc != -1); ++ rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) ++{ ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ rq->nr_llc_running -= (p->preferred_llc != -1); ++ rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +@@ -1303,8 +1321,11 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); + +- if (p->preferred_llc != mm_sched_llc) ++ if (p->preferred_llc != mm_sched_llc) { ++ account_llc_dequeue(rq, p); + p->preferred_llc = mm_sched_llc; ++ account_llc_enqueue(rq, p); ++ } + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +@@ -1408,6 +1429,17 @@ void init_sched_mm(struct task_struct *p) + work->next = work; + } + ++void reset_llc_stats(struct rq *rq) ++{ ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ if (rq->nr_llc_running) ++ rq->nr_llc_running = 0; ++ ++ rq->nr_pref_llc_running = 0; ++} ++ + #else + + static inline void account_mm_sched(struct rq *rq, struct task_struct *p, +@@ -1418,6 +1450,17 @@ void init_sched_mm(struct task_struct *p) { } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) ++{ ++} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) ++{ ++} ++ ++void reset_llc_stats(struct rq *rq) ++{ ++} + #endif + + static inline +@@ -3957,6 +4000,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); ++ account_llc_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } + #endif +@@ -3970,10 +4014,15 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) + #ifdef CONFIG_SMP + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); ++ account_llc_dequeue(rq_of(cfs_rq), task_of(se)); + list_del_init(&se->group_node); + } + #endif + cfs_rq->nr_queued--; ++ ++ /* safeguard? */ ++ if (!parent_entity(se) && !cfs_rq->nr_queued) ++ reset_llc_stats(rq_of(cfs_rq)); + } + + /* +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index c37c74dfce25..8026e2c66e9f 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1106,6 +1106,10 @@ struct rq { + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc_running; ++ unsigned int nr_llc_running; ++#endif + #ifdef CONFIG_NO_HZ_COMMON + #ifdef CONFIG_SMP + unsigned long last_blocked_load_update_tick; +@@ -1967,6 +1971,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) + + #endif /* !CONFIG_NUMA_BALANCING */ + ++void reset_llc_stats(struct rq *rq); ++int task_llc(const struct task_struct *p); ++ + #ifdef CONFIG_SMP + + static inline void +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch new file mode 100644 index 0000000..9e101dc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-11-28-sched-Introduce-per-runqueue-task-LLC-preference-counter.patch @@ -0,0 +1,217 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id ACB411DE3BE + for ; Sat, 9 Aug 2025 05:10:33 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716235; cv=none; b=lYq4sXXnPZXpun4uCiODi1V3d/oXpyY0WTO8EB77SDdlY6hzvunzTarb/1BsD6HfY68VzKdZ/P3gkg6eJSWY/V4zZO3lamlWGMLfuQQ9r0QtHrs78GolCQXrw2NP8BYJ0Ju1m5iSnICs6lWRkPa7xXiHxKjTT1AzsfOnMwn7rTs= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716235; c=relaxed/simple; + bh=mMOSEbE/HppEmtkHE5TulvZHAhAbaSvHHXGxGBzFcgU=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=j3HRMLmVG1MFAS1qP6GNt/s6/77Uh+WddEprQ5Z3l8TxNvB+P4AvujK8AWnshyXFcGmTbARtQ+BIJgmdYZlbH9m8Qs+2XeS6vuXVoCzJlMxbKBnl0JQB9Z0xEIolcdu+YlhDSc69qnES+cWMESyBPPowQbtqjplsmzkh92xzVx4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=geKQjfK/; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="geKQjfK/" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716234; x=1786252234; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=mMOSEbE/HppEmtkHE5TulvZHAhAbaSvHHXGxGBzFcgU=; + b=geKQjfK/ENQzxJIVdaSMSI44R3jzrRtlLAcLLVwYHZqNhRHgxmIbm8PU + XiWO/MC/iDc0e7d2LUmRe0CKRrydnkWRCT7V1d5Ru83rqB+7K/+RrZNhk + +2sQx23IAdQbusICOeU5sYoOB7pa5uDZu1oWLgGvhnJwEFa2V+2w+qxj2 + m6YwMTmZ4b38bn4agOoOn4ktTclSJFaj3Mp772dwNENS9tmK3L6FTfdSK + gtxcuwqrQMw1U7n0bVzf8SFtoDVy9euo9ZcsqmS67rcCdjvv8Sewo0TTC + 69ZCq755Kkj3SVTPSNh4ROx9trC8pZSNM1tAluNyLdBtgKLb9L4vwsHnZ + g==; +X-CSE-ConnectionGUID: gBIugcE8QvyHdtE7rR2INg== +X-CSE-MsgGUID: omeDEy9ySEOEIT5GfzHbDg== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57019934" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="57019934" +Received: from fmviesa005.fm.intel.com ([10.60.135.145]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:33 -0700 +X-CSE-ConnectionGUID: 3IoQ+9+0SaCZvxJgCfv5ug== +X-CSE-MsgGUID: 3gf5XiXlTDy8oJdmBqQc4w== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="169705062" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa005.fm.intel.com with ESMTP; 08 Aug 2025 22:10:27 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 11/28] sched: Introduce per runqueue task LLC preference counter +Date: Sat, 9 Aug 2025 13:04:18 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Each runqueue is assigned a static array, where each element indicates +the number of tasks preferring a particular LLC mapped to the +array index. + +For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on +this runqueue which prefer to run within LLC3 (indexed from 0 to MAX_LLC +across the entire system). With this information, the load balancer can +make better decisions to select the busiest runqueue and migrate tasks +to their preferred LLC domains. + +Note: The static array could be converted to an xarray in the future. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 42 ++++++++++++++++++++++++++++++++++++++++-- + kernel/sched/sched.h | 1 + + 2 files changed, 41 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index f964d5a44fcc..cfae71ee870b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1195,22 +1195,51 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static inline int pref_llc_idx(struct task_struct *p) ++{ ++ return llc_idx(p->preferred_llc); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + if (!sched_feat(SCHED_CACHE)) + return; + + rq->nr_llc_running += (p->preferred_llc != -1); + rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ ++rq->nr_pref_llc[pref_llc]; + } + + static void account_llc_dequeue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + if (!sched_feat(SCHED_CACHE)) + return; + + rq->nr_llc_running -= (p->preferred_llc != -1); + rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ /* avoid negative counter */ ++ if (rq->nr_pref_llc[pref_llc] > 0) ++ --rq->nr_pref_llc[pref_llc]; + } + + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) +@@ -1279,6 +1308,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); + } + ++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); ++ + static inline + void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + { +@@ -1321,7 +1352,9 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); + +- if (p->preferred_llc != mm_sched_llc) { ++ /* task not on rq accounted later in account_entity_enqueue() */ ++ if (task_running_on_cpu(rq->cpu, p) && ++ p->preferred_llc != mm_sched_llc) { + account_llc_dequeue(rq, p); + p->preferred_llc = mm_sched_llc; + account_llc_enqueue(rq, p); +@@ -1431,11 +1464,16 @@ void init_sched_mm(struct task_struct *p) + + void reset_llc_stats(struct rq *rq) + { ++ int i; ++ + if (!sched_feat(SCHED_CACHE)) + return; + +- if (rq->nr_llc_running) ++ if (rq->nr_llc_running) { ++ for (i = 0; i < MAX_LLC; ++i) ++ rq->nr_pref_llc[i] = 0; + rq->nr_llc_running = 0; ++ } + + rq->nr_pref_llc_running = 0; + } +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 8026e2c66e9f..4464b92767ad 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1109,6 +1109,7 @@ struct rq { + #ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; ++ unsigned int nr_pref_llc[MAX_LLC]; + #endif + #ifdef CONFIG_NO_HZ_COMMON + #ifdef CONFIG_SMP +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch new file mode 100644 index 0000000..3436e02 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-12-28-sched-Calculate-the-total-number-of-preferred-LLC-tasks-during-load-balance.patch @@ -0,0 +1,147 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id CE60C2472BA + for ; Sat, 9 Aug 2025 05:10:49 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716251; cv=none; b=ADP9uISEjTXXmrZUS4riveuGfD0WO35c5rz53HhmhWNMrD8Dv6NEgTvyjQYnfTc2u3LwbGoaDVNp6SSxFFbbtfw/FH7XlH762INSbgUPWTafzzs+ATW6FN2x9nPTJmp96ZH+mnf1JNlGrm30zuWW2dvWodZS8ErATrdpAPxPp5I= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716251; c=relaxed/simple; + bh=aqSPvY6s7QQr//GlOU5D+JTqKry164SNV/VZUR7Kspk=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=t/5xtGUzGrKxeHEjmp10AAMihG56pqeTBrBMPkuq4PHQwrU2AfkoQsDB4Y9TtiCy7GzKfao5LjE7hgoUByuq8rYZKjatFGSKo04f1EvdsfMKnnI+kvm1KPBegBNQVntmqPVlcdaoa8X2w/mUXe+QOHkdUYdt1Sj1Q17LdJWTtvM= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=hnHR/vdK; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="hnHR/vdK" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716250; x=1786252250; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=aqSPvY6s7QQr//GlOU5D+JTqKry164SNV/VZUR7Kspk=; + b=hnHR/vdKO7tdLOhgecBrJ/L6XtQ1BO8dVbYIB82zj+iOPcT1jV8xsoaQ + qVvNm13ubKRb17a6vcXBW7sO0sqESiBXEDAA3LOy6nrman47fi+cJF4GK + /APwE4CIXL2nHpsyP/5wUxfP2JBnuaAMw69BpWv3yxh4gHtAvKYTTGlqu + yYAwTm4DfQnzeKlTLUohbS6ngMAvDbvMCBqHRIliyvHmG2k+p5eyDajr2 + YczcwUmYJan/0n1K6JWf7awWcuI7A08+OPYkjN3MABdDHrz0f/nYtETcb + muTOyFAlK1kHzNTlu61gC8CfjRqlA8omMF+AZVXL7YPft7aD6yFC9a32S + Q==; +X-CSE-ConnectionGUID: 5tGpMziNRzOAuvg9CBzp+w== +X-CSE-MsgGUID: RdOEUAbzTDSASRaq4EwAIw== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="57019960" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="57019960" +Received: from fmviesa005.fm.intel.com ([10.60.135.145]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:10:49 -0700 +X-CSE-ConnectionGUID: DegXMsyUTHGOiD9Y2cNnrw== +X-CSE-MsgGUID: A+KDxoubRjeA2xjTkEMmBQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="169705082" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa005.fm.intel.com with ESMTP; 08 Aug 2025 22:10:44 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 12/28] sched: Calculate the total number of preferred LLC tasks during load balance +Date: Sat, 9 Aug 2025 13:04:39 +0800 +Message-Id: <4145385d4ce232e10cae713c8449d459c325db46.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +During load balancing between LLCs, gather the number of tasks +on each runqueue of a source LLC. + +For example, consider a system with 4 sched groups LLC0, LLC1, +..., LLC3. We are balancing towards LLC3 and LLC0 has 3 tasks +preferring LLC3, LLC1 has 2 tasks preferring LLC3 and LLC2 has +1 task preferring LLC3. LLC0 with most tasks preferring LLC3 +will be chosen as the busiest LLC to pick the tasks from. + +The number of tasks preferring the destination LLC are gathered +from each run queue for a source LLC. + +For example, consider the sched_group LLC0 with two CPUs, CPU0 +and CPU1. On CPU0, 2 tasks prefer to run on LLC3, and on CPU1, +one task prefers LLC3. The total number of tasks preferring +LLC3 in LLC0 is 2 + 1 = 3. + +These statistics enable the load balancer to select tasks from +a sched_group that best aligns tasks with their preferred LLCs. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cfae71ee870b..f1697658c3b8 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10496,6 +10496,9 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc[MAX_LLC]; ++#endif + }; + + /* +@@ -10974,6 +10977,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (cpu_overutilized(i)) + *sg_overutilized = 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_feat(SCHED_CACHE)) { ++ int j; ++ ++ for (j = 0; j < max_llcs; ++j) ++ sgs->nr_pref_llc[j] += rq->nr_pref_llc[j]; ++ } ++#endif + /* + * No need to call idle_cpu() if nr_running is not 0 + */ +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch new file mode 100644 index 0000000..8b9b9c1 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-13-28-sched-Tag-the-sched-group-as-llc_balance-if-it-has-tasks-prefer-other-LLC.patch @@ -0,0 +1,177 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4717515665C + for ; Sat, 9 Aug 2025 05:11:54 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716315; cv=none; b=V21sVGbgLQDQ3LIjl4XUJrmJ9J0H7EnLAxMhWiHDhYvdTQf37ITQq1SLcLqEU8QZBleGBd5opQCrWn9ZA+ka3UsL0gAkAAWeYPjzH6uXv6zStuJq71dJWgYewm9hUHjq7qSX6lm/Lgw0QNQVBR235FBzMEr5TtJzqy9vbdAZgM8= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716315; c=relaxed/simple; + bh=AIuQrSu/5y/S9TIWqFOGCV+S6deOxyVrovvWy4SL2kw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=OloMkMGj39oyisSI/elKEAvpy20hxc65DCBP77XOsD6DM1R+a5A5Da/ppqs7Z7p3GBtilMBdNOBeSCw4CLzWnpi/GbZs0twrI/jHBj75g3QWh2U+l9MnhNx5slK4wgHsVE88KJtcO1SMNVHstEsw3B+CP/Ty6sI5OGUAf63XtLk= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZuSndoFC; arc=none smtp.client-ip=192.198.163.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZuSndoFC" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716314; x=1786252314; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=AIuQrSu/5y/S9TIWqFOGCV+S6deOxyVrovvWy4SL2kw=; + b=ZuSndoFC4541nSvbCP6cEW/FYV1l8zzkJP80vf6pWY1ZOHgg/RVomY/T + Ti6yNCB58L0Sd4kxkaI5sPLuF7vCi3M5pS05/bm13pSUnPEFFjtsLUyj2 + f9POeHNiXdDCLJm6AbH5YNMJHTMKKxVu+wmWQOJyUA2JQAjfjhd4Y2l7i + hwHCcBSXZp9fOvMWTMVoUR8/ktX+69hF3c7sKUUgsZ3Ez6EVCyQ/ijbHd + VIyS49HPzpJp2UXMWArIqJMLsn/1xm4WXUpNNwWGCUSD+Ru4vQomDx8lv + I+sL/FNJq4W7oKMEvp16XyroY9GrTk18XR9yqRbCDZM1KnkZXCppQk2n8 + g==; +X-CSE-ConnectionGUID: UjPqhATKTouUNB6GDkUddg== +X-CSE-MsgGUID: rAOe+a/mQeqiUy/XHrFejQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56259917" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56259917" +Received: from orviesa007.jf.intel.com ([10.64.159.147]) + by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:11:53 -0700 +X-CSE-ConnectionGUID: WVRUnOqXSoeMjZHQRQuF4Q== +X-CSE-MsgGUID: AGpfHQaOSwWVkoL9IadjUw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165475971" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:11:48 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 13/28] sched: Tag the sched group as llc_balance if it has tasks prefer other LLC +Date: Sat, 9 Aug 2025 13:05:44 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +During load balancing between LLCs, check whether there are tasks +preferring the destination LLC. If so, balance those tasks to the +destination LLC first. + +Tag the sched_group that has tasks preferring to run on other LLCs +(non-local) with the group_llc_balance flag. This way, the load +balancer will later attempt to pull/push these tasks to their +preferred LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++++ + 1 file changed, 43 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index f1697658c3b8..30ebc7d1b999 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10491,6 +10491,7 @@ struct sg_lb_stats { + enum group_type group_type; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ ++ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + #ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; +@@ -10855,6 +10856,43 @@ static inline bool smt_balance(struct lb_env *env, struct sg_lb_stats *sgs, + return false; + } + ++/* ++ * Do LLC balance on sched group that contains LLC, and have tasks preferring ++ * to run on LLC in idle dst_cpu. ++ */ ++#ifdef CONFIG_SCHED_CACHE ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ struct sched_domain *child = env->sd->child; ++ int llc; ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return false; ++ ++ if (env->sd->flags & SD_SHARE_LLC) ++ return false; ++ ++ /* only care about task migration among LLCs */ ++ if (child && !(child->flags & SD_SHARE_LLC)) ++ return false; ++ ++ llc = llc_idx(env->dst_cpu); ++ if (sgs->nr_pref_llc[llc] > 0 && ++ _get_migrate_hint(env->src_cpu, env->dst_cpu, ++ 0, true) == mig_allow) ++ return true; ++ ++ return false; ++} ++#else ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ return false; ++} ++#endif ++ + static inline long sibling_imbalance(struct lb_env *env, + struct sd_lb_stats *sds, + struct sg_lb_stats *busiest, +@@ -11037,6 +11075,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + update_sg_if_llc(env, sgs, group); ++ ++ /* Check for tasks in this group can be moved to their preferred LLC */ ++ if (!local_group && llc_balance(env, sgs, group)) ++ sgs->group_llc_balance = 1; ++ + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch new file mode 100644 index 0000000..c2d89bc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-14-28-sched-Introduce-update_llc_busiest-to-deal-with-groups-having-preferred-LLC-tasks.patch @@ -0,0 +1,181 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9DFDCBA36 + for ; Sat, 9 Aug 2025 05:12:08 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716330; cv=none; b=aQ1DuKtufh+Odr/LorF9O/fM51njWURpIhUr0LEbL04DjrjFm+d09C28slCF+44U9FtSmiimp1pCffRh1R/fvm2ZZoxIwYLoRUwe6OyK5cj/+TCojsQvg2MTs+TBXvkqNEX58rHKytdwSZsbXZQ7b/69UrXdxM0ua0rXL6iOtQ0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716330; c=relaxed/simple; + bh=O81QutAo6orLIIDwAsw0ZMG10IN4T/AEDQCSSIJxqyQ=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=U5xozr9Mtp5uu2TvfPRuTYtKe4U1Wvg6adAEEQYPp3XhL43lFnMD5WFxuPpA676GFAHZvR3sqXgli2n0l/wcn29K4BggdO2CKIje4lN3tYfPeo9MARoA5x4puu9zfIoLrFm8QyrapPsLUWke2Sltghaenw5fxTvdXdEsap9QY/I= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MEuihH0+; arc=none smtp.client-ip=192.198.163.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MEuihH0+" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716328; x=1786252328; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=O81QutAo6orLIIDwAsw0ZMG10IN4T/AEDQCSSIJxqyQ=; + b=MEuihH0+FVuXmARwKbtg3ZEInFXwZofL2oeZFz8hAFncHiYR3thGt37w + r1T1tf3lEswVj9+r/pBIBCI/p6tNeK/mU9z44eSxhHK5hkAbm4U0pK2Yd + kJSRq/e/BjJbTfq0wdmHL+xZeKBs3wZgVSLF/cjxbK8xkGolfMsNQtISm + W3cnADlN4qbVKCTxsMFINRPYiR7F/yD1Oj5rfzQ0wt0MvGxsmxI+X3NFf + 9X2WdZwnrcjBg1uRuKw5Ke2i5+i08CMFggHeD7mmAh+bcE6otZUr5bfyT + 1/s8kOVFgN6mguEbs8JSr9oz3bIDVZO52aj/iKxcnT/mwWA+dnZZTC1UF + g==; +X-CSE-ConnectionGUID: 7CDD+sx7Si6bye2RdBQ3gA== +X-CSE-MsgGUID: jib9w3+fQbenCvLznD37kQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56259932" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56259932" +Received: from orviesa007.jf.intel.com ([10.64.159.147]) + by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:12:07 -0700 +X-CSE-ConnectionGUID: DOXrAiV0Se6h6BcziixHkw== +X-CSE-MsgGUID: PSBpAuryS0CRK6OjhNz0bw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165475987" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:12:02 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 14/28] sched: Introduce update_llc_busiest() to deal with groups having preferred LLC tasks +Date: Sat, 9 Aug 2025 13:05:58 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +The load balancer attempts to identify the busiest sched_group with +the highest load and migrates some tasks to a less busy sched_group +to distribute the load across different CPUs. + +When cache-aware scheduling is enabled, the busiest sched_group is +defined as the one with the highest number of tasks preferring to run +on the destination LLC. If the busiest group has llc_balance tag, +the cache aware load balance will be launched. + +Introduce the helper function update_llc_busiest() to identify +such sched group with most tasks preferring the destination LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++++- + 1 file changed, 35 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 30ebc7d1b999..b8cc85291351 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10885,12 +10885,36 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ int idx; ++ ++ /* Only the candidate with llc_balance need to be taken care of */ ++ if (!sgs->group_llc_balance) ++ return false; ++ ++ /* ++ * There are more tasks that want to run on dst_cpu's LLC. ++ */ ++ idx = llc_idx(env->dst_cpu); ++ return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx]; ++} + #else + static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) + { + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ return false; ++} + #endif + + static inline long sibling_imbalance(struct lb_env *env, +@@ -11122,6 +11146,14 @@ static bool update_sd_pick_busiest(struct lb_env *env, + sds->local_stat.group_type != group_has_spare)) + return false; + ++ /* deal with prefer LLC load balance, if failed, fall into normal load balance */ ++ if (update_llc_busiest(env, busiest, sgs)) ++ return true; ++ ++ /* if there is already a busy group, skip the normal load balance */ ++ if (busiest->group_llc_balance) ++ return false; ++ + if (sgs->group_type > busiest->group_type) + return true; + +@@ -12029,9 +12061,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. ++ * Also do so if we can move some tasks that prefer the local LLC. + */ + if (sds.prefer_sibling && local->group_type == group_has_spare && +- sibling_imbalance(env, &sds, busiest, local) > 1) ++ (busiest->group_llc_balance || ++ sibling_imbalance(env, &sds, busiest, local) > 1)) + goto force_balance; + + if (busiest->group_type != group_overloaded) { +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch new file mode 100644 index 0000000..2bd9a06 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-15-28-sched-Introduce-a-new-migration_type-to-track-the-preferred-LLC-load-balance.patch @@ -0,0 +1,191 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.19]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7EC501DE3BE + for ; Sat, 9 Aug 2025 05:12:37 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.19 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716359; cv=none; b=QbiGTyG3g2KE4hVHW5ANIWTpqVt3p0S/zxdB0gitjC7ulgs6vf7jEjdLirjHI7kaK+ztqqN/rCqTa/6hfausklfNc4rz8dYqp0CG6Y6YgxkKUwvBQIrp/KqxhxJRz+O3v6tp0XTYU16LxAdHr+C5BFN93tNkPr76wtjETRyaMaw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716359; c=relaxed/simple; + bh=yGnpZ6FlvoEQkStNXEnYs/+BqL2dRHGniBRSc7+bEyI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=m1yPm73VBI2YcbcZuIT/BdavdjC/l3BdlBwnghR8mggxNfB23vX6iBYsgIl3jTAM5i/J7m6oqk0xrxs69iqlLFQ90jQ8hbG1d9nYsXQ1qBKpwmo+MO9karhCFAgqXxtek2+Fw6jBNSgn4f3uTQlC2jV9TQHxB0EG1HVMEE5ZgXA= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=TEnIOB1/; arc=none smtp.client-ip=198.175.65.19 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="TEnIOB1/" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716358; x=1786252358; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=yGnpZ6FlvoEQkStNXEnYs/+BqL2dRHGniBRSc7+bEyI=; + b=TEnIOB1/ILlwqYFTI6kKipRVVwx6DRo7Bg6DA7Rv5STVaUTCs0KWpJvq + Z6xP+XRjxHgLpyN1tALXTPQatPAmzpP8yoIO81oaWxQRxnjTqFILCkrJF + kCtJQ/VZjCEhVc7wgV23PMjUStSCtMH0P9OW3KNu8Za5Pnw7tMZySzv3t + NECmaJZ84sMsSF0CtDUsuTqG088mMNcuu4rS+3dzRuIJxgZ7St1Ds47Z9 + 1QQxPYZElaPgiDkZBfePvDHM5kCX7XSWYlmCxqKtAeHU7eYkQHevmw80h + wEnmX47OYGWTIQssLuDSa8NOAiVjB27DwhzduhCcmV9MOJn0mr3hsf+ZP + A==; +X-CSE-ConnectionGUID: lvcsmsR8QjyFBqcujWQmiQ== +X-CSE-MsgGUID: bzzKp3XeT8WmtT0FxfRmoA== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56932532" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56932532" +Received: from orviesa006.jf.intel.com ([10.64.159.146]) + by orvoesa111.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:12:37 -0700 +X-CSE-ConnectionGUID: cebyh5CzSdq9bdRIx+9UHw== +X-CSE-MsgGUID: 291hr3FsSYCIyZnbCahMKA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="164703738" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa006.jf.intel.com with ESMTP; 08 Aug 2025 22:12:31 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 15/28] sched: Introduce a new migration_type to track the preferred LLC load balance +Date: Sat, 9 Aug 2025 13:06:27 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Introduce a new migration type named migrate_llc_task to facilitate +cache-aware load balancing. + +After the busiest sched_group is identified as the one that needs +migration due to having most tasks preferring destination LLC, tag the +migration type as the newly introduced migrate_llc_task. During load +balancing, each runqueue within the busiest preferred-LLC sched_group +is checked, and the runqueue with the highest number of tasks preferring +to run on the destination CPU is chosen as the busiest runqueue. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 37 ++++++++++++++++++++++++++++++++++++- + 1 file changed, 36 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b8cc85291351..a301b56dd2b4 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9746,7 +9746,8 @@ enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, +- migrate_misfit ++ migrate_misfit, ++ migrate_llc_task + }; + + #define LBF_ALL_PINNED 0x01 +@@ -10180,6 +10181,15 @@ static int detach_tasks(struct lb_env *env) + env->imbalance -= util; + break; + ++ case migrate_llc_task: ++ /* ++ * Since can_migrate_task() succeed, when we reach here, it means that p ++ * can be migrated even if dst_cpu is not p's preferred_llc, because there ++ * are no idle cores for p to do in-llc load balance. ++ */ ++ env->imbalance--; ++ break; ++ + case migrate_task: + env->imbalance--; + break; +@@ -11817,6 +11827,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + return; + } + ++#ifdef CONFIG_SCHED_CACHE ++ if (busiest->group_llc_balance) { ++ /* Move a task that prefer local LLC */ ++ env->migration_type = migrate_llc_task; ++ env->imbalance = 1; ++ return; ++ } ++#endif ++ + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages +@@ -12125,6 +12144,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int busiest_pref_llc = 0; ++ int dst_llc; ++#endif + int i; + + for_each_cpu_and(i, sched_group_span(group), env->cpus) { +@@ -12233,6 +12256,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + } + break; + ++ case migrate_llc_task: ++#ifdef CONFIG_SCHED_CACHE ++ dst_llc = llc_idx(env->dst_cpu); ++ if (!cpus_share_cache(env->dst_cpu, rq->cpu) && ++ busiest_pref_llc < rq->nr_pref_llc[dst_llc]) { ++ busiest_pref_llc = rq->nr_pref_llc[dst_llc]; ++ busiest = rq; ++ } ++#endif ++ break; + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; +@@ -12415,6 +12448,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; ++ case migrate_llc_task: ++ break; + } + } + +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch new file mode 100644 index 0000000..8b05b84 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-16-28-sched-Consider-LLC-locality-for-active-balance.patch @@ -0,0 +1,190 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.19]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id C82CABA36 + for ; Sat, 9 Aug 2025 05:12:53 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.19 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716375; cv=none; b=aS7CLs0yVWKaF6OGVkXK8ZAgy8yUgUakdznwWDtZD4N9gQ9eA7tWRxrHj1IeAjaqTL8M+VmHHPvI8FEcOuDBcfH3oVpULXvb4/xFnoBCpg/mVg6MCRCvDJrLWdumxn7wi15V2NyagC2GII5gOWOj3odj3IWvyB3Ywa1aJoBDB/I= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716375; c=relaxed/simple; + bh=fRnw1t3Rh9UYvT00ArjMcNIiB37mwFZbA2eVDMCUX9M=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=RJUo8ambmNBfq+wvLRlOCmiE/W7wpbkuJF7yL9JEQZ65V62F8oCjVmz3qVgLvkV3PLO6TzjT6umm4UV9UMY1fHNephBb+kWs8mVMmZ6rPjQkthPxxV8sRM5GBZAKF/4w8+2Bp7vO3sUeIwn+6xlZ35XOq6ECCeBWwM5GJsRHcEU= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=DO7eB20K; arc=none smtp.client-ip=198.175.65.19 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="DO7eB20K" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716374; x=1786252374; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=fRnw1t3Rh9UYvT00ArjMcNIiB37mwFZbA2eVDMCUX9M=; + b=DO7eB20K/f/MVe9m5JNfhy4n46O8ccIoioHjrTsbuwZlLpqzzksjVlAv + FSWzb4JVfyBHYiQaKCDVfK0CkNncYJ22CpRHB1RwD3zGRfwsq6x9aCRQH + TKpWNRyQsj3e8nZiDG6U1hLeWNbIKj/X6uKv56QKiYVXkZeKHyR4Zqnxi + U5rknviHlsICE9lsjONRBpod6oRa32YfBF17V6dJ4X6Vo8cglEVlG/FKh + aqYrgjA98DE4rWoYD97vabGK2LMtYuZO47cKW4wuRsI+yu4gtqm55Wgcy + 1wWRnlj7aVuWb9SYbQgGx83xtwUCYP9X9i36gO7Eb2NFdoWyrm8YyDqHa + g==; +X-CSE-ConnectionGUID: 0oJuosNIQM23ggPBmbF4UQ== +X-CSE-MsgGUID: LCnyHPYNSYquGLfPEsgHmQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56932555" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56932555" +Received: from orviesa006.jf.intel.com ([10.64.159.146]) + by orvoesa111.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:12:53 -0700 +X-CSE-ConnectionGUID: 0pVJsl26T9OrBTO7kMP8BQ== +X-CSE-MsgGUID: o19YVeMBQCulANK7Q6DXLA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="164703753" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa006.jf.intel.com with ESMTP; 08 Aug 2025 22:12:47 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 16/28] sched: Consider LLC locality for active balance +Date: Sat, 9 Aug 2025 13:06:42 +0800 +Message-Id: <38d036cf946223b46a20ad60ccf13f9dcb316240.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +If busiest run queue has only one task, active balance is enlisted +to actually move the task. However, before moving the task, +we should consider whether we are moving the task from its preferred +LLC. + +Don't move the single running task in a run queue to another LLC, if +we are moving it from its desired LLC, or moving it will cause too much +imbalance between the LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 48 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a301b56dd2b4..592a4034e760 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -12332,10 +12332,43 @@ imbalanced_active_balance(struct lb_env *env) + return 0; + } + ++#ifdef CONFIG_SCHED_CACHE ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ if (!sched_feat(SCHED_CACHE)) ++ return 0; ++ ++ if (cpus_share_cache(env->src_cpu, env->dst_cpu)) ++ return 0; ++ /* ++ * All tasks want to stay put. Move only if LLC is ++ * heavily loaded or don't pull a task from its ++ * preferred CPU if it is the only one running. ++ */ ++ if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable && ++ (env->src_rq->nr_running <= 1 || ++ _get_migrate_hint(env->src_cpu, env->dst_cpu, ++ 0, false) == mig_forbid)) ++ return 1; ++ ++ return 0; ++} ++#else ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ return 0; ++} ++#endif ++ + static int need_active_balance(struct lb_env *env) + { + struct sched_domain *sd = env->sd; + ++ if (break_llc_locality(env)) ++ return 0; ++ + if (asym_active_balance(env)) + return 1; + +@@ -12355,7 +12388,8 @@ static int need_active_balance(struct lb_env *env) + return 1; + } + +- if (env->migration_type == migrate_misfit) ++ if (env->migration_type == migrate_misfit || ++ env->migration_type == migrate_llc_task) + return 1; + + return 0; +@@ -12800,9 +12834,20 @@ static int active_load_balance_cpu_stop(void *data) + goto out_unlock; + + /* Is there any task to move? */ +- if (busiest_rq->nr_running <= 1) +- goto out_unlock; ++ if (busiest_rq->nr_running <= 1) { ++#ifdef CONFIG_SCHED_CACHE ++ int llc = llc_idx(target_cpu); + ++ if (!sched_feat(SCHED_CACHE)) ++ goto out_unlock; ++ ++ if (llc < 0) ++ goto out_unlock; ++ /* don't migrate if task does not prefer target */ ++ if (busiest_rq->nr_pref_llc[llc] < 1) ++#endif ++ goto out_unlock; ++ } + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch new file mode 100644 index 0000000..d548fb9 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-17-28-sched-Consider-LLC-preference-when-picking-tasks-from-busiest-queue.patch @@ -0,0 +1,201 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.20]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 58996226CFC + for ; Sat, 9 Aug 2025 05:13:07 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.20 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716388; cv=none; b=ToNlzHkLLYiYeK8T8BsAhwwVTQMjfVDgwARwIDYa+wyZ8Eu38JHaFmFWhEHDz8Y4QXb3R7dulNjX2NJYnlkPmQ0FB+POvt2GUZ/4GvwbMKz42XqpgP66/Git+tq6B67e0BFqrfwmVgwWN0fqYa7Y2mT9Jw28QbyFm7zODtPY6sc= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716388; c=relaxed/simple; + bh=sTSG237di6kHrTi+M/LVG5ENiqilE30WO0gE5TPh2Qg=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=k07/zqOO54KVW2SXRP1BcFCx9O6eoCI0J3Yg2JN4fa4l56WOxmJSEvLVg2Qq4TaVlBV1mD6qAvItCmcERw1UNU4TYSdIrJB+dIamh6hR7WzT6I/vQu1VEkz4aED2Kp/nidg5cbmW5fT2HqFSYZjL/i79XRYjNuUmf9W31GhMzEo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=G3ntFCoX; arc=none smtp.client-ip=198.175.65.20 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="G3ntFCoX" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716387; x=1786252387; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=sTSG237di6kHrTi+M/LVG5ENiqilE30WO0gE5TPh2Qg=; + b=G3ntFCoXyBZCYeCFd2XdSpKtKywszwsAL7iG155Ga7pmma4DufkDAk1j + J0oIixy6CX3G2NDetf51jJCmgOaHTwM5/Zyy62tX553kTkCWxQYisiVUg + 1tLtppV/kH9sI0k6oKldvrjqqgkVdJpDQWrsW6zAURpZZQre0+t9sB2DH + giDN1ULvFcnaQhebg6L8k2Sk3KyDkVnyIgPtuntXxM6AYlGGbknUXkX/S + gDEJ0HpmTVhkCqcyfcxY/dueuq+yt+7fItoY/olEIlSogszYxJCyn99+x + O91JFgRzp1p1is0mJ7huD6m3c93Mm1gkIWBIs7CmYnQAnn6h0ZqAcvxVv + Q==; +X-CSE-ConnectionGUID: RA52ecx/Ti+wEdDxJMLnkg== +X-CSE-MsgGUID: /5YdSh1jQj+Ywel/XofIvA== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56768552" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56768552" +Received: from fmviesa002.fm.intel.com ([10.60.135.142]) + by orvoesa112.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:06 -0700 +X-CSE-ConnectionGUID: +bCrWry6RQe4Sex8ybOxpA== +X-CSE-MsgGUID: IbTHoJawTVyJKtMAWCm2+g== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="189180343" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa002.fm.intel.com with ESMTP; 08 Aug 2025 22:13:01 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 17/28] sched: Consider LLC preference when picking tasks from busiest queue +Date: Sat, 9 Aug 2025 13:06:58 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +When picking tasks from busiest queue for load balance, we currently +do not consider LLC preference. + +Order the task in the busiest queue such that we picked the tasks in the +following order: + 1. tasks that prefer dst cpu's LLC + 2. tasks that have no preference in LLC + 3. tasks that prefer LLC other than the ones they are on + 4. tasks that prefer the LLC that they are currently on + +This will allow tasks better chances to wind up in its preferred LLC. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 592a4034e760..8d5792b9e658 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10093,6 +10093,68 @@ static struct task_struct *detach_one_task(struct lb_env *env) + return NULL; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Prepare lists to detach tasks in the following order: ++ * 1. tasks that prefer dst cpu's LLC ++ * 2. tasks that have no preference in LLC ++ * 3. tasks that prefer LLC other than the ones they are on ++ * 4. tasks that prefer the LLC that they are currently on. ++ */ ++static struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ struct task_struct *p; ++ LIST_HEAD(pref_old_llc); ++ LIST_HEAD(pref_new_llc); ++ LIST_HEAD(no_pref_llc); ++ LIST_HEAD(pref_other_llc); ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return tasks; ++ ++ if (cpus_share_cache(env->dst_cpu, env->src_cpu)) ++ return tasks; ++ ++ while (!list_empty(tasks)) { ++ p = list_last_entry(tasks, struct task_struct, se.group_node); ++ ++ if (p->preferred_llc == llc_id(env->dst_cpu)) { ++ list_move(&p->se.group_node, &pref_new_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == llc_id(env->src_cpu)) { ++ list_move(&p->se.group_node, &pref_old_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == -1) { ++ list_move(&p->se.group_node, &no_pref_llc); ++ continue; ++ } ++ ++ list_move(&p->se.group_node, &pref_other_llc); ++ } ++ ++ /* ++ * We detach tasks from list tail in detach tasks. Put tasks ++ * to be chosen first at end of list. ++ */ ++ list_splice(&pref_new_llc, tasks); ++ list_splice(&no_pref_llc, tasks); ++ list_splice(&pref_other_llc, tasks); ++ list_splice(&pref_old_llc, tasks); ++ return tasks; ++} ++#else ++static inline struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ return tasks; ++} ++#endif ++ + /* + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from + * busiest_rq, as part of a balancing operation within domain "sd". +@@ -10101,7 +10163,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) + */ + static int detach_tasks(struct lb_env *env) + { +- struct list_head *tasks = &env->src_rq->cfs_tasks; ++ struct list_head *tasks; + unsigned long util, load; + struct task_struct *p; + int detached = 0; +@@ -10120,6 +10182,8 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + return 0; + ++ tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks); ++ + while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch new file mode 100644 index 0000000..7e42bb5 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-18-28-sched-Do-not-migrate-task-if-it-is-moving-out-of-its-preferred-LLC.patch @@ -0,0 +1,163 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.20]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id B1D46274B39 + for ; Sat, 9 Aug 2025 05:13:19 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.20 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716401; cv=none; b=gwUxPhraVHLm64bXQSb1oNwdX318HEEFGQP3NJIRjG0ej1HembqLwL/AMdMnKs2idXx3KEfcQggsIlJeGxPd86ymVhFs/rlGwCRgO+oHKZRTtPkeotIYE6Skr2Z90a1CPa/LNaWTM1XdHnwRmA1ybF/xRMbrR1KLLpNUD058xfI= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716401; c=relaxed/simple; + bh=s+rR7wC7UnAbjuxbPj5c6/L98TvboEiREJprAaugmo8=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Ex3plSrAdkKiiWgy1wFo/qOyi1KRMrAmvUSGgR+Sl7c+uHNitZ3FkYOzLvNN2Kk7bcRKDLWQu/zjHkM0B/ktpD3735kBxpJu0PJ8IZ58b18B0w/r7VUcJthi/o2hdqN358rDx5jLpz4Z9VU9UsOwUlIB1nTwdFObyEGsDbo1DD0= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZNX9P8Zh; arc=none smtp.client-ip=198.175.65.20 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZNX9P8Zh" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716400; x=1786252400; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=s+rR7wC7UnAbjuxbPj5c6/L98TvboEiREJprAaugmo8=; + b=ZNX9P8Zhh4or6QRvyIGlsqU0K0XXUdzdv+gCoKT4EcsJWyPrUrrpjeh6 + Qmj8TTDc2Q3+gYj6uSTMIaEUdV5BvlkAcN9NnwrPfjzZslpxwRyFTPJTx + KtvP4Sp2C8p5ushx8yObLd6nOXcFZSnue19p3r5NoF227rlrE4GYeUWKq + dM+U0/Nq/0qZLmHe33WFqOXqLI4gmE0PevCwc5pjj8qUenPxHW1kXvWF+ + 3fvMGqOlhGBBzPmI9Nt4so8fHdQ0chc/atY+kOpU5fgp8EHxTRIRoH0Lz + 6bGZ08Lr6XkBPeiz62J27S7cC3PWHkwpEOh1gP4JN0l7CxroHUJGaT1Ve + g==; +X-CSE-ConnectionGUID: b1FzoNUSROqCeN8PP7hLeA== +X-CSE-MsgGUID: PpjmRF1hSJmGHpUlPRMqJA== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56768565" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56768565" +Received: from fmviesa002.fm.intel.com ([10.60.135.142]) + by orvoesa112.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:19 -0700 +X-CSE-ConnectionGUID: TVpwBJFnSOm4LFGDl/UL7A== +X-CSE-MsgGUID: vYNYzwHuTSmQtfe59f1VcQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="189180352" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa002.fm.intel.com with ESMTP; 08 Aug 2025 22:13:13 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 18/28] sched: Do not migrate task if it is moving out of its preferred LLC +Date: Sat, 9 Aug 2025 13:07:10 +0800 +Message-Id: <081010e2c9cd8f4b3c9aa6d1b98fbe9438cd3c06.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +In the final step of task migration during load balancing, +can_migrate_task() is used to determine whether a task can +be moved to the destination. If the task has an LLC preference, +consider this preference when moving it out of its preferred LLC. +With this check in place, there is no need to retain the task's +cache-hot CPU check in task_hot(); remove it accordingly. + +Besides, add more checks in detach_tasks() to avoid choosing +tasks that prefer their current LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 28 +++++++++++++++++----------- + 1 file changed, 17 insertions(+), 11 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 8d5792b9e658..22b7a7fe538e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9822,17 +9822,6 @@ static int task_hot(struct task_struct *p, struct lb_env *env) + if (sysctl_sched_migration_cost == 0) + return 0; + +-#ifdef CONFIG_SCHED_CACHE +- if (sched_feat(SCHED_CACHE) && p->mm && p->mm->pcpu_sched) { +- /* +- * XXX things like Skylake have non-inclusive L3 and might not +- * like this L3 centric view. What to do about L2 stickyness ? +- */ +- return per_cpu_ptr(p->mm->pcpu_sched, env->src_cpu)->occ > +- per_cpu_ptr(p->mm->pcpu_sched, env->dst_cpu)->occ; +- } +-#endif +- + delta = rq_clock_task(env->src_rq) - p->se.exec_start; + + return delta < (s64)sysctl_sched_migration_cost; +@@ -10029,6 +10018,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + if (env->flags & LBF_ACTIVE_LB) + return 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_feat(SCHED_CACHE) && ++ get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid) ++ return 0; ++#endif ++ + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); +@@ -10289,6 +10284,17 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + break; + ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Don't detach more tasks if remaining tasks want to stay: ++ * The tasks have already been sorted by order_tasks_by_llc(), ++ * they are tasks that prefer the current LLC. ++ */ ++ if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 && ++ llc_id(env->src_cpu) == p->preferred_llc) ++ break; ++#endif ++ + continue; + next: + if (p->sched_task_hot) +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch new file mode 100644 index 0000000..29bccbd --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-19-28-sched-Introduce-SCHED_CACHE_LB-to-control-cache-aware-load-balance.patch @@ -0,0 +1,194 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.9]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0E605BA36 + for ; Sat, 9 Aug 2025 05:13:32 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.9 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716413; cv=none; b=Y5DY4uVXJj/NXx4xa+vIglUy/e0Mz3lC23M4GrgHNX8VTYhXYab2lfrFY6mo9TrkT8w/WQHHy0ath+3g82U7f7w+5f8oq86hgIXxPEP3isbuKS9ryw2kjNuMOw6y8wqF8EHfhI4CaEef8Gm0ym0TuNTWKHvhA39IckSeqYVQ55Q= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716413; c=relaxed/simple; + bh=rJfyf/57CaUPFk3O70t1/xBAt4kiiSb/LUukMkIaaWI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=BbGezSEvMdgtuVQGY5ll4boKLDz3xf/HaQKe/BbQh2RvAxEla/bMNlbvy7fNRiyq6jqUio9sHSAT2xUSQfMGzvU0gbx6uABWuPs54UhyitX7QqsGpuhPSoWaqfjMJ+JchZxUZEza6Pv3LtvE1xF1YB8vYubCHFqLzoIDWZeEypI= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Fv6T2jfF; arc=none smtp.client-ip=198.175.65.9 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Fv6T2jfF" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716412; x=1786252412; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=rJfyf/57CaUPFk3O70t1/xBAt4kiiSb/LUukMkIaaWI=; + b=Fv6T2jfFSgH2501Wvsd9NAJpLZ95G3qQ2wpS+brwkLJI4Z9OwbGNI010 + TXV31asmWoF9+Q+nOOHJAmbtTXBwCofZZc4StvBRsyudecftX7Wk1PRur + u7QKz2FJNo0ci4Owq3KzhsOU/Zu+KxpkANT6PW233G4v7L1dfPrJmuTsr + kc0L6AcVmncnIjhBRuMo0p6BD/uY9llqRtu1k1OnH8I9Jcei+J4SP8kPW + Qrss/vUTJVcjLGz++sDQq5rXiSF8X5srU5tRisTDzgzNNua10vFLKF5a/ + +MoDPmHcbzf2Z88/IRgSh6BUtg+yyb4sVeKEc69v7SY3AfaN4pHBsvMby + w==; +X-CSE-ConnectionGUID: yRiWx0TaSqSHcni4YwFu1Q== +X-CSE-MsgGUID: KOLdWXj6RXq5qKElN0Zavw== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="79620355" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="79620355" +Received: from fmviesa009.fm.intel.com ([10.60.135.149]) + by orvoesa101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:31 -0700 +X-CSE-ConnectionGUID: 66tPf5k4Q3+atsnA5jm4Jg== +X-CSE-MsgGUID: 4d6H20jNRjW/omJZg3lPVw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165844013" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa009.fm.intel.com with ESMTP; 08 Aug 2025 22:13:26 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 19/28] sched: Introduce SCHED_CACHE_LB to control cache aware load balance +Date: Sat, 9 Aug 2025 13:07:23 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Introduce the SCHED_CACHE_LB sched feature to enable or disable +cache aware load balance in the schduler. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 18 ++++++++++-------- + kernel/sched/features.h | 1 + + 2 files changed, 11 insertions(+), 8 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 22b7a7fe538e..9843d4e1d84f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10019,7 +10019,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + return 1; + + #ifdef CONFIG_SCHED_CACHE +- if (sched_feat(SCHED_CACHE) && ++ if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) && + get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid) + return 0; + #endif +@@ -10105,7 +10105,7 @@ static struct list_head + LIST_HEAD(no_pref_llc); + LIST_HEAD(pref_other_llc); + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + return tasks; + + if (cpus_share_cache(env->dst_cpu, env->src_cpu)) +@@ -10290,7 +10290,8 @@ static int detach_tasks(struct lb_env *env) + * The tasks have already been sorted by order_tasks_by_llc(), + * they are tasks that prefer the current LLC. + */ +- if (sched_feat(SCHED_CACHE) && p->preferred_llc != -1 && ++ if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) && ++ p->preferred_llc != -1 && + llc_id(env->src_cpu) == p->preferred_llc) + break; + #endif +@@ -10947,7 +10948,7 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_domain *child = env->sd->child; + int llc; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + return false; + + if (env->sd->flags & SD_SHARE_LLC) +@@ -11058,7 +11059,8 @@ static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_domain *sd = env->sd->child; + struct sched_domain_shared *sd_share; + +- if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE || ++ !sched_feat(SCHED_CACHE_LB)) + return; + + /* only care the sched domain that spans 1 LLC */ +@@ -11120,7 +11122,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + *sg_overutilized = 1; + + #ifdef CONFIG_SCHED_CACHE +- if (sched_feat(SCHED_CACHE)) { ++ if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB)) { + int j; + + for (j = 0; j < max_llcs; ++j) +@@ -12406,7 +12408,7 @@ imbalanced_active_balance(struct lb_env *env) + static inline bool + break_llc_locality(struct lb_env *env) + { +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + return 0; + + if (cpus_share_cache(env->src_cpu, env->dst_cpu)) +@@ -12908,7 +12910,7 @@ static int active_load_balance_cpu_stop(void *data) + #ifdef CONFIG_SCHED_CACHE + int llc = llc_idx(target_cpu); + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) + goto out_unlock; + + if (llc < 0) +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index d2af7bfd36bf..11dbd74cd365 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -88,6 +88,7 @@ SCHED_FEAT(TTWU_QUEUE, true) + SCHED_FEAT(SIS_UTIL, true) + + SCHED_FEAT(SCHED_CACHE, true) ++SCHED_FEAT(SCHED_CACHE_LB, true) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch new file mode 100644 index 0000000..1cf1911 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-20-28-sched-Introduce-SCHED_CACHE_WAKE-to-control-LLC-aggregation-on-wake-up.patch @@ -0,0 +1,145 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.9]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7A4A02749D5 + for ; Sat, 9 Aug 2025 05:13:44 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.9 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716425; cv=none; b=VIDdU39lmJxSYrHB9S4l1mh0boKZZki9BLORb9qgZo0xYyJtYcfX+m2EFuLna+wvMqdM3b9jRoxWjfYX98zrEOAuWNoH8zCG6FpNq8YiKHKq1NGdKxQHVgzOOiLG5uy1qO9t7Wa4goaOHrkQI+arcKezllgcvY4ibca99xDzFQA= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716425; c=relaxed/simple; + bh=YdhYlPfOzBX2JmU404dp2rIw4uGKUk0/reO/wjvEcdA=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=AeIzKWJSFg2pTtYt5o5Vz8pc3/BIn9Yhzjs5gEx3/9nVXOR8/67BRRpJDX5hibfRtV6EY8e/fAEY/Zxa+8RbAe6m0nXD1z5eRZPQvlghGCpOjLX0XPm2maXH0OgysqVx3VFMCsFbO52VsoQD2p85NC594bUHYDFoDa6p7TKhhlA= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=RBrobR1i; arc=none smtp.client-ip=198.175.65.9 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="RBrobR1i" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716424; x=1786252424; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=YdhYlPfOzBX2JmU404dp2rIw4uGKUk0/reO/wjvEcdA=; + b=RBrobR1iwxBeJZoLKXMRg5MQx8WGXo8muZpXStdx9iyAK264sfhIG/qu + joL4lEprzIBRw8X03Yy5P97hTPxpboN85pYrMB8bQ6FHQo9ybNckkrA6U + Fm88MiZ42tIDdBFytUX2SY9R5LIWL4D6l7uxGHF/7t9G2tSrGXQZrLdmV + kEXY08yQyIUpqXqtLN/Fts7veKj7eYDCqo12PTEZYQ6XGxrwnt4HlyW3a + b9OPJyXpEcpsigfeiakBxz87spvYkl6NsSdiBGHP8WsVh/XlkJ8G7/XfP + bZjJOX3ekGPt6NQIeusuWhHKU/YI3AgkFy7IeRU2nggpBNO+zFRf78cN1 + w==; +X-CSE-ConnectionGUID: 6JJ7KzVjTq2D3T0a9H1P9w== +X-CSE-MsgGUID: pH0Ms/VYRfa3u1EQ9lfZ0A== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="79620375" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="79620375" +Received: from fmviesa009.fm.intel.com ([10.60.135.149]) + by orvoesa101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:44 -0700 +X-CSE-ConnectionGUID: p2RcioaZTnKrvWCzoclFkA== +X-CSE-MsgGUID: 7ArRWLtgSgqBcLoFA2nIvw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165844031" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa009.fm.intel.com with ESMTP; 08 Aug 2025 22:13:38 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 20/28] sched: Introduce SCHED_CACHE_WAKE to control LLC aggregation on wake up +Date: Sat, 9 Aug 2025 13:07:35 +0800 +Message-Id: <144358df73cbb8c7d24f757fc40cb068be603bed.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Tim Chen + +Introduce SCHED_CACHE_WAKE feature to enable or disable cache-aware +wake up. Disable this feature by default because cache-aware wakeup +is overly aggressive in stacking wakees of the same process on the +same LLC, if they are frequently woken up. + +The wake ups can be much more frequent than load balances, adding +much overhead when load balance alone for LLC aggregation is sufficient. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 6 +++++- + kernel/sched/features.h | 1 + + 2 files changed, 6 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 9843d4e1d84f..6e61f9e1f628 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9063,7 +9063,7 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + struct mm_struct *mm = p->mm; + int cpu; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE)) + return prev_cpu; + + if (!mm || p->nr_cpus_allowed == 1) +@@ -9076,6 +9076,10 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + if (cpus_share_cache(cpu, prev_cpu)) + return prev_cpu; + ++ if (_get_migrate_hint(prev_cpu, cpu, ++ task_util(p), true) == mig_forbid) ++ return prev_cpu; ++ + if (static_branch_likely(&sched_numa_balancing) && + __migrate_degrades_locality(p, prev_cpu, cpu, false) > 0) { + /* +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 11dbd74cd365..44b408cf0dd4 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -89,6 +89,7 @@ SCHED_FEAT(SIS_UTIL, true) + + SCHED_FEAT(SCHED_CACHE, true) + SCHED_FEAT(SCHED_CACHE_LB, true) ++SCHED_FEAT(SCHED_CACHE_WAKE, false) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch new file mode 100644 index 0000000..263a69a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-21-28-sched-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch @@ -0,0 +1,299 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.9]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2D7E92472BA + for ; Sat, 9 Aug 2025 05:13:56 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.9 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716438; cv=none; b=oP/KP8CT9PyG8F223CbqjTnw5fAnuBqC4qrJsRtJ4FhmBZN2qpqY1eTd8fjtB3IjDADG3eDf23ECJS0GGe21q95Lgbd8aoZ2d1dhrh9ekTQMUxybtv0qdhdWt0awWabB+mPE502GRGKAWZIXsyFrdxhb/zbs1b9+pJ2xhI3lh0U= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716438; c=relaxed/simple; + bh=2aEa4h1fI5J4/1AEUoGjg0eQXawj82V6LxqSqdQWKjc=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=eLlyXwmAlVBfL8ugZpWuJv/cYDUhBSgXTmsJA1VoG4McRZM9A+e9EyJW9FEw5BbRwioynILfrxFhAe4zM1FRKim6rhs5NDIaRMWKq7+xJ+DXnEZ4q4gjxkB8JnCe7RI2fcforxvMACn4NmxTFkZ0407GDkQ+uKE5393PmwyageE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=jyvdr92q; arc=none smtp.client-ip=198.175.65.9 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="jyvdr92q" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716436; x=1786252436; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=2aEa4h1fI5J4/1AEUoGjg0eQXawj82V6LxqSqdQWKjc=; + b=jyvdr92qHqcE9M3jPHv+fDdx/YNY3xnAId3qtw3C2QzAwffEVL4ZQTBS + CePnqJIrCnZ2R6zfimwVCnLdiYu5OvVZ5ChHHhlNO+ZL+HID3ktCe7O2w + 48m583KvcVHXVXNpkpIfS7DLrauwwN0nhjxTOtWhNNA6tX0C3umtnps9k + I2871JFWkVEb0mXhuELAw1LEqE+pk38njQNVgLdHwoT5vvi9CMGrEAr/N + RU5gBb080A9sLYEGTtpnWCaPKZUTFqtKi9ostEazBVphHyMbIcwyMjLxO + F+hAW/C7GT5rTxqDXPHnZ6JnGmHGoIMYcwTWKxDujBKbcjIHs9lsLI5Ay + w==; +X-CSE-ConnectionGUID: frbwZBVeQsyYvImCyN450Q== +X-CSE-MsgGUID: O9BLeD00TFuab4r38fYeEg== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="79620417" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="79620417" +Received: from fmviesa009.fm.intel.com ([10.60.135.149]) + by orvoesa101.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:13:55 -0700 +X-CSE-ConnectionGUID: 57mN5EeXQnOL1PlRN1msdQ== +X-CSE-MsgGUID: ZdnU+8XKSQuiZvpeY/fMpw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165844039" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa009.fm.intel.com with ESMTP; 08 Aug 2025 22:13:50 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 21/28] sched: Introduce a static key to enable cache aware only for multi LLCs +Date: Sat, 9 Aug 2025 13:07:47 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +If there are more than one LLCs in the node, the +cache aware scheduling is enabled. Otherwise, the +cache aware scheduling is disabled. + +The definition of multiple LLCs in a node is that +every node in the system should have more than one +LLC. For example, if node0, node1, and node2 each +have 4 LLCs, while node3 has 1 LLC (possibly due +to CPU hotplug), cache-aware scheduling should be +disabled. + +Suggested-by: Libo Chen +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + kernel/sched/fair.c | 31 ++++++++++++++++++++++--------- + kernel/sched/sched.h | 1 + + kernel/sched/topology.c | 22 ++++++++++++++++++++-- + 3 files changed, 43 insertions(+), 11 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6e61f9e1f628..194ec594561b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1175,6 +1175,8 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + #define EPOCH_PERIOD (HZ/100) /* 10 ms */ + #define EPOCH_OLD 5 /* 50 ms */ + ++DEFINE_STATIC_KEY_FALSE(sched_cache_present); ++ + static int llc_id(int cpu) + { + if (cpu < 0) +@@ -1318,7 +1320,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + unsigned long epoch; + int mm_sched_llc = -1; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || ++ !static_branch_likely(&sched_cache_present)) + return; + + if (p->sched_class != &fair_sched_class) +@@ -1366,7 +1369,8 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + struct callback_head *work = &p->cache_work; + struct mm_struct *mm = p->mm; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_feat(SCHED_CACHE) || ++ !static_branch_likely(&sched_cache_present)) + return; + + if (!mm || !mm->pcpu_sched) +@@ -9063,7 +9067,8 @@ static int select_cache_cpu(struct task_struct *p, int prev_cpu) + struct mm_struct *mm = p->mm; + int cpu; + +- if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_WAKE) || ++ !static_branch_likely(&sched_cache_present)) + return prev_cpu; + + if (!mm || p->nr_cpus_allowed == 1) +@@ -10024,6 +10029,7 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + + #ifdef CONFIG_SCHED_CACHE + if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) && ++ static_branch_likely(&sched_cache_present) && + get_migrate_hint(env->src_cpu, env->dst_cpu, p) == mig_forbid) + return 0; + #endif +@@ -10109,7 +10115,8 @@ static struct list_head + LIST_HEAD(no_pref_llc); + LIST_HEAD(pref_other_llc); + +- if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) || ++ !static_branch_likely(&sched_cache_present)) + return tasks; + + if (cpus_share_cache(env->dst_cpu, env->src_cpu)) +@@ -10295,6 +10302,7 @@ static int detach_tasks(struct lb_env *env) + * they are tasks that prefer the current LLC. + */ + if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) && ++ static_branch_likely(&sched_cache_present) && + p->preferred_llc != -1 && + llc_id(env->src_cpu) == p->preferred_llc) + break; +@@ -10952,7 +10960,8 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_domain *child = env->sd->child; + int llc; + +- if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) || ++ !static_branch_likely(&sched_cache_present)) + return false; + + if (env->sd->flags & SD_SHARE_LLC) +@@ -11064,7 +11073,8 @@ static void update_sg_if_llc(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_domain_shared *sd_share; + + if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE || +- !sched_feat(SCHED_CACHE_LB)) ++ !sched_feat(SCHED_CACHE_LB) || ++ !static_branch_likely(&sched_cache_present)) + return; + + /* only care the sched domain that spans 1 LLC */ +@@ -11126,7 +11136,8 @@ static inline void update_sg_lb_stats(struct lb_env *env, + *sg_overutilized = 1; + + #ifdef CONFIG_SCHED_CACHE +- if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB)) { ++ if (sched_feat(SCHED_CACHE) && sched_feat(SCHED_CACHE_LB) && ++ static_branch_likely(&sched_cache_present)) { + int j; + + for (j = 0; j < max_llcs; ++j) +@@ -12412,7 +12423,8 @@ imbalanced_active_balance(struct lb_env *env) + static inline bool + break_llc_locality(struct lb_env *env) + { +- if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) || ++ !static_branch_likely(&sched_cache_present)) + return 0; + + if (cpus_share_cache(env->src_cpu, env->dst_cpu)) +@@ -12914,7 +12926,8 @@ static int active_load_balance_cpu_stop(void *data) + #ifdef CONFIG_SCHED_CACHE + int llc = llc_idx(target_cpu); + +- if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB)) ++ if (!sched_feat(SCHED_CACHE) || !sched_feat(SCHED_CACHE_LB) || ++ !static_branch_likely(&sched_cache_present)) + goto out_unlock; + + if (llc < 0) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 4464b92767ad..3e60618a88e9 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2857,6 +2857,7 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; + #ifdef CONFIG_SCHED_CACHE + extern unsigned int sysctl_llc_aggr_cap; + extern unsigned int sysctl_llc_aggr_imb; ++extern struct static_key_false sched_cache_present; + #endif + + #ifdef CONFIG_SCHED_HRTICK +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 91a2b7f65fee..8483c02b4d28 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -2476,6 +2476,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + int i, ret = -ENOMEM; + bool has_asym = false; + bool has_cluster = false; ++ bool llc_has_parent_sd = false; ++ unsigned int multi_llcs_node = 1; + + #ifdef CONFIG_SCHED_CACHE + if (max_llcs < 0) { +@@ -2545,6 +2547,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + struct sched_domain __rcu *top_p; + unsigned int nr_llcs; + ++ if (!llc_has_parent_sd) ++ llc_has_parent_sd = true; + /* + * For a single LLC per node, allow an + * imbalance up to 12.5% of the node. This is +@@ -2566,10 +2570,19 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + * between LLCs and memory channels. + */ + nr_llcs = sd->span_weight / child->span_weight; +- if (nr_llcs == 1) ++ /* ++ * iff all nodes have multiple LLCs, the ++ * multi_llcs_node will be set to 1. If ++ * there is at least 1 node having 1 single ++ * LLC, the multi_llcs_node remains 0. ++ */ ++ if (nr_llcs == 1) { + imb = sd->span_weight >> 3; +- else ++ multi_llcs_node = 0; ++ } else { + imb = nr_llcs; ++ multi_llcs_node &= 1; ++ } + imb = max(1U, imb); + sd->imb_numa_nr = imb; + +@@ -2617,6 +2630,11 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + ++#ifdef CONFIG_SCHED_CACHE ++ if (llc_has_parent_sd && multi_llcs_node && !sched_asym_cpucap_active()) ++ static_branch_inc_cpuslocked(&sched_cache_present); ++#endif ++ + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); + +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch new file mode 100644 index 0000000..9235bda --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-22-28-sched-Turn-EPOCH_PERIOD-and-EPOCH_OLD-into-tunnable-debugfs.patch @@ -0,0 +1,164 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 72A7D2749D5 + for ; Sat, 9 Aug 2025 05:14:08 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716450; cv=none; b=IeQpUvuEInOqzZCCDB3S2FFJC3zSp4/XkEtNAu2D98UQxwsXgo2BbwLtaxH6iJwbi4gN7aV60Aez3K8ydwiJFAzlJgHRf/3+aORKqKYd4JzlvthQfT5xmROL82OUpU66n+lqag40QZb3zB8TfPDePjvzD5oeSB7VRw6J3Cixx+g= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716450; c=relaxed/simple; + bh=aXXr09emE5m1BA8+F+9P+Qieum2q8rFMJHj2OrnZISQ=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=S5MR0P7NeiK39j0s1bhGmWRhNoCo1BGGAp0DbGAVUu/cKCksVWgSVTBQqiegxZ+gsioGdqPIL8dWWDic13tfdX1SnoZ6qXprsQhZj7vCqF2cTb/xO8jcfhLxZaWGG0ZCiaJ0gxuXgS+7OA9TZAf4d9OKuUhX4CnNCW42X+VS4FE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GZsq9eJL; arc=none smtp.client-ip=192.198.163.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GZsq9eJL" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716449; x=1786252449; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=aXXr09emE5m1BA8+F+9P+Qieum2q8rFMJHj2OrnZISQ=; + b=GZsq9eJL5DGjYunKJTsof+Ln2cKfOk2BGwjzrC+sKFMATJMkyxO99PMo + 2Oyl5uVHYhRF8Tm0YmjjPIUI3d++yQ67YwJyf8GQHF8cYeMonziUJgHhH + kWHsp/STcLEVX42oVsCvQJlHA6eoqh5JKSyBBe3w1N12e5vNle7MdQRuI + 9sPdUfMBH0dbovuNFtw5OfBzc2eoiu4kiBY1XCFzj5eShFF03nf9Tv2B/ + ClF5YQoCu+HTwDDVvM9QKGz82gKXl8kYElV4byqv5tvHmI7Psovf6yI1d + zi0XGuLMMAQ/QyWVmk7U53AdlwCTHAyvtt6E7DmP/8gc3IF+ydRquzbUa + g==; +X-CSE-ConnectionGUID: Q9jwphPASBiXZUHMOabB2g== +X-CSE-MsgGUID: 2ZVJmYQUSISFufuXvDLTyg== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60860025" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="60860025" +Received: from orviesa003.jf.intel.com ([10.64.159.143]) + by fmvoesa106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:08 -0700 +X-CSE-ConnectionGUID: ilrvcjmWQsiRa7CRb1DYVw== +X-CSE-MsgGUID: Uh2zL5Q0RAa3BSoaBdHAzw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="169693092" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa003.jf.intel.com with ESMTP; 08 Aug 2025 22:14:02 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 22/28] sched: Turn EPOCH_PERIOD and EPOCH_OLD into tunnable debugfs +Date: Sat, 9 Aug 2025 13:07:59 +0800 +Message-Id: <79c8fdcf7e875617935cfaba2ea1f2c2ae5ce62c.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Convert EPOCH_PERIOD and EPOCH_OLD into tunable debugfs +entries. Users can adjust the decay rate as needed. +By default, occupancy decays by half every 10 ms. + +Suggested-by: Shrikanth Hegde +Signed-off-by: Chen Yu +--- + kernel/sched/debug.c | 2 ++ + kernel/sched/fair.c | 9 ++++++--- + kernel/sched/sched.h | 2 ++ + 3 files changed, 10 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 682fd91a42a0..7a9ec03704b9 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -535,6 +535,8 @@ static __init int sched_init_debug(void) + #ifdef CONFIG_SCHED_CACHE + debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap); + debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb); ++ debugfs_create_u32("llc_period", 0644, debugfs_sched, &sysctl_llc_period); ++ debugfs_create_u32("llc_old", 0644, debugfs_sched, &sysctl_llc_old); + #endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 194ec594561b..64f757ad39fc 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1175,6 +1175,9 @@ static s64 update_curr_se(struct rq *rq, struct sched_entity *curr) + #define EPOCH_PERIOD (HZ/100) /* 10 ms */ + #define EPOCH_OLD 5 /* 50 ms */ + ++__read_mostly unsigned int sysctl_llc_period = EPOCH_PERIOD; ++__read_mostly unsigned int sysctl_llc_old = EPOCH_OLD; ++ + DEFINE_STATIC_KEY_FALSE(sched_cache_present); + + static int llc_id(int cpu) +@@ -1283,9 +1286,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) + long delta = now - rq->cpu_epoch_next; + + if (delta > 0) { +- n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ n = (delta + sysctl_llc_period - 1) / sysctl_llc_period; + rq->cpu_epoch += n; +- rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ rq->cpu_epoch_next += n * sysctl_llc_period; + __shr_u64(&rq->cpu_runtime, n); + } + +@@ -1346,7 +1349,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + * has only 1 thread, invalidate + * it's preferred state. + */ +- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_OLD || ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > sysctl_llc_old || + get_nr_threads(p) <= 1) { + mm->mm_sched_cpu = -1; + pcpu_sched->occ = 0; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 3e60618a88e9..d752d64d4acd 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2858,6 +2858,8 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; + extern unsigned int sysctl_llc_aggr_cap; + extern unsigned int sysctl_llc_aggr_imb; + extern struct static_key_false sched_cache_present; ++extern unsigned int sysctl_llc_period; ++extern unsigned int sysctl_llc_old; + #endif + + #ifdef CONFIG_SCHED_HRTICK +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch new file mode 100644 index 0000000..962ea38 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-23-28-sched-Scan-a-task-s-preferred-node-for-preferred-LLC.patch @@ -0,0 +1,171 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 50FEB275872 + for ; Sat, 9 Aug 2025 05:14:20 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716461; cv=none; b=IojQ3o0319gniGMl43HTVAIglRucNSyn6f7mIZ2sA6nGcNZlUGEZWY6057tDsNZ4vk1O+nB32WSiImkG1cA4P3bSQwXMpAQf17p3nQR/jrpVdHP7V0+mJDJgG2Sf7l8Ti7krqUFqfX2pqdYZqFCdot+k/yWtmOk0EJDIuVmMV9c= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716461; c=relaxed/simple; + bh=vn8SQoDAOd07cpCKj326vCb8D/qCqZCs9BwVgi4KoVg=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=PMXTmQv0twXoDtxt8UL0eicbBf9o7Tfv//e1wCHtE99sZhtB/nIU8wVaymmbbfxF6XMzVWor4WdG/qxhSD/wWJ4vwz9cuiTjtjsAycEcLvPhT9aUF3kUVc4kwE6SAX+4OTFtFFHuRXxnq1R/3zLjD9SIRWfAHwReiV3b07cgCmM= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=nkaLairD; arc=none smtp.client-ip=192.198.163.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="nkaLairD" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716461; x=1786252461; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=vn8SQoDAOd07cpCKj326vCb8D/qCqZCs9BwVgi4KoVg=; + b=nkaLairDDjkysNs4UHTZ5xKHewoxOhkUNj9VyNX4tbdn4A+qnsAVwMle + R5heWKuCY8Flip8hzeFiNi/CFABQw9zu7obpiMTotWoXuYrKrGBh3HoZh + lmLG5GkRobIJvrI3ad/N+LP8GAWOX5LCCD9ciXh9NpYENpuy7gVq79Rno + lOPq4XCXPVEuiMBh+0Se3GxDjUG9K2DZWlyzewIOPzwn2XZvGRXdUZ3Ot + w3MIJHSIsVA80TETVQPqTJE71E/W3dHyU/Fc9CdibOzm0oeRAQl7UFWIV + RWX9ArTsi+Tp+Wc1c8CtMey1/OwsGiNy5hIgFduA1bpZq8LLDwe4VQMBf + g==; +X-CSE-ConnectionGUID: Uof0EABPSOGTbrf//bNl/A== +X-CSE-MsgGUID: G6Abi0pKRMKI/T1SAuVRiQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60860044" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="60860044" +Received: from orviesa003.jf.intel.com ([10.64.159.143]) + by fmvoesa106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:20 -0700 +X-CSE-ConnectionGUID: Y7bYuOZeSdOfQBAqGM3CtQ== +X-CSE-MsgGUID: edOp5vWRTYOveGm/1PiD4g== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="169693142" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa003.jf.intel.com with ESMTP; 08 Aug 2025 22:14:14 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 23/28] sched: Scan a task's preferred node for preferred LLC +Date: Sat, 9 Aug 2025 13:08:11 +0800 +Message-Id: <178bf43d7cbc9b2c9aea408dd56b87391067df37.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +When sched_cache is enabled, fully scanning all online +CPUs to find the hottest one is very costly. As a first +step, limit the scan to only the CPUs within the task's +preferred node. If the node containing the task's preferred +LLC is not in the CPU scan mask, add it. Additionally, if +the node where the current task is running is not in the +scan mask, add it too. + +Suggested-by: Jianyong Wu +Suggested-by: Shrikanth Hegde +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + kernel/sched/fair.c | 36 +++++++++++++++++++++++++++++++++--- + 1 file changed, 33 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 64f757ad39fc..420d3a080990 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1390,13 +1390,36 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + } + } + ++static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu, ++ int pref_nid, int curr_cpu) ++{ ++#ifdef CONFIG_NUMA_BALANCING ++ /* first honor the task's preferred node */ ++ if (pref_nid != NUMA_NO_NODE) ++ cpumask_or(cpus, cpus, cpumask_of_node(pref_nid)); ++#endif ++ ++ /* secondly honor the task's cache CPU if it is not included */ ++ if (cache_cpu != -1 && !cpumask_test_cpu(cache_cpu, cpus)) ++ cpumask_or(cpus, cpus, ++ cpumask_of_node(cpu_to_node(cache_cpu))); ++ ++ /* ++ * Thirdly honor the task's current running node ++ * as the last resort. ++ */ ++ if (!cpumask_test_cpu(curr_cpu, cpus)) ++ cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu))); ++} ++ + static void __no_profile task_cache_work(struct callback_head *work) + { + struct task_struct *p = current; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + unsigned long last_m_a_occ = 0; +- int cpu, m_a_cpu = -1; ++ int cpu, m_a_cpu = -1, cache_cpu, ++ pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(); + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); +@@ -1406,11 +1429,18 @@ static void __no_profile task_cache_work(struct callback_head *work) + if (p->flags & PF_EXITING) + return; + +- if (!alloc_cpumask_var(&cpus, GFP_KERNEL)) ++ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + ++ cache_cpu = mm->mm_sched_cpu; ++#ifdef CONFIG_NUMA_BALANCING ++ if (static_branch_likely(&sched_numa_balancing)) ++ pref_nid = p->numa_preferred_nid; ++#endif ++ + scoped_guard (cpus_read_lock) { +- cpumask_copy(cpus, cpu_online_mask); ++ get_scan_cpumasks(cpus, cache_cpu, ++ pref_nid, curr_cpu); + + for_each_cpu(cpu, cpus) { + /* XXX sched_cluster_active */ +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch new file mode 100644 index 0000000..fd94172 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-24-28-sched-Record-average-number-of-runninhg-tasks-per-process.patch @@ -0,0 +1,169 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 057372472BA + for ; Sat, 9 Aug 2025 05:14:32 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716474; cv=none; b=VIfN6Nj+OmuTvjcAQC5ue6+EJDcsfkI76dqTW3x38qtgbZ4hqzLAXpZlXfM9DlKB8dKo57i5wMslwoWbzJnvNz1ykOFHxPfoi2S93m+jstGBBseKp1ztbQwQ2K61GTrnAqbJdZm7pnjXyKLjWHy5vyrtWh/xw7PspOvnpd6AKpw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716474; c=relaxed/simple; + bh=FSWaU6wjxtUO8jXpCphgNEzz4lxAHBHSTH4IiVUyBUk=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=XjsKAevWGzT+p1INgbZoPieajKhnft6CnQg6Bbk4u1Z+7t5XRC0bswqFYNyS3V/ZFpoWqhCw4RYtVBZI+9IiO7g1Q46JsT7+09UPlrjZPLkzbNiFJI/DRRGFHTenkxTcke+8xYuOm3RbKap9vjHJ6muwNQMS05rVf9nebWSot00= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gmFy4oU1; arc=none smtp.client-ip=192.198.163.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gmFy4oU1" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716473; x=1786252473; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=FSWaU6wjxtUO8jXpCphgNEzz4lxAHBHSTH4IiVUyBUk=; + b=gmFy4oU1sZtwQyMYwHB8SIoIYYE5hI8m7TFKqp6zqiVd8rnbxM1bMhLC + 74RYM+I/O6xKXfeu/UVzPgl1+lq9og33Njeix9LSwjF6dc54BAfz6kZpm + XSL2l8zGvHKS024WUDTZLsjKB3ozB4WcNBoQCDO/MuFQPfhhiy+fkzGjd + GRjfmZ2nZEDzv9f+jC+e5CY8l12nBabKfqFG4La0LMDW1GFk7YsYd275+ + ppat6y66psYB1mii4x1wz+0D3WFtxDecRb9O1Al2JUDYi696b9W2OMbl/ + S6cwP0rZ5li5OyXKhxjLqGgxHrmb3hYgWbTMhafNcMH8p7GmxPK8BizEQ + Q==; +X-CSE-ConnectionGUID: hfmQ09kFSka1PTGb3z++tA== +X-CSE-MsgGUID: jqX5qpmwT2awTicWyNB8Gw== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="60860082" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="60860082" +Received: from orviesa003.jf.intel.com ([10.64.159.143]) + by fmvoesa106.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:32 -0700 +X-CSE-ConnectionGUID: UKKGkDsGRSSXIWvNvv1UIw== +X-CSE-MsgGUID: AYQBgKAjRwGDzbPfQLudxQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="169693165" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa003.jf.intel.com with ESMTP; 08 Aug 2025 22:14:26 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 24/28] sched: Record average number of runninhg tasks per process +Date: Sat, 9 Aug 2025 13:08:23 +0800 +Message-Id: +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Performance regression was found when running hackbench +with many threads per process(the fd number is high). +To avoid this regression, process having a large number +of active threads should be excluded from cache aware +scheduling. + +With sched_cache enabled, record the number of active threads within +the process. This calculation occurs in the periodic task_cache_work(): +when iterating over the CPUs, check the currently running task on that +CPU; if the running task belongs to the same process as the task that +launches task_cache_work(), increment the active thread count by 1. + +If the number exceeds the number of CPUs in the preferred LLC, +sched_cache is prevented from aggregating too many threads in one +LLC domain. + +Reported-by: K Prateek Nayak +Signed-off-by: Chen Yu +--- + include/linux/mm_types.h | 1 + + kernel/sched/fair.c | 14 ++++++++++++-- + 2 files changed, 13 insertions(+), 2 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 41a598a44361..13b715357ccb 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1033,6 +1033,7 @@ struct mm_struct { + raw_spinlock_t mm_sched_lock; + unsigned long mm_sched_epoch; + int mm_sched_cpu; ++ u64 nr_running_avg; + #endif + + #ifdef CONFIG_MMU +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 420d3a080990..2577b4225c3f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1414,12 +1414,13 @@ static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu, + + static void __no_profile task_cache_work(struct callback_head *work) + { +- struct task_struct *p = current; ++ struct task_struct *p = current, *cur; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + unsigned long last_m_a_occ = 0; + int cpu, m_a_cpu = -1, cache_cpu, +- pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(); ++ pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(), ++ nr_running = 0; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); +@@ -1460,6 +1461,14 @@ static void __no_profile task_cache_work(struct callback_head *work) + m_cpu = i; + } + nr++; ++ ++ rcu_read_lock(); ++ cur = rcu_dereference(cpu_rq(i)->curr); ++ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && ++ cur->mm == mm) ++ nr_running++; ++ rcu_read_unlock(); ++ + trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n", + per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr); + } +@@ -1489,6 +1498,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + mm->mm_sched_cpu = m_a_cpu; + } + ++ update_avg(&mm->nr_running_avg, nr_running); + free_cpumask_var(cpus); + } + +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch new file mode 100644 index 0000000..5d4f16d --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-25-28-sched-Skip-cache-aware-scheduling-if-the-process-has-many-active-threads.patch @@ -0,0 +1,160 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id D7B93262FE7 + for ; Sat, 9 Aug 2025 05:14:45 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716487; cv=none; b=SwLkLgdtnhX6+OqLa78yMZ0B4CsWLaf64/W+OI5unQJxwjCB9iC7AOkHjHIaNJ3elBvx03DMvZl1I9GsmU+4HfwjpJKe6RPlgB+vcyUgYbcDZyIBhWoXYSKefXUoVChQUkXZnD99yesLvH9Ng14G5w2CQagDetRVGrHCe4izAMY= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716487; c=relaxed/simple; + bh=NXUgKwlBD+LbS7BQoNhwYmUMMNvH06C+9x5ABL9KQvU=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=hLGB1Xa+0uSZNB/dGK1ZYaVry91TulpBPgnDTPCsKPDTIPOxhNaZZ9Cuzpbo38IwKkY5uJgzEZ1uUGrD/s/RAk4WaWJnoklk4db0h0lc6DTb3DdY///Tx0rE+4HGY8/VgFkFtuHBrA6i2cPS9Gpq8iWYdaB6+MhIJDjLW2gw96I= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=JtFt/v4t; arc=none smtp.client-ip=192.198.163.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="JtFt/v4t" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716485; x=1786252485; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=NXUgKwlBD+LbS7BQoNhwYmUMMNvH06C+9x5ABL9KQvU=; + b=JtFt/v4t08KznPLSFaAfIMnOlZCZo8kJRJT+HfIrOEKXEd/dqH+lrk/a + pIz5BUtjCYdr/a4c7pZ1o1bWZFwpHGC/M6S2vru19uOo+9h+a6+cDG4JL + yM5/aSH0L5HRBrhBX4JQAp/3MD69CoaWhFe+GdocmtVOMYwP+erObftt/ + 1RQRwVjl1GoMs0U6JSVWCb5Sk5EwLmq/bSRzlsfuwbMavl7fO/aSz0Urq + XtvO5DV6xrhsk1Y77keeRc1mlMUGjry5fYahbKbwyaOxyKFtrNAFVv1mP + qRCo9H1Kh+G72Foi9f3RFJzd/ky65xGF3aC6FH86kLP/zxm2pRC6gsTTi + w==; +X-CSE-ConnectionGUID: IHbXoWNLTlyKIkz7HGkE0w== +X-CSE-MsgGUID: A4HbKv+ySb+99UGpcOUEYA== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56259994" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56259994" +Received: from orviesa007.jf.intel.com ([10.64.159.147]) + by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:45 -0700 +X-CSE-ConnectionGUID: tRxkh/5RR2W0QaPbtyXBXw== +X-CSE-MsgGUID: kEi0G4T2TKO4l03CeelQGA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165476161" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:14:39 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 25/28] sched: Skip cache aware scheduling if the process has many active threads +Date: Sat, 9 Aug 2025 13:08:36 +0800 +Message-Id: <463bc54a283c1b908ea286ce67f301e2d1d39ea1.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +If the number of active threads within the process +exceeds the number of Cores(divided by SMTs number) +in the LLC, do not enable cache-aware scheduling. +This is because there is a risk of cache contention +within the preferred LLC when too many threads are +present. + +Reported-by: K Prateek Nayak +Signed-off-by: Chen Yu +--- + kernel/sched/fair.c | 24 +++++++++++++++++++++++- + 1 file changed, 23 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2577b4225c3f..4bf794f170cf 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1205,6 +1205,18 @@ static inline int pref_llc_idx(struct task_struct *p) + return llc_idx(p->preferred_llc); + } + ++static bool exceed_llc_nr(struct mm_struct *mm, int cpu) ++{ ++ int smt_nr = 1; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) ++ smt_nr = cpumask_weight(cpu_smt_mask(cpu)); ++#endif ++ ++ return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu)); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { + int pref_llc; +@@ -1350,7 +1362,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + * it's preferred state. + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > sysctl_llc_old || +- get_nr_threads(p) <= 1) { ++ get_nr_threads(p) <= 1 || ++ exceed_llc_nr(mm, cpu_of(rq))) { + mm->mm_sched_cpu = -1; + pcpu_sched->occ = 0; + } +@@ -1430,6 +1443,11 @@ static void __no_profile task_cache_work(struct callback_head *work) + if (p->flags & PF_EXITING) + return; + ++ if (get_nr_threads(p) <= 1) { ++ mm->mm_sched_cpu = -1; ++ return; ++ } ++ + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + +@@ -9095,6 +9113,10 @@ static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cp + if (cpu < 0) + return mig_allow; + ++ /* skip cache aware load balance for single/too many threads */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ return mig_allow; ++ + if (cpus_share_cache(dst_cpu, cpu)) + return _get_migrate_hint(src_cpu, dst_cpu, + task_util(p), true); +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch new file mode 100644 index 0000000..186a2a4 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-26-28-sched-Do-not-enable-cache-aware-scheduling-for-process-with-large-RSS.patch @@ -0,0 +1,196 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id C0E67221540 + for ; Sat, 9 Aug 2025 05:14:58 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716500; cv=none; b=MmUb4c5SUS1z0wOntqBT1lBWA98RaJXvOGOJgyKpL3css2V1PB3tSyqGl6uL4LpYMOyo+rKX3+Or66/w3kvy4IrnK/1zKzbYlsK4uG0lHZdaI/ylANl1HNqUIDGNZvHQU/rlxzJ01GSoxw4kvO56Gsq+Q3Dt9ImEFj8vhMltr7o= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716500; c=relaxed/simple; + bh=ihWK71wp/q5X7Akw0a3NfcmXIvV26+nk2ItbYopgMBI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=jLyq+PE48ZqCxRzNfCECXbgKXorBAOraTW1R5D8fjI5Zh6EGFB8ZxO9LxYUE8qGvLCGMlNFmK1+oj1nTLMrg0x4R6BkGXURfRaIdO4gl1uU8D++Mr+VA477bh1glz9u1Ll6+Hks8Jtf0M6xJo+lTeh2jLF0wW7on+hBzGyeTVsU= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eixROvUd; arc=none smtp.client-ip=192.198.163.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eixROvUd" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716498; x=1786252498; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=ihWK71wp/q5X7Akw0a3NfcmXIvV26+nk2ItbYopgMBI=; + b=eixROvUdtZ+jX6c9PUzsFkzZYy58X17+XSMG4TAKB4dMmEuzDFKNJS5/ + y6mSE9FUJZqVr6Z/MJgBg/rnrSxKx4WNtLRYRaKlxeoov9FqlTgFQ0cm3 + xrL/E9j5rOidep+PGoL8jF0Vi5sxq3zlPOp19TXKYmYCBYSnGBQNe2AtY + mgT3vaD7Elxg79E7NgAUMSiS4MZSj26K4v5ujKu8dsw3shTBxA6CvPmPO + rVrBtoWLK3XI3ZTP4tki0uiAJIjOH7fmVd/U18FdmdONY3ZWGGHpTVNYs + SDtMc5zQFr8yS7suux1xYdFcGTMSbKUOlN8esjYEKijdSJZB/ZI2rOcmc + Q==; +X-CSE-ConnectionGUID: 2OIbjhYrRs6bZ6T4QRg5pg== +X-CSE-MsgGUID: 1Tg83zzKRiy+BCTOQqeOJA== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56260022" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56260022" +Received: from orviesa007.jf.intel.com ([10.64.159.147]) + by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:14:58 -0700 +X-CSE-ConnectionGUID: iiCU+no/RZqZ47l6ZSVsuQ== +X-CSE-MsgGUID: gBZUGA4RTmG67FOC847xZA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165476169" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:14:52 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 26/28] sched: Do not enable cache aware scheduling for process with large RSS +Date: Sat, 9 Aug 2025 13:08:49 +0800 +Message-Id: <881a665a94858d4fb6f13491f4dffe58c8fc3870.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +It has been reported that when running memory-intensive workloads +such as stream, sched_cache may saturate the memory bandwidth on +the preferred LLC. + +To prevent this from happening, evaluate the process's memory +footprint by checking the size of RSS (anonymous pages and shared +pages) and comparing it to the size of the LLC. If the former is +larger, skip cache-aware scheduling. This is because if tasks +do not actually share data, aggregating tasks with large RSS will +likely result in cache contention and performance depredation. + +However, in theory, RSS is not the same as memory footprint. +This is just an estimated approach to prevent over-aggregation. +The default behavior is to strictly compare the size of RSS with +the size of the LLC. The next patch will introduce a user-provided +hint to customize this comparison. + +Reported-by: K Prateek Nayak +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 47 ++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 44 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 4bf794f170cf..cbda7dad1305 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1205,6 +1205,34 @@ static inline int pref_llc_idx(struct task_struct *p) + return llc_idx(p->preferred_llc); + } + ++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) ++{ ++ struct cpu_cacheinfo *this_cpu_ci; ++ struct cacheinfo *l3_leaf; ++ unsigned long rss; ++ unsigned int llc; ++ ++ /* ++ * get_cpu_cacheinfo_level() can not be used ++ * because it requires the cpu_hotplug_lock ++ * to be held. Use get_cpu_cacheinfo() ++ * directly because the 'cpu' can not be ++ * offlined at the moment. ++ */ ++ this_cpu_ci = get_cpu_cacheinfo(cpu); ++ if (!this_cpu_ci->info_list || ++ this_cpu_ci->num_leaves < 3) ++ return true; ++ ++ l3_leaf = this_cpu_ci->info_list + 3; ++ llc = l3_leaf->size; ++ ++ rss = get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ ++ return (llc <= (rss * PAGE_SIZE)); ++} ++ + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) + { + int smt_nr = 1; +@@ -1363,7 +1391,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > sysctl_llc_old || + get_nr_threads(p) <= 1 || +- exceed_llc_nr(mm, cpu_of(rq))) { ++ exceed_llc_nr(mm, cpu_of(rq)) || ++ exceed_llc_capacity(mm, cpu_of(rq))) { + mm->mm_sched_cpu = -1; + pcpu_sched->occ = 0; + } +@@ -1448,6 +1477,14 @@ static void __no_profile task_cache_work(struct callback_head *work) + return; + } + ++ /* ++ * Do not check exceed_llc_nr() because ++ * the active number of threads needs to ++ * been updated anyway. ++ */ ++ if (exceed_llc_capacity(mm, curr_cpu)) ++ return; ++ + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + +@@ -9113,8 +9150,12 @@ static __maybe_unused enum llc_mig_hint get_migrate_hint(int src_cpu, int dst_cp + if (cpu < 0) + return mig_allow; + +- /* skip cache aware load balance for single/too many threads */ +- if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ /* ++ * skip cache aware load balance for single/too many threads ++ * and large footprint. ++ */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) || ++ exceed_llc_capacity(mm, dst_cpu)) + return mig_allow; + + if (cpus_share_cache(dst_cpu, cpu)) +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch new file mode 100644 index 0000000..e91dd01 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-27-28-sched-Allow-the-user-space-to-tune-the-scale-factor-for-RSS-comparison.patch @@ -0,0 +1,303 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [192.198.163.18]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8BD69278E77 + for ; Sat, 9 Aug 2025 05:15:14 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=192.198.163.18 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716517; cv=none; b=f0wCf0A1e24ot1gDONm8873CUbeJO7p+XOYXyd1L81oXyHjWyDUgzMJcD0hJ3DF8nImLeld/DZRB4Rw1t1WEKTNhLr+PgIxdQt1pezZAV7PflwC9pScJMoIsibbNOHtKzaO++na+m07o/7UQdsk+sPTfO2f6+LpbJhepVHPoteU= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716517; c=relaxed/simple; + bh=/Ba0fCfCtChaUyr8nA+Reo2+vDWT1X8nPBpOr+cNYjk=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=EDkKLloCKeEh8IprYHQ5cLYC7XD5yrfwdWMtkoq85n7q8KXHTNKJtaTomDYe5XtQCLNBxnEyUumHNx4/C6VKj/drV2J40y00jxpuHKC2otW2Agu5fvbDIBaIndWVIczwxgu3StyAleOP2GSxyPvAaYACU8MlGinZXBKpz5KxMhg= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Y90Y97GH; arc=none smtp.client-ip=192.198.163.18 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Y90Y97GH" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716515; x=1786252515; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=/Ba0fCfCtChaUyr8nA+Reo2+vDWT1X8nPBpOr+cNYjk=; + b=Y90Y97GHLKUMgwRE30WxTWa+nQqnGyCMmuH77cLq8i/vdaPuuN2Ktztv + zBg5w7QsE4Tap803U0WzbsFCGqUe1e5QS4yNcC38D0ELqu1BwsHy6z+jR + WhnmKOXM13ylYTNfpsosT8H/fTVU1o4HavW4jD2mb3Xd/w2lpl/NwsKv1 + lUdWhkLxUWSHTQXwZSwGQLULWx/qg/CpYON81o3vjH2gAxshxAoSNBtaB + P0r4Ex0Lc94pbcpxMXlN2Yvf/QybuyXM6p6DUPSf0Ju1um2BF6tKB2GEf + jmV33bJNWsB01mTnWfFZ55QI4h+P2NvGIgzeEkg3JsyzHbN7Qffq9u85f + Q==; +X-CSE-ConnectionGUID: ktJ3gmikQC2MxSJlFd01xQ== +X-CSE-MsgGUID: gbMwARUNRLGJ2ccZVMMRBw== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="56260030" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="56260030" +Received: from orviesa007.jf.intel.com ([10.64.159.147]) + by fmvoesa112.fm.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:15:13 -0700 +X-CSE-ConnectionGUID: lbKBkeQYTzKHrxkrdXEGtA== +X-CSE-MsgGUID: NxMJcQ1jQD24A6f12YxIFA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="165476192" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by orviesa007.jf.intel.com with ESMTP; 08 Aug 2025 22:15:08 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 27/28] sched: Allow the user space to tune the scale factor for RSS comparison +Date: Sat, 9 Aug 2025 13:09:02 +0800 +Message-Id: <81c197882b7c9f4325a5cb32f8a9d1e1fc900297.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +sched_cache compares the process's resident pages with +the size of the LLC to determine whether task aggregation +on the preferred LLC might cause cache contention. If the +former is larger than the latter, skip cache-aware task +aggregation. However, some workloads with large resident +pages have a small memory footprint; such workloads could +benefit from cache-aware scheduling. The kernel lacks a +efficient mechanism to track the task's memory footprint +(yes, we have resctrl, but it is for user-space query, +and not process scope), so it is up to userspace to pass +this hint to the kernel. + +Introduce /sys/kernel/debug/sched/sched_cache_ignore_rss +to control the extent to which users ignore the RSS +restriction. This value ranges from 0 to 100. A value of +0 means that the user disables the cache aware scheduling. +1 means if a process's RSS is larger than the LLC size, +cache-aware scheduling will be skipped. 100 means cache +aware scheduling is alwasy enabled regardless of RSS size. +N (between 1 and 100) means turn off cache aware scheduling +when RSS is greater than (N-1) * 256 * LLC size + +For example, suppose the L3 size is 32MB. If the +sysctl_sched_cache_ignore_rss is 1: When the RSS is larger +than 32MB, the process is regarded as exceeding the LLC capacity. +If the sysctl_sched_cache_ignore_rss is 99: When the RSS is +larger than 784GB, the process is regarded as exceeding the +LLC capacity(please refer to the code): +784GB = (1 + (99 - 1) * 256) * 32MB + +Additionally, the number of SMTs is also considered for +sysctl_sched_cache_aggr_cap; if there are many SMTs in the core, +sysctl_llc_aggr_cap will be reduced. This inhibits task aggregation +from cache-aware scheduling on systems with a high number of SMTs, +like Power 10 and Power 11. + +Reported-by: K Prateek Nayak +Reported-by: Madadi Vineeth Reddy +Reported-by: Shrikanth Hegde +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/debug.c | 82 +++++++++++++++++++++++++++++++++++++++++++- + kernel/sched/fair.c | 10 ++++-- + kernel/sched/sched.h | 3 +- + 3 files changed, 90 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 7a9ec03704b9..6676fc2a8c08 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -166,6 +166,83 @@ static const struct file_operations sched_feat_fops = { + .release = single_release, + }; + ++#ifdef CONFIG_SCHED_CACHE ++#define SCHED_CACHE_CREATE_CONTROL(name, val) \ ++static int sysctl_sched_cache_##name = val; \ ++static ssize_t sched_cache_write_##name(struct file *filp, \ ++ const char __user *ubuf, \ ++ size_t cnt, loff_t *ppos) \ ++{ \ ++ char buf[16]; \ ++ unsigned int percent; \ ++ if (cnt > 15) \ ++ cnt = 15; \ ++ if (copy_from_user(&buf, ubuf, cnt)) \ ++ return -EFAULT; \ ++ buf[cnt] = '\0'; \ ++ if (kstrtouint(buf, 10, &percent)) \ ++ return -EINVAL; \ ++ if (percent > 100) \ ++ return -EINVAL; \ ++ sysctl_sched_cache_##name = percent; \ ++ *ppos += cnt; \ ++ return cnt; \ ++} \ ++static int sched_cache_show_##name(struct seq_file *m, void *v) \ ++{ \ ++ seq_printf(m, "%d\n", sysctl_sched_cache_##name); \ ++ return 0; \ ++} \ ++static int sched_cache_open_##name(struct inode *inode, \ ++ struct file *filp) \ ++{ \ ++ return single_open(filp, sched_cache_show_##name, NULL); \ ++} \ ++static const struct file_operations sched_cache_fops_##name = { \ ++ .open = sched_cache_open_##name, \ ++ .write = sched_cache_write_##name, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++} ++ ++SCHED_CACHE_CREATE_CONTROL(ignore_rss, 1); ++int get_sched_cache_rss_scale(void) ++{ ++ if (!sysctl_sched_cache_ignore_rss) ++ return 0; ++ ++ if (sysctl_sched_cache_ignore_rss >= 100) ++ return INT_MAX; ++ /* ++ * Suppose the L3 size is 32MB. If the ++ * sysctl_sched_cache_ignore_rss is 1: ++ * When the RSS is larger than 32MB, ++ * the process is regarded as exceeding ++ * the LLC capacity. If the ++ * sysctl_sched_cache_ignore_rss is 99: ++ * When the RSS is larger than 784GB, ++ * the process is regarded as exceeding ++ * the LLC capacity: ++ * 784GB = (1 + (99 - 1) * 256) * 32MB ++ */ ++ return (1 + (sysctl_sched_cache_ignore_rss - 1) * 256); ++} ++ ++SCHED_CACHE_CREATE_CONTROL(aggr_cap, 50); ++int get_sched_cache_cap_scale(void) ++{ ++ int smt_nr = 1; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) ++ smt_nr = ++ cpumask_weight(cpu_smt_mask(raw_smp_processor_id())); ++#endif ++ return (sysctl_sched_cache_aggr_cap / smt_nr); ++} ++#endif /* SCHED_CACHE */ ++ + #ifdef CONFIG_SMP + + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, +@@ -533,10 +610,13 @@ static __init int sched_init_debug(void) + #endif + + #ifdef CONFIG_SCHED_CACHE +- debugfs_create_u32("llc_aggr_cap", 0644, debugfs_sched, &sysctl_llc_aggr_cap); + debugfs_create_u32("llc_aggr_imb", 0644, debugfs_sched, &sysctl_llc_aggr_imb); + debugfs_create_u32("llc_period", 0644, debugfs_sched, &sysctl_llc_period); + debugfs_create_u32("llc_old", 0644, debugfs_sched, &sysctl_llc_old); ++ debugfs_create_file("llc_aggr_cap", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_aggr_cap); ++ debugfs_create_file("llc_ignore_rss", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_ignore_rss); + #endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cbda7dad1305..018825f04063 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1211,6 +1211,7 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + struct cacheinfo *l3_leaf; + unsigned long rss; + unsigned int llc; ++ int scale; + + /* + * get_cpu_cacheinfo_level() can not be used +@@ -1230,7 +1231,11 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + rss = get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + +- return (llc <= (rss * PAGE_SIZE)); ++ scale = get_sched_cache_rss_scale(); ++ if (scale == INT_MAX) ++ return false; ++ ++ return ((llc * scale) <= (rss * PAGE_SIZE)); + } + + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) +@@ -9037,7 +9042,6 @@ static int find_energy_efficient_cpu(struct task_struct *p, int prev_cpu) + static long __migrate_degrades_locality(struct task_struct *p, + int src_cpu, int dst_cpu, + bool idle); +-__read_mostly unsigned int sysctl_llc_aggr_cap = 50; + __read_mostly unsigned int sysctl_llc_aggr_imb = 20; + + /* +@@ -9049,7 +9053,7 @@ __read_mostly unsigned int sysctl_llc_aggr_imb = 20; + * (default: ~50%) + */ + #define fits_llc_capacity(util, max) \ +- ((util) * 100 < (max) * sysctl_llc_aggr_cap) ++ ((util) * 100 < (max) * get_sched_cache_cap_scale()) + + /* + * The margin used when comparing utilization. +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index d752d64d4acd..eaeca4e77ead 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2855,11 +2855,12 @@ extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + + #ifdef CONFIG_SCHED_CACHE +-extern unsigned int sysctl_llc_aggr_cap; + extern unsigned int sysctl_llc_aggr_imb; + extern struct static_key_false sched_cache_present; + extern unsigned int sysctl_llc_period; + extern unsigned int sysctl_llc_old; ++int get_sched_cache_rss_scale(void); ++int get_sched_cache_cap_scale(void); + #endif + + #ifdef CONFIG_SCHED_HRTICK +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch new file mode 100644 index 0000000..051e055 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.16/RFC-PATCH-v4-28-28-sched-Add-ftrace-to-track-cache-aware-load-balance-and-hottest-CPU-changes.patch @@ -0,0 +1,307 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.12]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 47CE12797BD + for ; Sat, 9 Aug 2025 05:15:27 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.12 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1754716532; cv=none; b=Q3Lu9iJcgpkY3JofeQsI3NuQ1TQam6CIlO+tdvTSCRjGAjVblky3W53EIomiHy80dmktuPQdtHxgcRNWPE+j/bg5BQe6GDtHnoJUJTNFKCR/9DYjJgajvDVOAMxm+f5X8nLVN12/qTm5fIAB7ohPRpMT5XEfcEaB2rgn4WdJjlA= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1754716532; c=relaxed/simple; + bh=Qnj+s91JA/iUJUPb0vlHFoRsDjRZdaL0BuPbFnDHlf4=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Id7UAlt3jf4R+5/V26lYDKIMUvQ2sWz1U1L6PxHG7qpC1Y2DBCKgalLglN7phNtyW+llMFcUZ8TM/8hPX4zlNTja13GIfazsivILfGOPcNG17Rvk+pJ6zxYEFCqME/cFa0umvr+QT0QzQ/sIQRDaZpzSMHofa4VucqncRSPHNUo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com; spf=pass smtp.mailfrom=intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=KZT0jTXM; arc=none smtp.client-ip=198.175.65.12 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="KZT0jTXM" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1754716528; x=1786252528; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Qnj+s91JA/iUJUPb0vlHFoRsDjRZdaL0BuPbFnDHlf4=; + b=KZT0jTXMrwi6hPfQPUBt2UhPb6fUrWeb/O3CDNims/uo7a2fRVzB96ni + iAcBIq7cRPl82Mb6WAlg3t8qyCFKa4+mARu2XOEG/1TlTOTPJpPcEFd23 + uyE/VkcDEWP2Pk6IHBPTbmMmpwS9xu2Sc2pif7fu4IJ95Ou4he7GApWaO + MXcgjRuqD2lXHrJW2ZCh04Xx6L3C8w5eBUkk3oAlp2wVkN4HgtgEp9ORv + Z96q81Q0Wd5WaHbBqUBeZQ2vuQf9nOsBlZZ3rd4ahEG7C3LjJGEU6HHLi + KLNOB6OypoKBQux1+HQWGMGe9cMObuKZXq8mFwxxNatSDn+GrcGALRB6K + A==; +X-CSE-ConnectionGUID: jbgqMP0qR/OvQDc48pzOPw== +X-CSE-MsgGUID: hEeOV4ycSKW7r3RHicCLKw== +X-IronPort-AV: E=McAfee;i="6800,10657,11515"; a="68514913" +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="68514913" +Received: from fmviesa010.fm.intel.com ([10.60.135.150]) + by orvoesa104.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 08 Aug 2025 22:15:27 -0700 +X-CSE-ConnectionGUID: RhK2nh/kRuuYh7KksFDbdw== +X-CSE-MsgGUID: nLJQ9FRIRdmfFlE8PcQEEw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.17,278,1747724400"; + d="scan'208";a="166275369" +Received: from chenyu-dev.sh.intel.com ([10.239.62.107]) + by fmviesa010.fm.intel.com with ESMTP; 08 Aug 2025 22:15:20 -0700 +From: Chen Yu +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Libo Chen , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + linux-kernel@vger.kernel.org +Subject: [RFC PATCH v4 28/28] sched: Add ftrace to track cache aware load balance and hottest CPU changes +Date: Sat, 9 Aug 2025 13:09:17 +0800 +Message-Id: <3e3622a5b2129b56741989f15a8debabec064de9.1754712565.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.25.1 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce 3 trace events: + +1. +The average time spent scanning CPUs and calculating occupancy +in each sample period. This event can be used to track the +overhead of cache-aware scheduling. + +2. +The footprint when switching to a new mm_sched_cpu (a cache-hot CPU). +This event can be used to track whether there is any abnormal +bouncing of mm_sched_cpu. + +3. +The footprint of load balancing when migrating a task between CPUs. +This event can be used to track whether cache-aware load balancing +behaves as expected. + +All these events can be used with bpftrace to gain a basic +understanding of whether cache-aware scheduling is effective. + +Suggested-by: Shrikanth Hegde +Signed-off-by: Chen Yu +--- + include/trace/events/sched.h | 93 ++++++++++++++++++++++++++++++++++++ + kernel/sched/fair.c | 25 ++++++++-- + 2 files changed, 113 insertions(+), 5 deletions(-) + +diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h +index 4e6b2910cec3..398180c18946 100644 +--- a/include/trace/events/sched.h ++++ b/include/trace/events/sched.h +@@ -10,6 +10,99 @@ + #include + #include + ++TRACE_EVENT(sched_scan_cost, ++ ++ TP_PROTO(struct task_struct *t, u64 cost, int nr, ++ u64 old_running, u64 new_running), ++ ++ TP_ARGS(t, cost, nr, old_running, new_running), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( u64, cost ) ++ __field( int, nr ) ++ __field( u64, old_running ) ++ __field( u64, new_running ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, t->comm, TASK_COMM_LEN); ++ __entry->pid = t->pid; ++ __entry->cost = cost; ++ __entry->nr = nr; ++ __entry->old_running = old_running; ++ __entry->new_running = new_running; ++ ), ++ ++ TP_printk("comm=%s pid=%d cost=%llu nr=%d old_r=%lld new_r=%lld", ++ __entry->comm, __entry->pid, ++ __entry->cost, __entry->nr, ++ __entry->old_running, __entry->new_running) ++); ++ ++TRACE_EVENT(sched_cache_work, ++ ++ TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc, ++ int new_cpu, int new_llc), ++ ++ TP_ARGS(t, pref_cpu, pref_llc, new_cpu, new_llc), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, pref_cpu ) ++ __field( int, pref_llc ) ++ __field( int, new_cpu ) ++ __field( int, new_llc ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, t->comm, TASK_COMM_LEN); ++ __entry->pid = t->pid; ++ __entry->pref_cpu = pref_cpu; ++ __entry->pref_llc = pref_llc; ++ __entry->new_cpu = new_cpu; ++ __entry->new_llc = new_llc; ++ ), ++ ++ TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d", ++ __entry->comm, __entry->pid, ++ __entry->pref_cpu, __entry->pref_llc, ++ __entry->new_cpu, __entry->new_llc) ++); ++ ++TRACE_EVENT(sched_attach_task, ++ ++ TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc, ++ int attach_cpu, int attach_llc), ++ ++ TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, pref_cpu ) ++ __field( int, pref_llc ) ++ __field( int, attach_cpu ) ++ __field( int, attach_llc ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, t->comm, TASK_COMM_LEN); ++ __entry->pid = t->pid; ++ __entry->pref_cpu = pref_cpu; ++ __entry->pref_llc = pref_llc; ++ __entry->attach_cpu = attach_cpu; ++ __entry->attach_llc = attach_llc; ++ ), ++ ++ TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d", ++ __entry->comm, __entry->pid, ++ __entry->pref_cpu, __entry->pref_llc, ++ __entry->attach_cpu, __entry->attach_llc) ++); ++ + /* + * Tracepoint for calling kthread_stop, performed to end a kthread: + */ +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 018825f04063..cb2c33ee0d92 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1467,8 +1467,9 @@ static void __no_profile task_cache_work(struct callback_head *work) + unsigned long last_m_a_occ = 0; + int cpu, m_a_cpu = -1, cache_cpu, + pref_nid = NUMA_NO_NODE, curr_cpu = smp_processor_id(), +- nr_running = 0; ++ nr_running = 0, nr_scan = 0; + cpumask_var_t cpus; ++ u64 t0, scan_cost = 0; + + WARN_ON_ONCE(work != &p->cache_work); + +@@ -1499,6 +1500,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + pref_nid = p->numa_preferred_nid; + #endif + ++ t0 = sched_clock_cpu(curr_cpu); + scoped_guard (cpus_read_lock) { + get_scan_cpumasks(cpus, cache_cpu, + pref_nid, curr_cpu); +@@ -1521,6 +1523,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + m_cpu = i; + } + nr++; ++ nr_scan++; + + rcu_read_lock(); + cur = rcu_dereference(cpu_rq(i)->curr); +@@ -1529,8 +1532,8 @@ static void __no_profile task_cache_work(struct callback_head *work) + nr_running++; + rcu_read_unlock(); + +- trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n", +- per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr); ++ //trace_printk("(%d) occ: %ld m_occ: %ld m_cpu: %d nr: %d\n", ++ // per_cpu(sd_llc_id, i), occ, m_occ, m_cpu, nr); + } + + // a_occ /= nr; +@@ -1541,8 +1544,8 @@ static void __no_profile task_cache_work(struct callback_head *work) + if (llc_id(cpu) == llc_id(mm->mm_sched_cpu)) + last_m_a_occ = a_occ; + +- trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n", +- per_cpu(sd_llc_id, cpu), a_occ, m_a_occ); ++ //trace_printk("(%d) a_occ: %ld m_a_occ: %ld\n", ++ // per_cpu(sd_llc_id, cpu), a_occ, m_a_occ); + + for_each_cpu(i, sched_domain_span(sd)) { + /* XXX threshold ? */ +@@ -1553,12 +1556,17 @@ static void __no_profile task_cache_work(struct callback_head *work) + } + } + ++ scan_cost = sched_clock_cpu(curr_cpu) - t0; ++ + if (m_a_occ > (2 * last_m_a_occ)) { + /* avoid the bouncing of mm_sched_cpu */ ++ trace_sched_cache_work(p, mm->mm_sched_cpu, llc_id(mm->mm_sched_cpu), ++ m_a_cpu, llc_id(m_a_cpu)); + mm->mm_sched_cpu = m_a_cpu; + } + + update_avg(&mm->nr_running_avg, nr_running); ++ trace_sched_scan_cost(p, scan_cost, nr_scan, mm->nr_running_avg, nr_running); + free_cpumask_var(cpus); + } + +@@ -10443,6 +10451,13 @@ static void attach_task(struct rq *rq, struct task_struct *p) + { + lockdep_assert_rq_held(rq); + ++#ifdef CONFIG_SCHED_CACHE ++ if (p->mm) ++ trace_sched_attach_task(p, ++ p->mm->mm_sched_cpu, ++ p->mm->mm_sched_cpu != -1 ? llc_id(p->mm->mm_sched_cpu) : -1, ++ cpu_of(rq), llc_id(cpu_of(rq))); ++#endif + WARN_ON_ONCE(task_rq(p) != rq); + activate_task(rq, p, ENQUEUE_NOCLOCK); + wakeup_preempt(rq, p, 0); +-- +2.25.1 + + diff --git a/sys-kernel/gentoo-sources-6.17/0002-bbr3.patch b/sys-kernel/gentoo-sources-6.17/0002-bbr3.patch new file mode 100644 index 0000000..e69de29 diff --git a/sys-kernel/gentoo-sources-6.17/0003-block.patch b/sys-kernel/gentoo-sources-6.17/0003-block.patch new file mode 100644 index 0000000..e69de29 diff --git a/sys-kernel/gentoo-sources-6.17/0005-fixes.patch.skip b/sys-kernel/gentoo-sources-6.17/0005-fixes.patch.skip new file mode 100644 index 0000000..e69de29 diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch new file mode 100644 index 0000000..2ac2c2f --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch @@ -0,0 +1,654 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 46062169AD2 + for ; Sat, 11 Oct 2025 18:18:21 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206703; cv=none; b=RpWLRsxlJTzhlJSNJ6YDnnOidsJ7oCIJ0QG0EXS7VFoOFFRWiuWYlsET6M5MjOkyE+dnQih3vxbVtcm+li+EdUZBeyP5FVticeDHkmuoWPHZblewToySaE5iRFgZqZZMrF2/g7ww+IHVQ3wb1PmaWoyqrDBaIo5To0g72h92TRE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206703; c=relaxed/simple; + bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version:Content-Type; b=QJa7XLmNRAgs2IV6jX9+J3RTiz2TA7hXn5NgC4yjWKV75coBs2eumwHZZgG2HlZqrxNZy2yyHAMM73rFnrDZIvG+RpHWxcfbJopVHrre/vMQ3HJJFjQUmhaAwWCfX+5CuF2S3mkLLbQPk1FwQMpFRQzmQi7ZRNOguwaR+/BIBvQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=fA7dEfIE; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="fA7dEfIE" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206701; x=1791742701; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=; + b=fA7dEfIE91ULN1jqc64owLAysrWyqWsDA5nuO1+sgcIA15Yn8yYj6iw4 + 55VPKl3g+xYXhPmGyE7a0LZvFUc9YG3ckmUpqO0pvf6oo1RJcM13mS3yi + KNsM4bbd9aFpNPTftzZGqryw94QrGirzar7JNUNOk0MJqRkziOVPLHnOi + iVfGn7SOaI4LzDDzlorOXwaeFstT3f2UVe0Cr2vAWBdxYyDop0Z+G9hqb + BhSDn+aeXU8OqAYP/xGpt3Ce8cbnDhTJhA+r5jzej1xMspSEeS1p/SQOm + slC+k3w/mm9HPugo6aL39ZyshlQHrAN4qvnJBJT/5GnR6bFHs9O0IKtHz + w==; +X-CSE-ConnectionGUID: AwkM8kCOR6yXxOyCyDBj4Q== +X-CSE-MsgGUID: FBEmDsF5QKC61vf0MqpBmQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339614" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339614" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:19 -0700 +X-CSE-ConnectionGUID: HGgPT3dBQFm59TiA7l3rfA== +X-CSE-MsgGUID: SlOHviQzSgGRjsbScX9f4Q== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487181" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:19 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 01/19] sched/fair: Add infrastructure for cache-aware load balancing +Date: Sat, 11 Oct 2025 11:24:38 -0700 +Message-Id: <865b852e3fdef6561c9e0a5be9a94aec8a68cdea.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: "Peter Zijlstra (Intel)" + +Cache-aware load balancing aims to aggregate tasks with potential +shared resources into the same cache domain. This approach enhances +cache locality, thereby optimizing system performance by reducing +cache misses and improving data access efficiency. + +In the current implementation, threads within the same process are +considered as entities that potentially share resources. +Cache-aware load balancing monitors the CPU occupancy of each cache +domain for every process. Based on this monitoring, it endeavors to +migrate threads within a given process to its cache-hot domains, +with the goal of maximizing cache locality. + +It is an attempt at modelling cache affinity. While the patch series +only targets LLC, it could very well be extended to clusters (L2), +or other kind of domains grouping inside a node. + +As it stands, the mechanism only computes a CPU within the LLC that +has the highest recent runtime; this CPU is then used in the load +balance path in subsequent patches to steer toward this LLC. + +More elaborate measures could be added later in NUMA_BALANCING: for +example, migrating task A to its preferred LLC when it has spare CPU +capacity, or swapping task A with another running task B in task A’s +preferred LLC. + +Originally-by: Peter Zijlstra (Intel) +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/mm_types.h | 44 ++++++ + include/linux/sched.h | 4 + + init/Kconfig | 11 ++ + kernel/fork.c | 6 + + kernel/sched/core.c | 6 + + kernel/sched/fair.c | 288 +++++++++++++++++++++++++++++++++++++++ + kernel/sched/features.h | 1 + + kernel/sched/sched.h | 8 ++ + 8 files changed, 368 insertions(+) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 08bc2442db93..3ca557c2f36d 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -927,6 +927,11 @@ struct mm_cid { + }; + #endif + ++struct mm_sched { ++ u64 runtime; ++ unsigned long epoch; ++}; ++ + struct kioctx_table; + struct iommu_mm_data; + struct mm_struct { +@@ -1017,6 +1022,17 @@ struct mm_struct { + */ + raw_spinlock_t cpus_allowed_lock; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Track per-cpu-per-process occupancy as a proxy for cache residency. ++ * See account_mm_sched() and ... ++ */ ++ struct mm_sched __percpu *pcpu_sched; ++ raw_spinlock_t mm_sched_lock; ++ unsigned long mm_sched_epoch; ++ int mm_sched_cpu; ++#endif ++ + #ifdef CONFIG_MMU + atomic_long_t pgtables_bytes; /* size of all page tables */ + #endif +@@ -1436,6 +1452,34 @@ static inline unsigned int mm_cid_size(void) + static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } + #endif /* CONFIG_SCHED_MM_CID */ + ++#ifdef CONFIG_SCHED_CACHE ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched); ++ ++static inline int mm_alloc_sched_noprof(struct mm_struct *mm) ++{ ++ struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ ++ if (!pcpu_sched) ++ return -ENOMEM; ++ ++ mm_init_sched(mm, pcpu_sched); ++ return 0; ++} ++ ++#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) ++ ++static inline void mm_destroy_sched(struct mm_struct *mm) ++{ ++ free_percpu(mm->pcpu_sched); ++ mm->pcpu_sched = NULL; ++} ++#else /* !CONFIG_SCHED_CACHE */ ++ ++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } ++static inline void mm_destroy_sched(struct mm_struct *mm) { } ++ ++#endif /* CONFIG_SCHED_CACHE */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index f8188b833350..d7ddb7ce6c4b 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1400,6 +1400,10 @@ struct task_struct { + unsigned long numa_pages_migrated; + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ struct callback_head cache_work; ++#endif ++ + #ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; +diff --git a/init/Kconfig b/init/Kconfig +index e3eb63eadc87..4e625db7920a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -970,6 +970,17 @@ config NUMA_BALANCING + + This system will be inactive on UMA systems. + ++config SCHED_CACHE ++ bool "Cache aware load balance" ++ default y ++ depends on SMP ++ help ++ When enabled, the scheduler will attempt to aggregate tasks from ++ the same process onto a single Last Level Cache (LLC) domain when ++ possible. This improves cache locality by keeping tasks that share ++ resources within the same cache domain, reducing cache misses and ++ lowering data access latency. ++ + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y +diff --git a/kernel/fork.c b/kernel/fork.c +index c4ada32598bd..9cd6efe2926d 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm) + cleanup_lazy_tlbs(mm); + + WARN_ON_ONCE(mm == current->active_mm); ++ mm_destroy_sched(mm); + mm_free_pgd(mm); + mm_free_id(mm); + destroy_context(mm); +@@ -1079,6 +1080,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + if (mm_alloc_cid(mm, p)) + goto fail_cid; + ++ if (mm_alloc_sched(mm)) ++ goto fail_sched; ++ + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; +@@ -1088,6 +1092,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + return mm; + + fail_pcpu: ++ mm_destroy_sched(mm); ++fail_sched: + mm_destroy_cid(mm); + fail_cid: + destroy_context(mm); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index be00629f0ba4..79d15e904d12 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4520,6 +4520,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; + init_sched_mm_cid(p); ++ init_sched_mm(p); + } + + DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); +@@ -8821,6 +8822,11 @@ void __init sched_init(void) + + rq->core_cookie = 0UL; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spin_lock_init(&rq->cpu_epoch_lock); ++ rq->cpu_epoch_next = jiffies; ++#endif ++ + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); + } + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b173a059315c..a2ea002f4fd6 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p) + sa->runnable_avg = sa->util_avg; + } + ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec); ++ + static s64 update_se(struct rq *rq, struct sched_entity *se) + { + u64 now = rq_clock_task(rq); +@@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); ++ account_mm_sched(rq, donor, delta_exec); + + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); +@@ -1193,6 +1196,289 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + return delta_exec; + } + ++#ifdef CONFIG_SCHED_CACHE ++ ++/* ++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD ++ * tunable or so. ++ */ ++#define EPOCH_PERIOD (HZ / 100) /* 10 ms */ ++#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ ++ ++static int llc_id(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_id, cpu); ++} ++ ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) ++{ ++ unsigned long epoch; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); ++ struct rq *rq = cpu_rq(i); ++ ++ pcpu_sched->runtime = 0; ++ pcpu_sched->epoch = rq->cpu_epoch; ++ epoch = rq->cpu_epoch; ++ } ++ ++ raw_spin_lock_init(&mm->mm_sched_lock); ++ mm->mm_sched_epoch = epoch; ++ mm->mm_sched_cpu = -1; ++ ++ /* ++ * The update to mm->pcpu_sched should not be reordered ++ * before initialization to mm's other fields, in case ++ * the readers may get invalid mm_sched_epoch, etc. ++ */ ++ smp_store_release(&mm->pcpu_sched, _pcpu_sched); ++} ++ ++/* because why would C be fully specified */ ++static __always_inline void __shr_u64(u64 *val, unsigned int n) ++{ ++ if (n >= 64) { ++ *val = 0; ++ return; ++ } ++ *val >>= n; ++} ++ ++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ lockdep_assert_held(&rq->cpu_epoch_lock); ++ ++ unsigned long n, now = jiffies; ++ long delta = now - rq->cpu_epoch_next; ++ ++ if (delta > 0) { ++ n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ rq->cpu_epoch += n; ++ rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ __shr_u64(&rq->cpu_runtime, n); ++ } ++ ++ n = rq->cpu_epoch - pcpu_sched->epoch; ++ if (n) { ++ pcpu_sched->epoch += n; ++ __shr_u64(&pcpu_sched->runtime, n); ++ } ++} ++ ++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); ++ ++ __update_mm_sched(rq, pcpu_sched); ++ ++ /* ++ * Runtime is a geometric series (r=0.5) and as such will sum to twice ++ * the accumulation period, this means the multiplcation here should ++ * not overflow. ++ */ ++ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); ++} ++ ++static inline ++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) ++{ ++ struct mm_struct *mm = p->mm; ++ struct mm_sched *pcpu_sched; ++ unsigned long epoch; ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ if (p->sched_class != &fair_sched_class) ++ return; ++ /* ++ * init_task and kthreads don't having mm ++ */ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq)); ++ ++ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { ++ __update_mm_sched(rq, pcpu_sched); ++ pcpu_sched->runtime += delta_exec; ++ rq->cpu_runtime += delta_exec; ++ epoch = rq->cpu_epoch; ++ } ++ ++ /* ++ * If this task hasn't hit task_cache_work() for a while, or it ++ * has only 1 thread, invalidate its preferred state. ++ */ ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || ++ get_nr_threads(p) <= 1) { ++ if (mm->mm_sched_cpu != -1) ++ mm->mm_sched_cpu = -1; ++ } ++} ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ struct mm_struct *mm = p->mm; ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ if (mm->mm_sched_epoch == rq->cpu_epoch) ++ return; ++ ++ guard(raw_spinlock)(&mm->mm_sched_lock); ++ ++ if (work->next == work) { ++ task_work_add(p, work, TWA_RESUME); ++ WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch); ++ } ++} ++ ++static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu, ++ int pref_nid, int curr_cpu) ++{ ++#ifdef CONFIG_NUMA_BALANCING ++ /* First honor the task's preferred node. */ ++ if (pref_nid != NUMA_NO_NODE) ++ cpumask_or(cpus, cpus, cpumask_of_node(pref_nid)); ++#endif ++ ++ /* Next honor the task's cache CPU if it is not included. */ ++ if (cache_cpu != -1 && !cpumask_test_cpu(cache_cpu, cpus)) ++ cpumask_or(cpus, cpus, ++ cpumask_of_node(cpu_to_node(cache_cpu))); ++ ++ /* ++ * Lastly make sure that the task's current running node is ++ * considered. ++ */ ++ if (!cpumask_test_cpu(curr_cpu, cpus)) ++ cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu))); ++} ++ ++static void __no_profile task_cache_work(struct callback_head *work) ++{ ++ struct task_struct *p = current; ++ struct mm_struct *mm = p->mm; ++ unsigned long m_a_occ = 0; ++ unsigned long curr_m_a_occ = 0; ++ int cpu, m_a_cpu = -1, cache_cpu, ++ pref_nid = NUMA_NO_NODE, curr_cpu; ++ cpumask_var_t cpus; ++ ++ WARN_ON_ONCE(work != &p->cache_work); ++ ++ work->next = work; ++ ++ if (p->flags & PF_EXITING) ++ return; ++ ++ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) ++ return; ++ ++ curr_cpu = task_cpu(p); ++ cache_cpu = mm->mm_sched_cpu; ++#ifdef CONFIG_NUMA_BALANCING ++ if (static_branch_likely(&sched_numa_balancing)) ++ pref_nid = p->numa_preferred_nid; ++#endif ++ ++ scoped_guard (cpus_read_lock) { ++ get_scan_cpumasks(cpus, cache_cpu, ++ pref_nid, curr_cpu); ++ ++ for_each_cpu(cpu, cpus) { ++ /* XXX sched_cluster_active */ ++ struct sched_domain *sd = per_cpu(sd_llc, cpu); ++ unsigned long occ, m_occ = 0, a_occ = 0; ++ int m_cpu = -1, i; ++ ++ if (!sd) ++ continue; ++ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ occ = fraction_mm_sched(cpu_rq(i), ++ per_cpu_ptr(mm->pcpu_sched, i)); ++ a_occ += occ; ++ if (occ > m_occ) { ++ m_occ = occ; ++ m_cpu = i; ++ } ++ } ++ ++ /* ++ * Compare the accumulated occupancy of each LLC. The ++ * reason for using accumulated occupancy rather than average ++ * per CPU occupancy is that it works better in asymmetric LLC ++ * scenarios. ++ * For example, if there are 2 threads in a 4CPU LLC and 3 ++ * threads in an 8CPU LLC, it might be better to choose the one ++ * with 3 threads. However, this would not be the case if the ++ * occupancy is divided by the number of CPUs in an LLC (i.e., ++ * if average per CPU occupancy is used). ++ * Besides, NUMA balancing fault statistics behave similarly: ++ * the total number of faults per node is compared rather than ++ * the average number of faults per CPU. This strategy is also ++ * followed here. ++ */ ++ if (a_occ > m_a_occ) { ++ m_a_occ = a_occ; ++ m_a_cpu = m_cpu; ++ } ++ ++ if (llc_id(cpu) == llc_id(mm->mm_sched_cpu)) ++ curr_m_a_occ = a_occ; ++ ++ cpumask_andnot(cpus, cpus, sched_domain_span(sd)); ++ } ++ } ++ ++ if (m_a_occ > (2 * curr_m_a_occ)) { ++ /* ++ * Avoid switching mm_sched_cpu too fast. ++ * The reason to choose 2X is because: ++ * 1. It is better to keep the preferred LLC stable, ++ * rather than changing it frequently and cause migrations ++ * 2. 2X means the new preferred LLC has at least 1 more ++ * busy CPU than the old one(200% vs 100%, eg) ++ * 3. 2X is chosen based on test results, as it delivers ++ * the optimal performance gain so far. ++ */ ++ mm->mm_sched_cpu = m_a_cpu; ++ } ++ ++ free_cpumask_var(cpus); ++} ++ ++void init_sched_mm(struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ ++ init_task_work(work, task_cache_work); ++ work->next = work; ++} ++ ++#else ++ ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, ++ s64 delta_exec) { } ++ ++void init_sched_mm(struct task_struct *p) { } ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) { } ++ ++#endif ++ + /* + * Used by other classes to account runtime. + */ +@@ -13031,6 +13317,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + if (static_branch_unlikely(&sched_numa_balancing)) + task_tick_numa(rq, curr); + ++ task_tick_cache(rq, curr); ++ + update_misfit_status(curr, rq); + check_update_overutilized_status(task_rq(curr)); + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 3c12d9f93331..d2af7bfd36bf 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true) + */ + SCHED_FEAT(SIS_UTIL, true) + ++SCHED_FEAT(SCHED_CACHE, true) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index be9745d104f7..2ded8d3d0ecc 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1166,6 +1166,12 @@ struct rq { + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spinlock_t cpu_epoch_lock ____cacheline_aligned; ++ u64 cpu_runtime; ++ unsigned long cpu_epoch; ++ unsigned long cpu_epoch_next; ++#endif + + atomic_t nr_iowait; + +@@ -3790,6 +3796,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } + static inline void init_sched_mm_cid(struct task_struct *t) { } + #endif /* !CONFIG_SCHED_MM_CID */ + ++extern void init_sched_mm(struct task_struct *p); ++ + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); + extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + static inline +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch b/sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch new file mode 100644 index 0000000..cbf16ce --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch @@ -0,0 +1,227 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 19068204096 + for ; Sat, 11 Oct 2025 18:18:21 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206704; cv=none; b=EzlLh3pSj7Y4f8RITAS280jAzGdfSil0Uvmf2s0iDBWXhjbTN9kKcwe8yCBI8vI/kpxwAU/q6SDZiBXRODyVXxt+x1ZEHGNytyNVJ+14VdLcKLUF/bWqEXXojGdMU1nZFeYor5k/Gwn2eBMXY7mjVq+req3REwzEV/z7PNxWJYU= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206704; c=relaxed/simple; + bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version:Content-Type; b=naTQ9gtxsiPYap1e7sRA67shhCjtvQU5+UWYPmFmFnsa1NV0CLod+8tcKlUn52BHYuXFMHk+KQi3AhpPSOC+Tysfot4R/EhnOjDucwfpslAmfKl+rwCfOrGMnq3fjOG/h3r7EnuLxz8dxpUfqriJzedrFrStvfO37iAPvvF5HVg= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=LSwa/WAK; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="LSwa/WAK" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206702; x=1791742702; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=; + b=LSwa/WAKvGAX6RIYpQ7iNqrlvhm/Szlkb5ZlWCgbajQDsBhTiTWg/PPi + Nxj6VEs7MSoZptgkIvxX8jl3FQca3deDnRuhlinmaGbJYu3LY3ZP4p3jp + 4+hBugKd3GkfwcLlWr+3IrP84r9gwdtMmKlDccI1G07f4s4tirTBoEDsm + gJ8uA3qrKlx1xYMf/sgz5udiByo4NeRPGdBdJ+bYBTDvNTGeTE9k4bBmi + 0OuSxEI9YhInAS8s2mr8VnpZwUVjixmAO4g6ZwRHW42PucNrjAj/v7YoU + sfJ1aDaIb4/pD7oTExOcJxChABHQZAXGQ1b9F1jBoWdX4w8mb0HwbQJ+I + A==; +X-CSE-ConnectionGUID: V6kqtIYCR06jkGZvnWCLsQ== +X-CSE-MsgGUID: XSPXCIWWQjiVjOSNzEq1Ow== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339631" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339631" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:20 -0700 +X-CSE-ConnectionGUID: wcTW2V7hQHun3H1J8na2Fw== +X-CSE-MsgGUID: zfpr8MStR5yuJxzDpmsnpw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487184" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:20 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 02/19] sched/fair: Record per-LLC utilization to guide cache-aware scheduling decisions +Date: Sat, 11 Oct 2025 11:24:39 -0700 +Message-Id: <7684e7381c61a2a0d0580790340d4daa5349e48c.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +When a system becomes busy and a process’s preferred LLC is +saturated with too many threads, tasks within that LLC migrate +frequently. These in LLC migrations introduce latency and degrade +performance. To avoid this, task aggregation should be suppressed when +the preferred LLC is overloaded, which requires a metric to indicate +LLC utilization. + +Record per LLC utilization/cpu capacity during periodic load +balancing. These statistics will be used in later patches to decide +whether tasks should be aggregated into their preferred LLC. + +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/sched/topology.h | 4 ++ + kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++++++ + 2 files changed, 77 insertions(+) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 5263746b63e8..fa25db00fdb6 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -77,6 +77,10 @@ struct sched_domain_shared { + atomic_t nr_busy_cpus; + int has_idle_cores; + int nr_idle_scan; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned long util_avg; ++ unsigned long capacity ____cacheline_aligned_in_smp; ++#endif + }; + + struct sched_domain { +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a2ea002f4fd6..1ebb0d99a906 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9559,6 +9559,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + return 0; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* Called from load balancing paths with rcu_read_lock held */ ++static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ struct sched_domain_shared *sd_share; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu)); ++ if (!sd_share) ++ return false; ++ ++ *util = READ_ONCE(sd_share->util_avg); ++ *cap = READ_ONCE(sd_share->capacity); ++ ++ return true; ++} ++#else ++static inline bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ return false; ++} ++#endif + /* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +@@ -10529,6 +10552,55 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) + return check_cpu_capacity(rq, sd); + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Record the statistics for this scheduler group for later ++ * use. These values guide load balancing on aggregating tasks ++ * to a LLC. ++ */ ++static void record_sg_llc_stats(struct lb_env *env, ++ struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ /* ++ * Find the child domain on env->dst_cpu. This domain ++ * is either the domain that spans this group(if the ++ * group is a local group), or the sibling domain of ++ * this group. ++ */ ++ struct sched_domain *sd = env->sd->child; ++ struct sched_domain_shared *sd_share; ++ ++ if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ return; ++ ++ /* only care about sched domains spanning a LLC */ ++ if (sd != rcu_dereference(per_cpu(sd_llc, env->dst_cpu))) ++ return; ++ ++ /* ++ * At this point we know this group spans a LLC domain. ++ * Record the statistic of this group in its corresponding ++ * shared LLC domain. ++ */ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, ++ cpumask_first(sched_group_span(group)))); ++ if (!sd_share) ++ return; ++ ++ if (READ_ONCE(sd_share->util_avg) != sgs->group_util) ++ WRITE_ONCE(sd_share->util_avg, sgs->group_util); ++ ++ if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) ++ WRITE_ONCE(sd_share->capacity, sgs->group_capacity); ++} ++#else ++static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++} ++#endif ++ + /** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @env: The load balancing environment. +@@ -10618,6 +10690,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ record_sg_llc_stats(env, sgs, group); + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch b/sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch new file mode 100644 index 0000000..eb1895b --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch @@ -0,0 +1,335 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F9012652B7 + for ; Sat, 11 Oct 2025 18:18:22 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206704; cv=none; b=oUXwn7ZLltUxrcsLLRQdMkG+rOj3I6N99RIlDJViVMyN84ZxeHx7+Ziq9zOEmnN6HNfk258hdIef+3nAkETeBkCnWEbZ8Lcj64n3OoXf0SrXkICA1KPwc1TZ230lpQNfogVeErSJlu4VOhrgueBPexZRP8Ng8MlzAqpdxuV0fQw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206704; c=relaxed/simple; + bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=h155j6xc5cDWdV6bfIecXus0Znq8M6zidqbVhtVjeQT/UoiHcyIrY8v1abXoVw27R0/39P2bQUH4GyYEjMOV8PSTvlLp8J+kYh4mcI1SSe5ftkudSs2ubZG59uaM4B6xXwz85tEAhPwwNkRLqFlmW7J/wyi3Ynw+ec/ie7a3Ft4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n7smfE6o; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n7smfE6o" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206702; x=1791742702; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=; + b=n7smfE6oCjv1Z9pv/7dg2JDtqoMwaTw0XnoJhqh6krIk55XD846r100l + CQyKNCviKGlIlQvhs/a27sgH4IgQduwhbRn6XT0KlUibkjI+C8DxLau1W + bQGlFOBkWVF6N/GWfn6y0ss98uylK337lt84xU7aPoM+QWTzjR+VkOrKT + 0bIzxevMwLmEG4vuOleJ69vSQP6G0PZSGpGrTBTnbFEemOJQO4Ufh8Z3S + CBvnKym+IUG+WQx9TQa+cFfFXkPxhSkobYj2dyGq+CWyc4oBsOiaaIfuN + mb6/NAGjVnTGTjlIsC3a7QsDovld1JkhMvVnrniOZGCbMVHv6vrIMp6no + g==; +X-CSE-ConnectionGUID: y8Q0FIVVTeyqh+iA7G7QGw== +X-CSE-MsgGUID: NHnFhDxxRvChXLKbODkIZw== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339652" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339652" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:21 -0700 +X-CSE-ConnectionGUID: r3BrcjKDSJONY4pZr3YdUQ== +X-CSE-MsgGUID: 9FSjHRHPTQWyN3aom4KQIA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487189" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:21 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 03/19] sched/fair: Introduce helper functions to enforce LLC migration policy +Date: Sat, 11 Oct 2025 11:24:40 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Cache-aware scheduling aggregates threads onto their preferred LLC, +mainly through load balancing. When the preferred LLC becomes +saturated, more threads are still placed there, increasing latency. +A mechanism is needed to limit aggregation so that the preferred LLC +does not become overloaded. + +Introduce helper functions can_migrate_llc() and +can_migrate_llc_task() to enforce the LLC migration policy: + + 1. Aggregate a task to its preferred LLC if both source and + destination LLCs are not too busy (<50% utilization, tunable), + or if doing so will not leave the preferred LLC much more + imbalanced than the non-preferred one (>20% utilization + difference, tunable, similar to imbalance_pct of the LLC domain). + 2. Allow moving a task from overloaded preferred LLC to a non preferred + LLC if this will not cause the non preferred LLC to become + too imbalanced to cause a later migration back. + 3. If both LLCs are too busy, let the generic load balance to spread + the tasks. + +This hysteresis prevents tasks from being migrated into and out of the +preferred LLC frequently (back and forth): the threshold for migrating +a task out of its preferred LLC is higher than that for migrating it +into the LLC. + +Since aggregation tends to make the preferred LLC busier than others, +the imbalance tolerance is controlled by llc_imb_pct. If set to 0, +tasks may still aggregate to the preferred LLC as long as it is +not more utilized than the source LLC, preserving the preference. + +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + kernel/sched/debug.c | 4 ++ + kernel/sched/fair.c | 145 +++++++++++++++++++++++++++++++++++++++++++ + kernel/sched/sched.h | 5 ++ + 3 files changed, 154 insertions(+) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 02e16b70a790..57bb04ebbf96 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -523,6 +523,10 @@ static __init int sched_init_debug(void) + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct); ++ debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct); ++#endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + + debugfs_fair_server_init(); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 1ebb0d99a906..cd080468ddc9 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1205,6 +1205,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + #define EPOCH_PERIOD (HZ / 100) /* 10 ms */ + #define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ + ++__read_mostly unsigned int llc_overload_pct = 50; ++__read_mostly unsigned int llc_imb_pct = 20; ++ + static int llc_id(int cpu) + { + if (cpu < 0) +@@ -9560,6 +9563,27 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + } + + #ifdef CONFIG_SCHED_CACHE ++/* ++ * The margin used when comparing LLC utilization with CPU capacity. ++ * Parameter llc_overload_pct determines the LLC load level where ++ * active LLC aggregation is done. ++ * Derived from fits_capacity(). ++ * ++ * (default: ~50%) ++ */ ++#define fits_llc_capacity(util, max) \ ++ ((util) * 100 < (max) * llc_overload_pct) ++ ++/* ++ * The margin used when comparing utilization. ++ * is 'util1' noticeably greater than 'util2' ++ * Derived from capacity_greater(). ++ * Bias is in perentage. ++ */ ++/* Allows dst util to be bigger than src util by up to bias percent */ ++#define util_greater(util1, util2) \ ++ ((util1) * 100 > (util2) * (100 + llc_imb_pct)) ++ + /* Called from load balancing paths with rcu_read_lock held */ + static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +@@ -9575,6 +9599,127 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + + return true; + } ++ ++/* ++ * Decision matrix according to the LLC utilization. To ++ * decide whether we can do task aggregation across LLC. ++ * ++ * By default, 50% is the threshold to treat the LLC as busy, ++ * and 20% is the utilization imbalance percentage to decide ++ * if the preferred LLC is busier than the non-preferred LLC. ++ * ++ * 1. moving towards the preferred LLC, dst is the preferred ++ * LLC, src is not. ++ * ++ * src \ dst 30% 40% 50% 60% ++ * 30% Y Y Y N ++ * 40% Y Y Y Y ++ * 50% Y Y G G ++ * 60% Y Y G G ++ * ++ * 2. moving out of the preferred LLC, src is the preferred ++ * LLC, dst is not: ++ * ++ * src \ dst 30% 40% 50% 60% ++ * 30% N N N N ++ * 40% N N N N ++ * 50% N N G G ++ * 60% Y N G G ++ * ++ * src : src_util ++ * dst : dst_util ++ * Y : Yes, migrate ++ * N : No, do not migrate ++ * G : let the Generic load balance to even the load. ++ * ++ * The intention is that if both LLCs are quite busy, cache aware ++ * load balance should not be performed, and generic load balance ++ * should take effect. However, if one is busy and the other is not, ++ * the preferred LLC capacity(50%) and imbalance criteria(20%) should ++ * be considered to determine whether LLC aggregation should be ++ * performed to bias the load towards the preferred LLC. ++ */ ++ ++/* migration decision, 3 states are orthogonal. */ ++enum llc_mig { ++ mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */ ++ mig_llc, /* Y: Do LLC preference based migration */ ++ mig_unrestricted /* G: Don't restrict generic load balance migration */ ++}; ++ ++static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, ++ unsigned long tsk_util, ++ bool to_pref) ++{ ++ unsigned long src_util, dst_util, src_cap, dst_cap; ++ ++ if (!get_llc_stats(src_cpu, &src_util, &src_cap) || ++ !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) ++ return mig_unrestricted; ++ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ !fits_llc_capacity(src_util, src_cap)) ++ return mig_unrestricted; ++ ++ src_util = src_util < tsk_util ? 0 : src_util - tsk_util; ++ dst_util = dst_util + tsk_util; ++ if (to_pref) { ++ /* ++ * llc_imb_pct is the imbalance allowed between ++ * preferred LLC and non-preferred LLC. ++ * Don't migrate if we will get preferred LLC too ++ * heavily loaded and if the dest is much busier ++ * than the src, in which case migration will ++ * increase the imbalance too much. ++ */ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ util_greater(dst_util, src_util)) ++ return mig_forbid; ++ } else { ++ /* ++ * Don't migrate if we will leave preferred LLC ++ * too idle, or if this migration leads to the ++ * non-preferred LLC falls within sysctl_aggr_imb percent ++ * of preferred LLC, leading to migration again ++ * back to preferred LLC. ++ */ ++ if (fits_llc_capacity(src_util, src_cap) || ++ !util_greater(src_util, dst_util)) ++ return mig_forbid; ++ } ++ return mig_llc; ++} ++ ++/* ++ * Check if task p can migrate from src_cpu to dst_cpu ++ * in terms of cache aware load balance. ++ */ ++static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, ++ struct task_struct *p) ++{ ++ struct mm_struct *mm; ++ bool to_pref; ++ int cpu; ++ ++ mm = p->mm; ++ if (!mm) ++ return mig_unrestricted; ++ ++ cpu = mm->mm_sched_cpu; ++ if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) ++ return mig_unrestricted; ++ ++ if (cpus_share_cache(dst_cpu, cpu)) ++ to_pref = true; ++ else if (cpus_share_cache(src_cpu, cpu)) ++ to_pref = false; ++ else ++ return mig_unrestricted; ++ ++ return can_migrate_llc(src_cpu, dst_cpu, ++ task_util(p), to_pref); ++} ++ + #else + static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 2ded8d3d0ecc..a52c96064b36 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2797,6 +2797,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; + extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + ++#ifdef CONFIG_SCHED_CACHE ++extern unsigned int llc_overload_pct; ++extern unsigned int llc_imb_pct; ++#endif ++ + #ifdef CONFIG_SCHED_HRTICK + + /* +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch b/sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch new file mode 100644 index 0000000..233f3fe --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch @@ -0,0 +1,208 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD9FE27F75F + for ; Sat, 11 Oct 2025 18:18:23 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206705; cv=none; b=toy7mYgrkMShyfYM+pYJVnlk2kT96KNiv5DNY2SPeZNG+C4hUMbzxW+QMLoY5P4G0gxMEqPJZD1oRcx17kku+G6SaznXM9qHf6TbjE3y6E+5eW6mFGs9F7x17MH+po42oQIBeMuQONsrqKSl7XLcK2ag8qWKJC1Xr5w/c8efzqg= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206705; c=relaxed/simple; + bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=BrQQpH91F+AYLu9pNsP5vrblllGBIiYSrf9Tqy9EYC4wS0n0udak+gKeFf8J19+3f0P2Q81tPIF74K0DC5ETs6YeanXYBydnXlUojA//lO1O300HBm7E4ONxjKjmsrUvcSI3JT5Le3EHo8kdx7whhv843/P3GIna7MP3njXDV14= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BHqKXCIn; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BHqKXCIn" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206704; x=1791742704; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=; + b=BHqKXCInpJ9FMs87LCbtbTr8sCx+I94vOdw+YhnA01VGi2y2vrviHuha + 44dYUBEYMQCSqJ0LZTT2V+2kshxkaTOgIYxGLcnue8xZcdvJE+tFA1vNK + e3l/bHsCjqNkzuXBC7xQTcdlcOk0RWIbIkbhlcUaSh6K3yuxlVHUHJcmE + r0xmWO+olPuADPa5P30u0Ohf3HcjIqBXZsxBvV5VI21iprKzNU2fqZx7i + dnB6Mbk+VkrpWYKhn8UVMBHAO40Hwj1qg7dTaTpQfAWXx8+nbbBZeHxKl + 1QcSW4+uLMzTxhbUTINvxL6mxdB/i7FkzCBGLbgZ013YwkDLFD2+4CBnX + w==; +X-CSE-ConnectionGUID: XU0Bp+klQCiSCfmyOaBeOA== +X-CSE-MsgGUID: qUdy5aE4QB+ndas2O3JrjQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339674" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339674" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:22 -0700 +X-CSE-ConnectionGUID: veyEE6PBTGirh+PomEioDQ== +X-CSE-MsgGUID: eht/GZN/S/ekMdaQtDO0ag== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487193" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:22 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 04/19] sched/fair: Introduce a static key to enable cache aware only for multi LLCs +Date: Sat, 11 Oct 2025 11:24:41 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Enable cache-aware load balancing only if at least 1 NUMA node has +more than one LLC. + +Suggested-by: Libo Chen +Suggested-by: Adam Li +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 15 ++++++++++++--- + kernel/sched/sched.h | 1 + + kernel/sched/topology.c | 14 ++++++++++++-- + 3 files changed, 25 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cd080468ddc9..3d643449c48c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1208,6 +1208,14 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + __read_mostly unsigned int llc_overload_pct = 50; + __read_mostly unsigned int llc_imb_pct = 20; + ++DEFINE_STATIC_KEY_FALSE(sched_cache_allowed); ++ ++static inline bool sched_cache_enabled(void) ++{ ++ return sched_feat(SCHED_CACHE) && ++ static_branch_likely(&sched_cache_allowed); ++} ++ + static int llc_id(int cpu) + { + if (cpu < 0) +@@ -1294,7 +1302,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_sched *pcpu_sched; + unsigned long epoch; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_cache_enabled()) + return; + + if (p->sched_class != &fair_sched_class) +@@ -1330,7 +1338,7 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + struct callback_head *work = &p->cache_work; + struct mm_struct *mm = p->mm; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_cache_enabled()) + return; + + if (!mm || !mm->pcpu_sched) +@@ -10716,7 +10724,8 @@ static void record_sg_llc_stats(struct lb_env *env, + struct sched_domain *sd = env->sd->child; + struct sched_domain_shared *sd_share; + +- if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ if (!sched_cache_enabled() || ++ env->idle == CPU_NEWLY_IDLE) + return; + + /* only care about sched domains spanning a LLC */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index a52c96064b36..60f1e51685ec 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2800,6 +2800,7 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; + #ifdef CONFIG_SCHED_CACHE + extern unsigned int llc_overload_pct; + extern unsigned int llc_imb_pct; ++extern struct static_key_false sched_cache_allowed; + #endif + + #ifdef CONFIG_SCHED_HRTICK +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 6e2f54169e66..2675db980f70 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -2444,6 +2444,7 @@ static int + build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) + { + enum s_alloc alloc_state = sa_none; ++ bool has_multi_llcs = false; + struct sched_domain *sd; + struct s_data d; + struct rq *rq = NULL; +@@ -2530,10 +2531,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + * between LLCs and memory channels. + */ + nr_llcs = sd->span_weight / child->span_weight; +- if (nr_llcs == 1) ++ if (nr_llcs == 1) { + imb = sd->span_weight >> 3; +- else ++ } else { + imb = nr_llcs; ++ has_multi_llcs = true; ++ } + imb = max(1U, imb); + sd->imb_numa_nr = imb; + +@@ -2581,6 +2584,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + ++#ifdef CONFIG_SCHED_CACHE ++ if (has_multi_llcs) { ++ static_branch_enable_cpuslocked(&sched_cache_allowed); ++ pr_info("Cache aware load balance enabled.\n"); ++ } ++#endif ++ + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch b/sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch new file mode 100644 index 0000000..cd2305a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch @@ -0,0 +1,291 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8E929283153 + for ; Sat, 11 Oct 2025 18:18:24 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206706; cv=none; b=l9o+r3tPneRXt3UimsPhWTyfqr4rcCBrkqPagUsuj236psyVrtVREf1eV9bh9i5x6sqiX/93/2fGTQOd3tDyAfM2x8nQDBG2tniRFTa1AjKlI5Hs36x8WGu+npNUTYaShkti1wSxrqntJys6VhwZ+aL+o6PQ3k1GyXMU2JJL3bw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206706; c=relaxed/simple; + bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=MDjhwzZYr3m7pwdhzj9TlyV526H5WJLBGHEilCqY27+WQSI1yxnPWT6k5Mm6bFKl/0I+sfGQBi/7HzzHe1S3ts6bk23EZaJB+w94GLEZKAcc8cSHQMDIbKKzGRMgBrwPnT0sZBkKxiooppSIJhtXCA86kWL70YWS1bZ1PVuSOI8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BzReY9Ll; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BzReY9Ll" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206705; x=1791742705; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=; + b=BzReY9LlEh9sk7OgZDcp2VjjY3mwnRzW5hp4d8rSX40TSJQm31n7pNsD + pGDX4pGNqIL2dKhB0TWBOakqdMqoEJBGhhFnbP0SML4ddRpmP22b3hhKk + 66OBjK6EOlIiBTx96elcU0fwjNnZqBKTvf/i3IuC2HlilzxwoimPLi7ym + OqUTRkCWmlqgJ5BjvtUEaD2eb97VkiEAs6iUC5FsMQPohIZRE0ZJGIQT2 + rLWb4YevoZUYtWiZQU/yYmcq5sU7eCp84d/YBPYTw8uDxW2au989TrB9t + olL4givIBdX+ieIJw7430Yz/Es1H+8Ji46MflznNqafshDKBuL8HbpSmx + A==; +X-CSE-ConnectionGUID: xTVpDyXiQYmCxiG8vc8uKg== +X-CSE-MsgGUID: ouYA76mXSo+MkfJ9ZAYryA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339693" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339693" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:23 -0700 +X-CSE-ConnectionGUID: Vda9/GgFQc2uyKt8dn0epA== +X-CSE-MsgGUID: 2SFdpXMCSGKC8Z5YqgWCow== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487198" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:23 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 05/19] sched/fair: Add LLC index mapping for CPUs +Date: Sat, 11 Oct 2025 11:24:42 -0700 +Message-Id: <7d75af576986cf447a171ce11f5e8a15a692e780.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce an index mapping between CPUs and their LLCs. This provides +a continuous per LLC index needed for cache-aware load balancing in +later patches. + +The existing per_cpu llc_id usually points to the first CPU of the +LLC domain, which is sparse and unsuitable as an array index. Using +llc_id directly would waste memory. + +With the new mapping, CPUs in the same LLC share a continuous index: + + per_cpu(llc_idx, CPU=0...15) = 0 + per_cpu(llc_idx, CPU=16...31) = 1 + per_cpu(llc_idx, CPU=32...47) = 2 + ... + +The maximum number of LLCs is limited by CONFIG_NR_LLCS. If the number +of LLCs available exceeds CONFIG_NR_LLCS, the cache aware load balance +is disabled. To further save memory, this array could be converted to +dynamic allocation in the future, or the LLC index could be made NUMA +node-wide. + +As mentioned by Adam, if there is no domain with SD_SHARE_LLC, the +function update_llc_idx() should not be invoked to update the index; +otherwise, it will generate an invalid index. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/threads.h | 10 +++++++++ + init/Kconfig | 9 ++++++++ + kernel/sched/fair.c | 11 ++++++++++ + kernel/sched/sched.h | 2 ++ + kernel/sched/topology.c | 47 +++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 79 insertions(+) + +diff --git a/include/linux/threads.h b/include/linux/threads.h +index 1674a471b0b4..2c9b1adfe024 100644 +--- a/include/linux/threads.h ++++ b/include/linux/threads.h +@@ -20,6 +20,16 @@ + /* Places which use this should consider cpumask_var_t. */ + #define NR_CPUS CONFIG_NR_CPUS + ++#ifndef CONFIG_NR_LLCS ++#define CONFIG_NR_LLCS 1 ++#endif ++ ++#if CONFIG_NR_LLCS > NR_CPUS ++#define NR_LLCS NR_CPUS ++#else ++#define NR_LLCS CONFIG_NR_LLCS ++#endif ++ + #define MIN_THREADS_LEFT_FOR_ROOT 4 + + /* +diff --git a/init/Kconfig b/init/Kconfig +index 4e625db7920a..6e4c96ccdda0 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -981,6 +981,15 @@ config SCHED_CACHE + resources within the same cache domain, reducing cache misses and + lowering data access latency. + ++config NR_LLCS ++ int "Maximum number of Last Level Caches" ++ range 2 1024 ++ depends on SMP && SCHED_CACHE ++ default 64 ++ help ++ This allows you to specify the maximum number of last level caches ++ this kernel will support for cache aware scheduling. ++ + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 3d643449c48c..61c129bde8b6 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1224,6 +1224,17 @@ static int llc_id(int cpu) + return per_cpu(sd_llc_id, cpu); + } + ++/* ++ * continuous LLC index, starting from 0. ++ */ ++static inline int llc_idx(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_idx, cpu); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 60f1e51685ec..b448ad6dc51d 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2039,6 +2039,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DECLARE_PER_CPU(int, sd_llc_size); + DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(int, sd_llc_idx); + DECLARE_PER_CPU(int, sd_share_id); + DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -2047,6 +2048,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + + extern struct static_key_false sched_asym_cpucapacity; + extern struct static_key_false sched_cluster_active; ++extern int max_llcs; + + static __always_inline bool sched_asym_cpucap_active(void) + { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 2675db980f70..4bd033060f1d 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -659,6 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd) + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DEFINE_PER_CPU(int, sd_llc_size); + DEFINE_PER_CPU(int, sd_llc_id); ++DEFINE_PER_CPU(int, sd_llc_idx); + DEFINE_PER_CPU(int, sd_share_id); + DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -668,6 +669,40 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_cluster_active); + ++int max_llcs = -1; ++ ++static void update_llc_idx(int cpu) ++{ ++#ifdef CONFIG_SCHED_CACHE ++ int idx = -1, llc_id = -1; ++ ++ if (max_llcs > NR_LLCS) ++ return; ++ ++ llc_id = per_cpu(sd_llc_id, cpu); ++ idx = per_cpu(sd_llc_idx, llc_id); ++ ++ /* ++ * A new LLC is detected, increase the index ++ * by 1. ++ */ ++ if (idx < 0) { ++ idx = max_llcs++; ++ ++ if (max_llcs > NR_LLCS) { ++ if (static_branch_unlikely(&sched_cache_allowed)) ++ static_branch_disable_cpuslocked(&sched_cache_allowed); ++ ++ pr_warn_once("CONFIG_NR_LLCS is too small, disable cache aware load balance\n"); ++ return; ++ } ++ ++ per_cpu(sd_llc_idx, llc_id) = idx; ++ } ++ per_cpu(sd_llc_idx, cpu) = idx; ++#endif ++} ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +@@ -687,6 +722,10 @@ static void update_top_cache_domain(int cpu) + per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + ++ /* only update the llc index for domain with SD_SHARE_LLC */ ++ if (sd) ++ update_llc_idx(cpu); ++ + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); +@@ -2452,6 +2491,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + bool has_asym = false; + bool has_cluster = false; + ++#ifdef CONFIG_SCHED_CACHE ++ if (max_llcs < 0) { ++ for_each_possible_cpu(i) ++ per_cpu(sd_llc_idx, i) = -1; ++ max_llcs = 0; ++ } ++#endif ++ + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch b/sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch new file mode 100644 index 0000000..33e7efa --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch @@ -0,0 +1,156 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id A93492836B1 + for ; Sat, 11 Oct 2025 18:18:24 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206706; cv=none; b=S6xTZtgG4gDit+VImk9W2UzS4qpXEGkcWHMUVoYyOSnpNNw4aucqYAXSSje8zYLjl3z3dX3Jt3ztt7bwcuxWrRrv6qxUGactOiUWUNrvSPN2VWKScV6w3ksMM6saX0NH5ZC3WBABiX0+fpwQlzvqkQFNz80/YqP8x3hbG8jBKng= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206706; c=relaxed/simple; + bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=JsV8TTkODWXWFKIKrzZGo3NxMw8hU5p/OWk4qVG3F1HoqgFqWBsu2TcQGUVWw1R9rnOAFP+1s9fHghtr+g8SHhcTCX8Srq+6rXX7gAPQLfCi2R3P+f6W+h6FG6DDQXFxrgsSAi265RFjsNyqSNVDyYiSw0j1kUou9k2jg/TFWas= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=maHNOTTa; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="maHNOTTa" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206705; x=1791742705; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=; + b=maHNOTTaUom4zOfjF9aQgzk/EHInefpcQXQBpZ407o2A6QAh7rtx4d1V + uIUh04rGM6MxEKMGQGzPbEcwmEUVnQVNQXhq0m60vo8GIlq3nI3UFHh2/ + okHOmrxdhoN3uwbNZN5d2mGAMO3ADHunEGtbLYRsJ5ffyJXYwvK9ZYj6n + ZqWJDYCygmb5LDln/D3icLbLhH8Zm6QWr4yAgVZQ73wl/I3EgDdp+pIYb + aLimiW5HUOhIlD+krR4Rg02sINFyPrZ2h5VJdZ1v01hMqilwa2zgPVcWi + tEJ0OmQs9iwf0mBA0kNnJx5l2NSvLy+2FE84H8lwtH6U/4ySfKAnmdVGc + Q==; +X-CSE-ConnectionGUID: LhZ9XN5ESr6ORNd5zvY9sA== +X-CSE-MsgGUID: UBKHEBpdQNSkGD6fqT87jQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339711" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339711" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:24 -0700 +X-CSE-ConnectionGUID: M/4LVw/6Qg626wVKqENzqw== +X-CSE-MsgGUID: hqk2hnIER+q1aJ8R3vcczQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487203" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:24 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 06/19] sched/fair: Assign preferred LLC ID to processes +Date: Sat, 11 Oct 2025 11:24:43 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +With cache-aware scheduling enabled, each task is assigned a +preferred LLC ID. This allows quick identification of the LLC domain +where the task prefers to run, similar to numa_preferred_nid in +NUMA balancing. + +Signed-off-by: Tim Chen +--- + include/linux/sched.h | 1 + + init/init_task.c | 3 +++ + kernel/sched/fair.c | 7 +++++++ + 3 files changed, 11 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index d7ddb7ce6c4b..8a5e4038cd5c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1402,6 +1402,7 @@ struct task_struct { + + #ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; ++ int preferred_llc; + #endif + + #ifdef CONFIG_RSEQ +diff --git a/init/init_task.c b/init/init_task.c +index e557f622bd90..5fffbe766f57 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { + .numa_group = NULL, + .numa_faults = NULL, + #endif ++#ifdef CONFIG_SCHED_CACHE ++ .preferred_llc = -1, ++#endif + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + .kasan_depth = 1, + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 61c129bde8b6..d6167a029c47 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1312,6 +1312,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_struct *mm = p->mm; + struct mm_sched *pcpu_sched; + unsigned long epoch; ++ int mm_sched_llc = -1; + + if (!sched_cache_enabled()) + return; +@@ -1342,6 +1343,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } ++ ++ if (mm->mm_sched_cpu != -1) ++ mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); ++ ++ if (p->preferred_llc != mm_sched_llc) ++ p->preferred_llc = mm_sched_llc; + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch b/sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch new file mode 100644 index 0000000..f87fefd --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch @@ -0,0 +1,257 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id B7966283FE1 + for ; Sat, 11 Oct 2025 18:18:25 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206707; cv=none; b=Jt9YvY3nM/0EYBih4PVmiKQ2QzO4ZDLh2TKnGqMyWerCIfIM0CWceRhOpjM2iQwiUHzLszpycQZ+UQorhwMqEi3t7Erkuc8eVsgIO7guz2r8zCqiEsDc75hJulbNVOIh4Hf5WtkLCN2FDwtJ+pKaDQzjrmQsv/RTGx24LhvBhds= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206707; c=relaxed/simple; + bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=PrDaFPl16+dUYVfNSWRpTD87yz4MK7/HdghB7ILX5xXggJN8vYLmcy4RQj7oE9weOCdcBzd1EZg476MST0VNTm2z3r/YGhIw0/+VWbtq1PKhfCTIEnPZWnJryrgw70ZRp0r4XDiQwz/h8bzHoZp9hMCEYHtSbHfUHW8eNSYr5z8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GvsjlkoW; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GvsjlkoW" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206706; x=1791742706; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=; + b=GvsjlkoWqX+zgP+tTee0MXcNRVBTPQkZKjOLBXZh33p44VICJNCiih6g + bdtLdnWwRkrJ2u2n2AVNyKIqQq+ELwCHQ1bUAIVe5B+Rq8F/WdKivkeVK + qCMdNHmRRRa8ijhdo6AEjjUZeHNS6/1dPU14KFq5zOdeXfuxJL5tGjlxb + ZtqhKFOWrFhhFPJwUw1KWb7C0rBkSGVoUeZH3ORagBu6Ud545g9bPF/M+ + p6sJSBNbnSNsdtDoZzzIKVmezgct+rLH0giyW0IcdjAUJlzYg6VsmVomk + Zm8UHf1s2hBr8fNdeC7UuXGFmty4d2atXckCM+YB8PsOqI0JwqlHCMSZ2 + A==; +X-CSE-ConnectionGUID: uKPzZGMbTiObyQydogOwGQ== +X-CSE-MsgGUID: QbxPW0yzQ4WA7VOf/APdAg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339729" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339729" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:25 -0700 +X-CSE-ConnectionGUID: GxY9AWlwTACW1S97eEsWGg== +X-CSE-MsgGUID: +oNXqS3kSkOTENG/ySm5FA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487208" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:25 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 07/19] sched/fair: Track LLC-preferred tasks per runqueue +Date: Sat, 11 Oct 2025 11:24:44 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +For each runqueue, track the number of tasks with an LLC preference +and how many of them are running on their preferred LLC. This mirrors +nr_numa_running and nr_preferred_running for NUMA balancing, and will +be used by cache-aware load balancing in later patches. + +Signed-off-by: Tim Chen +--- + kernel/sched/core.c | 12 +++++++++++ + kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 7 +++++++ + 3 files changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 79d15e904d12..5940756e2da3 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -529,6 +529,18 @@ void __trace_set_current_state(int state_value) + } + EXPORT_SYMBOL(__trace_set_current_state); + ++#ifdef CONFIG_SMP ++int task_llc(const struct task_struct *p) ++{ ++ return per_cpu(sd_llc_id, task_cpu(p)); ++} ++#else ++int task_llc(const struct task_struct *p) ++{ ++ return 0; ++} ++#endif ++ + /* + * Serialization rules: + * +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index d6167a029c47..fd315937c0cf 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1235,6 +1235,24 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) ++{ ++ if (!sched_cache_enabled()) ++ return; ++ ++ rq->nr_llc_running += (p->preferred_llc != -1); ++ rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) ++{ ++ if (!sched_cache_enabled()) ++ return; ++ ++ rq->nr_llc_running -= (p->preferred_llc != -1); ++ rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +@@ -1306,6 +1324,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); + } + ++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); ++ + static inline + void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + { +@@ -1347,8 +1367,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); + +- if (p->preferred_llc != mm_sched_llc) ++ /* task not on rq accounted later in account_entity_enqueue() */ ++ if (task_running_on_cpu(rq->cpu, p) && ++ p->preferred_llc != mm_sched_llc) { ++ account_llc_dequeue(rq, p); + p->preferred_llc = mm_sched_llc; ++ account_llc_enqueue(rq, p); ++ } + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +@@ -1497,6 +1522,15 @@ void init_sched_mm(struct task_struct *p) + work->next = work; + } + ++void reset_llc_stats(struct rq *rq) ++{ ++ if (!sched_cache_enabled()) ++ return; ++ ++ rq->nr_llc_running = 0; ++ rq->nr_pref_llc_running = 0; ++} ++ + #else + + static inline void account_mm_sched(struct rq *rq, struct task_struct *p, +@@ -1506,6 +1540,11 @@ void init_sched_mm(struct task_struct *p) { } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {} ++ ++void reset_llc_stats(struct rq *rq) {} + #endif + + /* +@@ -3999,6 +4038,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); ++ account_llc_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } + cfs_rq->nr_queued++; +@@ -4010,9 +4050,14 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) + update_load_sub(&cfs_rq->load, se->load.weight); + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); ++ account_llc_dequeue(rq_of(cfs_rq), task_of(se)); + list_del_init(&se->group_node); + } + cfs_rq->nr_queued--; ++ ++ /* safeguard to clear the cache aware data */ ++ if (!parent_entity(se) && !cfs_rq->nr_queued) ++ reset_llc_stats(rq_of(cfs_rq)); + } + + /* +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index b448ad6dc51d..3ab64067acc6 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1098,6 +1098,10 @@ struct rq { + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc_running; ++ unsigned int nr_llc_running; ++#endif + #ifdef CONFIG_NO_HZ_COMMON + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; +@@ -1952,6 +1956,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) + + #endif /* !CONFIG_NUMA_BALANCING */ + ++void reset_llc_stats(struct rq *rq); ++int task_llc(const struct task_struct *p); ++ + static inline void + queue_balance_callback(struct rq *rq, + struct balance_callback *head, +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch b/sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch new file mode 100644 index 0000000..18dc0f7 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch @@ -0,0 +1,194 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8D1D3284688 + for ; Sat, 11 Oct 2025 18:18:26 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206708; cv=none; b=W6A0Asy9e3NNDRL2ti9BvFY1go+vAlduaKJd1rmOWRr4k4IHRIEpHNJhix4g/v1mdJgDI06CWQ3sQC5YxuLOry9f66mT2W5iUkNoO1AMOa7iJYVMhxygC7dgS1riRk+Xr61GHZrfTq3glOqKoHqMJR1ChGEEIDFSijs9KJo91LU= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206708; c=relaxed/simple; + bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=uzr/dGrFdG1v5FdOJ/f9StnRIpzjJ5uOjWV+sYvWDeYE/dxtVTZG5FXWR8UqlK4jv7ZYYOlRDJRmdwLszrh1cbzNE43kw7ueGEnBAbSwzUyXo12aLw3ckNHZHHjqr9uTbTYz7GDrN3J5K862edN4cdJHoI9buyHUDzdCkXfIheE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MiTdX6Q6; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MiTdX6Q6" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206707; x=1791742707; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=; + b=MiTdX6Q6R/zAjqSeS2bqz6JnSO+lVjbu/CGoRS4W48TnANXSK7FbeFq8 + HIHNTysTrwhHCzP1gtYr6N2x0eFio/feVeyFBD5UytM6ahWF0SC67agMj + jWOkCg+WyPpJSmb2V4GE3mePGb9vm7kjvgiTp1tcN15ClNGhVOTqusLqF + ueDZKLr7dTfEr95oP3PXRNzKFZfqVSGN5aLDywe826XmjT29nykVCoMh+ + U9I8MAfHqzZxWLRDx+EC8+DhJZRsWw9B7dXqvyz67FsBnLG+HHYrAB479 + +0mKNo9XBbRlGAtlUlqUTEvej+mP00q1dndiGmLH/nY7e+wci1WK/1VQo + g==; +X-CSE-ConnectionGUID: e2RK1jGJT9eTlAZZ8FMWJQ== +X-CSE-MsgGUID: se6P+xZrTfOL+/m4zXf2xg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339748" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339748" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:26 -0700 +X-CSE-ConnectionGUID: Lb/G/3cTR6W6ajd8OWjDtQ== +X-CSE-MsgGUID: f0zaj3jsRd+gLA/rNNvR9A== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487214" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:26 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 08/19] sched/fair: Introduce per runqueue task LLC preference counter +Date: Sat, 11 Oct 2025 11:24:45 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Each runqueue is assigned a static array where each element tracks +the number of tasks preferring a given LLC, indexed from 0 to +NR_LLCS. + +For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on +this runqueue which prefer to run within LLC3 (indexed from 0 to NR_LLCS + +The load balancer can use this information to identify busy runqueues +and migrate tasks to their preferred LLC domains. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 35 +++++++++++++++++++++++++++++++++++ + kernel/sched/sched.h | 1 + + 2 files changed, 36 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index fd315937c0cf..b7a68fe7601b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1235,22 +1235,51 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static inline int pref_llc_idx(struct task_struct *p) ++{ ++ return llc_idx(p->preferred_llc); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + if (!sched_cache_enabled()) + return; + + rq->nr_llc_running += (p->preferred_llc != -1); + rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ ++rq->nr_pref_llc[pref_llc]; + } + + static void account_llc_dequeue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + if (!sched_cache_enabled()) + return; + + rq->nr_llc_running -= (p->preferred_llc != -1); + rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ /* avoid negative counter */ ++ if (rq->nr_pref_llc[pref_llc] > 0) ++ --rq->nr_pref_llc[pref_llc]; + } + + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) +@@ -1524,10 +1553,16 @@ void init_sched_mm(struct task_struct *p) + + void reset_llc_stats(struct rq *rq) + { ++ int i = 0; ++ + if (!sched_cache_enabled()) + return; + + rq->nr_llc_running = 0; ++ ++ for (i = 0; i < max_llcs; ++i) ++ rq->nr_pref_llc[i] = 0; ++ + rq->nr_pref_llc_running = 0; + } + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 3ab64067acc6..b801d32d5fba 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1101,6 +1101,7 @@ struct rq { + #ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; ++ unsigned int nr_pref_llc[NR_LLCS]; + #endif + #ifdef CONFIG_NO_HZ_COMMON + unsigned long last_blocked_load_update_tick; +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch b/sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch new file mode 100644 index 0000000..caf0c08 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch @@ -0,0 +1,143 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 896A92848A1 + for ; Sat, 11 Oct 2025 18:18:27 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206709; cv=none; b=OEtiMJ0EXsYmk/b2RpkCvrola+Tb5ZlnJVLLgRLqGiICx7t2qJcij9yw0SgiiThPPPTMrbIdFBAm4w8howvUGPAJFc0ItOZDXO+gwbi0GCrU/MRny5Tre78B7YMgEyxZMXkI05Eu0+fODpObrBBk2c09F8OXQKZ4o5hgptBzDK8= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206709; c=relaxed/simple; + bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=J6bK9CIrnn+dpoeG8RJW1aH3SE1Yc7QYj7Dgh7cqTjdsd3fsWZdu3E2SAwDjyqT5ptCJzWnqjXDoxnW3sFv/aeRC7QnnQkB9bTzAgmfskcoHsp0hZI6c042fUlYpwgsk0j6PmWc4xM8hZNNktu5sqG8t6W1tVMFc+pGngTuF0j8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n3R+hIU0; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n3R+hIU0" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206708; x=1791742708; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=; + b=n3R+hIU0WDMCAOT74Si47T0DHUQFpP/mOPOr4EFjzfrMTg20mocMFVue + SPJYeD3u+HI/S8DzRBSopnypgjipAk03R2jKWcm5OSqY338iFWIhO44pH + Rkbh2OZ1rpYHNaif/qBdzoG/S0GRuxE4+p6SgnYPob1i1tRz5kFPtKtWI + Em/YtXT8s7M8i1lwEkDGhNlIAeWj5yl5FVsHoShyMoDnOs/ZKpz9fa1vH + yY+/JK9y5B5Rh8CVo9sz+iLl5gL/zxPW+ETtFRKayHPWInq1R4rGuUz8D + OVUSiTUoZeUSI+4YJPz+v9iatJmNEpwFlvZeVYR4+WsdGyv8IT5qlNl3i + g==; +X-CSE-ConnectionGUID: VcC/511LSz6QngP8mD/4Fw== +X-CSE-MsgGUID: cm5ykdK+Tza9czQo0iIcIQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339767" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339767" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:27 -0700 +X-CSE-ConnectionGUID: +fnFCaxeROy1X1/2M3UOCQ== +X-CSE-MsgGUID: cAIBkdx0SvqbyNLUptq1pw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487219" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 09/19] sched/fair: Count tasks prefering each LLC in a sched group +Date: Sat, 11 Oct 2025 11:24:46 -0700 +Message-Id: <00e5f2cb6eadc3738e33858d3c4563a0775ee1c0.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During LLC load balancing, tabulate the number of tasks on each runqueue +that prefer a given destination LLC in a sched group. + +For example, consider a system with 4 LLC sched groups (LLC0 to LLC3) +balancing towards LLC3. LLC0 has 3 tasks preferring LLC3, LLC1 has +2, and LLC2 has 1. LLC0, having the most tasks preferring LLC3, is +selected as the busiest source to pick tasks from. + +Within a source LLC, the total number of tasks preferring a destination +LLC is computed by summing counts across all CPUs in that runqueue. For +instance, if LLC0 has CPU0 with 2 tasks and CPU1 with 1 task preferring +LLC3, the total for LLC0 is 3. + +These statistics allow the load balancer to choose tasks from source +sched groups that best match their preferred LLCs. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b7a68fe7601b..cbd1e97bca4b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10399,6 +10399,9 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc[NR_LLCS]; ++#endif + }; + + /* +@@ -10891,6 +10894,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (cpu_overutilized(i)) + *sg_overutilized = 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_cache_enabled()) { ++ int j; ++ ++ for (j = 0; j < max_llcs; ++j) ++ sgs->nr_pref_llc[j] += rq->nr_pref_llc[j]; ++ } ++#endif + /* + * No need to call idle_cpu() if nr_running is not 0 + */ +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch new file mode 100644 index 0000000..4bcffad --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch @@ -0,0 +1,187 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id F13BA28505C + for ; Sat, 11 Oct 2025 18:18:28 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206711; cv=none; b=EhBerRhJhQXPW7xGyw0P5bxJnRZLdUKLIQ12NKKqVw4ZWFGkcALuZ8VykNWnycAafmMkb5kBWaZT15xr3ZuPia1hqPYipqCAVEd34Wn9NgZ7h0Lqr4/FQP1HOI9Yp9naliJ5jjs5uaj5L1/4fJBsGwV0wle3JatN24KLVnEBxK8= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206711; c=relaxed/simple; + bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=XF3a1nw/8EN0FU+PNi1yIJ/227PxHRBRy24uDZNEkqQuRuIG35Ap7GIvbGG+L1n9ZlEPV0A8eM5UvEqTGNXZktaeA+OJjX4avu9hw9uu6rqowoIWWNlLa6/0iuozmn5jhIZJJqDbWB7j1stg+x51fnwnSbNrDkb2H27S3usCnzQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Loa6o7d1; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Loa6o7d1" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206709; x=1791742709; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=; + b=Loa6o7d1Mzs3ouslW83UWTdxmyggGuWTcpizCbNq+GcghqOrvTfXSRIV + 0EP9sedHVH3VdKCqAQHV/ZX3VHfUXCRKy9+NcdVchFLL8bKi/9buFRwhw + ZWmkcnGopsf975TA51MaL7sh2sNrOAvPuHmiA1plKNFBBesobcOlf5xbr + aZ9W/S+Mv3Ykf28JPDwOIYzvtKZi5pCgwvqz5wqJHrujBfUq//kuxX1xD + 44PevqjxkAnPNbnm/C3CdQgNXiNta5xW/ZKmACOzIkYXaOsL8kl9jvdQl + 4VJ6pV7RaGBpMqmBXGMhRqdKmN0HSByZ1kvmH46v45jRNYG2/U+7kgbrO + A==; +X-CSE-ConnectionGUID: 7OsmkTE2T2eIFyDjRKp/ig== +X-CSE-MsgGUID: oqLf97jbSIOB+8Rk4LLqqA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339788" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339788" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:28 -0700 +X-CSE-ConnectionGUID: jHLQbWxOTR2E4C2/k5j7Wg== +X-CSE-MsgGUID: sQhO8wOTQIuj4/5Og2eBgw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487222" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 10/19] sched/fair: Prioritize tasks preferring destination LLC during balancing +Date: Sat, 11 Oct 2025 11:24:47 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During LLC load balancing, first check for tasks that prefer the +destination LLC and balance them to it before others. + +Mark source sched groups containing tasks preferring non local LLCs +with the group_llc_balance flag. This ensures the load balancer later +pulls or pushes these tasks toward their preferred LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 41 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cbd1e97bca4b..af7b578eaa06 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9822,8 +9822,7 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu + else + return mig_unrestricted; + +- return can_migrate_llc(src_cpu, dst_cpu, +- task_util(p), to_pref); ++ return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref); + } + + #else +@@ -10394,6 +10393,7 @@ struct sg_lb_stats { + enum group_type group_type; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ ++ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + #ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; +@@ -10849,11 +10849,45 @@ static void record_sg_llc_stats(struct lb_env *env, + if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) + WRITE_ONCE(sd_share->capacity, sgs->group_capacity); + } ++ ++/* ++ * Do LLC balance on sched group that contains LLC, and have tasks preferring ++ * to run on LLC in idle dst_cpu. ++ */ ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ struct sched_domain *child = env->sd->child; ++ int llc; ++ ++ if (!sched_cache_enabled()) ++ return false; ++ ++ if (env->sd->flags & SD_SHARE_LLC) ++ return false; ++ ++ /* only care about task migration among LLCs */ ++ if (child && !(child->flags & SD_SHARE_LLC)) ++ return false; ++ ++ llc = llc_idx(env->dst_cpu); ++ if (sgs->nr_pref_llc[llc] > 0 && ++ can_migrate_llc(env->src_cpu, env->dst_cpu, 0, true) == mig_llc) ++ return true; ++ ++ return false; ++} + #else + static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) + { + } ++ ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ return false; ++} + #endif + + /** +@@ -10954,6 +10988,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + record_sg_llc_stats(env, sgs, group); ++ ++ /* Check for tasks in this group can be moved to their preferred LLC */ ++ if (!local_group && llc_balance(env, sgs, group)) ++ sgs->group_llc_balance = 1; ++ + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch new file mode 100644 index 0000000..ee39ef0 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch @@ -0,0 +1,184 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E3802857E0 + for ; Sat, 11 Oct 2025 18:18:29 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206711; cv=none; b=t2IkYrrS4OEW0rLnZ4Ph2aLp/ob7UBcUobZQPFlHPmpcJEG5m0pUt/86mOssLKuYpjefjiUDrjFelfxhjAxq8hkNJqtOEMJPbTz+zzT3SsVZRdrqKE8v+5YoRbLqXRQPim2ll3DhWUtUyVjcOo+wuodh/CEa974mbGOLa7mTgCc= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206711; c=relaxed/simple; + bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=CFlB5zhIcHUsbSOo/sD1pZdSFz7frR0zFFzgb5/20MqZiItU17WC0G8ifB7ANEAoWHl+sZ1UBTS2HXkckShm7SoSJJXvPBbw6XxQCBJK6yrElYIzS1CzXKAx7vBmkFFghPyfHOK4JpsmMAKYxqatpcWaHZwO7N1+tqHPYDwlFpo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Y9YkqrBb; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Y9YkqrBb" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206709; x=1791742709; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=; + b=Y9YkqrBbsakXirsuA3GK7ppNmtxnJk2cm0iimpzRLvMdIlTwXGPf3Jxq + CO6EwYbc/Esxx5TDgaH0h7SVW6eQY5e38xqt9oEwqeMZQtQ13URaPfC2Q + Mwk/v0qwxo5jXbC8xa2O9JpbH1ZyVCsabZmLtbPS2e8WfQbQS4lgRoeof + RbwLkRXbWC69JnwGxh3aUM7ZF9q8ziMLuIK7nYhL3utheouiHtWkbs+nW + RBMmwNo592e9Wh6g7Ht+Vdc051U+njdgUo7aZRqY6DlKoIGZaJJSG2c0W + jAF73DWLcSoTQT2Ii9M9dPOTvOCcojIDgIVpILvlasXm0wG4u+s+OJFGn + Q==; +X-CSE-ConnectionGUID: bcFBDLOoTw6TYukUkbI3wQ== +X-CSE-MsgGUID: 0WEdTBqUR0WG7HuYHYySDg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339807" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339807" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:29 -0700 +X-CSE-ConnectionGUID: teKUgYrNS8ayzrTmALf01w== +X-CSE-MsgGUID: OBuR3uU9Q8qKO64uzC8h4Q== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487230" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:28 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 11/19] sched/fair: Identify busiest sched_group for LLC-aware load balancing +Date: Sat, 11 Oct 2025 11:24:48 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +The load balancer selects the busiest sched_group and migrates tasks +to less busy groups to distribute load across CPUs. + +With cache-aware scheduling enabled, the busiest sched_group is +the one with most tasks preferring the destination LLC. If +the group has the llc_balance flag set, cache aware load balancing is +triggered. + +Introduce the helper function update_llc_busiest() to identify the +sched_group with the most tasks preferring the destination LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++++- + 1 file changed, 38 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index af7b578eaa06..8469ec528cb1 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10877,6 +10877,23 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ int idx; ++ ++ /* Only the candidate with llc_balance needs to be taken care of */ ++ if (!sgs->group_llc_balance) ++ return false; ++ ++ /* ++ * There are more tasks that want to run on dst_cpu's LLC. ++ */ ++ idx = llc_idx(env->dst_cpu); ++ return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx]; ++} + #else + static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +@@ -10888,6 +10905,13 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + { + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ return false; ++} + #endif + + /** +@@ -11035,6 +11059,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, + sds->local_stat.group_type != group_has_spare)) + return false; + ++ /* deal with prefer LLC load balance, if failed, fall into normal load balance */ ++ if (update_llc_busiest(env, busiest, sgs)) ++ return true; ++ ++ /* ++ * If the busiest group has tasks with LLC preference, ++ * skip normal load balance. ++ */ ++ if (busiest->group_llc_balance) ++ return false; ++ + if (sgs->group_type > busiest->group_type) + return true; + +@@ -11942,9 +11977,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. ++ * Also do so if we can move some tasks that prefer the local LLC. + */ + if (sds.prefer_sibling && local->group_type == group_has_spare && +- sibling_imbalance(env, &sds, busiest, local) > 1) ++ (busiest->group_llc_balance || ++ sibling_imbalance(env, &sds, busiest, local) > 1)) + goto force_balance; + + if (busiest->group_type != group_overloaded) { +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch new file mode 100644 index 0000000..e9edb7a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch @@ -0,0 +1,185 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 99E5F28642E + for ; Sat, 11 Oct 2025 18:18:30 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206712; cv=none; b=CcfwsAyp1OHHqY4mNPYPcN6bUrl09ci4+a/v8FtP9azgYQzfS6lmRwWajeweUonIlhrYSa3k3Uk+3iau8s00TJMHIq9pc69gZThbuJO24GmjHBtcGot6LsPzytIaUPaB8oNg5fj064BJxFXz948iENpfk/rfsglOKxpcJkX9wG0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206712; c=relaxed/simple; + bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=ScMEWl2DOAQMR5u9bpXgwKEadirbrSNG1X0vBv1Qm5M7qzeQRW6zyzR/0wZ49Stn9ftQ28uc0NLCvRH6mwbydhKFD3kpg3JgxWk9NBUU+Qnt+t7g3WQ/pDx7wFSEDUiofgdlic68Cqje1J43vJo7n57s1boIMbDvvtchvPGoTXM= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=WEVJOxO1; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="WEVJOxO1" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206711; x=1791742711; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=; + b=WEVJOxO1Uy4x+GEHukYgK7cjQhJ+ZPzArevJFx6r0uwjLvVHXCsCVf0d + U5oZ9qGbRNsQ961+swsJygnl0Xp69gaKKJFDcVvaKlw28OYtLWeCcKxy5 + 4DN0Azrktm8AXYGwp3idVSw3VynSmNbW2dqVmCfWn3Np2iYv1w7hTpRfb + SetW2PMNCXc4Fk5w1ve3GEJ9Bax25e3mUvpabN2XIbAEnlZu4rHyR3ovD + 1WzBrpK45tvGmB0FKRXCfsKbMFF1KdXCgjW4lAJ2KU2k2bhxv6SPWDjA8 + 0qVm8erW2mgP7HqJHVa71uZn8ehzzZAPeMVO4wyBDdQns/j8tkr67uAC6 + w==; +X-CSE-ConnectionGUID: osVAgR9XSEi43ydURnxquA== +X-CSE-MsgGUID: sgSrXMaOTSCJRnEynSu6Vg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339827" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339827" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:30 -0700 +X-CSE-ConnectionGUID: U/XiMYdrQLyr4smIn6sKwQ== +X-CSE-MsgGUID: iE4re5OqR+eOwHWBOdmfKA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487233" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:29 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 12/19] sched/fair: Add migrate_llc_task migration type for cache-aware balancing +Date: Sat, 11 Oct 2025 11:24:49 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce a new migration type, migrate_llc_task, to support +cache-aware load balancing. + +After identifying the busiest sched_group (having the most tasks +preferring the destination LLC), mark migrations with this type. +During load balancing, each runqueue in the busiest sched_group is +examined, and the runqueue with the highest number of tasks preferring +the destination CPU is selected as the busiest runqueue. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 8469ec528cb1..bec6354d7841 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9504,7 +9504,8 @@ enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, +- migrate_misfit ++ migrate_misfit, ++ migrate_llc_task + }; + + #define LBF_ALL_PINNED 0x01 +@@ -10082,6 +10083,10 @@ static int detach_tasks(struct lb_env *env) + env->imbalance -= util; + break; + ++ case migrate_llc_task: ++ env->imbalance--; ++ break; ++ + case migrate_task: + env->imbalance--; + break; +@@ -11733,6 +11738,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + return; + } + ++#ifdef CONFIG_SCHED_CACHE ++ if (busiest->group_llc_balance) { ++ /* Move a task that prefer local LLC */ ++ env->migration_type = migrate_llc_task; ++ env->imbalance = 1; ++ return; ++ } ++#endif ++ + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages +@@ -12041,6 +12055,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int busiest_pref_llc = 0; ++ int dst_llc; ++#endif + int i; + + for_each_cpu_and(i, sched_group_span(group), env->cpus) { +@@ -12149,6 +12167,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + } + break; + ++ case migrate_llc_task: ++#ifdef CONFIG_SCHED_CACHE ++ dst_llc = llc_idx(env->dst_cpu); ++ if (!cpus_share_cache(env->dst_cpu, rq->cpu) && ++ busiest_pref_llc < rq->nr_pref_llc[dst_llc]) { ++ busiest_pref_llc = rq->nr_pref_llc[dst_llc]; ++ busiest = rq; ++ } ++#endif ++ break; + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; +@@ -12331,6 +12359,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; ++ case migrate_llc_task: ++ break; + } + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch b/sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch new file mode 100644 index 0000000..50e470a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch @@ -0,0 +1,208 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CE0E286D56 + for ; Sat, 11 Oct 2025 18:18:31 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206713; cv=none; b=GHTSZiD43H1BP9udGQWGRTSdycj0dFbwOFNYssvdtvgDyjDEnOhEZuZ3tF7d4Oxq4KjVh/REHJdk8e5qmA0nk91pFvjTrD7ew0sadW9X2+TjejBiKi+Z4u/nZlJeGc29rI3I01ytNZfNGLLusPB2P/4mVx6bLIuv9bhIea7/KOQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206713; c=relaxed/simple; + bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Z43NaTPGAIlscL0L7fDhRwzngl1+8YayCbuXKnJJO/leht3IttqnVKWti2tJx4O3Ad4+Bxa7ijhsxQg7lysYNstcyC73l5FTr0P11m80kqmUiNRrC4pt99E80BCBIbFo2SatFJnTKT4Q1ux117UKVwuy6P9Rh922Z1naN6x4Wgc= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=JdkwbeJq; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="JdkwbeJq" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206711; x=1791742711; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=; + b=JdkwbeJqpvNLxxR/C5J1ZH6Sc5bkBzINB0NUowykgcoSMh+IrKTz9SEs + 3TI4U2WqUZ4fGfcXVpbX1N2vbaAfyQUv4dhr3bMb1WSUcBz4dSrMfVdBf + Gdlpc/LwIyV72Eyt8t+mfF176Y/vv2GuGHN9WuXsK8/fBvzDMB20NsZLB + QBg0I+M7oRSQsaiygrqnGBFHiCS3p2JbXoqghWgigPrv6u1iqo8HXxcYs + HtDa1JUkhRKqPvvWxmzbfQzJYS+Coi/HVD3eewtzP+ILLi56XMzOKLHfR + iZqHJ/1cq2a50rc7YQNpk4EmPQ7vkE0qnNCf9o39KpjsRQh5qnu3HCaul + A==; +X-CSE-ConnectionGUID: VRcX2cnOQSeMAY0e8g4K3w== +X-CSE-MsgGUID: SPoQqM3DQk6EyvXMnqQjmg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339847" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339847" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:31 -0700 +X-CSE-ConnectionGUID: pKVZhrKMR8K6LBYqzMOqAA== +X-CSE-MsgGUID: CK8cGt1oRtCxjPN4nS/YdA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487238" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:30 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 13/19] sched/fair: Handle moving single tasks to/from their preferred LLC +Date: Sat, 11 Oct 2025 11:24:50 -0700 +Message-Id: <231864b303906a60491bbb9eb7b2e3f083bff248.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +If the busiest runqueue has only one task, active balancing may be +invoked to move it. However, before migration, check whether the task +is running on its preferred LLC. + +Do not move a lone task to another LLC if it would move the task +away from its preferred LLC or cause excessive imbalance between LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 59 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index bec6354d7841..19ba9c1b9a63 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9826,12 +9826,53 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu + return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref); + } + ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ if (!sched_cache_enabled()) ++ return false; ++ ++ if (cpus_share_cache(env->src_cpu, env->dst_cpu)) ++ return false; ++ /* ++ * All tasks prefer to stay on their current CPU. ++ * Do not pull a task from its preferred CPU if: ++ * 1. It is the only task running there; OR ++ * 2. Migrating it away from its preferred LLC would violate ++ * the cache-aware scheduling policy. ++ */ ++ if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) { ++ unsigned long util = 0; ++ struct task_struct *cur; ++ ++ if (env->src_rq->nr_running <= 1) ++ return true; ++ ++ rcu_read_lock(); ++ cur = rcu_dereference(env->src_rq->curr); ++ if (cur) ++ util = task_util(cur); ++ rcu_read_unlock(); ++ ++ if (can_migrate_llc(env->src_cpu, env->dst_cpu, ++ util, false) == mig_forbid) ++ return true; ++ } ++ ++ return false; ++} + #else + static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) + { + return false; + } ++ ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ return false; ++} + #endif + /* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? +@@ -12247,6 +12288,9 @@ static int need_active_balance(struct lb_env *env) + { + struct sched_domain *sd = env->sd; + ++ if (break_llc_locality(env)) ++ return 0; ++ + if (asym_active_balance(env)) + return 1; + +@@ -12266,7 +12310,8 @@ static int need_active_balance(struct lb_env *env) + return 1; + } + +- if (env->migration_type == migrate_misfit) ++ if (env->migration_type == migrate_misfit || ++ env->migration_type == migrate_llc_task) + return 1; + + return 0; +@@ -12711,9 +12756,20 @@ static int active_load_balance_cpu_stop(void *data) + goto out_unlock; + + /* Is there any task to move? */ +- if (busiest_rq->nr_running <= 1) +- goto out_unlock; ++ if (busiest_rq->nr_running <= 1) { ++#ifdef CONFIG_SCHED_CACHE ++ int llc = llc_idx(target_cpu); + ++ if (!sched_cache_enabled()) ++ goto out_unlock; ++ ++ if (llc < 0) ++ goto out_unlock; ++ /* don't migrate if no task prefers target */ ++ if (busiest_rq->nr_pref_llc[llc] < 1) ++#endif ++ goto out_unlock; ++ } + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch b/sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch new file mode 100644 index 0000000..2839724 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch @@ -0,0 +1,201 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 417D42874EA + for ; Sat, 11 Oct 2025 18:18:32 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206714; cv=none; b=P5dnBcm/QdLKKHwOdHn/8WuPNdfAOl/PRiR2K2uOEI4cNFkN+3QA9gv1poGLydzEv/LcejqEay5DpC4q4pFVQXAYgNISmcWGnnkZt2WJ1RNwtLhNEUFXZhx40ubXDsBOhhphD04ToZpipNp3wabmP7EXcOk+GqqMg1ATyjn68eQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206714; c=relaxed/simple; + bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=K4DMRiSFceKlJzje7FYPpzQtMciS8INZnGsYmfTeHw6oUtErbWyqEJzurxfkaj/0e2BYrqNZ34Rdy0dGMjqeQWLbOVlQosaArztC6x5+Kes0uifkkB7Pj+Ot9ll7+ydHo4UrJOvNc7oKS/beZOgPG9FPfh7UCSuuvvMEgE2IUTo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=H3CAEs3w; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="H3CAEs3w" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206712; x=1791742712; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=; + b=H3CAEs3wXo6bis/3Dkhtptw+Q7vtaAFDMqK8g5XXqpoTWnnoOviYRAT9 + w6Ikfty6wJNr1MlZJ1pp/FTRrzxJpmwm8JYX2yaBiDeoJDyx/agfVsZPY + MklgYKNASSHcEaoYoXP3gsqWfSwXldul6nD1Cye5tqr86XkWjK3gJK3C2 + XHWF6ABgRrpsZ6WaBAuzrKten6FRqGkbA1i+aWIRwXqoWsGPVsgAC8AT4 + v51P3tS4APRavdFpCNPn2xNzJPdUZAW7dgqXMB0AkpdRadIZ72DIu+BFu + J9oJpUAr+gFfhWThceV6xrW/Bi4Emncs3GIHURfaahEgiLmzNa/UX2/Km + w==; +X-CSE-ConnectionGUID: L4/6SpgURcKa2MOypuG0Tw== +X-CSE-MsgGUID: s8jp3cejRyWqoo8mO6QU6Q== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339866" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339866" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:32 -0700 +X-CSE-ConnectionGUID: IV2+5+btQ3GmLWn4UVfGIA== +X-CSE-MsgGUID: Ti8qIpzsSjywiCl630piRA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487243" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:31 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 14/19] sched/fair: Consider LLC preference when selecting tasks for load balancing +Date: Sat, 11 Oct 2025 11:24:51 -0700 +Message-Id: <26e7bfa88163e13ba1ebefbb54ecf5f42d84f884.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Currently, task selection from the busiest runqueue ignores LLC +preferences. Reorder tasks in the busiest queue to prioritize selection +as follows: + + 1. Tasks preferring the destination CPU's LLC + 2. Tasks with no LLC preference + 3. Tasks preferring an LLC different from their current one + 4. Tasks preferring the LLC they are currently on + +This improves the likelihood that tasks are migrated to their +preferred LLC. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 19ba9c1b9a63..0fafbfedb21d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10036,6 +10036,68 @@ static struct task_struct *detach_one_task(struct lb_env *env) + return NULL; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Prepare lists to detach tasks in the following order: ++ * 1. tasks that prefer dst cpu's LLC ++ * 2. tasks that have no preference in LLC ++ * 3. tasks that prefer LLC other than the ones they are on ++ * 4. tasks that prefer the LLC that they are currently on. ++ */ ++static struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ struct task_struct *p; ++ LIST_HEAD(pref_old_llc); ++ LIST_HEAD(pref_new_llc); ++ LIST_HEAD(no_pref_llc); ++ LIST_HEAD(pref_other_llc); ++ ++ if (!sched_cache_enabled()) ++ return tasks; ++ ++ if (cpus_share_cache(env->dst_cpu, env->src_cpu)) ++ return tasks; ++ ++ while (!list_empty(tasks)) { ++ p = list_last_entry(tasks, struct task_struct, se.group_node); ++ ++ if (p->preferred_llc == llc_id(env->dst_cpu)) { ++ list_move(&p->se.group_node, &pref_new_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == llc_id(env->src_cpu)) { ++ list_move(&p->se.group_node, &pref_old_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == -1) { ++ list_move(&p->se.group_node, &no_pref_llc); ++ continue; ++ } ++ ++ list_move(&p->se.group_node, &pref_other_llc); ++ } ++ ++ /* ++ * We detach tasks from list tail in detach tasks. Put tasks ++ * to be chosen first at end of list. ++ */ ++ list_splice(&pref_new_llc, tasks); ++ list_splice(&no_pref_llc, tasks); ++ list_splice(&pref_other_llc, tasks); ++ list_splice(&pref_old_llc, tasks); ++ return tasks; ++} ++#else ++static inline struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ return tasks; ++} ++#endif ++ + /* + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from + * busiest_rq, as part of a balancing operation within domain "sd". +@@ -10044,7 +10106,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) + */ + static int detach_tasks(struct lb_env *env) + { +- struct list_head *tasks = &env->src_rq->cfs_tasks; ++ struct list_head *tasks; + unsigned long util, load; + struct task_struct *p; + int detached = 0; +@@ -10063,6 +10125,8 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + return 0; + ++ tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks); ++ + while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch b/sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch new file mode 100644 index 0000000..0a36e52 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch @@ -0,0 +1,156 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 118912877EE + for ; Sat, 11 Oct 2025 18:18:33 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206714; cv=none; b=Sgvo8eIzN/unUNmW2/+OixP9udhyNkmi4AZEZzDVPWK1PLnNoYAhA0isU11HgcQC7ul1i5aP8jgG2uHE7Cy8Asrdz+Y08qynhym2Y4X0S+xgTgNOkVzp41IhyzMl092I4cMjY7ziOvFvK6idsHZ/FR3VwQydRvg8d5aWYp64rpE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206714; c=relaxed/simple; + bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=TyYasSHPSqMZlN51+4bjWq8Z7cAg9IakiA1ZSJzbhlx8KJc6/UktRCAzZaEkZtQ3d+2B5EUSEDoefcCsbcoCPxFRSCAzN4VD9lBw94R0aIvRHbenlFVxgsvkmUCy9pzg5jZh5zHq/4CLUC+EDPmK622ZE8JNMYgUcZgPpxmosck= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Lw7L05el; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Lw7L05el" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206713; x=1791742713; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=; + b=Lw7L05elkbwdOCxozPfNxC8qRTe1i2iYshjZC2z6ZaIHRqDa3MmTXW5p + zHG6+auYcjgaRRcY16sdCyIbi7MCQxhd1rhIdaLh0bWrCs4ImE5P1VD8f + E+1GcTkJVgNbzLAR5f6+G7KZsA/sstlz5uIOTmFm5WpAXCY87MaYrAMAn + AO+uoYvLDh1ME4/gSK2T7C+P7K4lX/jQuif20ZGD72jW5wnQNob4g08JW + Z2MLtsd0WXxmCEXIKBfa0mtDIGY2FVs5/FvLd831/0grQYgT8vo1t80Kc + spuxB5OU6NgYwRfX7rKRRiLNfth6YUS68l+iwJeWbASwMAqE6PVWIEmJu + Q==; +X-CSE-ConnectionGUID: eDbtoCrOQHyIZtGmIsjSMQ== +X-CSE-MsgGUID: +ry6w/ChQZGrUwocr7gK9A== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339887" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339887" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:33 -0700 +X-CSE-ConnectionGUID: 1LsFjRblTkmkQu9Zwyc6pQ== +X-CSE-MsgGUID: 7olPURVrSrW53T9U5Kz7mw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487247" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:32 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 15/19] sched/fair: Respect LLC preference in task migration and detach +Date: Sat, 11 Oct 2025 11:24:52 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During the final step of load balancing, can_migrate_task() now +considers a task's LLC preference before moving it out of its +preferred LLC. + +Additionally, add checks in detach_tasks() to prevent selecting tasks +that prefer their current LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0fafbfedb21d..65ff7c306a2f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9801,8 +9801,8 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, + * Check if task p can migrate from src_cpu to dst_cpu + * in terms of cache aware load balance. + */ +-static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, +- struct task_struct *p) ++static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, ++ struct task_struct *p) + { + struct mm_struct *mm; + bool to_pref; +@@ -9969,6 +9969,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + if (env->flags & LBF_ACTIVE_LB) + return 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_cache_enabled() && ++ can_migrate_llc_task(env->src_cpu, env->dst_cpu, p) == mig_forbid) ++ return 0; ++#endif ++ + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); +@@ -10227,6 +10233,20 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + break; + ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Don't detach more tasks if the remaining tasks want ++ * to stay. We know the remaining tasks all prefer the ++ * current LLC, because after order_tasks_by_llc(), the ++ * tasks that prefer the current LLC are at the tail of ++ * the list. The inhibition of detachment is to avoid too ++ * many tasks being migrated out of the preferred LLC. ++ */ ++ if (sched_cache_enabled() && detached && p->preferred_llc != -1 && ++ llc_id(env->src_cpu) == p->preferred_llc) ++ break; ++#endif ++ + continue; + next: + if (p->sched_task_hot) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch new file mode 100644 index 0000000..88914b1 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch @@ -0,0 +1,172 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 16F11288C02 + for ; Sat, 11 Oct 2025 18:18:34 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206715; cv=none; b=msFA8TC41v9oEIuXxPkwmaUs9Guya5oz4k0g+kGWjFkx5t6zbq1fE/hqkiyOdPEhHS8cUTNX+aARYrbMu+YFzDRmUGhKnyOYkbiJD/UnEPwa2emEYG8RrqlU6lMxzm4wiDBJLxqnLLfKGSPXyWwXrM560Mia1tgl6K9uKsnEgFE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206715; c=relaxed/simple; + bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=pIYwSq6151qmo6KEbEr6KofmYMtBvZvl9VphDwsqPX3hTLP897hu66I6LFuek1xE2EdzY5hJ64po/YPEKcNn99hwknIHDQx8uamJBxPh8I2WV7/JQ8MBTxUclp3YSgTWiAJSRjNR9EBM7PkdUJqtsU69m11ei/HsbibGYzaOOwk= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=TQVK1fUD; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="TQVK1fUD" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206714; x=1791742714; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=; + b=TQVK1fUDtuFQmuxj0h/H6B3W/u2cJ2GkGOiUH7Lt/dRtHWxu09UqD683 + GE9GznGGwwF/Ima7vRS1ctHwsI6Xpw4SijdVGn66soleS5/ydNjcGaSKg + ygudPZpTfNaQrBfM0sFvdqPmdg50LMShstL+8pxYWf160UzvXjzOECyon + VuIxmxxlfPMnN2wMIOyjbQiDBL/LsnnHbGArR4IFK3zGWts6KMkvPzkiR + EwWOPnHMmqriXFYLM8wcDjSverDfcRP6MlQsXXusYG7bdxJhhuwymEiBB + InFNxWr5/xEksEDfouM5jLx/TVwLUkF4o8vAQ8HbkYgDi57JrvvbuA4Mr + g==; +X-CSE-ConnectionGUID: dN0cE9kLQ3yeKYNXwwT83A== +X-CSE-MsgGUID: KDX51V55RvaEAIpyi8Kcxg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339905" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339905" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:34 -0700 +X-CSE-ConnectionGUID: SHt2rwkJR6+JML7EmRAXVw== +X-CSE-MsgGUID: 7457bVysSBes9Wezrb15EQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487250" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:33 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 16/19] sched/fair: Exclude processes with many threads from cache-aware scheduling +Date: Sat, 11 Oct 2025 11:24:53 -0700 +Message-Id: <637cdb8ab11b1b978d697ed744cc402d32443ecc.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +A performance regression was observed by Prateek when running hackbench +with many threads per process (high fd count). To avoid this, processes +with a large number of active threads are excluded from cache-aware +scheduling. + +With sched_cache enabled, record the number of active threads in each +process during the periodic task_cache_work(). While iterating over +CPUs, if the currently running task belongs to the same process as the +task that launched task_cache_work(), increment the active thread count. + +If the count exceeds the number of CPUs in the process's preferred LLC, +sched_cache will avoid aggregating too many threads into a single LLC +domain. + +Reported-by: K Prateek Nayak +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/mm_types.h | 1 + + kernel/sched/fair.c | 14 ++++++++++++-- + 2 files changed, 13 insertions(+), 2 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 3ca557c2f36d..b307f81b2fde 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1031,6 +1031,7 @@ struct mm_struct { + raw_spinlock_t mm_sched_lock; + unsigned long mm_sched_epoch; + int mm_sched_cpu; ++ u64 nr_running_avg ____cacheline_aligned_in_smp; + #endif + + #ifdef CONFIG_MMU +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 65ff7c306a2f..79d109f8a09f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1451,12 +1451,13 @@ static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu, + + static void __no_profile task_cache_work(struct callback_head *work) + { +- struct task_struct *p = current; ++ struct task_struct *p = current, *cur; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + unsigned long curr_m_a_occ = 0; + int cpu, m_a_cpu = -1, cache_cpu, +- pref_nid = NUMA_NO_NODE, curr_cpu; ++ pref_nid = NUMA_NO_NODE, curr_cpu, ++ nr_running = 0; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); +@@ -1497,6 +1498,14 @@ static void __no_profile task_cache_work(struct callback_head *work) + m_occ = occ; + m_cpu = i; + } ++ ++ rcu_read_lock(); ++ cur = rcu_dereference(cpu_rq(i)->curr); ++ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && ++ cur->mm == mm) ++ nr_running++; ++ rcu_read_unlock(); ++ + } + + /* +@@ -1540,6 +1549,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + mm->mm_sched_cpu = m_a_cpu; + } + ++ update_avg(&mm->nr_running_avg, nr_running); + free_cpumask_var(cpus); + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch b/sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch new file mode 100644 index 0000000..0bb796c --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch @@ -0,0 +1,170 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CBF228B400 + for ; Sat, 11 Oct 2025 18:18:35 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206717; cv=none; b=YyEz/CWTR29mSwIUaPFMfMePzkOh+JM5Sy6daDO5bi2qr7vVNV19xi6LQHHFuh3wAPmGhaJZO0psSS/hmmAhEm9YYTN/Jgc2pWxCyI+xWhQCLC7I/PnTVjCiCQif4wqMsrxoWCBWSb2OUxPbQQvBrskdsdNoyUkJX7OfjisrPEo= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206717; c=relaxed/simple; + bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=cDrry+jPMrDILm/r9QUVZNGIrsE561nMMRjz9ay5n5LBA0g4KQ5jFwtQhbKMvroO4a5axJHedJTHbl6aSfvc0uCnQwzJq+eaxxOqXVEOWsoi3zdhUNBrxg97Vqp+GrazIyVFmuyXj145vhjyv4Ug8nfP5dYxkUNSPkfjany2j50= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=lrCuBiww; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="lrCuBiww" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206715; x=1791742715; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=; + b=lrCuBiwwXTTaUUesVoUKShmqNypNMcjFctaFnNlL8Jy17kFhV1UkeZza + ZuX0GXcNA+d1mgjVrCdwx7TgVROgGBNK4U8k00nbzT6TvTcewZUk7QGtM + ze+FjZ8AcXNEy5AhOAJw/Pg8vbtTnZ1loNcqp57iteVrKQqHWUMDyfSYU + 8P+nCqWidGuZDOqQcaEjQH4wD2Jn2+QsEcLHNMZnZLw6R3C8jci7hl1aG + MGxs8mPuw6pSR4ah1MI8YVoYS5wwLulLaJK/V5D02tGg7pdRILUMNtqsB + x0389trQkin/UccLwrCAMIGVL3znx7/2JW/py3nOY6EKojcOWTOyEIt0N + Q==; +X-CSE-ConnectionGUID: WfwYlMtNQVe279pYYOUBnA== +X-CSE-MsgGUID: AjSkDrsURkOZNf5ZbyXbNQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339923" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339923" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700 +X-CSE-ConnectionGUID: ezHUeA30SCiDTeB7wo76Nw== +X-CSE-MsgGUID: YeYwMr00ThmPUWDQc0+YAw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487255" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:34 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 17/19] sched/fair: Disable cache aware scheduling for processes with high thread counts +Date: Sat, 11 Oct 2025 11:24:54 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +If the number of active threads within the process +exceeds the number of Cores(divided by SMTs number) +in the LLC, do not enable cache-aware scheduling. +This is because there is a risk of cache contention +within the preferred LLC when too many threads are +present. + +Reported-by: K Prateek Nayak +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 27 +++++++++++++++++++++++++-- + 1 file changed, 25 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 79d109f8a09f..6b8eace79eee 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1240,6 +1240,18 @@ static inline int pref_llc_idx(struct task_struct *p) + return llc_idx(p->preferred_llc); + } + ++static bool exceed_llc_nr(struct mm_struct *mm, int cpu) ++{ ++ int smt_nr = 1; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) ++ smt_nr = cpumask_weight(cpu_smt_mask(cpu)); ++#endif ++ ++ return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu)); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { + int pref_llc; +@@ -1385,10 +1397,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + + /* + * If this task hasn't hit task_cache_work() for a while, or it +- * has only 1 thread, invalidate its preferred state. ++ * has only 1 thread, or has too many active threads, invalidate ++ * its preferred state. + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || +- get_nr_threads(p) <= 1) { ++ get_nr_threads(p) <= 1 || ++ exceed_llc_nr(mm, cpu_of(rq))) { + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } +@@ -1467,6 +1481,11 @@ static void __no_profile task_cache_work(struct callback_head *work) + if (p->flags & PF_EXITING) + return; + ++ if (get_nr_threads(p) <= 1) { ++ mm->mm_sched_cpu = -1; ++ return; ++ } ++ + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + +@@ -9826,6 +9845,10 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + ++ /* skip cache aware load balance for single/too many threads */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ return mig_unrestricted; ++ + if (cpus_share_cache(dst_cpu, cpu)) + to_pref = true; + else if (cpus_share_cache(src_cpu, cpu)) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch b/sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch new file mode 100644 index 0000000..b614ebc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch @@ -0,0 +1,246 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id EC76C28C03B + for ; Sat, 11 Oct 2025 18:18:35 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206717; cv=none; b=Gsl1htdC3Y7gJ6c3ywcidI/bSse8yUz6irs7/iI8KWV8rK5Ae95mMS6V4kE386ZpRZ64YVuSevPlw/gCCcGexlKVEsnpJGvjAMVnB6E3r26Sb5PQDcAwlJhgczIF0vnORN//ryXKWaGJdpyTLOi1a78IAJp76Mm0Cc1+XjF2rGQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206717; c=relaxed/simple; + bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=hT0pK7n3dH+PZ5LGb1wwP8mkt2A7mUf1PCIeydCbZfOqNSbSKOwNGkxWRp3xr4aPGGtMx1eK61Xyt7h2YGrFfvdSUCRdLGNS2BunlIUuq8SqGdxHIK829DTsOGKBUbEPWJzj/d6E4FC8xaBfUuz6ugBEq47VdX8vEtuc1XwNFis= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eyspbvXX; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eyspbvXX" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206716; x=1791742716; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=; + b=eyspbvXX6JZaLuPx9mP9k7AsJvdPNK3nA7Eu1n1ZjnjSeOqzlt2GEvCx + IIbDfmBwRBwDACT7YDm/5WXc6cuJLsO02ejx9sBoouGuZkUHl1/nB7J2O + i/e0/jcb0J2buciIQ3OvuzUhegT0ZaiQoJUm0tinSNJAyHv/2LoJKLT6E + 1wncP9sm103omUQyz2nIdzytwxhPLCdaTXt3R4jfGDM0HbNy1TRA5Ex3O + eiDpNNIsPslVI7J8r5viBVFuJFJIfp1atbqNY5xQ3zDqGyLEqF5FJMEHK + BGBjTx2SYuiM3sv4eOtztesROh9S4vRoc6wieYXXgBwOgrHLMjZB8S3CI + A==; +X-CSE-ConnectionGUID: 15+3n+5PQLG8KotmRvuIMw== +X-CSE-MsgGUID: Dj1GwDBDRtWs7ASTeti8MA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339940" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339940" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700 +X-CSE-ConnectionGUID: O+LhKbX0QNyBYwHUAp0ttw== +X-CSE-MsgGUID: PfPvzLkATc2Ca+B9H6Dwng== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487259" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:35 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 18/19] sched/fair: Avoid cache-aware scheduling for memory-heavy processes +Date: Sat, 11 Oct 2025 11:24:55 -0700 +Message-Id: <00da49fd590b95baad0525660bda4c0ba178243d.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Prateek and Tingyin reported that memory-intensive workloads (such as +stream) can saturate memory bandwidth and caches on the preferred LLC +when sched_cache aggregates too many threads. + +To mitigate this, estimate a process's memory footprint by comparing +its RSS (anonymous and shared pages) to the size of the LLC. If RSS +exceeds the LLC size, skip cache-aware scheduling. + +Note that RSS is only an approximation of the memory footprint. +By default, the comparison is strict, but a later patch will allow +users to provide a hint to adjust this threshold. + +According to the test from Adam, some systems do not have shared L3 +but with shared L2 as clusters. In this case, the L2 becomes the LLC[1]. + +Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/ + +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + include/linux/cacheinfo.h | 21 ++++++++++------ + kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++--- + 2 files changed, 61 insertions(+), 11 deletions(-) + +diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h +index c8f4f0a0b874..82d0d59ca0e1 100644 +--- a/include/linux/cacheinfo.h ++++ b/include/linux/cacheinfo.h +@@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu, + + const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf); + +-/* +- * Get the cacheinfo structure for the cache associated with @cpu at +- * level @level. +- * cpuhp lock must be held. +- */ +-static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) ++static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level) + { + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + int i; + +- lockdep_assert_cpus_held(); +- + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == level) { + if (ci->info_list[i].attributes & CACHE_ID) +@@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) + return NULL; + } + ++/* ++ * Get the cacheinfo structure for the cache associated with @cpu at ++ * level @level. ++ * cpuhp lock must be held. ++ */ ++static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) ++{ ++ lockdep_assert_cpus_held(); ++ ++ return _get_cpu_cacheinfo_level(cpu, level); ++} ++ + /* + * Get the id of the cache associated with @cpu at level @level. + * cpuhp lock must be held. +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6b8eace79eee..46dfcd2a01b3 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1240,6 +1240,38 @@ static inline int pref_llc_idx(struct task_struct *p) + return llc_idx(p->preferred_llc); + } + ++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) ++{ ++ struct cacheinfo *ci; ++ unsigned long rss; ++ unsigned int llc; ++ ++ /* ++ * get_cpu_cacheinfo_level() can not be used ++ * because it requires the cpu_hotplug_lock ++ * to be held. Use _get_cpu_cacheinfo_level() ++ * directly because the 'cpu' can not be ++ * offlined at the moment. ++ */ ++ ci = _get_cpu_cacheinfo_level(cpu, 3); ++ if (!ci) { ++ /* ++ * On system without L3 but with shared L2, ++ * L2 becomes the LLC. ++ */ ++ ci = _get_cpu_cacheinfo_level(cpu, 2); ++ if (!ci) ++ return true; ++ } ++ ++ llc = ci->size; ++ ++ rss = get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ ++ return (llc <= (rss * PAGE_SIZE)); ++} ++ + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) + { + int smt_nr = 1; +@@ -1402,7 +1434,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || + get_nr_threads(p) <= 1 || +- exceed_llc_nr(mm, cpu_of(rq))) { ++ exceed_llc_nr(mm, cpu_of(rq)) || ++ exceed_llc_capacity(mm, cpu_of(rq))) { + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } +@@ -1486,6 +1519,14 @@ static void __no_profile task_cache_work(struct callback_head *work) + return; + } + ++ /* ++ * Do not check exceed_llc_nr() because ++ * the active number of threads needs to ++ * been updated anyway. ++ */ ++ if (exceed_llc_capacity(mm, curr_cpu)) ++ return; ++ + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + +@@ -9845,8 +9886,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + +- /* skip cache aware load balance for single/too many threads */ +- if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ /* ++ * skip cache aware load balance for single/too many threads ++ * or large footprint. ++ */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) || ++ exceed_llc_capacity(mm, dst_cpu)) + return mig_unrestricted; + + if (cpus_share_cache(dst_cpu, cpu)) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch b/sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch new file mode 100644 index 0000000..893d5f6 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.17/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch @@ -0,0 +1,366 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 18E8128D850 + for ; Sat, 11 Oct 2025 18:18:37 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206719; cv=none; b=Xx1TJtOzMlihMYBSPUxuHxJ0Qjx1gDS60TVsBbaW2YAWG207+fLDuebhtY/m9byeKfuUMx/7RVc7mR4xE94pKemXSaF1s6z/Ug1MSbyJDL/f+gYUVN9JWyZVsl4nskC5I36GvI9Reswdcqif7FIqp4+OT03g4Ursen0Zl0KoJs4= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206719; c=relaxed/simple; + bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=TqNPDsqjikNan+1NtjFEbAg77jx9c3inhDW4V8l0uRiJhbQOXCuc9b1G6bYocgAvzvRSIQ0C9pHEOzGrnitQnTKHR4lM01jV+sq5AGE2Z0YUwNbJ3G2iOFzcz198JhG1QAmKUE7Vocf7AQigiloGd31ZcAGpFcHlx+XOPevHRzQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=iOR0vW8+; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="iOR0vW8+" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206717; x=1791742717; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=; + b=iOR0vW8+BW1BG+CuQKpeekNgIJXVik0HqP3JsArGSk608O/BAqQp2/2V + NevdC5FBoGU0UJqaEBq3eyHXjM8fq6f/t4e0BsD23dpBBveuXe++OVX8Y + Aapb+EWCp+mFsFeSqc6EHn1EKVQFE1axOMUnDuAWrAcUGMdrmUl0Sqt8l + gPm1isDiRNA4VWnGAtuiefQtTbQsCK7LA3hCWV2kYbD78VwasjvY/a8Zs + eIWoDg9eon7/Ajv/YxTCU8u2KHeYWmlazBkEjZ2+x2uGykUr+ha3ebndP + Ilvnp7dapSvlsm6l5tNbjmODs4GBS1SErTGbDlGwNscJODVWeB1whKGtb + g==; +X-CSE-ConnectionGUID: iwkdIGQ9QpepiaCCmITr2A== +X-CSE-MsgGUID: vpqcAnIxSGm05xalZwxCuA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339958" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339958" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:36 -0700 +X-CSE-ConnectionGUID: l0yVaxC3RhO6SKkG+8NgJA== +X-CSE-MsgGUID: KHjGlLwMQh2OAr5o5sZaPw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487263" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:36 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 19/19] sched/fair: Add user control to adjust the tolerance of cache-aware scheduling +Date: Sat, 11 Oct 2025 11:24:56 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +With sched_cache enabled, the scheduler uses a process's RSS as a +proxy for its LLC footprint to determine if aggregating tasks on the +preferred LLC could cause cache contention. If RSS exceeds the LLC +size, aggregation is skipped. Some workloads with large RSS but small +actual memory footprints may still benefit from aggregation. Since +the kernel cannot efficiently track per-task cache usage (resctrl is +user-space only), userspace can provide a more accurate hint. + +Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let +users control how strictly RSS limits aggregation. Values range from +0 to 100: + + - 0: Cache-aware scheduling is disabled. + - 1: Strict; tasks with RSS larger than LLC size are skipped. + - 100: Aggressive; tasks are aggregated regardless of RSS. + +For example, with a 32MB L3 cache: + + - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped. + - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped + (784GB = (1 + (99 - 1) * 256) * 32MB). + +Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls +how strictly the number of active threads is considered when doing +cache aware load balance. The number of SMTs is also considered. +High SMT counts reduce the aggregation capacity, preventing excessive +task aggregation on SMT-heavy systems like Power10/Power11. + +For example, with 8 Cores/16 CPUs in a L3: + + - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped. + - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped + 785 = (1 + (99 - 1) * 8). + +Reported-by: K Prateek Nayak +Reported-by: Madadi Vineeth Reddy +Reported-by: Shrikanth Hegde +Reported-by: Tingyin Duan +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + kernel/sched/debug.c | 56 ++++++++++++++++++++++++++++++-- + kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++++++++++---- + kernel/sched/sched.h | 3 ++ + 3 files changed, 126 insertions(+), 9 deletions(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 57bb04ebbf96..cfcd8b436cc5 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -169,6 +169,50 @@ static const struct file_operations sched_feat_fops = { + .release = single_release, + }; + ++#ifdef CONFIG_SCHED_CACHE ++#define SCHED_CACHE_CREATE_CONTROL(name) \ ++static ssize_t sched_cache_write_##name(struct file *filp, \ ++ const char __user *ubuf, \ ++ size_t cnt, loff_t *ppos) \ ++{ \ ++ char buf[16]; \ ++ unsigned int percent; \ ++ if (cnt > 15) \ ++ cnt = 15; \ ++ if (copy_from_user(&buf, ubuf, cnt)) \ ++ return -EFAULT; \ ++ buf[cnt] = '\0'; \ ++ if (kstrtouint(buf, 10, &percent)) \ ++ return -EINVAL; \ ++ if (percent > 100) \ ++ return -EINVAL; \ ++ llc_##name = percent; \ ++ *ppos += cnt; \ ++ return cnt; \ ++} \ ++static int sched_cache_show_##name(struct seq_file *m, void *v) \ ++{ \ ++ seq_printf(m, "%d\n", llc_##name); \ ++ return 0; \ ++} \ ++static int sched_cache_open_##name(struct inode *inode, \ ++ struct file *filp) \ ++{ \ ++ return single_open(filp, sched_cache_show_##name, NULL); \ ++} \ ++static const struct file_operations sched_cache_fops_##name = { \ ++ .open = sched_cache_open_##name, \ ++ .write = sched_cache_write_##name, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++} ++ ++SCHED_CACHE_CREATE_CONTROL(overload_pct); ++SCHED_CACHE_CREATE_CONTROL(imb_pct); ++SCHED_CACHE_CREATE_CONTROL(aggr_tolerance); ++#endif /* SCHED_CACHE */ ++ + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +@@ -524,8 +568,16 @@ static __init int sched_init_debug(void) + #endif /* CONFIG_NUMA_BALANCING */ + + #ifdef CONFIG_SCHED_CACHE +- debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct); +- debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct); ++ debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_overload_pct); ++ debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_imb_pct); ++ debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_aggr_tolerance); ++ debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched, ++ &llc_epoch_period); ++ debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched, ++ &llc_epoch_affinity_timeout); + #endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 46dfcd2a01b3..f9084e2f9ef2 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1207,9 +1207,62 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + + __read_mostly unsigned int llc_overload_pct = 50; + __read_mostly unsigned int llc_imb_pct = 20; ++__read_mostly unsigned int llc_aggr_tolerance = 1; ++__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD; ++__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT; + + DEFINE_STATIC_KEY_FALSE(sched_cache_allowed); + ++static inline int get_sched_cache_scale(int mul) ++{ ++ if (!llc_aggr_tolerance) ++ return 0; ++ ++ if (llc_aggr_tolerance == 100) ++ return INT_MAX; ++ ++ return (1 + (llc_aggr_tolerance - 1) * mul); ++} ++ ++static inline int get_sched_cache_rss_scale(void) ++{ ++ /* ++ * Suppose the L3 size is 32MB. If the ++ * llc_aggr_tolerance is 1: ++ * When the RSS is larger than 32MB, ++ * the process is regarded as exceeding ++ * the LLC capacity. If the ++ * llc_aggr_tolerance is 99: ++ * When the RSS is larger than 784GB, ++ * the process is regarded as exceeding ++ * the LLC capacity: ++ * 784GB = (1 + (99 - 1) * 256) * 32MB ++ */ ++ return get_sched_cache_scale(256); ++} ++ ++static inline int get_sched_cache_nr_scale(void) ++{ ++ /* ++ * Suppose the number of Cores in LLC is 8. ++ * Every core has 2 SMTs. ++ * If the llc_aggr_tolerance is 1: When the ++ * nr_running is larger than 8, the process ++ * is regarded as exceeding the LLC capacity. ++ * If the llc_aggr_tolerance is 99: ++ * When the nr_running is larger than 785, ++ * the process is regarded as exceeding ++ * the LLC capacity: ++ * 785 = 1 + (99 - 1) * 8 ++ */ ++ return get_sched_cache_scale(1); ++} ++ ++static inline int get_sched_cache_cap_scale(void) ++{ ++ return (llc_overload_pct / cpu_smt_num_threads); ++} ++ + static inline bool sched_cache_enabled(void) + { + return sched_feat(SCHED_CACHE) && +@@ -1245,6 +1298,7 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + struct cacheinfo *ci; + unsigned long rss; + unsigned int llc; ++ int scale; + + /* + * get_cpu_cacheinfo_level() can not be used +@@ -1269,19 +1323,27 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + rss = get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + +- return (llc <= (rss * PAGE_SIZE)); ++ scale = get_sched_cache_rss_scale(); ++ if (scale == INT_MAX) ++ return false; ++ ++ return ((llc * scale) <= (rss * PAGE_SIZE)); + } + + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) + { +- int smt_nr = 1; ++ int smt_nr = 1, scale; + + #ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) + smt_nr = cpumask_weight(cpu_smt_mask(cpu)); + #endif + +- return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu)); ++ scale = get_sched_cache_nr_scale(); ++ if (scale == INT_MAX) ++ return false; ++ ++ return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu))); + } + + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) +@@ -1370,9 +1432,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) + long delta = now - rq->cpu_epoch_next; + + if (delta > 0) { +- n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ n = (delta + llc_epoch_period - 1) / llc_epoch_period; + rq->cpu_epoch += n; +- rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ rq->cpu_epoch_next += n * llc_epoch_period; + __shr_u64(&rq->cpu_runtime, n); + } + +@@ -1432,7 +1494,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + * has only 1 thread, or has too many active threads, invalidate + * its preferred state. + */ +- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout || + get_nr_threads(p) <= 1 || + exceed_llc_nr(mm, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { +@@ -9749,7 +9811,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + * (default: ~50%) + */ + #define fits_llc_capacity(util, max) \ +- ((util) * 100 < (max) * llc_overload_pct) ++ ((util) * 100 < (max) * get_sched_cache_cap_scale()) + + /* + * The margin used when comparing utilization. +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index b801d32d5fba..97e8558b0530 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2810,6 +2810,9 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; + #ifdef CONFIG_SCHED_CACHE + extern unsigned int llc_overload_pct; + extern unsigned int llc_imb_pct; ++extern unsigned int llc_aggr_tolerance; ++extern unsigned int llc_epoch_period; ++extern unsigned int llc_epoch_affinity_timeout; + extern struct static_key_false sched_cache_allowed; + #endif + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch b/sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch new file mode 100644 index 0000000..c0a6d12 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/0001-amd-pstate.patch @@ -0,0 +1,120 @@ +From 8716b3a723c94c85da3c28a01ce5c23e46341562 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 18 Dec 2025 16:40:32 +0100 +Subject: [PATCH 01/11] amd-pstate + +Signed-off-by: Peter Jung +--- + drivers/cpufreq/amd-pstate.c | 33 ++++++++++++++------------------- + 1 file changed, 14 insertions(+), 19 deletions(-) + +diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c +index 602e4fa81d6c..c45bc98721d2 100644 +--- a/drivers/cpufreq/amd-pstate.c ++++ b/drivers/cpufreq/amd-pstate.c +@@ -65,13 +65,13 @@ static const char * const amd_pstate_mode_string[] = { + [AMD_PSTATE_PASSIVE] = "passive", + [AMD_PSTATE_ACTIVE] = "active", + [AMD_PSTATE_GUIDED] = "guided", +- NULL, + }; ++static_assert(ARRAY_SIZE(amd_pstate_mode_string) == AMD_PSTATE_MAX); + + const char *amd_pstate_get_mode_string(enum amd_pstate_mode mode) + { +- if (mode < 0 || mode >= AMD_PSTATE_MAX) +- return NULL; ++ if (mode < AMD_PSTATE_UNDEFINED || mode >= AMD_PSTATE_MAX) ++ mode = AMD_PSTATE_UNDEFINED; + return amd_pstate_mode_string[mode]; + } + EXPORT_SYMBOL_GPL(amd_pstate_get_mode_string); +@@ -110,6 +110,7 @@ enum energy_perf_value_index { + EPP_INDEX_BALANCE_PERFORMANCE, + EPP_INDEX_BALANCE_POWERSAVE, + EPP_INDEX_POWERSAVE, ++ EPP_INDEX_MAX, + }; + + static const char * const energy_perf_strings[] = { +@@ -118,8 +119,8 @@ static const char * const energy_perf_strings[] = { + [EPP_INDEX_BALANCE_PERFORMANCE] = "balance_performance", + [EPP_INDEX_BALANCE_POWERSAVE] = "balance_power", + [EPP_INDEX_POWERSAVE] = "power", +- NULL + }; ++static_assert(ARRAY_SIZE(energy_perf_strings) == EPP_INDEX_MAX); + + static unsigned int epp_values[] = { + [EPP_INDEX_DEFAULT] = 0, +@@ -127,7 +128,8 @@ static unsigned int epp_values[] = { + [EPP_INDEX_BALANCE_PERFORMANCE] = AMD_CPPC_EPP_BALANCE_PERFORMANCE, + [EPP_INDEX_BALANCE_POWERSAVE] = AMD_CPPC_EPP_BALANCE_POWERSAVE, + [EPP_INDEX_POWERSAVE] = AMD_CPPC_EPP_POWERSAVE, +- }; ++}; ++static_assert(ARRAY_SIZE(epp_values) == EPP_INDEX_MAX); + + typedef int (*cppc_mode_transition_fn)(int); + +@@ -183,7 +185,7 @@ static inline int get_mode_idx_from_str(const char *str, size_t size) + { + int i; + +- for (i=0; i < AMD_PSTATE_MAX; i++) { ++ for (i = 0; i < AMD_PSTATE_MAX; i++) { + if (!strncmp(str, amd_pstate_mode_string[i], size)) + return i; + } +@@ -1137,16 +1139,15 @@ static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, + static ssize_t show_energy_performance_available_preferences( + struct cpufreq_policy *policy, char *buf) + { +- int i = 0; +- int offset = 0; ++ int offset = 0, i; + struct amd_cpudata *cpudata = policy->driver_data; + + if (cpudata->policy == CPUFREQ_POLICY_PERFORMANCE) + return sysfs_emit_at(buf, offset, "%s\n", + energy_perf_strings[EPP_INDEX_PERFORMANCE]); + +- while (energy_perf_strings[i] != NULL) +- offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i++]); ++ for (i = 0; i < ARRAY_SIZE(energy_perf_strings); i++) ++ offset += sysfs_emit_at(buf, offset, "%s ", energy_perf_strings[i]); + + offset += sysfs_emit_at(buf, offset, "\n"); + +@@ -1157,15 +1158,10 @@ static ssize_t store_energy_performance_preference( + struct cpufreq_policy *policy, const char *buf, size_t count) + { + struct amd_cpudata *cpudata = policy->driver_data; +- char str_preference[21]; + ssize_t ret; + u8 epp; + +- ret = sscanf(buf, "%20s", str_preference); +- if (ret != 1) +- return -EINVAL; +- +- ret = match_string(energy_perf_strings, -1, str_preference); ++ ret = sysfs_match_string(energy_perf_strings, buf); + if (ret < 0) + return -EINVAL; + +@@ -1353,9 +1349,8 @@ int amd_pstate_update_status(const char *buf, size_t size) + return -EINVAL; + + mode_idx = get_mode_idx_from_str(buf, size); +- +- if (mode_idx < 0 || mode_idx >= AMD_PSTATE_MAX) +- return -EINVAL; ++ if (mode_idx < 0) ++ return mode_idx; + + if (mode_state_machine[cppc_state][mode_idx]) { + guard(mutex)(&amd_pstate_driver_lock); +-- +2.52.0 + diff --git a/sys-kernel/gentoo-sources-6.6/0002-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-6.18/0002-glitched-additional-timer-tick-frequencies.patch similarity index 100% rename from sys-kernel/gentoo-sources-6.6/0002-glitched-additional-timer-tick-frequencies.patch rename to sys-kernel/gentoo-sources-6.18/0002-glitched-additional-timer-tick-frequencies.patch diff --git a/sys-kernel/gentoo-sources-6.18/0004-bbr3.patch b/sys-kernel/gentoo-sources-6.18/0004-bbr3.patch new file mode 100644 index 0000000..0522c37 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/0004-bbr3.patch @@ -0,0 +1,3394 @@ +From f475869a64305975245f8d0f4ab1942bacbabf5a Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 18 Dec 2025 16:41:09 +0100 +Subject: [PATCH 04/11] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 6 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 72 +- + include/net/tcp_ecn.h | 6 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 4 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2233 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 42 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 17 files changed, 1939 insertions(+), 554 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 20b8c6e21fef..e334b7a7aac2 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -236,7 +236,8 @@ struct tcp_sock { + tcp_usec_ts : 1, /* TSval values in usec */ + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ +- recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ ++ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ ++ fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */ + __cacheline_group_end(tcp_sock_read_txrx); + + /* RX read-mostly hotpath cache lines */ +@@ -292,7 +293,8 @@ struct tcp_sock { + * 0x5?10 << 16 + snd_wnd in net byte order + */ + u8 nonagle : 4,/* Disable Nagle algorithm? */ +- rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ ++ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ ++ tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index b4b886647607..0dcce6489e56 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -132,8 +132,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index ab20f549b8f9..e3bcdc0be05e 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -403,6 +403,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_DEMAND_CWR BIT(2) + #define TCP_ECN_SEEN BIT(3) + #define TCP_ECN_MODE_ACCECN BIT(4) ++#define TCP_ECN_LOW BIT(5) ++#define TCP_ECN_ECT_PERMANENT BIT(6) + + #define TCP_ECN_DISABLED 0 + #define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) +@@ -838,6 +840,15 @@ static inline u32 __tcp_set_rto(const struct tcp_sock *tp) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -943,6 +954,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -1053,9 +1069,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1168,6 +1189,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1190,7 +1212,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED BIT(0) + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN BIT(1) +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS BIT(2) ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1210,10 +1236,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1224,7 +1253,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1248,8 +1279,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1315,6 +1349,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1334,6 +1376,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1346,6 +1389,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2531,7 +2589,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h +index f13e5cd2b1ac..bc5de05260eb 100644 +--- a/include/net/tcp_ecn.h ++++ b/include/net/tcp_ecn.h +@@ -583,10 +583,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -604,6 +603,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + } +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index dab9493c791b..cce4975fdcfe 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -517,12 +517,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index dce3113787a7..6efba4f74f6f 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ + #define TCPI_OPT_TFO_CHILD 128 /* child from a Fast Open option on SYN */ ++#define TCPI_OPT_ECN_LOW 256 /* Low-latency ECN enabled at conn init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 12850a277251..3b8b96692fb4 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index e01492234b0b..27893b774e08 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } +@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 8a18aeca7ab0..fe4c1b143de1 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3469,6 +3469,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4226,6 +4227,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..9279be755c16 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,123 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ return tcp_ecn_mode_any(tp) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +384,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +411,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +435,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +458,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, ++ bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +475,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +536,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +549,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +581,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +601,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +672,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +683,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +712,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +741,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +797,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +805,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +851,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +860,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +888,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +925,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +948,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +973,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ !!bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * and in PROBE_UP. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2362,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2399,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index df758adbb445..e98e5dbc050e 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index e4a979b75cc6..a0d7b9586e36 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -358,7 +358,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && +@@ -376,7 +376,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; +@@ -1289,7 +1289,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1654,6 +1659,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3889,7 +3905,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985 + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3906,6 +3923,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3916,6 +3934,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -4042,6 +4065,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4114,7 +4138,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_in_ack_event(sk, flag); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4139,6 +4163,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4164,7 +4189,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5891,13 +5916,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 2ec8c6f1cdcc..3e39a40867b5 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -500,6 +500,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index b94efb3050d2..3f11efcf0e98 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -347,7 +347,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1759,7 +1760,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + u16 flags; + int nlen; +@@ -1834,6 +1835,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2190,13 +2215,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2937,6 +2961,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -3149,6 +3174,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 2dd73a4e8e51..3d35afdbf803 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -702,6 +702,7 @@ void tcp_write_timer_handler(struct sock *sk) + icsk_timeout(icsk)); + return; + } ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.52.0 + diff --git a/sys-kernel/gentoo-sources-6.18/0005-block.patch b/sys-kernel/gentoo-sources-6.18/0005-block.patch new file mode 100644 index 0000000..f11165a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/0005-block.patch @@ -0,0 +1,214 @@ +From fa484998399ea55d03e50fb401ee0992f4666793 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 18 Dec 2025 16:56:28 +0100 +Subject: [PATCH 05/11] block + +Signed-off-by: Peter Jung +--- + block/bfq-iosched.c | 52 +++++++++++++++++++++++++++++++++++++++------ + block/bfq-iosched.h | 12 +++++++++-- + block/mq-deadline.c | 19 +++++++++++++---- + 3 files changed, 70 insertions(+), 13 deletions(-) + +diff --git a/block/bfq-iosched.c b/block/bfq-iosched.c +index 4a8d3d96bfe4..4c0c9e125211 100644 +--- a/block/bfq-iosched.c ++++ b/block/bfq-iosched.c +@@ -460,6 +460,21 @@ static struct bfq_io_cq *bfq_bic_lookup(struct request_queue *q) + return icq_to_bic(ioc_lookup_icq(q)); + } + ++static struct bfq_io_cq *bfq_bic_try_lookup(struct request_queue *q) ++{ ++ if (!current->io_context) ++ return NULL; ++ if (spin_trylock_irq(&q->queue_lock)) { ++ struct bfq_io_cq *icq; ++ ++ icq = icq_to_bic(ioc_lookup_icq(q)); ++ spin_unlock_irq(&q->queue_lock); ++ return icq; ++ } ++ ++ return NULL; ++} ++ + /* + * Scheduler run of queue, if there are requests pending and no one in the + * driver that will restart queueing. +@@ -2448,11 +2463,22 @@ static bool bfq_bio_merge(struct request_queue *q, struct bio *bio, + unsigned int nr_segs) + { + struct bfq_data *bfqd = q->elevator->elevator_data; +- struct bfq_io_cq *bic = bfq_bic_lookup(q); ++ struct bfq_io_cq *bic = bfq_bic_try_lookup(q); + struct request *free = NULL; + bool ret; + +- spin_lock_irq(&bfqd->lock); ++ /* ++ * bio merging is called for every bio queued, and it's very easy ++ * to run into contention because of that. If we fail getting ++ * the dd lock, just skip this merge attempt. For related IO, the ++ * plug will be the successful merging point. If we get here, we ++ * already failed doing the obvious merge. Chances of actually ++ * getting a merge off this path is a lot slimmer, so skipping an ++ * occassional lookup that will most likely not succeed anyway should ++ * not be a problem. ++ */ ++ if (!spin_trylock_irq(&bfqd->lock)) ++ return false; + + if (bic) { + /* +@@ -5301,6 +5327,18 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + struct bfq_queue *in_serv_queue; + bool waiting_rq, idle_timer_disabled = false; + ++ /* ++ * If someone else is already dispatching, skip this one. This will ++ * defer the next dispatch event to when something completes, and could ++ * potentially lower the queue depth for contended cases. ++ * ++ * See the logic in blk_mq_do_dispatch_sched(), which loops and ++ * retries if nothing is dispatched. ++ */ ++ if (test_bit(BFQ_DISPATCHING, &bfqd->run_state) || ++ test_and_set_bit_lock(BFQ_DISPATCHING, &bfqd->run_state)) ++ return NULL; ++ + spin_lock_irq(&bfqd->lock); + + in_serv_queue = bfqd->in_service_queue; +@@ -5312,6 +5350,7 @@ static struct request *bfq_dispatch_request(struct blk_mq_hw_ctx *hctx) + waiting_rq && !bfq_bfqq_wait_request(in_serv_queue); + } + ++ clear_bit_unlock(BFQ_DISPATCHING, &bfqd->run_state); + spin_unlock_irq(&bfqd->lock); + bfq_update_dispatch_stats(hctx->queue, rq, + idle_timer_disabled ? in_serv_queue : NULL, +@@ -6233,10 +6272,9 @@ static inline void bfq_update_insert_stats(struct request_queue *q, + + static struct bfq_queue *bfq_init_rq(struct request *rq); + +-static void bfq_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++static void bfq_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags) + { +- struct request_queue *q = hctx->queue; + struct bfq_data *bfqd = q->elevator->elevator_data; + struct bfq_queue *bfqq; + bool idle_timer_disabled = false; +@@ -6298,7 +6336,7 @@ static void bfq_insert_requests(struct blk_mq_hw_ctx *hctx, + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); +- bfq_insert_request(hctx, rq, flags); ++ bfq_insert_request(hctx->queue, rq, flags); + } + } + +@@ -7218,6 +7256,8 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) + q->elevator = eq; + spin_unlock_irq(&q->queue_lock); + ++ spin_lock_init(&bfqd->lock); ++ + /* + * Our fallback bfqq if bfq_find_alloc_queue() runs into OOM issues. + * Grab a permanent reference to it, so that the normal code flow +@@ -7335,8 +7375,6 @@ static int bfq_init_queue(struct request_queue *q, struct elevator_queue *eq) + /* see comments on the definition of next field inside bfq_data */ + bfqd->actuator_load_threshold = 4; + +- spin_lock_init(&bfqd->lock); +- + /* + * The invocation of the next bfq_create_group_hierarchy + * function is the head of a chain of function calls +diff --git a/block/bfq-iosched.h b/block/bfq-iosched.h +index 34a498e6b2a5..bef03e57b0a5 100644 +--- a/block/bfq-iosched.h ++++ b/block/bfq-iosched.h +@@ -504,12 +504,22 @@ struct bfq_io_cq { + unsigned int requests; /* Number of requests this process has in flight */ + }; + ++enum { ++ BFQ_DISPATCHING = 0, ++}; ++ + /** + * struct bfq_data - per-device data structure. + * + * All the fields are protected by @lock. + */ + struct bfq_data { ++ struct { ++ spinlock_t lock; ++ } ____cacheline_aligned_in_smp; ++ ++ unsigned long run_state; ++ + /* device request queue */ + struct request_queue *queue; + /* dispatch queue */ +@@ -795,8 +805,6 @@ struct bfq_data { + /* fallback dummy bfqq for extreme OOM conditions */ + struct bfq_queue oom_bfqq; + +- spinlock_t lock; +- + /* + * bic associated with the task issuing current bio for + * merging. This and the next field are used as a support to +diff --git a/block/mq-deadline.c b/block/mq-deadline.c +index 3e3719093aec..525ce44bd14b 100644 +--- a/block/mq-deadline.c ++++ b/block/mq-deadline.c +@@ -623,7 +623,19 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + struct request *free = NULL; + bool ret; + +- spin_lock(&dd->lock); ++ /* ++ * bio merging is called for every bio queued, and it's very easy ++ * to run into contention because of that. If we fail getting ++ * the dd lock, just skip this merge attempt. For related IO, the ++ * plug will be the successful merging point. If we get here, we ++ * already failed doing the obvious merge. Chances of actually ++ * getting a merge off this path is a lot slimmer, so skipping an ++ * occassional lookup that will most likely not succeed anyway should ++ * not be a problem. ++ */ ++ if (!spin_trylock(&dd->lock)) ++ return false; ++ + ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); + spin_unlock(&dd->lock); + +@@ -636,10 +648,9 @@ static bool dd_bio_merge(struct request_queue *q, struct bio *bio, + /* + * add rq to rbtree and fifo + */ +-static void dd_insert_request(struct blk_mq_hw_ctx *hctx, struct request *rq, ++static void dd_insert_request(struct request_queue *q, struct request *rq, + blk_insert_t flags, struct list_head *free) + { +- struct request_queue *q = hctx->queue; + struct deadline_data *dd = q->elevator->elevator_data; + const enum dd_data_dir data_dir = rq_data_dir(rq); + u16 ioprio = req_get_ioprio(rq); +@@ -697,7 +708,7 @@ static void dd_insert_requests(struct blk_mq_hw_ctx *hctx, + + rq = list_first_entry(list, struct request, queuelist); + list_del_init(&rq->queuelist); +- dd_insert_request(hctx, rq, flags, &free); ++ dd_insert_request(q, rq, flags, &free); + } + spin_unlock(&dd->lock); + +-- +2.52.0 + diff --git a/sys-kernel/gentoo-sources-6.18/0007-crypto.patch b/sys-kernel/gentoo-sources-6.18/0007-crypto.patch new file mode 100644 index 0000000..8cb7cd9 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/0007-crypto.patch @@ -0,0 +1,3441 @@ +From 19c062d3d4cd46ac9095f8ef8133c0e3c01a9d4f Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 18 Dec 2025 16:42:00 +0100 +Subject: [PATCH 07/11] crypto + +Signed-off-by: Peter Jung +--- + arch/x86/crypto/Makefile | 5 +- + arch/x86/crypto/aes-gcm-aesni-x86_64.S | 12 +- + arch/x86/crypto/aes-gcm-vaes-avx2.S | 1150 +++++++++++++++++ + ...m-avx10-x86_64.S => aes-gcm-vaes-avx512.S} | 722 +++++------ + arch/x86/crypto/aesni-intel_glue.c | 264 ++-- + drivers/md/Kconfig | 1 + + drivers/md/dm-verity-fec.c | 21 +- + drivers/md/dm-verity-fec.h | 5 +- + drivers/md/dm-verity-target.c | 203 ++- + drivers/md/dm-verity.h | 52 +- + include/linux/rhashtable.h | 70 +- + 11 files changed, 1921 insertions(+), 584 deletions(-) + create mode 100644 arch/x86/crypto/aes-gcm-vaes-avx2.S + rename arch/x86/crypto/{aes-gcm-avx10-x86_64.S => aes-gcm-vaes-avx512.S} (69%) + +diff --git a/arch/x86/crypto/Makefile b/arch/x86/crypto/Makefile +index 2d30d5d36145..6409e3009524 100644 +--- a/arch/x86/crypto/Makefile ++++ b/arch/x86/crypto/Makefile +@@ -46,8 +46,9 @@ obj-$(CONFIG_CRYPTO_AES_NI_INTEL) += aesni-intel.o + aesni-intel-y := aesni-intel_asm.o aesni-intel_glue.o + aesni-intel-$(CONFIG_64BIT) += aes-ctr-avx-x86_64.o \ + aes-gcm-aesni-x86_64.o \ +- aes-xts-avx-x86_64.o \ +- aes-gcm-avx10-x86_64.o ++ aes-gcm-vaes-avx2.o \ ++ aes-gcm-vaes-avx512.o \ ++ aes-xts-avx-x86_64.o + + obj-$(CONFIG_CRYPTO_GHASH_CLMUL_NI_INTEL) += ghash-clmulni-intel.o + ghash-clmulni-intel-y := ghash-clmulni-intel_asm.o ghash-clmulni-intel_glue.o +diff --git a/arch/x86/crypto/aes-gcm-aesni-x86_64.S b/arch/x86/crypto/aes-gcm-aesni-x86_64.S +index 45940e2883a0..7c8a8a32bd3c 100644 +--- a/arch/x86/crypto/aes-gcm-aesni-x86_64.S ++++ b/arch/x86/crypto/aes-gcm-aesni-x86_64.S +@@ -61,15 +61,15 @@ + // for the *_aesni functions or AVX for the *_aesni_avx ones. (But it seems + // there are no CPUs that support AES-NI without also PCLMULQDQ and SSE4.1.) + // +-// The design generally follows that of aes-gcm-avx10-x86_64.S, and that file is ++// The design generally follows that of aes-gcm-vaes-avx512.S, and that file is + // more thoroughly commented. This file has the following notable changes: + // + // - The vector length is fixed at 128-bit, i.e. xmm registers. This means + // there is only one AES block (and GHASH block) per register. + // +-// - Without AVX512 / AVX10, only 16 SIMD registers are available instead of +-// 32. We work around this by being much more careful about using +-// registers, relying heavily on loads to load values as they are needed. ++// - Without AVX512, only 16 SIMD registers are available instead of 32. We ++// work around this by being much more careful about using registers, ++// relying heavily on loads to load values as they are needed. + // + // - Masking is not available either. We work around this by implementing + // partial block loads and stores using overlapping scalar loads and stores +@@ -90,8 +90,8 @@ + // multiplication instead of schoolbook multiplication. This saves one + // pclmulqdq instruction per block, at the cost of one 64-bit load, one + // pshufd, and 0.25 pxors per block. (This is without the three-argument +-// XOR support that would be provided by AVX512 / AVX10, which would be +-// more beneficial to schoolbook than Karatsuba.) ++// XOR support that would be provided by AVX512, which would be more ++// beneficial to schoolbook than Karatsuba.) + // + // As a rough approximation, we can assume that Karatsuba multiplication is + // faster than schoolbook multiplication in this context if one pshufd and +diff --git a/arch/x86/crypto/aes-gcm-vaes-avx2.S b/arch/x86/crypto/aes-gcm-vaes-avx2.S +new file mode 100644 +index 000000000000..5ccbd85383cd +--- /dev/null ++++ b/arch/x86/crypto/aes-gcm-vaes-avx2.S +@@ -0,0 +1,1150 @@ ++/* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ ++// ++// AES-GCM implementation for x86_64 CPUs that support the following CPU ++// features: VAES && VPCLMULQDQ && AVX2 ++// ++// Copyright 2025 Google LLC ++// ++// Author: Eric Biggers ++// ++//------------------------------------------------------------------------------ ++// ++// This file is dual-licensed, meaning that you can use it under your choice of ++// either of the following two licenses: ++// ++// Licensed under the Apache License 2.0 (the "License"). You may obtain a copy ++// of the License at ++// ++// http://www.apache.org/licenses/LICENSE-2.0 ++// ++// Unless required by applicable law or agreed to in writing, software ++// distributed under the License is distributed on an "AS IS" BASIS, ++// WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. ++// See the License for the specific language governing permissions and ++// limitations under the License. ++// ++// or ++// ++// Redistribution and use in source and binary forms, with or without ++// modification, are permitted provided that the following conditions are met: ++// ++// 1. Redistributions of source code must retain the above copyright notice, ++// this list of conditions and the following disclaimer. ++// ++// 2. Redistributions in binary form must reproduce the above copyright ++// notice, this list of conditions and the following disclaimer in the ++// documentation and/or other materials provided with the distribution. ++// ++// THIS SOFTWARE IS PROVIDED BY THE COPYRIGHT HOLDERS AND CONTRIBUTORS "AS IS" ++// AND ANY EXPRESS OR IMPLIED WARRANTIES, INCLUDING, BUT NOT LIMITED TO, THE ++// IMPLIED WARRANTIES OF MERCHANTABILITY AND FITNESS FOR A PARTICULAR PURPOSE ++// ARE DISCLAIMED. IN NO EVENT SHALL THE COPYRIGHT HOLDER OR CONTRIBUTORS BE ++// LIABLE FOR ANY DIRECT, INDIRECT, INCIDENTAL, SPECIAL, EXEMPLARY, OR ++// CONSEQUENTIAL DAMAGES (INCLUDING, BUT NOT LIMITED TO, PROCUREMENT OF ++// SUBSTITUTE GOODS OR SERVICES; LOSS OF USE, DATA, OR PROFITS; OR BUSINESS ++// INTERRUPTION) HOWEVER CAUSED AND ON ANY THEORY OF LIABILITY, WHETHER IN ++// CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) ++// ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE ++// POSSIBILITY OF SUCH DAMAGE. ++// ++// ----------------------------------------------------------------------------- ++// ++// This is similar to aes-gcm-vaes-avx512.S, but it uses AVX2 instead of AVX512. ++// This means it can only use 16 vector registers instead of 32, the maximum ++// vector length is 32 bytes, and some instructions such as vpternlogd and ++// masked loads/stores are unavailable. However, it is able to run on CPUs that ++// have VAES without AVX512, namely AMD Zen 3 (including "Milan" server CPUs), ++// various Intel client CPUs such as Alder Lake, and Intel Sierra Forest. ++// ++// This implementation also uses Karatsuba multiplication instead of schoolbook ++// multiplication for GHASH in its main loop. This does not help much on Intel, ++// but it improves performance by ~5% on AMD Zen 3. Other factors weighing ++// slightly in favor of Karatsuba multiplication in this implementation are the ++// lower maximum vector length (which means there are fewer key powers, so we ++// can cache the halves of each key power XOR'd together and still use less ++// memory than the AVX512 implementation), and the unavailability of the ++// vpternlogd instruction (which helped schoolbook a bit more than Karatsuba). ++ ++#include ++ ++.section .rodata ++.p2align 4 ++ ++ // The below three 16-byte values must be in the order that they are, as ++ // they are really two 32-byte tables and a 16-byte value that overlap: ++ // ++ // - The first 32-byte table begins at .Lselect_high_bytes_table. ++ // For 0 <= len <= 16, the 16-byte value at ++ // '.Lselect_high_bytes_table + len' selects the high 'len' bytes of ++ // another 16-byte value when AND'ed with it. ++ // ++ // - The second 32-byte table begins at .Lrshift_and_bswap_table. ++ // For 0 <= len <= 16, the 16-byte value at ++ // '.Lrshift_and_bswap_table + len' is a vpshufb mask that does the ++ // following operation: right-shift by '16 - len' bytes (shifting in ++ // zeroes), then reflect all 16 bytes. ++ // ++ // - The 16-byte value at .Lbswap_mask is a vpshufb mask that reflects ++ // all 16 bytes. ++.Lselect_high_bytes_table: ++ .octa 0 ++.Lrshift_and_bswap_table: ++ .octa 0xffffffffffffffffffffffffffffffff ++.Lbswap_mask: ++ .octa 0x000102030405060708090a0b0c0d0e0f ++ ++ // Sixteen 0x0f bytes. By XOR'ing an entry of .Lrshift_and_bswap_table ++ // with this, we get a mask that left-shifts by '16 - len' bytes. ++.Lfifteens: ++ .octa 0x0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f0f ++ ++ // This is the GHASH reducing polynomial without its constant term, i.e. ++ // x^128 + x^7 + x^2 + x, represented using the backwards mapping ++ // between bits and polynomial coefficients. ++ // ++ // Alternatively, it can be interpreted as the naturally-ordered ++ // representation of the polynomial x^127 + x^126 + x^121 + 1, i.e. the ++ // "reversed" GHASH reducing polynomial without its x^128 term. ++.Lgfpoly: ++ .octa 0xc2000000000000000000000000000001 ++ ++ // Same as above, but with the (1 << 64) bit set. ++.Lgfpoly_and_internal_carrybit: ++ .octa 0xc2000000000000010000000000000001 ++ ++ // Values needed to prepare the initial vector of counter blocks. ++.Lctr_pattern: ++ .octa 0 ++ .octa 1 ++ ++ // The number of AES blocks per vector, as a 128-bit value. ++.Linc_2blocks: ++ .octa 2 ++ ++// Offsets in struct aes_gcm_key_vaes_avx2 ++#define OFFSETOF_AESKEYLEN 480 ++#define OFFSETOF_H_POWERS 512 ++#define NUM_H_POWERS 8 ++#define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) ++#define OFFSETOF_H_POWERS_XORED OFFSETOFEND_H_POWERS ++ ++.text ++ ++// Do one step of GHASH-multiplying the 128-bit lanes of \a by the 128-bit lanes ++// of \b and storing the reduced products in \dst. Uses schoolbook ++// multiplication. ++.macro _ghash_mul_step i, a, b, dst, gfpoly, t0, t1, t2 ++.if \i == 0 ++ vpclmulqdq $0x00, \a, \b, \t0 // LO = a_L * b_L ++ vpclmulqdq $0x01, \a, \b, \t1 // MI_0 = a_L * b_H ++.elseif \i == 1 ++ vpclmulqdq $0x10, \a, \b, \t2 // MI_1 = a_H * b_L ++.elseif \i == 2 ++ vpxor \t2, \t1, \t1 // MI = MI_0 + MI_1 ++.elseif \i == 3 ++ vpclmulqdq $0x01, \t0, \gfpoly, \t2 // LO_L*(x^63 + x^62 + x^57) ++.elseif \i == 4 ++ vpshufd $0x4e, \t0, \t0 // Swap halves of LO ++.elseif \i == 5 ++ vpxor \t0, \t1, \t1 // Fold LO into MI (part 1) ++ vpxor \t2, \t1, \t1 // Fold LO into MI (part 2) ++.elseif \i == 6 ++ vpclmulqdq $0x11, \a, \b, \dst // HI = a_H * b_H ++.elseif \i == 7 ++ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) ++.elseif \i == 8 ++ vpshufd $0x4e, \t1, \t1 // Swap halves of MI ++.elseif \i == 9 ++ vpxor \t1, \dst, \dst // Fold MI into HI (part 1) ++ vpxor \t0, \dst, \dst // Fold MI into HI (part 2) ++.endif ++.endm ++ ++// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and store ++// the reduced products in \dst. See _ghash_mul_step for full explanation. ++.macro _ghash_mul a, b, dst, gfpoly, t0, t1, t2 ++.irp i, 0,1,2,3,4,5,6,7,8,9 ++ _ghash_mul_step \i, \a, \b, \dst, \gfpoly, \t0, \t1, \t2 ++.endr ++.endm ++ ++// GHASH-multiply the 128-bit lanes of \a by the 128-bit lanes of \b and add the ++// *unreduced* products to \lo, \mi, and \hi. ++.macro _ghash_mul_noreduce a, b, lo, mi, hi, t0 ++ vpclmulqdq $0x00, \a, \b, \t0 // a_L * b_L ++ vpxor \t0, \lo, \lo ++ vpclmulqdq $0x01, \a, \b, \t0 // a_L * b_H ++ vpxor \t0, \mi, \mi ++ vpclmulqdq $0x10, \a, \b, \t0 // a_H * b_L ++ vpxor \t0, \mi, \mi ++ vpclmulqdq $0x11, \a, \b, \t0 // a_H * b_H ++ vpxor \t0, \hi, \hi ++.endm ++ ++// Reduce the unreduced products from \lo, \mi, and \hi and store the 128-bit ++// reduced products in \hi. See _ghash_mul_step for explanation of reduction. ++.macro _ghash_reduce lo, mi, hi, gfpoly, t0 ++ vpclmulqdq $0x01, \lo, \gfpoly, \t0 ++ vpshufd $0x4e, \lo, \lo ++ vpxor \lo, \mi, \mi ++ vpxor \t0, \mi, \mi ++ vpclmulqdq $0x01, \mi, \gfpoly, \t0 ++ vpshufd $0x4e, \mi, \mi ++ vpxor \mi, \hi, \hi ++ vpxor \t0, \hi, \hi ++.endm ++ ++// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it ++// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0. ++.macro _ghash_square a, dst, gfpoly, t0, t1 ++ vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L ++ vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H ++ vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57) ++ vpshufd $0x4e, \t0, \t0 // Swap halves of LO ++ vpxor \t0, \t1, \t1 // Fold LO into MI ++ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) ++ vpshufd $0x4e, \t1, \t1 // Swap halves of MI ++ vpxor \t1, \dst, \dst // Fold MI into HI (part 1) ++ vpxor \t0, \dst, \dst // Fold MI into HI (part 2) ++.endm ++ ++// void aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); ++// ++// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and ++// initialize |key->h_powers| and |key->h_powers_xored|. ++// ++// We use h_powers[0..7] to store H^8 through H^1, and h_powers_xored[0..7] to ++// store the 64-bit halves of the key powers XOR'd together (for Karatsuba ++// multiplication) in the order 8,6,7,5,4,2,3,1. ++SYM_FUNC_START(aes_gcm_precompute_vaes_avx2) ++ ++ // Function arguments ++ .set KEY, %rdi ++ ++ // Additional local variables ++ .set POWERS_PTR, %rsi ++ .set RNDKEYLAST_PTR, %rdx ++ .set TMP0, %ymm0 ++ .set TMP0_XMM, %xmm0 ++ .set TMP1, %ymm1 ++ .set TMP1_XMM, %xmm1 ++ .set TMP2, %ymm2 ++ .set TMP2_XMM, %xmm2 ++ .set H_CUR, %ymm3 ++ .set H_CUR_XMM, %xmm3 ++ .set H_CUR2, %ymm4 ++ .set H_CUR2_XMM, %xmm4 ++ .set H_INC, %ymm5 ++ .set H_INC_XMM, %xmm5 ++ .set GFPOLY, %ymm6 ++ .set GFPOLY_XMM, %xmm6 ++ ++ // Encrypt an all-zeroes block to get the raw hash subkey. ++ movl OFFSETOF_AESKEYLEN(KEY), %eax ++ lea 6*16(KEY,%rax,4), RNDKEYLAST_PTR ++ vmovdqu (KEY), H_CUR_XMM // Zero-th round key XOR all-zeroes block ++ lea 16(KEY), %rax ++1: ++ vaesenc (%rax), H_CUR_XMM, H_CUR_XMM ++ add $16, %rax ++ cmp %rax, RNDKEYLAST_PTR ++ jne 1b ++ vaesenclast (RNDKEYLAST_PTR), H_CUR_XMM, H_CUR_XMM ++ ++ // Reflect the bytes of the raw hash subkey. ++ vpshufb .Lbswap_mask(%rip), H_CUR_XMM, H_CUR_XMM ++ ++ // Finish preprocessing the byte-reflected hash subkey by multiplying it ++ // by x^-1 ("standard" interpretation of polynomial coefficients) or ++ // equivalently x^1 (natural interpretation). This gets the key into a ++ // format that avoids having to bit-reflect the data blocks later. ++ vpshufd $0xd3, H_CUR_XMM, TMP0_XMM ++ vpsrad $31, TMP0_XMM, TMP0_XMM ++ vpaddq H_CUR_XMM, H_CUR_XMM, H_CUR_XMM ++ vpand .Lgfpoly_and_internal_carrybit(%rip), TMP0_XMM, TMP0_XMM ++ vpxor TMP0_XMM, H_CUR_XMM, H_CUR_XMM ++ ++ // Load the gfpoly constant. ++ vbroadcasti128 .Lgfpoly(%rip), GFPOLY ++ ++ // Square H^1 to get H^2. ++ _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, TMP0_XMM, TMP1_XMM ++ ++ // Create H_CUR = [H^2, H^1] and H_INC = [H^2, H^2]. ++ vinserti128 $1, H_CUR_XMM, H_INC, H_CUR ++ vinserti128 $1, H_INC_XMM, H_INC, H_INC ++ ++ // Compute H_CUR2 = [H^4, H^3]. ++ _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 ++ ++ // Store [H^2, H^1] and [H^4, H^3]. ++ vmovdqu H_CUR, OFFSETOF_H_POWERS+3*32(KEY) ++ vmovdqu H_CUR2, OFFSETOF_H_POWERS+2*32(KEY) ++ ++ // For Karatsuba multiplication: compute and store the two 64-bit halves ++ // of each key power XOR'd together. Order is 4,2,3,1. ++ vpunpcklqdq H_CUR, H_CUR2, TMP0 ++ vpunpckhqdq H_CUR, H_CUR2, TMP1 ++ vpxor TMP1, TMP0, TMP0 ++ vmovdqu TMP0, OFFSETOF_H_POWERS_XORED+32(KEY) ++ ++ // Compute and store H_CUR = [H^6, H^5] and H_CUR2 = [H^8, H^7]. ++ _ghash_mul H_INC, H_CUR2, H_CUR, GFPOLY, TMP0, TMP1, TMP2 ++ _ghash_mul H_INC, H_CUR, H_CUR2, GFPOLY, TMP0, TMP1, TMP2 ++ vmovdqu H_CUR, OFFSETOF_H_POWERS+1*32(KEY) ++ vmovdqu H_CUR2, OFFSETOF_H_POWERS+0*32(KEY) ++ ++ // Again, compute and store the two 64-bit halves of each key power ++ // XOR'd together. Order is 8,6,7,5. ++ vpunpcklqdq H_CUR, H_CUR2, TMP0 ++ vpunpckhqdq H_CUR, H_CUR2, TMP1 ++ vpxor TMP1, TMP0, TMP0 ++ vmovdqu TMP0, OFFSETOF_H_POWERS_XORED(KEY) ++ ++ vzeroupper ++ RET ++SYM_FUNC_END(aes_gcm_precompute_vaes_avx2) ++ ++// Do one step of the GHASH update of four vectors of data blocks. ++// \i: the step to do, 0 through 9 ++// \ghashdata_ptr: pointer to the data blocks (ciphertext or AAD) ++// KEY: pointer to struct aes_gcm_key_vaes_avx2 ++// BSWAP_MASK: mask for reflecting the bytes of blocks ++// H_POW[2-1]_XORED: cached values from KEY->h_powers_xored ++// TMP[0-2]: temporary registers. TMP[1-2] must be preserved across steps. ++// LO, MI: working state for this macro that must be preserved across steps ++// GHASH_ACC: the GHASH accumulator (input/output) ++.macro _ghash_step_4x i, ghashdata_ptr ++ .set HI, GHASH_ACC # alias ++ .set HI_XMM, GHASH_ACC_XMM ++.if \i == 0 ++ // First vector ++ vmovdqu 0*32(\ghashdata_ptr), TMP1 ++ vpshufb BSWAP_MASK, TMP1, TMP1 ++ vmovdqu OFFSETOF_H_POWERS+0*32(KEY), TMP2 ++ vpxor GHASH_ACC, TMP1, TMP1 ++ vpclmulqdq $0x00, TMP2, TMP1, LO ++ vpclmulqdq $0x11, TMP2, TMP1, HI ++ vpunpckhqdq TMP1, TMP1, TMP0 ++ vpxor TMP1, TMP0, TMP0 ++ vpclmulqdq $0x00, H_POW2_XORED, TMP0, MI ++.elseif \i == 1 ++.elseif \i == 2 ++ // Second vector ++ vmovdqu 1*32(\ghashdata_ptr), TMP1 ++ vpshufb BSWAP_MASK, TMP1, TMP1 ++ vmovdqu OFFSETOF_H_POWERS+1*32(KEY), TMP2 ++ vpclmulqdq $0x00, TMP2, TMP1, TMP0 ++ vpxor TMP0, LO, LO ++ vpclmulqdq $0x11, TMP2, TMP1, TMP0 ++ vpxor TMP0, HI, HI ++ vpunpckhqdq TMP1, TMP1, TMP0 ++ vpxor TMP1, TMP0, TMP0 ++ vpclmulqdq $0x10, H_POW2_XORED, TMP0, TMP0 ++ vpxor TMP0, MI, MI ++.elseif \i == 3 ++ // Third vector ++ vmovdqu 2*32(\ghashdata_ptr), TMP1 ++ vpshufb BSWAP_MASK, TMP1, TMP1 ++ vmovdqu OFFSETOF_H_POWERS+2*32(KEY), TMP2 ++.elseif \i == 4 ++ vpclmulqdq $0x00, TMP2, TMP1, TMP0 ++ vpxor TMP0, LO, LO ++ vpclmulqdq $0x11, TMP2, TMP1, TMP0 ++ vpxor TMP0, HI, HI ++.elseif \i == 5 ++ vpunpckhqdq TMP1, TMP1, TMP0 ++ vpxor TMP1, TMP0, TMP0 ++ vpclmulqdq $0x00, H_POW1_XORED, TMP0, TMP0 ++ vpxor TMP0, MI, MI ++ ++ // Fourth vector ++ vmovdqu 3*32(\ghashdata_ptr), TMP1 ++ vpshufb BSWAP_MASK, TMP1, TMP1 ++.elseif \i == 6 ++ vmovdqu OFFSETOF_H_POWERS+3*32(KEY), TMP2 ++ vpclmulqdq $0x00, TMP2, TMP1, TMP0 ++ vpxor TMP0, LO, LO ++ vpclmulqdq $0x11, TMP2, TMP1, TMP0 ++ vpxor TMP0, HI, HI ++ vpunpckhqdq TMP1, TMP1, TMP0 ++ vpxor TMP1, TMP0, TMP0 ++ vpclmulqdq $0x10, H_POW1_XORED, TMP0, TMP0 ++ vpxor TMP0, MI, MI ++.elseif \i == 7 ++ // Finalize 'mi' following Karatsuba multiplication. ++ vpxor LO, MI, MI ++ vpxor HI, MI, MI ++ ++ // Fold lo into mi. ++ vbroadcasti128 .Lgfpoly(%rip), TMP2 ++ vpclmulqdq $0x01, LO, TMP2, TMP0 ++ vpshufd $0x4e, LO, LO ++ vpxor LO, MI, MI ++ vpxor TMP0, MI, MI ++.elseif \i == 8 ++ // Fold mi into hi. ++ vpclmulqdq $0x01, MI, TMP2, TMP0 ++ vpshufd $0x4e, MI, MI ++ vpxor MI, HI, HI ++ vpxor TMP0, HI, HI ++.elseif \i == 9 ++ vextracti128 $1, HI, TMP0_XMM ++ vpxor TMP0_XMM, HI_XMM, GHASH_ACC_XMM ++.endif ++.endm ++ ++// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full ++// explanation. ++.macro _ghash_4x ghashdata_ptr ++.irp i, 0,1,2,3,4,5,6,7,8,9 ++ _ghash_step_4x \i, \ghashdata_ptr ++.endr ++.endm ++ ++// Load 1 <= %ecx <= 16 bytes from the pointer \src into the xmm register \dst ++// and zeroize any remaining bytes. Clobbers %rax, %rcx, and \tmp{64,32}. ++.macro _load_partial_block src, dst, tmp64, tmp32 ++ sub $8, %ecx // LEN - 8 ++ jle .Lle8\@ ++ ++ // Load 9 <= LEN <= 16 bytes. ++ vmovq (\src), \dst // Load first 8 bytes ++ mov (\src, %rcx), %rax // Load last 8 bytes ++ neg %ecx ++ shl $3, %ecx ++ shr %cl, %rax // Discard overlapping bytes ++ vpinsrq $1, %rax, \dst, \dst ++ jmp .Ldone\@ ++ ++.Lle8\@: ++ add $4, %ecx // LEN - 4 ++ jl .Llt4\@ ++ ++ // Load 4 <= LEN <= 8 bytes. ++ mov (\src), %eax // Load first 4 bytes ++ mov (\src, %rcx), \tmp32 // Load last 4 bytes ++ jmp .Lcombine\@ ++ ++.Llt4\@: ++ // Load 1 <= LEN <= 3 bytes. ++ add $2, %ecx // LEN - 2 ++ movzbl (\src), %eax // Load first byte ++ jl .Lmovq\@ ++ movzwl (\src, %rcx), \tmp32 // Load last 2 bytes ++.Lcombine\@: ++ shl $3, %ecx ++ shl %cl, \tmp64 ++ or \tmp64, %rax // Combine the two parts ++.Lmovq\@: ++ vmovq %rax, \dst ++.Ldone\@: ++.endm ++ ++// Store 1 <= %ecx <= 16 bytes from the xmm register \src to the pointer \dst. ++// Clobbers %rax, %rcx, and \tmp{64,32}. ++.macro _store_partial_block src, dst, tmp64, tmp32 ++ sub $8, %ecx // LEN - 8 ++ jl .Llt8\@ ++ ++ // Store 8 <= LEN <= 16 bytes. ++ vpextrq $1, \src, %rax ++ mov %ecx, \tmp32 ++ shl $3, %ecx ++ ror %cl, %rax ++ mov %rax, (\dst, \tmp64) // Store last LEN - 8 bytes ++ vmovq \src, (\dst) // Store first 8 bytes ++ jmp .Ldone\@ ++ ++.Llt8\@: ++ add $4, %ecx // LEN - 4 ++ jl .Llt4\@ ++ ++ // Store 4 <= LEN <= 7 bytes. ++ vpextrd $1, \src, %eax ++ mov %ecx, \tmp32 ++ shl $3, %ecx ++ ror %cl, %eax ++ mov %eax, (\dst, \tmp64) // Store last LEN - 4 bytes ++ vmovd \src, (\dst) // Store first 4 bytes ++ jmp .Ldone\@ ++ ++.Llt4\@: ++ // Store 1 <= LEN <= 3 bytes. ++ vpextrb $0, \src, 0(\dst) ++ cmp $-2, %ecx // LEN - 4 == -2, i.e. LEN == 2? ++ jl .Ldone\@ ++ vpextrb $1, \src, 1(\dst) ++ je .Ldone\@ ++ vpextrb $2, \src, 2(\dst) ++.Ldone\@: ++.endm ++ ++// void aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++// u8 ghash_acc[16], ++// const u8 *aad, int aadlen); ++// ++// This function processes the AAD (Additional Authenticated Data) in GCM. ++// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the ++// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all ++// zeroes. |aadlen| must be a multiple of 16, except on the last call where it ++// can be any length. The caller must do any buffering needed to ensure this. ++// ++// This handles large amounts of AAD efficiently, while also keeping overhead ++// low for small amounts which is the common case. TLS and IPsec use less than ++// one block of AAD, but (uncommonly) other use cases may use much more. ++SYM_FUNC_START(aes_gcm_aad_update_vaes_avx2) ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set GHASH_ACC_PTR, %rsi ++ .set AAD, %rdx ++ .set AADLEN, %ecx // Must be %ecx for _load_partial_block ++ .set AADLEN64, %rcx // Zero-extend AADLEN before using! ++ ++ // Additional local variables. ++ // %rax and %r8 are used as temporary registers. ++ .set TMP0, %ymm0 ++ .set TMP0_XMM, %xmm0 ++ .set TMP1, %ymm1 ++ .set TMP1_XMM, %xmm1 ++ .set TMP2, %ymm2 ++ .set TMP2_XMM, %xmm2 ++ .set LO, %ymm3 ++ .set LO_XMM, %xmm3 ++ .set MI, %ymm4 ++ .set MI_XMM, %xmm4 ++ .set GHASH_ACC, %ymm5 ++ .set GHASH_ACC_XMM, %xmm5 ++ .set BSWAP_MASK, %ymm6 ++ .set BSWAP_MASK_XMM, %xmm6 ++ .set GFPOLY, %ymm7 ++ .set GFPOLY_XMM, %xmm7 ++ .set H_POW2_XORED, %ymm8 ++ .set H_POW1_XORED, %ymm9 ++ ++ // Load the bswap_mask and gfpoly constants. Since AADLEN is usually ++ // small, usually only 128-bit vectors will be used. So as an ++ // optimization, don't broadcast these constants to both 128-bit lanes ++ // quite yet. ++ vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM ++ vmovdqu .Lgfpoly(%rip), GFPOLY_XMM ++ ++ // Load the GHASH accumulator. ++ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM ++ ++ // Check for the common case of AADLEN <= 16, as well as AADLEN == 0. ++ test AADLEN, AADLEN ++ jz .Laad_done ++ cmp $16, AADLEN ++ jle .Laad_lastblock ++ ++ // AADLEN > 16, so we'll operate on full vectors. Broadcast bswap_mask ++ // and gfpoly to both 128-bit lanes. ++ vinserti128 $1, BSWAP_MASK_XMM, BSWAP_MASK, BSWAP_MASK ++ vinserti128 $1, GFPOLY_XMM, GFPOLY, GFPOLY ++ ++ // If AADLEN >= 128, update GHASH with 128 bytes of AAD at a time. ++ add $-128, AADLEN // 128 is 4 bytes, -128 is 1 byte ++ jl .Laad_loop_4x_done ++ vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED ++ vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED ++.Laad_loop_4x: ++ _ghash_4x AAD ++ sub $-128, AAD ++ add $-128, AADLEN ++ jge .Laad_loop_4x ++.Laad_loop_4x_done: ++ ++ // If AADLEN >= 32, update GHASH with 32 bytes of AAD at a time. ++ add $96, AADLEN ++ jl .Laad_loop_1x_done ++.Laad_loop_1x: ++ vmovdqu (AAD), TMP0 ++ vpshufb BSWAP_MASK, TMP0, TMP0 ++ vpxor TMP0, GHASH_ACC, GHASH_ACC ++ vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 ++ _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO ++ vextracti128 $1, GHASH_ACC, TMP0_XMM ++ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ add $32, AAD ++ sub $32, AADLEN ++ jge .Laad_loop_1x ++.Laad_loop_1x_done: ++ add $32, AADLEN ++ // Now 0 <= AADLEN < 32. ++ ++ jz .Laad_done ++ cmp $16, AADLEN ++ jle .Laad_lastblock ++ ++.Laad_last2blocks: ++ // Update GHASH with the remaining 17 <= AADLEN <= 31 bytes of AAD. ++ mov AADLEN, AADLEN // Zero-extend AADLEN to AADLEN64. ++ vmovdqu (AAD), TMP0_XMM ++ vmovdqu -16(AAD, AADLEN64), TMP1_XMM ++ vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM ++ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ lea .Lrshift_and_bswap_table(%rip), %rax ++ vpshufb -16(%rax, AADLEN64), TMP1_XMM, TMP1_XMM ++ vinserti128 $1, TMP1_XMM, GHASH_ACC, GHASH_ACC ++ vmovdqu OFFSETOFEND_H_POWERS-32(KEY), TMP0 ++ _ghash_mul TMP0, GHASH_ACC, GHASH_ACC, GFPOLY, TMP1, TMP2, LO ++ vextracti128 $1, GHASH_ACC, TMP0_XMM ++ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ jmp .Laad_done ++ ++.Laad_lastblock: ++ // Update GHASH with the remaining 1 <= AADLEN <= 16 bytes of AAD. ++ _load_partial_block AAD, TMP0_XMM, %r8, %r8d ++ vpshufb BSWAP_MASK_XMM, TMP0_XMM, TMP0_XMM ++ vpxor TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), TMP0_XMM ++ _ghash_mul TMP0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \ ++ TMP1_XMM, TMP2_XMM, LO_XMM ++ ++.Laad_done: ++ // Store the updated GHASH accumulator back to memory. ++ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) ++ ++ vzeroupper ++ RET ++SYM_FUNC_END(aes_gcm_aad_update_vaes_avx2) ++ ++// Do one non-last round of AES encryption on the blocks in the given AESDATA ++// vectors using the round key that has been broadcast to all 128-bit lanes of ++// \round_key. ++.macro _vaesenc round_key, vecs:vararg ++.irp i, \vecs ++ vaesenc \round_key, AESDATA\i, AESDATA\i ++.endr ++.endm ++ ++// Generate counter blocks in the given AESDATA vectors, then do the zero-th AES ++// round on them. Clobbers TMP0. ++.macro _ctr_begin vecs:vararg ++ vbroadcasti128 .Linc_2blocks(%rip), TMP0 ++.irp i, \vecs ++ vpshufb BSWAP_MASK, LE_CTR, AESDATA\i ++ vpaddd TMP0, LE_CTR, LE_CTR ++.endr ++.irp i, \vecs ++ vpxor RNDKEY0, AESDATA\i, AESDATA\i ++.endr ++.endm ++ ++// Generate and encrypt counter blocks in the given AESDATA vectors, excluding ++// the last AES round. Clobbers TMP0. ++.macro _aesenc_loop vecs:vararg ++ _ctr_begin \vecs ++ lea 16(KEY), %rax ++.Laesenc_loop\@: ++ vbroadcasti128 (%rax), TMP0 ++ _vaesenc TMP0, \vecs ++ add $16, %rax ++ cmp %rax, RNDKEYLAST_PTR ++ jne .Laesenc_loop\@ ++.endm ++ ++// Finalize the keystream blocks in the given AESDATA vectors by doing the last ++// AES round, then XOR those keystream blocks with the corresponding data. ++// Reduce latency by doing the XOR before the vaesenclast, utilizing the ++// property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). Clobbers TMP0. ++.macro _aesenclast_and_xor vecs:vararg ++.irp i, \vecs ++ vpxor \i*32(SRC), RNDKEYLAST, TMP0 ++ vaesenclast TMP0, AESDATA\i, AESDATA\i ++.endr ++.irp i, \vecs ++ vmovdqu AESDATA\i, \i*32(DST) ++.endr ++.endm ++ ++// void aes_gcm_{enc,dec}_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// const u8 *src, u8 *dst, int datalen); ++// ++// This macro generates a GCM encryption or decryption update function with the ++// above prototype (with \enc selecting which one). The function computes the ++// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|, ++// and writes the resulting encrypted or decrypted data to |dst|. It also ++// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext ++// bytes. ++// ++// |datalen| must be a multiple of 16, except on the last call where it can be ++// any length. The caller must do any buffering needed to ensure this. Both ++// in-place and out-of-place en/decryption are supported. ++// ++// |le_ctr| must give the current counter in little-endian format. This ++// function loads the counter from |le_ctr| and increments the loaded counter as ++// needed, but it does *not* store the updated counter back to |le_ctr|. The ++// caller must update |le_ctr| if any more data segments follow. Internally, ++// only the low 32-bit word of the counter is incremented, following the GCM ++// standard. ++.macro _aes_gcm_update enc ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set LE_CTR_PTR, %rsi ++ .set LE_CTR_PTR32, %esi ++ .set GHASH_ACC_PTR, %rdx ++ .set SRC, %rcx // Assumed to be %rcx. ++ // See .Ltail_xor_and_ghash_partial_vec ++ .set DST, %r8 ++ .set DATALEN, %r9d ++ .set DATALEN64, %r9 // Zero-extend DATALEN before using! ++ ++ // Additional local variables ++ ++ // %rax is used as a temporary register. LE_CTR_PTR is also available ++ // as a temporary register after the counter is loaded. ++ ++ // AES key length in bytes ++ .set AESKEYLEN, %r10d ++ .set AESKEYLEN64, %r10 ++ ++ // Pointer to the last AES round key for the chosen AES variant ++ .set RNDKEYLAST_PTR, %r11 ++ ++ // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values ++ // using vpshufb, copied to all 128-bit lanes. ++ .set BSWAP_MASK, %ymm0 ++ .set BSWAP_MASK_XMM, %xmm0 ++ ++ // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, ++ // only the lowest 128-bit lane can be nonzero. When not fully reduced, ++ // more than one lane may be used, and they need to be XOR'd together. ++ .set GHASH_ACC, %ymm1 ++ .set GHASH_ACC_XMM, %xmm1 ++ ++ // TMP[0-2] are temporary registers. ++ .set TMP0, %ymm2 ++ .set TMP0_XMM, %xmm2 ++ .set TMP1, %ymm3 ++ .set TMP1_XMM, %xmm3 ++ .set TMP2, %ymm4 ++ .set TMP2_XMM, %xmm4 ++ ++ // LO and MI are used to accumulate unreduced GHASH products. ++ .set LO, %ymm5 ++ .set LO_XMM, %xmm5 ++ .set MI, %ymm6 ++ .set MI_XMM, %xmm6 ++ ++ // H_POW[2-1]_XORED contain cached values from KEY->h_powers_xored. The ++ // descending numbering reflects the order of the key powers. ++ .set H_POW2_XORED, %ymm7 ++ .set H_POW2_XORED_XMM, %xmm7 ++ .set H_POW1_XORED, %ymm8 ++ .set H_POW1_XORED_XMM, %xmm8 ++ ++ // RNDKEY0 caches the zero-th round key, and RNDKEYLAST the last one. ++ .set RNDKEY0, %ymm9 ++ .set RNDKEYLAST, %ymm10 ++ ++ // LE_CTR contains the next set of little-endian counter blocks. ++ .set LE_CTR, %ymm11 ++ ++ // AESDATA[0-3] hold the counter blocks that are being encrypted by AES. ++ .set AESDATA0, %ymm12 ++ .set AESDATA0_XMM, %xmm12 ++ .set AESDATA1, %ymm13 ++ .set AESDATA1_XMM, %xmm13 ++ .set AESDATA2, %ymm14 ++ .set AESDATA2_XMM, %xmm14 ++ .set AESDATA3, %ymm15 ++ .set AESDATA3_XMM, %xmm15 ++ ++.if \enc ++ .set GHASHDATA_PTR, DST ++.else ++ .set GHASHDATA_PTR, SRC ++.endif ++ ++ vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK ++ ++ // Load the GHASH accumulator and the starting counter. ++ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM ++ vbroadcasti128 (LE_CTR_PTR), LE_CTR ++ ++ // Load the AES key length in bytes. ++ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN ++ ++ // Make RNDKEYLAST_PTR point to the last AES round key. This is the ++ // round key with index 10, 12, or 14 for AES-128, AES-192, or AES-256 ++ // respectively. Then load the zero-th and last round keys. ++ lea 6*16(KEY,AESKEYLEN64,4), RNDKEYLAST_PTR ++ vbroadcasti128 (KEY), RNDKEY0 ++ vbroadcasti128 (RNDKEYLAST_PTR), RNDKEYLAST ++ ++ // Finish initializing LE_CTR by adding 1 to the second block. ++ vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR ++ ++ // If there are at least 128 bytes of data, then continue into the loop ++ // that processes 128 bytes of data at a time. Otherwise skip it. ++ add $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte ++ jl .Lcrypt_loop_4x_done\@ ++ ++ vmovdqu OFFSETOF_H_POWERS_XORED(KEY), H_POW2_XORED ++ vmovdqu OFFSETOF_H_POWERS_XORED+32(KEY), H_POW1_XORED ++ ++ // Main loop: en/decrypt and hash 4 vectors (128 bytes) at a time. ++ ++.if \enc ++ // Encrypt the first 4 vectors of plaintext blocks. ++ _aesenc_loop 0,1,2,3 ++ _aesenclast_and_xor 0,1,2,3 ++ sub $-128, SRC // 128 is 4 bytes, -128 is 1 byte ++ add $-128, DATALEN ++ jl .Lghash_last_ciphertext_4x\@ ++.endif ++ ++.align 16 ++.Lcrypt_loop_4x\@: ++ ++ // Start the AES encryption of the counter blocks. ++ _ctr_begin 0,1,2,3 ++ cmp $24, AESKEYLEN ++ jl 128f // AES-128? ++ je 192f // AES-192? ++ // AES-256 ++ vbroadcasti128 -13*16(RNDKEYLAST_PTR), TMP0 ++ _vaesenc TMP0, 0,1,2,3 ++ vbroadcasti128 -12*16(RNDKEYLAST_PTR), TMP0 ++ _vaesenc TMP0, 0,1,2,3 ++192: ++ vbroadcasti128 -11*16(RNDKEYLAST_PTR), TMP0 ++ _vaesenc TMP0, 0,1,2,3 ++ vbroadcasti128 -10*16(RNDKEYLAST_PTR), TMP0 ++ _vaesenc TMP0, 0,1,2,3 ++128: ++ ++ // Finish the AES encryption of the counter blocks in AESDATA[0-3], ++ // interleaved with the GHASH update of the ciphertext blocks. ++.irp i, 9,8,7,6,5,4,3,2,1 ++ _ghash_step_4x (9 - \i), GHASHDATA_PTR ++ vbroadcasti128 -\i*16(RNDKEYLAST_PTR), TMP0 ++ _vaesenc TMP0, 0,1,2,3 ++.endr ++ _ghash_step_4x 9, GHASHDATA_PTR ++.if \enc ++ sub $-128, DST // 128 is 4 bytes, -128 is 1 byte ++.endif ++ _aesenclast_and_xor 0,1,2,3 ++ sub $-128, SRC ++.if !\enc ++ sub $-128, DST ++.endif ++ add $-128, DATALEN ++ jge .Lcrypt_loop_4x\@ ++ ++.if \enc ++.Lghash_last_ciphertext_4x\@: ++ // Update GHASH with the last set of ciphertext blocks. ++ _ghash_4x DST ++ sub $-128, DST ++.endif ++ ++.Lcrypt_loop_4x_done\@: ++ ++ // Undo the extra subtraction by 128 and check whether data remains. ++ sub $-128, DATALEN // 128 is 4 bytes, -128 is 1 byte ++ jz .Ldone\@ ++ ++ // The data length isn't a multiple of 128 bytes. Process the remaining ++ // data of length 1 <= DATALEN < 128. ++ // ++ // Since there are enough key powers available for all remaining data, ++ // there is no need to do a GHASH reduction after each iteration. ++ // Instead, multiply each remaining block by its own key power, and only ++ // do a GHASH reduction at the very end. ++ ++ // Make POWERS_PTR point to the key powers [H^N, H^(N-1), ...] where N ++ // is the number of blocks that remain. ++ .set POWERS_PTR, LE_CTR_PTR // LE_CTR_PTR is free to be reused. ++ .set POWERS_PTR32, LE_CTR_PTR32 ++ mov DATALEN, %eax ++ neg %rax ++ and $~15, %rax // -round_up(DATALEN, 16) ++ lea OFFSETOFEND_H_POWERS(KEY,%rax), POWERS_PTR ++ ++ // Start collecting the unreduced GHASH intermediate value LO, MI, HI. ++ .set HI, H_POW2_XORED // H_POW2_XORED is free to be reused. ++ .set HI_XMM, H_POW2_XORED_XMM ++ vpxor LO_XMM, LO_XMM, LO_XMM ++ vpxor MI_XMM, MI_XMM, MI_XMM ++ vpxor HI_XMM, HI_XMM, HI_XMM ++ ++ // 1 <= DATALEN < 128. Generate 2 or 4 more vectors of keystream blocks ++ // excluding the last AES round, depending on the remaining DATALEN. ++ cmp $64, DATALEN ++ jg .Ltail_gen_4_keystream_vecs\@ ++ _aesenc_loop 0,1 ++ cmp $32, DATALEN ++ jge .Ltail_xor_and_ghash_full_vec_loop\@ ++ jmp .Ltail_xor_and_ghash_partial_vec\@ ++.Ltail_gen_4_keystream_vecs\@: ++ _aesenc_loop 0,1,2,3 ++ ++ // XOR the remaining data and accumulate the unreduced GHASH products ++ // for DATALEN >= 32, starting with one full 32-byte vector at a time. ++.Ltail_xor_and_ghash_full_vec_loop\@: ++.if \enc ++ _aesenclast_and_xor 0 ++ vpshufb BSWAP_MASK, AESDATA0, AESDATA0 ++.else ++ vmovdqu (SRC), TMP1 ++ vpxor TMP1, RNDKEYLAST, TMP0 ++ vaesenclast TMP0, AESDATA0, AESDATA0 ++ vmovdqu AESDATA0, (DST) ++ vpshufb BSWAP_MASK, TMP1, AESDATA0 ++.endif ++ // The ciphertext blocks (i.e. GHASH input data) are now in AESDATA0. ++ vpxor GHASH_ACC, AESDATA0, AESDATA0 ++ vmovdqu (POWERS_PTR), TMP2 ++ _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 ++ vmovdqa AESDATA1, AESDATA0 ++ vmovdqa AESDATA2, AESDATA1 ++ vmovdqa AESDATA3, AESDATA2 ++ vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ add $32, SRC ++ add $32, DST ++ add $32, POWERS_PTR ++ sub $32, DATALEN ++ cmp $32, DATALEN ++ jge .Ltail_xor_and_ghash_full_vec_loop\@ ++ test DATALEN, DATALEN ++ jz .Ltail_ghash_reduce\@ ++ ++.Ltail_xor_and_ghash_partial_vec\@: ++ // XOR the remaining data and accumulate the unreduced GHASH products, ++ // for 1 <= DATALEN < 32. ++ vaesenclast RNDKEYLAST, AESDATA0, AESDATA0 ++ cmp $16, DATALEN ++ jle .Ltail_xor_and_ghash_1to16bytes\@ ++ ++ // Handle 17 <= DATALEN < 32. ++ ++ // Load a vpshufb mask that will right-shift by '32 - DATALEN' bytes ++ // (shifting in zeroes), then reflect all 16 bytes. ++ lea .Lrshift_and_bswap_table(%rip), %rax ++ vmovdqu -16(%rax, DATALEN64), TMP2_XMM ++ ++ // Move the second keystream block to its own register and left-align it ++ vextracti128 $1, AESDATA0, AESDATA1_XMM ++ vpxor .Lfifteens(%rip), TMP2_XMM, TMP0_XMM ++ vpshufb TMP0_XMM, AESDATA1_XMM, AESDATA1_XMM ++ ++ // Using overlapping loads and stores, XOR the source data with the ++ // keystream and write the destination data. Then prepare the GHASH ++ // input data: the full ciphertext block and the zero-padded partial ++ // ciphertext block, both byte-reflected, in AESDATA0. ++.if \enc ++ vpxor -16(SRC, DATALEN64), AESDATA1_XMM, AESDATA1_XMM ++ vpxor (SRC), AESDATA0_XMM, AESDATA0_XMM ++ vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) ++ vmovdqu AESDATA0_XMM, (DST) ++ vpshufb TMP2_XMM, AESDATA1_XMM, AESDATA1_XMM ++ vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM ++.else ++ vmovdqu -16(SRC, DATALEN64), TMP1_XMM ++ vmovdqu (SRC), TMP0_XMM ++ vpxor TMP1_XMM, AESDATA1_XMM, AESDATA1_XMM ++ vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM ++ vmovdqu AESDATA1_XMM, -16(DST, DATALEN64) ++ vmovdqu AESDATA0_XMM, (DST) ++ vpshufb TMP2_XMM, TMP1_XMM, AESDATA1_XMM ++ vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM ++.endif ++ vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM ++ vinserti128 $1, AESDATA1_XMM, AESDATA0, AESDATA0 ++ vmovdqu (POWERS_PTR), TMP2 ++ jmp .Ltail_ghash_last_vec\@ ++ ++.Ltail_xor_and_ghash_1to16bytes\@: ++ // Handle 1 <= DATALEN <= 16. Carefully load and store the ++ // possibly-partial block, which we mustn't access out of bounds. ++ vmovdqu (POWERS_PTR), TMP2_XMM ++ mov SRC, KEY // Free up %rcx, assuming SRC == %rcx ++ mov DATALEN, %ecx ++ _load_partial_block KEY, TMP0_XMM, POWERS_PTR, POWERS_PTR32 ++ vpxor TMP0_XMM, AESDATA0_XMM, AESDATA0_XMM ++ mov DATALEN, %ecx ++ _store_partial_block AESDATA0_XMM, DST, POWERS_PTR, POWERS_PTR32 ++.if \enc ++ lea .Lselect_high_bytes_table(%rip), %rax ++ vpshufb BSWAP_MASK_XMM, AESDATA0_XMM, AESDATA0_XMM ++ vpand (%rax, DATALEN64), AESDATA0_XMM, AESDATA0_XMM ++.else ++ vpshufb BSWAP_MASK_XMM, TMP0_XMM, AESDATA0_XMM ++.endif ++ vpxor GHASH_ACC_XMM, AESDATA0_XMM, AESDATA0_XMM ++ ++.Ltail_ghash_last_vec\@: ++ // Accumulate the unreduced GHASH products for the last 1-2 blocks. The ++ // GHASH input data is in AESDATA0. If only one block remains, then the ++ // second block in AESDATA0 is zero and does not affect the result. ++ _ghash_mul_noreduce TMP2, AESDATA0, LO, MI, HI, TMP0 ++ ++.Ltail_ghash_reduce\@: ++ // Finally, do the GHASH reduction. ++ vbroadcasti128 .Lgfpoly(%rip), TMP0 ++ _ghash_reduce LO, MI, HI, TMP0, TMP1 ++ vextracti128 $1, HI, GHASH_ACC_XMM ++ vpxor HI_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ ++.Ldone\@: ++ // Store the updated GHASH accumulator back to memory. ++ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) ++ ++ vzeroupper ++ RET ++.endm ++ ++// void aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen); ++// bool aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++// const u32 le_ctr[4], const u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen, ++// const u8 tag[16], int taglen); ++// ++// This macro generates one of the above two functions (with \enc selecting ++// which one). Both functions finish computing the GCM authentication tag by ++// updating GHASH with the lengths block and encrypting the GHASH accumulator. ++// |total_aadlen| and |total_datalen| must be the total length of the additional ++// authenticated data and the en/decrypted data in bytes, respectively. ++// ++// The encryption function then stores the full-length (16-byte) computed ++// authentication tag to |ghash_acc|. The decryption function instead loads the ++// expected authentication tag (the one that was transmitted) from the 16-byte ++// buffer |tag|, compares the first 4 <= |taglen| <= 16 bytes of it to the ++// computed tag in constant time, and returns true if and only if they match. ++.macro _aes_gcm_final enc ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set LE_CTR_PTR, %rsi ++ .set GHASH_ACC_PTR, %rdx ++ .set TOTAL_AADLEN, %rcx ++ .set TOTAL_DATALEN, %r8 ++ .set TAG, %r9 ++ .set TAGLEN, %r10d // Originally at 8(%rsp) ++ .set TAGLEN64, %r10 ++ ++ // Additional local variables. ++ // %rax and %xmm0-%xmm3 are used as temporary registers. ++ .set AESKEYLEN, %r11d ++ .set AESKEYLEN64, %r11 ++ .set GFPOLY, %xmm4 ++ .set BSWAP_MASK, %xmm5 ++ .set LE_CTR, %xmm6 ++ .set GHASH_ACC, %xmm7 ++ .set H_POW1, %xmm8 ++ ++ // Load some constants. ++ vmovdqa .Lgfpoly(%rip), GFPOLY ++ vmovdqa .Lbswap_mask(%rip), BSWAP_MASK ++ ++ // Load the AES key length in bytes. ++ movl OFFSETOF_AESKEYLEN(KEY), AESKEYLEN ++ ++ // Set up a counter block with 1 in the low 32-bit word. This is the ++ // counter that produces the ciphertext needed to encrypt the auth tag. ++ // GFPOLY has 1 in the low word, so grab the 1 from there using a blend. ++ vpblendd $0xe, (LE_CTR_PTR), GFPOLY, LE_CTR ++ ++ // Build the lengths block and XOR it with the GHASH accumulator. ++ // Although the lengths block is defined as the AAD length followed by ++ // the en/decrypted data length, both in big-endian byte order, a byte ++ // reflection of the full block is needed because of the way we compute ++ // GHASH (see _ghash_mul_step). By using little-endian values in the ++ // opposite order, we avoid having to reflect any bytes here. ++ vmovq TOTAL_DATALEN, %xmm0 ++ vpinsrq $1, TOTAL_AADLEN, %xmm0, %xmm0 ++ vpsllq $3, %xmm0, %xmm0 // Bytes to bits ++ vpxor (GHASH_ACC_PTR), %xmm0, GHASH_ACC ++ ++ // Load the first hash key power (H^1), which is stored last. ++ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1 ++ ++ // Load TAGLEN if decrypting. ++.if !\enc ++ movl 8(%rsp), TAGLEN ++.endif ++ ++ // Make %rax point to the last AES round key for the chosen AES variant. ++ lea 6*16(KEY,AESKEYLEN64,4), %rax ++ ++ // Start the AES encryption of the counter block by swapping the counter ++ // block to big-endian and XOR-ing it with the zero-th AES round key. ++ vpshufb BSWAP_MASK, LE_CTR, %xmm0 ++ vpxor (KEY), %xmm0, %xmm0 ++ ++ // Complete the AES encryption and multiply GHASH_ACC by H^1. ++ // Interleave the AES and GHASH instructions to improve performance. ++ cmp $24, AESKEYLEN ++ jl 128f // AES-128? ++ je 192f // AES-192? ++ // AES-256 ++ vaesenc -13*16(%rax), %xmm0, %xmm0 ++ vaesenc -12*16(%rax), %xmm0, %xmm0 ++192: ++ vaesenc -11*16(%rax), %xmm0, %xmm0 ++ vaesenc -10*16(%rax), %xmm0, %xmm0 ++128: ++.irp i, 0,1,2,3,4,5,6,7,8 ++ _ghash_mul_step \i, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ %xmm1, %xmm2, %xmm3 ++ vaesenc (\i-9)*16(%rax), %xmm0, %xmm0 ++.endr ++ _ghash_mul_step 9, H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ %xmm1, %xmm2, %xmm3 ++ ++ // Undo the byte reflection of the GHASH accumulator. ++ vpshufb BSWAP_MASK, GHASH_ACC, GHASH_ACC ++ ++ // Do the last AES round and XOR the resulting keystream block with the ++ // GHASH accumulator to produce the full computed authentication tag. ++ // ++ // Reduce latency by taking advantage of the property vaesenclast(key, ++ // a) ^ b == vaesenclast(key ^ b, a). I.e., XOR GHASH_ACC into the last ++ // round key, instead of XOR'ing the final AES output with GHASH_ACC. ++ // ++ // enc_final then returns the computed auth tag, while dec_final ++ // compares it with the transmitted one and returns a bool. To compare ++ // the tags, dec_final XORs them together and uses vptest to check ++ // whether the result is all-zeroes. This should be constant-time. ++ // dec_final applies the vaesenclast optimization to this additional ++ // value XOR'd too. ++.if \enc ++ vpxor (%rax), GHASH_ACC, %xmm1 ++ vaesenclast %xmm1, %xmm0, GHASH_ACC ++ vmovdqu GHASH_ACC, (GHASH_ACC_PTR) ++.else ++ vpxor (TAG), GHASH_ACC, GHASH_ACC ++ vpxor (%rax), GHASH_ACC, GHASH_ACC ++ vaesenclast GHASH_ACC, %xmm0, %xmm0 ++ lea .Lselect_high_bytes_table(%rip), %rax ++ vmovdqu (%rax, TAGLEN64), %xmm1 ++ vpshufb BSWAP_MASK, %xmm1, %xmm1 // select low bytes, not high ++ vptest %xmm1, %xmm0 ++ sete %al ++.endif ++ // No need for vzeroupper here, since only used xmm registers were used. ++ RET ++.endm ++ ++SYM_FUNC_START(aes_gcm_enc_update_vaes_avx2) ++ _aes_gcm_update 1 ++SYM_FUNC_END(aes_gcm_enc_update_vaes_avx2) ++SYM_FUNC_START(aes_gcm_dec_update_vaes_avx2) ++ _aes_gcm_update 0 ++SYM_FUNC_END(aes_gcm_dec_update_vaes_avx2) ++ ++SYM_FUNC_START(aes_gcm_enc_final_vaes_avx2) ++ _aes_gcm_final 1 ++SYM_FUNC_END(aes_gcm_enc_final_vaes_avx2) ++SYM_FUNC_START(aes_gcm_dec_final_vaes_avx2) ++ _aes_gcm_final 0 ++SYM_FUNC_END(aes_gcm_dec_final_vaes_avx2) +diff --git a/arch/x86/crypto/aes-gcm-avx10-x86_64.S b/arch/x86/crypto/aes-gcm-vaes-avx512.S +similarity index 69% +rename from arch/x86/crypto/aes-gcm-avx10-x86_64.S +rename to arch/x86/crypto/aes-gcm-vaes-avx512.S +index 02ee11083d4f..06b71314d65c 100644 +--- a/arch/x86/crypto/aes-gcm-avx10-x86_64.S ++++ b/arch/x86/crypto/aes-gcm-vaes-avx512.S +@@ -1,6 +1,7 @@ + /* SPDX-License-Identifier: Apache-2.0 OR BSD-2-Clause */ + // +-// VAES and VPCLMULQDQ optimized AES-GCM for x86_64 ++// AES-GCM implementation for x86_64 CPUs that support the following CPU ++// features: VAES && VPCLMULQDQ && AVX512BW && AVX512VL && BMI2 + // + // Copyright 2024 Google LLC + // +@@ -45,41 +46,6 @@ + // CONTRACT, STRICT LIABILITY, OR TORT (INCLUDING NEGLIGENCE OR OTHERWISE) + // ARISING IN ANY WAY OUT OF THE USE OF THIS SOFTWARE, EVEN IF ADVISED OF THE + // POSSIBILITY OF SUCH DAMAGE. +-// +-//------------------------------------------------------------------------------ +-// +-// This file implements AES-GCM (Galois/Counter Mode) for x86_64 CPUs that +-// support VAES (vector AES), VPCLMULQDQ (vector carryless multiplication), and +-// either AVX512 or AVX10. Some of the functions, notably the encryption and +-// decryption update functions which are the most performance-critical, are +-// provided in two variants generated from a macro: one using 256-bit vectors +-// (suffix: vaes_avx10_256) and one using 512-bit vectors (vaes_avx10_512). The +-// other, "shared" functions (vaes_avx10) use at most 256-bit vectors. +-// +-// The functions that use 512-bit vectors are intended for CPUs that support +-// 512-bit vectors *and* where using them doesn't cause significant +-// downclocking. They require the following CPU features: +-// +-// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/512) +-// +-// The other functions require the following CPU features: +-// +-// VAES && VPCLMULQDQ && BMI2 && ((AVX512BW && AVX512VL) || AVX10/256) +-// +-// All functions use the "System V" ABI. The Windows ABI is not supported. +-// +-// Note that we use "avx10" in the names of the functions as a shorthand to +-// really mean "AVX10 or a certain set of AVX512 features". Due to Intel's +-// introduction of AVX512 and then its replacement by AVX10, there doesn't seem +-// to be a simple way to name things that makes sense on all CPUs. +-// +-// Note that the macros that support both 256-bit and 512-bit vectors could +-// fairly easily be changed to support 128-bit too. However, this would *not* +-// be sufficient to allow the code to run on CPUs without AVX512 or AVX10, +-// because the code heavily uses several features of these extensions other than +-// the vector length: the increase in the number of SIMD registers from 16 to +-// 32, masking support, and new instructions such as vpternlogd (which can do a +-// three-argument XOR). These features are very useful for AES-GCM. + + #include + +@@ -104,16 +70,14 @@ + .Lgfpoly_and_internal_carrybit: + .octa 0xc2000000000000010000000000000001 + +- // The below constants are used for incrementing the counter blocks. +- // ctr_pattern points to the four 128-bit values [0, 1, 2, 3]. +- // inc_2blocks and inc_4blocks point to the single 128-bit values 2 and +- // 4. Note that the same '2' is reused in ctr_pattern and inc_2blocks. ++ // Values needed to prepare the initial vector of counter blocks. + .Lctr_pattern: + .octa 0 + .octa 1 +-.Linc_2blocks: + .octa 2 + .octa 3 ++ ++ // The number of AES blocks per vector, as a 128-bit value. + .Linc_4blocks: + .octa 4 + +@@ -130,29 +94,13 @@ + // Offset to end of hash key powers array in the key struct. + // + // This is immediately followed by three zeroized padding blocks, which are +-// included so that partial vectors can be handled more easily. E.g. if VL=64 +-// and two blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most +-// padding blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. ++// included so that partial vectors can be handled more easily. E.g. if two ++// blocks remain, we load the 4 values [H^2, H^1, 0, 0]. The most padding ++// blocks needed is 3, which occurs if [H^1, 0, 0, 0] is loaded. + #define OFFSETOFEND_H_POWERS (OFFSETOF_H_POWERS + (NUM_H_POWERS * 16)) + + .text + +-// Set the vector length in bytes. This sets the VL variable and defines +-// register aliases V0-V31 that map to the ymm or zmm registers. +-.macro _set_veclen vl +- .set VL, \vl +-.irp i, 0,1,2,3,4,5,6,7,8,9,10,11,12,13,14,15, \ +- 16,17,18,19,20,21,22,23,24,25,26,27,28,29,30,31 +-.if VL == 32 +- .set V\i, %ymm\i +-.elseif VL == 64 +- .set V\i, %zmm\i +-.else +- .error "Unsupported vector length" +-.endif +-.endr +-.endm +- + // The _ghash_mul_step macro does one step of GHASH multiplication of the + // 128-bit lanes of \a by the corresponding 128-bit lanes of \b and storing the + // reduced products in \dst. \t0, \t1, and \t2 are temporary registers of the +@@ -312,39 +260,44 @@ + vpternlogd $0x96, \t0, \mi, \hi + .endm + +-// void aes_gcm_precompute_##suffix(struct aes_gcm_key_avx10 *key); +-// +-// Given the expanded AES key |key->aes_key|, this function derives the GHASH +-// subkey and initializes |key->ghash_key_powers| with powers of it. +-// +-// The number of key powers initialized is NUM_H_POWERS, and they are stored in +-// the order H^NUM_H_POWERS to H^1. The zeroized padding blocks after the key +-// powers themselves are also initialized. ++// This is a specialized version of _ghash_mul that computes \a * \a, i.e. it ++// squares \a. It skips computing MI = (a_L * a_H) + (a_H * a_L) = 0. ++.macro _ghash_square a, dst, gfpoly, t0, t1 ++ vpclmulqdq $0x00, \a, \a, \t0 // LO = a_L * a_L ++ vpclmulqdq $0x11, \a, \a, \dst // HI = a_H * a_H ++ vpclmulqdq $0x01, \t0, \gfpoly, \t1 // LO_L*(x^63 + x^62 + x^57) ++ vpshufd $0x4e, \t0, \t0 // Swap halves of LO ++ vpxord \t0, \t1, \t1 // Fold LO into MI ++ vpclmulqdq $0x01, \t1, \gfpoly, \t0 // MI_L*(x^63 + x^62 + x^57) ++ vpshufd $0x4e, \t1, \t1 // Swap halves of MI ++ vpternlogd $0x96, \t0, \t1, \dst // Fold MI into HI ++.endm ++ ++// void aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); + // +-// This macro supports both VL=32 and VL=64. _set_veclen must have been invoked +-// with the desired length. In the VL=32 case, the function computes twice as +-// many key powers than are actually used by the VL=32 GCM update functions. +-// This is done to keep the key format the same regardless of vector length. +-.macro _aes_gcm_precompute ++// Given the expanded AES key |key->base.aes_key|, derive the GHASH subkey and ++// initialize |key->h_powers| and |key->padding|. ++SYM_FUNC_START(aes_gcm_precompute_vaes_avx512) + + // Function arguments + .set KEY, %rdi + +- // Additional local variables. V0-V2 and %rax are used as temporaries. ++ // Additional local variables. ++ // %zmm[0-2] and %rax are used as temporaries. + .set POWERS_PTR, %rsi + .set RNDKEYLAST_PTR, %rdx +- .set H_CUR, V3 ++ .set H_CUR, %zmm3 + .set H_CUR_YMM, %ymm3 + .set H_CUR_XMM, %xmm3 +- .set H_INC, V4 ++ .set H_INC, %zmm4 + .set H_INC_YMM, %ymm4 + .set H_INC_XMM, %xmm4 +- .set GFPOLY, V5 ++ .set GFPOLY, %zmm5 + .set GFPOLY_YMM, %ymm5 + .set GFPOLY_XMM, %xmm5 + + // Get pointer to lowest set of key powers (located at end of array). +- lea OFFSETOFEND_H_POWERS-VL(KEY), POWERS_PTR ++ lea OFFSETOFEND_H_POWERS-64(KEY), POWERS_PTR + + // Encrypt an all-zeroes block to get the raw hash subkey. + movl OFFSETOF_AESKEYLEN(KEY), %eax +@@ -363,8 +316,8 @@ + + // Zeroize the padding blocks. + vpxor %xmm0, %xmm0, %xmm0 +- vmovdqu %ymm0, VL(POWERS_PTR) +- vmovdqu %xmm0, VL+2*16(POWERS_PTR) ++ vmovdqu %ymm0, 64(POWERS_PTR) ++ vmovdqu %xmm0, 64+2*16(POWERS_PTR) + + // Finish preprocessing the first key power, H^1. Since this GHASH + // implementation operates directly on values with the backwards bit +@@ -397,54 +350,44 @@ + // special needs to be done to make this happen, though: H^1 * H^1 would + // end up with two factors of x^-1, but the multiplication consumes one. + // So the product H^2 ends up with the desired one factor of x^-1. +- _ghash_mul H_CUR_XMM, H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, \ +- %xmm0, %xmm1, %xmm2 ++ _ghash_square H_CUR_XMM, H_INC_XMM, GFPOLY_XMM, %xmm0, %xmm1 + + // Create H_CUR_YMM = [H^2, H^1] and H_INC_YMM = [H^2, H^2]. + vinserti128 $1, H_CUR_XMM, H_INC_YMM, H_CUR_YMM + vinserti128 $1, H_INC_XMM, H_INC_YMM, H_INC_YMM + +-.if VL == 64 + // Create H_CUR = [H^4, H^3, H^2, H^1] and H_INC = [H^4, H^4, H^4, H^4]. + _ghash_mul H_INC_YMM, H_CUR_YMM, H_INC_YMM, GFPOLY_YMM, \ + %ymm0, %ymm1, %ymm2 + vinserti64x4 $1, H_CUR_YMM, H_INC, H_CUR + vshufi64x2 $0, H_INC, H_INC, H_INC +-.endif + + // Store the lowest set of key powers. + vmovdqu8 H_CUR, (POWERS_PTR) + +- // Compute and store the remaining key powers. With VL=32, repeatedly +- // multiply [H^(i+1), H^i] by [H^2, H^2] to get [H^(i+3), H^(i+2)]. +- // With VL=64, repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by ++ // Compute and store the remaining key powers. ++ // Repeatedly multiply [H^(i+3), H^(i+2), H^(i+1), H^i] by + // [H^4, H^4, H^4, H^4] to get [H^(i+7), H^(i+6), H^(i+5), H^(i+4)]. +- mov $(NUM_H_POWERS*16/VL) - 1, %eax +-.Lprecompute_next\@: +- sub $VL, POWERS_PTR +- _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, V0, V1, V2 ++ mov $3, %eax ++.Lprecompute_next: ++ sub $64, POWERS_PTR ++ _ghash_mul H_INC, H_CUR, H_CUR, GFPOLY, %zmm0, %zmm1, %zmm2 + vmovdqu8 H_CUR, (POWERS_PTR) + dec %eax +- jnz .Lprecompute_next\@ ++ jnz .Lprecompute_next + + vzeroupper // This is needed after using ymm or zmm registers. + RET +-.endm ++SYM_FUNC_END(aes_gcm_precompute_vaes_avx512) + + // XOR together the 128-bit lanes of \src (whose low lane is \src_xmm) and store + // the result in \dst_xmm. This implicitly zeroizes the other lanes of dst. + .macro _horizontal_xor src, src_xmm, dst_xmm, t0_xmm, t1_xmm, t2_xmm + vextracti32x4 $1, \src, \t0_xmm +-.if VL == 32 +- vpxord \t0_xmm, \src_xmm, \dst_xmm +-.elseif VL == 64 + vextracti32x4 $2, \src, \t1_xmm + vextracti32x4 $3, \src, \t2_xmm + vpxord \t0_xmm, \src_xmm, \dst_xmm + vpternlogd $0x96, \t1_xmm, \t2_xmm, \dst_xmm +-.else +- .error "Unsupported vector length" +-.endif + .endm + + // Do one step of the GHASH update of the data blocks given in the vector +@@ -458,25 +401,21 @@ + // + // The GHASH update does: GHASH_ACC = H_POW4*(GHASHDATA0 + GHASH_ACC) + + // H_POW3*GHASHDATA1 + H_POW2*GHASHDATA2 + H_POW1*GHASHDATA3, where the +-// operations are vectorized operations on vectors of 16-byte blocks. E.g., +-// with VL=32 there are 2 blocks per vector and the vectorized terms correspond +-// to the following non-vectorized terms: +-// +-// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^8*(blk0 + GHASH_ACC_XMM) and H^7*(blk1 + 0) +-// H_POW3*GHASHDATA1 => H^6*blk2 and H^5*blk3 +-// H_POW2*GHASHDATA2 => H^4*blk4 and H^3*blk5 +-// H_POW1*GHASHDATA3 => H^2*blk6 and H^1*blk7 ++// operations are vectorized operations on 512-bit vectors of 128-bit blocks. ++// The vectorized terms correspond to the following non-vectorized terms: + // +-// With VL=64, we use 4 blocks/vector, H^16 through H^1, and blk0 through blk15. ++// H_POW4*(GHASHDATA0 + GHASH_ACC) => H^16*(blk0 + GHASH_ACC_XMM), ++// H^15*(blk1 + 0), H^14*(blk2 + 0), and H^13*(blk3 + 0) ++// H_POW3*GHASHDATA1 => H^12*blk4, H^11*blk5, H^10*blk6, and H^9*blk7 ++// H_POW2*GHASHDATA2 => H^8*blk8, H^7*blk9, H^6*blk10, and H^5*blk11 ++// H_POW1*GHASHDATA3 => H^4*blk12, H^3*blk13, H^2*blk14, and H^1*blk15 + // + // More concretely, this code does: + // - Do vectorized "schoolbook" multiplications to compute the intermediate + // 256-bit product of each block and its corresponding hash key power. +-// There are 4*VL/16 of these intermediate products. +-// - Sum (XOR) the intermediate 256-bit products across vectors. This leaves +-// VL/16 256-bit intermediate values. ++// - Sum (XOR) the intermediate 256-bit products across vectors. + // - Do a vectorized reduction of these 256-bit intermediate values to +-// 128-bits each. This leaves VL/16 128-bit intermediate values. ++// 128-bits each. + // - Sum (XOR) these values and store the 128-bit result in GHASH_ACC_XMM. + // + // See _ghash_mul_step for the full explanation of the operations performed for +@@ -532,85 +471,224 @@ + .endif + .endm + +-// Do one non-last round of AES encryption on the counter blocks in V0-V3 using +-// the round key that has been broadcast to all 128-bit lanes of \round_key. ++// Update GHASH with four vectors of data blocks. See _ghash_step_4x for full ++// explanation. ++.macro _ghash_4x ++.irp i, 0,1,2,3,4,5,6,7,8,9 ++ _ghash_step_4x \i ++.endr ++.endm ++ ++// void aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++// u8 ghash_acc[16], ++// const u8 *aad, int aadlen); ++// ++// This function processes the AAD (Additional Authenticated Data) in GCM. ++// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the ++// data given by |aad| and |aadlen|. On the first call, |ghash_acc| must be all ++// zeroes. |aadlen| must be a multiple of 16, except on the last call where it ++// can be any length. The caller must do any buffering needed to ensure this. ++// ++// This handles large amounts of AAD efficiently, while also keeping overhead ++// low for small amounts which is the common case. TLS and IPsec use less than ++// one block of AAD, but (uncommonly) other use cases may use much more. ++SYM_FUNC_START(aes_gcm_aad_update_vaes_avx512) ++ ++ // Function arguments ++ .set KEY, %rdi ++ .set GHASH_ACC_PTR, %rsi ++ .set AAD, %rdx ++ .set AADLEN, %ecx ++ .set AADLEN64, %rcx // Zero-extend AADLEN before using! ++ ++ // Additional local variables. ++ // %rax and %k1 are used as temporary registers. ++ .set GHASHDATA0, %zmm0 ++ .set GHASHDATA0_XMM, %xmm0 ++ .set GHASHDATA1, %zmm1 ++ .set GHASHDATA1_XMM, %xmm1 ++ .set GHASHDATA2, %zmm2 ++ .set GHASHDATA2_XMM, %xmm2 ++ .set GHASHDATA3, %zmm3 ++ .set BSWAP_MASK, %zmm4 ++ .set BSWAP_MASK_XMM, %xmm4 ++ .set GHASH_ACC, %zmm5 ++ .set GHASH_ACC_XMM, %xmm5 ++ .set H_POW4, %zmm6 ++ .set H_POW3, %zmm7 ++ .set H_POW2, %zmm8 ++ .set H_POW1, %zmm9 ++ .set H_POW1_XMM, %xmm9 ++ .set GFPOLY, %zmm10 ++ .set GFPOLY_XMM, %xmm10 ++ .set GHASHTMP0, %zmm11 ++ .set GHASHTMP1, %zmm12 ++ .set GHASHTMP2, %zmm13 ++ ++ // Load the GHASH accumulator. ++ vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM ++ ++ // Check for the common case of AADLEN <= 16, as well as AADLEN == 0. ++ cmp $16, AADLEN ++ jg .Laad_more_than_16bytes ++ test AADLEN, AADLEN ++ jz .Laad_done ++ ++ // Fast path: update GHASH with 1 <= AADLEN <= 16 bytes of AAD. ++ vmovdqu .Lbswap_mask(%rip), BSWAP_MASK_XMM ++ vmovdqu .Lgfpoly(%rip), GFPOLY_XMM ++ mov $-1, %eax ++ bzhi AADLEN, %eax, %eax ++ kmovd %eax, %k1 ++ vmovdqu8 (AAD), GHASHDATA0_XMM{%k1}{z} ++ vmovdqu OFFSETOFEND_H_POWERS-16(KEY), H_POW1_XMM ++ vpshufb BSWAP_MASK_XMM, GHASHDATA0_XMM, GHASHDATA0_XMM ++ vpxor GHASHDATA0_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM ++ _ghash_mul H_POW1_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM, GFPOLY_XMM, \ ++ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM ++ jmp .Laad_done ++ ++.Laad_more_than_16bytes: ++ vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK ++ vbroadcasti32x4 .Lgfpoly(%rip), GFPOLY ++ ++ // If AADLEN >= 256, update GHASH with 256 bytes of AAD at a time. ++ sub $256, AADLEN ++ jl .Laad_loop_4x_done ++ vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4 ++ vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3 ++ vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2 ++ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 ++.Laad_loop_4x: ++ vmovdqu8 0*64(AAD), GHASHDATA0 ++ vmovdqu8 1*64(AAD), GHASHDATA1 ++ vmovdqu8 2*64(AAD), GHASHDATA2 ++ vmovdqu8 3*64(AAD), GHASHDATA3 ++ _ghash_4x ++ add $256, AAD ++ sub $256, AADLEN ++ jge .Laad_loop_4x ++.Laad_loop_4x_done: ++ ++ // If AADLEN >= 64, update GHASH with 64 bytes of AAD at a time. ++ add $192, AADLEN ++ jl .Laad_loop_1x_done ++ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 ++.Laad_loop_1x: ++ vmovdqu8 (AAD), GHASHDATA0 ++ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 ++ vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC ++ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ GHASHDATA0, GHASHDATA1, GHASHDATA2 ++ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ ++ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM ++ add $64, AAD ++ sub $64, AADLEN ++ jge .Laad_loop_1x ++.Laad_loop_1x_done: ++ ++ // Update GHASH with the remaining 0 <= AADLEN < 64 bytes of AAD. ++ add $64, AADLEN ++ jz .Laad_done ++ mov $-1, %rax ++ bzhi AADLEN64, %rax, %rax ++ kmovq %rax, %k1 ++ vmovdqu8 (AAD), GHASHDATA0{%k1}{z} ++ neg AADLEN64 ++ and $~15, AADLEN64 // -round_up(AADLEN, 16) ++ vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 ++ vpshufb BSWAP_MASK, GHASHDATA0, GHASHDATA0 ++ vpxord GHASHDATA0, GHASH_ACC, GHASH_ACC ++ _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ ++ GHASHDATA0, GHASHDATA1, GHASHDATA2 ++ _horizontal_xor GHASH_ACC, GHASH_ACC_XMM, GHASH_ACC_XMM, \ ++ GHASHDATA0_XMM, GHASHDATA1_XMM, GHASHDATA2_XMM ++ ++.Laad_done: ++ // Store the updated GHASH accumulator back to memory. ++ vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) ++ ++ vzeroupper // This is needed after using ymm or zmm registers. ++ RET ++SYM_FUNC_END(aes_gcm_aad_update_vaes_avx512) ++ ++// Do one non-last round of AES encryption on the blocks in %zmm[0-3] using the ++// round key that has been broadcast to all 128-bit lanes of \round_key. + .macro _vaesenc_4x round_key +- vaesenc \round_key, V0, V0 +- vaesenc \round_key, V1, V1 +- vaesenc \round_key, V2, V2 +- vaesenc \round_key, V3, V3 ++ vaesenc \round_key, %zmm0, %zmm0 ++ vaesenc \round_key, %zmm1, %zmm1 ++ vaesenc \round_key, %zmm2, %zmm2 ++ vaesenc \round_key, %zmm3, %zmm3 + .endm + + // Start the AES encryption of four vectors of counter blocks. + .macro _ctr_begin_4x + + // Increment LE_CTR four times to generate four vectors of little-endian +- // counter blocks, swap each to big-endian, and store them in V0-V3. +- vpshufb BSWAP_MASK, LE_CTR, V0 ++ // counter blocks, swap each to big-endian, and store them in %zmm[0-3]. ++ vpshufb BSWAP_MASK, LE_CTR, %zmm0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR +- vpshufb BSWAP_MASK, LE_CTR, V1 ++ vpshufb BSWAP_MASK, LE_CTR, %zmm1 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR +- vpshufb BSWAP_MASK, LE_CTR, V2 ++ vpshufb BSWAP_MASK, LE_CTR, %zmm2 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR +- vpshufb BSWAP_MASK, LE_CTR, V3 ++ vpshufb BSWAP_MASK, LE_CTR, %zmm3 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR + + // AES "round zero": XOR in the zero-th round key. +- vpxord RNDKEY0, V0, V0 +- vpxord RNDKEY0, V1, V1 +- vpxord RNDKEY0, V2, V2 +- vpxord RNDKEY0, V3, V3 ++ vpxord RNDKEY0, %zmm0, %zmm0 ++ vpxord RNDKEY0, %zmm1, %zmm1 ++ vpxord RNDKEY0, %zmm2, %zmm2 ++ vpxord RNDKEY0, %zmm3, %zmm3 + .endm + +-// Do the last AES round for four vectors of counter blocks V0-V3, XOR source +-// data with the resulting keystream, and write the result to DST and ++// Do the last AES round for four vectors of counter blocks %zmm[0-3], XOR ++// source data with the resulting keystream, and write the result to DST and + // GHASHDATA[0-3]. (Implementation differs slightly, but has the same effect.) + .macro _aesenclast_and_xor_4x + // XOR the source data with the last round key, saving the result in + // GHASHDATA[0-3]. This reduces latency by taking advantage of the + // property vaesenclast(key, a) ^ b == vaesenclast(key ^ b, a). +- vpxord 0*VL(SRC), RNDKEYLAST, GHASHDATA0 +- vpxord 1*VL(SRC), RNDKEYLAST, GHASHDATA1 +- vpxord 2*VL(SRC), RNDKEYLAST, GHASHDATA2 +- vpxord 3*VL(SRC), RNDKEYLAST, GHASHDATA3 ++ vpxord 0*64(SRC), RNDKEYLAST, GHASHDATA0 ++ vpxord 1*64(SRC), RNDKEYLAST, GHASHDATA1 ++ vpxord 2*64(SRC), RNDKEYLAST, GHASHDATA2 ++ vpxord 3*64(SRC), RNDKEYLAST, GHASHDATA3 + + // Do the last AES round. This handles the XOR with the source data + // too, as per the optimization described above. +- vaesenclast GHASHDATA0, V0, GHASHDATA0 +- vaesenclast GHASHDATA1, V1, GHASHDATA1 +- vaesenclast GHASHDATA2, V2, GHASHDATA2 +- vaesenclast GHASHDATA3, V3, GHASHDATA3 ++ vaesenclast GHASHDATA0, %zmm0, GHASHDATA0 ++ vaesenclast GHASHDATA1, %zmm1, GHASHDATA1 ++ vaesenclast GHASHDATA2, %zmm2, GHASHDATA2 ++ vaesenclast GHASHDATA3, %zmm3, GHASHDATA3 + + // Store the en/decrypted data to DST. +- vmovdqu8 GHASHDATA0, 0*VL(DST) +- vmovdqu8 GHASHDATA1, 1*VL(DST) +- vmovdqu8 GHASHDATA2, 2*VL(DST) +- vmovdqu8 GHASHDATA3, 3*VL(DST) ++ vmovdqu8 GHASHDATA0, 0*64(DST) ++ vmovdqu8 GHASHDATA1, 1*64(DST) ++ vmovdqu8 GHASHDATA2, 2*64(DST) ++ vmovdqu8 GHASHDATA3, 3*64(DST) + .endm + +-// void aes_gcm_{enc,dec}_update_##suffix(const struct aes_gcm_key_avx10 *key, +-// const u32 le_ctr[4], u8 ghash_acc[16], +-// const u8 *src, u8 *dst, int datalen); ++// void aes_gcm_{enc,dec}_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// const u8 *src, u8 *dst, int datalen); + // + // This macro generates a GCM encryption or decryption update function with the +-// above prototype (with \enc selecting which one). This macro supports both +-// VL=32 and VL=64. _set_veclen must have been invoked with the desired length. +-// +-// This function computes the next portion of the CTR keystream, XOR's it with +-// |datalen| bytes from |src|, and writes the resulting encrypted or decrypted +-// data to |dst|. It also updates the GHASH accumulator |ghash_acc| using the +-// next |datalen| ciphertext bytes. ++// above prototype (with \enc selecting which one). The function computes the ++// next portion of the CTR keystream, XOR's it with |datalen| bytes from |src|, ++// and writes the resulting encrypted or decrypted data to |dst|. It also ++// updates the GHASH accumulator |ghash_acc| using the next |datalen| ciphertext ++// bytes. + // + // |datalen| must be a multiple of 16, except on the last call where it can be + // any length. The caller must do any buffering needed to ensure this. Both + // in-place and out-of-place en/decryption are supported. + // +-// |le_ctr| must give the current counter in little-endian format. For a new +-// message, the low word of the counter must be 2. This function loads the +-// counter from |le_ctr| and increments the loaded counter as needed, but it +-// does *not* store the updated counter back to |le_ctr|. The caller must +-// update |le_ctr| if any more data segments follow. Internally, only the low +-// 32-bit word of the counter is incremented, following the GCM standard. ++// |le_ctr| must give the current counter in little-endian format. This ++// function loads the counter from |le_ctr| and increments the loaded counter as ++// needed, but it does *not* store the updated counter back to |le_ctr|. The ++// caller must update |le_ctr| if any more data segments follow. Internally, ++// only the low 32-bit word of the counter is incremented, following the GCM ++// standard. + .macro _aes_gcm_update enc + + // Function arguments +@@ -634,69 +712,69 @@ + // Pointer to the last AES round key for the chosen AES variant + .set RNDKEYLAST_PTR, %r11 + +- // In the main loop, V0-V3 are used as AES input and output. Elsewhere +- // they are used as temporary registers. ++ // In the main loop, %zmm[0-3] are used as AES input and output. ++ // Elsewhere they are used as temporary registers. + + // GHASHDATA[0-3] hold the ciphertext blocks and GHASH input data. +- .set GHASHDATA0, V4 ++ .set GHASHDATA0, %zmm4 + .set GHASHDATA0_XMM, %xmm4 +- .set GHASHDATA1, V5 ++ .set GHASHDATA1, %zmm5 + .set GHASHDATA1_XMM, %xmm5 +- .set GHASHDATA2, V6 ++ .set GHASHDATA2, %zmm6 + .set GHASHDATA2_XMM, %xmm6 +- .set GHASHDATA3, V7 ++ .set GHASHDATA3, %zmm7 + + // BSWAP_MASK is the shuffle mask for byte-reflecting 128-bit values + // using vpshufb, copied to all 128-bit lanes. +- .set BSWAP_MASK, V8 ++ .set BSWAP_MASK, %zmm8 + + // RNDKEY temporarily holds the next AES round key. +- .set RNDKEY, V9 ++ .set RNDKEY, %zmm9 + + // GHASH_ACC is the accumulator variable for GHASH. When fully reduced, + // only the lowest 128-bit lane can be nonzero. When not fully reduced, + // more than one lane may be used, and they need to be XOR'd together. +- .set GHASH_ACC, V10 ++ .set GHASH_ACC, %zmm10 + .set GHASH_ACC_XMM, %xmm10 + + // LE_CTR_INC is the vector of 32-bit words that need to be added to a + // vector of little-endian counter blocks to advance it forwards. +- .set LE_CTR_INC, V11 ++ .set LE_CTR_INC, %zmm11 + + // LE_CTR contains the next set of little-endian counter blocks. +- .set LE_CTR, V12 ++ .set LE_CTR, %zmm12 + + // RNDKEY0, RNDKEYLAST, and RNDKEY_M[9-1] contain cached AES round keys, + // copied to all 128-bit lanes. RNDKEY0 is the zero-th round key, + // RNDKEYLAST the last, and RNDKEY_M\i the one \i-th from the last. +- .set RNDKEY0, V13 +- .set RNDKEYLAST, V14 +- .set RNDKEY_M9, V15 +- .set RNDKEY_M8, V16 +- .set RNDKEY_M7, V17 +- .set RNDKEY_M6, V18 +- .set RNDKEY_M5, V19 +- .set RNDKEY_M4, V20 +- .set RNDKEY_M3, V21 +- .set RNDKEY_M2, V22 +- .set RNDKEY_M1, V23 ++ .set RNDKEY0, %zmm13 ++ .set RNDKEYLAST, %zmm14 ++ .set RNDKEY_M9, %zmm15 ++ .set RNDKEY_M8, %zmm16 ++ .set RNDKEY_M7, %zmm17 ++ .set RNDKEY_M6, %zmm18 ++ .set RNDKEY_M5, %zmm19 ++ .set RNDKEY_M4, %zmm20 ++ .set RNDKEY_M3, %zmm21 ++ .set RNDKEY_M2, %zmm22 ++ .set RNDKEY_M1, %zmm23 + + // GHASHTMP[0-2] are temporary variables used by _ghash_step_4x. These + // cannot coincide with anything used for AES encryption, since for + // performance reasons GHASH and AES encryption are interleaved. +- .set GHASHTMP0, V24 +- .set GHASHTMP1, V25 +- .set GHASHTMP2, V26 ++ .set GHASHTMP0, %zmm24 ++ .set GHASHTMP1, %zmm25 ++ .set GHASHTMP2, %zmm26 + +- // H_POW[4-1] contain the powers of the hash key H^(4*VL/16)...H^1. The ++ // H_POW[4-1] contain the powers of the hash key H^16...H^1. The + // descending numbering reflects the order of the key powers. +- .set H_POW4, V27 +- .set H_POW3, V28 +- .set H_POW2, V29 +- .set H_POW1, V30 ++ .set H_POW4, %zmm27 ++ .set H_POW3, %zmm28 ++ .set H_POW2, %zmm29 ++ .set H_POW1, %zmm30 + + // GFPOLY contains the .Lgfpoly constant, copied to all 128-bit lanes. +- .set GFPOLY, V31 ++ .set GFPOLY, %zmm31 + + // Load some constants. + vbroadcasti32x4 .Lbswap_mask(%rip), BSWAP_MASK +@@ -719,29 +797,23 @@ + // Finish initializing LE_CTR by adding [0, 1, ...] to its low words. + vpaddd .Lctr_pattern(%rip), LE_CTR, LE_CTR + +- // Initialize LE_CTR_INC to contain VL/16 in all 128-bit lanes. +-.if VL == 32 +- vbroadcasti32x4 .Linc_2blocks(%rip), LE_CTR_INC +-.elseif VL == 64 ++ // Load 4 into all 128-bit lanes of LE_CTR_INC. + vbroadcasti32x4 .Linc_4blocks(%rip), LE_CTR_INC +-.else +- .error "Unsupported vector length" +-.endif + +- // If there are at least 4*VL bytes of data, then continue into the loop +- // that processes 4*VL bytes of data at a time. Otherwise skip it. ++ // If there are at least 256 bytes of data, then continue into the loop ++ // that processes 256 bytes of data at a time. Otherwise skip it. + // +- // Pre-subtracting 4*VL from DATALEN saves an instruction from the main ++ // Pre-subtracting 256 from DATALEN saves an instruction from the main + // loop and also ensures that at least one write always occurs to + // DATALEN, zero-extending it and allowing DATALEN64 to be used later. +- add $-4*VL, DATALEN // shorter than 'sub 4*VL' when VL=32 ++ sub $256, DATALEN + jl .Lcrypt_loop_4x_done\@ + + // Load powers of the hash key. +- vmovdqu8 OFFSETOFEND_H_POWERS-4*VL(KEY), H_POW4 +- vmovdqu8 OFFSETOFEND_H_POWERS-3*VL(KEY), H_POW3 +- vmovdqu8 OFFSETOFEND_H_POWERS-2*VL(KEY), H_POW2 +- vmovdqu8 OFFSETOFEND_H_POWERS-1*VL(KEY), H_POW1 ++ vmovdqu8 OFFSETOFEND_H_POWERS-4*64(KEY), H_POW4 ++ vmovdqu8 OFFSETOFEND_H_POWERS-3*64(KEY), H_POW3 ++ vmovdqu8 OFFSETOFEND_H_POWERS-2*64(KEY), H_POW2 ++ vmovdqu8 OFFSETOFEND_H_POWERS-1*64(KEY), H_POW1 + + // Main loop: en/decrypt and hash 4 vectors at a time. + // +@@ -770,9 +842,9 @@ + cmp %rax, RNDKEYLAST_PTR + jne 1b + _aesenclast_and_xor_4x +- sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 +- sub $-4*VL, DST +- add $-4*VL, DATALEN ++ add $256, SRC ++ add $256, DST ++ sub $256, DATALEN + jl .Lghash_last_ciphertext_4x\@ + .endif + +@@ -786,10 +858,10 @@ + // If decrypting, load more ciphertext blocks into GHASHDATA[0-3]. If + // encrypting, GHASHDATA[0-3] already contain the previous ciphertext. + .if !\enc +- vmovdqu8 0*VL(SRC), GHASHDATA0 +- vmovdqu8 1*VL(SRC), GHASHDATA1 +- vmovdqu8 2*VL(SRC), GHASHDATA2 +- vmovdqu8 3*VL(SRC), GHASHDATA3 ++ vmovdqu8 0*64(SRC), GHASHDATA0 ++ vmovdqu8 1*64(SRC), GHASHDATA1 ++ vmovdqu8 2*64(SRC), GHASHDATA2 ++ vmovdqu8 3*64(SRC), GHASHDATA3 + .endif + + // Start the AES encryption of the counter blocks. +@@ -809,44 +881,44 @@ + _vaesenc_4x RNDKEY + 128: + +- // Finish the AES encryption of the counter blocks in V0-V3, interleaved +- // with the GHASH update of the ciphertext blocks in GHASHDATA[0-3]. ++ // Finish the AES encryption of the counter blocks in %zmm[0-3], ++ // interleaved with the GHASH update of the ciphertext blocks in ++ // GHASHDATA[0-3]. + .irp i, 9,8,7,6,5,4,3,2,1 + _ghash_step_4x (9 - \i) + _vaesenc_4x RNDKEY_M\i + .endr + _ghash_step_4x 9 + _aesenclast_and_xor_4x +- sub $-4*VL, SRC // shorter than 'add 4*VL' when VL=32 +- sub $-4*VL, DST +- add $-4*VL, DATALEN ++ add $256, SRC ++ add $256, DST ++ sub $256, DATALEN + jge .Lcrypt_loop_4x\@ + + .if \enc + .Lghash_last_ciphertext_4x\@: + // Update GHASH with the last set of ciphertext blocks. +-.irp i, 0,1,2,3,4,5,6,7,8,9 +- _ghash_step_4x \i +-.endr ++ _ghash_4x + .endif + + .Lcrypt_loop_4x_done\@: + +- // Undo the extra subtraction by 4*VL and check whether data remains. +- sub $-4*VL, DATALEN // shorter than 'add 4*VL' when VL=32 ++ // Undo the extra subtraction by 256 and check whether data remains. ++ add $256, DATALEN + jz .Ldone\@ + +- // The data length isn't a multiple of 4*VL. Process the remaining data +- // of length 1 <= DATALEN < 4*VL, up to one vector (VL bytes) at a time. +- // Going one vector at a time may seem inefficient compared to having +- // separate code paths for each possible number of vectors remaining. +- // However, using a loop keeps the code size down, and it performs +- // surprising well; modern CPUs will start executing the next iteration +- // before the previous one finishes and also predict the number of loop +- // iterations. For a similar reason, we roll up the AES rounds. ++ // The data length isn't a multiple of 256 bytes. Process the remaining ++ // data of length 1 <= DATALEN < 256, up to one 64-byte vector at a ++ // time. Going one vector at a time may seem inefficient compared to ++ // having separate code paths for each possible number of vectors ++ // remaining. However, using a loop keeps the code size down, and it ++ // performs surprising well; modern CPUs will start executing the next ++ // iteration before the previous one finishes and also predict the ++ // number of loop iterations. For a similar reason, we roll up the AES ++ // rounds. + // +- // On the last iteration, the remaining length may be less than VL. +- // Handle this using masking. ++ // On the last iteration, the remaining length may be less than 64 ++ // bytes. Handle this using masking. + // + // Since there are enough key powers available for all remaining data, + // there is no need to do a GHASH reduction after each iteration. +@@ -875,65 +947,60 @@ + .Lcrypt_loop_1x\@: + + // Select the appropriate mask for this iteration: all 1's if +- // DATALEN >= VL, otherwise DATALEN 1's. Do this branchlessly using the ++ // DATALEN >= 64, otherwise DATALEN 1's. Do this branchlessly using the + // bzhi instruction from BMI2. (This relies on DATALEN <= 255.) +-.if VL < 64 +- mov $-1, %eax +- bzhi DATALEN, %eax, %eax +- kmovd %eax, %k1 +-.else + mov $-1, %rax + bzhi DATALEN64, %rax, %rax + kmovq %rax, %k1 +-.endif + + // Encrypt a vector of counter blocks. This does not need to be masked. +- vpshufb BSWAP_MASK, LE_CTR, V0 ++ vpshufb BSWAP_MASK, LE_CTR, %zmm0 + vpaddd LE_CTR_INC, LE_CTR, LE_CTR +- vpxord RNDKEY0, V0, V0 ++ vpxord RNDKEY0, %zmm0, %zmm0 + lea 16(KEY), %rax + 1: + vbroadcasti32x4 (%rax), RNDKEY +- vaesenc RNDKEY, V0, V0 ++ vaesenc RNDKEY, %zmm0, %zmm0 + add $16, %rax + cmp %rax, RNDKEYLAST_PTR + jne 1b +- vaesenclast RNDKEYLAST, V0, V0 ++ vaesenclast RNDKEYLAST, %zmm0, %zmm0 + + // XOR the data with the appropriate number of keystream bytes. +- vmovdqu8 (SRC), V1{%k1}{z} +- vpxord V1, V0, V0 +- vmovdqu8 V0, (DST){%k1} ++ vmovdqu8 (SRC), %zmm1{%k1}{z} ++ vpxord %zmm1, %zmm0, %zmm0 ++ vmovdqu8 %zmm0, (DST){%k1} + + // Update GHASH with the ciphertext block(s), without reducing. + // +- // In the case of DATALEN < VL, the ciphertext is zero-padded to VL. +- // (If decrypting, it's done by the above masked load. If encrypting, +- // it's done by the below masked register-to-register move.) Note that +- // if DATALEN <= VL - 16, there will be additional padding beyond the +- // padding of the last block specified by GHASH itself; i.e., there may +- // be whole block(s) that get processed by the GHASH multiplication and +- // reduction instructions but should not actually be included in the ++ // In the case of DATALEN < 64, the ciphertext is zero-padded to 64 ++ // bytes. (If decrypting, it's done by the above masked load. If ++ // encrypting, it's done by the below masked register-to-register move.) ++ // Note that if DATALEN <= 48, there will be additional padding beyond ++ // the padding of the last block specified by GHASH itself; i.e., there ++ // may be whole block(s) that get processed by the GHASH multiplication ++ // and reduction instructions but should not actually be included in the + // GHASH. However, any such blocks are all-zeroes, and the values that + // they're multiplied with are also all-zeroes. Therefore they just add + // 0 * 0 = 0 to the final GHASH result, which makes no difference. + vmovdqu8 (POWERS_PTR), H_POW1 + .if \enc +- vmovdqu8 V0, V1{%k1}{z} ++ vmovdqu8 %zmm0, %zmm1{%k1}{z} + .endif +- vpshufb BSWAP_MASK, V1, V0 +- vpxord GHASH_ACC, V0, V0 +- _ghash_mul_noreduce H_POW1, V0, LO, MI, HI, GHASHDATA3, V1, V2, V3 ++ vpshufb BSWAP_MASK, %zmm1, %zmm0 ++ vpxord GHASH_ACC, %zmm0, %zmm0 ++ _ghash_mul_noreduce H_POW1, %zmm0, LO, MI, HI, \ ++ GHASHDATA3, %zmm1, %zmm2, %zmm3 + vpxor GHASH_ACC_XMM, GHASH_ACC_XMM, GHASH_ACC_XMM + +- add $VL, POWERS_PTR +- add $VL, SRC +- add $VL, DST +- sub $VL, DATALEN ++ add $64, POWERS_PTR ++ add $64, SRC ++ add $64, DST ++ sub $64, DATALEN + jg .Lcrypt_loop_1x\@ + + // Finally, do the GHASH reduction. +- _ghash_reduce LO, MI, HI, GFPOLY, V0 ++ _ghash_reduce LO, MI, HI, GFPOLY, %zmm0 + _horizontal_xor HI, HI_XMM, GHASH_ACC_XMM, %xmm0, %xmm1, %xmm2 + + .Ldone\@: +@@ -944,14 +1011,14 @@ + RET + .endm + +-// void aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +-// const u32 le_ctr[4], u8 ghash_acc[16], +-// u64 total_aadlen, u64 total_datalen); +-// bool aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +-// const u32 le_ctr[4], +-// const u8 ghash_acc[16], +-// u64 total_aadlen, u64 total_datalen, +-// const u8 tag[16], int taglen); ++// void aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++// const u32 le_ctr[4], u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen); ++// bool aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++// const u32 le_ctr[4], ++// const u8 ghash_acc[16], ++// u64 total_aadlen, u64 total_datalen, ++// const u8 tag[16], int taglen); + // + // This macro generates one of the above two functions (with \enc selecting + // which one). Both functions finish computing the GCM authentication tag by +@@ -1081,119 +1148,16 @@ + RET + .endm + +-_set_veclen 32 +-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_256) +- _aes_gcm_precompute +-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_256) +-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_256) +- _aes_gcm_update 1 +-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_256) +-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_256) +- _aes_gcm_update 0 +-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_256) +- +-_set_veclen 64 +-SYM_FUNC_START(aes_gcm_precompute_vaes_avx10_512) +- _aes_gcm_precompute +-SYM_FUNC_END(aes_gcm_precompute_vaes_avx10_512) +-SYM_FUNC_START(aes_gcm_enc_update_vaes_avx10_512) ++SYM_FUNC_START(aes_gcm_enc_update_vaes_avx512) + _aes_gcm_update 1 +-SYM_FUNC_END(aes_gcm_enc_update_vaes_avx10_512) +-SYM_FUNC_START(aes_gcm_dec_update_vaes_avx10_512) ++SYM_FUNC_END(aes_gcm_enc_update_vaes_avx512) ++SYM_FUNC_START(aes_gcm_dec_update_vaes_avx512) + _aes_gcm_update 0 +-SYM_FUNC_END(aes_gcm_dec_update_vaes_avx10_512) +- +-// void aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, +-// u8 ghash_acc[16], +-// const u8 *aad, int aadlen); +-// +-// This function processes the AAD (Additional Authenticated Data) in GCM. +-// Using the key |key|, it updates the GHASH accumulator |ghash_acc| with the +-// data given by |aad| and |aadlen|. |key->ghash_key_powers| must have been +-// initialized. On the first call, |ghash_acc| must be all zeroes. |aadlen| +-// must be a multiple of 16, except on the last call where it can be any length. +-// The caller must do any buffering needed to ensure this. +-// +-// AES-GCM is almost always used with small amounts of AAD, less than 32 bytes. +-// Therefore, for AAD processing we currently only provide this implementation +-// which uses 256-bit vectors (ymm registers) and only has a 1x-wide loop. This +-// keeps the code size down, and it enables some micro-optimizations, e.g. using +-// VEX-coded instructions instead of EVEX-coded to save some instruction bytes. +-// To optimize for large amounts of AAD, we could implement a 4x-wide loop and +-// provide a version using 512-bit vectors, but that doesn't seem to be useful. +-SYM_FUNC_START(aes_gcm_aad_update_vaes_avx10) +- +- // Function arguments +- .set KEY, %rdi +- .set GHASH_ACC_PTR, %rsi +- .set AAD, %rdx +- .set AADLEN, %ecx +- .set AADLEN64, %rcx // Zero-extend AADLEN before using! +- +- // Additional local variables. +- // %rax, %ymm0-%ymm3, and %k1 are used as temporary registers. +- .set BSWAP_MASK, %ymm4 +- .set GFPOLY, %ymm5 +- .set GHASH_ACC, %ymm6 +- .set GHASH_ACC_XMM, %xmm6 +- .set H_POW1, %ymm7 +- +- // Load some constants. +- vbroadcasti128 .Lbswap_mask(%rip), BSWAP_MASK +- vbroadcasti128 .Lgfpoly(%rip), GFPOLY +- +- // Load the GHASH accumulator. +- vmovdqu (GHASH_ACC_PTR), GHASH_ACC_XMM +- +- // Update GHASH with 32 bytes of AAD at a time. +- // +- // Pre-subtracting 32 from AADLEN saves an instruction from the loop and +- // also ensures that at least one write always occurs to AADLEN, +- // zero-extending it and allowing AADLEN64 to be used later. +- sub $32, AADLEN +- jl .Laad_loop_1x_done +- vmovdqu8 OFFSETOFEND_H_POWERS-32(KEY), H_POW1 // [H^2, H^1] +-.Laad_loop_1x: +- vmovdqu (AAD), %ymm0 +- vpshufb BSWAP_MASK, %ymm0, %ymm0 +- vpxor %ymm0, GHASH_ACC, GHASH_ACC +- _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ +- %ymm0, %ymm1, %ymm2 +- vextracti128 $1, GHASH_ACC, %xmm0 +- vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM +- add $32, AAD +- sub $32, AADLEN +- jge .Laad_loop_1x +-.Laad_loop_1x_done: +- add $32, AADLEN +- jz .Laad_done +- +- // Update GHASH with the remaining 1 <= AADLEN < 32 bytes of AAD. +- mov $-1, %eax +- bzhi AADLEN, %eax, %eax +- kmovd %eax, %k1 +- vmovdqu8 (AAD), %ymm0{%k1}{z} +- neg AADLEN64 +- and $~15, AADLEN64 // -round_up(AADLEN, 16) +- vmovdqu8 OFFSETOFEND_H_POWERS(KEY,AADLEN64), H_POW1 +- vpshufb BSWAP_MASK, %ymm0, %ymm0 +- vpxor %ymm0, GHASH_ACC, GHASH_ACC +- _ghash_mul H_POW1, GHASH_ACC, GHASH_ACC, GFPOLY, \ +- %ymm0, %ymm1, %ymm2 +- vextracti128 $1, GHASH_ACC, %xmm0 +- vpxor %xmm0, GHASH_ACC_XMM, GHASH_ACC_XMM +- +-.Laad_done: +- // Store the updated GHASH accumulator back to memory. +- vmovdqu GHASH_ACC_XMM, (GHASH_ACC_PTR) +- +- vzeroupper // This is needed after using ymm or zmm registers. +- RET +-SYM_FUNC_END(aes_gcm_aad_update_vaes_avx10) ++SYM_FUNC_END(aes_gcm_dec_update_vaes_avx512) + +-SYM_FUNC_START(aes_gcm_enc_final_vaes_avx10) ++SYM_FUNC_START(aes_gcm_enc_final_vaes_avx512) + _aes_gcm_final 1 +-SYM_FUNC_END(aes_gcm_enc_final_vaes_avx10) +-SYM_FUNC_START(aes_gcm_dec_final_vaes_avx10) ++SYM_FUNC_END(aes_gcm_enc_final_vaes_avx512) ++SYM_FUNC_START(aes_gcm_dec_final_vaes_avx512) + _aes_gcm_final 0 +-SYM_FUNC_END(aes_gcm_dec_final_vaes_avx10) ++SYM_FUNC_END(aes_gcm_dec_final_vaes_avx512) +diff --git a/arch/x86/crypto/aesni-intel_glue.c b/arch/x86/crypto/aesni-intel_glue.c +index d953ac470aae..bb6e2c47ffc6 100644 +--- a/arch/x86/crypto/aesni-intel_glue.c ++++ b/arch/x86/crypto/aesni-intel_glue.c +@@ -874,8 +874,38 @@ struct aes_gcm_key_aesni { + #define AES_GCM_KEY_AESNI_SIZE \ + (sizeof(struct aes_gcm_key_aesni) + (15 & ~(CRYPTO_MINALIGN - 1))) + +-/* Key struct used by the VAES + AVX10 implementations of AES-GCM */ +-struct aes_gcm_key_avx10 { ++/* Key struct used by the VAES + AVX2 implementation of AES-GCM */ ++struct aes_gcm_key_vaes_avx2 { ++ /* ++ * Common part of the key. The assembly code prefers 16-byte alignment ++ * for the round keys; we get this by them being located at the start of ++ * the struct and the whole struct being 32-byte aligned. ++ */ ++ struct aes_gcm_key base; ++ ++ /* ++ * Powers of the hash key H^8 through H^1. These are 128-bit values. ++ * They all have an extra factor of x^-1 and are byte-reversed. ++ * The assembly code prefers 32-byte alignment for this. ++ */ ++ u64 h_powers[8][2] __aligned(32); ++ ++ /* ++ * Each entry in this array contains the two halves of an entry of ++ * h_powers XOR'd together, in the following order: ++ * H^8,H^6,H^7,H^5,H^4,H^2,H^3,H^1 i.e. indices 0,2,1,3,4,6,5,7. ++ * This is used for Karatsuba multiplication. ++ */ ++ u64 h_powers_xored[8]; ++}; ++ ++#define AES_GCM_KEY_VAES_AVX2(key) \ ++ container_of((key), struct aes_gcm_key_vaes_avx2, base) ++#define AES_GCM_KEY_VAES_AVX2_SIZE \ ++ (sizeof(struct aes_gcm_key_vaes_avx2) + (31 & ~(CRYPTO_MINALIGN - 1))) ++ ++/* Key struct used by the VAES + AVX512 implementation of AES-GCM */ ++struct aes_gcm_key_vaes_avx512 { + /* + * Common part of the key. The assembly code prefers 16-byte alignment + * for the round keys; we get this by them being located at the start of +@@ -895,10 +925,10 @@ struct aes_gcm_key_avx10 { + /* Three padding blocks required by the assembly code */ + u64 padding[3][2]; + }; +-#define AES_GCM_KEY_AVX10(key) \ +- container_of((key), struct aes_gcm_key_avx10, base) +-#define AES_GCM_KEY_AVX10_SIZE \ +- (sizeof(struct aes_gcm_key_avx10) + (63 & ~(CRYPTO_MINALIGN - 1))) ++#define AES_GCM_KEY_VAES_AVX512(key) \ ++ container_of((key), struct aes_gcm_key_vaes_avx512, base) ++#define AES_GCM_KEY_VAES_AVX512_SIZE \ ++ (sizeof(struct aes_gcm_key_vaes_avx512) + (63 & ~(CRYPTO_MINALIGN - 1))) + + /* + * These flags are passed to the AES-GCM helper functions to specify the +@@ -910,14 +940,16 @@ struct aes_gcm_key_avx10 { + #define FLAG_RFC4106 BIT(0) + #define FLAG_ENC BIT(1) + #define FLAG_AVX BIT(2) +-#define FLAG_AVX10_256 BIT(3) +-#define FLAG_AVX10_512 BIT(4) ++#define FLAG_VAES_AVX2 BIT(3) ++#define FLAG_VAES_AVX512 BIT(4) + + static inline struct aes_gcm_key * + aes_gcm_key_get(struct crypto_aead *tfm, int flags) + { +- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) ++ if (flags & FLAG_VAES_AVX512) + return PTR_ALIGN(crypto_aead_ctx(tfm), 64); ++ else if (flags & FLAG_VAES_AVX2) ++ return PTR_ALIGN(crypto_aead_ctx(tfm), 32); + else + return PTR_ALIGN(crypto_aead_ctx(tfm), 16); + } +@@ -927,26 +959,16 @@ aes_gcm_precompute_aesni(struct aes_gcm_key_aesni *key); + asmlinkage void + aes_gcm_precompute_aesni_avx(struct aes_gcm_key_aesni *key); + asmlinkage void +-aes_gcm_precompute_vaes_avx10_256(struct aes_gcm_key_avx10 *key); ++aes_gcm_precompute_vaes_avx2(struct aes_gcm_key_vaes_avx2 *key); + asmlinkage void +-aes_gcm_precompute_vaes_avx10_512(struct aes_gcm_key_avx10 *key); ++aes_gcm_precompute_vaes_avx512(struct aes_gcm_key_vaes_avx512 *key); + + static void aes_gcm_precompute(struct aes_gcm_key *key, int flags) + { +- /* +- * To make things a bit easier on the assembly side, the AVX10 +- * implementations use the same key format. Therefore, a single +- * function using 256-bit vectors would suffice here. However, it's +- * straightforward to provide a 512-bit one because of how the assembly +- * code is structured, and it works nicely because the total size of the +- * key powers is a multiple of 512 bits. So we take advantage of that. +- * +- * A similar situation applies to the AES-NI implementations. +- */ +- if (flags & FLAG_AVX10_512) +- aes_gcm_precompute_vaes_avx10_512(AES_GCM_KEY_AVX10(key)); +- else if (flags & FLAG_AVX10_256) +- aes_gcm_precompute_vaes_avx10_256(AES_GCM_KEY_AVX10(key)); ++ if (flags & FLAG_VAES_AVX512) ++ aes_gcm_precompute_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key)); ++ else if (flags & FLAG_VAES_AVX2) ++ aes_gcm_precompute_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key)); + else if (flags & FLAG_AVX) + aes_gcm_precompute_aesni_avx(AES_GCM_KEY_AESNI(key)); + else +@@ -960,15 +982,21 @@ asmlinkage void + aes_gcm_aad_update_aesni_avx(const struct aes_gcm_key_aesni *key, + u8 ghash_acc[16], const u8 *aad, int aadlen); + asmlinkage void +-aes_gcm_aad_update_vaes_avx10(const struct aes_gcm_key_avx10 *key, +- u8 ghash_acc[16], const u8 *aad, int aadlen); ++aes_gcm_aad_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++ u8 ghash_acc[16], const u8 *aad, int aadlen); ++asmlinkage void ++aes_gcm_aad_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++ u8 ghash_acc[16], const u8 *aad, int aadlen); + + static void aes_gcm_aad_update(const struct aes_gcm_key *key, u8 ghash_acc[16], + const u8 *aad, int aadlen, int flags) + { +- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) +- aes_gcm_aad_update_vaes_avx10(AES_GCM_KEY_AVX10(key), ghash_acc, +- aad, aadlen); ++ if (flags & FLAG_VAES_AVX512) ++ aes_gcm_aad_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), ++ ghash_acc, aad, aadlen); ++ else if (flags & FLAG_VAES_AVX2) ++ aes_gcm_aad_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), ++ ghash_acc, aad, aadlen); + else if (flags & FLAG_AVX) + aes_gcm_aad_update_aesni_avx(AES_GCM_KEY_AESNI(key), ghash_acc, + aad, aadlen); +@@ -986,13 +1014,13 @@ aes_gcm_enc_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + asmlinkage void +-aes_gcm_enc_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, +- const u32 le_ctr[4], u8 ghash_acc[16], +- const u8 *src, u8 *dst, int datalen); ++aes_gcm_enc_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); + asmlinkage void +-aes_gcm_enc_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, +- const u32 le_ctr[4], u8 ghash_acc[16], +- const u8 *src, u8 *dst, int datalen); ++aes_gcm_enc_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); + + asmlinkage void + aes_gcm_dec_update_aesni(const struct aes_gcm_key_aesni *key, +@@ -1003,13 +1031,13 @@ aes_gcm_dec_update_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + const u8 *src, u8 *dst, int datalen); + asmlinkage void +-aes_gcm_dec_update_vaes_avx10_256(const struct aes_gcm_key_avx10 *key, +- const u32 le_ctr[4], u8 ghash_acc[16], +- const u8 *src, u8 *dst, int datalen); ++aes_gcm_dec_update_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); + asmlinkage void +-aes_gcm_dec_update_vaes_avx10_512(const struct aes_gcm_key_avx10 *key, +- const u32 le_ctr[4], u8 ghash_acc[16], +- const u8 *src, u8 *dst, int datalen); ++aes_gcm_dec_update_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ const u8 *src, u8 *dst, int datalen); + + /* __always_inline to optimize out the branches based on @flags */ + static __always_inline void +@@ -1018,14 +1046,14 @@ aes_gcm_update(const struct aes_gcm_key *key, + const u8 *src, u8 *dst, int datalen, int flags) + { + if (flags & FLAG_ENC) { +- if (flags & FLAG_AVX10_512) +- aes_gcm_enc_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), +- le_ctr, ghash_acc, +- src, dst, datalen); +- else if (flags & FLAG_AVX10_256) +- aes_gcm_enc_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), +- le_ctr, ghash_acc, +- src, dst, datalen); ++ if (flags & FLAG_VAES_AVX512) ++ aes_gcm_enc_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else if (flags & FLAG_VAES_AVX2) ++ aes_gcm_enc_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, +@@ -1034,14 +1062,14 @@ aes_gcm_update(const struct aes_gcm_key *key, + aes_gcm_enc_update_aesni(AES_GCM_KEY_AESNI(key), le_ctr, + ghash_acc, src, dst, datalen); + } else { +- if (flags & FLAG_AVX10_512) +- aes_gcm_dec_update_vaes_avx10_512(AES_GCM_KEY_AVX10(key), +- le_ctr, ghash_acc, +- src, dst, datalen); +- else if (flags & FLAG_AVX10_256) +- aes_gcm_dec_update_vaes_avx10_256(AES_GCM_KEY_AVX10(key), +- le_ctr, ghash_acc, +- src, dst, datalen); ++ if (flags & FLAG_VAES_AVX512) ++ aes_gcm_dec_update_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); ++ else if (flags & FLAG_VAES_AVX2) ++ aes_gcm_dec_update_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), ++ le_ctr, ghash_acc, ++ src, dst, datalen); + else if (flags & FLAG_AVX) + aes_gcm_dec_update_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, +@@ -1062,9 +1090,13 @@ aes_gcm_enc_final_aesni_avx(const struct aes_gcm_key_aesni *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen); + asmlinkage void +-aes_gcm_enc_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +- const u32 le_ctr[4], u8 ghash_acc[16], +- u64 total_aadlen, u64 total_datalen); ++aes_gcm_enc_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen); ++asmlinkage void ++aes_gcm_enc_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++ const u32 le_ctr[4], u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen); + + /* __always_inline to optimize out the branches based on @flags */ + static __always_inline void +@@ -1072,10 +1104,14 @@ aes_gcm_enc_final(const struct aes_gcm_key *key, + const u32 le_ctr[4], u8 ghash_acc[16], + u64 total_aadlen, u64 total_datalen, int flags) + { +- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) +- aes_gcm_enc_final_vaes_avx10(AES_GCM_KEY_AVX10(key), +- le_ctr, ghash_acc, +- total_aadlen, total_datalen); ++ if (flags & FLAG_VAES_AVX512) ++ aes_gcm_enc_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen); ++ else if (flags & FLAG_VAES_AVX2) ++ aes_gcm_enc_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen); + else if (flags & FLAG_AVX) + aes_gcm_enc_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, +@@ -1097,10 +1133,15 @@ aes_gcm_dec_final_aesni_avx(const struct aes_gcm_key_aesni *key, + u64 total_aadlen, u64 total_datalen, + const u8 tag[16], int taglen); + asmlinkage bool __must_check +-aes_gcm_dec_final_vaes_avx10(const struct aes_gcm_key_avx10 *key, +- const u32 le_ctr[4], const u8 ghash_acc[16], +- u64 total_aadlen, u64 total_datalen, +- const u8 tag[16], int taglen); ++aes_gcm_dec_final_vaes_avx2(const struct aes_gcm_key_vaes_avx2 *key, ++ const u32 le_ctr[4], const u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen, ++ const u8 tag[16], int taglen); ++asmlinkage bool __must_check ++aes_gcm_dec_final_vaes_avx512(const struct aes_gcm_key_vaes_avx512 *key, ++ const u32 le_ctr[4], const u8 ghash_acc[16], ++ u64 total_aadlen, u64 total_datalen, ++ const u8 tag[16], int taglen); + + /* __always_inline to optimize out the branches based on @flags */ + static __always_inline bool __must_check +@@ -1108,11 +1149,16 @@ aes_gcm_dec_final(const struct aes_gcm_key *key, const u32 le_ctr[4], + u8 ghash_acc[16], u64 total_aadlen, u64 total_datalen, + u8 tag[16], int taglen, int flags) + { +- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) +- return aes_gcm_dec_final_vaes_avx10(AES_GCM_KEY_AVX10(key), +- le_ctr, ghash_acc, +- total_aadlen, total_datalen, +- tag, taglen); ++ if (flags & FLAG_VAES_AVX512) ++ return aes_gcm_dec_final_vaes_avx512(AES_GCM_KEY_VAES_AVX512(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen, ++ tag, taglen); ++ else if (flags & FLAG_VAES_AVX2) ++ return aes_gcm_dec_final_vaes_avx2(AES_GCM_KEY_VAES_AVX2(key), ++ le_ctr, ghash_acc, ++ total_aadlen, total_datalen, ++ tag, taglen); + else if (flags & FLAG_AVX) + return aes_gcm_dec_final_aesni_avx(AES_GCM_KEY_AESNI(key), + le_ctr, ghash_acc, +@@ -1195,10 +1241,14 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers) != 496); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_powers_xored) != 624); + BUILD_BUG_ON(offsetof(struct aes_gcm_key_aesni, h_times_x64) != 688); +- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_enc) != 0); +- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, base.aes_key.key_length) != 480); +- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, h_powers) != 512); +- BUILD_BUG_ON(offsetof(struct aes_gcm_key_avx10, padding) != 768); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_enc) != 0); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, base.aes_key.key_length) != 480); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers) != 512); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx2, h_powers_xored) != 640); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_enc) != 0); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, base.aes_key.key_length) != 480); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, h_powers) != 512); ++ BUILD_BUG_ON(offsetof(struct aes_gcm_key_vaes_avx512, padding) != 768); + + if (likely(crypto_simd_usable())) { + err = aes_check_keylen(keylen); +@@ -1231,8 +1281,9 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, + gf128mul_lle(&h, (const be128 *)x_to_the_minus1); + + /* Compute the needed key powers */ +- if (flags & (FLAG_AVX10_256 | FLAG_AVX10_512)) { +- struct aes_gcm_key_avx10 *k = AES_GCM_KEY_AVX10(key); ++ if (flags & FLAG_VAES_AVX512) { ++ struct aes_gcm_key_vaes_avx512 *k = ++ AES_GCM_KEY_VAES_AVX512(key); + + for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { + k->h_powers[i][0] = be64_to_cpu(h.b); +@@ -1240,6 +1291,22 @@ static int gcm_setkey(struct crypto_aead *tfm, const u8 *raw_key, + gf128mul_lle(&h, &h1); + } + memset(k->padding, 0, sizeof(k->padding)); ++ } else if (flags & FLAG_VAES_AVX2) { ++ struct aes_gcm_key_vaes_avx2 *k = ++ AES_GCM_KEY_VAES_AVX2(key); ++ static const u8 indices[8] = { 0, 2, 1, 3, 4, 6, 5, 7 }; ++ ++ for (i = ARRAY_SIZE(k->h_powers) - 1; i >= 0; i--) { ++ k->h_powers[i][0] = be64_to_cpu(h.b); ++ k->h_powers[i][1] = be64_to_cpu(h.a); ++ gf128mul_lle(&h, &h1); ++ } ++ for (i = 0; i < ARRAY_SIZE(k->h_powers_xored); i++) { ++ int j = indices[i]; ++ ++ k->h_powers_xored[i] = k->h_powers[j][0] ^ ++ k->h_powers[j][1]; ++ } + } else { + struct aes_gcm_key_aesni *k = AES_GCM_KEY_AESNI(key); + +@@ -1508,15 +1575,15 @@ DEFINE_GCM_ALGS(aesni_avx, FLAG_AVX, + "generic-gcm-aesni-avx", "rfc4106-gcm-aesni-avx", + AES_GCM_KEY_AESNI_SIZE, 500); + +-/* aes_gcm_algs_vaes_avx10_256 */ +-DEFINE_GCM_ALGS(vaes_avx10_256, FLAG_AVX10_256, +- "generic-gcm-vaes-avx10_256", "rfc4106-gcm-vaes-avx10_256", +- AES_GCM_KEY_AVX10_SIZE, 700); ++/* aes_gcm_algs_vaes_avx2 */ ++DEFINE_GCM_ALGS(vaes_avx2, FLAG_VAES_AVX2, ++ "generic-gcm-vaes-avx2", "rfc4106-gcm-vaes-avx2", ++ AES_GCM_KEY_VAES_AVX2_SIZE, 600); + +-/* aes_gcm_algs_vaes_avx10_512 */ +-DEFINE_GCM_ALGS(vaes_avx10_512, FLAG_AVX10_512, +- "generic-gcm-vaes-avx10_512", "rfc4106-gcm-vaes-avx10_512", +- AES_GCM_KEY_AVX10_SIZE, 800); ++/* aes_gcm_algs_vaes_avx512 */ ++DEFINE_GCM_ALGS(vaes_avx512, FLAG_VAES_AVX512, ++ "generic-gcm-vaes-avx512", "rfc4106-gcm-vaes-avx512", ++ AES_GCM_KEY_VAES_AVX512_SIZE, 800); + + static int __init register_avx_algs(void) + { +@@ -1548,6 +1615,10 @@ static int __init register_avx_algs(void) + ARRAY_SIZE(skcipher_algs_vaes_avx2)); + if (err) + return err; ++ err = crypto_register_aeads(aes_gcm_algs_vaes_avx2, ++ ARRAY_SIZE(aes_gcm_algs_vaes_avx2)); ++ if (err) ++ return err; + + if (!boot_cpu_has(X86_FEATURE_AVX512BW) || + !boot_cpu_has(X86_FEATURE_AVX512VL) || +@@ -1556,26 +1627,21 @@ static int __init register_avx_algs(void) + XFEATURE_MASK_AVX512, NULL)) + return 0; + +- err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_256, +- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_256)); +- if (err) +- return err; +- + if (boot_cpu_has(X86_FEATURE_PREFER_YMM)) { + int i; + + for (i = 0; i < ARRAY_SIZE(skcipher_algs_vaes_avx512); i++) + skcipher_algs_vaes_avx512[i].base.cra_priority = 1; +- for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512); i++) +- aes_gcm_algs_vaes_avx10_512[i].base.cra_priority = 1; ++ for (i = 0; i < ARRAY_SIZE(aes_gcm_algs_vaes_avx512); i++) ++ aes_gcm_algs_vaes_avx512[i].base.cra_priority = 1; + } + + err = crypto_register_skciphers(skcipher_algs_vaes_avx512, + ARRAY_SIZE(skcipher_algs_vaes_avx512)); + if (err) + return err; +- err = crypto_register_aeads(aes_gcm_algs_vaes_avx10_512, +- ARRAY_SIZE(aes_gcm_algs_vaes_avx10_512)); ++ err = crypto_register_aeads(aes_gcm_algs_vaes_avx512, ++ ARRAY_SIZE(aes_gcm_algs_vaes_avx512)); + if (err) + return err; + +@@ -1595,8 +1661,8 @@ static void unregister_avx_algs(void) + unregister_aeads(aes_gcm_algs_aesni_avx); + unregister_skciphers(skcipher_algs_vaes_avx2); + unregister_skciphers(skcipher_algs_vaes_avx512); +- unregister_aeads(aes_gcm_algs_vaes_avx10_256); +- unregister_aeads(aes_gcm_algs_vaes_avx10_512); ++ unregister_aeads(aes_gcm_algs_vaes_avx2); ++ unregister_aeads(aes_gcm_algs_vaes_avx512); + } + #else /* CONFIG_X86_64 */ + static struct aead_alg aes_gcm_algs_aesni[0]; +diff --git a/drivers/md/Kconfig b/drivers/md/Kconfig +index 104aa5355090..cac4926fc340 100644 +--- a/drivers/md/Kconfig ++++ b/drivers/md/Kconfig +@@ -546,6 +546,7 @@ config DM_VERITY + depends on BLK_DEV_DM + select CRYPTO + select CRYPTO_HASH ++ select CRYPTO_LIB_SHA256 + select DM_BUFIO + help + This device-mapper target creates a read-only device that +diff --git a/drivers/md/dm-verity-fec.c b/drivers/md/dm-verity-fec.c +index 72047b47a7a0..0c858b9ee06b 100644 +--- a/drivers/md/dm-verity-fec.c ++++ b/drivers/md/dm-verity-fec.c +@@ -188,14 +188,13 @@ static int fec_decode_bufs(struct dm_verity *v, struct dm_verity_io *io, + * Locate data block erasures using verity hashes. + */ + static int fec_is_erasure(struct dm_verity *v, struct dm_verity_io *io, +- u8 *want_digest, u8 *data) ++ const u8 *want_digest, const u8 *data) + { + if (unlikely(verity_hash(v, io, data, 1 << v->data_dev_block_bits, +- verity_io_real_digest(v, io)))) ++ io->tmp_digest))) + return 0; + +- return memcmp(verity_io_real_digest(v, io), want_digest, +- v->digest_size) != 0; ++ return memcmp(io->tmp_digest, want_digest, v->digest_size) != 0; + } + + /* +@@ -362,7 +361,7 @@ static void fec_init_bufs(struct dm_verity *v, struct dm_verity_fec_io *fio) + */ + static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, + struct dm_verity_fec_io *fio, u64 rsb, u64 offset, +- bool use_erasures) ++ const u8 *want_digest, bool use_erasures) + { + int r, neras = 0; + unsigned int pos; +@@ -388,12 +387,11 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, + + /* Always re-validate the corrected block against the expected hash */ + r = verity_hash(v, io, fio->output, 1 << v->data_dev_block_bits, +- verity_io_real_digest(v, io)); ++ io->tmp_digest); + if (unlikely(r < 0)) + return r; + +- if (memcmp(verity_io_real_digest(v, io), verity_io_want_digest(v, io), +- v->digest_size)) { ++ if (memcmp(io->tmp_digest, want_digest, v->digest_size)) { + DMERR_LIMIT("%s: FEC %llu: failed to correct (%d erasures)", + v->data_dev->name, (unsigned long long)rsb, neras); + return -EILSEQ; +@@ -404,7 +402,8 @@ static int fec_decode_rsb(struct dm_verity *v, struct dm_verity_io *io, + + /* Correct errors in a block. Copies corrected block to dest. */ + int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, +- enum verity_block_type type, sector_t block, u8 *dest) ++ enum verity_block_type type, const u8 *want_digest, ++ sector_t block, u8 *dest) + { + int r; + struct dm_verity_fec_io *fio = fec_io(io); +@@ -447,9 +446,9 @@ int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, + * them first. Do a second attempt with erasures if the corruption is + * bad enough. + */ +- r = fec_decode_rsb(v, io, fio, rsb, offset, false); ++ r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, false); + if (r < 0) { +- r = fec_decode_rsb(v, io, fio, rsb, offset, true); ++ r = fec_decode_rsb(v, io, fio, rsb, offset, want_digest, true); + if (r < 0) + goto done; + } +diff --git a/drivers/md/dm-verity-fec.h b/drivers/md/dm-verity-fec.h +index 09123a612953..a6689cdc489d 100644 +--- a/drivers/md/dm-verity-fec.h ++++ b/drivers/md/dm-verity-fec.h +@@ -68,8 +68,8 @@ struct dm_verity_fec_io { + extern bool verity_fec_is_enabled(struct dm_verity *v); + + extern int verity_fec_decode(struct dm_verity *v, struct dm_verity_io *io, +- enum verity_block_type type, sector_t block, +- u8 *dest); ++ enum verity_block_type type, const u8 *want_digest, ++ sector_t block, u8 *dest); + + extern unsigned int verity_fec_status_table(struct dm_verity *v, unsigned int sz, + char *result, unsigned int maxlen); +@@ -99,6 +99,7 @@ static inline bool verity_fec_is_enabled(struct dm_verity *v) + static inline int verity_fec_decode(struct dm_verity *v, + struct dm_verity_io *io, + enum verity_block_type type, ++ const u8 *want_digest, + sector_t block, u8 *dest) + { + return -EOPNOTSUPP; +diff --git a/drivers/md/dm-verity-target.c b/drivers/md/dm-verity-target.c +index 66a00a8ccb39..bf0aee73b074 100644 +--- a/drivers/md/dm-verity-target.c ++++ b/drivers/md/dm-verity-target.c +@@ -117,11 +117,25 @@ static sector_t verity_position_at_level(struct dm_verity *v, sector_t block, + int verity_hash(struct dm_verity *v, struct dm_verity_io *io, + const u8 *data, size_t len, u8 *digest) + { +- struct shash_desc *desc = &io->hash_desc; ++ struct shash_desc *desc; + int r; + ++ if (likely(v->use_sha256_lib)) { ++ struct sha256_ctx *ctx = &io->hash_ctx.sha256; ++ ++ /* ++ * Fast path using SHA-256 library. This is enabled only for ++ * verity version 1, where the salt is at the beginning. ++ */ ++ *ctx = *v->initial_hashstate.sha256; ++ sha256_update(ctx, data, len); ++ sha256_final(ctx, digest); ++ return 0; ++ } ++ ++ desc = &io->hash_ctx.shash; + desc->tfm = v->shash_tfm; +- if (unlikely(v->initial_hashstate == NULL)) { ++ if (unlikely(v->initial_hashstate.shash == NULL)) { + /* Version 0: salt at end */ + r = crypto_shash_init(desc) ?: + crypto_shash_update(desc, data, len) ?: +@@ -129,7 +143,7 @@ int verity_hash(struct dm_verity *v, struct dm_verity_io *io, + crypto_shash_final(desc, digest); + } else { + /* Version 1: salt at beginning */ +- r = crypto_shash_import(desc, v->initial_hashstate) ?: ++ r = crypto_shash_import(desc, v->initial_hashstate.shash) ?: + crypto_shash_finup(desc, data, len, digest); + } + if (unlikely(r)) +@@ -215,12 +229,12 @@ static int verity_handle_err(struct dm_verity *v, enum verity_block_type type, + * Verify hash of a metadata block pertaining to the specified data block + * ("block" argument) at a specified level ("level" argument). + * +- * On successful return, verity_io_want_digest(v, io) contains the hash value +- * for a lower tree level or for the data block (if we're at the lowest level). ++ * On successful return, want_digest contains the hash value for a lower tree ++ * level or for the data block (if we're at the lowest level). + * + * If "skip_unverified" is true, unverified buffer is skipped and 1 is returned. + * If "skip_unverified" is false, unverified buffer is hashed and verified +- * against current value of verity_io_want_digest(v, io). ++ * against current value of want_digest. + */ + static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + sector_t block, int level, bool skip_unverified, +@@ -259,7 +273,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + if (IS_ERR(data)) + return r; + if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, +- hash_block, data) == 0) { ++ want_digest, hash_block, data) == 0) { + aux = dm_bufio_get_aux_data(buf); + aux->hash_verified = 1; + goto release_ok; +@@ -279,11 +293,11 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + } + + r = verity_hash(v, io, data, 1 << v->hash_dev_block_bits, +- verity_io_real_digest(v, io)); ++ io->tmp_digest); + if (unlikely(r < 0)) + goto release_ret_r; + +- if (likely(memcmp(verity_io_real_digest(v, io), want_digest, ++ if (likely(memcmp(io->tmp_digest, want_digest, + v->digest_size) == 0)) + aux->hash_verified = 1; + else if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { +@@ -294,7 +308,7 @@ static int verity_verify_level(struct dm_verity *v, struct dm_verity_io *io, + r = -EAGAIN; + goto release_ret_r; + } else if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_METADATA, +- hash_block, data) == 0) ++ want_digest, hash_block, data) == 0) + aux->hash_verified = 1; + else if (verity_handle_err(v, + DM_VERITY_BLOCK_TYPE_METADATA, +@@ -358,7 +372,8 @@ int verity_hash_for_block(struct dm_verity *v, struct dm_verity_io *io, + } + + static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, +- sector_t cur_block, u8 *dest) ++ const u8 *want_digest, sector_t cur_block, ++ u8 *dest) + { + struct page *page; + void *buffer; +@@ -382,12 +397,11 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, + goto free_ret; + + r = verity_hash(v, io, buffer, 1 << v->data_dev_block_bits, +- verity_io_real_digest(v, io)); ++ io->tmp_digest); + if (unlikely(r)) + goto free_ret; + +- if (memcmp(verity_io_real_digest(v, io), +- verity_io_want_digest(v, io), v->digest_size)) { ++ if (memcmp(io->tmp_digest, want_digest, v->digest_size)) { + r = -EIO; + goto free_ret; + } +@@ -402,9 +416,13 @@ static noinline int verity_recheck(struct dm_verity *v, struct dm_verity_io *io, + + static int verity_handle_data_hash_mismatch(struct dm_verity *v, + struct dm_verity_io *io, +- struct bio *bio, sector_t blkno, +- u8 *data) ++ struct bio *bio, ++ struct pending_block *block) + { ++ const u8 *want_digest = block->want_digest; ++ sector_t blkno = block->blkno; ++ u8 *data = block->data; ++ + if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { + /* + * Error handling code (FEC included) cannot be run in the +@@ -412,14 +430,14 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, + */ + return -EAGAIN; + } +- if (verity_recheck(v, io, blkno, data) == 0) { ++ if (verity_recheck(v, io, want_digest, blkno, data) == 0) { + if (v->validated_blocks) + set_bit(blkno, v->validated_blocks); + return 0; + } + #if defined(CONFIG_DM_VERITY_FEC) +- if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, blkno, +- data) == 0) ++ if (verity_fec_decode(v, io, DM_VERITY_BLOCK_TYPE_DATA, want_digest, ++ blkno, data) == 0) + return 0; + #endif + if (bio->bi_status) +@@ -433,6 +451,58 @@ static int verity_handle_data_hash_mismatch(struct dm_verity *v, + return 0; + } + ++static void verity_clear_pending_blocks(struct dm_verity_io *io) ++{ ++ int i; ++ ++ for (i = io->num_pending - 1; i >= 0; i--) { ++ kunmap_local(io->pending_blocks[i].data); ++ io->pending_blocks[i].data = NULL; ++ } ++ io->num_pending = 0; ++} ++ ++static int verity_verify_pending_blocks(struct dm_verity *v, ++ struct dm_verity_io *io, ++ struct bio *bio) ++{ ++ const unsigned int block_size = 1 << v->data_dev_block_bits; ++ int i, r; ++ ++ if (io->num_pending == 2) { ++ /* num_pending == 2 implies that the algorithm is SHA-256 */ ++ sha256_finup_2x(v->initial_hashstate.sha256, ++ io->pending_blocks[0].data, ++ io->pending_blocks[1].data, block_size, ++ io->pending_blocks[0].real_digest, ++ io->pending_blocks[1].real_digest); ++ } else { ++ for (i = 0; i < io->num_pending; i++) { ++ r = verity_hash(v, io, io->pending_blocks[i].data, ++ block_size, ++ io->pending_blocks[i].real_digest); ++ if (unlikely(r)) ++ return r; ++ } ++ } ++ ++ for (i = 0; i < io->num_pending; i++) { ++ struct pending_block *block = &io->pending_blocks[i]; ++ ++ if (likely(memcmp(block->real_digest, block->want_digest, ++ v->digest_size) == 0)) { ++ if (v->validated_blocks) ++ set_bit(block->blkno, v->validated_blocks); ++ } else { ++ r = verity_handle_data_hash_mismatch(v, io, bio, block); ++ if (unlikely(r)) ++ return r; ++ } ++ } ++ verity_clear_pending_blocks(io); ++ return 0; ++} ++ + /* + * Verify one "dm_verity_io" structure. + */ +@@ -440,10 +510,14 @@ static int verity_verify_io(struct dm_verity_io *io) + { + struct dm_verity *v = io->v; + const unsigned int block_size = 1 << v->data_dev_block_bits; ++ const int max_pending = v->use_sha256_finup_2x ? 2 : 1; + struct bvec_iter iter_copy; + struct bvec_iter *iter; + struct bio *bio = dm_bio_from_per_bio_data(io, v->ti->per_io_data_size); + unsigned int b; ++ int r; ++ ++ io->num_pending = 0; + + if (static_branch_unlikely(&use_bh_wq_enabled) && io->in_bh) { + /* +@@ -457,21 +531,22 @@ static int verity_verify_io(struct dm_verity_io *io) + + for (b = 0; b < io->n_blocks; + b++, bio_advance_iter(bio, iter, block_size)) { +- int r; +- sector_t cur_block = io->block + b; ++ sector_t blkno = io->block + b; ++ struct pending_block *block; + bool is_zero; + struct bio_vec bv; + void *data; + + if (v->validated_blocks && bio->bi_status == BLK_STS_OK && +- likely(test_bit(cur_block, v->validated_blocks))) ++ likely(test_bit(blkno, v->validated_blocks))) + continue; + +- r = verity_hash_for_block(v, io, cur_block, +- verity_io_want_digest(v, io), ++ block = &io->pending_blocks[io->num_pending]; ++ ++ r = verity_hash_for_block(v, io, blkno, block->want_digest, + &is_zero); + if (unlikely(r < 0)) +- return r; ++ goto error; + + bv = bio_iter_iovec(bio, *iter); + if (unlikely(bv.bv_len < block_size)) { +@@ -482,7 +557,8 @@ static int verity_verify_io(struct dm_verity_io *io) + * data block size to be greater than PAGE_SIZE. + */ + DMERR_LIMIT("unaligned io (data block spans pages)"); +- return -EIO; ++ r = -EIO; ++ goto error; + } + + data = bvec_kmap_local(&bv); +@@ -496,29 +572,26 @@ static int verity_verify_io(struct dm_verity_io *io) + kunmap_local(data); + continue; + } +- +- r = verity_hash(v, io, data, block_size, +- verity_io_real_digest(v, io)); +- if (unlikely(r < 0)) { +- kunmap_local(data); +- return r; ++ block->data = data; ++ block->blkno = blkno; ++ if (++io->num_pending == max_pending) { ++ r = verity_verify_pending_blocks(v, io, bio); ++ if (unlikely(r)) ++ goto error; + } ++ } + +- if (likely(memcmp(verity_io_real_digest(v, io), +- verity_io_want_digest(v, io), v->digest_size) == 0)) { +- if (v->validated_blocks) +- set_bit(cur_block, v->validated_blocks); +- kunmap_local(data); +- continue; +- } +- r = verity_handle_data_hash_mismatch(v, io, bio, cur_block, +- data); +- kunmap_local(data); ++ if (io->num_pending) { ++ r = verity_verify_pending_blocks(v, io, bio); + if (unlikely(r)) +- return r; ++ goto error; + } + + return 0; ++ ++error: ++ verity_clear_pending_blocks(io); ++ return r; + } + + /* +@@ -1004,7 +1077,7 @@ static void verity_dtr(struct dm_target *ti) + + kvfree(v->validated_blocks); + kfree(v->salt); +- kfree(v->initial_hashstate); ++ kfree(v->initial_hashstate.shash); + kfree(v->root_digest); + kfree(v->zero_digest); + verity_free_sig(v); +@@ -1069,8 +1142,7 @@ static int verity_alloc_zero_digest(struct dm_verity *v) + if (!v->zero_digest) + return r; + +- io = kmalloc(sizeof(*io) + crypto_shash_descsize(v->shash_tfm), +- GFP_KERNEL); ++ io = kmalloc(v->ti->per_io_data_size, GFP_KERNEL); + + if (!io) + return r; /* verity_dtr will free zero_digest */ +@@ -1252,11 +1324,26 @@ static int verity_setup_hash_alg(struct dm_verity *v, const char *alg_name) + } + v->shash_tfm = shash; + v->digest_size = crypto_shash_digestsize(shash); +- DMINFO("%s using \"%s\"", alg_name, crypto_shash_driver_name(shash)); + if ((1 << v->hash_dev_block_bits) < v->digest_size * 2) { + ti->error = "Digest size too big"; + return -EINVAL; + } ++ if (likely(v->version && strcmp(alg_name, "sha256") == 0)) { ++ /* ++ * Fast path: use the library API for reduced overhead and ++ * interleaved hashing support. ++ */ ++ v->use_sha256_lib = true; ++ if (sha256_finup_2x_is_optimized()) ++ v->use_sha256_finup_2x = true; ++ ti->per_io_data_size = ++ offsetofend(struct dm_verity_io, hash_ctx.sha256); ++ } else { ++ /* Fallback case: use the generic crypto API. */ ++ ti->per_io_data_size = ++ offsetofend(struct dm_verity_io, hash_ctx.shash) + ++ crypto_shash_descsize(shash); ++ } + return 0; + } + +@@ -1277,7 +1364,18 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) + return -EINVAL; + } + } +- if (v->version) { /* Version 1: salt at beginning */ ++ if (likely(v->use_sha256_lib)) { ++ /* Implies version 1: salt at beginning */ ++ v->initial_hashstate.sha256 = ++ kmalloc(sizeof(struct sha256_ctx), GFP_KERNEL); ++ if (!v->initial_hashstate.sha256) { ++ ti->error = "Cannot allocate initial hash state"; ++ return -ENOMEM; ++ } ++ sha256_init(v->initial_hashstate.sha256); ++ sha256_update(v->initial_hashstate.sha256, ++ v->salt, v->salt_size); ++ } else if (v->version) { /* Version 1: salt at beginning */ + SHASH_DESC_ON_STACK(desc, v->shash_tfm); + int r; + +@@ -1285,16 +1383,16 @@ static int verity_setup_salt_and_hashstate(struct dm_verity *v, const char *arg) + * Compute the pre-salted hash state that can be passed to + * crypto_shash_import() for each block later. + */ +- v->initial_hashstate = kmalloc( ++ v->initial_hashstate.shash = kmalloc( + crypto_shash_statesize(v->shash_tfm), GFP_KERNEL); +- if (!v->initial_hashstate) { ++ if (!v->initial_hashstate.shash) { + ti->error = "Cannot allocate initial hash state"; + return -ENOMEM; + } + desc->tfm = v->shash_tfm; + r = crypto_shash_init(desc) ?: + crypto_shash_update(desc, v->salt, v->salt_size) ?: +- crypto_shash_export(desc, v->initial_hashstate); ++ crypto_shash_export(desc, v->initial_hashstate.shash); + if (r) { + ti->error = "Cannot set up initial hash state"; + return r; +@@ -1556,9 +1654,6 @@ static int verity_ctr(struct dm_target *ti, unsigned int argc, char **argv) + goto bad; + } + +- ti->per_io_data_size = sizeof(struct dm_verity_io) + +- crypto_shash_descsize(v->shash_tfm); +- + r = verity_fec_ctr(v); + if (r) + goto bad; +diff --git a/drivers/md/dm-verity.h b/drivers/md/dm-verity.h +index 6d141abd965c..f975a9e5c5d6 100644 +--- a/drivers/md/dm-verity.h ++++ b/drivers/md/dm-verity.h +@@ -16,6 +16,7 @@ + #include + #include + #include ++#include + + #define DM_VERITY_MAX_LEVELS 63 + +@@ -42,7 +43,10 @@ struct dm_verity { + struct crypto_shash *shash_tfm; + u8 *root_digest; /* digest of the root block */ + u8 *salt; /* salt: its size is salt_size */ +- u8 *initial_hashstate; /* salted initial state, if version >= 1 */ ++ union { ++ struct sha256_ctx *sha256; /* for use_sha256_lib=1 */ ++ u8 *shash; /* for use_sha256_lib=0 */ ++ } initial_hashstate; /* salted initial state, if version >= 1 */ + u8 *zero_digest; /* digest for a zero block */ + #ifdef CONFIG_SECURITY + u8 *root_digest_sig; /* signature of the root digest */ +@@ -59,6 +63,8 @@ struct dm_verity { + unsigned char version; + bool hash_failed:1; /* set if hash of any block failed */ + bool use_bh_wq:1; /* try to verify in BH wq before normal work-queue */ ++ bool use_sha256_lib:1; /* use SHA-256 library instead of generic crypto API */ ++ bool use_sha256_finup_2x:1; /* use interleaved hashing optimization */ + unsigned int digest_size; /* digest size for the current hash algorithm */ + enum verity_mode mode; /* mode for handling verification errors */ + enum verity_mode error_mode;/* mode for handling I/O errors */ +@@ -78,6 +84,13 @@ struct dm_verity { + mempool_t recheck_pool; + }; + ++struct pending_block { ++ void *data; ++ sector_t blkno; ++ u8 want_digest[HASH_MAX_DIGESTSIZE]; ++ u8 real_digest[HASH_MAX_DIGESTSIZE]; ++}; ++ + struct dm_verity_io { + struct dm_verity *v; + +@@ -94,28 +107,29 @@ struct dm_verity_io { + struct work_struct work; + struct work_struct bh_work; + +- u8 real_digest[HASH_MAX_DIGESTSIZE]; +- u8 want_digest[HASH_MAX_DIGESTSIZE]; ++ u8 tmp_digest[HASH_MAX_DIGESTSIZE]; + + /* +- * Temporary space for hashing. This is variable-length and must be at +- * the end of the struct. struct shash_desc is just the fixed part; +- * it's followed by a context of size crypto_shash_descsize(shash_tfm). ++ * This is the queue of data blocks that are pending verification. When ++ * the crypto layer supports interleaved hashing, we allow multiple ++ * blocks to be queued up in order to utilize it. This can improve ++ * performance significantly vs. sequential hashing of each block. + */ +- struct shash_desc hash_desc; +-}; ++ int num_pending; ++ struct pending_block pending_blocks[2]; + +-static inline u8 *verity_io_real_digest(struct dm_verity *v, +- struct dm_verity_io *io) +-{ +- return io->real_digest; +-} +- +-static inline u8 *verity_io_want_digest(struct dm_verity *v, +- struct dm_verity_io *io) +-{ +- return io->want_digest; +-} ++ /* ++ * Temporary space for hashing. Either sha256 or shash is used, ++ * depending on the value of use_sha256_lib. If shash is used, ++ * then this field is variable-length, with total size ++ * sizeof(struct shash_desc) + crypto_shash_descsize(shash_tfm). ++ * For this reason, this field must be the end of the struct. ++ */ ++ union { ++ struct sha256_ctx sha256; ++ struct shash_desc shash; ++ } hash_ctx; ++}; + + extern int verity_hash(struct dm_verity *v, struct dm_verity_io *io, + const u8 *data, size_t len, u8 *digest); +diff --git a/include/linux/rhashtable.h b/include/linux/rhashtable.h +index 05a221ce79a6..b87a768b955c 100644 +--- a/include/linux/rhashtable.h ++++ b/include/linux/rhashtable.h +@@ -355,12 +355,25 @@ static inline void rht_unlock(struct bucket_table *tbl, + local_irq_restore(flags); + } + +-static inline struct rhash_head *__rht_ptr( +- struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt) ++enum rht_lookup_freq { ++ RHT_LOOKUP_NORMAL, ++ RHT_LOOKUP_LIKELY, ++}; ++ ++static __always_inline struct rhash_head *__rht_ptr( ++ struct rhash_lock_head *p, struct rhash_lock_head __rcu *const *bkt, ++ const enum rht_lookup_freq freq) + { +- return (struct rhash_head *) +- ((unsigned long)p & ~BIT(0) ?: +- (unsigned long)RHT_NULLS_MARKER(bkt)); ++ unsigned long p_val = (unsigned long)p & ~BIT(0); ++ ++ BUILD_BUG_ON(!__builtin_constant_p(freq)); ++ ++ if (freq == RHT_LOOKUP_LIKELY) ++ return (struct rhash_head *) ++ (likely(p_val) ? p_val : (unsigned long)RHT_NULLS_MARKER(bkt)); ++ else ++ return (struct rhash_head *) ++ (p_val ?: (unsigned long)RHT_NULLS_MARKER(bkt)); + } + + /* +@@ -370,10 +383,17 @@ static inline struct rhash_head *__rht_ptr( + * rht_ptr_exclusive() dereferences in a context where exclusive + * access is guaranteed, such as when destroying the table. + */ ++static __always_inline struct rhash_head *__rht_ptr_rcu( ++ struct rhash_lock_head __rcu *const *bkt, ++ const enum rht_lookup_freq freq) ++{ ++ return __rht_ptr(rcu_dereference(*bkt), bkt, freq); ++} ++ + static inline struct rhash_head *rht_ptr_rcu( + struct rhash_lock_head __rcu *const *bkt) + { +- return __rht_ptr(rcu_dereference_all(*bkt), bkt); ++ return __rht_ptr_rcu(bkt, RHT_LOOKUP_NORMAL); + } + + static inline struct rhash_head *rht_ptr( +@@ -381,13 +401,15 @@ static inline struct rhash_head *rht_ptr( + struct bucket_table *tbl, + unsigned int hash) + { +- return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt); ++ return __rht_ptr(rht_dereference_bucket(*bkt, tbl, hash), bkt, ++ RHT_LOOKUP_NORMAL); + } + + static inline struct rhash_head *rht_ptr_exclusive( + struct rhash_lock_head __rcu *const *bkt) + { +- return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt); ++ return __rht_ptr(rcu_dereference_protected(*bkt, 1), bkt, ++ RHT_LOOKUP_NORMAL); + } + + static inline void rht_assign_locked(struct rhash_lock_head __rcu **bkt, +@@ -588,7 +610,8 @@ static inline int rhashtable_compare(struct rhashtable_compare_arg *arg, + /* Internal function, do not use. */ + static __always_inline struct rhash_head *__rhashtable_lookup( + struct rhashtable *ht, const void *key, +- const struct rhashtable_params params) ++ const struct rhashtable_params params, ++ const enum rht_lookup_freq freq) + { + struct rhashtable_compare_arg arg = { + .ht = ht, +@@ -599,12 +622,13 @@ static __always_inline struct rhash_head *__rhashtable_lookup( + struct rhash_head *he; + unsigned int hash; + ++ BUILD_BUG_ON(!__builtin_constant_p(freq)); + tbl = rht_dereference_rcu(ht->tbl, ht); + restart: + hash = rht_key_hashfn(ht, tbl, key, params); + bkt = rht_bucket(tbl, hash); + do { +- rht_for_each_rcu_from(he, rht_ptr_rcu(bkt), tbl, hash) { ++ rht_for_each_rcu_from(he, __rht_ptr_rcu(bkt, freq), tbl, hash) { + if (params.obj_cmpfn ? + params.obj_cmpfn(&arg, rht_obj(ht, he)) : + rhashtable_compare(&arg, rht_obj(ht, he))) +@@ -643,11 +667,22 @@ static __always_inline void *rhashtable_lookup( + struct rhashtable *ht, const void *key, + const struct rhashtable_params params) + { +- struct rhash_head *he = __rhashtable_lookup(ht, key, params); ++ struct rhash_head *he = __rhashtable_lookup(ht, key, params, ++ RHT_LOOKUP_NORMAL); + + return he ? rht_obj(ht, he) : NULL; + } + ++static __always_inline void *rhashtable_lookup_likely( ++ struct rhashtable *ht, const void *key, ++ const struct rhashtable_params params) ++{ ++ struct rhash_head *he = __rhashtable_lookup(ht, key, params, ++ RHT_LOOKUP_LIKELY); ++ ++ return likely(he) ? rht_obj(ht, he) : NULL; ++} ++ + /** + * rhashtable_lookup_fast - search hash table, without RCU read lock + * @ht: hash table +@@ -693,11 +728,22 @@ static __always_inline struct rhlist_head *rhltable_lookup( + struct rhltable *hlt, const void *key, + const struct rhashtable_params params) + { +- struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params); ++ struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, ++ RHT_LOOKUP_NORMAL); + + return he ? container_of(he, struct rhlist_head, rhead) : NULL; + } + ++static __always_inline struct rhlist_head *rhltable_lookup_likely( ++ struct rhltable *hlt, const void *key, ++ const struct rhashtable_params params) ++{ ++ struct rhash_head *he = __rhashtable_lookup(&hlt->ht, key, params, ++ RHT_LOOKUP_LIKELY); ++ ++ return likely(he) ? container_of(he, struct rhlist_head, rhead) : NULL; ++} ++ + /* Internal function, please use rhashtable_insert_fast() instead. This + * function returns the existing element already in hashes if there is a clash, + * otherwise it returns an error via ERR_PTR(). +-- +2.52.0 + diff --git a/sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch b/sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch new file mode 100644 index 0000000..b0aaec7 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/0010-sched-ext.patch @@ -0,0 +1,708 @@ +From 9d35fa170b23d0aa9e7724629d55f8c2c6e38e99 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Thu, 18 Dec 2025 16:42:35 +0100 +Subject: [PATCH 10/11] sched-ext + +Signed-off-by: Peter Jung +--- + include/linux/sched/ext.h | 1 + + kernel/sched/ext.c | 69 ++++- + tools/sched_ext/include/scx/common.bpf.h | 1 + + tools/sched_ext/include/scx/compat.bpf.h | 18 ++ + tools/testing/selftests/sched_ext/Makefile | 1 + + .../selftests/sched_ext/peek_dsq.bpf.c | 251 ++++++++++++++++++ + tools/testing/selftests/sched_ext/peek_dsq.c | 224 ++++++++++++++++ + 7 files changed, 561 insertions(+), 4 deletions(-) + create mode 100644 tools/testing/selftests/sched_ext/peek_dsq.bpf.c + create mode 100644 tools/testing/selftests/sched_ext/peek_dsq.c + +diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h +index d82b7a9b0658..81478d4ae782 100644 +--- a/include/linux/sched/ext.h ++++ b/include/linux/sched/ext.h +@@ -58,6 +58,7 @@ enum scx_dsq_id_flags { + */ + struct scx_dispatch_q { + raw_spinlock_t lock; ++ struct task_struct __rcu *first_task; /* lockless peek at head */ + struct list_head list; /* tasks in dispatch order */ + struct rb_root priq; /* used to order by p->scx.dsq_vtime */ + u32 nr; +diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c +index 979484dab2d3..9acc660c350c 100644 +--- a/kernel/sched/ext.c ++++ b/kernel/sched/ext.c +@@ -965,8 +965,11 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, + container_of(rbp, struct task_struct, + scx.dsq_priq); + list_add(&p->scx.dsq_list.node, &prev->scx.dsq_list.node); ++ /* first task unchanged - no update needed */ + } else { + list_add(&p->scx.dsq_list.node, &dsq->list); ++ /* not builtin and new task is at head - use fastpath */ ++ rcu_assign_pointer(dsq->first_task, p); + } + } else { + /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ +@@ -974,10 +977,19 @@ static void dispatch_enqueue(struct scx_sched *sch, struct scx_dispatch_q *dsq, + scx_error(sch, "DSQ ID 0x%016llx already had PRIQ-enqueued tasks", + dsq->id); + +- if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) ++ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) { + list_add(&p->scx.dsq_list.node, &dsq->list); +- else ++ /* new task inserted at head - use fastpath */ ++ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN)) ++ rcu_assign_pointer(dsq->first_task, p); ++ } else { ++ bool was_empty; ++ ++ was_empty = list_empty(&dsq->list); + list_add_tail(&p->scx.dsq_list.node, &dsq->list); ++ if (was_empty && !(dsq->id & SCX_DSQ_FLAG_BUILTIN)) ++ rcu_assign_pointer(dsq->first_task, p); ++ } + } + + /* seq records the order tasks are queued, used by BPF DSQ iterator */ +@@ -1034,6 +1046,13 @@ static void task_unlink_from_dsq(struct task_struct *p, + + list_del_init(&p->scx.dsq_list.node); + dsq_mod_nr(dsq, -1); ++ ++ if (!(dsq->id & SCX_DSQ_FLAG_BUILTIN) && dsq->first_task == p) { ++ struct task_struct *first_task; ++ ++ first_task = nldsq_next_task(dsq, NULL, false); ++ rcu_assign_pointer(dsq->first_task, first_task); ++ } + } + + static void dispatch_dequeue(struct rq *rq, struct task_struct *p) +@@ -4516,7 +4535,7 @@ static struct scx_sched *scx_alloc_and_add_sched(struct sched_ext_ops *ops) + return ERR_PTR(ret); + } + +-static void check_hotplug_seq(struct scx_sched *sch, ++static int check_hotplug_seq(struct scx_sched *sch, + const struct sched_ext_ops *ops) + { + unsigned long long global_hotplug_seq; +@@ -4533,8 +4552,11 @@ static void check_hotplug_seq(struct scx_sched *sch, + SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, + "expected hotplug seq %llu did not match actual %llu", + ops->hotplug_seq, global_hotplug_seq); ++ return -EBUSY; + } + } ++ ++ return 0; + } + + static int validate_ops(struct scx_sched *sch, const struct sched_ext_ops *ops) +@@ -4636,7 +4658,11 @@ static int scx_enable(struct sched_ext_ops *ops, struct bpf_link *link) + if (((void (**)(void))ops)[i]) + set_bit(i, sch->has_op); + +- check_hotplug_seq(sch, ops); ++ ret = check_hotplug_seq(sch, ops); ++ if (ret) { ++ cpus_read_unlock(); ++ goto err_disable; ++ } + scx_idle_update_selcpu_topology(ops); + + cpus_read_unlock(); +@@ -6183,6 +6209,40 @@ __bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) + kit->dsq = NULL; + } + ++/** ++ * scx_bpf_dsq_peek - Lockless peek at the first element. ++ * @dsq_id: DSQ to examine. ++ * ++ * Read the first element in the DSQ. This is semantically equivalent to using ++ * the DSQ iterator, but is lockfree. Of course, like any lockless operation, ++ * this provides only a point-in-time snapshot, and the contents may change ++ * by the time any subsequent locking operation reads the queue. ++ * ++ * Returns the pointer, or NULL indicates an empty queue OR internal error. ++ */ ++__bpf_kfunc struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) ++{ ++ struct scx_sched *sch; ++ struct scx_dispatch_q *dsq; ++ ++ sch = rcu_dereference(scx_root); ++ if (unlikely(!sch)) ++ return NULL; ++ ++ if (unlikely(dsq_id & SCX_DSQ_FLAG_BUILTIN)) { ++ scx_error(sch, "peek disallowed on builtin DSQ 0x%llx", dsq_id); ++ return NULL; ++ } ++ ++ dsq = find_user_dsq(sch, dsq_id); ++ if (unlikely(!dsq)) { ++ scx_error(sch, "peek on non-existent DSQ 0x%llx", dsq_id); ++ return NULL; ++ } ++ ++ return rcu_dereference(dsq->first_task); ++} ++ + __bpf_kfunc_end_defs(); + + static s32 __bstr_format(struct scx_sched *sch, u64 *data_buf, char *line_buf, +@@ -6740,6 +6800,7 @@ BTF_KFUNCS_START(scx_kfunc_ids_any) + BTF_ID_FLAGS(func, scx_bpf_kick_cpu) + BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) + BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) ++BTF_ID_FLAGS(func, scx_bpf_dsq_peek, KF_RCU_PROTECTED | KF_RET_NULL) + BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) + BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) + BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) +diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h +index 06e2551033cb..fbf3e7f9526c 100644 +--- a/tools/sched_ext/include/scx/common.bpf.h ++++ b/tools/sched_ext/include/scx/common.bpf.h +@@ -75,6 +75,7 @@ u32 scx_bpf_reenqueue_local(void) __ksym; + void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; + s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; + void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; ++struct task_struct *scx_bpf_dsq_peek(u64 dsq_id) __ksym __weak; + int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, u64 flags) __ksym __weak; + struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; + void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; +diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h +index dd9144624dc9..467a987880e1 100644 +--- a/tools/sched_ext/include/scx/compat.bpf.h ++++ b/tools/sched_ext/include/scx/compat.bpf.h +@@ -130,6 +130,24 @@ int bpf_cpumask_populate(struct cpumask *dst, void *src, size_t src__sz) __ksym + false; \ + }) + ++/* ++ * v6.19: Introduce lockless peek API for user DSQs. ++ * ++ * Preserve the following macro until v6.21. ++ */ ++static inline struct task_struct *__COMPAT_scx_bpf_dsq_peek(u64 dsq_id) ++{ ++ struct task_struct *p = NULL; ++ struct bpf_iter_scx_dsq it; ++ ++ if (bpf_ksym_exists(scx_bpf_dsq_peek)) ++ return scx_bpf_dsq_peek(dsq_id); ++ if (!bpf_iter_scx_dsq_new(&it, dsq_id, 0)) ++ p = bpf_iter_scx_dsq_next(&it); ++ bpf_iter_scx_dsq_destroy(&it); ++ return p; ++} ++ + /** + * __COMPAT_is_enq_cpu_selected - Test if SCX_ENQ_CPU_SELECTED is on + * in a compatible way. We will preserve this __COMPAT helper until v6.16. +diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile +index 9d9d6b4c38b0..5fe45f9c5f8f 100644 +--- a/tools/testing/selftests/sched_ext/Makefile ++++ b/tools/testing/selftests/sched_ext/Makefile +@@ -174,6 +174,7 @@ auto-test-targets := \ + minimal \ + numa \ + allowed_cpus \ ++ peek_dsq \ + prog_run \ + reload_loop \ + select_cpu_dfl \ +diff --git a/tools/testing/selftests/sched_ext/peek_dsq.bpf.c b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c +new file mode 100644 +index 000000000000..a3faf5bb49d6 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/peek_dsq.bpf.c +@@ -0,0 +1,251 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * A BPF program for testing DSQ operations and peek in particular. ++ * ++ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2025 Ryan Newton ++ */ ++ ++#include ++#include ++ ++char _license[] SEC("license") = "GPL"; ++ ++UEI_DEFINE(uei); /* Error handling */ ++ ++#define MAX_SAMPLES 100 ++#define MAX_CPUS 512 ++#define DSQ_POOL_SIZE 8 ++int max_samples = MAX_SAMPLES; ++int max_cpus = MAX_CPUS; ++int dsq_pool_size = DSQ_POOL_SIZE; ++ ++/* Global variables to store test results */ ++int dsq_peek_result1 = -1; ++long dsq_inserted_pid = -1; ++int insert_test_cpu = -1; /* Set to the cpu that performs the test */ ++long dsq_peek_result2 = -1; ++long dsq_peek_result2_pid = -1; ++long dsq_peek_result2_expected = -1; ++int test_dsq_id = 1234; /* Use a simple ID like create_dsq example */ ++int real_dsq_id = 1235; /* DSQ for normal operation */ ++int enqueue_count = -1; ++int dispatch_count = -1; ++bool debug_ksym_exists; ++ ++/* DSQ pool for stress testing */ ++int dsq_pool_base_id = 2000; ++int phase1_complete = -1; ++long total_peek_attempts = -1; ++long successful_peeks = -1; ++ ++/* BPF map for sharing peek results with userspace */ ++struct { ++ __uint(type, BPF_MAP_TYPE_ARRAY); ++ __uint(max_entries, MAX_SAMPLES); ++ __type(key, u32); ++ __type(value, long); ++} peek_results SEC(".maps"); ++ ++static int get_random_dsq_id(void) ++{ ++ u64 time = bpf_ktime_get_ns(); ++ ++ return dsq_pool_base_id + (time % DSQ_POOL_SIZE); ++} ++ ++static void record_peek_result(long pid) ++{ ++ u32 slot_key; ++ long *slot_pid_ptr; ++ int ix; ++ ++ if (pid <= 0) ++ return; ++ ++ /* Find an empty slot or one with the same PID */ ++ bpf_for(ix, 0, 10) { ++ slot_key = (pid + ix) % MAX_SAMPLES; ++ slot_pid_ptr = bpf_map_lookup_elem(&peek_results, &slot_key); ++ if (!slot_pid_ptr) ++ continue; ++ ++ if (*slot_pid_ptr == -1 || *slot_pid_ptr == pid) { ++ *slot_pid_ptr = pid; ++ break; ++ } ++ } ++} ++ ++/* Scan all DSQs in the pool and try to move a task to local */ ++static int scan_dsq_pool(void) ++{ ++ struct task_struct *task; ++ int moved = 0; ++ int i; ++ ++ bpf_for(i, 0, DSQ_POOL_SIZE) { ++ int dsq_id = dsq_pool_base_id + i; ++ ++ total_peek_attempts++; ++ ++ task = __COMPAT_scx_bpf_dsq_peek(dsq_id); ++ if (task) { ++ successful_peeks++; ++ record_peek_result(task->pid); ++ ++ /* Try to move this task to local */ ++ if (!moved && scx_bpf_dsq_move_to_local(dsq_id) == 0) { ++ moved = 1; ++ break; ++ } ++ } ++ } ++ return moved; ++} ++ ++/* Struct_ops scheduler for testing DSQ peek operations */ ++void BPF_STRUCT_OPS(peek_dsq_enqueue, struct task_struct *p, u64 enq_flags) ++{ ++ struct task_struct *peek_result; ++ int last_insert_test_cpu, cpu; ++ ++ enqueue_count++; ++ cpu = bpf_get_smp_processor_id(); ++ last_insert_test_cpu = __sync_val_compare_and_swap(&insert_test_cpu, -1, cpu); ++ ++ /* Phase 1: Simple insert-then-peek test (only on first task) */ ++ if (last_insert_test_cpu == -1) { ++ bpf_printk("peek_dsq_enqueue beginning phase 1 peek test on cpu %d", cpu); ++ ++ /* Test 1: Peek empty DSQ - should return NULL */ ++ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); ++ dsq_peek_result1 = (long)peek_result; /* Should be 0 (NULL) */ ++ ++ /* Test 2: Insert task into test DSQ for testing in dispatch callback */ ++ dsq_inserted_pid = p->pid; ++ scx_bpf_dsq_insert(p, test_dsq_id, 0, enq_flags); ++ dsq_peek_result2_expected = (long)p; /* Expected the task we just inserted */ ++ } else if (!phase1_complete) { ++ /* Still in phase 1, use real DSQ */ ++ scx_bpf_dsq_insert(p, real_dsq_id, 0, enq_flags); ++ } else { ++ /* Phase 2: Random DSQ insertion for stress testing */ ++ int random_dsq_id = get_random_dsq_id(); ++ ++ scx_bpf_dsq_insert(p, random_dsq_id, 0, enq_flags); ++ } ++} ++ ++void BPF_STRUCT_OPS(peek_dsq_dispatch, s32 cpu, struct task_struct *prev) ++{ ++ dispatch_count++; ++ ++ /* Phase 1: Complete the simple peek test if we inserted a task but ++ * haven't tested peek yet ++ */ ++ if (insert_test_cpu == cpu && dsq_peek_result2 == -1) { ++ struct task_struct *peek_result; ++ ++ bpf_printk("peek_dsq_dispatch completing phase 1 peek test on cpu %d", cpu); ++ ++ /* Test 3: Peek DSQ after insert - should return the task we inserted */ ++ peek_result = __COMPAT_scx_bpf_dsq_peek(test_dsq_id); ++ /* Store the PID of the peeked task for comparison */ ++ dsq_peek_result2 = (long)peek_result; ++ dsq_peek_result2_pid = peek_result ? peek_result->pid : -1; ++ ++ /* Now consume the task since we've peeked at it */ ++ scx_bpf_dsq_move_to_local(test_dsq_id); ++ ++ /* Mark phase 1 as complete */ ++ phase1_complete = 1; ++ bpf_printk("Phase 1 complete, starting phase 2 stress testing"); ++ } else if (!phase1_complete) { ++ /* Still in phase 1, use real DSQ */ ++ scx_bpf_dsq_move_to_local(real_dsq_id); ++ } else { ++ /* Phase 2: Scan all DSQs in the pool and try to move a task */ ++ if (!scan_dsq_pool()) { ++ /* No tasks found in DSQ pool, fall back to real DSQ */ ++ scx_bpf_dsq_move_to_local(real_dsq_id); ++ } ++ } ++} ++ ++s32 BPF_STRUCT_OPS_SLEEPABLE(peek_dsq_init) ++{ ++ s32 err; ++ int i; ++ ++ /* Always set debug values so we can see which version we're using */ ++ debug_ksym_exists = bpf_ksym_exists(scx_bpf_dsq_peek) ? 1 : 0; ++ ++ /* Initialize state first */ ++ insert_test_cpu = -1; ++ enqueue_count = 0; ++ dispatch_count = 0; ++ phase1_complete = 0; ++ total_peek_attempts = 0; ++ successful_peeks = 0; ++ ++ /* Create the test and real DSQs */ ++ err = scx_bpf_create_dsq(test_dsq_id, -1); ++ if (err) { ++ scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); ++ return err; ++ } ++ err = scx_bpf_create_dsq(real_dsq_id, -1); ++ if (err) { ++ scx_bpf_error("Failed to create DSQ %d: %d", test_dsq_id, err); ++ return err; ++ } ++ ++ /* Create the DSQ pool for stress testing */ ++ bpf_for(i, 0, DSQ_POOL_SIZE) { ++ int dsq_id = dsq_pool_base_id + i; ++ ++ err = scx_bpf_create_dsq(dsq_id, -1); ++ if (err) { ++ scx_bpf_error("Failed to create DSQ pool entry %d: %d", dsq_id, err); ++ return err; ++ } ++ } ++ ++ /* Initialize the peek results map */ ++ bpf_for(i, 0, MAX_SAMPLES) { ++ u32 key = i; ++ long pid = -1; ++ ++ bpf_map_update_elem(&peek_results, &key, &pid, BPF_ANY); ++ } ++ ++ return 0; ++} ++ ++void BPF_STRUCT_OPS(peek_dsq_exit, struct scx_exit_info *ei) ++{ ++ int i; ++ ++ /* Destroy the primary DSQs */ ++ scx_bpf_destroy_dsq(test_dsq_id); ++ scx_bpf_destroy_dsq(real_dsq_id); ++ ++ /* Destroy the DSQ pool */ ++ bpf_for(i, 0, DSQ_POOL_SIZE) { ++ int dsq_id = dsq_pool_base_id + i; ++ ++ scx_bpf_destroy_dsq(dsq_id); ++ } ++ ++ UEI_RECORD(uei, ei); ++} ++ ++SEC(".struct_ops.link") ++struct sched_ext_ops peek_dsq_ops = { ++ .enqueue = (void *)peek_dsq_enqueue, ++ .dispatch = (void *)peek_dsq_dispatch, ++ .init = (void *)peek_dsq_init, ++ .exit = (void *)peek_dsq_exit, ++ .name = "peek_dsq", ++}; +diff --git a/tools/testing/selftests/sched_ext/peek_dsq.c b/tools/testing/selftests/sched_ext/peek_dsq.c +new file mode 100644 +index 000000000000..a717384a3224 +--- /dev/null ++++ b/tools/testing/selftests/sched_ext/peek_dsq.c +@@ -0,0 +1,224 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Test for DSQ operations including create, destroy, and peek operations. ++ * ++ * Copyright (c) 2025 Meta Platforms, Inc. and affiliates. ++ * Copyright (c) 2025 Ryan Newton ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "peek_dsq.bpf.skel.h" ++#include "scx_test.h" ++ ++#define NUM_WORKERS 4 ++ ++static bool workload_running = true; ++static pthread_t workload_threads[NUM_WORKERS]; ++ ++/** ++ * Background workload thread that sleeps and wakes rapidly to exercise ++ * the scheduler's enqueue operations and ensure DSQ operations get tested. ++ */ ++static void *workload_thread_fn(void *arg) ++{ ++ while (workload_running) { ++ /* Sleep for a very short time to trigger scheduler activity */ ++ usleep(1000); /* 1ms sleep */ ++ /* Yield to ensure we go through the scheduler */ ++ sched_yield(); ++ } ++ return NULL; ++} ++ ++static enum scx_test_status setup(void **ctx) ++{ ++ struct peek_dsq *skel; ++ ++ skel = peek_dsq__open(); ++ SCX_FAIL_IF(!skel, "Failed to open"); ++ SCX_ENUM_INIT(skel); ++ SCX_FAIL_IF(peek_dsq__load(skel), "Failed to load skel"); ++ ++ *ctx = skel; ++ ++ return SCX_TEST_PASS; ++} ++ ++static int print_observed_pids(struct bpf_map *map, int max_samples, const char *dsq_name) ++{ ++ long count = 0; ++ ++ printf("Observed %s DSQ peek pids:\n", dsq_name); ++ for (int i = 0; i < max_samples; i++) { ++ long pid; ++ int err; ++ ++ err = bpf_map_lookup_elem(bpf_map__fd(map), &i, &pid); ++ if (err == 0) { ++ if (pid == 0) { ++ printf(" Sample %d: NULL peek\n", i); ++ } else if (pid > 0) { ++ printf(" Sample %d: pid %ld\n", i, pid); ++ count++; ++ } ++ } else { ++ printf(" Sample %d: error reading pid (err=%d)\n", i, err); ++ } ++ } ++ printf("Observed ~%ld pids in the %s DSQ(s)\n", count, dsq_name); ++ return count; ++} ++ ++static enum scx_test_status run(void *ctx) ++{ ++ struct peek_dsq *skel = ctx; ++ bool failed = false; ++ int seconds = 3; ++ int err; ++ ++ /* Enable the scheduler to test DSQ operations */ ++ printf("Enabling scheduler to test DSQ insert operations...\n"); ++ ++ struct bpf_link *link = ++ bpf_map__attach_struct_ops(skel->maps.peek_dsq_ops); ++ ++ if (!link) { ++ SCX_ERR("Failed to attach struct_ops"); ++ return SCX_TEST_FAIL; ++ } ++ ++ printf("Starting %d background workload threads...\n", NUM_WORKERS); ++ workload_running = true; ++ for (int i = 0; i < NUM_WORKERS; i++) { ++ err = pthread_create(&workload_threads[i], NULL, workload_thread_fn, NULL); ++ if (err) { ++ SCX_ERR("Failed to create workload thread %d: %s", i, strerror(err)); ++ /* Stop already created threads */ ++ workload_running = false; ++ for (int j = 0; j < i; j++) ++ pthread_join(workload_threads[j], NULL); ++ bpf_link__destroy(link); ++ return SCX_TEST_FAIL; ++ } ++ } ++ ++ printf("Waiting for enqueue events.\n"); ++ sleep(seconds); ++ while (skel->data->enqueue_count <= 0) { ++ printf("."); ++ fflush(stdout); ++ sleep(1); ++ seconds++; ++ if (seconds >= 30) { ++ printf("\n\u2717 Timeout waiting for enqueue events\n"); ++ /* Stop workload threads and cleanup */ ++ workload_running = false; ++ for (int i = 0; i < NUM_WORKERS; i++) ++ pthread_join(workload_threads[i], NULL); ++ bpf_link__destroy(link); ++ return SCX_TEST_FAIL; ++ } ++ } ++ ++ workload_running = false; ++ for (int i = 0; i < NUM_WORKERS; i++) { ++ err = pthread_join(workload_threads[i], NULL); ++ if (err) { ++ SCX_ERR("Failed to join workload thread %d: %s", i, strerror(err)); ++ bpf_link__destroy(link); ++ return SCX_TEST_FAIL; ++ } ++ } ++ printf("Background workload threads stopped.\n"); ++ ++ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_NONE)); ++ ++ /* Detach the scheduler */ ++ bpf_link__destroy(link); ++ ++ printf("Enqueue/dispatch count over %d seconds: %d / %d\n", seconds, ++ skel->data->enqueue_count, skel->data->dispatch_count); ++ printf("Debug: ksym_exists=%d\n", ++ skel->bss->debug_ksym_exists); ++ ++ /* Check DSQ insert result */ ++ printf("DSQ insert test done on cpu: %d\n", skel->data->insert_test_cpu); ++ if (skel->data->insert_test_cpu != -1) ++ printf("\u2713 DSQ insert succeeded !\n"); ++ else { ++ printf("\u2717 DSQ insert failed or not attempted\n"); ++ failed = true; ++ } ++ ++ /* Check DSQ peek results */ ++ printf(" DSQ peek result 1 (before insert): %d\n", ++ skel->data->dsq_peek_result1); ++ if (skel->data->dsq_peek_result1 == 0) ++ printf("\u2713 DSQ peek verification success: peek returned NULL!\n"); ++ else { ++ printf("\u2717 DSQ peek verification failed\n"); ++ failed = true; ++ } ++ ++ printf(" DSQ peek result 2 (after insert): %ld\n", ++ skel->data->dsq_peek_result2); ++ printf(" DSQ peek result 2, expected: %ld\n", ++ skel->data->dsq_peek_result2_expected); ++ if (skel->data->dsq_peek_result2 == ++ skel->data->dsq_peek_result2_expected) ++ printf("\u2713 DSQ peek verification success: peek returned the inserted task!\n"); ++ else { ++ printf("\u2717 DSQ peek verification failed\n"); ++ failed = true; ++ } ++ ++ printf(" Inserted test task -> pid: %ld\n", skel->data->dsq_inserted_pid); ++ printf(" DSQ peek result 2 -> pid: %ld\n", skel->data->dsq_peek_result2_pid); ++ ++ int pid_count; ++ ++ pid_count = print_observed_pids(skel->maps.peek_results, ++ skel->data->max_samples, "DSQ pool"); ++ printf("Total non-null peek observations: %ld out of %ld\n", ++ skel->data->successful_peeks, skel->data->total_peek_attempts); ++ ++ if (skel->bss->debug_ksym_exists && pid_count == 0) { ++ printf("\u2717 DSQ pool test failed: no successful peeks in native mode\n"); ++ failed = true; ++ } ++ if (skel->bss->debug_ksym_exists && pid_count > 0) ++ printf("\u2713 DSQ pool test success: observed successful peeks in native mode\n"); ++ ++ if (failed) ++ return SCX_TEST_FAIL; ++ else ++ return SCX_TEST_PASS; ++} ++ ++static void cleanup(void *ctx) ++{ ++ struct peek_dsq *skel = ctx; ++ ++ if (workload_running) { ++ workload_running = false; ++ for (int i = 0; i < NUM_WORKERS; i++) ++ pthread_join(workload_threads[i], NULL); ++ } ++ ++ peek_dsq__destroy(skel); ++} ++ ++struct scx_test peek_dsq = { ++ .name = "peek_dsq", ++ .description = ++ "Test DSQ create/destroy operations and future peek functionality", ++ .setup = setup, ++ .run = run, ++ .cleanup = cleanup, ++}; ++REGISTER_SCX_TEST(&peek_dsq) +-- +2.52.0 + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip new file mode 100644 index 0000000..2ac2c2f --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-01-19-sched-fair-Add-infrastructure-for-cache-aware-load-balancing.patch.skip @@ -0,0 +1,654 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 46062169AD2 + for ; Sat, 11 Oct 2025 18:18:21 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206703; cv=none; b=RpWLRsxlJTzhlJSNJ6YDnnOidsJ7oCIJ0QG0EXS7VFoOFFRWiuWYlsET6M5MjOkyE+dnQih3vxbVtcm+li+EdUZBeyP5FVticeDHkmuoWPHZblewToySaE5iRFgZqZZMrF2/g7ww+IHVQ3wb1PmaWoyqrDBaIo5To0g72h92TRE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206703; c=relaxed/simple; + bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version:Content-Type; b=QJa7XLmNRAgs2IV6jX9+J3RTiz2TA7hXn5NgC4yjWKV75coBs2eumwHZZgG2HlZqrxNZy2yyHAMM73rFnrDZIvG+RpHWxcfbJopVHrre/vMQ3HJJFjQUmhaAwWCfX+5CuF2S3mkLLbQPk1FwQMpFRQzmQi7ZRNOguwaR+/BIBvQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=fA7dEfIE; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="fA7dEfIE" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206701; x=1791742701; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=eCGUZmunSjVOsoqwxe8kKF4T+jrOyKsftgZkbuwe1Jo=; + b=fA7dEfIE91ULN1jqc64owLAysrWyqWsDA5nuO1+sgcIA15Yn8yYj6iw4 + 55VPKl3g+xYXhPmGyE7a0LZvFUc9YG3ckmUpqO0pvf6oo1RJcM13mS3yi + KNsM4bbd9aFpNPTftzZGqryw94QrGirzar7JNUNOk0MJqRkziOVPLHnOi + iVfGn7SOaI4LzDDzlorOXwaeFstT3f2UVe0Cr2vAWBdxYyDop0Z+G9hqb + BhSDn+aeXU8OqAYP/xGpt3Ce8cbnDhTJhA+r5jzej1xMspSEeS1p/SQOm + slC+k3w/mm9HPugo6aL39ZyshlQHrAN4qvnJBJT/5GnR6bFHs9O0IKtHz + w==; +X-CSE-ConnectionGUID: AwkM8kCOR6yXxOyCyDBj4Q== +X-CSE-MsgGUID: FBEmDsF5QKC61vf0MqpBmQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339614" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339614" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:19 -0700 +X-CSE-ConnectionGUID: HGgPT3dBQFm59TiA7l3rfA== +X-CSE-MsgGUID: SlOHviQzSgGRjsbScX9f4Q== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487181" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:19 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 01/19] sched/fair: Add infrastructure for cache-aware load balancing +Date: Sat, 11 Oct 2025 11:24:38 -0700 +Message-Id: <865b852e3fdef6561c9e0a5be9a94aec8a68cdea.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: "Peter Zijlstra (Intel)" + +Cache-aware load balancing aims to aggregate tasks with potential +shared resources into the same cache domain. This approach enhances +cache locality, thereby optimizing system performance by reducing +cache misses and improving data access efficiency. + +In the current implementation, threads within the same process are +considered as entities that potentially share resources. +Cache-aware load balancing monitors the CPU occupancy of each cache +domain for every process. Based on this monitoring, it endeavors to +migrate threads within a given process to its cache-hot domains, +with the goal of maximizing cache locality. + +It is an attempt at modelling cache affinity. While the patch series +only targets LLC, it could very well be extended to clusters (L2), +or other kind of domains grouping inside a node. + +As it stands, the mechanism only computes a CPU within the LLC that +has the highest recent runtime; this CPU is then used in the load +balance path in subsequent patches to steer toward this LLC. + +More elaborate measures could be added later in NUMA_BALANCING: for +example, migrating task A to its preferred LLC when it has spare CPU +capacity, or swapping task A with another running task B in task A’s +preferred LLC. + +Originally-by: Peter Zijlstra (Intel) +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/mm_types.h | 44 ++++++ + include/linux/sched.h | 4 + + init/Kconfig | 11 ++ + kernel/fork.c | 6 + + kernel/sched/core.c | 6 + + kernel/sched/fair.c | 288 +++++++++++++++++++++++++++++++++++++++ + kernel/sched/features.h | 1 + + kernel/sched/sched.h | 8 ++ + 8 files changed, 368 insertions(+) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 08bc2442db93..3ca557c2f36d 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -927,6 +927,11 @@ struct mm_cid { + }; + #endif + ++struct mm_sched { ++ u64 runtime; ++ unsigned long epoch; ++}; ++ + struct kioctx_table; + struct iommu_mm_data; + struct mm_struct { +@@ -1017,6 +1022,17 @@ struct mm_struct { + */ + raw_spinlock_t cpus_allowed_lock; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Track per-cpu-per-process occupancy as a proxy for cache residency. ++ * See account_mm_sched() and ... ++ */ ++ struct mm_sched __percpu *pcpu_sched; ++ raw_spinlock_t mm_sched_lock; ++ unsigned long mm_sched_epoch; ++ int mm_sched_cpu; ++#endif ++ + #ifdef CONFIG_MMU + atomic_long_t pgtables_bytes; /* size of all page tables */ + #endif +@@ -1436,6 +1452,34 @@ static inline unsigned int mm_cid_size(void) + static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } + #endif /* CONFIG_SCHED_MM_CID */ + ++#ifdef CONFIG_SCHED_CACHE ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched); ++ ++static inline int mm_alloc_sched_noprof(struct mm_struct *mm) ++{ ++ struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ ++ if (!pcpu_sched) ++ return -ENOMEM; ++ ++ mm_init_sched(mm, pcpu_sched); ++ return 0; ++} ++ ++#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) ++ ++static inline void mm_destroy_sched(struct mm_struct *mm) ++{ ++ free_percpu(mm->pcpu_sched); ++ mm->pcpu_sched = NULL; ++} ++#else /* !CONFIG_SCHED_CACHE */ ++ ++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } ++static inline void mm_destroy_sched(struct mm_struct *mm) { } ++ ++#endif /* CONFIG_SCHED_CACHE */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index f8188b833350..d7ddb7ce6c4b 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1400,6 +1400,10 @@ struct task_struct { + unsigned long numa_pages_migrated; + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ struct callback_head cache_work; ++#endif ++ + #ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; +diff --git a/init/Kconfig b/init/Kconfig +index e3eb63eadc87..4e625db7920a 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -970,6 +970,17 @@ config NUMA_BALANCING + + This system will be inactive on UMA systems. + ++config SCHED_CACHE ++ bool "Cache aware load balance" ++ default y ++ depends on SMP ++ help ++ When enabled, the scheduler will attempt to aggregate tasks from ++ the same process onto a single Last Level Cache (LLC) domain when ++ possible. This improves cache locality by keeping tasks that share ++ resources within the same cache domain, reducing cache misses and ++ lowering data access latency. ++ + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y +diff --git a/kernel/fork.c b/kernel/fork.c +index c4ada32598bd..9cd6efe2926d 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm) + cleanup_lazy_tlbs(mm); + + WARN_ON_ONCE(mm == current->active_mm); ++ mm_destroy_sched(mm); + mm_free_pgd(mm); + mm_free_id(mm); + destroy_context(mm); +@@ -1079,6 +1080,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + if (mm_alloc_cid(mm, p)) + goto fail_cid; + ++ if (mm_alloc_sched(mm)) ++ goto fail_sched; ++ + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; +@@ -1088,6 +1092,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + return mm; + + fail_pcpu: ++ mm_destroy_sched(mm); ++fail_sched: + mm_destroy_cid(mm); + fail_cid: + destroy_context(mm); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index be00629f0ba4..79d15e904d12 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4520,6 +4520,7 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; + init_sched_mm_cid(p); ++ init_sched_mm(p); + } + + DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); +@@ -8821,6 +8822,11 @@ void __init sched_init(void) + + rq->core_cookie = 0UL; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spin_lock_init(&rq->cpu_epoch_lock); ++ rq->cpu_epoch_next = jiffies; ++#endif ++ + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); + } + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b173a059315c..a2ea002f4fd6 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p) + sa->runnable_avg = sa->util_avg; + } + ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec); ++ + static s64 update_se(struct rq *rq, struct sched_entity *se) + { + u64 now = rq_clock_task(rq); +@@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); ++ account_mm_sched(rq, donor, delta_exec); + + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); +@@ -1193,6 +1196,289 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + return delta_exec; + } + ++#ifdef CONFIG_SCHED_CACHE ++ ++/* ++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD ++ * tunable or so. ++ */ ++#define EPOCH_PERIOD (HZ / 100) /* 10 ms */ ++#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ ++ ++static int llc_id(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_id, cpu); ++} ++ ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) ++{ ++ unsigned long epoch; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); ++ struct rq *rq = cpu_rq(i); ++ ++ pcpu_sched->runtime = 0; ++ pcpu_sched->epoch = rq->cpu_epoch; ++ epoch = rq->cpu_epoch; ++ } ++ ++ raw_spin_lock_init(&mm->mm_sched_lock); ++ mm->mm_sched_epoch = epoch; ++ mm->mm_sched_cpu = -1; ++ ++ /* ++ * The update to mm->pcpu_sched should not be reordered ++ * before initialization to mm's other fields, in case ++ * the readers may get invalid mm_sched_epoch, etc. ++ */ ++ smp_store_release(&mm->pcpu_sched, _pcpu_sched); ++} ++ ++/* because why would C be fully specified */ ++static __always_inline void __shr_u64(u64 *val, unsigned int n) ++{ ++ if (n >= 64) { ++ *val = 0; ++ return; ++ } ++ *val >>= n; ++} ++ ++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ lockdep_assert_held(&rq->cpu_epoch_lock); ++ ++ unsigned long n, now = jiffies; ++ long delta = now - rq->cpu_epoch_next; ++ ++ if (delta > 0) { ++ n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ rq->cpu_epoch += n; ++ rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ __shr_u64(&rq->cpu_runtime, n); ++ } ++ ++ n = rq->cpu_epoch - pcpu_sched->epoch; ++ if (n) { ++ pcpu_sched->epoch += n; ++ __shr_u64(&pcpu_sched->runtime, n); ++ } ++} ++ ++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); ++ ++ __update_mm_sched(rq, pcpu_sched); ++ ++ /* ++ * Runtime is a geometric series (r=0.5) and as such will sum to twice ++ * the accumulation period, this means the multiplcation here should ++ * not overflow. ++ */ ++ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); ++} ++ ++static inline ++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) ++{ ++ struct mm_struct *mm = p->mm; ++ struct mm_sched *pcpu_sched; ++ unsigned long epoch; ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ if (p->sched_class != &fair_sched_class) ++ return; ++ /* ++ * init_task and kthreads don't having mm ++ */ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq)); ++ ++ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { ++ __update_mm_sched(rq, pcpu_sched); ++ pcpu_sched->runtime += delta_exec; ++ rq->cpu_runtime += delta_exec; ++ epoch = rq->cpu_epoch; ++ } ++ ++ /* ++ * If this task hasn't hit task_cache_work() for a while, or it ++ * has only 1 thread, invalidate its preferred state. ++ */ ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || ++ get_nr_threads(p) <= 1) { ++ if (mm->mm_sched_cpu != -1) ++ mm->mm_sched_cpu = -1; ++ } ++} ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ struct mm_struct *mm = p->mm; ++ ++ if (!sched_feat(SCHED_CACHE)) ++ return; ++ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ if (mm->mm_sched_epoch == rq->cpu_epoch) ++ return; ++ ++ guard(raw_spinlock)(&mm->mm_sched_lock); ++ ++ if (work->next == work) { ++ task_work_add(p, work, TWA_RESUME); ++ WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch); ++ } ++} ++ ++static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu, ++ int pref_nid, int curr_cpu) ++{ ++#ifdef CONFIG_NUMA_BALANCING ++ /* First honor the task's preferred node. */ ++ if (pref_nid != NUMA_NO_NODE) ++ cpumask_or(cpus, cpus, cpumask_of_node(pref_nid)); ++#endif ++ ++ /* Next honor the task's cache CPU if it is not included. */ ++ if (cache_cpu != -1 && !cpumask_test_cpu(cache_cpu, cpus)) ++ cpumask_or(cpus, cpus, ++ cpumask_of_node(cpu_to_node(cache_cpu))); ++ ++ /* ++ * Lastly make sure that the task's current running node is ++ * considered. ++ */ ++ if (!cpumask_test_cpu(curr_cpu, cpus)) ++ cpumask_or(cpus, cpus, cpumask_of_node(cpu_to_node(curr_cpu))); ++} ++ ++static void __no_profile task_cache_work(struct callback_head *work) ++{ ++ struct task_struct *p = current; ++ struct mm_struct *mm = p->mm; ++ unsigned long m_a_occ = 0; ++ unsigned long curr_m_a_occ = 0; ++ int cpu, m_a_cpu = -1, cache_cpu, ++ pref_nid = NUMA_NO_NODE, curr_cpu; ++ cpumask_var_t cpus; ++ ++ WARN_ON_ONCE(work != &p->cache_work); ++ ++ work->next = work; ++ ++ if (p->flags & PF_EXITING) ++ return; ++ ++ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) ++ return; ++ ++ curr_cpu = task_cpu(p); ++ cache_cpu = mm->mm_sched_cpu; ++#ifdef CONFIG_NUMA_BALANCING ++ if (static_branch_likely(&sched_numa_balancing)) ++ pref_nid = p->numa_preferred_nid; ++#endif ++ ++ scoped_guard (cpus_read_lock) { ++ get_scan_cpumasks(cpus, cache_cpu, ++ pref_nid, curr_cpu); ++ ++ for_each_cpu(cpu, cpus) { ++ /* XXX sched_cluster_active */ ++ struct sched_domain *sd = per_cpu(sd_llc, cpu); ++ unsigned long occ, m_occ = 0, a_occ = 0; ++ int m_cpu = -1, i; ++ ++ if (!sd) ++ continue; ++ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ occ = fraction_mm_sched(cpu_rq(i), ++ per_cpu_ptr(mm->pcpu_sched, i)); ++ a_occ += occ; ++ if (occ > m_occ) { ++ m_occ = occ; ++ m_cpu = i; ++ } ++ } ++ ++ /* ++ * Compare the accumulated occupancy of each LLC. The ++ * reason for using accumulated occupancy rather than average ++ * per CPU occupancy is that it works better in asymmetric LLC ++ * scenarios. ++ * For example, if there are 2 threads in a 4CPU LLC and 3 ++ * threads in an 8CPU LLC, it might be better to choose the one ++ * with 3 threads. However, this would not be the case if the ++ * occupancy is divided by the number of CPUs in an LLC (i.e., ++ * if average per CPU occupancy is used). ++ * Besides, NUMA balancing fault statistics behave similarly: ++ * the total number of faults per node is compared rather than ++ * the average number of faults per CPU. This strategy is also ++ * followed here. ++ */ ++ if (a_occ > m_a_occ) { ++ m_a_occ = a_occ; ++ m_a_cpu = m_cpu; ++ } ++ ++ if (llc_id(cpu) == llc_id(mm->mm_sched_cpu)) ++ curr_m_a_occ = a_occ; ++ ++ cpumask_andnot(cpus, cpus, sched_domain_span(sd)); ++ } ++ } ++ ++ if (m_a_occ > (2 * curr_m_a_occ)) { ++ /* ++ * Avoid switching mm_sched_cpu too fast. ++ * The reason to choose 2X is because: ++ * 1. It is better to keep the preferred LLC stable, ++ * rather than changing it frequently and cause migrations ++ * 2. 2X means the new preferred LLC has at least 1 more ++ * busy CPU than the old one(200% vs 100%, eg) ++ * 3. 2X is chosen based on test results, as it delivers ++ * the optimal performance gain so far. ++ */ ++ mm->mm_sched_cpu = m_a_cpu; ++ } ++ ++ free_cpumask_var(cpus); ++} ++ ++void init_sched_mm(struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ ++ init_task_work(work, task_cache_work); ++ work->next = work; ++} ++ ++#else ++ ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, ++ s64 delta_exec) { } ++ ++void init_sched_mm(struct task_struct *p) { } ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) { } ++ ++#endif ++ + /* + * Used by other classes to account runtime. + */ +@@ -13031,6 +13317,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + if (static_branch_unlikely(&sched_numa_balancing)) + task_tick_numa(rq, curr); + ++ task_tick_cache(rq, curr); ++ + update_misfit_status(curr, rq); + check_update_overutilized_status(task_rq(curr)); + +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index 3c12d9f93331..d2af7bfd36bf 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -87,6 +87,7 @@ SCHED_FEAT(TTWU_QUEUE, true) + */ + SCHED_FEAT(SIS_UTIL, true) + ++SCHED_FEAT(SCHED_CACHE, true) + /* + * Issue a WARN when we do multiple update_rq_clock() calls + * in a single rq->lock section. Default disabled because the +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index be9745d104f7..2ded8d3d0ecc 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1166,6 +1166,12 @@ struct rq { + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spinlock_t cpu_epoch_lock ____cacheline_aligned; ++ u64 cpu_runtime; ++ unsigned long cpu_epoch; ++ unsigned long cpu_epoch_next; ++#endif + + atomic_t nr_iowait; + +@@ -3790,6 +3796,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } + static inline void init_sched_mm_cid(struct task_struct *t) { } + #endif /* !CONFIG_SCHED_MM_CID */ + ++extern void init_sched_mm(struct task_struct *p); ++ + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); + extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + static inline +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip new file mode 100644 index 0000000..cbf16ce --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-02-19-sched-fair-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip @@ -0,0 +1,227 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 19068204096 + for ; Sat, 11 Oct 2025 18:18:21 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206704; cv=none; b=EzlLh3pSj7Y4f8RITAS280jAzGdfSil0Uvmf2s0iDBWXhjbTN9kKcwe8yCBI8vI/kpxwAU/q6SDZiBXRODyVXxt+x1ZEHGNytyNVJ+14VdLcKLUF/bWqEXXojGdMU1nZFeYor5k/Gwn2eBMXY7mjVq+req3REwzEV/z7PNxWJYU= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206704; c=relaxed/simple; + bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version:Content-Type; b=naTQ9gtxsiPYap1e7sRA67shhCjtvQU5+UWYPmFmFnsa1NV0CLod+8tcKlUn52BHYuXFMHk+KQi3AhpPSOC+Tysfot4R/EhnOjDucwfpslAmfKl+rwCfOrGMnq3fjOG/h3r7EnuLxz8dxpUfqriJzedrFrStvfO37iAPvvF5HVg= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=LSwa/WAK; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="LSwa/WAK" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206702; x=1791742702; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=BGRV8Sqvoh/cH0/obDDFWGIX+d3J6kT5RHYq4DeXAFQ=; + b=LSwa/WAKvGAX6RIYpQ7iNqrlvhm/Szlkb5ZlWCgbajQDsBhTiTWg/PPi + Nxj6VEs7MSoZptgkIvxX8jl3FQca3deDnRuhlinmaGbJYu3LY3ZP4p3jp + 4+hBugKd3GkfwcLlWr+3IrP84r9gwdtMmKlDccI1G07f4s4tirTBoEDsm + gJ8uA3qrKlx1xYMf/sgz5udiByo4NeRPGdBdJ+bYBTDvNTGeTE9k4bBmi + 0OuSxEI9YhInAS8s2mr8VnpZwUVjixmAO4g6ZwRHW42PucNrjAj/v7YoU + sfJ1aDaIb4/pD7oTExOcJxChABHQZAXGQ1b9F1jBoWdX4w8mb0HwbQJ+I + A==; +X-CSE-ConnectionGUID: V6kqtIYCR06jkGZvnWCLsQ== +X-CSE-MsgGUID: XSPXCIWWQjiVjOSNzEq1Ow== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339631" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339631" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:20 -0700 +X-CSE-ConnectionGUID: wcTW2V7hQHun3H1J8na2Fw== +X-CSE-MsgGUID: zfpr8MStR5yuJxzDpmsnpw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487184" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:20 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 02/19] sched/fair: Record per-LLC utilization to guide cache-aware scheduling decisions +Date: Sat, 11 Oct 2025 11:24:39 -0700 +Message-Id: <7684e7381c61a2a0d0580790340d4daa5349e48c.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +When a system becomes busy and a process’s preferred LLC is +saturated with too many threads, tasks within that LLC migrate +frequently. These in LLC migrations introduce latency and degrade +performance. To avoid this, task aggregation should be suppressed when +the preferred LLC is overloaded, which requires a metric to indicate +LLC utilization. + +Record per LLC utilization/cpu capacity during periodic load +balancing. These statistics will be used in later patches to decide +whether tasks should be aggregated into their preferred LLC. + +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/sched/topology.h | 4 ++ + kernel/sched/fair.c | 73 ++++++++++++++++++++++++++++++++++ + 2 files changed, 77 insertions(+) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 5263746b63e8..fa25db00fdb6 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -77,6 +77,10 @@ struct sched_domain_shared { + atomic_t nr_busy_cpus; + int has_idle_cores; + int nr_idle_scan; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned long util_avg; ++ unsigned long capacity ____cacheline_aligned_in_smp; ++#endif + }; + + struct sched_domain { +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a2ea002f4fd6..1ebb0d99a906 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9559,6 +9559,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + return 0; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* Called from load balancing paths with rcu_read_lock held */ ++static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ struct sched_domain_shared *sd_share; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu)); ++ if (!sd_share) ++ return false; ++ ++ *util = READ_ONCE(sd_share->util_avg); ++ *cap = READ_ONCE(sd_share->capacity); ++ ++ return true; ++} ++#else ++static inline bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ return false; ++} ++#endif + /* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +@@ -10529,6 +10552,55 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) + return check_cpu_capacity(rq, sd); + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Record the statistics for this scheduler group for later ++ * use. These values guide load balancing on aggregating tasks ++ * to a LLC. ++ */ ++static void record_sg_llc_stats(struct lb_env *env, ++ struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ /* ++ * Find the child domain on env->dst_cpu. This domain ++ * is either the domain that spans this group(if the ++ * group is a local group), or the sibling domain of ++ * this group. ++ */ ++ struct sched_domain *sd = env->sd->child; ++ struct sched_domain_shared *sd_share; ++ ++ if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ return; ++ ++ /* only care about sched domains spanning a LLC */ ++ if (sd != rcu_dereference(per_cpu(sd_llc, env->dst_cpu))) ++ return; ++ ++ /* ++ * At this point we know this group spans a LLC domain. ++ * Record the statistic of this group in its corresponding ++ * shared LLC domain. ++ */ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, ++ cpumask_first(sched_group_span(group)))); ++ if (!sd_share) ++ return; ++ ++ if (READ_ONCE(sd_share->util_avg) != sgs->group_util) ++ WRITE_ONCE(sd_share->util_avg, sgs->group_util); ++ ++ if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) ++ WRITE_ONCE(sd_share->capacity, sgs->group_capacity); ++} ++#else ++static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++} ++#endif ++ + /** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @env: The load balancing environment. +@@ -10618,6 +10690,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ record_sg_llc_stats(env, sgs, group); + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip new file mode 100644 index 0000000..eb1895b --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-03-19-sched-fair-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip @@ -0,0 +1,335 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F9012652B7 + for ; Sat, 11 Oct 2025 18:18:22 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206704; cv=none; b=oUXwn7ZLltUxrcsLLRQdMkG+rOj3I6N99RIlDJViVMyN84ZxeHx7+Ziq9zOEmnN6HNfk258hdIef+3nAkETeBkCnWEbZ8Lcj64n3OoXf0SrXkICA1KPwc1TZ230lpQNfogVeErSJlu4VOhrgueBPexZRP8Ng8MlzAqpdxuV0fQw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206704; c=relaxed/simple; + bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=h155j6xc5cDWdV6bfIecXus0Znq8M6zidqbVhtVjeQT/UoiHcyIrY8v1abXoVw27R0/39P2bQUH4GyYEjMOV8PSTvlLp8J+kYh4mcI1SSe5ftkudSs2ubZG59uaM4B6xXwz85tEAhPwwNkRLqFlmW7J/wyi3Ynw+ec/ie7a3Ft4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n7smfE6o; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n7smfE6o" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206702; x=1791742702; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=ogKsGwPqpnqTItrkOZHQAqJw6k94DPs+hyTioL4d/Ig=; + b=n7smfE6oCjv1Z9pv/7dg2JDtqoMwaTw0XnoJhqh6krIk55XD846r100l + CQyKNCviKGlIlQvhs/a27sgH4IgQduwhbRn6XT0KlUibkjI+C8DxLau1W + bQGlFOBkWVF6N/GWfn6y0ss98uylK337lt84xU7aPoM+QWTzjR+VkOrKT + 0bIzxevMwLmEG4vuOleJ69vSQP6G0PZSGpGrTBTnbFEemOJQO4Ufh8Z3S + CBvnKym+IUG+WQx9TQa+cFfFXkPxhSkobYj2dyGq+CWyc4oBsOiaaIfuN + mb6/NAGjVnTGTjlIsC3a7QsDovld1JkhMvVnrniOZGCbMVHv6vrIMp6no + g==; +X-CSE-ConnectionGUID: y8Q0FIVVTeyqh+iA7G7QGw== +X-CSE-MsgGUID: NHnFhDxxRvChXLKbODkIZw== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339652" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339652" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:21 -0700 +X-CSE-ConnectionGUID: r3BrcjKDSJONY4pZr3YdUQ== +X-CSE-MsgGUID: 9FSjHRHPTQWyN3aom4KQIA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487189" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:21 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 03/19] sched/fair: Introduce helper functions to enforce LLC migration policy +Date: Sat, 11 Oct 2025 11:24:40 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Cache-aware scheduling aggregates threads onto their preferred LLC, +mainly through load balancing. When the preferred LLC becomes +saturated, more threads are still placed there, increasing latency. +A mechanism is needed to limit aggregation so that the preferred LLC +does not become overloaded. + +Introduce helper functions can_migrate_llc() and +can_migrate_llc_task() to enforce the LLC migration policy: + + 1. Aggregate a task to its preferred LLC if both source and + destination LLCs are not too busy (<50% utilization, tunable), + or if doing so will not leave the preferred LLC much more + imbalanced than the non-preferred one (>20% utilization + difference, tunable, similar to imbalance_pct of the LLC domain). + 2. Allow moving a task from overloaded preferred LLC to a non preferred + LLC if this will not cause the non preferred LLC to become + too imbalanced to cause a later migration back. + 3. If both LLCs are too busy, let the generic load balance to spread + the tasks. + +This hysteresis prevents tasks from being migrated into and out of the +preferred LLC frequently (back and forth): the threshold for migrating +a task out of its preferred LLC is higher than that for migrating it +into the LLC. + +Since aggregation tends to make the preferred LLC busier than others, +the imbalance tolerance is controlled by llc_imb_pct. If set to 0, +tasks may still aggregate to the preferred LLC as long as it is +not more utilized than the source LLC, preserving the preference. + +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + kernel/sched/debug.c | 4 ++ + kernel/sched/fair.c | 145 +++++++++++++++++++++++++++++++++++++++++++ + kernel/sched/sched.h | 5 ++ + 3 files changed, 154 insertions(+) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 02e16b70a790..57bb04ebbf96 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -523,6 +523,10 @@ static __init int sched_init_debug(void) + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct); ++ debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct); ++#endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + + debugfs_fair_server_init(); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 1ebb0d99a906..cd080468ddc9 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1205,6 +1205,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + #define EPOCH_PERIOD (HZ / 100) /* 10 ms */ + #define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ + ++__read_mostly unsigned int llc_overload_pct = 50; ++__read_mostly unsigned int llc_imb_pct = 20; ++ + static int llc_id(int cpu) + { + if (cpu < 0) +@@ -9560,6 +9563,27 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + } + + #ifdef CONFIG_SCHED_CACHE ++/* ++ * The margin used when comparing LLC utilization with CPU capacity. ++ * Parameter llc_overload_pct determines the LLC load level where ++ * active LLC aggregation is done. ++ * Derived from fits_capacity(). ++ * ++ * (default: ~50%) ++ */ ++#define fits_llc_capacity(util, max) \ ++ ((util) * 100 < (max) * llc_overload_pct) ++ ++/* ++ * The margin used when comparing utilization. ++ * is 'util1' noticeably greater than 'util2' ++ * Derived from capacity_greater(). ++ * Bias is in perentage. ++ */ ++/* Allows dst util to be bigger than src util by up to bias percent */ ++#define util_greater(util1, util2) \ ++ ((util1) * 100 > (util2) * (100 + llc_imb_pct)) ++ + /* Called from load balancing paths with rcu_read_lock held */ + static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +@@ -9575,6 +9599,127 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + + return true; + } ++ ++/* ++ * Decision matrix according to the LLC utilization. To ++ * decide whether we can do task aggregation across LLC. ++ * ++ * By default, 50% is the threshold to treat the LLC as busy, ++ * and 20% is the utilization imbalance percentage to decide ++ * if the preferred LLC is busier than the non-preferred LLC. ++ * ++ * 1. moving towards the preferred LLC, dst is the preferred ++ * LLC, src is not. ++ * ++ * src \ dst 30% 40% 50% 60% ++ * 30% Y Y Y N ++ * 40% Y Y Y Y ++ * 50% Y Y G G ++ * 60% Y Y G G ++ * ++ * 2. moving out of the preferred LLC, src is the preferred ++ * LLC, dst is not: ++ * ++ * src \ dst 30% 40% 50% 60% ++ * 30% N N N N ++ * 40% N N N N ++ * 50% N N G G ++ * 60% Y N G G ++ * ++ * src : src_util ++ * dst : dst_util ++ * Y : Yes, migrate ++ * N : No, do not migrate ++ * G : let the Generic load balance to even the load. ++ * ++ * The intention is that if both LLCs are quite busy, cache aware ++ * load balance should not be performed, and generic load balance ++ * should take effect. However, if one is busy and the other is not, ++ * the preferred LLC capacity(50%) and imbalance criteria(20%) should ++ * be considered to determine whether LLC aggregation should be ++ * performed to bias the load towards the preferred LLC. ++ */ ++ ++/* migration decision, 3 states are orthogonal. */ ++enum llc_mig { ++ mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */ ++ mig_llc, /* Y: Do LLC preference based migration */ ++ mig_unrestricted /* G: Don't restrict generic load balance migration */ ++}; ++ ++static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, ++ unsigned long tsk_util, ++ bool to_pref) ++{ ++ unsigned long src_util, dst_util, src_cap, dst_cap; ++ ++ if (!get_llc_stats(src_cpu, &src_util, &src_cap) || ++ !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) ++ return mig_unrestricted; ++ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ !fits_llc_capacity(src_util, src_cap)) ++ return mig_unrestricted; ++ ++ src_util = src_util < tsk_util ? 0 : src_util - tsk_util; ++ dst_util = dst_util + tsk_util; ++ if (to_pref) { ++ /* ++ * llc_imb_pct is the imbalance allowed between ++ * preferred LLC and non-preferred LLC. ++ * Don't migrate if we will get preferred LLC too ++ * heavily loaded and if the dest is much busier ++ * than the src, in which case migration will ++ * increase the imbalance too much. ++ */ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ util_greater(dst_util, src_util)) ++ return mig_forbid; ++ } else { ++ /* ++ * Don't migrate if we will leave preferred LLC ++ * too idle, or if this migration leads to the ++ * non-preferred LLC falls within sysctl_aggr_imb percent ++ * of preferred LLC, leading to migration again ++ * back to preferred LLC. ++ */ ++ if (fits_llc_capacity(src_util, src_cap) || ++ !util_greater(src_util, dst_util)) ++ return mig_forbid; ++ } ++ return mig_llc; ++} ++ ++/* ++ * Check if task p can migrate from src_cpu to dst_cpu ++ * in terms of cache aware load balance. ++ */ ++static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, ++ struct task_struct *p) ++{ ++ struct mm_struct *mm; ++ bool to_pref; ++ int cpu; ++ ++ mm = p->mm; ++ if (!mm) ++ return mig_unrestricted; ++ ++ cpu = mm->mm_sched_cpu; ++ if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) ++ return mig_unrestricted; ++ ++ if (cpus_share_cache(dst_cpu, cpu)) ++ to_pref = true; ++ else if (cpus_share_cache(src_cpu, cpu)) ++ to_pref = false; ++ else ++ return mig_unrestricted; ++ ++ return can_migrate_llc(src_cpu, dst_cpu, ++ task_util(p), to_pref); ++} ++ + #else + static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 2ded8d3d0ecc..a52c96064b36 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2797,6 +2797,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; + extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + ++#ifdef CONFIG_SCHED_CACHE ++extern unsigned int llc_overload_pct; ++extern unsigned int llc_imb_pct; ++#endif ++ + #ifdef CONFIG_SCHED_HRTICK + + /* +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip new file mode 100644 index 0000000..233f3fe --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-04-19-sched-fair-Introduce-a-static-key-to-enable-cache-aware-only-for-multi-LLCs.patch.skip @@ -0,0 +1,208 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id BD9FE27F75F + for ; Sat, 11 Oct 2025 18:18:23 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206705; cv=none; b=toy7mYgrkMShyfYM+pYJVnlk2kT96KNiv5DNY2SPeZNG+C4hUMbzxW+QMLoY5P4G0gxMEqPJZD1oRcx17kku+G6SaznXM9qHf6TbjE3y6E+5eW6mFGs9F7x17MH+po42oQIBeMuQONsrqKSl7XLcK2ag8qWKJC1Xr5w/c8efzqg= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206705; c=relaxed/simple; + bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=BrQQpH91F+AYLu9pNsP5vrblllGBIiYSrf9Tqy9EYC4wS0n0udak+gKeFf8J19+3f0P2Q81tPIF74K0DC5ETs6YeanXYBydnXlUojA//lO1O300HBm7E4ONxjKjmsrUvcSI3JT5Le3EHo8kdx7whhv843/P3GIna7MP3njXDV14= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BHqKXCIn; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BHqKXCIn" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206704; x=1791742704; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=DdyW/r2KQaOAUhZji+A8n5cKTc9SCv7SgRP3P8o/I+A=; + b=BHqKXCInpJ9FMs87LCbtbTr8sCx+I94vOdw+YhnA01VGi2y2vrviHuha + 44dYUBEYMQCSqJ0LZTT2V+2kshxkaTOgIYxGLcnue8xZcdvJE+tFA1vNK + e3l/bHsCjqNkzuXBC7xQTcdlcOk0RWIbIkbhlcUaSh6K3yuxlVHUHJcmE + r0xmWO+olPuADPa5P30u0Ohf3HcjIqBXZsxBvV5VI21iprKzNU2fqZx7i + dnB6Mbk+VkrpWYKhn8UVMBHAO40Hwj1qg7dTaTpQfAWXx8+nbbBZeHxKl + 1QcSW4+uLMzTxhbUTINvxL6mxdB/i7FkzCBGLbgZ013YwkDLFD2+4CBnX + w==; +X-CSE-ConnectionGUID: XU0Bp+klQCiSCfmyOaBeOA== +X-CSE-MsgGUID: qUdy5aE4QB+ndas2O3JrjQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339674" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339674" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:22 -0700 +X-CSE-ConnectionGUID: veyEE6PBTGirh+PomEioDQ== +X-CSE-MsgGUID: eht/GZN/S/ekMdaQtDO0ag== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487193" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:22 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 04/19] sched/fair: Introduce a static key to enable cache aware only for multi LLCs +Date: Sat, 11 Oct 2025 11:24:41 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Enable cache-aware load balancing only if at least 1 NUMA node has +more than one LLC. + +Suggested-by: Libo Chen +Suggested-by: Adam Li +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 15 ++++++++++++--- + kernel/sched/sched.h | 1 + + kernel/sched/topology.c | 14 ++++++++++++-- + 3 files changed, 25 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cd080468ddc9..3d643449c48c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1208,6 +1208,14 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + __read_mostly unsigned int llc_overload_pct = 50; + __read_mostly unsigned int llc_imb_pct = 20; + ++DEFINE_STATIC_KEY_FALSE(sched_cache_allowed); ++ ++static inline bool sched_cache_enabled(void) ++{ ++ return sched_feat(SCHED_CACHE) && ++ static_branch_likely(&sched_cache_allowed); ++} ++ + static int llc_id(int cpu) + { + if (cpu < 0) +@@ -1294,7 +1302,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_sched *pcpu_sched; + unsigned long epoch; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_cache_enabled()) + return; + + if (p->sched_class != &fair_sched_class) +@@ -1330,7 +1338,7 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + struct callback_head *work = &p->cache_work; + struct mm_struct *mm = p->mm; + +- if (!sched_feat(SCHED_CACHE)) ++ if (!sched_cache_enabled()) + return; + + if (!mm || !mm->pcpu_sched) +@@ -10716,7 +10724,8 @@ static void record_sg_llc_stats(struct lb_env *env, + struct sched_domain *sd = env->sd->child; + struct sched_domain_shared *sd_share; + +- if (!sched_feat(SCHED_CACHE) || env->idle == CPU_NEWLY_IDLE) ++ if (!sched_cache_enabled() || ++ env->idle == CPU_NEWLY_IDLE) + return; + + /* only care about sched domains spanning a LLC */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index a52c96064b36..60f1e51685ec 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2800,6 +2800,7 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; + #ifdef CONFIG_SCHED_CACHE + extern unsigned int llc_overload_pct; + extern unsigned int llc_imb_pct; ++extern struct static_key_false sched_cache_allowed; + #endif + + #ifdef CONFIG_SCHED_HRTICK +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 6e2f54169e66..2675db980f70 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -2444,6 +2444,7 @@ static int + build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) + { + enum s_alloc alloc_state = sa_none; ++ bool has_multi_llcs = false; + struct sched_domain *sd; + struct s_data d; + struct rq *rq = NULL; +@@ -2530,10 +2531,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + * between LLCs and memory channels. + */ + nr_llcs = sd->span_weight / child->span_weight; +- if (nr_llcs == 1) ++ if (nr_llcs == 1) { + imb = sd->span_weight >> 3; +- else ++ } else { + imb = nr_llcs; ++ has_multi_llcs = true; ++ } + imb = max(1U, imb); + sd->imb_numa_nr = imb; + +@@ -2581,6 +2584,13 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + ++#ifdef CONFIG_SCHED_CACHE ++ if (has_multi_llcs) { ++ static_branch_enable_cpuslocked(&sched_cache_allowed); ++ pr_info("Cache aware load balance enabled.\n"); ++ } ++#endif ++ + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip new file mode 100644 index 0000000..cd2305a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-05-19-sched-fair-Add-LLC-index-mapping-for-CPUs.patch.skip @@ -0,0 +1,291 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8E929283153 + for ; Sat, 11 Oct 2025 18:18:24 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206706; cv=none; b=l9o+r3tPneRXt3UimsPhWTyfqr4rcCBrkqPagUsuj236psyVrtVREf1eV9bh9i5x6sqiX/93/2fGTQOd3tDyAfM2x8nQDBG2tniRFTa1AjKlI5Hs36x8WGu+npNUTYaShkti1wSxrqntJys6VhwZ+aL+o6PQ3k1GyXMU2JJL3bw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206706; c=relaxed/simple; + bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=MDjhwzZYr3m7pwdhzj9TlyV526H5WJLBGHEilCqY27+WQSI1yxnPWT6k5Mm6bFKl/0I+sfGQBi/7HzzHe1S3ts6bk23EZaJB+w94GLEZKAcc8cSHQMDIbKKzGRMgBrwPnT0sZBkKxiooppSIJhtXCA86kWL70YWS1bZ1PVuSOI8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=BzReY9Ll; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="BzReY9Ll" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206705; x=1791742705; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=KmODaaWe2UFjj11ibL17qZDBWmMYCsJpeBqEebS+qwU=; + b=BzReY9LlEh9sk7OgZDcp2VjjY3mwnRzW5hp4d8rSX40TSJQm31n7pNsD + pGDX4pGNqIL2dKhB0TWBOakqdMqoEJBGhhFnbP0SML4ddRpmP22b3hhKk + 66OBjK6EOlIiBTx96elcU0fwjNnZqBKTvf/i3IuC2HlilzxwoimPLi7ym + OqUTRkCWmlqgJ5BjvtUEaD2eb97VkiEAs6iUC5FsMQPohIZRE0ZJGIQT2 + rLWb4YevoZUYtWiZQU/yYmcq5sU7eCp84d/YBPYTw8uDxW2au989TrB9t + olL4givIBdX+ieIJw7430Yz/Es1H+8Ji46MflznNqafshDKBuL8HbpSmx + A==; +X-CSE-ConnectionGUID: xTVpDyXiQYmCxiG8vc8uKg== +X-CSE-MsgGUID: ouYA76mXSo+MkfJ9ZAYryA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339693" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339693" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:23 -0700 +X-CSE-ConnectionGUID: Vda9/GgFQc2uyKt8dn0epA== +X-CSE-MsgGUID: 2SFdpXMCSGKC8Z5YqgWCow== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487198" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:23 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 05/19] sched/fair: Add LLC index mapping for CPUs +Date: Sat, 11 Oct 2025 11:24:42 -0700 +Message-Id: <7d75af576986cf447a171ce11f5e8a15a692e780.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce an index mapping between CPUs and their LLCs. This provides +a continuous per LLC index needed for cache-aware load balancing in +later patches. + +The existing per_cpu llc_id usually points to the first CPU of the +LLC domain, which is sparse and unsuitable as an array index. Using +llc_id directly would waste memory. + +With the new mapping, CPUs in the same LLC share a continuous index: + + per_cpu(llc_idx, CPU=0...15) = 0 + per_cpu(llc_idx, CPU=16...31) = 1 + per_cpu(llc_idx, CPU=32...47) = 2 + ... + +The maximum number of LLCs is limited by CONFIG_NR_LLCS. If the number +of LLCs available exceeds CONFIG_NR_LLCS, the cache aware load balance +is disabled. To further save memory, this array could be converted to +dynamic allocation in the future, or the LLC index could be made NUMA +node-wide. + +As mentioned by Adam, if there is no domain with SD_SHARE_LLC, the +function update_llc_idx() should not be invoked to update the index; +otherwise, it will generate an invalid index. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/threads.h | 10 +++++++++ + init/Kconfig | 9 ++++++++ + kernel/sched/fair.c | 11 ++++++++++ + kernel/sched/sched.h | 2 ++ + kernel/sched/topology.c | 47 +++++++++++++++++++++++++++++++++++++++++ + 5 files changed, 79 insertions(+) + +diff --git a/include/linux/threads.h b/include/linux/threads.h +index 1674a471b0b4..2c9b1adfe024 100644 +--- a/include/linux/threads.h ++++ b/include/linux/threads.h +@@ -20,6 +20,16 @@ + /* Places which use this should consider cpumask_var_t. */ + #define NR_CPUS CONFIG_NR_CPUS + ++#ifndef CONFIG_NR_LLCS ++#define CONFIG_NR_LLCS 1 ++#endif ++ ++#if CONFIG_NR_LLCS > NR_CPUS ++#define NR_LLCS NR_CPUS ++#else ++#define NR_LLCS CONFIG_NR_LLCS ++#endif ++ + #define MIN_THREADS_LEFT_FOR_ROOT 4 + + /* +diff --git a/init/Kconfig b/init/Kconfig +index 4e625db7920a..6e4c96ccdda0 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -981,6 +981,15 @@ config SCHED_CACHE + resources within the same cache domain, reducing cache misses and + lowering data access latency. + ++config NR_LLCS ++ int "Maximum number of Last Level Caches" ++ range 2 1024 ++ depends on SMP && SCHED_CACHE ++ default 64 ++ help ++ This allows you to specify the maximum number of last level caches ++ this kernel will support for cache aware scheduling. ++ + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 3d643449c48c..61c129bde8b6 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1224,6 +1224,17 @@ static int llc_id(int cpu) + return per_cpu(sd_llc_id, cpu); + } + ++/* ++ * continuous LLC index, starting from 0. ++ */ ++static inline int llc_idx(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_idx, cpu); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 60f1e51685ec..b448ad6dc51d 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2039,6 +2039,7 @@ static inline struct sched_domain *lowest_flag_domain(int cpu, int flag) + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DECLARE_PER_CPU(int, sd_llc_size); + DECLARE_PER_CPU(int, sd_llc_id); ++DECLARE_PER_CPU(int, sd_llc_idx); + DECLARE_PER_CPU(int, sd_share_id); + DECLARE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DECLARE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -2047,6 +2048,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + + extern struct static_key_false sched_asym_cpucapacity; + extern struct static_key_false sched_cluster_active; ++extern int max_llcs; + + static __always_inline bool sched_asym_cpucap_active(void) + { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 2675db980f70..4bd033060f1d 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -659,6 +659,7 @@ static void destroy_sched_domains(struct sched_domain *sd) + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_llc); + DEFINE_PER_CPU(int, sd_llc_size); + DEFINE_PER_CPU(int, sd_llc_id); ++DEFINE_PER_CPU(int, sd_llc_idx); + DEFINE_PER_CPU(int, sd_share_id); + DEFINE_PER_CPU(struct sched_domain_shared __rcu *, sd_llc_shared); + DEFINE_PER_CPU(struct sched_domain __rcu *, sd_numa); +@@ -668,6 +669,40 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_cluster_active); + ++int max_llcs = -1; ++ ++static void update_llc_idx(int cpu) ++{ ++#ifdef CONFIG_SCHED_CACHE ++ int idx = -1, llc_id = -1; ++ ++ if (max_llcs > NR_LLCS) ++ return; ++ ++ llc_id = per_cpu(sd_llc_id, cpu); ++ idx = per_cpu(sd_llc_idx, llc_id); ++ ++ /* ++ * A new LLC is detected, increase the index ++ * by 1. ++ */ ++ if (idx < 0) { ++ idx = max_llcs++; ++ ++ if (max_llcs > NR_LLCS) { ++ if (static_branch_unlikely(&sched_cache_allowed)) ++ static_branch_disable_cpuslocked(&sched_cache_allowed); ++ ++ pr_warn_once("CONFIG_NR_LLCS is too small, disable cache aware load balance\n"); ++ return; ++ } ++ ++ per_cpu(sd_llc_idx, llc_id) = idx; ++ } ++ per_cpu(sd_llc_idx, cpu) = idx; ++#endif ++} ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +@@ -687,6 +722,10 @@ static void update_top_cache_domain(int cpu) + per_cpu(sd_llc_id, cpu) = id; + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + ++ /* only update the llc index for domain with SD_SHARE_LLC */ ++ if (sd) ++ update_llc_idx(cpu); ++ + sd = lowest_flag_domain(cpu, SD_CLUSTER); + if (sd) + id = cpumask_first(sched_domain_span(sd)); +@@ -2452,6 +2491,14 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + bool has_asym = false; + bool has_cluster = false; + ++#ifdef CONFIG_SCHED_CACHE ++ if (max_llcs < 0) { ++ for_each_possible_cpu(i) ++ per_cpu(sd_llc_idx, i) = -1; ++ max_llcs = 0; ++ } ++#endif ++ + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip new file mode 100644 index 0000000..33e7efa --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-06-19-sched-fair-Assign-preferred-LLC-ID-to-processes.patch.skip @@ -0,0 +1,156 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id A93492836B1 + for ; Sat, 11 Oct 2025 18:18:24 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206706; cv=none; b=S6xTZtgG4gDit+VImk9W2UzS4qpXEGkcWHMUVoYyOSnpNNw4aucqYAXSSje8zYLjl3z3dX3Jt3ztt7bwcuxWrRrv6qxUGactOiUWUNrvSPN2VWKScV6w3ksMM6saX0NH5ZC3WBABiX0+fpwQlzvqkQFNz80/YqP8x3hbG8jBKng= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206706; c=relaxed/simple; + bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=JsV8TTkODWXWFKIKrzZGo3NxMw8hU5p/OWk4qVG3F1HoqgFqWBsu2TcQGUVWw1R9rnOAFP+1s9fHghtr+g8SHhcTCX8Srq+6rXX7gAPQLfCi2R3P+f6W+h6FG6DDQXFxrgsSAi265RFjsNyqSNVDyYiSw0j1kUou9k2jg/TFWas= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=maHNOTTa; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="maHNOTTa" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206705; x=1791742705; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=9oov8ViGgsZaxAZzpTlsnaOcdJ/Jv8NLa7EsoSw2oPE=; + b=maHNOTTaUom4zOfjF9aQgzk/EHInefpcQXQBpZ407o2A6QAh7rtx4d1V + uIUh04rGM6MxEKMGQGzPbEcwmEUVnQVNQXhq0m60vo8GIlq3nI3UFHh2/ + okHOmrxdhoN3uwbNZN5d2mGAMO3ADHunEGtbLYRsJ5ffyJXYwvK9ZYj6n + ZqWJDYCygmb5LDln/D3icLbLhH8Zm6QWr4yAgVZQ73wl/I3EgDdp+pIYb + aLimiW5HUOhIlD+krR4Rg02sINFyPrZ2h5VJdZ1v01hMqilwa2zgPVcWi + tEJ0OmQs9iwf0mBA0kNnJx5l2NSvLy+2FE84H8lwtH6U/4ySfKAnmdVGc + Q==; +X-CSE-ConnectionGUID: LhZ9XN5ESr6ORNd5zvY9sA== +X-CSE-MsgGUID: UBKHEBpdQNSkGD6fqT87jQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339711" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339711" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:24 -0700 +X-CSE-ConnectionGUID: M/4LVw/6Qg626wVKqENzqw== +X-CSE-MsgGUID: hqk2hnIER+q1aJ8R3vcczQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487203" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:24 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 06/19] sched/fair: Assign preferred LLC ID to processes +Date: Sat, 11 Oct 2025 11:24:43 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +With cache-aware scheduling enabled, each task is assigned a +preferred LLC ID. This allows quick identification of the LLC domain +where the task prefers to run, similar to numa_preferred_nid in +NUMA balancing. + +Signed-off-by: Tim Chen +--- + include/linux/sched.h | 1 + + init/init_task.c | 3 +++ + kernel/sched/fair.c | 7 +++++++ + 3 files changed, 11 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index d7ddb7ce6c4b..8a5e4038cd5c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1402,6 +1402,7 @@ struct task_struct { + + #ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; ++ int preferred_llc; + #endif + + #ifdef CONFIG_RSEQ +diff --git a/init/init_task.c b/init/init_task.c +index e557f622bd90..5fffbe766f57 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -188,6 +188,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { + .numa_group = NULL, + .numa_faults = NULL, + #endif ++#ifdef CONFIG_SCHED_CACHE ++ .preferred_llc = -1, ++#endif + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + .kasan_depth = 1, + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 61c129bde8b6..d6167a029c47 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1312,6 +1312,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_struct *mm = p->mm; + struct mm_sched *pcpu_sched; + unsigned long epoch; ++ int mm_sched_llc = -1; + + if (!sched_cache_enabled()) + return; +@@ -1342,6 +1343,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } ++ ++ if (mm->mm_sched_cpu != -1) ++ mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); ++ ++ if (p->preferred_llc != mm_sched_llc) ++ p->preferred_llc = mm_sched_llc; + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip new file mode 100644 index 0000000..f87fefd --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-07-19-sched-fair-Track-LLC-preferred-tasks-per-runqueue.patch.skip @@ -0,0 +1,257 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id B7966283FE1 + for ; Sat, 11 Oct 2025 18:18:25 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206707; cv=none; b=Jt9YvY3nM/0EYBih4PVmiKQ2QzO4ZDLh2TKnGqMyWerCIfIM0CWceRhOpjM2iQwiUHzLszpycQZ+UQorhwMqEi3t7Erkuc8eVsgIO7guz2r8zCqiEsDc75hJulbNVOIh4Hf5WtkLCN2FDwtJ+pKaDQzjrmQsv/RTGx24LhvBhds= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206707; c=relaxed/simple; + bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=PrDaFPl16+dUYVfNSWRpTD87yz4MK7/HdghB7ILX5xXggJN8vYLmcy4RQj7oE9weOCdcBzd1EZg476MST0VNTm2z3r/YGhIw0/+VWbtq1PKhfCTIEnPZWnJryrgw70ZRp0r4XDiQwz/h8bzHoZp9hMCEYHtSbHfUHW8eNSYr5z8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=GvsjlkoW; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="GvsjlkoW" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206706; x=1791742706; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=42DlMZ/oW4HLhFoIJCetdcfblbinqNDtbjQrvZGBme0=; + b=GvsjlkoWqX+zgP+tTee0MXcNRVBTPQkZKjOLBXZh33p44VICJNCiih6g + bdtLdnWwRkrJ2u2n2AVNyKIqQq+ELwCHQ1bUAIVe5B+Rq8F/WdKivkeVK + qCMdNHmRRRa8ijhdo6AEjjUZeHNS6/1dPU14KFq5zOdeXfuxJL5tGjlxb + ZtqhKFOWrFhhFPJwUw1KWb7C0rBkSGVoUeZH3ORagBu6Ud545g9bPF/M+ + p6sJSBNbnSNsdtDoZzzIKVmezgct+rLH0giyW0IcdjAUJlzYg6VsmVomk + Zm8UHf1s2hBr8fNdeC7UuXGFmty4d2atXckCM+YB8PsOqI0JwqlHCMSZ2 + A==; +X-CSE-ConnectionGUID: uKPzZGMbTiObyQydogOwGQ== +X-CSE-MsgGUID: QbxPW0yzQ4WA7VOf/APdAg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339729" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339729" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:25 -0700 +X-CSE-ConnectionGUID: GxY9AWlwTACW1S97eEsWGg== +X-CSE-MsgGUID: +oNXqS3kSkOTENG/ySm5FA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487208" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:25 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 07/19] sched/fair: Track LLC-preferred tasks per runqueue +Date: Sat, 11 Oct 2025 11:24:44 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +For each runqueue, track the number of tasks with an LLC preference +and how many of them are running on their preferred LLC. This mirrors +nr_numa_running and nr_preferred_running for NUMA balancing, and will +be used by cache-aware load balancing in later patches. + +Signed-off-by: Tim Chen +--- + kernel/sched/core.c | 12 +++++++++++ + kernel/sched/fair.c | 47 +++++++++++++++++++++++++++++++++++++++++++- + kernel/sched/sched.h | 7 +++++++ + 3 files changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 79d15e904d12..5940756e2da3 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -529,6 +529,18 @@ void __trace_set_current_state(int state_value) + } + EXPORT_SYMBOL(__trace_set_current_state); + ++#ifdef CONFIG_SMP ++int task_llc(const struct task_struct *p) ++{ ++ return per_cpu(sd_llc_id, task_cpu(p)); ++} ++#else ++int task_llc(const struct task_struct *p) ++{ ++ return 0; ++} ++#endif ++ + /* + * Serialization rules: + * +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index d6167a029c47..fd315937c0cf 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1235,6 +1235,24 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) ++{ ++ if (!sched_cache_enabled()) ++ return; ++ ++ rq->nr_llc_running += (p->preferred_llc != -1); ++ rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) ++{ ++ if (!sched_cache_enabled()) ++ return; ++ ++ rq->nr_llc_running -= (p->preferred_llc != -1); ++ rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +@@ -1306,6 +1324,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); + } + ++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); ++ + static inline + void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + { +@@ -1347,8 +1367,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm_sched_llc = per_cpu(sd_llc_id, mm->mm_sched_cpu); + +- if (p->preferred_llc != mm_sched_llc) ++ /* task not on rq accounted later in account_entity_enqueue() */ ++ if (task_running_on_cpu(rq->cpu, p) && ++ p->preferred_llc != mm_sched_llc) { ++ account_llc_dequeue(rq, p); + p->preferred_llc = mm_sched_llc; ++ account_llc_enqueue(rq, p); ++ } + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +@@ -1497,6 +1522,15 @@ void init_sched_mm(struct task_struct *p) + work->next = work; + } + ++void reset_llc_stats(struct rq *rq) ++{ ++ if (!sched_cache_enabled()) ++ return; ++ ++ rq->nr_llc_running = 0; ++ rq->nr_pref_llc_running = 0; ++} ++ + #else + + static inline void account_mm_sched(struct rq *rq, struct task_struct *p, +@@ -1506,6 +1540,11 @@ void init_sched_mm(struct task_struct *p) { } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {} ++ ++void reset_llc_stats(struct rq *rq) {} + #endif + + /* +@@ -3999,6 +4038,7 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) + struct rq *rq = rq_of(cfs_rq); + + account_numa_enqueue(rq, task_of(se)); ++ account_llc_enqueue(rq, task_of(se)); + list_add(&se->group_node, &rq->cfs_tasks); + } + cfs_rq->nr_queued++; +@@ -4010,9 +4050,14 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) + update_load_sub(&cfs_rq->load, se->load.weight); + if (entity_is_task(se)) { + account_numa_dequeue(rq_of(cfs_rq), task_of(se)); ++ account_llc_dequeue(rq_of(cfs_rq), task_of(se)); + list_del_init(&se->group_node); + } + cfs_rq->nr_queued--; ++ ++ /* safeguard to clear the cache aware data */ ++ if (!parent_entity(se) && !cfs_rq->nr_queued) ++ reset_llc_stats(rq_of(cfs_rq)); + } + + /* +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index b448ad6dc51d..3ab64067acc6 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1098,6 +1098,10 @@ struct rq { + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc_running; ++ unsigned int nr_llc_running; ++#endif + #ifdef CONFIG_NO_HZ_COMMON + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; +@@ -1952,6 +1956,9 @@ init_numa_balancing(unsigned long clone_flags, struct task_struct *p) + + #endif /* !CONFIG_NUMA_BALANCING */ + ++void reset_llc_stats(struct rq *rq); ++int task_llc(const struct task_struct *p); ++ + static inline void + queue_balance_callback(struct rq *rq, + struct balance_callback *head, +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip new file mode 100644 index 0000000..18dc0f7 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-08-19-sched-fair-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip @@ -0,0 +1,194 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8D1D3284688 + for ; Sat, 11 Oct 2025 18:18:26 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206708; cv=none; b=W6A0Asy9e3NNDRL2ti9BvFY1go+vAlduaKJd1rmOWRr4k4IHRIEpHNJhix4g/v1mdJgDI06CWQ3sQC5YxuLOry9f66mT2W5iUkNoO1AMOa7iJYVMhxygC7dgS1riRk+Xr61GHZrfTq3glOqKoHqMJR1ChGEEIDFSijs9KJo91LU= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206708; c=relaxed/simple; + bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=uzr/dGrFdG1v5FdOJ/f9StnRIpzjJ5uOjWV+sYvWDeYE/dxtVTZG5FXWR8UqlK4jv7ZYYOlRDJRmdwLszrh1cbzNE43kw7ueGEnBAbSwzUyXo12aLw3ckNHZHHjqr9uTbTYz7GDrN3J5K862edN4cdJHoI9buyHUDzdCkXfIheE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MiTdX6Q6; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MiTdX6Q6" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206707; x=1791742707; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=qUQDFYZ38LNpr9WfzaoIX7ySGKszvby265gkxQF4WK0=; + b=MiTdX6Q6R/zAjqSeS2bqz6JnSO+lVjbu/CGoRS4W48TnANXSK7FbeFq8 + HIHNTysTrwhHCzP1gtYr6N2x0eFio/feVeyFBD5UytM6ahWF0SC67agMj + jWOkCg+WyPpJSmb2V4GE3mePGb9vm7kjvgiTp1tcN15ClNGhVOTqusLqF + ueDZKLr7dTfEr95oP3PXRNzKFZfqVSGN5aLDywe826XmjT29nykVCoMh+ + U9I8MAfHqzZxWLRDx+EC8+DhJZRsWw9B7dXqvyz67FsBnLG+HHYrAB479 + +0mKNo9XBbRlGAtlUlqUTEvej+mP00q1dndiGmLH/nY7e+wci1WK/1VQo + g==; +X-CSE-ConnectionGUID: e2RK1jGJT9eTlAZZ8FMWJQ== +X-CSE-MsgGUID: se6P+xZrTfOL+/m4zXf2xg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339748" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339748" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:26 -0700 +X-CSE-ConnectionGUID: Lb/G/3cTR6W6ajd8OWjDtQ== +X-CSE-MsgGUID: f0zaj3jsRd+gLA/rNNvR9A== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487214" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:26 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 08/19] sched/fair: Introduce per runqueue task LLC preference counter +Date: Sat, 11 Oct 2025 11:24:45 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Each runqueue is assigned a static array where each element tracks +the number of tasks preferring a given LLC, indexed from 0 to +NR_LLCS. + +For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on +this runqueue which prefer to run within LLC3 (indexed from 0 to NR_LLCS + +The load balancer can use this information to identify busy runqueues +and migrate tasks to their preferred LLC domains. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 35 +++++++++++++++++++++++++++++++++++ + kernel/sched/sched.h | 1 + + 2 files changed, 36 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index fd315937c0cf..b7a68fe7601b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1235,22 +1235,51 @@ static inline int llc_idx(int cpu) + return per_cpu(sd_llc_idx, cpu); + } + ++static inline int pref_llc_idx(struct task_struct *p) ++{ ++ return llc_idx(p->preferred_llc); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + if (!sched_cache_enabled()) + return; + + rq->nr_llc_running += (p->preferred_llc != -1); + rq->nr_pref_llc_running += (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ ++rq->nr_pref_llc[pref_llc]; + } + + static void account_llc_dequeue(struct rq *rq, struct task_struct *p) + { ++ int pref_llc; ++ + if (!sched_cache_enabled()) + return; + + rq->nr_llc_running -= (p->preferred_llc != -1); + rq->nr_pref_llc_running -= (p->preferred_llc == task_llc(p)); ++ ++ if (p->preferred_llc < 0) ++ return; ++ ++ pref_llc = pref_llc_idx(p); ++ if (pref_llc < 0) ++ return; ++ ++ /* avoid negative counter */ ++ if (rq->nr_pref_llc[pref_llc] > 0) ++ --rq->nr_pref_llc[pref_llc]; + } + + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) +@@ -1524,10 +1553,16 @@ void init_sched_mm(struct task_struct *p) + + void reset_llc_stats(struct rq *rq) + { ++ int i = 0; ++ + if (!sched_cache_enabled()) + return; + + rq->nr_llc_running = 0; ++ ++ for (i = 0; i < max_llcs; ++i) ++ rq->nr_pref_llc[i] = 0; ++ + rq->nr_pref_llc_running = 0; + } + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 3ab64067acc6..b801d32d5fba 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1101,6 +1101,7 @@ struct rq { + #ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; ++ unsigned int nr_pref_llc[NR_LLCS]; + #endif + #ifdef CONFIG_NO_HZ_COMMON + unsigned long last_blocked_load_update_tick; +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip new file mode 100644 index 0000000..caf0c08 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-09-19-sched-fair-Count-tasks-prefering-each-LLC-in-a-sched-group.patch.skip @@ -0,0 +1,143 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 896A92848A1 + for ; Sat, 11 Oct 2025 18:18:27 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206709; cv=none; b=OEtiMJ0EXsYmk/b2RpkCvrola+Tb5ZlnJVLLgRLqGiICx7t2qJcij9yw0SgiiThPPPTMrbIdFBAm4w8howvUGPAJFc0ItOZDXO+gwbi0GCrU/MRny5Tre78B7YMgEyxZMXkI05Eu0+fODpObrBBk2c09F8OXQKZ4o5hgptBzDK8= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206709; c=relaxed/simple; + bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=J6bK9CIrnn+dpoeG8RJW1aH3SE1Yc7QYj7Dgh7cqTjdsd3fsWZdu3E2SAwDjyqT5ptCJzWnqjXDoxnW3sFv/aeRC7QnnQkB9bTzAgmfskcoHsp0hZI6c042fUlYpwgsk0j6PmWc4xM8hZNNktu5sqG8t6W1tVMFc+pGngTuF0j8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=n3R+hIU0; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="n3R+hIU0" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206708; x=1791742708; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=PKtlvEc8jJzYEmRgIquRSV3KaK94Gb12wRaccvthO/I=; + b=n3R+hIU0WDMCAOT74Si47T0DHUQFpP/mOPOr4EFjzfrMTg20mocMFVue + SPJYeD3u+HI/S8DzRBSopnypgjipAk03R2jKWcm5OSqY338iFWIhO44pH + Rkbh2OZ1rpYHNaif/qBdzoG/S0GRuxE4+p6SgnYPob1i1tRz5kFPtKtWI + Em/YtXT8s7M8i1lwEkDGhNlIAeWj5yl5FVsHoShyMoDnOs/ZKpz9fa1vH + yY+/JK9y5B5Rh8CVo9sz+iLl5gL/zxPW+ETtFRKayHPWInq1R4rGuUz8D + OVUSiTUoZeUSI+4YJPz+v9iatJmNEpwFlvZeVYR4+WsdGyv8IT5qlNl3i + g==; +X-CSE-ConnectionGUID: VcC/511LSz6QngP8mD/4Fw== +X-CSE-MsgGUID: cm5ykdK+Tza9czQo0iIcIQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339767" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339767" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:27 -0700 +X-CSE-ConnectionGUID: +fnFCaxeROy1X1/2M3UOCQ== +X-CSE-MsgGUID: cAIBkdx0SvqbyNLUptq1pw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487219" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 09/19] sched/fair: Count tasks prefering each LLC in a sched group +Date: Sat, 11 Oct 2025 11:24:46 -0700 +Message-Id: <00e5f2cb6eadc3738e33858d3c4563a0775ee1c0.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During LLC load balancing, tabulate the number of tasks on each runqueue +that prefer a given destination LLC in a sched group. + +For example, consider a system with 4 LLC sched groups (LLC0 to LLC3) +balancing towards LLC3. LLC0 has 3 tasks preferring LLC3, LLC1 has +2, and LLC2 has 1. LLC0, having the most tasks preferring LLC3, is +selected as the busiest source to pick tasks from. + +Within a source LLC, the total number of tasks preferring a destination +LLC is computed by summing counts across all CPUs in that runqueue. For +instance, if LLC0 has CPU0 with 2 tasks and CPU1 with 1 task preferring +LLC3, the total for LLC0 is 3. + +These statistics allow the load balancer to choose tasks from source +sched groups that best match their preferred LLCs. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 11 +++++++++++ + 1 file changed, 11 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b7a68fe7601b..cbd1e97bca4b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10399,6 +10399,9 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc[NR_LLCS]; ++#endif + }; + + /* +@@ -10891,6 +10894,14 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (cpu_overutilized(i)) + *sg_overutilized = 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_cache_enabled()) { ++ int j; ++ ++ for (j = 0; j < max_llcs; ++j) ++ sgs->nr_pref_llc[j] += rq->nr_pref_llc[j]; ++ } ++#endif + /* + * No need to call idle_cpu() if nr_running is not 0 + */ +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip new file mode 100644 index 0000000..4bcffad --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-10-19-sched-fair-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip @@ -0,0 +1,187 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id F13BA28505C + for ; Sat, 11 Oct 2025 18:18:28 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206711; cv=none; b=EhBerRhJhQXPW7xGyw0P5bxJnRZLdUKLIQ12NKKqVw4ZWFGkcALuZ8VykNWnycAafmMkb5kBWaZT15xr3ZuPia1hqPYipqCAVEd34Wn9NgZ7h0Lqr4/FQP1HOI9Yp9naliJ5jjs5uaj5L1/4fJBsGwV0wle3JatN24KLVnEBxK8= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206711; c=relaxed/simple; + bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=XF3a1nw/8EN0FU+PNi1yIJ/227PxHRBRy24uDZNEkqQuRuIG35Ap7GIvbGG+L1n9ZlEPV0A8eM5UvEqTGNXZktaeA+OJjX4avu9hw9uu6rqowoIWWNlLa6/0iuozmn5jhIZJJqDbWB7j1stg+x51fnwnSbNrDkb2H27S3usCnzQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Loa6o7d1; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Loa6o7d1" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206709; x=1791742709; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=QoFubbb8wiPjhz5y3pWF+17tV+P2bIxeqL8wpFi1nfk=; + b=Loa6o7d1Mzs3ouslW83UWTdxmyggGuWTcpizCbNq+GcghqOrvTfXSRIV + 0EP9sedHVH3VdKCqAQHV/ZX3VHfUXCRKy9+NcdVchFLL8bKi/9buFRwhw + ZWmkcnGopsf975TA51MaL7sh2sNrOAvPuHmiA1plKNFBBesobcOlf5xbr + aZ9W/S+Mv3Ykf28JPDwOIYzvtKZi5pCgwvqz5wqJHrujBfUq//kuxX1xD + 44PevqjxkAnPNbnm/C3CdQgNXiNta5xW/ZKmACOzIkYXaOsL8kl9jvdQl + 4VJ6pV7RaGBpMqmBXGMhRqdKmN0HSByZ1kvmH46v45jRNYG2/U+7kgbrO + A==; +X-CSE-ConnectionGUID: 7OsmkTE2T2eIFyDjRKp/ig== +X-CSE-MsgGUID: oqLf97jbSIOB+8Rk4LLqqA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339788" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339788" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:28 -0700 +X-CSE-ConnectionGUID: jHLQbWxOTR2E4C2/k5j7Wg== +X-CSE-MsgGUID: sQhO8wOTQIuj4/5Og2eBgw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487222" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:27 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 10/19] sched/fair: Prioritize tasks preferring destination LLC during balancing +Date: Sat, 11 Oct 2025 11:24:47 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During LLC load balancing, first check for tasks that prefer the +destination LLC and balance them to it before others. + +Mark source sched groups containing tasks preferring non local LLCs +with the group_llc_balance flag. This ensures the load balancer later +pulls or pushes these tasks toward their preferred LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 43 +++++++++++++++++++++++++++++++++++++++++-- + 1 file changed, 41 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cbd1e97bca4b..af7b578eaa06 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9822,8 +9822,7 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu + else + return mig_unrestricted; + +- return can_migrate_llc(src_cpu, dst_cpu, +- task_util(p), to_pref); ++ return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref); + } + + #else +@@ -10394,6 +10393,7 @@ struct sg_lb_stats { + enum group_type group_type; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ ++ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + #ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; +@@ -10849,11 +10849,45 @@ static void record_sg_llc_stats(struct lb_env *env, + if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) + WRITE_ONCE(sd_share->capacity, sgs->group_capacity); + } ++ ++/* ++ * Do LLC balance on sched group that contains LLC, and have tasks preferring ++ * to run on LLC in idle dst_cpu. ++ */ ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ struct sched_domain *child = env->sd->child; ++ int llc; ++ ++ if (!sched_cache_enabled()) ++ return false; ++ ++ if (env->sd->flags & SD_SHARE_LLC) ++ return false; ++ ++ /* only care about task migration among LLCs */ ++ if (child && !(child->flags & SD_SHARE_LLC)) ++ return false; ++ ++ llc = llc_idx(env->dst_cpu); ++ if (sgs->nr_pref_llc[llc] > 0 && ++ can_migrate_llc(env->src_cpu, env->dst_cpu, 0, true) == mig_llc) ++ return true; ++ ++ return false; ++} + #else + static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) + { + } ++ ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ return false; ++} + #endif + + /** +@@ -10954,6 +10988,11 @@ static inline void update_sg_lb_stats(struct lb_env *env, + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + + record_sg_llc_stats(env, sgs, group); ++ ++ /* Check for tasks in this group can be moved to their preferred LLC */ ++ if (!local_group && llc_balance(env, sgs, group)) ++ sgs->group_llc_balance = 1; ++ + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip new file mode 100644 index 0000000..ee39ef0 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-11-19-sched-fair-Identify-busiest-sched_group-for-LLC-aware-load-balancing.patch.skip @@ -0,0 +1,184 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E3802857E0 + for ; Sat, 11 Oct 2025 18:18:29 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206711; cv=none; b=t2IkYrrS4OEW0rLnZ4Ph2aLp/ob7UBcUobZQPFlHPmpcJEG5m0pUt/86mOssLKuYpjefjiUDrjFelfxhjAxq8hkNJqtOEMJPbTz+zzT3SsVZRdrqKE8v+5YoRbLqXRQPim2ll3DhWUtUyVjcOo+wuodh/CEa974mbGOLa7mTgCc= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206711; c=relaxed/simple; + bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=CFlB5zhIcHUsbSOo/sD1pZdSFz7frR0zFFzgb5/20MqZiItU17WC0G8ifB7ANEAoWHl+sZ1UBTS2HXkckShm7SoSJJXvPBbw6XxQCBJK6yrElYIzS1CzXKAx7vBmkFFghPyfHOK4JpsmMAKYxqatpcWaHZwO7N1+tqHPYDwlFpo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Y9YkqrBb; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Y9YkqrBb" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206709; x=1791742709; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=XiIsNrTg0GfmfpcWJwni6hIdWkEEq9nbQ2y28gcjQcw=; + b=Y9YkqrBbsakXirsuA3GK7ppNmtxnJk2cm0iimpzRLvMdIlTwXGPf3Jxq + CO6EwYbc/Esxx5TDgaH0h7SVW6eQY5e38xqt9oEwqeMZQtQ13URaPfC2Q + Mwk/v0qwxo5jXbC8xa2O9JpbH1ZyVCsabZmLtbPS2e8WfQbQS4lgRoeof + RbwLkRXbWC69JnwGxh3aUM7ZF9q8ziMLuIK7nYhL3utheouiHtWkbs+nW + RBMmwNo592e9Wh6g7Ht+Vdc051U+njdgUo7aZRqY6DlKoIGZaJJSG2c0W + jAF73DWLcSoTQT2Ii9M9dPOTvOCcojIDgIVpILvlasXm0wG4u+s+OJFGn + Q==; +X-CSE-ConnectionGUID: bcFBDLOoTw6TYukUkbI3wQ== +X-CSE-MsgGUID: 0WEdTBqUR0WG7HuYHYySDg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339807" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339807" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:29 -0700 +X-CSE-ConnectionGUID: teKUgYrNS8ayzrTmALf01w== +X-CSE-MsgGUID: OBuR3uU9Q8qKO64uzC8h4Q== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487230" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:28 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 11/19] sched/fair: Identify busiest sched_group for LLC-aware load balancing +Date: Sat, 11 Oct 2025 11:24:48 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +The load balancer selects the busiest sched_group and migrates tasks +to less busy groups to distribute load across CPUs. + +With cache-aware scheduling enabled, the busiest sched_group is +the one with most tasks preferring the destination LLC. If +the group has the llc_balance flag set, cache aware load balancing is +triggered. + +Introduce the helper function update_llc_busiest() to identify the +sched_group with the most tasks preferring the destination LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 39 ++++++++++++++++++++++++++++++++++++++- + 1 file changed, 38 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index af7b578eaa06..8469ec528cb1 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10877,6 +10877,23 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ int idx; ++ ++ /* Only the candidate with llc_balance needs to be taken care of */ ++ if (!sgs->group_llc_balance) ++ return false; ++ ++ /* ++ * There are more tasks that want to run on dst_cpu's LLC. ++ */ ++ idx = llc_idx(env->dst_cpu); ++ return sgs->nr_pref_llc[idx] > busiest->nr_pref_llc[idx]; ++} + #else + static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) +@@ -10888,6 +10905,13 @@ static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, + { + return false; + } ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ return false; ++} + #endif + + /** +@@ -11035,6 +11059,17 @@ static bool update_sd_pick_busiest(struct lb_env *env, + sds->local_stat.group_type != group_has_spare)) + return false; + ++ /* deal with prefer LLC load balance, if failed, fall into normal load balance */ ++ if (update_llc_busiest(env, busiest, sgs)) ++ return true; ++ ++ /* ++ * If the busiest group has tasks with LLC preference, ++ * skip normal load balance. ++ */ ++ if (busiest->group_llc_balance) ++ return false; ++ + if (sgs->group_type > busiest->group_type) + return true; + +@@ -11942,9 +11977,11 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) + /* + * Try to move all excess tasks to a sibling domain of the busiest + * group's child domain. ++ * Also do so if we can move some tasks that prefer the local LLC. + */ + if (sds.prefer_sibling && local->group_type == group_has_spare && +- sibling_imbalance(env, &sds, busiest, local) > 1) ++ (busiest->group_llc_balance || ++ sibling_imbalance(env, &sds, busiest, local) > 1)) + goto force_balance; + + if (busiest->group_type != group_overloaded) { +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip new file mode 100644 index 0000000..e9edb7a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-12-19-sched-fair-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip @@ -0,0 +1,185 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 99E5F28642E + for ; Sat, 11 Oct 2025 18:18:30 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206712; cv=none; b=CcfwsAyp1OHHqY4mNPYPcN6bUrl09ci4+a/v8FtP9azgYQzfS6lmRwWajeweUonIlhrYSa3k3Uk+3iau8s00TJMHIq9pc69gZThbuJO24GmjHBtcGot6LsPzytIaUPaB8oNg5fj064BJxFXz948iENpfk/rfsglOKxpcJkX9wG0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206712; c=relaxed/simple; + bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=ScMEWl2DOAQMR5u9bpXgwKEadirbrSNG1X0vBv1Qm5M7qzeQRW6zyzR/0wZ49Stn9ftQ28uc0NLCvRH6mwbydhKFD3kpg3JgxWk9NBUU+Qnt+t7g3WQ/pDx7wFSEDUiofgdlic68Cqje1J43vJo7n57s1boIMbDvvtchvPGoTXM= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=WEVJOxO1; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="WEVJOxO1" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206711; x=1791742711; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=y1sB/ng56N+mvnxojmgS/eclR6zFHdcgY5tqVpcDUNQ=; + b=WEVJOxO1Uy4x+GEHukYgK7cjQhJ+ZPzArevJFx6r0uwjLvVHXCsCVf0d + U5oZ9qGbRNsQ961+swsJygnl0Xp69gaKKJFDcVvaKlw28OYtLWeCcKxy5 + 4DN0Azrktm8AXYGwp3idVSw3VynSmNbW2dqVmCfWn3Np2iYv1w7hTpRfb + SetW2PMNCXc4Fk5w1ve3GEJ9Bax25e3mUvpabN2XIbAEnlZu4rHyR3ovD + 1WzBrpK45tvGmB0FKRXCfsKbMFF1KdXCgjW4lAJ2KU2k2bhxv6SPWDjA8 + 0qVm8erW2mgP7HqJHVa71uZn8ehzzZAPeMVO4wyBDdQns/j8tkr67uAC6 + w==; +X-CSE-ConnectionGUID: osVAgR9XSEi43ydURnxquA== +X-CSE-MsgGUID: sgSrXMaOTSCJRnEynSu6Vg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339827" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339827" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:30 -0700 +X-CSE-ConnectionGUID: U/XiMYdrQLyr4smIn6sKwQ== +X-CSE-MsgGUID: iE4re5OqR+eOwHWBOdmfKA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487233" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:29 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 12/19] sched/fair: Add migrate_llc_task migration type for cache-aware balancing +Date: Sat, 11 Oct 2025 11:24:49 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce a new migration type, migrate_llc_task, to support +cache-aware load balancing. + +After identifying the busiest sched_group (having the most tasks +preferring the destination LLC), mark migrations with this type. +During load balancing, each runqueue in the busiest sched_group is +examined, and the runqueue with the highest number of tasks preferring +the destination CPU is selected as the busiest runqueue. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 8469ec528cb1..bec6354d7841 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9504,7 +9504,8 @@ enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, +- migrate_misfit ++ migrate_misfit, ++ migrate_llc_task + }; + + #define LBF_ALL_PINNED 0x01 +@@ -10082,6 +10083,10 @@ static int detach_tasks(struct lb_env *env) + env->imbalance -= util; + break; + ++ case migrate_llc_task: ++ env->imbalance--; ++ break; ++ + case migrate_task: + env->imbalance--; + break; +@@ -11733,6 +11738,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + return; + } + ++#ifdef CONFIG_SCHED_CACHE ++ if (busiest->group_llc_balance) { ++ /* Move a task that prefer local LLC */ ++ env->migration_type = migrate_llc_task; ++ env->imbalance = 1; ++ return; ++ } ++#endif ++ + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages +@@ -12041,6 +12055,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int busiest_pref_llc = 0; ++ int dst_llc; ++#endif + int i; + + for_each_cpu_and(i, sched_group_span(group), env->cpus) { +@@ -12149,6 +12167,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + } + break; + ++ case migrate_llc_task: ++#ifdef CONFIG_SCHED_CACHE ++ dst_llc = llc_idx(env->dst_cpu); ++ if (!cpus_share_cache(env->dst_cpu, rq->cpu) && ++ busiest_pref_llc < rq->nr_pref_llc[dst_llc]) { ++ busiest_pref_llc = rq->nr_pref_llc[dst_llc]; ++ busiest = rq; ++ } ++#endif ++ break; + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; +@@ -12331,6 +12359,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; ++ case migrate_llc_task: ++ break; + } + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip new file mode 100644 index 0000000..50e470a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-13-19-sched-fair-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip @@ -0,0 +1,208 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CE0E286D56 + for ; Sat, 11 Oct 2025 18:18:31 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206713; cv=none; b=GHTSZiD43H1BP9udGQWGRTSdycj0dFbwOFNYssvdtvgDyjDEnOhEZuZ3tF7d4Oxq4KjVh/REHJdk8e5qmA0nk91pFvjTrD7ew0sadW9X2+TjejBiKi+Z4u/nZlJeGc29rI3I01ytNZfNGLLusPB2P/4mVx6bLIuv9bhIea7/KOQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206713; c=relaxed/simple; + bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Z43NaTPGAIlscL0L7fDhRwzngl1+8YayCbuXKnJJO/leht3IttqnVKWti2tJx4O3Ad4+Bxa7ijhsxQg7lysYNstcyC73l5FTr0P11m80kqmUiNRrC4pt99E80BCBIbFo2SatFJnTKT4Q1ux117UKVwuy6P9Rh922Z1naN6x4Wgc= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=JdkwbeJq; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="JdkwbeJq" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206711; x=1791742711; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=4nb/OF/m6vG5cWGZuJFatpxTHqyXyCOJlbLckp109KU=; + b=JdkwbeJqpvNLxxR/C5J1ZH6Sc5bkBzINB0NUowykgcoSMh+IrKTz9SEs + 3TI4U2WqUZ4fGfcXVpbX1N2vbaAfyQUv4dhr3bMb1WSUcBz4dSrMfVdBf + Gdlpc/LwIyV72Eyt8t+mfF176Y/vv2GuGHN9WuXsK8/fBvzDMB20NsZLB + QBg0I+M7oRSQsaiygrqnGBFHiCS3p2JbXoqghWgigPrv6u1iqo8HXxcYs + HtDa1JUkhRKqPvvWxmzbfQzJYS+Coi/HVD3eewtzP+ILLi56XMzOKLHfR + iZqHJ/1cq2a50rc7YQNpk4EmPQ7vkE0qnNCf9o39KpjsRQh5qnu3HCaul + A==; +X-CSE-ConnectionGUID: VRcX2cnOQSeMAY0e8g4K3w== +X-CSE-MsgGUID: SPoQqM3DQk6EyvXMnqQjmg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339847" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339847" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:31 -0700 +X-CSE-ConnectionGUID: pKVZhrKMR8K6LBYqzMOqAA== +X-CSE-MsgGUID: CK8cGt1oRtCxjPN4nS/YdA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487238" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:30 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 13/19] sched/fair: Handle moving single tasks to/from their preferred LLC +Date: Sat, 11 Oct 2025 11:24:50 -0700 +Message-Id: <231864b303906a60491bbb9eb7b2e3f083bff248.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +If the busiest runqueue has only one task, active balancing may be +invoked to move it. However, before migration, check whether the task +is running on its preferred LLC. + +Do not move a lone task to another LLC if it would move the task +away from its preferred LLC or cause excessive imbalance between LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 62 ++++++++++++++++++++++++++++++++++++++++++--- + 1 file changed, 59 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index bec6354d7841..19ba9c1b9a63 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9826,12 +9826,53 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu + return can_migrate_llc(src_cpu, dst_cpu, task_util(p), to_pref); + } + ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ if (!sched_cache_enabled()) ++ return false; ++ ++ if (cpus_share_cache(env->src_cpu, env->dst_cpu)) ++ return false; ++ /* ++ * All tasks prefer to stay on their current CPU. ++ * Do not pull a task from its preferred CPU if: ++ * 1. It is the only task running there; OR ++ * 2. Migrating it away from its preferred LLC would violate ++ * the cache-aware scheduling policy. ++ */ ++ if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) { ++ unsigned long util = 0; ++ struct task_struct *cur; ++ ++ if (env->src_rq->nr_running <= 1) ++ return true; ++ ++ rcu_read_lock(); ++ cur = rcu_dereference(env->src_rq->curr); ++ if (cur) ++ util = task_util(cur); ++ rcu_read_unlock(); ++ ++ if (can_migrate_llc(env->src_cpu, env->dst_cpu, ++ util, false) == mig_forbid) ++ return true; ++ } ++ ++ return false; ++} + #else + static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) + { + return false; + } ++ ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ return false; ++} + #endif + /* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? +@@ -12247,6 +12288,9 @@ static int need_active_balance(struct lb_env *env) + { + struct sched_domain *sd = env->sd; + ++ if (break_llc_locality(env)) ++ return 0; ++ + if (asym_active_balance(env)) + return 1; + +@@ -12266,7 +12310,8 @@ static int need_active_balance(struct lb_env *env) + return 1; + } + +- if (env->migration_type == migrate_misfit) ++ if (env->migration_type == migrate_misfit || ++ env->migration_type == migrate_llc_task) + return 1; + + return 0; +@@ -12711,9 +12756,20 @@ static int active_load_balance_cpu_stop(void *data) + goto out_unlock; + + /* Is there any task to move? */ +- if (busiest_rq->nr_running <= 1) +- goto out_unlock; ++ if (busiest_rq->nr_running <= 1) { ++#ifdef CONFIG_SCHED_CACHE ++ int llc = llc_idx(target_cpu); + ++ if (!sched_cache_enabled()) ++ goto out_unlock; ++ ++ if (llc < 0) ++ goto out_unlock; ++ /* don't migrate if no task prefers target */ ++ if (busiest_rq->nr_pref_llc[llc] < 1) ++#endif ++ goto out_unlock; ++ } + /* + * This condition is "impossible", if it occurs + * we need to fix it. Originally reported by +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip new file mode 100644 index 0000000..2839724 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-14-19-sched-fair-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip @@ -0,0 +1,201 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 417D42874EA + for ; Sat, 11 Oct 2025 18:18:32 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206714; cv=none; b=P5dnBcm/QdLKKHwOdHn/8WuPNdfAOl/PRiR2K2uOEI4cNFkN+3QA9gv1poGLydzEv/LcejqEay5DpC4q4pFVQXAYgNISmcWGnnkZt2WJ1RNwtLhNEUFXZhx40ubXDsBOhhphD04ToZpipNp3wabmP7EXcOk+GqqMg1ATyjn68eQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206714; c=relaxed/simple; + bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=K4DMRiSFceKlJzje7FYPpzQtMciS8INZnGsYmfTeHw6oUtErbWyqEJzurxfkaj/0e2BYrqNZ34Rdy0dGMjqeQWLbOVlQosaArztC6x5+Kes0uifkkB7Pj+Ot9ll7+ydHo4UrJOvNc7oKS/beZOgPG9FPfh7UCSuuvvMEgE2IUTo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=H3CAEs3w; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="H3CAEs3w" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206712; x=1791742712; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=7mAc5fCb/Yw4KmiNv5+1hXXuEie+xn7lqzFvEVfM5lI=; + b=H3CAEs3wXo6bis/3Dkhtptw+Q7vtaAFDMqK8g5XXqpoTWnnoOviYRAT9 + w6Ikfty6wJNr1MlZJ1pp/FTRrzxJpmwm8JYX2yaBiDeoJDyx/agfVsZPY + MklgYKNASSHcEaoYoXP3gsqWfSwXldul6nD1Cye5tqr86XkWjK3gJK3C2 + XHWF6ABgRrpsZ6WaBAuzrKten6FRqGkbA1i+aWIRwXqoWsGPVsgAC8AT4 + v51P3tS4APRavdFpCNPn2xNzJPdUZAW7dgqXMB0AkpdRadIZ72DIu+BFu + J9oJpUAr+gFfhWThceV6xrW/Bi4Emncs3GIHURfaahEgiLmzNa/UX2/Km + w==; +X-CSE-ConnectionGUID: L4/6SpgURcKa2MOypuG0Tw== +X-CSE-MsgGUID: s8jp3cejRyWqoo8mO6QU6Q== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339866" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339866" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:32 -0700 +X-CSE-ConnectionGUID: IV2+5+btQ3GmLWn4UVfGIA== +X-CSE-MsgGUID: Ti8qIpzsSjywiCl630piRA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487243" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:31 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 14/19] sched/fair: Consider LLC preference when selecting tasks for load balancing +Date: Sat, 11 Oct 2025 11:24:51 -0700 +Message-Id: <26e7bfa88163e13ba1ebefbb54ecf5f42d84f884.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Currently, task selection from the busiest runqueue ignores LLC +preferences. Reorder tasks in the busiest queue to prioritize selection +as follows: + + 1. Tasks preferring the destination CPU's LLC + 2. Tasks with no LLC preference + 3. Tasks preferring an LLC different from their current one + 4. Tasks preferring the LLC they are currently on + +This improves the likelihood that tasks are migrated to their +preferred LLC. + +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 19ba9c1b9a63..0fafbfedb21d 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10036,6 +10036,68 @@ static struct task_struct *detach_one_task(struct lb_env *env) + return NULL; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Prepare lists to detach tasks in the following order: ++ * 1. tasks that prefer dst cpu's LLC ++ * 2. tasks that have no preference in LLC ++ * 3. tasks that prefer LLC other than the ones they are on ++ * 4. tasks that prefer the LLC that they are currently on. ++ */ ++static struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ struct task_struct *p; ++ LIST_HEAD(pref_old_llc); ++ LIST_HEAD(pref_new_llc); ++ LIST_HEAD(no_pref_llc); ++ LIST_HEAD(pref_other_llc); ++ ++ if (!sched_cache_enabled()) ++ return tasks; ++ ++ if (cpus_share_cache(env->dst_cpu, env->src_cpu)) ++ return tasks; ++ ++ while (!list_empty(tasks)) { ++ p = list_last_entry(tasks, struct task_struct, se.group_node); ++ ++ if (p->preferred_llc == llc_id(env->dst_cpu)) { ++ list_move(&p->se.group_node, &pref_new_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == llc_id(env->src_cpu)) { ++ list_move(&p->se.group_node, &pref_old_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == -1) { ++ list_move(&p->se.group_node, &no_pref_llc); ++ continue; ++ } ++ ++ list_move(&p->se.group_node, &pref_other_llc); ++ } ++ ++ /* ++ * We detach tasks from list tail in detach tasks. Put tasks ++ * to be chosen first at end of list. ++ */ ++ list_splice(&pref_new_llc, tasks); ++ list_splice(&no_pref_llc, tasks); ++ list_splice(&pref_other_llc, tasks); ++ list_splice(&pref_old_llc, tasks); ++ return tasks; ++} ++#else ++static inline struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ return tasks; ++} ++#endif ++ + /* + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from + * busiest_rq, as part of a balancing operation within domain "sd". +@@ -10044,7 +10106,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) + */ + static int detach_tasks(struct lb_env *env) + { +- struct list_head *tasks = &env->src_rq->cfs_tasks; ++ struct list_head *tasks; + unsigned long util, load; + struct task_struct *p; + int detached = 0; +@@ -10063,6 +10125,8 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + return 0; + ++ tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks); ++ + while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip new file mode 100644 index 0000000..0a36e52 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-15-19-sched-fair-Respect-LLC-preference-in-task-migration-and-detach.patch.skip @@ -0,0 +1,156 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 118912877EE + for ; Sat, 11 Oct 2025 18:18:33 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206714; cv=none; b=Sgvo8eIzN/unUNmW2/+OixP9udhyNkmi4AZEZzDVPWK1PLnNoYAhA0isU11HgcQC7ul1i5aP8jgG2uHE7Cy8Asrdz+Y08qynhym2Y4X0S+xgTgNOkVzp41IhyzMl092I4cMjY7ziOvFvK6idsHZ/FR3VwQydRvg8d5aWYp64rpE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206714; c=relaxed/simple; + bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=TyYasSHPSqMZlN51+4bjWq8Z7cAg9IakiA1ZSJzbhlx8KJc6/UktRCAzZaEkZtQ3d+2B5EUSEDoefcCsbcoCPxFRSCAzN4VD9lBw94R0aIvRHbenlFVxgsvkmUCy9pzg5jZh5zHq/4CLUC+EDPmK622ZE8JNMYgUcZgPpxmosck= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Lw7L05el; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Lw7L05el" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206713; x=1791742713; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=HpgwI4hDixtTD/XOc2H/Ob5dO6FbxoYzkh6tbeLAGPw=; + b=Lw7L05elkbwdOCxozPfNxC8qRTe1i2iYshjZC2z6ZaIHRqDa3MmTXW5p + zHG6+auYcjgaRRcY16sdCyIbi7MCQxhd1rhIdaLh0bWrCs4ImE5P1VD8f + E+1GcTkJVgNbzLAR5f6+G7KZsA/sstlz5uIOTmFm5WpAXCY87MaYrAMAn + AO+uoYvLDh1ME4/gSK2T7C+P7K4lX/jQuif20ZGD72jW5wnQNob4g08JW + Z2MLtsd0WXxmCEXIKBfa0mtDIGY2FVs5/FvLd831/0grQYgT8vo1t80Kc + spuxB5OU6NgYwRfX7rKRRiLNfth6YUS68l+iwJeWbASwMAqE6PVWIEmJu + Q==; +X-CSE-ConnectionGUID: eDbtoCrOQHyIZtGmIsjSMQ== +X-CSE-MsgGUID: +ry6w/ChQZGrUwocr7gK9A== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339887" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339887" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:33 -0700 +X-CSE-ConnectionGUID: 1LsFjRblTkmkQu9Zwyc6pQ== +X-CSE-MsgGUID: 7olPURVrSrW53T9U5Kz7mw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487247" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:32 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Tim Chen , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 15/19] sched/fair: Respect LLC preference in task migration and detach +Date: Sat, 11 Oct 2025 11:24:52 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During the final step of load balancing, can_migrate_task() now +considers a task's LLC preference before moving it out of its +preferred LLC. + +Additionally, add checks in detach_tasks() to prevent selecting tasks +that prefer their current LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 24 ++++++++++++++++++++++-- + 1 file changed, 22 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0fafbfedb21d..65ff7c306a2f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9801,8 +9801,8 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, + * Check if task p can migrate from src_cpu to dst_cpu + * in terms of cache aware load balance. + */ +-static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, +- struct task_struct *p) ++static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, ++ struct task_struct *p) + { + struct mm_struct *mm; + bool to_pref; +@@ -9969,6 +9969,12 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + if (env->flags & LBF_ACTIVE_LB) + return 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_cache_enabled() && ++ can_migrate_llc_task(env->src_cpu, env->dst_cpu, p) == mig_forbid) ++ return 0; ++#endif ++ + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); +@@ -10227,6 +10233,20 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + break; + ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Don't detach more tasks if the remaining tasks want ++ * to stay. We know the remaining tasks all prefer the ++ * current LLC, because after order_tasks_by_llc(), the ++ * tasks that prefer the current LLC are at the tail of ++ * the list. The inhibition of detachment is to avoid too ++ * many tasks being migrated out of the preferred LLC. ++ */ ++ if (sched_cache_enabled() && detached && p->preferred_llc != -1 && ++ llc_id(env->src_cpu) == p->preferred_llc) ++ break; ++#endif ++ + continue; + next: + if (p->sched_task_hot) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip new file mode 100644 index 0000000..88914b1 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-16-19-sched-fair-Exclude-processes-with-many-threads-from-cache-aware-scheduling.patch.skip @@ -0,0 +1,172 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 16F11288C02 + for ; Sat, 11 Oct 2025 18:18:34 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206715; cv=none; b=msFA8TC41v9oEIuXxPkwmaUs9Guya5oz4k0g+kGWjFkx5t6zbq1fE/hqkiyOdPEhHS8cUTNX+aARYrbMu+YFzDRmUGhKnyOYkbiJD/UnEPwa2emEYG8RrqlU6lMxzm4wiDBJLxqnLLfKGSPXyWwXrM560Mia1tgl6K9uKsnEgFE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206715; c=relaxed/simple; + bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=pIYwSq6151qmo6KEbEr6KofmYMtBvZvl9VphDwsqPX3hTLP897hu66I6LFuek1xE2EdzY5hJ64po/YPEKcNn99hwknIHDQx8uamJBxPh8I2WV7/JQ8MBTxUclp3YSgTWiAJSRjNR9EBM7PkdUJqtsU69m11ei/HsbibGYzaOOwk= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=TQVK1fUD; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="TQVK1fUD" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206714; x=1791742714; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Tofl2LDuzdO5QbpLjDZ3W55iV9tdiYron5fWReifyPw=; + b=TQVK1fUDtuFQmuxj0h/H6B3W/u2cJ2GkGOiUH7Lt/dRtHWxu09UqD683 + GE9GznGGwwF/Ima7vRS1ctHwsI6Xpw4SijdVGn66soleS5/ydNjcGaSKg + ygudPZpTfNaQrBfM0sFvdqPmdg50LMShstL+8pxYWf160UzvXjzOECyon + VuIxmxxlfPMnN2wMIOyjbQiDBL/LsnnHbGArR4IFK3zGWts6KMkvPzkiR + EwWOPnHMmqriXFYLM8wcDjSverDfcRP6MlQsXXusYG7bdxJhhuwymEiBB + InFNxWr5/xEksEDfouM5jLx/TVwLUkF4o8vAQ8HbkYgDi57JrvvbuA4Mr + g==; +X-CSE-ConnectionGUID: dN0cE9kLQ3yeKYNXwwT83A== +X-CSE-MsgGUID: KDX51V55RvaEAIpyi8Kcxg== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339905" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339905" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:34 -0700 +X-CSE-ConnectionGUID: SHt2rwkJR6+JML7EmRAXVw== +X-CSE-MsgGUID: 7457bVysSBes9Wezrb15EQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487250" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:33 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 16/19] sched/fair: Exclude processes with many threads from cache-aware scheduling +Date: Sat, 11 Oct 2025 11:24:53 -0700 +Message-Id: <637cdb8ab11b1b978d697ed744cc402d32443ecc.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +A performance regression was observed by Prateek when running hackbench +with many threads per process (high fd count). To avoid this, processes +with a large number of active threads are excluded from cache-aware +scheduling. + +With sched_cache enabled, record the number of active threads in each +process during the periodic task_cache_work(). While iterating over +CPUs, if the currently running task belongs to the same process as the +task that launched task_cache_work(), increment the active thread count. + +If the count exceeds the number of CPUs in the process's preferred LLC, +sched_cache will avoid aggregating too many threads into a single LLC +domain. + +Reported-by: K Prateek Nayak +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/mm_types.h | 1 + + kernel/sched/fair.c | 14 ++++++++++++-- + 2 files changed, 13 insertions(+), 2 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 3ca557c2f36d..b307f81b2fde 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1031,6 +1031,7 @@ struct mm_struct { + raw_spinlock_t mm_sched_lock; + unsigned long mm_sched_epoch; + int mm_sched_cpu; ++ u64 nr_running_avg ____cacheline_aligned_in_smp; + #endif + + #ifdef CONFIG_MMU +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 65ff7c306a2f..79d109f8a09f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1451,12 +1451,13 @@ static void get_scan_cpumasks(cpumask_var_t cpus, int cache_cpu, + + static void __no_profile task_cache_work(struct callback_head *work) + { +- struct task_struct *p = current; ++ struct task_struct *p = current, *cur; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + unsigned long curr_m_a_occ = 0; + int cpu, m_a_cpu = -1, cache_cpu, +- pref_nid = NUMA_NO_NODE, curr_cpu; ++ pref_nid = NUMA_NO_NODE, curr_cpu, ++ nr_running = 0; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); +@@ -1497,6 +1498,14 @@ static void __no_profile task_cache_work(struct callback_head *work) + m_occ = occ; + m_cpu = i; + } ++ ++ rcu_read_lock(); ++ cur = rcu_dereference(cpu_rq(i)->curr); ++ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && ++ cur->mm == mm) ++ nr_running++; ++ rcu_read_unlock(); ++ + } + + /* +@@ -1540,6 +1549,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + mm->mm_sched_cpu = m_a_cpu; + } + ++ update_avg(&mm->nr_running_avg, nr_running); + free_cpumask_var(cpus); + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip new file mode 100644 index 0000000..0bb796c --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-17-19-sched-fair-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip @@ -0,0 +1,170 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 4CBF228B400 + for ; Sat, 11 Oct 2025 18:18:35 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206717; cv=none; b=YyEz/CWTR29mSwIUaPFMfMePzkOh+JM5Sy6daDO5bi2qr7vVNV19xi6LQHHFuh3wAPmGhaJZO0psSS/hmmAhEm9YYTN/Jgc2pWxCyI+xWhQCLC7I/PnTVjCiCQif4wqMsrxoWCBWSb2OUxPbQQvBrskdsdNoyUkJX7OfjisrPEo= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206717; c=relaxed/simple; + bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=cDrry+jPMrDILm/r9QUVZNGIrsE561nMMRjz9ay5n5LBA0g4KQ5jFwtQhbKMvroO4a5axJHedJTHbl6aSfvc0uCnQwzJq+eaxxOqXVEOWsoi3zdhUNBrxg97Vqp+GrazIyVFmuyXj145vhjyv4Ug8nfP5dYxkUNSPkfjany2j50= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=lrCuBiww; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="lrCuBiww" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206715; x=1791742715; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=1ZlncHncPiFtSdZrBk62iQ7LoAdWu/umRn9XHDFyiec=; + b=lrCuBiwwXTTaUUesVoUKShmqNypNMcjFctaFnNlL8Jy17kFhV1UkeZza + ZuX0GXcNA+d1mgjVrCdwx7TgVROgGBNK4U8k00nbzT6TvTcewZUk7QGtM + ze+FjZ8AcXNEy5AhOAJw/Pg8vbtTnZ1loNcqp57iteVrKQqHWUMDyfSYU + 8P+nCqWidGuZDOqQcaEjQH4wD2Jn2+QsEcLHNMZnZLw6R3C8jci7hl1aG + MGxs8mPuw6pSR4ah1MI8YVoYS5wwLulLaJK/V5D02tGg7pdRILUMNtqsB + x0389trQkin/UccLwrCAMIGVL3znx7/2JW/py3nOY6EKojcOWTOyEIt0N + Q==; +X-CSE-ConnectionGUID: WfwYlMtNQVe279pYYOUBnA== +X-CSE-MsgGUID: AjSkDrsURkOZNf5ZbyXbNQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339923" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339923" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700 +X-CSE-ConnectionGUID: ezHUeA30SCiDTeB7wo76Nw== +X-CSE-MsgGUID: YeYwMr00ThmPUWDQc0+YAw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487255" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:34 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 17/19] sched/fair: Disable cache aware scheduling for processes with high thread counts +Date: Sat, 11 Oct 2025 11:24:54 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +If the number of active threads within the process +exceeds the number of Cores(divided by SMTs number) +in the LLC, do not enable cache-aware scheduling. +This is because there is a risk of cache contention +within the preferred LLC when too many threads are +present. + +Reported-by: K Prateek Nayak +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + kernel/sched/fair.c | 27 +++++++++++++++++++++++++-- + 1 file changed, 25 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 79d109f8a09f..6b8eace79eee 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1240,6 +1240,18 @@ static inline int pref_llc_idx(struct task_struct *p) + return llc_idx(p->preferred_llc); + } + ++static bool exceed_llc_nr(struct mm_struct *mm, int cpu) ++{ ++ int smt_nr = 1; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) ++ smt_nr = cpumask_weight(cpu_smt_mask(cpu)); ++#endif ++ ++ return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu)); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { + int pref_llc; +@@ -1385,10 +1397,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + + /* + * If this task hasn't hit task_cache_work() for a while, or it +- * has only 1 thread, invalidate its preferred state. ++ * has only 1 thread, or has too many active threads, invalidate ++ * its preferred state. + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || +- get_nr_threads(p) <= 1) { ++ get_nr_threads(p) <= 1 || ++ exceed_llc_nr(mm, cpu_of(rq))) { + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } +@@ -1467,6 +1481,11 @@ static void __no_profile task_cache_work(struct callback_head *work) + if (p->flags & PF_EXITING) + return; + ++ if (get_nr_threads(p) <= 1) { ++ mm->mm_sched_cpu = -1; ++ return; ++ } ++ + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + +@@ -9826,6 +9845,10 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + ++ /* skip cache aware load balance for single/too many threads */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ return mig_unrestricted; ++ + if (cpus_share_cache(dst_cpu, cpu)) + to_pref = true; + else if (cpus_share_cache(src_cpu, cpu)) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip new file mode 100644 index 0000000..b614ebc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-18-19-sched-fair-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip @@ -0,0 +1,246 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id EC76C28C03B + for ; Sat, 11 Oct 2025 18:18:35 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206717; cv=none; b=Gsl1htdC3Y7gJ6c3ywcidI/bSse8yUz6irs7/iI8KWV8rK5Ae95mMS6V4kE386ZpRZ64YVuSevPlw/gCCcGexlKVEsnpJGvjAMVnB6E3r26Sb5PQDcAwlJhgczIF0vnORN//ryXKWaGJdpyTLOi1a78IAJp76Mm0Cc1+XjF2rGQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206717; c=relaxed/simple; + bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=hT0pK7n3dH+PZ5LGb1wwP8mkt2A7mUf1PCIeydCbZfOqNSbSKOwNGkxWRp3xr4aPGGtMx1eK61Xyt7h2YGrFfvdSUCRdLGNS2BunlIUuq8SqGdxHIK829DTsOGKBUbEPWJzj/d6E4FC8xaBfUuz6ugBEq47VdX8vEtuc1XwNFis= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=eyspbvXX; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="eyspbvXX" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206716; x=1791742716; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=RPMOV8sl+NtxUoril7y0k9+l4VyTXFWW/dE0ALAKhro=; + b=eyspbvXX6JZaLuPx9mP9k7AsJvdPNK3nA7Eu1n1ZjnjSeOqzlt2GEvCx + IIbDfmBwRBwDACT7YDm/5WXc6cuJLsO02ejx9sBoouGuZkUHl1/nB7J2O + i/e0/jcb0J2buciIQ3OvuzUhegT0ZaiQoJUm0tinSNJAyHv/2LoJKLT6E + 1wncP9sm103omUQyz2nIdzytwxhPLCdaTXt3R4jfGDM0HbNy1TRA5Ex3O + eiDpNNIsPslVI7J8r5viBVFuJFJIfp1atbqNY5xQ3zDqGyLEqF5FJMEHK + BGBjTx2SYuiM3sv4eOtztesROh9S4vRoc6wieYXXgBwOgrHLMjZB8S3CI + A==; +X-CSE-ConnectionGUID: 15+3n+5PQLG8KotmRvuIMw== +X-CSE-MsgGUID: Dj1GwDBDRtWs7ASTeti8MA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339940" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339940" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:35 -0700 +X-CSE-ConnectionGUID: O+LhKbX0QNyBYwHUAp0ttw== +X-CSE-MsgGUID: PfPvzLkATc2Ca+B9H6Dwng== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487259" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:35 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 18/19] sched/fair: Avoid cache-aware scheduling for memory-heavy processes +Date: Sat, 11 Oct 2025 11:24:55 -0700 +Message-Id: <00da49fd590b95baad0525660bda4c0ba178243d.1760206683.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Prateek and Tingyin reported that memory-intensive workloads (such as +stream) can saturate memory bandwidth and caches on the preferred LLC +when sched_cache aggregates too many threads. + +To mitigate this, estimate a process's memory footprint by comparing +its RSS (anonymous and shared pages) to the size of the LLC. If RSS +exceeds the LLC size, skip cache-aware scheduling. + +Note that RSS is only an approximation of the memory footprint. +By default, the comparison is strict, but a later patch will allow +users to provide a hint to adjust this threshold. + +According to the test from Adam, some systems do not have shared L3 +but with shared L2 as clusters. In this case, the L2 becomes the LLC[1]. + +Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/ + +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + include/linux/cacheinfo.h | 21 ++++++++++------ + kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++--- + 2 files changed, 61 insertions(+), 11 deletions(-) + +diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h +index c8f4f0a0b874..82d0d59ca0e1 100644 +--- a/include/linux/cacheinfo.h ++++ b/include/linux/cacheinfo.h +@@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu, + + const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf); + +-/* +- * Get the cacheinfo structure for the cache associated with @cpu at +- * level @level. +- * cpuhp lock must be held. +- */ +-static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) ++static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level) + { + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + int i; + +- lockdep_assert_cpus_held(); +- + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == level) { + if (ci->info_list[i].attributes & CACHE_ID) +@@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) + return NULL; + } + ++/* ++ * Get the cacheinfo structure for the cache associated with @cpu at ++ * level @level. ++ * cpuhp lock must be held. ++ */ ++static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) ++{ ++ lockdep_assert_cpus_held(); ++ ++ return _get_cpu_cacheinfo_level(cpu, level); ++} ++ + /* + * Get the id of the cache associated with @cpu at level @level. + * cpuhp lock must be held. +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6b8eace79eee..46dfcd2a01b3 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1240,6 +1240,38 @@ static inline int pref_llc_idx(struct task_struct *p) + return llc_idx(p->preferred_llc); + } + ++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) ++{ ++ struct cacheinfo *ci; ++ unsigned long rss; ++ unsigned int llc; ++ ++ /* ++ * get_cpu_cacheinfo_level() can not be used ++ * because it requires the cpu_hotplug_lock ++ * to be held. Use _get_cpu_cacheinfo_level() ++ * directly because the 'cpu' can not be ++ * offlined at the moment. ++ */ ++ ci = _get_cpu_cacheinfo_level(cpu, 3); ++ if (!ci) { ++ /* ++ * On system without L3 but with shared L2, ++ * L2 becomes the LLC. ++ */ ++ ci = _get_cpu_cacheinfo_level(cpu, 2); ++ if (!ci) ++ return true; ++ } ++ ++ llc = ci->size; ++ ++ rss = get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ ++ return (llc <= (rss * PAGE_SIZE)); ++} ++ + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) + { + int smt_nr = 1; +@@ -1402,7 +1434,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || + get_nr_threads(p) <= 1 || +- exceed_llc_nr(mm, cpu_of(rq))) { ++ exceed_llc_nr(mm, cpu_of(rq)) || ++ exceed_llc_capacity(mm, cpu_of(rq))) { + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } +@@ -1486,6 +1519,14 @@ static void __no_profile task_cache_work(struct callback_head *work) + return; + } + ++ /* ++ * Do not check exceed_llc_nr() because ++ * the active number of threads needs to ++ * been updated anyway. ++ */ ++ if (exceed_llc_capacity(mm, curr_cpu)) ++ return; ++ + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + +@@ -9845,8 +9886,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + +- /* skip cache aware load balance for single/too many threads */ +- if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ /* ++ * skip cache aware load balance for single/too many threads ++ * or large footprint. ++ */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) || ++ exceed_llc_capacity(mm, dst_cpu)) + return mig_unrestricted; + + if (cpus_share_cache(dst_cpu, cpu)) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip new file mode 100644 index 0000000..893d5f6 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-19-19-sched-fair-Add-user-control-to-adjust-the-tolerance-of-cache-aware-scheduling.patch.skip @@ -0,0 +1,366 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.17]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 18E8128D850 + for ; Sat, 11 Oct 2025 18:18:37 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.17 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1760206719; cv=none; b=Xx1TJtOzMlihMYBSPUxuHxJ0Qjx1gDS60TVsBbaW2YAWG207+fLDuebhtY/m9byeKfuUMx/7RVc7mR4xE94pKemXSaF1s6z/Ug1MSbyJDL/f+gYUVN9JWyZVsl4nskC5I36GvI9Reswdcqif7FIqp4+OT03g4Ursen0Zl0KoJs4= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1760206719; c=relaxed/simple; + bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=TqNPDsqjikNan+1NtjFEbAg77jx9c3inhDW4V8l0uRiJhbQOXCuc9b1G6bYocgAvzvRSIQ0C9pHEOzGrnitQnTKHR4lM01jV+sq5AGE2Z0YUwNbJ3G2iOFzcz198JhG1QAmKUE7Vocf7AQigiloGd31ZcAGpFcHlx+XOPevHRzQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=iOR0vW8+; arc=none smtp.client-ip=198.175.65.17 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="iOR0vW8+" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1760206717; x=1791742717; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=6DqahwvJ4pGTYe3R6NSaO1UYdqKyQR0MiqAECtQawPE=; + b=iOR0vW8+BW1BG+CuQKpeekNgIJXVik0HqP3JsArGSk608O/BAqQp2/2V + NevdC5FBoGU0UJqaEBq3eyHXjM8fq6f/t4e0BsD23dpBBveuXe++OVX8Y + Aapb+EWCp+mFsFeSqc6EHn1EKVQFE1axOMUnDuAWrAcUGMdrmUl0Sqt8l + gPm1isDiRNA4VWnGAtuiefQtTbQsCK7LA3hCWV2kYbD78VwasjvY/a8Zs + eIWoDg9eon7/Ajv/YxTCU8u2KHeYWmlazBkEjZ2+x2uGykUr+ha3ebndP + Ilvnp7dapSvlsm6l5tNbjmODs4GBS1SErTGbDlGwNscJODVWeB1whKGtb + g==; +X-CSE-ConnectionGUID: iwkdIGQ9QpepiaCCmITr2A== +X-CSE-MsgGUID: vpqcAnIxSGm05xalZwxCuA== +X-IronPort-AV: E=McAfee;i="6800,10657,11531"; a="62339958" +X-IronPort-AV: E=Sophos;i="6.17,312,1747724400"; + d="scan'208";a="62339958" +Received: from orviesa004.jf.intel.com ([10.64.159.144]) + by orvoesa109.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 11 Oct 2025 11:18:36 -0700 +X-CSE-ConnectionGUID: l0yVaxC3RhO6SKkG+8NgJA== +X-CSE-MsgGUID: KHjGlLwMQh2OAr5o5sZaPw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.19,221,1754982000"; + d="scan'208";a="185487263" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by orviesa004.jf.intel.com with ESMTP; 11 Oct 2025 11:18:36 -0700 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" +Cc: Chen Yu , + Vincent Guittot , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Libo Chen , + Adam Li , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH 19/19] sched/fair: Add user control to adjust the tolerance of cache-aware scheduling +Date: Sat, 11 Oct 2025 11:24:56 -0700 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +With sched_cache enabled, the scheduler uses a process's RSS as a +proxy for its LLC footprint to determine if aggregating tasks on the +preferred LLC could cause cache contention. If RSS exceeds the LLC +size, aggregation is skipped. Some workloads with large RSS but small +actual memory footprints may still benefit from aggregation. Since +the kernel cannot efficiently track per-task cache usage (resctrl is +user-space only), userspace can provide a more accurate hint. + +Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let +users control how strictly RSS limits aggregation. Values range from +0 to 100: + + - 0: Cache-aware scheduling is disabled. + - 1: Strict; tasks with RSS larger than LLC size are skipped. + - 100: Aggressive; tasks are aggregated regardless of RSS. + +For example, with a 32MB L3 cache: + + - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped. + - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped + (784GB = (1 + (99 - 1) * 256) * 32MB). + +Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls +how strictly the number of active threads is considered when doing +cache aware load balance. The number of SMTs is also considered. +High SMT counts reduce the aggregation capacity, preventing excessive +task aggregation on SMT-heavy systems like Power10/Power11. + +For example, with 8 Cores/16 CPUs in a L3: + + - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped. + - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped + 785 = (1 + (99 - 1) * 8). + +Reported-by: K Prateek Nayak +Reported-by: Madadi Vineeth Reddy +Reported-by: Shrikanth Hegde +Reported-by: Tingyin Duan +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + kernel/sched/debug.c | 56 ++++++++++++++++++++++++++++++-- + kernel/sched/fair.c | 76 ++++++++++++++++++++++++++++++++++++++++---- + kernel/sched/sched.h | 3 ++ + 3 files changed, 126 insertions(+), 9 deletions(-) + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 57bb04ebbf96..cfcd8b436cc5 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -169,6 +169,50 @@ static const struct file_operations sched_feat_fops = { + .release = single_release, + }; + ++#ifdef CONFIG_SCHED_CACHE ++#define SCHED_CACHE_CREATE_CONTROL(name) \ ++static ssize_t sched_cache_write_##name(struct file *filp, \ ++ const char __user *ubuf, \ ++ size_t cnt, loff_t *ppos) \ ++{ \ ++ char buf[16]; \ ++ unsigned int percent; \ ++ if (cnt > 15) \ ++ cnt = 15; \ ++ if (copy_from_user(&buf, ubuf, cnt)) \ ++ return -EFAULT; \ ++ buf[cnt] = '\0'; \ ++ if (kstrtouint(buf, 10, &percent)) \ ++ return -EINVAL; \ ++ if (percent > 100) \ ++ return -EINVAL; \ ++ llc_##name = percent; \ ++ *ppos += cnt; \ ++ return cnt; \ ++} \ ++static int sched_cache_show_##name(struct seq_file *m, void *v) \ ++{ \ ++ seq_printf(m, "%d\n", llc_##name); \ ++ return 0; \ ++} \ ++static int sched_cache_open_##name(struct inode *inode, \ ++ struct file *filp) \ ++{ \ ++ return single_open(filp, sched_cache_show_##name, NULL); \ ++} \ ++static const struct file_operations sched_cache_fops_##name = { \ ++ .open = sched_cache_open_##name, \ ++ .write = sched_cache_write_##name, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++} ++ ++SCHED_CACHE_CREATE_CONTROL(overload_pct); ++SCHED_CACHE_CREATE_CONTROL(imb_pct); ++SCHED_CACHE_CREATE_CONTROL(aggr_tolerance); ++#endif /* SCHED_CACHE */ ++ + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +@@ -524,8 +568,16 @@ static __init int sched_init_debug(void) + #endif /* CONFIG_NUMA_BALANCING */ + + #ifdef CONFIG_SCHED_CACHE +- debugfs_create_u32("llc_overload_pct", 0644, debugfs_sched, &llc_overload_pct); +- debugfs_create_u32("llc_imb_pct", 0644, debugfs_sched, &llc_imb_pct); ++ debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_overload_pct); ++ debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_imb_pct); ++ debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_aggr_tolerance); ++ debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched, ++ &llc_epoch_period); ++ debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched, ++ &llc_epoch_affinity_timeout); + #endif + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 46dfcd2a01b3..f9084e2f9ef2 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1207,9 +1207,62 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + + __read_mostly unsigned int llc_overload_pct = 50; + __read_mostly unsigned int llc_imb_pct = 20; ++__read_mostly unsigned int llc_aggr_tolerance = 1; ++__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD; ++__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT; + + DEFINE_STATIC_KEY_FALSE(sched_cache_allowed); + ++static inline int get_sched_cache_scale(int mul) ++{ ++ if (!llc_aggr_tolerance) ++ return 0; ++ ++ if (llc_aggr_tolerance == 100) ++ return INT_MAX; ++ ++ return (1 + (llc_aggr_tolerance - 1) * mul); ++} ++ ++static inline int get_sched_cache_rss_scale(void) ++{ ++ /* ++ * Suppose the L3 size is 32MB. If the ++ * llc_aggr_tolerance is 1: ++ * When the RSS is larger than 32MB, ++ * the process is regarded as exceeding ++ * the LLC capacity. If the ++ * llc_aggr_tolerance is 99: ++ * When the RSS is larger than 784GB, ++ * the process is regarded as exceeding ++ * the LLC capacity: ++ * 784GB = (1 + (99 - 1) * 256) * 32MB ++ */ ++ return get_sched_cache_scale(256); ++} ++ ++static inline int get_sched_cache_nr_scale(void) ++{ ++ /* ++ * Suppose the number of Cores in LLC is 8. ++ * Every core has 2 SMTs. ++ * If the llc_aggr_tolerance is 1: When the ++ * nr_running is larger than 8, the process ++ * is regarded as exceeding the LLC capacity. ++ * If the llc_aggr_tolerance is 99: ++ * When the nr_running is larger than 785, ++ * the process is regarded as exceeding ++ * the LLC capacity: ++ * 785 = 1 + (99 - 1) * 8 ++ */ ++ return get_sched_cache_scale(1); ++} ++ ++static inline int get_sched_cache_cap_scale(void) ++{ ++ return (llc_overload_pct / cpu_smt_num_threads); ++} ++ + static inline bool sched_cache_enabled(void) + { + return sched_feat(SCHED_CACHE) && +@@ -1245,6 +1298,7 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + struct cacheinfo *ci; + unsigned long rss; + unsigned int llc; ++ int scale; + + /* + * get_cpu_cacheinfo_level() can not be used +@@ -1269,19 +1323,27 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + rss = get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + +- return (llc <= (rss * PAGE_SIZE)); ++ scale = get_sched_cache_rss_scale(); ++ if (scale == INT_MAX) ++ return false; ++ ++ return ((llc * scale) <= (rss * PAGE_SIZE)); + } + + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) + { +- int smt_nr = 1; ++ int smt_nr = 1, scale; + + #ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) + smt_nr = cpumask_weight(cpu_smt_mask(cpu)); + #endif + +- return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu)); ++ scale = get_sched_cache_nr_scale(); ++ if (scale == INT_MAX) ++ return false; ++ ++ return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu))); + } + + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) +@@ -1370,9 +1432,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) + long delta = now - rq->cpu_epoch_next; + + if (delta > 0) { +- n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ n = (delta + llc_epoch_period - 1) / llc_epoch_period; + rq->cpu_epoch += n; +- rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ rq->cpu_epoch_next += n * llc_epoch_period; + __shr_u64(&rq->cpu_runtime, n); + } + +@@ -1432,7 +1494,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + * has only 1 thread, or has too many active threads, invalidate + * its preferred state. + */ +- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout || + get_nr_threads(p) <= 1 || + exceed_llc_nr(mm, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { +@@ -9749,7 +9811,7 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + * (default: ~50%) + */ + #define fits_llc_capacity(util, max) \ +- ((util) * 100 < (max) * llc_overload_pct) ++ ((util) * 100 < (max) * get_sched_cache_cap_scale()) + + /* + * The margin used when comparing utilization. +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index b801d32d5fba..97e8558b0530 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2810,6 +2810,9 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; + #ifdef CONFIG_SCHED_CACHE + extern unsigned int llc_overload_pct; + extern unsigned int llc_imb_pct; ++extern unsigned int llc_aggr_tolerance; ++extern unsigned int llc_epoch_period; ++extern unsigned int llc_epoch_affinity_timeout; + extern struct static_key_false sched_cache_allowed; + #endif + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip new file mode 100644 index 0000000..30b2f3a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-01-23-sched-cache-Introduce-infrastructure-for-cache-aware-load-balancing.patch.skip @@ -0,0 +1,637 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3E1062EA481 + for ; Wed, 3 Dec 2025 23:01:19 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802883; cv=none; b=UKLk6Rg4Ag2RrVZTM6q83e57jrtOabhFLy87jTKdCORkErT5oscdmGvQFuZ8uzk4JddS6cPh1pfkZIjrorb34GjVrTfhTnjF3Ev1eA9P3f9SHm6a8HG5wxWf/yS25iz0NQWmXUw8INvgj0a9A56o6dRBuDjYNgK/XPE8bAKiBUg= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802883; c=relaxed/simple; + bh=6xbRUXX8feoSk8bOjg/vcAGiqy4i78lNKWOOyysMsTg=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=ezTAzjYx2Rp52iZO2WWYVcoqrFo5k7CxRy+shLmmCt9X8OAnGBmN2eYuhkz/I7t0LW4rAjnmLXBSt4s5lKDI7cjNxUO/rV3B0EWqv13ojuB5QKkGvUXb3YGE9U0EUSc8TdruI55O35k40Uh0lNID1k89G7Dxb8VJ6Ckm0RWpbqE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=hXZ7RTSy; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="hXZ7RTSy" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802880; x=1796338880; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=6xbRUXX8feoSk8bOjg/vcAGiqy4i78lNKWOOyysMsTg=; + b=hXZ7RTSym9fS1Xvrd9iY6zdRxiZpXzgeaEnDkbWt4E9kikaWOOGcUivi + QbmpWan09GqoanGn0S6Vft9B7BxCpebF9EW9KXpkUelSttyWWDfdj3/y5 + FTK7BCv2Ykd5RjEGqBmouxnoYSthhh0M052SACkie+UXmvYxcT/sOQCCX + HOsATO8B6T2nuON/L4dyuLl54HqVuf+JcbMOZ0ABnQ6ZFHGM/cCwqCXcJ + AmUI07y2Khz2g6thC1D3WG4YXreJSp+sT28iidXrCmaZBan6+WI286Msl + K0/hGg9Y68V2FBcOV+wIiAuy+MY5XGtKxf7nZIp0LSDOwP7fiuTJEB8U9 + g==; +X-CSE-ConnectionGUID: cpmnVUlITmyoLapwFs/Now== +X-CSE-MsgGUID: 1fVK363gQBOq9Aw6XgwrKg== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136182" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136182" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:18 -0800 +X-CSE-ConnectionGUID: d8cQS9oyRh+diLyesP0AjA== +X-CSE-MsgGUID: JRVFNz3/S1eHusPWEwHUNA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763734" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:18 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 01/23] sched/cache: Introduce infrastructure for cache-aware load balancing +Date: Wed, 3 Dec 2025 15:07:20 -0800 +Message-Id: <06f0d7edbc3185ec730b50b3b00d87ace44169b3.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: "Peter Zijlstra (Intel)" + +Adds infrastructure to enable cache-aware load balancing, +which improves cache locality by grouping tasks that share resources +within the same cache domain. This reduces cache misses and improves +overall data access efficiency. + +In this initial implementation, threads belonging to the same process +are treated as entities that likely share working sets. The mechanism +tracks per-process CPU occupancy across cache domains and attempts to +migrate threads toward cache-hot domains where their process already +has active threads, thereby enhancing locality. + +This provides a basic model for cache affinity. While the current code +targets the last-level cache (LLC), the approach could be extended to +other domain types such as clusters (L2) or node-internal groupings. + +At present, the mechanism selects the CPU within an LLC that has the +highest recent runtime. Subsequent patches in this series will use this +information in the load-balancing path to guide task placement toward +preferred LLCs. + +In the future, more advanced policies could be integrated through NUMA +balancing-for example, migrating a task to its preferred LLC when spare +capacity exists, or swapping tasks across LLCs to improve cache affinity. +Grouping of tasks could also be generalized from that of a process +to be that of a NUMA group, or be user configurable. + +Originally-by: Peter Zijlstra (Intel) +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + Restore the original CPU scan to cover all online CPUs, + rather than scanning within the preferred NUMA node. + (Peter Zijlstra) + + Use rq->curr instead of rq->donor. (K Prateek Nayak) + + Minor fix in task_tick_cache() to use + if (mm->mm_sched_epoch >= rq->cpu_epoch) + to avoid mm_sched_epoch going backwards. + + include/linux/mm_types.h | 44 +++++++ + include/linux/sched.h | 11 ++ + init/Kconfig | 11 ++ + kernel/fork.c | 6 + + kernel/sched/core.c | 6 + + kernel/sched/fair.c | 258 +++++++++++++++++++++++++++++++++++++++ + kernel/sched/sched.h | 8 ++ + 7 files changed, 344 insertions(+) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 90e5790c318f..1ea16ef90566 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -939,6 +939,11 @@ typedef struct { + DECLARE_BITMAP(__mm_flags, NUM_MM_FLAG_BITS); + } __private mm_flags_t; + ++struct mm_sched { ++ u64 runtime; ++ unsigned long epoch; ++}; ++ + struct kioctx_table; + struct iommu_mm_data; + struct mm_struct { +@@ -1029,6 +1034,17 @@ struct mm_struct { + */ + raw_spinlock_t cpus_allowed_lock; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ /* ++ * Track per-cpu-per-process occupancy as a proxy for cache residency. ++ * See account_mm_sched() and ... ++ */ ++ struct mm_sched __percpu *pcpu_sched; ++ raw_spinlock_t mm_sched_lock; ++ unsigned long mm_sched_epoch; ++ int mm_sched_cpu; ++#endif ++ + #ifdef CONFIG_MMU + atomic_long_t pgtables_bytes; /* size of all page tables */ + #endif +@@ -1487,6 +1503,34 @@ static inline unsigned int mm_cid_size(void) + static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumask *cpumask) { } + #endif /* CONFIG_SCHED_MM_CID */ + ++#ifdef CONFIG_SCHED_CACHE ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched); ++ ++static inline int mm_alloc_sched_noprof(struct mm_struct *mm) ++{ ++ struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ ++ if (!pcpu_sched) ++ return -ENOMEM; ++ ++ mm_init_sched(mm, pcpu_sched); ++ return 0; ++} ++ ++#define mm_alloc_sched(...) alloc_hooks(mm_alloc_sched_noprof(__VA_ARGS__)) ++ ++static inline void mm_destroy_sched(struct mm_struct *mm) ++{ ++ free_percpu(mm->pcpu_sched); ++ mm->pcpu_sched = NULL; ++} ++#else /* !CONFIG_SCHED_CACHE */ ++ ++static inline int mm_alloc_sched(struct mm_struct *mm) { return 0; } ++static inline void mm_destroy_sched(struct mm_struct *mm) { } ++ ++#endif /* CONFIG_SCHED_CACHE */ ++ + struct mmu_gather; + extern void tlb_gather_mmu(struct mmu_gather *tlb, struct mm_struct *mm); + extern void tlb_gather_mmu_fullmm(struct mmu_gather *tlb, struct mm_struct *mm); +diff --git a/include/linux/sched.h b/include/linux/sched.h +index b469878de25c..278b529c91df 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1406,6 +1406,10 @@ struct task_struct { + unsigned long numa_pages_migrated; + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ struct callback_head cache_work; ++#endif ++ + #ifdef CONFIG_RSEQ + struct rseq __user *rseq; + u32 rseq_len; +@@ -2428,4 +2432,11 @@ extern void migrate_enable(void); + + DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + ++#ifdef CONFIG_SCHED_CACHE ++static inline bool sched_cache_enabled(void) ++{ ++ return false; ++} ++#endif ++ + #endif +diff --git a/init/Kconfig b/init/Kconfig +index cab3ad28ca49..88556ef8cfd1 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -983,6 +983,17 @@ config NUMA_BALANCING + + This system will be inactive on UMA systems. + ++config SCHED_CACHE ++ bool "Cache aware load balance" ++ default y ++ depends on SMP ++ help ++ When enabled, the scheduler will attempt to aggregate tasks from ++ the same process onto a single Last Level Cache (LLC) domain when ++ possible. This improves cache locality by keeping tasks that share ++ resources within the same cache domain, reducing cache misses and ++ lowering data access latency. ++ + config NUMA_BALANCING_DEFAULT_ENABLED + bool "Automatically enable NUMA aware memory/task placement" + default y +diff --git a/kernel/fork.c b/kernel/fork.c +index 3da0f08615a9..aae5053d1e30 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -680,6 +680,7 @@ void __mmdrop(struct mm_struct *mm) + cleanup_lazy_tlbs(mm); + + WARN_ON_ONCE(mm == current->active_mm); ++ mm_destroy_sched(mm); + mm_free_pgd(mm); + mm_free_id(mm); + destroy_context(mm); +@@ -1083,6 +1084,9 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + if (mm_alloc_cid(mm, p)) + goto fail_cid; + ++ if (mm_alloc_sched(mm)) ++ goto fail_sched; ++ + if (percpu_counter_init_many(mm->rss_stat, 0, GFP_KERNEL_ACCOUNT, + NR_MM_COUNTERS)) + goto fail_pcpu; +@@ -1092,6 +1096,8 @@ static struct mm_struct *mm_init(struct mm_struct *mm, struct task_struct *p, + return mm; + + fail_pcpu: ++ mm_destroy_sched(mm); ++fail_sched: + mm_destroy_cid(mm); + fail_cid: + destroy_context(mm); +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index f754a60de848..e8bdf03a4b7f 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4488,6 +4488,7 @@ static void __sched_fork(u64 clone_flags, struct task_struct *p) + p->wake_entry.u_flags = CSD_TYPE_TTWU; + p->migration_pending = NULL; + init_sched_mm_cid(p); ++ init_sched_mm(p); + } + + DEFINE_STATIC_KEY_FALSE(sched_numa_balancing); +@@ -8791,6 +8792,11 @@ void __init sched_init(void) + + rq->core_cookie = 0UL; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spin_lock_init(&rq->cpu_epoch_lock); ++ rq->cpu_epoch_next = jiffies; ++#endif ++ + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); + } + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 5b752324270b..cb82f558dc5b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1152,6 +1152,8 @@ void post_init_entity_util_avg(struct task_struct *p) + sa->runnable_avg = sa->util_avg; + } + ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec); ++ + static s64 update_se(struct rq *rq, struct sched_entity *se) + { + u64 now = rq_clock_task(rq); +@@ -1174,6 +1176,7 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + + trace_sched_stat_runtime(running, delta_exec); + account_group_exec_runtime(running, delta_exec); ++ account_mm_sched(rq, running, delta_exec); + + /* cgroup time is always accounted against the donor */ + cgroup_account_cputime(donor, delta_exec); +@@ -1193,6 +1196,259 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + return delta_exec; + } + ++#ifdef CONFIG_SCHED_CACHE ++ ++/* ++ * XXX numbers come from a place the sun don't shine -- probably wants to be SD ++ * tunable or so. ++ */ ++#define EPOCH_PERIOD (HZ / 100) /* 10 ms */ ++#define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ ++ ++static int llc_id(int cpu) ++{ ++ if (cpu < 0) ++ return -1; ++ ++ return per_cpu(sd_llc_id, cpu); ++} ++ ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) ++{ ++ unsigned long epoch; ++ int i; ++ ++ for_each_possible_cpu(i) { ++ struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); ++ struct rq *rq = cpu_rq(i); ++ ++ pcpu_sched->runtime = 0; ++ pcpu_sched->epoch = rq->cpu_epoch; ++ epoch = rq->cpu_epoch; ++ } ++ ++ raw_spin_lock_init(&mm->mm_sched_lock); ++ mm->mm_sched_epoch = epoch; ++ mm->mm_sched_cpu = -1; ++ ++ /* ++ * The update to mm->pcpu_sched should not be reordered ++ * before initialization to mm's other fields, in case ++ * the readers may get invalid mm_sched_epoch, etc. ++ */ ++ smp_store_release(&mm->pcpu_sched, _pcpu_sched); ++} ++ ++/* because why would C be fully specified */ ++static __always_inline void __shr_u64(u64 *val, unsigned int n) ++{ ++ if (n >= 64) { ++ *val = 0; ++ return; ++ } ++ *val >>= n; ++} ++ ++static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ lockdep_assert_held(&rq->cpu_epoch_lock); ++ ++ unsigned long n, now = jiffies; ++ long delta = now - rq->cpu_epoch_next; ++ ++ if (delta > 0) { ++ n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ rq->cpu_epoch += n; ++ rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ __shr_u64(&rq->cpu_runtime, n); ++ } ++ ++ n = rq->cpu_epoch - pcpu_sched->epoch; ++ if (n) { ++ pcpu_sched->epoch += n; ++ __shr_u64(&pcpu_sched->runtime, n); ++ } ++} ++ ++static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) ++{ ++ guard(raw_spinlock_irqsave)(&rq->cpu_epoch_lock); ++ ++ __update_mm_sched(rq, pcpu_sched); ++ ++ /* ++ * Runtime is a geometric series (r=0.5) and as such will sum to twice ++ * the accumulation period, this means the multiplcation here should ++ * not overflow. ++ */ ++ return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); ++} ++ ++static inline ++void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) ++{ ++ struct mm_struct *mm = p->mm; ++ struct mm_sched *pcpu_sched; ++ unsigned long epoch; ++ ++ if (!sched_cache_enabled()) ++ return; ++ ++ if (p->sched_class != &fair_sched_class) ++ return; ++ /* ++ * init_task and kthreads don't having mm ++ */ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq)); ++ ++ scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { ++ __update_mm_sched(rq, pcpu_sched); ++ pcpu_sched->runtime += delta_exec; ++ rq->cpu_runtime += delta_exec; ++ epoch = rq->cpu_epoch; ++ } ++ ++ /* ++ * If this task hasn't hit task_cache_work() for a while, or it ++ * has only 1 thread, invalidate its preferred state. ++ */ ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || ++ get_nr_threads(p) <= 1) { ++ if (mm->mm_sched_cpu != -1) ++ mm->mm_sched_cpu = -1; ++ } ++} ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ struct mm_struct *mm = p->mm; ++ ++ if (!sched_cache_enabled()) ++ return; ++ ++ if (!mm || !mm->pcpu_sched) ++ return; ++ ++ /* avoid moving backwards */ ++ if (mm->mm_sched_epoch >= rq->cpu_epoch) ++ return; ++ ++ guard(raw_spinlock)(&mm->mm_sched_lock); ++ ++ if (work->next == work) { ++ task_work_add(p, work, TWA_RESUME); ++ WRITE_ONCE(mm->mm_sched_epoch, rq->cpu_epoch); ++ } ++} ++ ++static void __no_profile task_cache_work(struct callback_head *work) ++{ ++ struct task_struct *p = current; ++ struct mm_struct *mm = p->mm; ++ unsigned long m_a_occ = 0; ++ unsigned long curr_m_a_occ = 0; ++ int cpu, m_a_cpu = -1; ++ cpumask_var_t cpus; ++ ++ WARN_ON_ONCE(work != &p->cache_work); ++ ++ work->next = work; ++ ++ if (p->flags & PF_EXITING) ++ return; ++ ++ if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) ++ return; ++ ++ scoped_guard (cpus_read_lock) { ++ cpumask_copy(cpus, cpu_online_mask); ++ ++ for_each_cpu(cpu, cpus) { ++ /* XXX sched_cluster_active */ ++ struct sched_domain *sd = per_cpu(sd_llc, cpu); ++ unsigned long occ, m_occ = 0, a_occ = 0; ++ int m_cpu = -1, i; ++ ++ if (!sd) ++ continue; ++ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ occ = fraction_mm_sched(cpu_rq(i), ++ per_cpu_ptr(mm->pcpu_sched, i)); ++ a_occ += occ; ++ if (occ > m_occ) { ++ m_occ = occ; ++ m_cpu = i; ++ } ++ } ++ ++ /* ++ * Compare the accumulated occupancy of each LLC. The ++ * reason for using accumulated occupancy rather than average ++ * per CPU occupancy is that it works better in asymmetric LLC ++ * scenarios. ++ * For example, if there are 2 threads in a 4CPU LLC and 3 ++ * threads in an 8CPU LLC, it might be better to choose the one ++ * with 3 threads. However, this would not be the case if the ++ * occupancy is divided by the number of CPUs in an LLC (i.e., ++ * if average per CPU occupancy is used). ++ * Besides, NUMA balancing fault statistics behave similarly: ++ * the total number of faults per node is compared rather than ++ * the average number of faults per CPU. This strategy is also ++ * followed here. ++ */ ++ if (a_occ > m_a_occ) { ++ m_a_occ = a_occ; ++ m_a_cpu = m_cpu; ++ } ++ ++ if (llc_id(cpu) == llc_id(mm->mm_sched_cpu)) ++ curr_m_a_occ = a_occ; ++ ++ cpumask_andnot(cpus, cpus, sched_domain_span(sd)); ++ } ++ } ++ ++ if (m_a_occ > (2 * curr_m_a_occ)) { ++ /* ++ * Avoid switching mm_sched_cpu too fast. ++ * The reason to choose 2X is because: ++ * 1. It is better to keep the preferred LLC stable, ++ * rather than changing it frequently and cause migrations ++ * 2. 2X means the new preferred LLC has at least 1 more ++ * busy CPU than the old one(200% vs 100%, eg) ++ * 3. 2X is chosen based on test results, as it delivers ++ * the optimal performance gain so far. ++ */ ++ mm->mm_sched_cpu = m_a_cpu; ++ } ++ ++ free_cpumask_var(cpus); ++} ++ ++void init_sched_mm(struct task_struct *p) ++{ ++ struct callback_head *work = &p->cache_work; ++ ++ init_task_work(work, task_cache_work); ++ work->next = work; ++} ++ ++#else ++ ++static inline void account_mm_sched(struct rq *rq, struct task_struct *p, ++ s64 delta_exec) { } ++ ++void init_sched_mm(struct task_struct *p) { } ++ ++static void task_tick_cache(struct rq *rq, struct task_struct *p) { } ++ ++#endif ++ + /* + * Used by other classes to account runtime. + */ +@@ -13124,6 +13380,8 @@ static void task_tick_fair(struct rq *rq, struct task_struct *curr, int queued) + if (static_branch_unlikely(&sched_numa_balancing)) + task_tick_numa(rq, curr); + ++ task_tick_cache(rq, curr); ++ + update_misfit_status(curr, rq); + check_update_overutilized_status(task_rq(curr)); + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index adfb6e3409d7..84118b522f22 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1194,6 +1194,12 @@ struct rq { + u64 clock_pelt_idle_copy; + u64 clock_idle_copy; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ raw_spinlock_t cpu_epoch_lock ____cacheline_aligned; ++ u64 cpu_runtime; ++ unsigned long cpu_epoch; ++ unsigned long cpu_epoch_next; ++#endif + + atomic_t nr_iowait; + +@@ -3819,6 +3825,8 @@ static inline void task_tick_mm_cid(struct rq *rq, struct task_struct *curr) { } + static inline void init_sched_mm_cid(struct task_struct *t) { } + #endif /* !CONFIG_SCHED_MM_CID */ + ++extern void init_sched_mm(struct task_struct *p); ++ + extern u64 avg_vruntime(struct cfs_rq *cfs_rq); + extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); + static inline +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip new file mode 100644 index 0000000..ad4bcfd --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-02-23-sched-cache-Record-per-LLC-utilization-to-guide-cache-aware-scheduling-decisions.patch.skip @@ -0,0 +1,229 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8872B2EC0A3 + for ; Wed, 3 Dec 2025 23:01:21 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802884; cv=none; b=PIVYWfHNGhpYcL5pUf5pbJV6z5GC4MufyMLaT00/IZT2eIAKxBzqzRglsyVDKa18ZuvGOOBF6720BmFO1QjbQTlm++JQNaJ2Li4EQo87RGn9XE96gbHXFQW46Ye00LdP+tH7Hh5mDSD6E7sACuXB9wl4PappMcJ/np+rPkSv+fk= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802884; c=relaxed/simple; + bh=1tsEZhdWTsEDcQ9RmMyka/N/6UwyydH6Z8nvicoX744=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version:Content-Type; b=HIfoZU9H/SZm0t6eE4dquYqikhNAFvY4+BXlcqSIZ3CtUZjOzIUSOC63YZp9YVMZHXi1YQfdjTLmXM4JflgdOMpsYGcmIdM9y97XnpuLltYZndJJ3UMie+BQAS7WTzwavGBbWlwvukQFWzaAt18tTAj+n7TvfZUbdaq3Hd/PwnQ= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=gY2DTyL8; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="gY2DTyL8" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802881; x=1796338881; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=1tsEZhdWTsEDcQ9RmMyka/N/6UwyydH6Z8nvicoX744=; + b=gY2DTyL8yWnM8kjFl12irX509n24BDz4iFKCqM6WCNUCLXRN5a5IlNUP + CfYI2+/YpAT4bu6uPNEPMLhPBFM2XD4LK26owQYwXoYEFxXYOPyRzMCCr + rISEhzC11YficDTuxwWe3QvPX3HaXsnsqXtK9HLG/hiT6NfkxrHYuu33P + 2QVChiY0MqYwc1nvL417RDFrqZbCy7kRQLG02T5nK00USUuGMRvgZv+U3 + gt7oM5XlbDtNyyU+5sVU7KIViaRsZSfklkuYRaOOMQ39LYUdIFQ+Ue6G0 + EAocEYO+P59FhkDZmjjHTJ9I3dlRH+Fcb/w/MBdqObwG/r+XHEjXGxmQZ + g==; +X-CSE-ConnectionGUID: leGPfNk6R8KUwcSASjrrFg== +X-CSE-MsgGUID: gXlnURSyTm+Cie+BIy/27w== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136204" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136204" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:21 -0800 +X-CSE-ConnectionGUID: R/PsDZqXSOeZVekLwfPG7Q== +X-CSE-MsgGUID: gEHdD8uJSdmMqZ2X+NMi/w== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763741" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:20 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 02/23] sched/cache: Record per-LLC utilization to guide cache-aware scheduling decisions +Date: Wed, 3 Dec 2025 15:07:21 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Type: text/plain; charset=UTF-8 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +When a system becomes busy and a process’s preferred LLC is +saturated with too many threads, tasks within that LLC migrate +frequently. These in LLC migrations introduce latency and degrade +performance. To avoid this, task aggregation should be suppressed when +the preferred LLC is overloaded, which requires a metric to indicate +LLC utilization. + +Record per LLC utilization/cpu capacity during periodic load +balancing. These statistics will be used in later patches to decide +whether tasks should be aggregated into their preferred LLC. + +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + Refine the comments in record_sg_llc_stats().(Peter Zijlstra). + + include/linux/sched/topology.h | 4 ++ + kernel/sched/fair.c | 69 ++++++++++++++++++++++++++++++++++ + 2 files changed, 73 insertions(+) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index bbcfdf12aa6e..0ba4697d74ba 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -68,6 +68,10 @@ struct sched_domain_shared { + atomic_t nr_busy_cpus; + int has_idle_cores; + int nr_idle_scan; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned long util_avg; ++ unsigned long capacity ____cacheline_aligned_in_smp; ++#endif + }; + + struct sched_domain { +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index cb82f558dc5b..b9f336300f14 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9622,6 +9622,29 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + return 0; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* Called from load balancing paths with rcu_read_lock held */ ++static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ struct sched_domain_shared *sd_share; ++ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, cpu)); ++ if (!sd_share) ++ return false; ++ ++ *util = READ_ONCE(sd_share->util_avg); ++ *cap = READ_ONCE(sd_share->capacity); ++ ++ return true; ++} ++#else ++static inline bool get_llc_stats(int cpu, unsigned long *util, ++ unsigned long *cap) ++{ ++ return false; ++} ++#endif + /* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? + */ +@@ -10592,6 +10615,51 @@ sched_reduced_capacity(struct rq *rq, struct sched_domain *sd) + return check_cpu_capacity(rq, sd); + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Record the statistics for this scheduler group for later ++ * use. These values guide load balancing on aggregating tasks ++ * to a LLC. ++ */ ++static void record_sg_llc_stats(struct lb_env *env, ++ struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ struct sched_domain_shared *sd_share; ++ ++ if (!sched_cache_enabled() || env->idle == CPU_NEWLY_IDLE) ++ return; ++ ++ /* Only care about sched domain spanning multiple LLCs */ ++ if (env->sd->child != rcu_dereference(per_cpu(sd_llc, env->dst_cpu))) ++ return; ++ ++ /* ++ * At this point we know this group spans a LLC domain. ++ * Record the statistic of this group in its corresponding ++ * shared LLC domain. ++ * Note: sd_share cannot be obtained via sd->child->shared, because ++ * it refers to the domain that covers the local group, while ++ * sd_share could represent any of the LLC group. ++ */ ++ sd_share = rcu_dereference(per_cpu(sd_llc_shared, ++ cpumask_first(sched_group_span(group)))); ++ if (!sd_share) ++ return; ++ ++ if (READ_ONCE(sd_share->util_avg) != sgs->group_util) ++ WRITE_ONCE(sd_share->util_avg, sgs->group_util); ++ ++ if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) ++ WRITE_ONCE(sd_share->capacity, sgs->group_capacity); ++} ++#else ++static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++} ++#endif ++ + /** + * update_sg_lb_stats - Update sched_group's statistics for load balancing. + * @env: The load balancing environment. +@@ -10681,6 +10749,7 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + ++ record_sg_llc_stats(env, sgs, group); + /* Computing avg_load makes sense only when group is overloaded */ + if (sgs->group_type == group_overloaded) + sgs->avg_load = (sgs->group_load * SCHED_CAPACITY_SCALE) / +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip new file mode 100644 index 0000000..821d67b --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-03-23-sched-cache-Introduce-helper-functions-to-enforce-LLC-migration-policy.patch.skip @@ -0,0 +1,333 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2F2692EBDDE + for ; Wed, 3 Dec 2025 23:01:23 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802885; cv=none; b=fboZZkFKPl6gqpHDoF2b6zbyblNamhVu+FcjT54t3oU8vxsb1XXezAqbDtyJgvQY5nilFQH3AKBGOohsQ/SQ3tX2mRk+BSCtjeqUEVqOw4w0dDc2wtmgFtlHa6V/L30IDsIjeiViMUZM4y4AiA82fvOBsu4+NJQNRAWoaUu83no= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802885; c=relaxed/simple; + bh=TakEXE1LpDhxRe/Kb7GWIrlVFYabIDFNwz7qIMJeAzA=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=lWQIStOFn4iy99stFlHV/qSBEi3k7WL/GF8q0g3QeYxyAInDLMtgRyHdyj4lgpwV4+hcrGelSaLn9GQ314YsxP62kdg4igNnwsJ5I/UGLtE/m0W5/zOTgeJYpf5nNjxi042Eu8UJR3sDuMQmXljn/+2COvTOKDQkes+q8dJg4fs= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ZJw0lk7W; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ZJw0lk7W" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802883; x=1796338883; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=TakEXE1LpDhxRe/Kb7GWIrlVFYabIDFNwz7qIMJeAzA=; + b=ZJw0lk7W6RTQQr7pUyzeAA7+tFRn5rwcdkgzS49IJ6otxSXwAzwDWZIh + 72+xVH8b/09ZAgA4A4sjEOCcav+jAPzfD2L3N7AxSkmW/F8BHhBoUD3JQ + QbRstLbqNMnMwfrcQ+qBeU1Q3VwTeXm0rmxciTrI2u6z3GCHX79/Bxc9Y + tid45au2Oifch9e3/2xq9ljpUEYKZAVIVVPqiF3n86ssLv/OdDy75IUHo + 67RTdQeGc20OckklfmpRjpvC7cCT1mZKRlid3w67UBs6EEbQgCGzqXjOi + NdatFPNJvaFIWKoBtqpyQd9yFecmVzXENUGCr745w3Jqa3QUeXJGyO4fH + g==; +X-CSE-ConnectionGUID: /fs42l2aRamlkF2vGhwf9A== +X-CSE-MsgGUID: 2SoBy6EYTSqmAsupZBGzKg== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136230" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136230" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:22 -0800 +X-CSE-ConnectionGUID: O845bRyGQ8Wd6MpeSkE96g== +X-CSE-MsgGUID: Cz70j2EQQ0GHWMf5pEgBBw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763752" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:22 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 03/23] sched/cache: Introduce helper functions to enforce LLC migration policy +Date: Wed, 3 Dec 2025 15:07:22 -0800 +Message-Id: <12e90c8c26c690b40e48cc1e03c785f2f99fafa8.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Cache-aware scheduling aggregates threads onto their preferred LLC, +mainly through load balancing. When the preferred LLC becomes +saturated, more threads are still placed there, increasing latency. +A mechanism is needed to limit aggregation so that the preferred LLC +does not become overloaded. + +Introduce helper functions can_migrate_llc() and +can_migrate_llc_task() to enforce the LLC migration policy: + + 1. Aggregate a task to its preferred LLC if both source and + destination LLCs are not too busy (<50% utilization), + or if doing so will not leave the preferred LLC much more + imbalanced than the non-preferred one (>20% utilization + difference, similar to imbalance_pct of the LLC domain). + 2. Allow moving a task from overloaded preferred LLC to a non preferred + LLC if this will not cause the non preferred LLC to become + too imbalanced to cause a later migration back. + 3. If both LLCs are too busy, let the generic load balance to spread + the tasks. + +Further (hysteresis)action could be taken in the future to prevent tasks +from being migrated into and out of the preferred LLC frequently (back and +forth): the threshold for migrating a task out of its preferred LLC should +be higher than that for migrating it into the LLC. + +Since aggregation tends to make the preferred LLC busier than others, +the imbalance tolerance is controlled by llc_imb_pct. If set to 0, +tasks may still aggregate to the preferred LLC as long as it is +not more utilized than the source LLC, preserving the preference. + +Co-developed-by: Tim Chen +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + No change. + + kernel/sched/fair.c | 153 +++++++++++++++++++++++++++++++++++++++++++ + kernel/sched/sched.h | 5 ++ + 2 files changed, 158 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b9f336300f14..710ed9943d27 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1205,6 +1205,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + #define EPOCH_PERIOD (HZ / 100) /* 10 ms */ + #define EPOCH_LLC_AFFINITY_TIMEOUT 5 /* 50 ms */ + ++__read_mostly unsigned int llc_overload_pct = 50; ++__read_mostly unsigned int llc_imb_pct = 20; ++ + static int llc_id(int cpu) + { + if (cpu < 0) +@@ -9623,6 +9626,27 @@ static inline int task_is_ineligible_on_dst_cpu(struct task_struct *p, int dest_ + } + + #ifdef CONFIG_SCHED_CACHE ++/* ++ * The margin used when comparing LLC utilization with CPU capacity. ++ * Parameter llc_overload_pct determines the LLC load level where ++ * active LLC aggregation is done. ++ * Derived from fits_capacity(). ++ * ++ * (default: ~50%) ++ */ ++#define fits_llc_capacity(util, max) \ ++ ((util) * 100 < (max) * llc_overload_pct) ++ ++/* ++ * The margin used when comparing utilization. ++ * is 'util1' noticeably greater than 'util2' ++ * Derived from capacity_greater(). ++ * Bias is in perentage. ++ */ ++/* Allows dst util to be bigger than src util by up to bias percent */ ++#define util_greater(util1, util2) \ ++ ((util1) * 100 > (util2) * (100 + llc_imb_pct)) ++ + /* Called from load balancing paths with rcu_read_lock held */ + static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +@@ -9638,6 +9662,135 @@ static __maybe_unused bool get_llc_stats(int cpu, unsigned long *util, + + return true; + } ++ ++/* ++ * Decision matrix according to the LLC utilization. To ++ * decide whether we can do task aggregation across LLC. ++ * ++ * By default, 50% is the threshold to treat the LLC as busy, ++ * and 20% is the utilization imbalance percentage to decide ++ * if the preferred LLC is busier than the non-preferred LLC. ++ * The hysteresis is used to avoid task bouncing between the ++ * preferred LLC and the non-preferred LLC. ++ * ++ * 1. moving towards the preferred LLC, dst is the preferred ++ * LLC, src is not. ++ * ++ * src \ dst 30% 40% 50% 60% ++ * 30% Y Y Y N ++ * 40% Y Y Y Y ++ * 50% Y Y G G ++ * 60% Y Y G G ++ * ++ * 2. moving out of the preferred LLC, src is the preferred ++ * LLC, dst is not: ++ * ++ * src \ dst 30% 40% 50% 60% ++ * 30% N N N N ++ * 40% N N N N ++ * 50% N N G G ++ * 60% Y N G G ++ * ++ * src : src_util ++ * dst : dst_util ++ * Y : Yes, migrate ++ * N : No, do not migrate ++ * G : let the Generic load balance to even the load. ++ * ++ * The intention is that if both LLCs are quite busy, cache aware ++ * load balance should not be performed, and generic load balance ++ * should take effect. However, if one is busy and the other is not, ++ * the preferred LLC capacity(50%) and imbalance criteria(20%) should ++ * be considered to determine whether LLC aggregation should be ++ * performed to bias the load towards the preferred LLC. ++ */ ++ ++/* migration decision, 3 states are orthogonal. */ ++enum llc_mig { ++ mig_forbid = 0, /* N: Don't migrate task, respect LLC preference */ ++ mig_llc, /* Y: Do LLC preference based migration */ ++ mig_unrestricted /* G: Don't restrict generic load balance migration */ ++}; ++ ++/* ++ * Check if task can be moved from the source LLC to the ++ * destination LLC without breaking cache aware preferrence. ++ * src_cpu and dst_cpu are arbitrary CPUs within the source ++ * and destination LLCs, respectively. ++ */ ++static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, ++ unsigned long tsk_util, ++ bool to_pref) ++{ ++ unsigned long src_util, dst_util, src_cap, dst_cap; ++ ++ if (!get_llc_stats(src_cpu, &src_util, &src_cap) || ++ !get_llc_stats(dst_cpu, &dst_util, &dst_cap)) ++ return mig_unrestricted; ++ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ !fits_llc_capacity(src_util, src_cap)) ++ return mig_unrestricted; ++ ++ src_util = src_util < tsk_util ? 0 : src_util - tsk_util; ++ dst_util = dst_util + tsk_util; ++ if (to_pref) { ++ /* ++ * llc_imb_pct is the imbalance allowed between ++ * preferred LLC and non-preferred LLC. ++ * Don't migrate if we will get preferred LLC too ++ * heavily loaded and if the dest is much busier ++ * than the src, in which case migration will ++ * increase the imbalance too much. ++ */ ++ if (!fits_llc_capacity(dst_util, dst_cap) && ++ util_greater(dst_util, src_util)) ++ return mig_forbid; ++ } else { ++ /* ++ * Don't migrate if we will leave preferred LLC ++ * too idle, or if this migration leads to the ++ * non-preferred LLC falls within sysctl_aggr_imb percent ++ * of preferred LLC, leading to migration again ++ * back to preferred LLC. ++ */ ++ if (fits_llc_capacity(src_util, src_cap) || ++ !util_greater(src_util, dst_util)) ++ return mig_forbid; ++ } ++ return mig_llc; ++} ++ ++/* ++ * Check if task p can migrate from source LLC to ++ * destination LLC in terms of cache aware load balance. ++ */ ++static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, ++ struct task_struct *p) ++{ ++ struct mm_struct *mm; ++ bool to_pref; ++ int cpu; ++ ++ mm = p->mm; ++ if (!mm) ++ return mig_unrestricted; ++ ++ cpu = mm->mm_sched_cpu; ++ if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) ++ return mig_unrestricted; ++ ++ if (cpus_share_cache(dst_cpu, cpu)) ++ to_pref = true; ++ else if (cpus_share_cache(src_cpu, cpu)) ++ to_pref = false; ++ else ++ return mig_unrestricted; ++ ++ return can_migrate_llc(src_cpu, dst_cpu, ++ task_util(p), to_pref); ++} ++ + #else + static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 84118b522f22..bf72c5bab506 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2828,6 +2828,11 @@ extern unsigned int sysctl_numa_balancing_scan_period_max; + extern unsigned int sysctl_numa_balancing_scan_size; + extern unsigned int sysctl_numa_balancing_hot_threshold; + ++#ifdef CONFIG_SCHED_CACHE ++extern unsigned int llc_overload_pct; ++extern unsigned int llc_imb_pct; ++#endif ++ + #ifdef CONFIG_SCHED_HRTICK + + /* +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip new file mode 100644 index 0000000..6926073 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-04-23-sched-cache-Make-LLC-id-continuous.patch.skip @@ -0,0 +1,257 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 55E9C2EC08D + for ; Wed, 3 Dec 2025 23:01:24 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802886; cv=none; b=d3VrPdnnjHo1v15INzZi2Be9GCCRZHIzY8RvdjoDE/lVfQN7C6RgefM63jeAgMs+Ej4xBAgNM48bikZgcfBK97s516BGyLXX1Rbvhsn/lxdjOTLJb7/BzUSsXmqizKiXSV4Q40vVu+4KUJUTuTrw0EcRJX7axQAupxl66/Njl7g= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802886; c=relaxed/simple; + bh=/WXShEpYiDAFPDra61vUPdbNcgE+VqMlav+UUM59jU0=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=sp6UO1OW6Q3DioPA4TyMAxm2w7jZEWfXn+BecCi+DY63bhyHNOAdo2gxE9qPcZ4H/AG5K6vG0sVgNdh5TPmn2YDZ1M3oPRXJYAPeKE66XGC3smKX35V4ctG4LeLd8SIPZYPGBwl8SDEjENvTH1Cw9AGh2YoAZb6Q6CfS4bRt+vY= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=mJ6yn4qm; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="mJ6yn4qm" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802884; x=1796338884; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=/WXShEpYiDAFPDra61vUPdbNcgE+VqMlav+UUM59jU0=; + b=mJ6yn4qmNCzGpuUPMZdx+lsUqY/Y8q397TD5tze5hB735PCmFim3TtR3 + Eh74z+kUDoOPtNaJnMct+g67IgKwnq6+WYRbc+f3oEEw9Wg1Gcg9yN7oU + vI5Oubm8s7zVFVo1CwCylUT7AAgUyeA+NaPz/BoikrttCBobaJqnnubeC + HmGkKxv21UFMqlb7bdh2Dv1ZUBuQd/5iPTCr2He8Z4My1BxTJHc0KlROt + IrrMfarEIQ6kjL275GsASGznmrL05FEBJGY2at3hHLlbpnBR+lPPkEK0Y + B/H+e/fK9u8hElcLfWPp6Axh3PPWmX2TiXZI/s6f1Be/ZF/FgJXPpYRSc + Q==; +X-CSE-ConnectionGUID: G2tkFvPIT6SY1+ZRXOxXBw== +X-CSE-MsgGUID: uYGT49/IQCatA5I80r2gog== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136249" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136249" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:23 -0800 +X-CSE-ConnectionGUID: uiDsRffYTdabgXsA6tZowg== +X-CSE-MsgGUID: ym+MS4XuQPSAYB5T1Atjpw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763756" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:23 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 04/23] sched/cache: Make LLC id continuous +Date: Wed, 3 Dec 2025 15:07:23 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce an index mapping between CPUs and their LLCs. This provides +a continuous per LLC index needed for cache-aware load balancing in +later patches. + +The existing per_cpu llc_id usually points to the first CPU of the +LLC domain, which is sparse and unsuitable as an array index. Using +llc_id directly would waste memory. + +With the new mapping, CPUs in the same LLC share a continuous id: + + per_cpu(llc_id, CPU=0...15) = 0 + per_cpu(llc_id, CPU=16...31) = 1 + per_cpu(llc_id, CPU=32...47) = 2 + ... + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + Convert the static LLC id to be allocated sequentially as LLCs are + discovered, and replace the old sd_llc_id. (Peter Zijlstra) + + kernel/sched/fair.c | 9 ++++++- + kernel/sched/sched.h | 1 + + kernel/sched/topology.c | 60 +++++++++++++++++++++++++++++++++++++++-- + 3 files changed, 67 insertions(+), 3 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 710ed9943d27..0a3918269906 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1210,10 +1210,17 @@ __read_mostly unsigned int llc_imb_pct = 20; + + static int llc_id(int cpu) + { ++ int llc; ++ + if (cpu < 0) + return -1; + +- return per_cpu(sd_llc_id, cpu); ++ llc = per_cpu(sd_llc_id, cpu); ++ /* avoid race with cpu hotplug */ ++ if (unlikely(llc >= max_llcs)) ++ return -1; ++ ++ return llc; + } + + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index bf72c5bab506..728737641847 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2075,6 +2075,7 @@ DECLARE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + + extern struct static_key_false sched_asym_cpucapacity; + extern struct static_key_false sched_cluster_active; ++extern int max_llcs; + + static __always_inline bool sched_asym_cpucap_active(void) + { +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 444bdfdab731..f25d950ab015 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -17,6 +17,8 @@ void sched_domains_mutex_unlock(void) + mutex_unlock(&sched_domains_mutex); + } + ++int max_llcs; ++ + /* Protected by sched_domains_mutex: */ + static cpumask_var_t sched_domains_tmpmask; + static cpumask_var_t sched_domains_tmpmask2; +@@ -668,6 +670,55 @@ DEFINE_PER_CPU(struct sched_domain __rcu *, sd_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_asym_cpucapacity); + DEFINE_STATIC_KEY_FALSE(sched_cluster_active); + ++/* ++ * Assign continuous llc id for the CPU, and return ++ * the assigned llc id. ++ */ ++static int update_llc_id(struct sched_domain *sd, ++ int cpu) ++{ ++ int id = per_cpu(sd_llc_id, cpu), i; ++ ++ if (id >= 0) ++ return id; ++ ++ if (sd) { ++ /* Look for any assigned id and reuse it.*/ ++ for_each_cpu(i, sched_domain_span(sd)) { ++ id = per_cpu(sd_llc_id, i); ++ ++ if (id >= 0) { ++ per_cpu(sd_llc_id, cpu) = id; ++ return id; ++ } ++ } ++ } ++ ++ /* ++ * When 1. there is no id assigned to this LLC domain, ++ * or 2. the sd is NULL, we reach here. ++ * Consider the following scenario, ++ * CPU0~CPU95 are in the node0, CPU96~CPU191 are ++ * in the node1. During bootup, maxcpus=96 is ++ * appended. ++ * case 1: When running cpu_attach_domain(CPU24) ++ * during boot up, CPU24 is the first CPU in its ++ * non-NULL LLC domain. However, ++ * its corresponding llc id has not been assigned yet. ++ * ++ * case 2: After boot up, the CPU100 is brought up ++ * via sysfs manually. As a result, CPU100 has only a ++ * Numa domain attached, because CPU100 is the only CPU ++ * of a sched domain, all its bottom domains are degenerated. ++ * The LLC domain pointer sd is NULL for CPU100. ++ * ++ * For both cases, we want to increase the number of LLCs. ++ */ ++ per_cpu(sd_llc_id, cpu) = max_llcs++; ++ ++ return per_cpu(sd_llc_id, cpu); ++} ++ + static void update_top_cache_domain(int cpu) + { + struct sched_domain_shared *sds = NULL; +@@ -677,14 +728,13 @@ static void update_top_cache_domain(int cpu) + + sd = highest_flag_domain(cpu, SD_SHARE_LLC); + if (sd) { +- id = cpumask_first(sched_domain_span(sd)); + size = cpumask_weight(sched_domain_span(sd)); + sds = sd->shared; + } + + rcu_assign_pointer(per_cpu(sd_llc, cpu), sd); + per_cpu(sd_llc_size, cpu) = size; +- per_cpu(sd_llc_id, cpu) = id; ++ id = update_llc_id(sd, cpu); + rcu_assign_pointer(per_cpu(sd_llc_shared, cpu), sds); + + sd = lowest_flag_domain(cpu, SD_CLUSTER); +@@ -2488,6 +2538,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + bool has_asym = false; + bool has_cluster = false; + ++ /* first scan of LLCs */ ++ if (!max_llcs) { ++ for_each_possible_cpu(i) ++ per_cpu(sd_llc_id, i) = -1; ++ } ++ + if (WARN_ON(cpumask_empty(cpu_map))) + goto error; + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip new file mode 100644 index 0000000..9eeeb42 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-05-23-sched-cache-Assign-preferred-LLC-ID-to-processes.patch.skip @@ -0,0 +1,172 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 679562ECEBB + for ; Wed, 3 Dec 2025 23:01:26 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802888; cv=none; b=ETXMSycIjg3hW2uD7ktvuDRCwlm80jzWlfuybxMLSJjuPv1gOLZC1i6pxE62EG9+cDFAU1hLySS0z9EjoSW7h+IC9WTpkMIZz2geJs1QP3R/eObNqU3OG+yETt/G54TGksleKQ7hmlJH6AIkTyDQ9XdCc+AMJOzQkCsvN6AteuA= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802888; c=relaxed/simple; + bh=0AOJ8UhIDlWuve34OwSELAi4hyIDL68J1uZ46Rj5j/U=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=oT6l76w5OE/CgwV2buKuyAjl0MI2Q/KFcNiA5tSmBm5YfGauRJZvP4km+gtrjR5EEwXVgaCsan/LhKN6+lL1MozMs4acvCaZOIR7MI0TH1a6DN/iL60iGgK73IOwTgFjrIfIZLKuBBoFD14Z4gbqwWYyV8VrRWfEVNe6RZksId4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=MYgWTb60; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="MYgWTb60" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802886; x=1796338886; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=0AOJ8UhIDlWuve34OwSELAi4hyIDL68J1uZ46Rj5j/U=; + b=MYgWTb60T6yG49rQ3nLnjAfGEf6N3B3x0R1ujoF4MP+f6thBTMMmV5A2 + 6gtXPzCButviIBBCpY7AZSw3brie2XhnzEv9X/ke/XBPmw9iwTMQXM9o0 + iuW5LJjdLixT+ECza7WcFjH4T9QTfvwhG/w9TZhOFFXAm15dszIkONvBa + SXqv+2sjbXByYYFdX59mzr/UJBdZJP29/Qsoq52Bq39LKfBUjAIOaxdni + O3Dd1ftGoYiiVuFKxIPrD6KHkaSzbffy0qzla2yFfiBHwoJt7cDfE6IuV + V+N5yhbYcGH4NZwhO7yAb7il3S4WiOKkWeUjmgInRdyyz/X833IZzWB7j + A==; +X-CSE-ConnectionGUID: kjwyz+xTQ4WR9aHYObBogw== +X-CSE-MsgGUID: zaCpH7h5Qxu4sNwXX6Krdg== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136266" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136266" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:25 -0800 +X-CSE-ConnectionGUID: alNeGVnjSJOwjLHIJ4OuIg== +X-CSE-MsgGUID: 5cfoA8m6Su+MIcnd+RQoGg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763763" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:25 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 05/23] sched/cache: Assign preferred LLC ID to processes +Date: Wed, 3 Dec 2025 15:07:24 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +With cache-aware scheduling enabled, each task is assigned a +preferred LLC ID. This allows quick identification of the LLC domain +where the task prefers to run, similar to numa_preferred_nid in +NUMA balancing. + +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: Align preferred LLC with NUMA balancing's preferred node. + + include/linux/sched.h | 1 + + init/init_task.c | 3 +++ + kernel/sched/fair.c | 18 ++++++++++++++++++ + 3 files changed, 22 insertions(+) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 278b529c91df..1ad46220cd04 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1408,6 +1408,7 @@ struct task_struct { + + #ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; ++ int preferred_llc; + #endif + + #ifdef CONFIG_RSEQ +diff --git a/init/init_task.c b/init/init_task.c +index a55e2189206f..44bae72b5b7d 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -191,6 +191,9 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { + .numa_group = NULL, + .numa_faults = NULL, + #endif ++#ifdef CONFIG_SCHED_CACHE ++ .preferred_llc = -1, ++#endif + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) + .kasan_depth = 1, + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 0a3918269906..10cec83f65d5 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1300,6 +1300,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + struct mm_struct *mm = p->mm; + struct mm_sched *pcpu_sched; + unsigned long epoch; ++ int mm_sched_llc = -1; + + if (!sched_cache_enabled()) + return; +@@ -1330,6 +1331,23 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } ++ ++ if (mm->mm_sched_cpu != -1) { ++ mm_sched_llc = llc_id(mm->mm_sched_cpu); ++ ++#ifdef CONFIG_NUMA_BALANCING ++ /* ++ * Don't assign preferred LLC if it ++ * conflicts with NUMA balancing. ++ */ ++ if (p->numa_preferred_nid >= 0 && ++ cpu_to_node(mm->mm_sched_cpu) != p->numa_preferred_nid) ++ mm_sched_llc = -1; ++#endif ++ } ++ ++ if (p->preferred_llc != mm_sched_llc) ++ p->preferred_llc = mm_sched_llc; + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip new file mode 100644 index 0000000..da576b4 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-06-23-sched-cache-Track-LLC-preferred-tasks-per-runqueue.patch.skip @@ -0,0 +1,289 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id C93962EF652 + for ; Wed, 3 Dec 2025 23:01:27 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802890; cv=none; b=EDsiu7g2BtXvvoS9BKwrirW/B8ldDhmwGPx+cdJzoxBtklhxCuicf7XZFi+5IO9eicj+U0q988drhlH0OJjM+IwUt0amTGbw3mfM6d+6WZDelOH8Kc3PIbWBuITzHpbg31UVRdkj3UEviuqp+uvpMTrssPknIugATiCNu3Bm+08= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802890; c=relaxed/simple; + bh=VfUUqC84e+k4dM9OCiHr0qSll3wkyw96Z2hiwhlrd+g=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=bMgyCWF3/XMpBtns9xgAQbuvJYsQoxOLy5qU1v3Ure2zyH7eaHG4ZLbKyqgBn1NINjkU2O0RPcPn7whkPdiyLRm36oluEWQ4viCDhC3YxOj/EZYMjqKw4E92UmhMBk5j0NYcW2RvXkMIEQxCZjUg4qUDiMfwP1eraXWWdJgmvkk= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=HkSBtET5; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="HkSBtET5" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802887; x=1796338887; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=VfUUqC84e+k4dM9OCiHr0qSll3wkyw96Z2hiwhlrd+g=; + b=HkSBtET5tJuyrYVLfwF6tgrJB3jPRTx01PEveXBF1wIqsiOJxkXhzAOm + sC1smgQCW8wgJyR4E1u9VSEyU2s5OeGIeEuC988/p/oKmWX8sR4t5I1+Q + tI0jgAIHPovP+AIphgRpysIDP7uveWJciGMii/zPUANlnHxP4W7VRq2eJ + sBFqpGeZy1Ve8fewNRoxQswiP1fA+sTe9iwHVjtYcP+1v4kzgt4NxJNt7 + wXwMA6vcMf7L8X5pDnsHkNo+K4j1B34n8SEcNJu9+4em9z3ghkY3MGzod + zaVcGH6lY2mH/znHiuVlkKaau6etkJB5XXnU6Zdt6/ZSkCkDGyN6SoMYU + A==; +X-CSE-ConnectionGUID: k1qC8aFmROqogX9M8c8KUg== +X-CSE-MsgGUID: rOmGsn1SSNWPs0ITFZygdw== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136288" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136288" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:27 -0800 +X-CSE-ConnectionGUID: cvzDuwr8RH6q0DcIt04zOw== +X-CSE-MsgGUID: uxGA3PlMTN6liJURHzFh/g== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763775" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:26 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 06/23] sched/cache: Track LLC-preferred tasks per runqueue +Date: Wed, 3 Dec 2025 15:07:25 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +For each runqueue, track the number of tasks with an LLC preference +and how many of them are running on their preferred LLC. This mirrors +nr_numa_running and nr_preferred_running for NUMA balancing, and will +be used by cache-aware load balancing in later patches. + +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: Invoke task_of() once and reuse its result afterwards. + (Peter Zijlstra) + Remove hacky reset_llc_stats() and introduce sched_llc_active flag + to properly pair enqueue/dequeue statistics update (Peter Zijlstra, K Prateek Nayak) + + include/linux/sched.h | 2 ++ + init/init_task.c | 1 + + kernel/sched/core.c | 5 ++++ + kernel/sched/fair.c | 60 ++++++++++++++++++++++++++++++++++++++++--- + kernel/sched/sched.h | 6 +++++ + 5 files changed, 71 insertions(+), 3 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 1ad46220cd04..466ba8b7398c 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -1408,6 +1408,8 @@ struct task_struct { + + #ifdef CONFIG_SCHED_CACHE + struct callback_head cache_work; ++ /*the p is currently refcounted in a rq's preferred llc stats*/ ++ bool sched_llc_active; + int preferred_llc; + #endif + +diff --git a/init/init_task.c b/init/init_task.c +index 44bae72b5b7d..ee78837b0aa2 100644 +--- a/init/init_task.c ++++ b/init/init_task.c +@@ -192,6 +192,7 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { + .numa_faults = NULL, + #endif + #ifdef CONFIG_SCHED_CACHE ++ .sched_llc_active = false, + .preferred_llc = -1, + #endif + #if defined(CONFIG_KASAN_GENERIC) || defined(CONFIG_KASAN_SW_TAGS) +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index e8bdf03a4b7f..48626c81ba8e 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -531,6 +531,11 @@ void __trace_set_current_state(int state_value) + } + EXPORT_SYMBOL(__trace_set_current_state); + ++int task_llc(const struct task_struct *p) ++{ ++ return per_cpu(sd_llc_id, task_cpu(p)); ++} ++ + /* + * Serialization rules: + * +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 10cec83f65d5..d46a70a9d9fb 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1223,6 +1223,43 @@ static int llc_id(int cpu) + return llc; + } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) ++{ ++ int pref_llc; ++ ++ if (!sched_cache_enabled()) ++ return; ++ ++ pref_llc = p->preferred_llc; ++ if (pref_llc < 0) ++ return; ++ ++ rq->nr_llc_running++; ++ rq->nr_pref_llc_running += (pref_llc == task_llc(p)); ++ p->sched_llc_active = true; ++} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) ++{ ++ int pref_llc; ++ ++ /* ++ * Borrow the uc_se->active from uclamp_rq_inc_id(), ++ * uclamp_rq_dec_id() to avoid the unbalanced calculation ++ * of rq statistics. ++ */ ++ if (unlikely(!p->sched_llc_active)) ++ return; ++ ++ pref_llc = p->preferred_llc; ++ if (pref_llc < 0) ++ return; ++ ++ rq->nr_llc_running--; ++ rq->nr_pref_llc_running -= (pref_llc == task_llc(p)); ++ p->sched_llc_active = false; ++} ++ + void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + { + unsigned long epoch; +@@ -1294,6 +1331,8 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch + return div64_u64(NICE_0_LOAD * pcpu_sched->runtime, rq->cpu_runtime + 1); + } + ++static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); ++ + static inline + void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + { +@@ -1346,8 +1385,13 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + #endif + } + +- if (p->preferred_llc != mm_sched_llc) ++ /* task not on rq accounted later in account_entity_enqueue() */ ++ if (task_running_on_cpu(rq->cpu, p) && ++ p->preferred_llc != mm_sched_llc) { ++ account_llc_dequeue(rq, p); + p->preferred_llc = mm_sched_llc; ++ account_llc_enqueue(rq, p); ++ } + } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) +@@ -1475,6 +1519,10 @@ void init_sched_mm(struct task_struct *p) { } + + static void task_tick_cache(struct rq *rq, struct task_struct *p) { } + ++static void account_llc_enqueue(struct rq *rq, struct task_struct *p) {} ++ ++static void account_llc_dequeue(struct rq *rq, struct task_struct *p) {} ++ + #endif + + /* +@@ -3965,9 +4013,11 @@ account_entity_enqueue(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + update_load_add(&cfs_rq->load, se->load.weight); + if (entity_is_task(se)) { ++ struct task_struct *p = task_of(se); + struct rq *rq = rq_of(cfs_rq); + +- account_numa_enqueue(rq, task_of(se)); ++ account_numa_enqueue(rq, p); ++ account_llc_enqueue(rq, p); + list_add(&se->group_node, &rq->cfs_tasks); + } + cfs_rq->nr_queued++; +@@ -3978,7 +4028,11 @@ account_entity_dequeue(struct cfs_rq *cfs_rq, struct sched_entity *se) + { + update_load_sub(&cfs_rq->load, se->load.weight); + if (entity_is_task(se)) { +- account_numa_dequeue(rq_of(cfs_rq), task_of(se)); ++ struct task_struct *p = task_of(se); ++ struct rq *rq = rq_of(cfs_rq); ++ ++ account_numa_dequeue(rq, p); ++ account_llc_dequeue(rq, p); + list_del_init(&se->group_node); + } + cfs_rq->nr_queued--; +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 728737641847..ee8b70647835 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1126,6 +1126,10 @@ struct rq { + unsigned int nr_preferred_running; + unsigned int numa_migrate_on; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc_running; ++ unsigned int nr_llc_running; ++#endif + #ifdef CONFIG_NO_HZ_COMMON + unsigned long last_blocked_load_update_tick; + unsigned int has_blocked_load; +@@ -1980,6 +1984,8 @@ init_numa_balancing(u64 clone_flags, struct task_struct *p) + + #endif /* !CONFIG_NUMA_BALANCING */ + ++int task_llc(const struct task_struct *p); ++ + static inline void + queue_balance_callback(struct rq *rq, + struct balance_callback *head, +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip new file mode 100644 index 0000000..56edbcb --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-07-23-sched-cache-Introduce-per-runqueue-task-LLC-preference-counter.patch.skip @@ -0,0 +1,293 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 7F2A72F0696 + for ; Wed, 3 Dec 2025 23:01:29 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802892; cv=none; b=JuI4HP7FPjUZRRvIF57U5a+nyKFVaejSLBjOwb2o4K+dyMy+TzvS6alNai1tmhDlx/F2kpTdrbKJxXsp0ye0xTv9vWh98FuHcXDXimNg3p+EZ0AClnIocNRkMFznzOXiGUgsNO6KJzOsOmRV7MqRji4PoMn2fV9YYulhopDCdW0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802892; c=relaxed/simple; + bh=Kgfm8ZrVAem+cuIFSErLp11pWO+uaVSLeCf068dctEI=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=oWgrj/+vmbD4ydPoKoPApP5RMU0UBhjF4mxsiADMVL/t5AARqr//6C8rqPkshWdzhhrhMPF1AzqYud7ZATo+YBem2D9OjWwAWcvEU+adG0BNbDeKX0F/tFC7FpYkxBtH1K1PhGVx8OIwbNowGJZ5W0OZkvMWwyvk09t3vXbHMn4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Ff+wBHml; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Ff+wBHml" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802889; x=1796338889; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Kgfm8ZrVAem+cuIFSErLp11pWO+uaVSLeCf068dctEI=; + b=Ff+wBHmlGI9Ls+hPQ/icfiRSQpZE9xFA2dMFUkAvN4HoLDq9rsPxZPeZ + 8VRONCVnKKzfdp0/tx6ByohayUgQnukEiUM/5FG80edcOUwn8pLvcV6CD + rsakyGnOPLHSStQkG1+f0q6DnjhqobEUdJaywwMsE54fftDticAbLprId + 3bhB2AwAPJQjK37rs0/N96in+m4FjW7qil9FvPJrQKe2CXx6Vw8vc05XH + UOnoKjT+4VoaXotKSh3uNxjPZTKFSxLyHcD1a3z71R7y9pyahaHenJnCZ + 3UkyBEcsW2m1c1Cx8k4IAc/bj/uxMr+zGfxYNNEZL+3nmX/2zLcKYH7UG + w==; +X-CSE-ConnectionGUID: XVdRsMs0TMKO/Xjz8IoNBA== +X-CSE-MsgGUID: 8lt8Jb1nTZqY7huRsHsSQw== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136318" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136318" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:28 -0800 +X-CSE-ConnectionGUID: HuS/ZH/YT/Cjm+dSF3UiRw== +X-CSE-MsgGUID: YDDbEJCdQwCGXEo7YEaNTQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763787" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:28 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 07/23] sched/cache: Introduce per runqueue task LLC preference counter +Date: Wed, 3 Dec 2025 15:07:26 -0800 +Message-Id: <63091f7ca7bb473fbc176af86a87d27a07a6e149.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Each runqueue is assigned an array where each element tracks +the number of tasks preferring a given LLC, indexed from 0 to +max_llcs - 1. + +For example, rq->nr_pref_llc[3] = 2 signifies that there are 2 tasks on +this runqueue which prefer to run within LLC3. + +The load balancer can use this information to identify busy +runqueues and migrate tasks to their preferred LLC domains. +This array will be reallocated at runtime if the number of LLCs +increases due to CPU hotplug. Only extending the buffer(rather +than shrinking it) is supported to simplify the implementation. + +Introduce the buffer allocation mechanism, and the statistics +will be calculated in the subsequent patch. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + Remove static allocation of per runqueue LLC preference arrays. + Allocate array size to the actual number of LLCs online. (Peter Zijlstra, Madadi Vineeth Reddy) + + kernel/sched/core.c | 1 + + kernel/sched/sched.h | 1 + + kernel/sched/topology.c | 117 +++++++++++++++++++++++++++++++++++++++- + 3 files changed, 118 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 48626c81ba8e..ce533dc485f5 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -8800,6 +8800,7 @@ void __init sched_init(void) + #ifdef CONFIG_SCHED_CACHE + raw_spin_lock_init(&rq->cpu_epoch_lock); + rq->cpu_epoch_next = jiffies; ++ rq->nr_pref_llc = NULL; + #endif + + zalloc_cpumask_var_node(&rq->scratch_mask, GFP_KERNEL, cpu_to_node(i)); +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index ee8b70647835..8f2a779825e4 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1129,6 +1129,7 @@ struct rq { + #ifdef CONFIG_SCHED_CACHE + unsigned int nr_pref_llc_running; + unsigned int nr_llc_running; ++ unsigned int *nr_pref_llc; + #endif + #ifdef CONFIG_NO_HZ_COMMON + unsigned long last_blocked_load_update_tick; +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index f25d950ab015..d583399fc6a1 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -17,8 +17,121 @@ void sched_domains_mutex_unlock(void) + mutex_unlock(&sched_domains_mutex); + } + ++/* the number of max LLCs being detected */ ++static int new_max_llcs; ++/* the current number of max LLCs */ + int max_llcs; + ++#ifdef CONFIG_SCHED_CACHE ++ ++static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc) ++{ ++ unsigned int *new = NULL; ++ ++ new = kcalloc(new_max_llcs, sizeof(unsigned int), ++ GFP_KERNEL | __GFP_NOWARN); ++ ++ if (!new) { ++ *gc = NULL; ++ } else { ++ /* ++ * Place old entry in garbage collector ++ * for later disposal. ++ */ ++ *gc = old; ++ } ++ return new; ++} ++ ++static void populate_new_pref_llcs(unsigned int *old, unsigned int *new) ++{ ++ int i; ++ ++ if (!old) ++ return; ++ ++ for (i = 0; i < max_llcs; i++) ++ new[i] = old[i]; ++} ++ ++static int resize_llc_pref(void) ++{ ++ unsigned int *__percpu *tmp_llc_pref; ++ int i, ret = 0; ++ ++ if (new_max_llcs <= max_llcs) ++ return 0; ++ ++ /* ++ * Allocate temp percpu pointer for old llc_pref, ++ * which will be released after switching to the ++ * new buffer. ++ */ ++ tmp_llc_pref = alloc_percpu_noprof(unsigned int *); ++ if (!tmp_llc_pref) ++ return -ENOMEM; ++ ++ for_each_present_cpu(i) ++ *per_cpu_ptr(tmp_llc_pref, i) = NULL; ++ ++ /* ++ * Resize the per rq nr_pref_llc buffer and ++ * switch to this new buffer. ++ */ ++ for_each_present_cpu(i) { ++ struct rq_flags rf; ++ unsigned int *new; ++ struct rq *rq; ++ ++ rq = cpu_rq(i); ++ new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i)); ++ if (!new) { ++ ret = -ENOMEM; ++ ++ goto release_old; ++ } ++ ++ /* ++ * Locking rq ensures that rq->nr_pref_llc values ++ * don't change with new task enqueue/dequeue ++ * when we repopulate the newly enlarged array. ++ */ ++ rq_lock_irqsave(rq, &rf); ++ populate_new_pref_llcs(rq->nr_pref_llc, new); ++ rq->nr_pref_llc = new; ++ rq_unlock_irqrestore(rq, &rf); ++ } ++ ++release_old: ++ /* ++ * Load balance is done under rcu_lock. ++ * Wait for load balance before and during resizing to ++ * be done. They may refer to old nr_pref_llc[] ++ * that hasn't been resized. ++ */ ++ synchronize_rcu(); ++ for_each_present_cpu(i) ++ kfree(*per_cpu_ptr(tmp_llc_pref, i)); ++ ++ free_percpu(tmp_llc_pref); ++ ++ /* succeed and update */ ++ if (!ret) ++ max_llcs = new_max_llcs; ++ ++ return ret; ++} ++ ++#else ++ ++static int resize_llc_pref(void) ++{ ++ max_llcs = new_max_llcs; ++ return 0; ++} ++ ++#endif ++ + /* Protected by sched_domains_mutex: */ + static cpumask_var_t sched_domains_tmpmask; + static cpumask_var_t sched_domains_tmpmask2; +@@ -714,7 +827,7 @@ static int update_llc_id(struct sched_domain *sd, + * + * For both cases, we want to increase the number of LLCs. + */ +- per_cpu(sd_llc_id, cpu) = max_llcs++; ++ per_cpu(sd_llc_id, cpu) = new_max_llcs++; + + return per_cpu(sd_llc_id, cpu); + } +@@ -2674,6 +2787,8 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + ++ resize_llc_pref(); ++ + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip new file mode 100644 index 0000000..4706f26 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-08-23-sched-cache-Calculate-the-per-runqueue-task-LLC-preference.patch.skip @@ -0,0 +1,142 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id D24952F0C5B + for ; Wed, 3 Dec 2025 23:01:30 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802893; cv=none; b=oh6ql8wRtQTKo8nnK9dUK9t3JsNVUN1SrqTTOLrpZpUDsIKZ+qt9qst5oOs9c5FDd2R9eecOFriCSP4q8iJw0WZIClfw/A2n3lz9QanZX0TndqedBRildmD/ptw2VXSsbXzzCrUFl3ehtEIBnQQqE0gyq5YyFY1waemEa1gZMq0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802893; c=relaxed/simple; + bh=ubwbCrnLe+FpFs84fmQJ8NDFPPh85CKovnWcqS4HszM=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Q4hmeOs7hnwqOE8JDGvxpGVeABvVS45aiDvLk6ZpSrPGuTfn+4YcfZc0AFuBMnvnutRPD41rCA1to3LTp3U/rg4Ky2sVe8bcd4xUTzxW+ljCc0tBYewYHhc60QRARoN5k0NGQJalWwDG5Ur5+u4g9f7uSgwIhh8HrXiFwlORSOs= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=ngl+EBZ5; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="ngl+EBZ5" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802890; x=1796338890; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=ubwbCrnLe+FpFs84fmQJ8NDFPPh85CKovnWcqS4HszM=; + b=ngl+EBZ5jKSYuF1GoScWtzUvUawQCvSqX6vXeypzCig51al5M6EFhEW6 + 6ZPkta/KDGc5tm3cLZAn+Q0r4sAGXevBcvNbEeEF94NWh0Q5o4Qi40yoE + 6fENyQt6WsIYC5Biv3AXCHk/Ns+vA3D+5k8K971vxD5ci0G6jwAhua/Ip + V4EYKsxzhnY36WL45Wqmck026Nhmf3XpLNt/wYGNgwSMFF7INI6pnMGxW + qdO3IW9AZPldmpFj84igpzlIJMlsU2GHA5/5/K1uwnar4bbN3Va12Jz5l + CXyXS2But8o6/1q/DIrjmb1ErBv9PahFCMwFzVlsm1m+7SCCYHQiWGEv4 + A==; +X-CSE-ConnectionGUID: 1JiC3BwvQxSVrDu/qUQ3kA== +X-CSE-MsgGUID: lKtqqExhQ5+Xp1L40Y0AnQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136340" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136340" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:30 -0800 +X-CSE-ConnectionGUID: lyR4BYigTY+QEoG74KEnKQ== +X-CSE-MsgGUID: lE96dcq2TgepzkLPnZNnrg== +X-Ironport-Invalid-End-Of-Message: True +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763795" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:30 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 08/23] sched/cache: Calculate the per runqueue task LLC preference +Date: Wed, 3 Dec 2025 15:07:27 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Calculate the number of tasks' LLC preferences for each runqueue. +This statistic is computed during task enqueue and dequeue +operations, and is used by the cache-aware load balancing. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: Split from previous patch for easier review. + + kernel/sched/fair.c | 6 ++++-- + 1 file changed, 4 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index d46a70a9d9fb..b0e87616e377 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1231,11 +1231,12 @@ static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + return; + + pref_llc = p->preferred_llc; +- if (pref_llc < 0) ++ if (pref_llc < 0 || pref_llc >= max_llcs) + return; + + rq->nr_llc_running++; + rq->nr_pref_llc_running += (pref_llc == task_llc(p)); ++ rq->nr_pref_llc[pref_llc]++; + p->sched_llc_active = true; + } + +@@ -1252,11 +1253,12 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p) + return; + + pref_llc = p->preferred_llc; +- if (pref_llc < 0) ++ if (pref_llc < 0 || pref_llc >= max_llcs) + return; + + rq->nr_llc_running--; + rq->nr_pref_llc_running -= (pref_llc == task_llc(p)); ++ rq->nr_pref_llc[pref_llc]--; + p->sched_llc_active = false; + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip new file mode 100644 index 0000000..0efcbfc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-09-23-sched-cache-Count-tasks-prefering-destination-LLC-in-a-sched-group.patch.skip @@ -0,0 +1,160 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 761A92F12BE + for ; Wed, 3 Dec 2025 23:01:32 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802894; cv=none; b=VOvR4Yo5MT+v4vvHJHnJrL04tUMLwfYbb4+GQbWJ3QO13hC1zjlHArO6dzcuGLllayHXLBw43BKllYMjOKohjC7Fzd9T9m3hYmCRq3WLpZzHqcCQuO2JcQTdEeD/rjnDRhN1lGZeCfQEi5WHKdPb8iHSUPG9WfZsKEu6JozCWHQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802894; c=relaxed/simple; + bh=Hwjod13ydyBeyAl1Bc0MaWee5egwZS7IehFiRUr+3EU=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Qm3SflXuxBKuYopJgqhcvipXf7FPYSYSF15V5hLWMr9nUpsAfdv+d2spbB0P7Tw1LmX/zkoTpJ7guZJ5VbPuMzy9Baf9HL/h+ZfC7oU8NJtxgafnNNwl0O1u1CDaxlhc7yoqMW17JyUgVXekWAPj30g3bMDCDrz5uBQLCvlVneA= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=cSsts8rq; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="cSsts8rq" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802892; x=1796338892; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Hwjod13ydyBeyAl1Bc0MaWee5egwZS7IehFiRUr+3EU=; + b=cSsts8rq9lESYUplMXqyaf7fQNdZgkgjFqCazxZIqivu0ulnrxBxtLfr + 2q49FeXJtEzQZUFodeAzsWSFeSbbR0eNrEPCzAiJg3hLVd3plskFuoc8R + LSKLX41Wp9fMgp9Ou54k2TxPn+ZJpABPQDMRZBxyysFrDh3CB41EwtGEs + RrfwNP72MRObV0Rpqk7QGgKlk2FmXjIY1nC71X0MFH6YEKKSRhWDNHOyK + 9xcJGzOrMyQT5S0kQJJP+Yjr1dE5itsHoR0sqlWiS8N54X7izsEc5kZbZ + a2UxxHPNluXsMUFiW8C3sWBY39nJzoHIE5rPFYFCFz7BLdiv2vnTIfuTx + g==; +X-CSE-ConnectionGUID: +jOJhU2XTqKlvSAAbvSNZg== +X-CSE-MsgGUID: bWKw4Hx3R3mw3p0kqP8M9w== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136361" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136361" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:31 -0800 +X-CSE-ConnectionGUID: fNxL8O0TTpG29riKu1HOzA== +X-CSE-MsgGUID: yN1bkFJBSRe3C9XnLxsEJA== +X-Ironport-Invalid-End-Of-Message: True +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763802" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:31 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 09/23] sched/cache: Count tasks prefering destination LLC in a sched group +Date: Wed, 3 Dec 2025 15:07:28 -0800 +Message-Id: <1eb6a231ec82b37483208983f0cf10eec823ec9d.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During LLC load balancing, tabulate the number of tasks on each runqueue +that prefer the LLC contains the env->dst_cpu in a sched group. + +For example, consider a system with 4 LLC sched groups (LLC0 to LLC3) +balancing towards LLC3. LLC0 has 3 tasks preferring LLC3, LLC1 has +2, and LLC2 has 1. LLC0, having the most tasks preferring LLC3, is +selected as the busiest source to pick tasks from. + +Within a source LLC, the total number of tasks preferring a destination +LLC is computed by summing counts across all CPUs in that LLC. For +instance, if LLC0 has CPU0 with 2 tasks and CPU1 with 1 task preferring +LLC3, the total for LLC0 is 3. + +These statistics allow the load balancer to choose tasks from source +sched groups that best match their preferred LLCs. + +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + Convert nr_pref_llc array in sg_lb_stats to a single + variable as only the dst LLC stat is needed. + (K Prateek Nayak) + + kernel/sched/fair.c | 12 ++++++++++++ + 1 file changed, 12 insertions(+) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b0e87616e377..4d7803f69a74 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10445,6 +10445,9 @@ struct sg_lb_stats { + unsigned int nr_numa_running; + unsigned int nr_preferred_running; + #endif ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int nr_pref_llc; ++#endif + }; + + /* +@@ -10912,6 +10915,9 @@ static inline void update_sg_lb_stats(struct lb_env *env, + { + int i, nr_running, local_group, sd_flags = env->sd->flags; + bool balancing_at_rd = !env->sd->parent; ++#ifdef CONFIG_SCHED_CACHE ++ int dst_llc = llc_id(env->dst_cpu); ++#endif + + memset(sgs, 0, sizeof(*sgs)); + +@@ -10932,6 +10938,12 @@ static inline void update_sg_lb_stats(struct lb_env *env, + if (cpu_overutilized(i)) + *sg_overutilized = 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_cache_enabled() && llc_id(i) != dst_llc && ++ dst_llc >= 0) ++ sgs->nr_pref_llc += rq->nr_pref_llc[dst_llc]; ++#endif ++ + /* + * No need to call idle_cpu() if nr_running is not 0 + */ +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip new file mode 100644 index 0000000..0bb5b14 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-10-23-sched-cache-Check-local_group-only-once-in-update_sg_lb_stats.patch.skip @@ -0,0 +1,142 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3209E2EDD45 + for ; Wed, 3 Dec 2025 23:01:34 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802897; cv=none; b=AN9K8aiWJQG7HDbeaWXpGDetIW2icpqGbDr6zs/psxf+4ZLm2ceitwFSdlkxUNnHO69aqE5S3Lgw8UXlsXoedmM4Pr7i5RbMpn7L1KrlbpjXV6xeAEYh8XRvFtihZU5ev2z3gpc9wUtfTNoORHKd7LfpH7/RywEIWMBBa/DRGKQ= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802897; c=relaxed/simple; + bh=p+3h65+/r+G8M/UVKx3C3o18pTa5Qaadr44RFr//JJM=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=MmMfghNG3eQEQnrI1wgmAlkBPcwScfTCOYIB2L9oD0PhxTEQvycV+raEGlUU7tq/cOm1m41tgx1zgYVTnsY1VCpNGnM6slJtSvukwWoNbVbq6sVz9SyOM9hVO35VnfPEJ/kFPYJD7nSsZDAVCSBbwe4MWGUKumJjlC3jPA1Gp5w= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=oK3XGSFi; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="oK3XGSFi" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802894; x=1796338894; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=p+3h65+/r+G8M/UVKx3C3o18pTa5Qaadr44RFr//JJM=; + b=oK3XGSFi2bGDDnHY3Lou8C7HjUQfAlxc1xp5Jsb4tWssOTetEyKk8VhS + xWt++svfjbe9DJCu7kK8NB54Iyuv23cDcsruzAVgtKiHf34SlRWKEmzrW + D+oCFG7YN+VzH5prFgSppmI032uc/cJAJ/qAKAOk+5EqFUqWcIySUNujp + dnKCK0NZsBYY0rnhzU9NpLtzRd0sgBD+P+q/gVsngGR9F8P7Ojt0z+4k+ + FNbn0vTsTTr/tR3CHEUKYnt1XKHxIQth0oKpXgg30ClUCUHrWShO5n1wq + sHaXMI4sp88m3bKftZXPxnzsOaTk5Sy2iUOBeydtIg4kqCpHbvNeeio00 + A==; +X-CSE-ConnectionGUID: eUWbdnCGTbS8UdiOjQqeaw== +X-CSE-MsgGUID: M8ATD04uQSWlmkQNqjCJ+A== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136382" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136382" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:33 -0800 +X-CSE-ConnectionGUID: +B4a0CVGS5aDMise1kmgcw== +X-CSE-MsgGUID: mYFfuf8aQyCTLl73SGkFmQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763810" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:33 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 10/23] sched/cache: Check local_group only once in update_sg_lb_stats() +Date: Wed, 3 Dec 2025 15:07:29 -0800 +Message-Id: <2581fa14a0083bbd22b50837cd86003e59192c00.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +There is no need to check the local group twice for both group_asym_packing +and group_smt_balance. Adjust the code to facilitate future checks for group +types (cache-aware load balancing) as well. + +No functional changes are expected. + +Suggested-by: Peter Zijlstra (Intel) +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + New code cleanup patch. (Peter Zijlstra) + + kernel/sched/fair.c | 18 ++++++++++-------- + 1 file changed, 10 insertions(+), 8 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 4d7803f69a74..6e4c1ae1bdda 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10984,14 +10984,16 @@ static inline void update_sg_lb_stats(struct lb_env *env, + + sgs->group_weight = group->group_weight; + +- /* Check if dst CPU is idle and preferred to this group */ +- if (!local_group && env->idle && sgs->sum_h_nr_running && +- sched_group_asym(env, sgs, group)) +- sgs->group_asym_packing = 1; +- +- /* Check for loaded SMT group to be balanced to dst CPU */ +- if (!local_group && smt_balance(env, sgs, group)) +- sgs->group_smt_balance = 1; ++ if (!local_group) { ++ /* Check if dst CPU is idle and preferred to this group */ ++ if (env->idle && sgs->sum_h_nr_running && ++ sched_group_asym(env, sgs, group)) ++ sgs->group_asym_packing = 1; ++ ++ /* Check for loaded SMT group to be balanced to dst CPU */ ++ if (smt_balance(env, sgs, group)) ++ sgs->group_smt_balance = 1; ++ } + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip new file mode 100644 index 0000000..0f73957 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-11-23-sched-cache-Prioritize-tasks-preferring-destination-LLC-during-balancing.patch.skip @@ -0,0 +1,276 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id C1A762F6160 + for ; Wed, 3 Dec 2025 23:01:36 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802899; cv=none; b=Y+KSvOIGNo37S4ppd6Zqb+qeXMYg9H7oOVVSwDUONcSmPmmNo+OfFtkUVVLhQy9Kszncjru9WbcIa9UEetZqhMPsmMY2k5fVZ6RAWQZpLFm3o5ZOTcH4gt2vkBWUME5YgLQA3NYdBf+3LQy/lgsvGtAErx6vO+QUxr5PuBX7rAE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802899; c=relaxed/simple; + bh=CwrhaA/K9sEcx5ifxeMnRiF7w0oKVkh5kmhIRkZCI08=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=oku/UCNYCKxtHDLcs7jWCa04/T613otu/fvMOx46pM5Fk461C8jmF88SnvfkaEKbY/tPKG6ssSj+6jJ5qq4aFqOkczxx9qajmomVw1d15n0Nxc/H0Jxmj7YmItsjsTy0cRx3h5fJ6U2M4vg8NuLnjq+H/GqT2czhMHBhUawwwc0= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=NQQGq9b9; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="NQQGq9b9" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802896; x=1796338896; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=CwrhaA/K9sEcx5ifxeMnRiF7w0oKVkh5kmhIRkZCI08=; + b=NQQGq9b9vgYmeBpCZdLnkhTURcvp8LDZjz79tucf4QAOjRH6WMoJ7DIc + VCEpH4PZk5+dZi9trvIpapAwsuwYkQegVq+/LDqHzSrIt129SaHxgL94Y + 5nrvAHUr0MUD5UNXllanE0V0Fykum1uE2UTQDl3LnIDioTcTzOYpAO1X1 + 4qycYWShsJLluL7efSyQ+/SgISKYo/HIyxL8OBYx1D4XH6mSLaqEpIaiX + g8GbNG2ofsWe9Fe2YAYpsC9b78PtUUg4W2Vm4/GWu3tuk8/oeCtghHVCm + rv/mHq9+NoDA+NgB2cghgRnsU5NYvBkjZ9v38NvuhidP8frlEkqZR1gb1 + Q==; +X-CSE-ConnectionGUID: p32r4lRGQkiQ0lJdqe2vYQ== +X-CSE-MsgGUID: Aoa+xDJNReuNdKo3ZUU7Lw== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136420" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136420" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:36 -0800 +X-CSE-ConnectionGUID: bwFbXM5NTD2aXs2HmVfWRA== +X-CSE-MsgGUID: oV/d19IyQLihLj6NBNyYIA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763827" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:35 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 11/23] sched/cache: Prioritize tasks preferring destination LLC during balancing +Date: Wed, 3 Dec 2025 15:07:30 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During LLC load balancing, first check for tasks that prefer the +destination LLC and balance them to it before others. + +Mark source sched groups containing tasks preferring non local LLCs +with the group_llc_balance flag. This ensures the load balancer later +pulls or pushes these tasks toward their preferred LLCs. + +The load balancer selects the busiest sched_group and migrates tasks +to less busy groups to distribute load across CPUs. + +With cache-aware scheduling enabled, the busiest sched_group is +the one with most tasks preferring the destination LLC. If +the group has the llc_balance flag set, cache aware load balancing is +triggered. + +Introduce the helper function update_llc_busiest() to identify the +sched_group with the most tasks preferring the destination LLC. + +Suggested-by: K Prateek Nayak +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + Fix comparison in can_migrate_llc(), which uses an uninitialized + env->src_cpu. Use the candidate group's first CPU instead. (Aaron Lu) + + Fix a race condition during bootup with build_sched_domains(), + where the per-cpu(sd_llc_id) is reset to -1. (lkp/0day) + Put the set of group_llc_balance and the usage of it into + 1 patch. (Peter Zijlstra) + + Change group_llc_balance priority to be lower than group_overloaded + and embed it into normal load balance path. (Peter Zijlstra) + + Remove the sched group's SD_SHARE_LLC check in llc_balance(), because + we should allow tasks migration across NUMA nodes to their preferred LLC, + where the domain does not have SD_SHARE_LLC flag. + + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6e4c1ae1bdda..db555c11b5b8 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9531,6 +9531,11 @@ enum group_type { + * from balancing the load across the system. + */ + group_imbalanced, ++ /* ++ * There are tasks running on non-preferred LLC, possible to move ++ * them to their preferred LLC without creating too much imbalance. ++ */ ++ group_llc_balance, + /* + * The CPU is overloaded and can't provide expected CPU cycles to all + * tasks. +@@ -10440,6 +10445,7 @@ struct sg_lb_stats { + enum group_type group_type; + unsigned int group_asym_packing; /* Tasks should be moved to preferred CPU */ + unsigned int group_smt_balance; /* Task on busy SMT be moved */ ++ unsigned int group_llc_balance; /* Tasks should be moved to preferred LLC */ + unsigned long group_misfit_task_load; /* A CPU has a task too big for its capacity */ + #ifdef CONFIG_NUMA_BALANCING + unsigned int nr_numa_running; +@@ -10698,6 +10704,9 @@ group_type group_classify(unsigned int imbalance_pct, + if (group_is_overloaded(imbalance_pct, sgs)) + return group_overloaded; + ++ if (sgs->group_llc_balance) ++ return group_llc_balance; ++ + if (sg_imbalanced(group)) + return group_imbalanced; + +@@ -10890,11 +10899,55 @@ static void record_sg_llc_stats(struct lb_env *env, + if (unlikely(READ_ONCE(sd_share->capacity) != sgs->group_capacity)) + WRITE_ONCE(sd_share->capacity, sgs->group_capacity); + } ++ ++/* ++ * Do LLC balance on sched group that contains LLC, and have tasks preferring ++ * to run on LLC in idle dst_cpu. ++ */ ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ if (!sched_cache_enabled()) ++ return false; ++ ++ if (env->sd->flags & SD_SHARE_LLC) ++ return false; ++ ++ if (sgs->nr_pref_llc && ++ can_migrate_llc(cpumask_first(sched_group_span(group)), ++ env->dst_cpu, 0, true) == mig_llc) ++ return true; ++ ++ return false; ++} ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ /* ++ * There are more tasks that want to run on dst_cpu's LLC. ++ */ ++ return sgs->nr_pref_llc > busiest->nr_pref_llc; ++} + #else + static inline void record_sg_llc_stats(struct lb_env *env, struct sg_lb_stats *sgs, + struct sched_group *group) + { + } ++ ++static inline bool llc_balance(struct lb_env *env, struct sg_lb_stats *sgs, ++ struct sched_group *group) ++{ ++ return false; ++} ++ ++static bool update_llc_busiest(struct lb_env *env, ++ struct sg_lb_stats *busiest, ++ struct sg_lb_stats *sgs) ++{ ++ return false; ++} + #endif + + /** +@@ -10993,6 +11046,10 @@ static inline void update_sg_lb_stats(struct lb_env *env, + /* Check for loaded SMT group to be balanced to dst CPU */ + if (smt_balance(env, sgs, group)) + sgs->group_smt_balance = 1; ++ ++ /* Check for tasks in this group can be moved to their preferred LLC */ ++ if (llc_balance(env, sgs, group)) ++ sgs->group_llc_balance = 1; + } + + sgs->group_type = group_classify(env->sd->imbalance_pct, group, sgs); +@@ -11056,6 +11113,10 @@ static bool update_sd_pick_busiest(struct lb_env *env, + /* Select the overloaded group with highest avg_load. */ + return sgs->avg_load > busiest->avg_load; + ++ case group_llc_balance: ++ /* Select the group with most tasks preferring dst LLC */ ++ return update_llc_busiest(env, busiest, sgs); ++ + case group_imbalanced: + /* + * Select the 1st imbalanced group as we don't have any way to +@@ -11318,6 +11379,7 @@ static bool update_pick_idlest(struct sched_group *idlest, + return false; + break; + ++ case group_llc_balance: + case group_imbalanced: + case group_asym_packing: + case group_smt_balance: +@@ -11450,6 +11512,7 @@ sched_balance_find_dst_group(struct sched_domain *sd, struct task_struct *p, int + return NULL; + break; + ++ case group_llc_balance: + case group_imbalanced: + case group_asym_packing: + case group_smt_balance: +@@ -11949,7 +12012,8 @@ static struct sched_group *sched_balance_find_src_group(struct lb_env *env) + * group's child domain. + */ + if (sds.prefer_sibling && local->group_type == group_has_spare && +- sibling_imbalance(env, &sds, busiest, local) > 1) ++ (busiest->group_type == group_llc_balance || ++ sibling_imbalance(env, &sds, busiest, local) > 1)) + goto force_balance; + + if (busiest->group_type != group_overloaded) { +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip new file mode 100644 index 0000000..b5197ec --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-12-23-sched-cache-Add-migrate_llc_task-migration-type-for-cache-aware-balancing.patch.skip @@ -0,0 +1,191 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 8446B2FCBE3 + for ; Wed, 3 Dec 2025 23:01:38 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802901; cv=none; b=AczUsF+ErJIRlmzmMhdwLmi7ZupDah78/dCkfXKoZGQ3XVlhu9qwGaFYSDg3FFQU9754xRJEORkGrcVZU1ssicX++R+V0FXfSTdSUEZWfvt980XcoUhlWnK7J8un6y7YNQXxJBfZVrhj31WyccQPJJevDmK67sgqqF6PsKk29mc= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802901; c=relaxed/simple; + bh=da90OiAHbhR9NPA8Ratl9FUXidYv15t1ql0bzkXvmzA=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=hCA0ZezNljYOVjtzlpNDPYqpoGKoW7yU4ihuYN4DdplXI8ZjqyOysntDUcfzbne+6CzBonX2R+LOUwUNh5V4ZvlW0NEG+WGaT266Gr89t7EmmUAyb0SQ4i4NDSbCHrELFwlVL45n3XsDuBwIKNxjYMRKZj90lzt9XJuGVK0hJpE= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=E/oDxO7e; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="E/oDxO7e" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802898; x=1796338898; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=da90OiAHbhR9NPA8Ratl9FUXidYv15t1ql0bzkXvmzA=; + b=E/oDxO7ef4nI4G5J3jOvjR+X/vFua+P9e3AZXKLJcxFJriNr7Ua944xG + AxkcNTluTudW0fa7LiL2oLSyXQGNm4wxTedztXy+Kb3GNW3m1xItQPgjY + yaKpw+/5zQcwTUlI7cSSe2yq6pGi70PjZnOQeUYqx+6LdidqnzQeT9x0d + oKfUVrBxLwV+bxjJ5X7pfb+amTWF/9P1/Z2cwQnN4MgR4+xZfJ/oQETi0 + OhZkv30WMo989iIGaDW9QOVZENXrnIYuSR0poLGwGoz4vGxEA6oadIK33 + rSOZLBiBoM9ORQbnZoVJ4AxudF9GCXu3fDkCd/li1EhJxcKamQHTatJeP + g==; +X-CSE-ConnectionGUID: Ktrog9qIS3GVBMh0FikIKg== +X-CSE-MsgGUID: wFdH+7CRS02fjEaxSZiyog== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136444" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136444" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:37 -0800 +X-CSE-ConnectionGUID: 6HowfZdBQD20KHd0gzJbtg== +X-CSE-MsgGUID: 6WhOzrMuS8+5P3U6JQUdyA== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763835" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:37 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 12/23] sched/cache: Add migrate_llc_task migration type for cache-aware balancing +Date: Wed, 3 Dec 2025 15:07:31 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Introduce a new migration type, migrate_llc_task, to support +cache-aware load balancing. + +After identifying the busiest sched_group (having the most tasks +preferring the destination LLC), mark migrations with this type. +During load balancing, each runqueue in the busiest sched_group is +examined, and the runqueue with the highest number of tasks preferring +the destination CPU is selected as the busiest runqueue. + +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: Remove unnecessary cpus_share_cache() check in + sched_balance_find_src_rq() (K Prateek Nayak) + + kernel/sched/fair.c | 32 +++++++++++++++++++++++++++++++- + 1 file changed, 31 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index db555c11b5b8..529adf342ce0 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9547,7 +9547,8 @@ enum migration_type { + migrate_load = 0, + migrate_util, + migrate_task, +- migrate_misfit ++ migrate_misfit, ++ migrate_llc_task + }; + + #define LBF_ALL_PINNED 0x01 +@@ -10134,6 +10135,10 @@ static int detach_tasks(struct lb_env *env) + env->imbalance -= util; + break; + ++ case migrate_llc_task: ++ env->imbalance--; ++ break; ++ + case migrate_task: + env->imbalance--; + break; +@@ -11766,6 +11771,15 @@ static inline void calculate_imbalance(struct lb_env *env, struct sd_lb_stats *s + return; + } + ++#ifdef CONFIG_SCHED_CACHE ++ if (busiest->group_type == group_llc_balance) { ++ /* Move a task that prefer local LLC */ ++ env->migration_type = migrate_llc_task; ++ env->imbalance = 1; ++ return; ++ } ++#endif ++ + if (busiest->group_type == group_imbalanced) { + /* + * In the group_imb case we cannot rely on group-wide averages +@@ -12073,6 +12087,10 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + struct rq *busiest = NULL, *rq; + unsigned long busiest_util = 0, busiest_load = 0, busiest_capacity = 1; + unsigned int busiest_nr = 0; ++#ifdef CONFIG_SCHED_CACHE ++ unsigned int busiest_pref_llc = 0; ++ int dst_llc; ++#endif + int i; + + for_each_cpu_and(i, sched_group_span(group), env->cpus) { +@@ -12181,6 +12199,16 @@ static struct rq *sched_balance_find_src_rq(struct lb_env *env, + } + break; + ++ case migrate_llc_task: ++#ifdef CONFIG_SCHED_CACHE ++ dst_llc = llc_id(env->dst_cpu); ++ if (dst_llc >= 0 && ++ busiest_pref_llc < rq->nr_pref_llc[dst_llc]) { ++ busiest_pref_llc = rq->nr_pref_llc[dst_llc]; ++ busiest = rq; ++ } ++#endif ++ break; + case migrate_task: + if (busiest_nr < nr_running) { + busiest_nr = nr_running; +@@ -12363,6 +12391,8 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd + case migrate_misfit: + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; ++ case migrate_llc_task: ++ break; + } + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip new file mode 100644 index 0000000..35fef7d --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-13-23-sched-cache-Handle-moving-single-tasks-to-from-their-preferred-LLC.patch.skip @@ -0,0 +1,195 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 97BB42FFF98 + for ; Wed, 3 Dec 2025 23:01:39 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802903; cv=none; b=KDFpHcGAhKnHRBZFMFMtHMoRnhc4icrwIxIA8u+Vif5oz7Z18LHjkzu1IOV8tRYJFy4lXDjG6wYe22JV6BPtT9JAf2mUHKRyigHv1MkoPNBeRIKSEJ51iH0zebfyiiIhyx46QCps5MkfKG9xVMGg3N7ENza6Vv2+y6dsL+Zp0lE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802903; c=relaxed/simple; + bh=8SM2jHHpi12dQS+zJornGRPQxkuowwvNXMVhwIeDBGA=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=U/JdvWoJw/IU3s2ub70NWLePIaQBRwHwPYibO+bbRJhw5I3xBFJgWmgkN/HfBIb1ABZRWNcUN5ladx9wdRE4q84V9sG4/k/92/pAoHRgP60/SkA1N0lBh+0oNDDaOMmcaJymNEYAB4Y+PlNTanSz07u82e6zrOmPcftrMVq0eQg= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=QFrFmdLP; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="QFrFmdLP" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802900; x=1796338900; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=8SM2jHHpi12dQS+zJornGRPQxkuowwvNXMVhwIeDBGA=; + b=QFrFmdLPstihmD8vLzP896hsrOed6TFf664ZbLxgCKDyVP1ElFu/KxlL + cWka8HAx7lSbtKJIRs2zDLb662V+u3vSkOL/+GmAmBZOGy6YahHgzdZ+w + Cm8JPiAUQ0kzPS2n/rAw++vW0A14d5QX1S2PZ0RvAxgtjOMIEQght4vtw + NlNGyMxSykwrfzzHo/Khc6YFVxKydWs7zQdFb7hjDddawl3rivgSTQ4lM + rXsDbUmw/L0HUCnUtshRY/GabXqs3gMSK3t3UfCRyfscjIhW5T7A4/xG6 + Ul+07Ph3CpTYgJ6hsHVxiRy1rZKIhjL1V7FiHZTJQ8OxeBn2eVIqDAWXn + g==; +X-CSE-ConnectionGUID: Ew56K6WcQq6rz+G+xWa07Q== +X-CSE-MsgGUID: UdKIolArSn+zoD5IfTBGAQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136469" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136469" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:39 -0800 +X-CSE-ConnectionGUID: 2oDVFmE5Rvu2YRtXhtrPTg== +X-CSE-MsgGUID: oS68+IQVQH2bxeYLezigJQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763850" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:38 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 13/23] sched/cache: Handle moving single tasks to/from their preferred LLC +Date: Wed, 3 Dec 2025 15:07:32 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +If the busiest runqueue has only one task, active balancing may be +invoked to move it. However, before migration, check whether the task +is running on its preferred LLC. + +Do not move a lone task to another LLC if it would move the task +away from its preferred LLC or cause excessive imbalance between LLCs. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: Remove uneeded preferred LLC migration check from + active_load_balance_cpu_stop(). + + kernel/sched/fair.c | 51 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 50 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 529adf342ce0..aed3fab98d7c 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9878,12 +9878,57 @@ static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu + task_util(p), to_pref); + } + ++/* ++ * Check if active load balance breaks LLC locality in ++ * terms of cache aware load balance. ++ */ ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ if (!sched_cache_enabled()) ++ return false; ++ ++ if (cpus_share_cache(env->src_cpu, env->dst_cpu)) ++ return false; ++ /* ++ * All tasks prefer to stay on their current CPU. ++ * Do not pull a task from its preferred CPU if: ++ * 1. It is the only task running there; OR ++ * 2. Migrating it away from its preferred LLC would violate ++ * the cache-aware scheduling policy. ++ */ ++ if (env->src_rq->nr_pref_llc_running == env->src_rq->cfs.h_nr_runnable) { ++ unsigned long util = 0; ++ struct task_struct *cur; ++ ++ if (env->src_rq->nr_running <= 1) ++ return true; ++ ++ rcu_read_lock(); ++ cur = rcu_dereference(env->src_rq->curr); ++ if (cur) ++ util = task_util(cur); ++ rcu_read_unlock(); ++ ++ if (can_migrate_llc(env->src_cpu, env->dst_cpu, ++ util, false) == mig_forbid) ++ return true; ++ } ++ ++ return false; ++} + #else + static inline bool get_llc_stats(int cpu, unsigned long *util, + unsigned long *cap) + { + return false; + } ++ ++static inline bool ++break_llc_locality(struct lb_env *env) ++{ ++ return false; ++} + #endif + /* + * can_migrate_task - may task p from runqueue rq be migrated to this_cpu? +@@ -12279,6 +12324,9 @@ static int need_active_balance(struct lb_env *env) + { + struct sched_domain *sd = env->sd; + ++ if (break_llc_locality(env)) ++ return 0; ++ + if (asym_active_balance(env)) + return 1; + +@@ -12298,7 +12346,8 @@ static int need_active_balance(struct lb_env *env) + return 1; + } + +- if (env->migration_type == migrate_misfit) ++ if (env->migration_type == migrate_misfit || ++ env->migration_type == migrate_llc_task) + return 1; + + return 0; +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip new file mode 100644 index 0000000..49a9124 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-14-23-sched-cache-Consider-LLC-preference-when-selecting-tasks-for-load-balancing.patch.skip @@ -0,0 +1,206 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id C1938F513 + for ; Wed, 3 Dec 2025 23:01:41 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802905; cv=none; b=sPmV7aM8SfneES++JSxoAMTpkJxsxkIaVzLucunnA9mKqP6A+4Tm600kyT9VTXTzXq34T39lXTUp9sHWoERIl8w+bTu7J1HC+rfyTlXxwEVQV8C99GFpkkbN1BPFHILnrVb4xczJGDnWK5dD50Ye9FIBTMyihvIerGvjfEsmqNE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802905; c=relaxed/simple; + bh=lHL2pgABc7GHr6ACmg9H32RJUswizn6AHQFobrHrbFw=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=G7rXkqjakmupf9n++e5JGAkMIXq3jqgQc6G6Gw5IYyY/VhHNnlVMfdVNOcDomPtYPBMavf9m7Y2bsSMUvQExqTt6CASUZ8aGZ8iX+XoR/Ej28b5EwCnggenbKxXL4Xj0/E38v+KIJD/T8MnOLbFEeGjSREtAQxxgu/2prdjZMw8= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=FiUlG+0K; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="FiUlG+0K" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802901; x=1796338901; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=lHL2pgABc7GHr6ACmg9H32RJUswizn6AHQFobrHrbFw=; + b=FiUlG+0K/UC9vVMh/oPWl1WUBZdhy5MrB44PaaHkXUAA4jYHkLTFSSsi + qocTAQQFuheK8JLYpFg2R7aU2iv4GZRGXge93BEc9kS9nTpx4oQOMWekm + +vXMxJj28JhCGkxAcIYAkVQvbks0I4+snX/or9+O6+kLtJoq4VW98lvHt + gsZRnKPvbTAbfB8BLT4mfbZqijYwb7I27I0TW2bqZx35wIeRxh9EeBFyi + ROuei6K/cuomwGMaKK20uTZT8/nP1CIoBiGImBAQQNhK7Hgo6jMMsFX23 + lLTcZHF+7w8PBbIBEKU+iwv08wqwC5Czno4lf4DE3GutioUzRJHIw2uIq + A==; +X-CSE-ConnectionGUID: LC9AWrvJQPSiwhGuWRZBoA== +X-CSE-MsgGUID: vAm1J3bzStCyct17U9yt9Q== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136497" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136497" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:40 -0800 +X-CSE-ConnectionGUID: pVhkYbuLQ6qhXibAglfMuQ== +X-CSE-MsgGUID: 3iy0SCYQQeWYaEaxUD8biw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763859" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:40 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 14/23] sched/cache: Consider LLC preference when selecting tasks for load balancing +Date: Wed, 3 Dec 2025 15:07:33 -0800 +Message-Id: <048601436d24f19e84c0a002e1c5897f95853276.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +Currently, task selection from the busiest runqueue ignores LLC +preferences. Reorder tasks in the busiest queue to prioritize selection +as follows: + + 1. Tasks preferring the destination CPU's LLC + 2. Tasks with no LLC preference + 3. Tasks preferring an LLC different from their current one + 4. Tasks preferring the LLC they are currently on + +This improves the likelihood that tasks are migrated to their +preferred LLC. + +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: No change. + + kernel/sched/fair.c | 66 ++++++++++++++++++++++++++++++++++++++++++++- + 1 file changed, 65 insertions(+), 1 deletion(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index aed3fab98d7c..dd09a816670e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10092,6 +10092,68 @@ static struct task_struct *detach_one_task(struct lb_env *env) + return NULL; + } + ++#ifdef CONFIG_SCHED_CACHE ++/* ++ * Prepare lists to detach tasks in the following order: ++ * 1. tasks that prefer dst cpu's LLC ++ * 2. tasks that have no preference in LLC ++ * 3. tasks that prefer LLC other than the ones they are on ++ * 4. tasks that prefer the LLC that they are currently on. ++ */ ++static struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ struct task_struct *p; ++ LIST_HEAD(pref_old_llc); ++ LIST_HEAD(pref_new_llc); ++ LIST_HEAD(no_pref_llc); ++ LIST_HEAD(pref_other_llc); ++ ++ if (!sched_cache_enabled()) ++ return tasks; ++ ++ if (cpus_share_cache(env->dst_cpu, env->src_cpu)) ++ return tasks; ++ ++ while (!list_empty(tasks)) { ++ p = list_last_entry(tasks, struct task_struct, se.group_node); ++ ++ if (p->preferred_llc == llc_id(env->dst_cpu)) { ++ list_move(&p->se.group_node, &pref_new_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == llc_id(env->src_cpu)) { ++ list_move(&p->se.group_node, &pref_old_llc); ++ continue; ++ } ++ ++ if (p->preferred_llc == -1) { ++ list_move(&p->se.group_node, &no_pref_llc); ++ continue; ++ } ++ ++ list_move(&p->se.group_node, &pref_other_llc); ++ } ++ ++ /* ++ * We detach tasks from list tail in detach tasks. Put tasks ++ * to be chosen first at end of list. ++ */ ++ list_splice(&pref_new_llc, tasks); ++ list_splice(&no_pref_llc, tasks); ++ list_splice(&pref_other_llc, tasks); ++ list_splice(&pref_old_llc, tasks); ++ return tasks; ++} ++#else ++static inline struct list_head ++*order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) ++{ ++ return tasks; ++} ++#endif ++ + /* + * detach_tasks() -- tries to detach up to imbalance load/util/tasks from + * busiest_rq, as part of a balancing operation within domain "sd". +@@ -10100,7 +10162,7 @@ static struct task_struct *detach_one_task(struct lb_env *env) + */ + static int detach_tasks(struct lb_env *env) + { +- struct list_head *tasks = &env->src_rq->cfs_tasks; ++ struct list_head *tasks; + unsigned long util, load; + struct task_struct *p; + int detached = 0; +@@ -10119,6 +10181,8 @@ static int detach_tasks(struct lb_env *env) + if (env->imbalance <= 0) + return 0; + ++ tasks = order_tasks_by_llc(env, &env->src_rq->cfs_tasks); ++ + while (!list_empty(tasks)) { + /* + * We don't want to steal all, otherwise we may be treated likewise, +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip new file mode 100644 index 0000000..85bcc3b --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-15-23-sched-cache-Respect-LLC-preference-in-task-migration-and-detach.patch.skip @@ -0,0 +1,251 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id F15862EC0B3 + for ; Wed, 3 Dec 2025 23:01:43 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802907; cv=none; b=gaatxX9hyfNCQNZuo8e4RU3vaqRhxVWET62DnEKpixJNU5xDEVuougssJt9/6wdKqXoIUOBKaKYsQEEI9+soes2dovmZhy3fGDXwD4VJshA6aArNO/9BRtmRmrSUH+Qeb4uxqCx6TiODM+aPCVtCEwIA755BalFPfrmj7+qULOI= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802907; c=relaxed/simple; + bh=Qq+bxGUfP5y5uzFrPweEIf2ig+fLfO0Fva+8tsaaHnM=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=PMemmZdPDG2ErK7Z6ebePwKSI9cabjQRZi7fOaAynPsVbH0TYAxCQkgG7kmEu1N/+0Kmoqb2iEytzk5b6Y83O56eTuw4wsJTpcQbn5OA5nrv8fwKgYRvMuPqwTWStSC5o/clmWh6Un/rG7VXFCAXnoxf+tadmloUwr1ceD4Iuek= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=K1c6F2rL; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="K1c6F2rL" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802904; x=1796338904; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Qq+bxGUfP5y5uzFrPweEIf2ig+fLfO0Fva+8tsaaHnM=; + b=K1c6F2rLIDMugmXFGo0VPRa3CkwpTWx9IJrRa/hsq4UrL7DnV0pw8ajG + BaGeCuW4iC0q3KpRjUrb5Gjs2+rOB74bBmgvjzvP0Bgae0TPuFdvMjX23 + z6+gGGgG19Wv4ve1vRjEwTT08BRcUINH2YNXiTUVgX6ibcCJComlk0Y6n + quNDMVfwdU0hQZhwOtrSHXPRqMojx8I7m9WQ/PmD1woe8uT6yci0V4u2u + jfnFFUMEbPvj3J6FUSZjuQwGSGo/EqXqp0xk/5KRyXKafHJF8xEhV/udJ + e4v9JDT09EYShziT4Bzd1zuoH2hhzYHA7OeJFLCwdgppCCBWwVA2w3KmJ + w==; +X-CSE-ConnectionGUID: 9y9zHDIITpm9FBwYNTu03A== +X-CSE-MsgGUID: Qrhpktr7Tg2wRc89JHsg/g== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136537" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136537" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:42 -0800 +X-CSE-ConnectionGUID: JansXFpeT5WbVZuKS0jHBA== +X-CSE-MsgGUID: NjtVIxeZSgSD5Yg4dtwXow== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763888" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:42 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Tim Chen , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Aubrey Li , + Zhao Liu , + Chen Yu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 15/23] sched/cache: Respect LLC preference in task migration and detach +Date: Wed, 3 Dec 2025 15:07:34 -0800 +Message-Id: <1c75f54a2e259737eb9b15c98a5c1d1f142fdef6.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +During the final step of load balancing, can_migrate_task() now +considers a task's LLC preference before moving it out of its +preferred LLC. + +Additionally, add checks in detach_tasks() to prevent selecting tasks +that prefer their current LLC. + +Co-developed-by: Chen Yu +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: Leave out tasks under core scheduling from the cache aware + load balance. (K Prateek Nayak) + + Reduce the degree of honoring preferred_llc in detach_tasks(). + If certain conditions are met, stop migrating tasks that prefer + their current LLC and instead continue load balancing from other + busiest runqueues. (K Prateek Nayak) + + kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++++++++-- + kernel/sched/sched.h | 13 +++++++++ + 2 files changed, 74 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index dd09a816670e..580a967efdac 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -9852,8 +9852,8 @@ static enum llc_mig can_migrate_llc(int src_cpu, int dst_cpu, + * Check if task p can migrate from source LLC to + * destination LLC in terms of cache aware load balance. + */ +-static __maybe_unused enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, +- struct task_struct *p) ++static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, ++ struct task_struct *p) + { + struct mm_struct *mm; + bool to_pref; +@@ -10025,6 +10025,13 @@ int can_migrate_task(struct task_struct *p, struct lb_env *env) + if (env->flags & LBF_ACTIVE_LB) + return 1; + ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_cache_enabled() && ++ can_migrate_llc_task(env->src_cpu, env->dst_cpu, p) == mig_forbid && ++ !task_has_sched_core(p)) ++ return 0; ++#endif ++ + degrades = migrate_degrades_locality(p, env); + if (!degrades) + hot = task_hot(p, env); +@@ -10146,12 +10153,55 @@ static struct list_head + list_splice(&pref_old_llc, tasks); + return tasks; + } ++ ++static bool stop_migrate_src_rq(struct task_struct *p, ++ struct lb_env *env, ++ int detached) ++{ ++ if (!sched_cache_enabled() || p->preferred_llc == -1 || ++ cpus_share_cache(env->src_cpu, env->dst_cpu) || ++ env->sd->nr_balance_failed) ++ return false; ++ ++ /* ++ * Stop migration for the src_rq and pull from a ++ * different busy runqueue in the following cases: ++ * ++ * 1. Trying to migrate task to its preferred ++ * LLC, but the chosen task does not prefer dest ++ * LLC - case 3 in order_tasks_by_llc(). This violates ++ * the goal of migrate_llc_task. However, we should ++ * stop detaching only if some tasks have been detached ++ * and the imbalance has been mitigated. ++ * ++ * 2. Don't detach more tasks if the remaining tasks want ++ * to stay. We know the remaining tasks all prefer the ++ * current LLC, because after order_tasks_by_llc(), the ++ * tasks that prefer the current LLC are the least favored ++ * candidates to be migrated out. ++ */ ++ if (env->migration_type == migrate_llc_task && ++ detached && llc_id(env->dst_cpu) != p->preferred_llc) ++ return true; ++ ++ if (llc_id(env->src_cpu) == p->preferred_llc) ++ return true; ++ ++ return false; ++} + #else + static inline struct list_head + *order_tasks_by_llc(struct lb_env *env, struct list_head *tasks) + { + return tasks; + } ++ ++static bool stop_migrate_src_rq(struct task_struct *p, ++ struct lb_env *env, ++ int detached) ++{ ++ return false; ++} + #endif + + /* +@@ -10205,6 +10255,15 @@ static int detach_tasks(struct lb_env *env) + + p = list_last_entry(tasks, struct task_struct, se.group_node); + ++ /* ++ * Check if detaching current src_rq should be stopped, because ++ * doing so would break cache aware load balance. If we stop ++ * here, the env->flags has LBF_ALL_PINNED, which would cause ++ * the load balance to pull from another busy runqueue. ++ */ ++ if (stop_migrate_src_rq(p, env, detached)) ++ break; ++ + if (!can_migrate_task(p, env)) + goto next; + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 8f2a779825e4..40798a06e058 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1485,6 +1485,14 @@ extern void sched_core_dequeue(struct rq *rq, struct task_struct *p, int flags); + extern void sched_core_get(void); + extern void sched_core_put(void); + ++static inline bool task_has_sched_core(struct task_struct *p) ++{ ++ if (sched_core_disabled()) ++ return false; ++ ++ return !!p->core_cookie; ++} ++ + #else /* !CONFIG_SCHED_CORE: */ + + static inline bool sched_core_enabled(struct rq *rq) +@@ -1524,6 +1532,11 @@ static inline bool sched_group_cookie_match(struct rq *rq, + return true; + } + ++static inline bool task_has_sched_core(struct task_struct *p) ++{ ++ return false; ++} ++ + #endif /* !CONFIG_SCHED_CORE */ + + #ifdef CONFIG_RT_GROUP_SCHED +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip new file mode 100644 index 0000000..31f6859 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-16-23-sched-cache-Introduce-sched_cache_present-to-enable-cache-aware-scheduling-for-multi-LLCs-NUMA-node.patch.skip @@ -0,0 +1,192 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 6E3AB2EFDAD + for ; Wed, 3 Dec 2025 23:01:44 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802907; cv=none; b=DqEqfXaSW0ZZxydgpHnr//9Y+r8Kz4ipcj+CchWbORZ48RCt17FQ2DquLW8sfqca/x+abOrEYIPaq71/GVkzdhR5YktmlcdFPno7ta7IuxETAlghruG+YXcsfmrH3WvfypIFBRxcIK9G7zQ7Meao90BbtEmbg2ZH1AORZqaQMHw= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802907; c=relaxed/simple; + bh=UY2I5n5Zb5eoLU5mFytvpnggFlTCSd5WOZCBICo1NK0=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=ss76k4YY8rB/Z6uAGDFyQbUZ7bARhHHFMR8yOxKyMTjDj6HDUJk3fTrjyBpd8eZwWLWJd6uE+i5j5z2Y9c/kkgK7AnD0FSS5RcyHMwddwez0X8IBpyAwZBkh9Vkri2qy0caEGEQrs66nsLD9/pRtuqh/ensvo0F7AVsRu2xo2+M= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=b/WFlJ1d; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="b/WFlJ1d" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802904; x=1796338904; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=UY2I5n5Zb5eoLU5mFytvpnggFlTCSd5WOZCBICo1NK0=; + b=b/WFlJ1dlftZ7EAiu5bb8CTSjdtBeseHX8isQ4Wht5vD1dxWm6RURFOT + R1B3Vg98GKNKQd2LzX3IPnNH9KdzkcCltvIyuRjvzvHEAhFOFxsI/nNCA + UEadn+0Fte3u19UFuKUeR+zfOfQY/nrc24OBpPT4wpQKXE96Ne4Zzhez9 + CGKthr3Nhi0su6EqgFcgXSic3+e2vAZwxOJETpVdCkTcXOxPoH3AQRibc + 89EqfPOQ7c13HxarJn7Y8fuv5oRcK9m2z4cMXZ93jLuPQkW6wM0YzTFzA + la772T94DglzvBNsM6aU73BVVoFLW1MUMY65Xa6wwGE8bwa6iEUdQtZCN + w==; +X-CSE-ConnectionGUID: hV5QNWsDRNeWT6DD4+0r9w== +X-CSE-MsgGUID: zyp/cB/OQI6PxkSBENXT7w== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136566" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136566" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:44 -0800 +X-CSE-ConnectionGUID: iafqWAoBQZGMBBV/tLdIww== +X-CSE-MsgGUID: S+YoPmfDSRiUGxMt3Qjmgw== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763904" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:43 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org, + Libo Chen +Subject: [PATCH v2 16/23] sched/cache: Introduce sched_cache_present to enable cache aware scheduling for multi LLCs NUMA node +Date: Wed, 3 Dec 2025 15:07:35 -0800 +Message-Id: <7453e3f901878608959f23dacaa36dfc0432c05b.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Cache-aware load balancing should only be enabled if there are more +than 1 LLCs within 1 NUMA node. sched_cache_present is introduced to +indicate whether this platform supports this topology. + +Suggested-by: Libo Chen +Suggested-by: Adam Li +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: + Use flag sched_cache_present to indicate whether a platform + supports cache aware scheduling. Change this flag from staic key. + There should be only 1 static key to control the cache aware + scheduling. (Peter Zijlstra) + + kernel/sched/topology.c | 20 +++++++++++++++----- + 1 file changed, 15 insertions(+), 5 deletions(-) + +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index d583399fc6a1..9799e3a9a609 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -24,6 +24,8 @@ int max_llcs; + + #ifdef CONFIG_SCHED_CACHE + ++static bool sched_cache_present; ++ + static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc) + { + unsigned int *new = NULL; +@@ -54,7 +56,7 @@ static void populate_new_pref_llcs(unsigned int *old, unsigned int *new) + new[i] = old[i]; + } + +-static int resize_llc_pref(void) ++static int resize_llc_pref(bool has_multi_llcs) + { + unsigned int *__percpu *tmp_llc_pref; + int i, ret = 0; +@@ -102,6 +104,11 @@ static int resize_llc_pref(void) + rq_unlock_irqrestore(rq, &rf); + } + ++ if (has_multi_llcs) { ++ sched_cache_present = true; ++ pr_info_once("Cache aware load balance is enabled on the platform.\n"); ++ } ++ + release_old: + /* + * Load balance is done under rcu_lock. +@@ -124,7 +131,7 @@ static int resize_llc_pref(void) + + #else + +-static int resize_llc_pref(void) ++static int resize_llc_pref(bool has_multi_llcs) + { + max_llcs = new_max_llcs; + return 0; +@@ -2644,6 +2651,7 @@ static int + build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *attr) + { + enum s_alloc alloc_state = sa_none; ++ bool has_multi_llcs = false; + struct sched_domain *sd; + struct s_data d; + struct rq *rq = NULL; +@@ -2736,10 +2744,12 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + * between LLCs and memory channels. + */ + nr_llcs = sd->span_weight / child->span_weight; +- if (nr_llcs == 1) ++ if (nr_llcs == 1) { + imb = sd->span_weight >> 3; +- else ++ } else { + imb = nr_llcs; ++ has_multi_llcs = true; ++ } + imb = max(1U, imb); + sd->imb_numa_nr = imb; + +@@ -2787,7 +2797,7 @@ build_sched_domains(const struct cpumask *cpu_map, struct sched_domain_attr *att + if (has_cluster) + static_branch_inc_cpuslocked(&sched_cluster_active); + +- resize_llc_pref(); ++ resize_llc_pref(has_multi_llcs); + + if (rq && sched_debug_verbose) + pr_info("root domain span: %*pbl\n", cpumask_pr_args(cpu_map)); +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip new file mode 100644 index 0000000..c27b26e --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-17-23-sched-cache-Record-the-number-of-active-threads-per-process-for-cache-aware-scheduling.patch.skip @@ -0,0 +1,172 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 2CA91309EF4 + for ; Wed, 3 Dec 2025 23:01:46 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802908; cv=none; b=nzlhLORShQGH6z2OKPCwgPj3fFYQBq0S4kjlB8PdpAMAbRvUDKx69/o9oLg1lRga1/7uLzN7ZJmwClhqm7REccEFVBXjMxnF8O6F1qeXlUxSc5j6wsPAdvgE25W54gtIVxKBjQRnZDVLeIGtXbaxk29EoCqp7pm1fCpS1IY7jQo= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802908; c=relaxed/simple; + bh=7G8GAR73tqFcdrEyXVcfBaeUwRwA82VAe47pEbdUV2w=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Jh1NMZniFEQvMeyAac4yMWESOURMqAUIKW5GcomnPyFPuACvinoSr0dUF9HnUWSFLODn+/4wiWm4ySl8YKMzKSgIL7OQSmo169aanmL/sbmdbfeduyjfscZaBGqL5cQYK99GiDZLKPt44QcYP3KC0gclEaC+Rkd8OiTRxeMU500= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=E0yq8JMN; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="E0yq8JMN" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802906; x=1796338906; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=7G8GAR73tqFcdrEyXVcfBaeUwRwA82VAe47pEbdUV2w=; + b=E0yq8JMN3sNhZ58s1b5iZ/cpqNuM9N0pDevJEvrPce0R2mUndVkmGScN + McHDjEQAdkFny/+9qg6ANdvlFmYlDA/4TibC4Yz5kBPZKGiM/VEgmSwNx + Wv+0fExbPAqEqTORsnJ61vyIc7KAkoB0P/ug+G27y1gOBAwA36EGLI/OA + /yCpUK6WyND+MO1j8Jd+Z6+AKRhUgaidNDGg0GWIIit5s7o17SsHVlDsV + qRWNYanMa3En1ALugyelInfcAx8tLNFNwwlqUz9ZCh6D2uuGRuoBR5fLH + VziKp+AH5f2oXxMZP43VD+u7hWt+ni9sCpFuAa1/qPyus5y+HPClviJWH + w==; +X-CSE-ConnectionGUID: oDqO/ga6T/+BT+4b/VYbEw== +X-CSE-MsgGUID: BsO2ZD3WSAih53lppi2XZQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136597" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136597" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:45 -0800 +X-CSE-ConnectionGUID: vKt+yECETT+2Z5MJs0mW1A== +X-CSE-MsgGUID: XpexGbaTSRGCCth9FMIgbg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763921" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:45 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 17/23] sched/cache: Record the number of active threads per process for cache-aware scheduling +Date: Wed, 3 Dec 2025 15:07:36 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +A performance regression was observed by Prateek when running hackbench +with many threads per process (high fd count). To avoid this, processes +with a large number of active threads are excluded from cache-aware +scheduling. + +With sched_cache enabled, record the number of active threads in each +process during the periodic task_cache_work(). While iterating over +CPUs, if the currently running task belongs to the same process as the +task that launched task_cache_work(), increment the active thread count. + +This number will be used by subsequent patch to inhibit cache aware +load balance. + +Suggested-by: K Prateek Nayak +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: No change. + + include/linux/mm_types.h | 1 + + kernel/sched/fair.c | 11 +++++++++-- + 2 files changed, 10 insertions(+), 2 deletions(-) + +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 1ea16ef90566..04743983de4d 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -1043,6 +1043,7 @@ struct mm_struct { + raw_spinlock_t mm_sched_lock; + unsigned long mm_sched_epoch; + int mm_sched_cpu; ++ u64 nr_running_avg ____cacheline_aligned_in_smp; + #endif + + #ifdef CONFIG_MMU +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 580a967efdac..2f38ad82688f 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1421,11 +1421,11 @@ static void task_tick_cache(struct rq *rq, struct task_struct *p) + + static void __no_profile task_cache_work(struct callback_head *work) + { +- struct task_struct *p = current; ++ struct task_struct *p = current, *cur; + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + unsigned long curr_m_a_occ = 0; +- int cpu, m_a_cpu = -1; ++ int cpu, m_a_cpu = -1, nr_running = 0; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); +@@ -1458,6 +1458,12 @@ static void __no_profile task_cache_work(struct callback_head *work) + m_occ = occ; + m_cpu = i; + } ++ rcu_read_lock(); ++ cur = rcu_dereference(cpu_rq(i)->curr); ++ if (cur && !(cur->flags & (PF_EXITING | PF_KTHREAD)) && ++ cur->mm == mm) ++ nr_running++; ++ rcu_read_unlock(); + } + + /* +@@ -1501,6 +1507,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + mm->mm_sched_cpu = m_a_cpu; + } + ++ update_avg(&mm->nr_running_avg, nr_running); + free_cpumask_var(cpus); + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip new file mode 100644 index 0000000..d348566 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-18-23-sched-cache-Disable-cache-aware-scheduling-for-processes-with-high-thread-counts.patch.skip @@ -0,0 +1,175 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 3BC5B30C376 + for ; Wed, 3 Dec 2025 23:01:48 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802910; cv=none; b=onpA+M+D8g+bB7DNpp5zLpepvUh9w8T9C2/oqeKTUWUlV9lpl/W31aZarTCR7uvwI9r/kkm/FD7MwcDDnX7hNWvSaLIvFHtht8DxsLrUWb3j5NtWoxy2IAV7VHzxT0RxTQbEVmk6ub/tCK+n4V2wt8/jU8sGCZYABu8xUNFmQzE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802910; c=relaxed/simple; + bh=BCBRwLmdA+4IVzADPAWhC/3F5wk90mYr0XsPdVDldug=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=P22IAZf3pO0DwcaeGaXfPF45reu5KwrXd9udmOhkXnd4XQpVPzlUupze8eBT005FfxLXJRNYY4JgHS7VRdg5qBGX8VhBoX9G0rOKgnTr7U9RHG4jdp1TU4xtGdenBrAxzksuJ/5c09oa/Ni6O8HCwsplWWOi+6exHbX7OKSFqwo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Ty7FUw1A; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Ty7FUw1A" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802908; x=1796338908; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=BCBRwLmdA+4IVzADPAWhC/3F5wk90mYr0XsPdVDldug=; + b=Ty7FUw1AorJFrTn1pShKiLwJJ/bjWAtb7y1krTlw9/SRwaxzgqmmczqo + u3N/1SifTNffuhxC1c0FAisXDHgXvvqPgSL0eykN2kILgw5XGJw02WLu5 + DTsTU9YL6pY9pb/nL5ZARaF9QKCpSpfipEIM2etVGVvo5Q7kFSTOXs+H8 + iIxOD/4oSuYwezAxsdbkRhhzIdd7YfjUSvB9o0XWfU4YnsJl/heMOcJ7B + H3ZduMD5RF+5BphEK1nTa5CXhVJ0S2nzOaIUo5QipmWAbfGExiFD7Dfvc + B8hxG4haeF2aHk7F8TdO+F6bVlL/xt/ae41Mu5pc0GlLavso3K0AzD+Xh + Q==; +X-CSE-ConnectionGUID: 93dV145yReO721FOecTa9w== +X-CSE-MsgGUID: cCQ1dcHHRZCkHxfc9efEaQ== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136621" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136621" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:47 -0800 +X-CSE-ConnectionGUID: gYbWyA1jQSuPW79ZakwQKg== +X-CSE-MsgGUID: TU95ucBJS6iZ5dz55kzZsQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763946" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:47 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 18/23] sched/cache: Disable cache aware scheduling for processes with high thread counts +Date: Wed, 3 Dec 2025 15:07:37 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +If the number of active threads within the process exceeds the number +of Cores(divided by SMTs number) in the LLC, do not enable cache-aware +scheduling. This is because there is a risk of cache contention within +the preferred LLC when too many threads are present. + +Suggested-by: K Prateek Nayak +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: No change. + + kernel/sched/fair.c | 29 +++++++++++++++++++++++++++-- + 1 file changed, 27 insertions(+), 2 deletions(-) + +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 2f38ad82688f..6afa3f9a4e9b 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1223,6 +1223,18 @@ static int llc_id(int cpu) + return llc; + } + ++static bool exceed_llc_nr(struct mm_struct *mm, int cpu) ++{ ++ int smt_nr = 1; ++ ++#ifdef CONFIG_SCHED_SMT ++ if (sched_smt_active()) ++ smt_nr = cpumask_weight(cpu_smt_mask(cpu)); ++#endif ++ ++ return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu)); ++} ++ + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) + { + int pref_llc; +@@ -1365,10 +1377,12 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + + /* + * If this task hasn't hit task_cache_work() for a while, or it +- * has only 1 thread, invalidate its preferred state. ++ * has only 1 thread, or has too many active threads, invalidate ++ * its preferred state. + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || +- get_nr_threads(p) <= 1) { ++ get_nr_threads(p) <= 1 || ++ exceed_llc_nr(mm, cpu_of(rq))) { + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } +@@ -1435,6 +1449,13 @@ static void __no_profile task_cache_work(struct callback_head *work) + if (p->flags & PF_EXITING) + return; + ++ if (get_nr_threads(p) <= 1) { ++ if (mm->mm_sched_cpu != -1) ++ mm->mm_sched_cpu = -1; ++ ++ return; ++ } ++ + if (!zalloc_cpumask_var(&cpus, GFP_KERNEL)) + return; + +@@ -9874,6 +9895,10 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + ++ /* skip cache aware load balance for single/too many threads */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ return mig_unrestricted; ++ + if (cpus_share_cache(dst_cpu, cpu)) + to_pref = true; + else if (cpus_share_cache(src_cpu, cpu)) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip new file mode 100644 index 0000000..f83468e --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-19-23-sched-cache-Avoid-cache-aware-scheduling-for-memory-heavy-processes.patch.skip @@ -0,0 +1,258 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 9101E2EC54D + for ; Wed, 3 Dec 2025 23:01:50 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802913; cv=none; b=mZ6zgozB73YTe2Q60NzNJeXcrA6dwd6hmTIv0PKyoFj0ekz5KBJkRG1qM2/BURh0aF7CFHE0sYQDT25Sh/ho6UmSGiIRzP3Vlf26ErGeRZYynNy7Hu4jA7k4JybnWrC09LDy8qEGxsIyAxdcr/3QTceL1Zxm0kxxCEBV46nlDEI= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802913; c=relaxed/simple; + bh=ty+thnKFxG9+3T4ifTVEX04pmBe/l14iXANMioAm72I=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=bowCsa1//bbyzKU9WSJiWQsUHsXrqBQlvs/cAKgMyk/m4Bld010TDYg5UwVzdHKRvlpaid+xFoVz12quGwWlGa5F6HadDbBqKTBPP6/p1CNg91urhPN3p32qxubeGCoBIbuMM7MCO6I/YdFGB6u4/f5TpvPg3YmLnLcjC8/C7Xc= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=bxWe6OeK; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="bxWe6OeK" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802910; x=1796338910; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=ty+thnKFxG9+3T4ifTVEX04pmBe/l14iXANMioAm72I=; + b=bxWe6OeKUH0dPxqgW1jI5HE2e1z6OmOiyR4hMvqwqKai+AqvYcbOCYwu + JOlPn9ZWYosHECHx5UGnkdTGEzkOmDWCRC2K3ypKwePUhIyD1337RCjJ3 + uixa8Z2lYSQS2J5GJVC48B2f/yhUzBFPqFV4CEHvCoMLsK1cOf7W1aP4l + eQBVHvIxVJB4mpBt3ae1f/13ipHHAFwfwmFLo4k5SToBHKxSAT6nyvK8a + Vm37u8PzhAmKBcxxBJlGGGzpwc2T4MC/PWSin17i5/r/Xk+DaSUzLnxaF + ZlP2B1+lT/NuonQU/h16sWvSe3/WRw4AeV5gKIbsttEfaewPOisfGEd7j + g==; +X-CSE-ConnectionGUID: Jgmht7L1SaW2ul5kAUA6dw== +X-CSE-MsgGUID: 8CE3l3r/SEaFaHk/6vdVRg== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136653" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136653" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:49 -0800 +X-CSE-ConnectionGUID: 88MGzjBCTmOWjRxLdU7vUw== +X-CSE-MsgGUID: Bi68ivGaS76IdMdbGxb19w== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763965" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:49 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 19/23] sched/cache: Avoid cache-aware scheduling for memory-heavy processes +Date: Wed, 3 Dec 2025 15:07:38 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Prateek and Tingyin reported that memory-intensive workloads (such as +stream) can saturate memory bandwidth and caches on the preferred LLC +when sched_cache aggregates too many threads. + +To mitigate this, estimate a process's memory footprint by comparing +its RSS (anonymous and shared pages) to the size of the LLC. If RSS +exceeds the LLC size, skip cache-aware scheduling. + +Note that RSS is only an approximation of the memory footprint. +By default, the comparison is strict, but a later patch will allow +users to provide a hint to adjust this threshold. + +According to the test from Adam, some systems do not have shared L3 +but with shared L2 as clusters. In this case, the L2 becomes the LLC[1]. + +Link[1]: https://lore.kernel.org/all/3cb6ebc7-a2fd-42b3-8739-b00e28a09cb6@os.amperecomputing.com/ + +Co-developed-by: Tim Chen +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + +Notes: + v1->v2: Assigned curr_cpu in task_cache_work() before checking + exceed_llc_capacity(mm, curr_cpu) to avoid out-of-bound + access.(lkp/0day) + + include/linux/cacheinfo.h | 21 ++++++++++------- + kernel/sched/fair.c | 49 +++++++++++++++++++++++++++++++++++---- + 2 files changed, 57 insertions(+), 13 deletions(-) + +diff --git a/include/linux/cacheinfo.h b/include/linux/cacheinfo.h +index c8f4f0a0b874..82d0d59ca0e1 100644 +--- a/include/linux/cacheinfo.h ++++ b/include/linux/cacheinfo.h +@@ -113,18 +113,11 @@ int acpi_get_cache_info(unsigned int cpu, + + const struct attribute_group *cache_get_priv_group(struct cacheinfo *this_leaf); + +-/* +- * Get the cacheinfo structure for the cache associated with @cpu at +- * level @level. +- * cpuhp lock must be held. +- */ +-static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) ++static inline struct cacheinfo *_get_cpu_cacheinfo_level(int cpu, int level) + { + struct cpu_cacheinfo *ci = get_cpu_cacheinfo(cpu); + int i; + +- lockdep_assert_cpus_held(); +- + for (i = 0; i < ci->num_leaves; i++) { + if (ci->info_list[i].level == level) { + if (ci->info_list[i].attributes & CACHE_ID) +@@ -136,6 +129,18 @@ static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) + return NULL; + } + ++/* ++ * Get the cacheinfo structure for the cache associated with @cpu at ++ * level @level. ++ * cpuhp lock must be held. ++ */ ++static inline struct cacheinfo *get_cpu_cacheinfo_level(int cpu, int level) ++{ ++ lockdep_assert_cpus_held(); ++ ++ return _get_cpu_cacheinfo_level(cpu, level); ++} ++ + /* + * Get the id of the cache associated with @cpu at level @level. + * cpuhp lock must be held. +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 6afa3f9a4e9b..424ec601cfdf 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1223,6 +1223,38 @@ static int llc_id(int cpu) + return llc; + } + ++static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) ++{ ++ struct cacheinfo *ci; ++ unsigned long rss; ++ unsigned int llc; ++ ++ /* ++ * get_cpu_cacheinfo_level() can not be used ++ * because it requires the cpu_hotplug_lock ++ * to be held. Use _get_cpu_cacheinfo_level() ++ * directly because the 'cpu' can not be ++ * offlined at the moment. ++ */ ++ ci = _get_cpu_cacheinfo_level(cpu, 3); ++ if (!ci) { ++ /* ++ * On system without L3 but with shared L2, ++ * L2 becomes the LLC. ++ */ ++ ci = _get_cpu_cacheinfo_level(cpu, 2); ++ if (!ci) ++ return true; ++ } ++ ++ llc = ci->size; ++ ++ rss = get_mm_counter(mm, MM_ANONPAGES) + ++ get_mm_counter(mm, MM_SHMEMPAGES); ++ ++ return (llc <= (rss * PAGE_SIZE)); ++} ++ + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) + { + int smt_nr = 1; +@@ -1382,7 +1414,8 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + */ + if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || + get_nr_threads(p) <= 1 || +- exceed_llc_nr(mm, cpu_of(rq))) { ++ exceed_llc_nr(mm, cpu_of(rq)) || ++ exceed_llc_capacity(mm, cpu_of(rq))) { + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + } +@@ -1439,7 +1472,7 @@ static void __no_profile task_cache_work(struct callback_head *work) + struct mm_struct *mm = p->mm; + unsigned long m_a_occ = 0; + unsigned long curr_m_a_occ = 0; +- int cpu, m_a_cpu = -1, nr_running = 0; ++ int cpu, m_a_cpu = -1, nr_running = 0, curr_cpu; + cpumask_var_t cpus; + + WARN_ON_ONCE(work != &p->cache_work); +@@ -1449,7 +1482,9 @@ static void __no_profile task_cache_work(struct callback_head *work) + if (p->flags & PF_EXITING) + return; + +- if (get_nr_threads(p) <= 1) { ++ curr_cpu = task_cpu(p); ++ if (get_nr_threads(p) <= 1 || ++ exceed_llc_capacity(mm, curr_cpu)) { + if (mm->mm_sched_cpu != -1) + mm->mm_sched_cpu = -1; + +@@ -9895,8 +9930,12 @@ static enum llc_mig can_migrate_llc_task(int src_cpu, int dst_cpu, + if (cpu < 0 || cpus_share_cache(src_cpu, dst_cpu)) + return mig_unrestricted; + +- /* skip cache aware load balance for single/too many threads */ +- if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu)) ++ /* ++ * Skip cache aware load balance for single/too many threads ++ * or large footprint. ++ */ ++ if (get_nr_threads(p) <= 1 || exceed_llc_nr(mm, dst_cpu) || ++ exceed_llc_capacity(mm, dst_cpu)) + return mig_unrestricted; + + if (cpus_share_cache(dst_cpu, cpu)) +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip new file mode 100644 index 0000000..8a42b76 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-20-23-sched-cache-Add-user-control-to-adjust-the-parameters-of-cache-aware-scheduling.patch.skip @@ -0,0 +1,478 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id E85E42F0C6F + for ; Wed, 3 Dec 2025 23:01:52 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802915; cv=none; b=NKB81c5nkJMF1m/c1AQra8pCalQ/VATWqz8ZHIWg0eoz6hnNECnbqY6IjBOdnDBFvVl/b9HVmkECeNM1mHW2uEI8K209dQ6+mwy42BNPEeHaX20qEOS7RazcHKvkjiS5SxHlmYAv1Sx5K4HGlnkZ+3m/wG0/DRyA26pbDpUaoF0= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802915; c=relaxed/simple; + bh=j5hfiRZ2EYaCTsQGDmAvNRTgCCnUI1j/ItMFRbl9uzY=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=V2hqbFyqQGneKfxIcpO2Kc5dagTB+TDzJUq23BN2DeHLv/PgsNga9e2rv+hmluwZMbEcHv9RyyZKJ8F8TwCiuK0Z3yMm4l1RIXSG3p6TYCnyj/3zsuh7jcDOrc/cJgzZvLgpTBDOt79ulEa8r4q4GzHG4PsV4tL2S7Y8MOiS1eo= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=aHHISq0g; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="aHHISq0g" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802913; x=1796338913; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=j5hfiRZ2EYaCTsQGDmAvNRTgCCnUI1j/ItMFRbl9uzY=; + b=aHHISq0gwB38J2pv7w+1lfXdj3ALD4Re5eBGYwuwYbgrSTS87mzWr9d9 + 6z8UE8JAD8ovVTi9HPH2Dj4nm47BQyJFWTB7aSIByFBZvHQDMif8JcxQo + YN44mNhAEn4CrrZXow3MjME9dhVbGveKvuIPn5IfCupOo2V/UomJWHR8v + dtkYFqLnVw3S3bkna5BsUdpRh9ZBimaMuGq/+WwGF2nx4rrzpNdxn0j5U + 3rhoVYZ01bV7elVPmaWw/ckqsd0iILZe0x+W0mSMx9qrnSVEtbw4rvo6z + M5hLadE9a+KUPXiCE/w4A03eCnExBDNTMSqLbTk/r37NYHjbU70zyE3SM + g==; +X-CSE-ConnectionGUID: EZWPyiB6S9KiFT6DKfxjxA== +X-CSE-MsgGUID: 07XoWa+5TBOCIV3mZenWgw== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136682" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136682" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:51 -0800 +X-CSE-ConnectionGUID: MiEptcrPQgi3rw/P5nNNDA== +X-CSE-MsgGUID: DrdTMc52RGuwpeHC+Js9gg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763975" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:51 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 20/23] sched/cache: Add user control to adjust the parameters of cache-aware scheduling +Date: Wed, 3 Dec 2025 15:07:39 -0800 +Message-Id: +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Introduce a set of debugfs knobs to control the enabling of +and parameters for cache-aware load balancing. + +(1) llc_enabled +llc_enabled acts as the primary switch - users can toggle it to +enable or disable cache aware load balancing. + +(2) llc_aggr_tolerance +With sched_cache enabled, the scheduler uses a process's RSS as a +proxy for its LLC footprint to determine if aggregating tasks on the +preferred LLC could cause cache contention. If RSS exceeds the LLC +size, aggregation is skipped. Some workloads with large RSS but small +actual memory footprints may still benefit from aggregation. Since +the kernel cannot efficiently track per-task cache usage (resctrl is +user-space only), userspace can provide a more accurate hint. + +Introduce /sys/kernel/debug/sched/llc_aggr_tolerance to let +users control how strictly RSS limits aggregation. Values range from +0 to 100: + + - 0: Cache-aware scheduling is disabled. + - 1: Strict; tasks with RSS larger than LLC size are skipped. + - 100: Aggressive; tasks are aggregated regardless of RSS. + +For example, with a 32MB L3 cache: + + - llc_aggr_tolerance=1 -> tasks with RSS > 32MB are skipped. + - llc_aggr_tolerance=99 -> tasks with RSS > 784GB are skipped + (784GB = (1 + (99 - 1) * 256) * 32MB). + +Similarly, /sys/kernel/debug/sched/llc_aggr_tolerance also controls +how strictly the number of active threads is considered when doing +cache aware load balance. The number of SMTs is also considered. +High SMT counts reduce the aggregation capacity, preventing excessive +task aggregation on SMT-heavy systems like Power10/Power11. + +For example, with 8 Cores/16 CPUs in a L3: + + - llc_aggr_tolerance=1 -> tasks with nr_running > 8 are skipped. + - llc_aggr_tolerance=99 -> tasks with nr_running > 785 are skipped + 785 = (1 + (99 - 1) * 8). + +(3) llc_epoch_period/llc_epoch_affinity_timeout +Besides, llc_epoch_period and llc_epoch_affinity_timeout are also turned +into tunable. + +Suggested-by: K Prateek Nayak +Suggested-by: Madadi Vineeth Reddy +Suggested-by: Shrikanth Hegde +Suggested-by: Tingyin Duan +Co-developed-by: Tim Chen +Signed-off-by: Tim Chen +Signed-off-by: Chen Yu +--- + +Notes: + v1->v2: Remove the smt_nr check in fits_llc_capacity(). + (Aaron Lu) + + include/linux/sched.h | 4 ++- + kernel/sched/debug.c | 62 ++++++++++++++++++++++++++++++++++++++++ + kernel/sched/fair.c | 63 ++++++++++++++++++++++++++++++++++++----- + kernel/sched/sched.h | 5 ++++ + kernel/sched/topology.c | 54 +++++++++++++++++++++++++++++++++-- + 5 files changed, 178 insertions(+), 10 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 466ba8b7398c..95bf080bbbf0 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2436,9 +2436,11 @@ extern void migrate_enable(void); + DEFINE_LOCK_GUARD_0(migrate, migrate_disable(), migrate_enable()) + + #ifdef CONFIG_SCHED_CACHE ++DECLARE_STATIC_KEY_FALSE(sched_cache_on); ++ + static inline bool sched_cache_enabled(void) + { +- return false; ++ return static_branch_unlikely(&sched_cache_on); + } + #endif + +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 02e16b70a790..cde324672103 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = { + .release = single_release, + }; + ++#ifdef CONFIG_SCHED_CACHE ++#define SCHED_CACHE_CREATE_CONTROL(name, max) \ ++static ssize_t sched_cache_write_##name(struct file *filp, \ ++ const char __user *ubuf, \ ++ size_t cnt, loff_t *ppos) \ ++{ \ ++ char buf[16]; \ ++ unsigned int val; \ ++ if (cnt > 15) \ ++ cnt = 15; \ ++ if (copy_from_user(&buf, ubuf, cnt)) \ ++ return -EFAULT; \ ++ buf[cnt] = '\0'; \ ++ if (kstrtouint(buf, 10, &val)) \ ++ return -EINVAL; \ ++ if (val > (max)) \ ++ return -EINVAL; \ ++ llc_##name = val; \ ++ if (!strcmp(#name, "enabled")) \ ++ sched_cache_set(false); \ ++ *ppos += cnt; \ ++ return cnt; \ ++} \ ++static int sched_cache_show_##name(struct seq_file *m, void *v) \ ++{ \ ++ seq_printf(m, "%d\n", llc_##name); \ ++ return 0; \ ++} \ ++static int sched_cache_open_##name(struct inode *inode, \ ++ struct file *filp) \ ++{ \ ++ return single_open(filp, sched_cache_show_##name, NULL); \ ++} \ ++static const struct file_operations sched_cache_fops_##name = { \ ++ .open = sched_cache_open_##name, \ ++ .write = sched_cache_write_##name, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++} ++ ++SCHED_CACHE_CREATE_CONTROL(overload_pct, 100); ++SCHED_CACHE_CREATE_CONTROL(imb_pct, 100); ++SCHED_CACHE_CREATE_CONTROL(aggr_tolerance, 100); ++SCHED_CACHE_CREATE_CONTROL(enabled, 1); ++#endif /* SCHED_CACHE */ ++ + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +@@ -523,6 +570,21 @@ static __init int sched_init_debug(void) + debugfs_create_u32("hot_threshold_ms", 0644, numa, &sysctl_numa_balancing_hot_threshold); + #endif /* CONFIG_NUMA_BALANCING */ + ++#ifdef CONFIG_SCHED_CACHE ++ debugfs_create_file("llc_overload_pct", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_overload_pct); ++ debugfs_create_file("llc_imb_pct", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_imb_pct); ++ debugfs_create_file("llc_aggr_tolerance", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_aggr_tolerance); ++ debugfs_create_file("llc_enabled", 0644, debugfs_sched, NULL, ++ &sched_cache_fops_enabled); ++ debugfs_create_u32("llc_epoch_period", 0644, debugfs_sched, ++ &llc_epoch_period); ++ debugfs_create_u32("llc_epoch_affinity_timeout", 0644, debugfs_sched, ++ &llc_epoch_affinity_timeout); ++#endif ++ + debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); + + debugfs_fair_server_init(); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 424ec601cfdf..a2e2d6742481 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1207,6 +1207,9 @@ static s64 update_se(struct rq *rq, struct sched_entity *se) + + __read_mostly unsigned int llc_overload_pct = 50; + __read_mostly unsigned int llc_imb_pct = 20; ++__read_mostly unsigned int llc_aggr_tolerance = 1; ++__read_mostly unsigned int llc_epoch_period = EPOCH_PERIOD; ++__read_mostly unsigned int llc_epoch_affinity_timeout = EPOCH_LLC_AFFINITY_TIMEOUT; + + static int llc_id(int cpu) + { +@@ -1223,11 +1226,22 @@ static int llc_id(int cpu) + return llc; + } + ++static inline int get_sched_cache_scale(int mul) ++{ ++ if (!llc_aggr_tolerance) ++ return 0; ++ ++ if (llc_aggr_tolerance == 100) ++ return INT_MAX; ++ ++ return (1 + (llc_aggr_tolerance - 1) * mul); ++} ++ + static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + { ++ unsigned int llc, scale; + struct cacheinfo *ci; + unsigned long rss; +- unsigned int llc; + + /* + * get_cpu_cacheinfo_level() can not be used +@@ -1252,19 +1266,54 @@ static bool exceed_llc_capacity(struct mm_struct *mm, int cpu) + rss = get_mm_counter(mm, MM_ANONPAGES) + + get_mm_counter(mm, MM_SHMEMPAGES); + +- return (llc <= (rss * PAGE_SIZE)); ++ /* ++ * Scale the LLC size by 256*llc_aggr_tolerance ++ * and compare it to the task's RSS size. ++ * ++ * Suppose the L3 size is 32MB. If the ++ * llc_aggr_tolerance is 1: ++ * When the RSS is larger than 32MB, the process ++ * is regarded as exceeding the LLC capacity. If ++ * the llc_aggr_tolerance is 99: ++ * When the RSS is larger than 784GB, the process ++ * is regarded as exceeding the LLC capacity because: ++ * 784GB = (1 + (99 - 1) * 256) * 32MB ++ */ ++ scale = get_sched_cache_scale(256); ++ if (scale == INT_MAX) ++ return false; ++ ++ return ((llc * scale) <= (rss * PAGE_SIZE)); + } + + static bool exceed_llc_nr(struct mm_struct *mm, int cpu) + { +- int smt_nr = 1; ++ int smt_nr = 1, scale; + + #ifdef CONFIG_SCHED_SMT + if (sched_smt_active()) + smt_nr = cpumask_weight(cpu_smt_mask(cpu)); + #endif ++ /* ++ * Scale the Core number in a LLC by llc_aggr_tolerance ++ * and compare it to the task's active threads. ++ * ++ * Suppose the number of Cores in LLC is 8. ++ * Every core has 2 SMTs. ++ * If the llc_aggr_tolerance is 1: When the ++ * nr_running is larger than 8, the process ++ * is regarded as exceeding the LLC capacity. ++ * If the llc_aggr_tolerance is 99: ++ * When the nr_running is larger than 785, ++ * the process is regarded as exceeding ++ * the LLC capacity: ++ * 785 = 1 + (99 - 1) * 8 ++ */ ++ scale = get_sched_cache_scale(1); ++ if (scale == INT_MAX) ++ return false; + +- return ((mm->nr_running_avg * smt_nr) > per_cpu(sd_llc_size, cpu)); ++ return ((mm->nr_running_avg * smt_nr) > (scale * per_cpu(sd_llc_size, cpu))); + } + + static void account_llc_enqueue(struct rq *rq, struct task_struct *p) +@@ -1350,9 +1399,9 @@ static inline void __update_mm_sched(struct rq *rq, struct mm_sched *pcpu_sched) + long delta = now - rq->cpu_epoch_next; + + if (delta > 0) { +- n = (delta + EPOCH_PERIOD - 1) / EPOCH_PERIOD; ++ n = (delta + llc_epoch_period - 1) / llc_epoch_period; + rq->cpu_epoch += n; +- rq->cpu_epoch_next += n * EPOCH_PERIOD; ++ rq->cpu_epoch_next += n * llc_epoch_period; + __shr_u64(&rq->cpu_runtime, n); + } + +@@ -1412,7 +1461,7 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + * has only 1 thread, or has too many active threads, invalidate + * its preferred state. + */ +- if (epoch - READ_ONCE(mm->mm_sched_epoch) > EPOCH_LLC_AFFINITY_TIMEOUT || ++ if (epoch - READ_ONCE(mm->mm_sched_epoch) > llc_epoch_affinity_timeout || + get_nr_threads(p) <= 1 || + exceed_llc_nr(mm, cpu_of(rq)) || + exceed_llc_capacity(mm, cpu_of(rq))) { +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 40798a06e058..15d126bd3728 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2852,6 +2852,11 @@ extern unsigned int sysctl_numa_balancing_hot_threshold; + #ifdef CONFIG_SCHED_CACHE + extern unsigned int llc_overload_pct; + extern unsigned int llc_imb_pct; ++extern unsigned int llc_aggr_tolerance; ++extern unsigned int llc_epoch_period; ++extern unsigned int llc_epoch_affinity_timeout; ++extern unsigned int llc_enabled; ++void sched_cache_set(bool locked); + #endif + + #ifdef CONFIG_SCHED_HRTICK +diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c +index 9799e3a9a609..818599ddaaef 100644 +--- a/kernel/sched/topology.c ++++ b/kernel/sched/topology.c +@@ -26,6 +26,49 @@ int max_llcs; + + static bool sched_cache_present; + ++unsigned int llc_enabled = 1; ++DEFINE_STATIC_KEY_FALSE(sched_cache_on); ++ ++/* ++ * Enable/disable cache aware scheduling according to ++ * user input and the presence of hardware support. ++ */ ++static void _sched_cache_set(bool enable, bool locked) ++{ ++ if (enable) { ++ if (locked) ++ static_branch_enable_cpuslocked(&sched_cache_on); ++ else ++ static_branch_enable(&sched_cache_on); ++ } else { ++ if (locked) ++ static_branch_disable_cpuslocked(&sched_cache_on); ++ else ++ static_branch_disable(&sched_cache_on); ++ } ++} ++ ++void sched_cache_set(bool locked) ++{ ++ /* hardware does not support */ ++ if (!sched_cache_present) { ++ if (static_branch_likely(&sched_cache_on)) ++ _sched_cache_set(false, locked); ++ ++ return; ++ } ++ ++ /* user wants it or not ?*/ ++ if (llc_enabled) { ++ if (!static_branch_likely(&sched_cache_on)) ++ _sched_cache_set(true, locked); ++ ++ } else { ++ if (static_branch_likely(&sched_cache_on)) ++ _sched_cache_set(false, locked); ++ } ++} ++ + static unsigned int *alloc_new_pref_llcs(unsigned int *old, unsigned int **gc) + { + unsigned int *new = NULL; +@@ -70,8 +113,12 @@ static int resize_llc_pref(bool has_multi_llcs) + * new buffer. + */ + tmp_llc_pref = alloc_percpu_noprof(unsigned int *); +- if (!tmp_llc_pref) +- return -ENOMEM; ++ if (!tmp_llc_pref) { ++ sched_cache_present = false; ++ ret = -ENOMEM; ++ ++ goto out; ++ } + + for_each_present_cpu(i) + *per_cpu_ptr(tmp_llc_pref, i) = NULL; +@@ -89,6 +136,7 @@ static int resize_llc_pref(bool has_multi_llcs) + new = alloc_new_pref_llcs(rq->nr_pref_llc, per_cpu_ptr(tmp_llc_pref, i)); + if (!new) { + ret = -ENOMEM; ++ sched_cache_present = false; + + goto release_old; + } +@@ -126,6 +174,8 @@ static int resize_llc_pref(bool has_multi_llcs) + if (!ret) + max_llcs = new_max_llcs; + ++out: ++ sched_cache_set(true); + return ret; + } + +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip new file mode 100644 index 0000000..91cb19a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-21-23-DO-NOT-APPLY-sched-cache-stats-Add-schedstat-for-cache-aware-load-balancing.patch.skip @@ -0,0 +1,174 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id D10542F12DD + for ; Wed, 3 Dec 2025 23:01:53 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802915; cv=none; b=AmWzQXbFY2sN5heLcp4s9rWoLO7pjURsg464nsA8jjoqA5nJagwpJv9G+UJULof1tTaFgz2GmAr0hHkABofj6ydnfXE2fd4hRRYb7GE+M+4gERnZr5wAJOQw/zTEmxBeWSSE5iNgbAWmM054GBUn6MCdpITYzuKbb1BP7b3L3sk= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802915; c=relaxed/simple; + bh=heewblv8+VUSifHzkX3W2P+i26TuBbpse5E1oodIHu8=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=Xpw0JLbgiphYf3Sab645eHcm9Luo+Mx2FuFXrjcPsXJxnYfglU5zHbY1C3nGcYUTlQht3caQEhhC7tRceDrXIkNZHUg5zn5pvhgic99RbM9RtmxCAUWRJKHEvQHILxmwPExiCxVB0m/pqwl8+stVV67Gqhqd6Lhw1hT41ldDB9s= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=Cxg4oTl4; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="Cxg4oTl4" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802913; x=1796338913; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=heewblv8+VUSifHzkX3W2P+i26TuBbpse5E1oodIHu8=; + b=Cxg4oTl4FXmQXHOKywDD1PXh0TwFbaiKduxzegiGnyiEGbaHQGeStB45 + heDXhCr5sdgqIhbxUFp1vM0glTwn0l4/6ZiEL/dgHN9LNlGjaYsII9jc1 + 2qGZ9JRhqrUWqdc8Jm6fWF0Wuz16A6ncwR05z1/osHOGjbKNCnVNF9Y0l + 4FSdn5Pg7wz/0mo5Tfd9kz21TLqYSS8tlCVsn5MnhfbvMVKYOtZOb0WKR + 3KiZKcH2I7DsvpgO/euP9zAwOTpRdP8eIGES5K1LCg7I6oiUiavAKbHWR + nP3xATAIJENhZb+rdETusA0Fs1MIUcnKK88Vr8NJIw3yCIQUWh4CdT9qz + A==; +X-CSE-ConnectionGUID: 32GrqILbQRmayJvwXZJ8Bg== +X-CSE-MsgGUID: F77VqvbYTl+X7/J43cXgzw== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136713" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136713" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:53 -0800 +X-CSE-ConnectionGUID: U2qPsSdSRnej2tO9wNUi2g== +X-CSE-MsgGUID: YiZKZfnpSMaNZy2O6pI9Xg== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199763990" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:52 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 21/23] -- DO NOT APPLY!!! -- sched/cache/stats: Add schedstat for cache aware load balancing +Date: Wed, 3 Dec 2025 15:07:40 -0800 +Message-Id: <71b94a7547f7843230270e20b84ecb0a540ab604.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Debug patch only. + +With cache-aware load balancing enabled, statistics related to its activity +are exposed via /proc/schedstat and debugfs. For instance, if users want to +verify metrics like the number of exceeding RSS and nr_running limits, they +can filter the output of /sys/kernel/debug/sched/debug and compute the required +statistics manually: + +llc_exceed_cap SUM: 6 +llc_exceed_nr SUM: 4531 + +Furthermore, these statistics exposed in /proc/schedstats can be queried manually +or via perf sched stats[1] with minor modifications. + +Link: https://lore.kernel.org/all/20250909114227.58802-1-swapnil.sapkal@amd.com #1 + +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/linux/sched/topology.h | 1 + + kernel/sched/fair.c | 1 + + kernel/sched/stats.c | 5 +++-- + 3 files changed, 5 insertions(+), 2 deletions(-) + +diff --git a/include/linux/sched/topology.h b/include/linux/sched/topology.h +index 0ba4697d74ba..8702c1e731a0 100644 +--- a/include/linux/sched/topology.h ++++ b/include/linux/sched/topology.h +@@ -108,6 +108,7 @@ struct sched_domain { + unsigned int lb_imbalance_util[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_task[CPU_MAX_IDLE_TYPES]; + unsigned int lb_imbalance_misfit[CPU_MAX_IDLE_TYPES]; ++ unsigned int lb_imbalance_llc[CPU_MAX_IDLE_TYPES]; + unsigned int lb_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_hot_gained[CPU_MAX_IDLE_TYPES]; + unsigned int lb_nobusyg[CPU_MAX_IDLE_TYPES]; +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index a2e2d6742481..742e455b093e 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -12684,6 +12684,7 @@ static void update_lb_imbalance_stat(struct lb_env *env, struct sched_domain *sd + __schedstat_add(sd->lb_imbalance_misfit[idle], env->imbalance); + break; + case migrate_llc_task: ++ __schedstat_add(sd->lb_imbalance_llc[idle], env->imbalance); + break; + } + } +diff --git a/kernel/sched/stats.c b/kernel/sched/stats.c +index d1c9429a4ac5..3736f6102261 100644 +--- a/kernel/sched/stats.c ++++ b/kernel/sched/stats.c +@@ -104,7 +104,7 @@ void __update_stats_enqueue_sleeper(struct rq *rq, struct task_struct *p, + * Bump this up when changing the output format or the meaning of an existing + * format, so that tools can adapt (or abort) + */ +-#define SCHEDSTAT_VERSION 17 ++#define SCHEDSTAT_VERSION 18 + + static int show_schedstat(struct seq_file *seq, void *v) + { +@@ -139,7 +139,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + seq_printf(seq, "domain%d %s %*pb", dcount++, sd->name, + cpumask_pr_args(sched_domain_span(sd))); + for (itype = 0; itype < CPU_MAX_IDLE_TYPES; itype++) { +- seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u", ++ seq_printf(seq, " %u %u %u %u %u %u %u %u %u %u %u %u", + sd->lb_count[itype], + sd->lb_balanced[itype], + sd->lb_failed[itype], +@@ -147,6 +147,7 @@ static int show_schedstat(struct seq_file *seq, void *v) + sd->lb_imbalance_util[itype], + sd->lb_imbalance_task[itype], + sd->lb_imbalance_misfit[itype], ++ sd->lb_imbalance_llc[itype], + sd->lb_gained[itype], + sd->lb_hot_gained[itype], + sd->lb_nobusyq[itype], +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip new file mode 100644 index 0000000..434c1b6 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-22-23-DO-NOT-APPLY-sched-cache-debug-Add-ftrace-to-track-the-load-balance-statistics.patch.skip @@ -0,0 +1,172 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id B53DB2EDD63 + for ; Wed, 3 Dec 2025 23:01:55 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802917; cv=none; b=Sv2g8yh/ssOUkCxGvmjgju6aonEWXYABCuXTb+U7pmXY6LV36x4JKu1MuMeuYO1vCluXZy/7Ay7i1yE6FtkBqXrqbYaDn/USnq7xKePL08B+Z5erY6PuyaIsHhWqUANVdUR5D6Behj/PK8qsySaRT1rgt6AitMIk8lP+NbOAHCE= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802917; c=relaxed/simple; + bh=Z1RTLO9XI8wzi8HuLfkDHGWZXGFVHLiwaXN3uD70l1Y=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=CATqSWfOo+6YE9nXLVWZJO6JnMOLrl52x8cVMx1zwPuSpCTUr3IN5JnkiXN2GyKQ26mCPXBWcWxBHdzMY7E9cxtAmJLxGzbXdU2Fg+4DSuAYi1K0o6tozFHYiuKS+6QKbzMtYuK8+ri9bLYJjOu4P79WeHsP8FgYaIsrRFSRu2w= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=mJ2c+rcP; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="mJ2c+rcP" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802915; x=1796338915; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=Z1RTLO9XI8wzi8HuLfkDHGWZXGFVHLiwaXN3uD70l1Y=; + b=mJ2c+rcP1UOBgGP4yRYC4G9oY4qxvoF1rz/E8g2VluXVhdaKym+KKeiM + 98QozNlJsgm6c2psR2Mp1UJhkz/Z+hMiEVNErwajLDcIdLXPKWwrmkhgP + CWKO4YFSmv7sZsGBLUL6MPnqDCpqzgPQvR5FKXPgi7m3I3rXLqAaZgLzM + bfubfkiwaBvcluOfyoYhJ37GeqSNPw53SP+PU0pGAu+cSL5BeyuIN+g+r + dRFzsYKK0wBWGsqYyMy6aje2lH7qKav3U/83YEE1h0WkyFF5hAmr4RJRT + /HIg5gjIb43mMeVrXXMSuFG2ajgVo7HXw1utNSmLiOQiREq43MfL2zw8y + w==; +X-CSE-ConnectionGUID: 9oIPtAybQ2qo8rrXonYwDQ== +X-CSE-MsgGUID: zYiFP9hSSKCIjoTpSYg3HA== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136743" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136743" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:54 -0800 +X-CSE-ConnectionGUID: WESeCoKDRrGN0u/wU3Xx1Q== +X-CSE-MsgGUID: 8VfHl2BFSC60lwjXMiz/LQ== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199764003" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:54 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 22/23] -- DO NOT APPLY!!! -- sched/cache/debug: Add ftrace to track the load balance statistics +Date: Wed, 3 Dec 2025 15:07:41 -0800 +Message-Id: <445303c70d8d464c35c97f33d4be7b752e8db5ae.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Debug patch only. + +The user leverages this trace event (via bpftrace, etc)to monitor the cache +aware load balance activity - whether the tasks are moved to their preferred +LLC, or moved out of their preferred LLC. + +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + include/trace/events/sched.h | 31 +++++++++++++++++++++++++++++++ + kernel/sched/fair.c | 10 ++++++++++ + 2 files changed, 41 insertions(+) + +diff --git a/include/trace/events/sched.h b/include/trace/events/sched.h +index 7b2645b50e78..bd03f49f7e3c 100644 +--- a/include/trace/events/sched.h ++++ b/include/trace/events/sched.h +@@ -10,6 +10,37 @@ + #include + #include + ++TRACE_EVENT(sched_attach_task, ++ ++ TP_PROTO(struct task_struct *t, int pref_cpu, int pref_llc, ++ int attach_cpu, int attach_llc), ++ ++ TP_ARGS(t, pref_cpu, pref_llc, attach_cpu, attach_llc), ++ ++ TP_STRUCT__entry( ++ __array( char, comm, TASK_COMM_LEN ) ++ __field( pid_t, pid ) ++ __field( int, pref_cpu ) ++ __field( int, pref_llc ) ++ __field( int, attach_cpu ) ++ __field( int, attach_llc ) ++ ), ++ ++ TP_fast_assign( ++ memcpy(__entry->comm, t->comm, TASK_COMM_LEN); ++ __entry->pid = t->pid; ++ __entry->pref_cpu = pref_cpu; ++ __entry->pref_llc = pref_llc; ++ __entry->attach_cpu = attach_cpu; ++ __entry->attach_llc = attach_llc; ++ ), ++ ++ TP_printk("comm=%s pid=%d pref_cpu=%d pref_llc=%d attach_cpu=%d attach_llc=%d", ++ __entry->comm, __entry->pid, ++ __entry->pref_cpu, __entry->pref_llc, ++ __entry->attach_cpu, __entry->attach_llc) ++); ++ + /* + * Tracepoint for calling kthread_stop, performed to end a kthread: + */ +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index 742e455b093e..e47b4096f0a6 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -10487,6 +10487,16 @@ static void attach_task(struct rq *rq, struct task_struct *p) + { + lockdep_assert_rq_held(rq); + ++#ifdef CONFIG_SCHED_CACHE ++ if (p->mm) { ++ int pref_cpu = p->mm->mm_sched_cpu; ++ ++ trace_sched_attach_task(p, ++ pref_cpu, ++ pref_cpu != -1 ? llc_id(pref_cpu) : -1, ++ cpu_of(rq), llc_id(cpu_of(rq))); ++ } ++#endif + WARN_ON_ONCE(task_rq(p) != rq); + activate_task(rq, p, ENQUEUE_NOCLOCK); + wakeup_preempt(rq, p, 0); +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip b/sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip new file mode 100644 index 0000000..6969f82 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/PATCH-v2-23-23-DO-NOT-APPLY-sched-cache-debug-Display-the-per-LLC-occupancy-for-each-process-via-proc-fs.patch.skip @@ -0,0 +1,323 @@ +From mboxrd@z Thu Jan 1 00:00:00 1970 +Received: from mgamail.intel.com (mgamail.intel.com [198.175.65.11]) + (using TLSv1.2 with cipher ECDHE-RSA-AES256-GCM-SHA384 (256/256 bits)) + (No client certificate requested) + by smtp.subspace.kernel.org (Postfix) with ESMTPS id 0CB492EBBB7 + for ; Wed, 3 Dec 2025 23:01:56 +0000 (UTC) +Authentication-Results: smtp.subspace.kernel.org; arc=none smtp.client-ip=198.175.65.11 +ARC-Seal:i=1; a=rsa-sha256; d=subspace.kernel.org; s=arc-20240116; + t=1764802919; cv=none; b=m/i8AM9jez30fmSC1ThjI0YmAYEwTjLN0aX4/W91cI/xJdwDY3yhTCxjuRQMXmg8XAbCVHRL4AColOXBfQy71E1URs7aT+GLFscw7WH4+OFmIN9YsDx0KaMus5WdBjhF8tzszL6TEZ12kmt42mlqOXQoE5Z3dqzJYmLCcriio58= +ARC-Message-Signature:i=1; a=rsa-sha256; d=subspace.kernel.org; + s=arc-20240116; t=1764802919; c=relaxed/simple; + bh=wLrJr/SuamOWjFO9gpHP5B2k8lcK+6x8dlASnUWXGe8=; + h=From:To:Cc:Subject:Date:Message-Id:In-Reply-To:References: + MIME-Version; b=eJbxWPHDUsl7XKuqPrYe829WccTGNXVp007ecq2JrHaVKwuvPh4j19TPROJM5V4vppIdkk1U3AT26iFdDx2qrmsewZCkwqlDeBPDJqbvbZbY+3Vimkg2ojZhH8CLl94yalOO4ZSXRjWefBovmf2taUbRtFOEBHGk1S0e1XvQ7G4= +ARC-Authentication-Results:i=1; smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com; spf=pass smtp.mailfrom=linux.intel.com; dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b=jEU0XIYU; arc=none smtp.client-ip=198.175.65.11 +Authentication-Results: smtp.subspace.kernel.org; dmarc=pass (p=none dis=none) header.from=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; spf=pass smtp.mailfrom=linux.intel.com +Authentication-Results: smtp.subspace.kernel.org; + dkim=pass (2048-bit key) header.d=intel.com header.i=@intel.com header.b="jEU0XIYU" +DKIM-Signature: v=1; a=rsa-sha256; c=relaxed/simple; + d=intel.com; i=@intel.com; q=dns/txt; s=Intel; + t=1764802917; x=1796338917; + h=from:to:cc:subject:date:message-id:in-reply-to: + references:mime-version:content-transfer-encoding; + bh=wLrJr/SuamOWjFO9gpHP5B2k8lcK+6x8dlASnUWXGe8=; + b=jEU0XIYUmUP1w1odUpoZztkux4d2T4uFzSQDEoeQkO6AEZ1yfHcuVfq9 + YwImXDzBWY46rQh33rL3qoP+4HJZhnOXgjU9/vwFZtLvGkGs5rHvI8YBx + jDLfActh0h/lcktc8ZNAWUhHLuPaktpxkehHuTNiQ+/PYiyL7+Hj8Xdrd + 41rYFhxJEN7aCEKecsCgMgtV2kyKG5rxF89kVp/FA/73jNvUXDa5pRoN7 + yqtdT/I+zUDFwYL0JDyMdCOZxceWrOHrciU5DroHkoBLTkvVc7oA5oIMh + KkFun1mmeV+tcvGf8EXfa3CUEmb0TvEhrDlTxbkcFqltiq0sEOiCw8NXE + w==; +X-CSE-ConnectionGUID: 7s0dCQLrSayFkNv254nlIw== +X-CSE-MsgGUID: oFf+c8koRFSSrN6sf+Ly3g== +X-IronPort-AV: E=McAfee;i="6800,10657,11631"; a="77136770" +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="77136770" +Received: from fmviesa004.fm.intel.com ([10.60.135.144]) + by orvoesa103.jf.intel.com with ESMTP/TLS/ECDHE-RSA-AES256-GCM-SHA384; 03 Dec 2025 15:01:56 -0800 +X-CSE-ConnectionGUID: F/ChnV0DRm2XulsnpHzSUQ== +X-CSE-MsgGUID: VHoDeeBRRb2BuWXAH6c04Q== +X-ExtLoop1: 1 +X-IronPort-AV: E=Sophos;i="6.20,247,1758610800"; + d="scan'208";a="199764012" +Received: from b04f130c83f2.jf.intel.com ([10.165.154.98]) + by fmviesa004.fm.intel.com with ESMTP; 03 Dec 2025 15:01:55 -0800 +From: Tim Chen +To: Peter Zijlstra , + Ingo Molnar , + K Prateek Nayak , + "Gautham R . Shenoy" , + Vincent Guittot +Cc: Chen Yu , + Juri Lelli , + Dietmar Eggemann , + Steven Rostedt , + Ben Segall , + Mel Gorman , + Valentin Schneider , + Madadi Vineeth Reddy , + Hillf Danton , + Shrikanth Hegde , + Jianyong Wu , + Yangyu Chen , + Tingyin Duan , + Vern Hao , + Vern Hao , + Len Brown , + Tim Chen , + Aubrey Li , + Zhao Liu , + Chen Yu , + Adam Li , + Aaron Lu , + Tim Chen , + linux-kernel@vger.kernel.org +Subject: [PATCH v2 23/23] -- DO NOT APPLY!!! -- sched/cache/debug: Display the per LLC occupancy for each process via proc fs +Date: Wed, 3 Dec 2025 15:07:42 -0800 +Message-Id: <0eaf9b9f89f0d97dbf46b760421f65aee3ffe063.1764801860.git.tim.c.chen@linux.intel.com> +X-Mailer: git-send-email 2.32.0 +In-Reply-To: +References: +Precedence: bulk +X-Mailing-List: linux-kernel@vger.kernel.org +List-Id: +List-Subscribe: +List-Unsubscribe: +MIME-Version: 1.0 +Content-Transfer-Encoding: 8bit + +From: Chen Yu + +Debug patch only. + +Show the per-LLC occupancy in /proc/{PID}/schedstat, with each column +corresponding to one LLC. This can be used to verify if the cache-aware +load balancer works as expected by aggregating threads onto dedicated LLCs. + +Suppose there are 2 LLCs and the sampling duration is 10 seconds: + +Enable the cache aware load balance: +0 12281 <--- LLC0 residency delta is 0, LLC1 is 12 seconds +0 18881 +0 16217 + +disable the cache aware load balance: +6497 15802 +9299 5435 +17811 8278 + +Signed-off-by: Chen Yu +Signed-off-by: Tim Chen +--- + fs/proc/base.c | 22 ++++++++++++++++++++++ + include/linux/mm_types.h | 19 +++++++++++++++++-- + include/linux/sched.h | 3 +++ + kernel/sched/fair.c | 40 ++++++++++++++++++++++++++++++++++++++-- + 4 files changed, 80 insertions(+), 4 deletions(-) + +diff --git a/fs/proc/base.c b/fs/proc/base.c +index 6299878e3d97..f4be96f4bd01 100644 +--- a/fs/proc/base.c ++++ b/fs/proc/base.c +@@ -518,6 +518,28 @@ static int proc_pid_schedstat(struct seq_file *m, struct pid_namespace *ns, + (unsigned long long)task->se.sum_exec_runtime, + (unsigned long long)task->sched_info.run_delay, + task->sched_info.pcount); ++#ifdef CONFIG_SCHED_CACHE ++ if (sched_cache_enabled()) { ++ struct mm_struct *mm = task->mm; ++ u64 *llc_runtime; ++ ++ if (!mm) ++ return 0; ++ ++ llc_runtime = kcalloc(max_llcs, sizeof(u64), GFP_KERNEL); ++ if (!llc_runtime) ++ return 0; ++ ++ if (get_mm_per_llc_runtime(task, llc_runtime)) ++ goto out; ++ ++ for (int i = 0; i < max_llcs; i++) ++ seq_printf(m, "%llu ", llc_runtime[i]); ++ seq_puts(m, "\n"); ++out: ++ kfree(llc_runtime); ++ } ++#endif + + return 0; + } +diff --git a/include/linux/mm_types.h b/include/linux/mm_types.h +index 04743983de4d..255c22be7312 100644 +--- a/include/linux/mm_types.h ++++ b/include/linux/mm_types.h +@@ -944,6 +944,10 @@ struct mm_sched { + unsigned long epoch; + }; + ++struct mm_time { ++ u64 runtime_ns; ++}; ++ + struct kioctx_table; + struct iommu_mm_data; + struct mm_struct { +@@ -1040,6 +1044,7 @@ struct mm_struct { + * See account_mm_sched() and ... + */ + struct mm_sched __percpu *pcpu_sched; ++ struct mm_time __percpu *pcpu_time; + raw_spinlock_t mm_sched_lock; + unsigned long mm_sched_epoch; + int mm_sched_cpu; +@@ -1505,16 +1510,24 @@ static inline void mm_set_cpus_allowed(struct mm_struct *mm, const struct cpumas + #endif /* CONFIG_SCHED_MM_CID */ + + #ifdef CONFIG_SCHED_CACHE +-void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched); ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *pcpu_sched, ++ struct mm_time __percpu *pcpu_time); + + static inline int mm_alloc_sched_noprof(struct mm_struct *mm) + { + struct mm_sched __percpu *pcpu_sched = alloc_percpu_noprof(struct mm_sched); ++ struct mm_time __percpu *pcpu_time; + + if (!pcpu_sched) + return -ENOMEM; + +- mm_init_sched(mm, pcpu_sched); ++ pcpu_time = alloc_percpu_noprof(struct mm_time); ++ if (!pcpu_time) { ++ free_percpu(mm->pcpu_sched); ++ return -ENOMEM; ++ } ++ ++ mm_init_sched(mm, pcpu_sched, pcpu_time); + return 0; + } + +@@ -1523,7 +1536,9 @@ static inline int mm_alloc_sched_noprof(struct mm_struct *mm) + static inline void mm_destroy_sched(struct mm_struct *mm) + { + free_percpu(mm->pcpu_sched); ++ free_percpu(mm->pcpu_time); + mm->pcpu_sched = NULL; ++ mm->pcpu_time = NULL; + } + #else /* !CONFIG_SCHED_CACHE */ + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 95bf080bbbf0..875ac3f4208b 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -2442,6 +2442,9 @@ static inline bool sched_cache_enabled(void) + { + return static_branch_unlikely(&sched_cache_on); + } ++ ++int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf); ++extern int max_llcs; + #endif + + #endif +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index e47b4096f0a6..205208f061bb 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -1355,16 +1355,19 @@ static void account_llc_dequeue(struct rq *rq, struct task_struct *p) + p->sched_llc_active = false; + } + +-void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) ++void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched, ++ struct mm_time __percpu *_pcpu_time) + { + unsigned long epoch; + int i; + + for_each_possible_cpu(i) { + struct mm_sched *pcpu_sched = per_cpu_ptr(_pcpu_sched, i); ++ struct mm_time *pcpu_time = per_cpu_ptr(_pcpu_time, i); + struct rq *rq = cpu_rq(i); + + pcpu_sched->runtime = 0; ++ pcpu_time->runtime_ns = 0; + pcpu_sched->epoch = rq->cpu_epoch; + epoch = rq->cpu_epoch; + } +@@ -1379,6 +1382,8 @@ void mm_init_sched(struct mm_struct *mm, struct mm_sched __percpu *_pcpu_sched) + * the readers may get invalid mm_sched_epoch, etc. + */ + smp_store_release(&mm->pcpu_sched, _pcpu_sched); ++ /* same as above */ ++ smp_store_release(&mm->pcpu_time, _pcpu_time); + } + + /* because why would C be fully specified */ +@@ -1428,11 +1433,39 @@ static unsigned long __no_profile fraction_mm_sched(struct rq *rq, struct mm_sch + + static unsigned int task_running_on_cpu(int cpu, struct task_struct *p); + ++/* p->pi_lock is hold */ ++int get_mm_per_llc_runtime(struct task_struct *p, u64 *buf) ++{ ++ struct mm_struct *mm = p->mm; ++ struct mm_time *pcpu_time; ++ int cpu; ++ ++ if (!mm) ++ return -EINVAL; ++ ++ rcu_read_lock(); ++ for_each_online_cpu(cpu) { ++ int llc = llc_id(cpu); ++ u64 runtime_ms; ++ ++ if (llc < 0) ++ continue; ++ ++ pcpu_time = per_cpu_ptr(mm->pcpu_time, cpu); ++ runtime_ms = div_u64(pcpu_time->runtime_ns, NSEC_PER_MSEC); ++ buf[llc] += runtime_ms; ++ } ++ rcu_read_unlock(); ++ ++ return 0; ++} ++ + static inline + void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + { + struct mm_struct *mm = p->mm; + struct mm_sched *pcpu_sched; ++ struct mm_time *pcpu_time; + unsigned long epoch; + int mm_sched_llc = -1; + +@@ -1444,14 +1477,17 @@ void account_mm_sched(struct rq *rq, struct task_struct *p, s64 delta_exec) + /* + * init_task and kthreads don't having mm + */ +- if (!mm || !mm->pcpu_sched) ++ if (!mm || !mm->pcpu_sched || !mm->pcpu_time) + return; + + pcpu_sched = per_cpu_ptr(p->mm->pcpu_sched, cpu_of(rq)); ++ pcpu_time = per_cpu_ptr(p->mm->pcpu_time, cpu_of(rq)); + + scoped_guard (raw_spinlock, &rq->cpu_epoch_lock) { + __update_mm_sched(rq, pcpu_sched); + pcpu_sched->runtime += delta_exec; ++ /* pure runtime without decay */ ++ pcpu_time->runtime_ns += delta_exec; + rq->cpu_runtime += delta_exec; + epoch = rq->cpu_epoch; + } +-- +2.32.0 + + diff --git a/sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip b/sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip new file mode 100644 index 0000000..d5645f0 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.18/mm_slab_introduce_kvfree_rcu_barrier_on_cache.patch.skip @@ -0,0 +1,259 @@ +From 0f35040de59371ad542b915d7b91176c9910dadc Mon Sep 17 00:00:00 2001 +From: Harry Yoo +Date: Mon, 8 Dec 2025 00:41:47 +0900 +Subject: mm/slab: introduce kvfree_rcu_barrier_on_cache() for cache + destruction + +Currently, kvfree_rcu_barrier() flushes RCU sheaves across all slab +caches when a cache is destroyed. This is unnecessary; only the RCU +sheaves belonging to the cache being destroyed need to be flushed. + +As suggested by Vlastimil Babka, introduce a weaker form of +kvfree_rcu_barrier() that operates on a specific slab cache. + +Factor out flush_rcu_sheaves_on_cache() from flush_all_rcu_sheaves() and +call it from flush_all_rcu_sheaves() and kvfree_rcu_barrier_on_cache(). + +Call kvfree_rcu_barrier_on_cache() instead of kvfree_rcu_barrier() on +cache destruction. + +The performance benefit is evaluated on a 12 core 24 threads AMD Ryzen +5900X machine (1 socket), by loading slub_kunit module. + +Before: + Total calls: 19 + Average latency (us): 18127 + Total time (us): 344414 + +After: + Total calls: 19 + Average latency (us): 10066 + Total time (us): 191264 + +Two performance regression have been reported: + - stress module loader test's runtime increases by 50-60% (Daniel) + - internal graphics test's runtime on Tegra234 increases by 35% (Jon) + +They are fixed by this change. + +Suggested-by: Vlastimil Babka +Fixes: ec66e0d59952 ("slab: add sheaf support for batching kfree_rcu() operations") +Cc: stable@vger.kernel.org +Link: https://lore.kernel.org/linux-mm/1bda09da-93be-4737-aef0-d47f8c5c9301@suse.cz +Reported-and-tested-by: Daniel Gomez +Closes: https://lore.kernel.org/linux-mm/0406562e-2066-4cf8-9902-b2b0616dd742@kernel.org +Reported-and-tested-by: Jon Hunter +Closes: https://lore.kernel.org/linux-mm/e988eff6-1287-425e-a06c-805af5bbf262@nvidia.com +Signed-off-by: Harry Yoo +Link: https://patch.msgid.link/20251207154148.117723-1-harry.yoo@oracle.com +Signed-off-by: Vlastimil Babka +--- + include/linux/slab.h | 7 +++++++ + mm/slab.h | 1 + + mm/slab_common.c | 52 +++++++++++++++++++++++++++++++++++-------------- + mm/slub.c | 55 ++++++++++++++++++++++++++++------------------------ + 4 files changed, 75 insertions(+), 40 deletions(-) + +diff --git a/include/linux/slab.h b/include/linux/slab.h +index cf443f064a667e..2482992248dc9c 100644 +--- a/include/linux/slab.h ++++ b/include/linux/slab.h +@@ -1150,10 +1150,17 @@ static inline void kvfree_rcu_barrier(void) + rcu_barrier(); + } + ++static inline void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) ++{ ++ rcu_barrier(); ++} ++ + static inline void kfree_rcu_scheduler_running(void) { } + #else + void kvfree_rcu_barrier(void); + ++void kvfree_rcu_barrier_on_cache(struct kmem_cache *s); ++ + void kfree_rcu_scheduler_running(void); + #endif + +diff --git a/mm/slab.h b/mm/slab.h +index f730e012553ccd..e767aa7e91b098 100644 +--- a/mm/slab.h ++++ b/mm/slab.h +@@ -422,6 +422,7 @@ static inline bool is_kmalloc_normal(struct kmem_cache *s) + + bool __kfree_rcu_sheaf(struct kmem_cache *s, void *obj); + void flush_all_rcu_sheaves(void); ++void flush_rcu_sheaves_on_cache(struct kmem_cache *s); + + #define SLAB_CORE_FLAGS (SLAB_HWCACHE_ALIGN | SLAB_CACHE_DMA | \ + SLAB_CACHE_DMA32 | SLAB_PANIC | \ +diff --git a/mm/slab_common.c b/mm/slab_common.c +index 84dfff4f7b1fce..dd8a49d6f9cc3d 100644 +--- a/mm/slab_common.c ++++ b/mm/slab_common.c +@@ -492,7 +492,7 @@ void kmem_cache_destroy(struct kmem_cache *s) + return; + + /* in-flight kfree_rcu()'s may include objects from our cache */ +- kvfree_rcu_barrier(); ++ kvfree_rcu_barrier_on_cache(s); + + if (IS_ENABLED(CONFIG_SLUB_RCU_DEBUG) && + (s->flags & SLAB_TYPESAFE_BY_RCU)) { +@@ -2038,25 +2038,13 @@ void kvfree_call_rcu(struct rcu_head *head, void *ptr) + } + EXPORT_SYMBOL_GPL(kvfree_call_rcu); + +-/** +- * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. +- * +- * Note that a single argument of kvfree_rcu() call has a slow path that +- * triggers synchronize_rcu() following by freeing a pointer. It is done +- * before the return from the function. Therefore for any single-argument +- * call that will result in a kfree() to a cache that is to be destroyed +- * during module exit, it is developer's responsibility to ensure that all +- * such calls have returned before the call to kmem_cache_destroy(). +- */ +-void kvfree_rcu_barrier(void) ++static inline void __kvfree_rcu_barrier(void) + { + struct kfree_rcu_cpu_work *krwp; + struct kfree_rcu_cpu *krcp; + bool queued; + int i, cpu; + +- flush_all_rcu_sheaves(); +- + /* + * Firstly we detach objects and queue them over an RCU-batch + * for all CPUs. Finally queued works are flushed for each CPU. +@@ -2118,8 +2106,43 @@ void kvfree_rcu_barrier(void) + } + } + } ++ ++/** ++ * kvfree_rcu_barrier - Wait until all in-flight kvfree_rcu() complete. ++ * ++ * Note that a single argument of kvfree_rcu() call has a slow path that ++ * triggers synchronize_rcu() following by freeing a pointer. It is done ++ * before the return from the function. Therefore for any single-argument ++ * call that will result in a kfree() to a cache that is to be destroyed ++ * during module exit, it is developer's responsibility to ensure that all ++ * such calls have returned before the call to kmem_cache_destroy(). ++ */ ++void kvfree_rcu_barrier(void) ++{ ++ flush_all_rcu_sheaves(); ++ __kvfree_rcu_barrier(); ++} + EXPORT_SYMBOL_GPL(kvfree_rcu_barrier); + ++/** ++ * kvfree_rcu_barrier_on_cache - Wait for in-flight kvfree_rcu() calls on a ++ * specific slab cache. ++ * @s: slab cache to wait for ++ * ++ * See the description of kvfree_rcu_barrier() for details. ++ */ ++void kvfree_rcu_barrier_on_cache(struct kmem_cache *s) ++{ ++ if (s->cpu_sheaves) ++ flush_rcu_sheaves_on_cache(s); ++ /* ++ * TODO: Introduce a version of __kvfree_rcu_barrier() that works ++ * on a specific slab cache. ++ */ ++ __kvfree_rcu_barrier(); ++} ++EXPORT_SYMBOL_GPL(kvfree_rcu_barrier_on_cache); ++ + static unsigned long + kfree_rcu_shrink_count(struct shrinker *shrink, struct shrink_control *sc) + { +@@ -2215,4 +2238,3 @@ void __init kvfree_rcu_init(void) + } + + #endif /* CONFIG_KVFREE_RCU_BATCHED */ +- +diff --git a/mm/slub.c b/mm/slub.c +index 2acce22590f846..f22ba8be29e060 100644 +--- a/mm/slub.c ++++ b/mm/slub.c +@@ -4122,42 +4122,47 @@ static void flush_rcu_sheaf(struct work_struct *w) + + + /* needed for kvfree_rcu_barrier() */ +-void flush_all_rcu_sheaves(void) ++void flush_rcu_sheaves_on_cache(struct kmem_cache *s) + { + struct slub_flush_work *sfw; +- struct kmem_cache *s; + unsigned int cpu; + +- cpus_read_lock(); +- mutex_lock(&slab_mutex); ++ mutex_lock(&flush_lock); + +- list_for_each_entry(s, &slab_caches, list) { +- if (!s->cpu_sheaves) +- continue; ++ for_each_online_cpu(cpu) { ++ sfw = &per_cpu(slub_flush, cpu); + +- mutex_lock(&flush_lock); ++ /* ++ * we don't check if rcu_free sheaf exists - racing ++ * __kfree_rcu_sheaf() might have just removed it. ++ * by executing flush_rcu_sheaf() on the cpu we make ++ * sure the __kfree_rcu_sheaf() finished its call_rcu() ++ */ + +- for_each_online_cpu(cpu) { +- sfw = &per_cpu(slub_flush, cpu); ++ INIT_WORK(&sfw->work, flush_rcu_sheaf); ++ sfw->s = s; ++ queue_work_on(cpu, flushwq, &sfw->work); ++ } + +- /* +- * we don't check if rcu_free sheaf exists - racing +- * __kfree_rcu_sheaf() might have just removed it. +- * by executing flush_rcu_sheaf() on the cpu we make +- * sure the __kfree_rcu_sheaf() finished its call_rcu() +- */ ++ for_each_online_cpu(cpu) { ++ sfw = &per_cpu(slub_flush, cpu); ++ flush_work(&sfw->work); ++ } + +- INIT_WORK(&sfw->work, flush_rcu_sheaf); +- sfw->s = s; +- queue_work_on(cpu, flushwq, &sfw->work); +- } ++ mutex_unlock(&flush_lock); ++} + +- for_each_online_cpu(cpu) { +- sfw = &per_cpu(slub_flush, cpu); +- flush_work(&sfw->work); +- } ++void flush_all_rcu_sheaves(void) ++{ ++ struct kmem_cache *s; ++ ++ cpus_read_lock(); ++ mutex_lock(&slab_mutex); + +- mutex_unlock(&flush_lock); ++ list_for_each_entry(s, &slab_caches, list) { ++ if (!s->cpu_sheaves) ++ continue; ++ flush_rcu_sheaves_on_cache(s); + } + + mutex_unlock(&slab_mutex); +-- +cgit 1.2.3-korg + diff --git a/sys-kernel/gentoo-sources-6.19/0002-bbr3.patch b/sys-kernel/gentoo-sources-6.19/0002-bbr3.patch new file mode 100644 index 0000000..a2d49fc --- /dev/null +++ b/sys-kernel/gentoo-sources-6.19/0002-bbr3.patch @@ -0,0 +1,3395 @@ +From 185514200e2848a5af6dc9e6165096ed34ee9d38 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 27 Feb 2026 09:11:53 +0100 +Subject: [PATCH 2/8] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 6 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 71 +- + include/net/tcp_ecn.h | 6 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 4 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2233 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 42 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 17 files changed, 1938 insertions(+), 554 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 20b8c6e21fef..e334b7a7aac2 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -236,7 +236,8 @@ struct tcp_sock { + tcp_usec_ts : 1, /* TSval values in usec */ + is_sack_reneg:1, /* in recovery from loss with SACK reneg? */ + is_cwnd_limited:1,/* forward progress limited by snd_cwnd? */ +- recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ ++ recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ ++ fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */ + __cacheline_group_end(tcp_sock_read_txrx); + + /* RX read-mostly hotpath cache lines */ +@@ -292,7 +293,8 @@ struct tcp_sock { + * 0x5?10 << 16 + snd_wnd in net byte order + */ + u8 nonagle : 4,/* Disable Nagle algorithm? */ +- rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ ++ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ ++ tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */ + u8 received_ce_pending:4, /* Not yet transmit cnt of received_ce */ + unused2:4; + u8 accecn_minlen:2,/* Minimum length of AccECN option sent */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index ecb362025c4e..9de884b7fe01 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -137,8 +137,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index e0a5cf2f7818..6a4a5f38c072 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -406,6 +406,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_DEMAND_CWR BIT(2) + #define TCP_ECN_SEEN BIT(3) + #define TCP_ECN_MODE_ACCECN BIT(4) ++#define TCP_ECN_LOW BIT(5) ++#define TCP_ECN_ECT_PERMANENT BIT(6) + + #define TCP_ECN_DISABLED 0 + #define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) +@@ -851,6 +853,15 @@ static inline unsigned long tcp_reqsk_timeout(struct request_sock *req) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -956,6 +967,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -1066,9 +1082,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1181,6 +1202,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1209,9 +1231,12 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_ECT_1_NEGOTIATION BIT(3) + /* Cannot fallback to RFC3168 during AccECN negotiation */ + #define TCP_CONG_NO_FALLBACK_RFC3168 BIT(4) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS BIT(5) + #define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN | \ + TCP_CONG_NEEDS_ACCECN | TCP_CONG_ECT_1_NEGOTIATION | \ +- TCP_CONG_NO_FALLBACK_RFC3168) ++ TCP_CONG_NO_FALLBACK_RFC3168 | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1231,10 +1256,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1245,7 +1273,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1269,8 +1299,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1336,6 +1369,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1376,6 +1417,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1388,6 +1430,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2562,7 +2619,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/net/tcp_ecn.h b/include/net/tcp_ecn.h +index a709fb1756eb..5b2f85419201 100644 +--- a/include/net/tcp_ecn.h ++++ b/include/net/tcp_ecn.h +@@ -613,10 +613,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + use_ecn = tcp_ecn == TCP_ECN_IN_ECN_OUT_ECN || + tcp_ecn == TCP_ECN_IN_ACCECN_OUT_ECN || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn || use_accecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -634,6 +633,9 @@ static inline void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->syn_ect_snt = inet_sk(sk)->tos & INET_ECN_MASK; + } else { + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + } +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index dab9493c791b..cce4975fdcfe 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -517,12 +517,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index dce3113787a7..6efba4f74f6f 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ + #define TCPI_OPT_TFO_CHILD 128 /* child from a Fast Open option on SYN */ ++#define TCPI_OPT_ECN_LOW 256 /* Low-latency ECN enabled at conn init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index b71c22475c51..85d95a59708e 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index e01492234b0b..27893b774e08 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } +@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 81666571ecfb..86d1a689b41a 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3471,6 +3471,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4251,6 +4252,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..9279be755c16 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,123 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ return tcp_ecn_mode_any(tp) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +384,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +411,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +435,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +458,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, ++ bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +475,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +536,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +549,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +581,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +601,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +672,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +683,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +712,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +741,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +797,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +805,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +851,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +860,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +888,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +925,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +948,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +973,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ !!bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * and in PROBE_UP. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2362,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2399,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index e9f6c77e0631..8e5e77a77e91 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -238,6 +238,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 0d080a3e27d6..bdc0cdda875d 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -358,7 +358,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR) && +@@ -376,7 +376,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + if (!tcp_ecn_mode_rfc3168(tp)) + break; +@@ -1305,7 +1305,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1670,6 +1675,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3905,7 +3921,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985 + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3922,6 +3939,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3932,6 +3950,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -4058,6 +4081,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -4130,7 +4154,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_in_ack_event(sk, flag); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -4155,6 +4179,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -4180,7 +4205,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5909,13 +5934,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 9776c921d1bb..990df5f9e6c4 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -498,6 +498,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 479afb714bdf..a9eb14d0cf47 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -348,7 +348,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1762,7 +1763,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + u16 flags; + int nlen; +@@ -1837,6 +1838,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2193,13 +2218,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2940,6 +2964,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -3152,6 +3177,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 160080c9021d..06ee74f2c01d 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -702,6 +702,7 @@ void tcp_write_timer_handler(struct sock *sk) + tcp_timeout_expires(sk)); + return; + } ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.53.0 + diff --git a/sys-kernel/gentoo-sources-6.19/0005-hdmi.patch b/sys-kernel/gentoo-sources-6.19/0005-hdmi.patch new file mode 100644 index 0000000..47563fe --- /dev/null +++ b/sys-kernel/gentoo-sources-6.19/0005-hdmi.patch @@ -0,0 +1,1729 @@ +From 663014be05bfb67ae7852cbd651afec0db18995c Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 27 Feb 2026 09:09:13 +0100 +Subject: [PATCH 5/8] hdmi + +Signed-off-by: Peter Jung +--- + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 304 +++++++++++++---- + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h | 4 + + .../amd/display/amdgpu_dm/amdgpu_dm_helpers.c | 49 ++- + drivers/gpu/drm/amd/display/dc/core/dc.c | 3 + + .../gpu/drm/amd/display/dc/core/dc_resource.c | 2 +- + drivers/gpu/drm/amd/display/dc/dc.h | 1 + + drivers/gpu/drm/amd/display/dc/dc_stream.h | 2 + + drivers/gpu/drm/amd/display/dc/dc_types.h | 7 +- + drivers/gpu/drm/amd/display/dc/dm_helpers.h | 2 +- + .../amd/display/include/ddc_service_types.h | 1 + + .../amd/display/modules/freesync/freesync.c | 4 + + .../amd/display/modules/inc/mod_info_packet.h | 17 +- + .../display/modules/info_packet/info_packet.c | 307 ++++++++++++------ + drivers/gpu/drm/amd/include/amd_shared.h | 6 + + drivers/gpu/drm/drm_atomic_uapi.c | 8 + + drivers/gpu/drm/drm_connector.c | 188 +++++++++++ + drivers/gpu/drm/drm_crtc.c | 2 + + drivers/gpu/drm/drm_edid.c | 41 ++- + drivers/gpu/drm/drm_mode_config.c | 6 + + include/drm/drm_connector.h | 99 ++++++ + include/drm/drm_crtc.h | 9 + + include/drm/drm_mode_config.h | 6 + + 22 files changed, 872 insertions(+), 196 deletions(-) + +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +index bc4d6d5009bf..bc9aca604aa0 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -2069,6 +2069,9 @@ static int amdgpu_dm_init(struct amdgpu_device *adev) + if (amdgpu_dc_debug_mask & DC_SKIP_DETECTION_LT) + adev->dm.dc->debug.skip_detection_link_training = true; + ++ if (amdgpu_dc_debug_mask & DC_OVERRIDE_PCON_VRR_ID_CHECK) ++ adev->dm.dc->debug.override_pcon_vrr_id_check = true; ++ + adev->dm.dc->debug.visual_confirm = amdgpu_dc_visual_confirm; + + /* TODO: Remove after DP2 receiver gets proper support of Cable ID feature */ +@@ -7370,7 +7373,7 @@ create_stream_for_sink(struct drm_connector *connector, + update_stream_signal(stream, sink); + + if (stream->signal == SIGNAL_TYPE_HDMI_TYPE_A) +- mod_build_hf_vsif_infopacket(stream, &stream->vsp_infopacket); ++ mod_build_hf_vsif_infopacket(stream, &stream->hfvsif_infopacket); + + if (stream->signal == SIGNAL_TYPE_DISPLAY_PORT || + stream->signal == SIGNAL_TYPE_DISPLAY_PORT_MST || +@@ -7829,6 +7832,8 @@ amdgpu_dm_connector_atomic_duplicate_state(struct drm_connector *connector) + __drm_atomic_helper_connector_duplicate_state(connector, &new_state->base); + + new_state->freesync_capable = state->freesync_capable; ++ new_state->freesync_on_desktop_capable = ++ state->freesync_on_desktop_capable; + new_state->abm_level = state->abm_level; + new_state->scaling = state->scaling; + new_state->underscan_enable = state->underscan_enable; +@@ -8945,6 +8950,7 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, + aconnector->audio_inst = -1; + aconnector->pack_sdp_v1_3 = false; + aconnector->as_type = ADAPTIVE_SYNC_TYPE_NONE; ++ aconnector->hdmi_allm_capable = false; + memset(&aconnector->vsdb_info, 0, sizeof(aconnector->vsdb_info)); + mutex_init(&aconnector->hpd_lock); + mutex_init(&aconnector->handle_mst_msg_ready); +@@ -9035,8 +9041,10 @@ void amdgpu_dm_connector_init_helper(struct amdgpu_display_manager *dm, + connector_type == DRM_MODE_CONNECTOR_eDP) { + drm_connector_attach_hdr_output_metadata_property(&aconnector->base); + +- if (!aconnector->mst_root) ++ if (!aconnector->mst_root) { + drm_connector_attach_vrr_capable_property(&aconnector->base); ++ drm_connector_attach_passive_vrr_capable_property(&aconnector->base); ++ } + + if (adev->dm.hdcp_workqueue) + drm_connector_attach_content_protection_property(&aconnector->base, true); +@@ -9140,6 +9148,10 @@ int amdgpu_dm_initialize_hdmi_connector(struct amdgpu_dm_connector *aconnector) + struct drm_device *ddev = aconnector->base.dev; + struct device *hdmi_dev = ddev->dev; + ++ /* ALLM */ ++ drm_connector_attach_allm_capable_property(&aconnector->base); ++ drm_connector_attach_allm_mode_property(&aconnector->base); ++ + if (amdgpu_dc_debug_mask & DC_DISABLE_HDMI_CEC) { + drm_info(ddev, "HDMI-CEC feature masked\n"); + return -EINVAL; +@@ -9607,7 +9619,11 @@ static void update_freesync_state_on_stream( + + aconn = (struct amdgpu_dm_connector *)new_stream->dm_stream_context; + +- if (aconn && (aconn->as_type == FREESYNC_TYPE_PCON_IN_WHITELIST || aconn->vsdb_info.replay_mode)) { ++ if (aconn && aconn->as_type == ADAPTIVE_SYNC_TYPE_HDMI) ++ packet_type = PACKET_TYPE_VTEM; ++ ++ else if (aconn && (aconn->as_type == ADAPTIVE_SYNC_TYPE_PCON_ALLOWED || ++ aconn->vsdb_info.replay_mode)) { + pack_sdp_v1_3 = aconn->pack_sdp_v1_3; + + if (aconn->vsdb_info.amd_vsdb_version == 1) +@@ -10826,6 +10842,31 @@ static int amdgpu_dm_atomic_setup_commit(struct drm_atomic_state *state) + return 0; + } + ++static void update_allm_state_on_crtc_stream(struct dm_crtc_state *new_crtc_state, ++ const struct drm_connector_state *new_conn) ++{ ++ struct mod_freesync_config *config = &new_crtc_state->freesync_config; ++ struct dc_stream_state *new_stream = new_crtc_state->stream; ++ bool allm_active = false; ++ ++ switch (new_conn->allm_mode) { ++ case DRM_ALLM_MODE_ENABLED_DYNAMIC: ++ allm_active = config->state == VRR_STATE_ACTIVE_VARIABLE || ++ new_stream->content_type == DISPLAY_CONTENT_TYPE_GAME; ++ break; ++ ++ case DRM_ALLM_MODE_ENABLED_FORCED: ++ allm_active = true; ++ break; ++ ++ case DRM_ALLM_MODE_DISABLED: ++ default: ++ allm_active = false; ++ } ++ ++ new_stream->hdmi_allm_active = allm_active; ++} ++ + /** + * amdgpu_dm_atomic_commit_tail() - AMDgpu DM's commit tail implementation. + * @state: The atomic state to commit +@@ -10868,12 +10909,14 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) + for_each_oldnew_connector_in_state(state, connector, old_con_state, new_con_state, i) { + struct dm_connector_state *dm_new_con_state = to_dm_connector_state(new_con_state); + struct dm_connector_state *dm_old_con_state = to_dm_connector_state(old_con_state); ++ struct amdgpu_dm_connector *dm_conn = to_amdgpu_dm_connector(connector); + struct amdgpu_crtc *acrtc = to_amdgpu_crtc(dm_new_con_state->base.crtc); + struct dc_surface_update *dummy_updates; + struct dc_stream_update stream_update; + struct dc_info_packet hdr_packet; + struct dc_stream_status *status = NULL; + bool abm_changed, hdr_changed, scaling_changed, output_color_space_changed = false; ++ bool allm_changed = false; + + memset(&stream_update, 0, sizeof(stream_update)); + +@@ -10903,7 +10946,11 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) + hdr_changed = + !drm_connector_atomic_hdr_metadata_equal(old_con_state, new_con_state); + +- if (!scaling_changed && !abm_changed && !hdr_changed && !output_color_space_changed) ++ allm_changed = dm_conn->hdmi_allm_capable && ++ (new_con_state->allm_mode != old_con_state->allm_mode); ++ ++ if (!scaling_changed && !abm_changed && !hdr_changed && ++ !output_color_space_changed && !allm_changed) + continue; + + stream_update.stream = dm_new_crtc_state->stream; +@@ -10933,6 +10980,17 @@ static void amdgpu_dm_atomic_commit_tail(struct drm_atomic_state *state) + stream_update.hdr_static_metadata = &hdr_packet; + } + ++ if (allm_changed) { ++ update_allm_state_on_crtc_stream(dm_new_crtc_state, new_con_state); ++ mod_build_hf_vsif_infopacket(dm_new_crtc_state->stream, ++ &dm_new_crtc_state->stream->hfvsif_infopacket); ++ ++ stream_update.hdmi_allm_active = ++ &dm_new_crtc_state->stream->hdmi_allm_active; ++ stream_update.hfvsif_infopacket = ++ &dm_new_crtc_state->stream->hfvsif_infopacket; ++ } ++ + status = dc_stream_get_status(dm_new_crtc_state->stream); + + if (WARN_ON(!status)) +@@ -11312,6 +11370,12 @@ static void get_freesync_config_for_crtc( + config.vsif_supported = true; + config.btr = true; + ++ if (new_con_state->freesync_on_desktop_capable) ++ new_crtc_state->stream->freesync_on_desktop = ++ !new_crtc_state->base.passive_vrr_disabled; ++ else ++ new_crtc_state->stream->freesync_on_desktop = false; ++ + if (fs_vid_mode) { + config.state = VRR_STATE_ACTIVE_FIXED; + config.fixed_refresh_in_uhz = new_crtc_state->freesync_config.fixed_refresh_in_uhz; +@@ -11323,6 +11387,7 @@ static void get_freesync_config_for_crtc( + } + } else { + config.state = VRR_STATE_UNSUPPORTED; ++ new_crtc_state->stream->freesync_on_desktop = false; + } + out: + new_crtc_state->freesync_config = config; +@@ -13114,8 +13179,8 @@ static void parse_edid_displayid_vrr(struct drm_connector *connector, + } + } + +-static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector, +- const struct edid *edid, struct amdgpu_hdmi_vsdb_info *vsdb_info) ++static int parse_amd_vsdb_did(struct amdgpu_dm_connector *aconnector, ++ const struct edid *edid, struct amdgpu_hdmi_vsdb_info *vsdb_info) + { + u8 *edid_ext = NULL; + int i; +@@ -13131,6 +13196,9 @@ static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector, + break; + } + ++ if (i == edid->extensions) ++ return false; ++ + while (j < EDID_LENGTH - sizeof(struct amd_vsdb_block)) { + struct amd_vsdb_block *amd_vsdb = (struct amd_vsdb_block *)&edid_ext[j]; + unsigned int ieeeId = (amd_vsdb->ieee_id[2] << 16) | (amd_vsdb->ieee_id[1] << 8) | (amd_vsdb->ieee_id[0]); +@@ -13149,13 +13217,13 @@ static int parse_amd_vsdb(struct amdgpu_dm_connector *aconnector, + return false; + } + +-static int parse_hdmi_amd_vsdb(struct amdgpu_dm_connector *aconnector, ++static bool parse_amd_vsdb_cea(struct amdgpu_dm_connector *aconnector, + const struct edid *edid, + struct amdgpu_hdmi_vsdb_info *vsdb_info) + { ++ struct amdgpu_hdmi_vsdb_info vsdb_local = {0}; + u8 *edid_ext = NULL; + int i; +- bool valid_vsdb_found = false; + + /*----- drm_find_cea_extension() -----*/ + /* No EDID or EDID extensions */ +@@ -13176,9 +13244,99 @@ static int parse_hdmi_amd_vsdb(struct amdgpu_dm_connector *aconnector, + if (edid_ext[0] != CEA_EXT) + return -ENODEV; + +- valid_vsdb_found = parse_edid_cea(aconnector, edid_ext, EDID_LENGTH, vsdb_info); ++ if (!parse_edid_cea(aconnector, edid_ext, EDID_LENGTH, &vsdb_local)) ++ return -ENODEV; ++ ++ *vsdb_info = vsdb_local; ++ return false; ++} ++ ++static bool is_monitor_range_invalid(const struct drm_connector *conn) ++{ ++ return conn->display_info.monitor_range.min_vfreq == 0 || ++ conn->display_info.monitor_range.max_vfreq == 0; ++} ++ ++/* ++ * Returns true if (max_vfreq - min_vfreq) > 10 ++ */ ++static bool is_freesync_capable(const struct drm_monitor_range_info *range) ++{ ++ return (range->max_vfreq - range->min_vfreq) > 10; ++} ++ ++static void monitor_range_from_vsdb(struct drm_display_info *display, ++ const struct amdgpu_hdmi_vsdb_info *vsdb) ++{ ++ display->monitor_range.min_vfreq = vsdb->min_refresh_rate_hz; ++ display->monitor_range.max_vfreq = vsdb->max_refresh_rate_hz; ++} ++ ++/** ++ * Get VRR range from HDMI VRR info in EDID. If VRRmax == 0, ++ * try getting upper bound from AMD vsdb. ++ * ++ * @conn: drm_connector with HDMI VRR info ++ * @vsdb: AMD vsdb from CAE ++ */ ++static void monitor_range_from_hdmi(struct drm_display_info *display, ++ const struct amdgpu_hdmi_vsdb_info *vsdb) ++{ ++ u16 vrr_max = display->hdmi.vrr_cap.vrr_max; ++ ++ /* Try getting upper vrr bound from AMD vsdb */ ++ if (vrr_max == 0) ++ vrr_max = vsdb->max_refresh_rate_hz; ++ ++ /* Use max possible BRR value as a last resort */ ++ if (vrr_max == 0) ++ vrr_max = VTEM_BRR_MAX; + +- return valid_vsdb_found ? i : -ENODEV; ++ display->monitor_range.min_vfreq = display->hdmi.vrr_cap.vrr_min; ++ display->monitor_range.max_vfreq = vrr_max; ++} ++ ++/* ++ * Returns true if connector is capable of freesync ++ * Optionally, can fetch the range from AMD vsdb ++ */ ++static bool copy_range_to_amdgpu_connector(struct drm_connector *conn) ++{ ++ struct amdgpu_dm_connector *aconn = to_amdgpu_dm_connector(conn); ++ struct drm_monitor_range_info *range = &conn->display_info.monitor_range; ++ ++ aconn->min_vfreq = range->min_vfreq; ++ aconn->max_vfreq = range->max_vfreq; ++ ++ return is_freesync_capable(range); ++} ++ ++static void extend_range_from_vsdb(struct drm_display_info *display, ++ const struct amdgpu_hdmi_vsdb_info *vsdb) ++{ ++ u16 vrr_min = display->monitor_range.min_vfreq; ++ u16 vrr_max = display->monitor_range.max_vfreq; ++ ++ /* Always extend upper limit */ ++ if (vsdb->max_refresh_rate_hz > vrr_max) ++ vrr_max = vsdb->max_refresh_rate_hz; ++ ++ /* ++ * Only extend lower limit if current one disables LFC. ++ ++ * During widespread testing, we found that some manufacturers probably ++ * had issues with their monitors' lower VRR boundaries and adjusted ++ * them up (Gigabyte X34GS with official range 48 - 180, AMD vsdb 48 - ++ * 180 yet Monitor Ranges 55 - 180). After setting the lower boundary ++ * from AMD vsdb, such monitors start having blanking issues. ++ * ++ * Work around that by not touching VRR min if it still supports LFC. ++ */ ++ if (vsdb->min_refresh_rate_hz < vrr_min && (vrr_min * 2 >= vrr_max)) ++ vrr_min = vsdb->min_refresh_rate_hz; ++ ++ display->monitor_range.min_vfreq = vrr_min; ++ display->monitor_range.max_vfreq = vrr_max; + } + + /** +@@ -13195,16 +13353,20 @@ static int parse_hdmi_amd_vsdb(struct amdgpu_dm_connector *aconnector, + void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, + const struct drm_edid *drm_edid) + { +- int i = 0; + struct amdgpu_dm_connector *amdgpu_dm_connector = + to_amdgpu_dm_connector(connector); + struct dm_connector_state *dm_con_state = NULL; + struct dc_sink *sink; + struct amdgpu_device *adev = drm_to_adev(connector->dev); + struct amdgpu_hdmi_vsdb_info vsdb_info = {0}; ++ struct amdgpu_hdmi_vsdb_info vsdb_did = {0}; ++ struct drm_hdmi_vrr_cap hdmi_vrr = {0}; ++ struct dpcd_caps dpcd_caps = {0}; + const struct edid *edid; ++ bool freesync_on_desktop = false; + bool freesync_capable = false; +- enum adaptive_sync_type as_type = ADAPTIVE_SYNC_TYPE_NONE; ++ bool pcon_allowed = false; ++ bool is_pcon = false; + + if (!connector->state) { + drm_err(adev_to_drm(adev), "%s - Connector has no state", __func__); +@@ -13232,68 +13394,77 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, + if (!adev->dm.freesync_module || !dc_supports_vrr(sink->ctx->dce_version)) + goto update; + ++ /* Gather all data */ + edid = drm_edid_raw(drm_edid); // FIXME: Get rid of drm_edid_raw() ++ parse_amd_vsdb_cea(amdgpu_dm_connector, edid, &vsdb_info); ++ hdmi_vrr = connector->display_info.hdmi.vrr_cap; ++ ++ if (amdgpu_dm_connector->dc_link) { ++ dpcd_caps = amdgpu_dm_connector->dc_link->dpcd_caps; ++ is_pcon = dpcd_caps.dongle_type == DISPLAY_DONGLE_DP_HDMI_CONVERTER; ++ pcon_allowed = dm_helpers_is_vrr_pcon_allowed( ++ amdgpu_dm_connector->dc_link, connector->dev); ++ } ++ ++ /* DP & eDP excluding PCONs */ ++ if ((sink->sink_signal == SIGNAL_TYPE_EDP || ++ sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT) && !is_pcon) { ++ /* Some eDP panels only have the refresh rate range info in DisplayID */ ++ if (is_monitor_range_invalid(connector)) ++ parse_edid_displayid_vrr(connector, edid); ++ /* ++ * Many monitors expose AMD vsdb in CAE even for DP and their ++ * monitor ranges do not contain Range Limits Only flag ++ */ ++ if (is_monitor_range_invalid(connector)) ++ monitor_range_from_vsdb(&connector->display_info, &vsdb_info); + +- /* Some eDP panels only have the refresh rate range info in DisplayID */ +- if ((connector->display_info.monitor_range.min_vfreq == 0 || +- connector->display_info.monitor_range.max_vfreq == 0)) +- parse_edid_displayid_vrr(connector, edid); +- +- if (edid && (sink->sink_signal == SIGNAL_TYPE_DISPLAY_PORT || +- sink->sink_signal == SIGNAL_TYPE_EDP)) { +- if (amdgpu_dm_connector->dc_link && +- amdgpu_dm_connector->dc_link->dpcd_caps.allow_invalid_MSA_timing_param) { +- amdgpu_dm_connector->min_vfreq = connector->display_info.monitor_range.min_vfreq; +- amdgpu_dm_connector->max_vfreq = connector->display_info.monitor_range.max_vfreq; +- if (amdgpu_dm_connector->max_vfreq - amdgpu_dm_connector->min_vfreq > 10) +- freesync_capable = true; +- } ++ /* Try extending range if found in AMD vsdb */ ++ extend_range_from_vsdb(&connector->display_info, &vsdb_info); + +- parse_amd_vsdb(amdgpu_dm_connector, edid, &vsdb_info); ++ if (dpcd_caps.allow_invalid_MSA_timing_param) ++ freesync_capable = copy_range_to_amdgpu_connector(connector); + +- if (vsdb_info.replay_mode) { +- amdgpu_dm_connector->vsdb_info.replay_mode = vsdb_info.replay_mode; +- amdgpu_dm_connector->vsdb_info.amd_vsdb_version = vsdb_info.amd_vsdb_version; ++ /* eDP */ ++ parse_amd_vsdb_did(amdgpu_dm_connector, edid, &vsdb_did); ++ if (vsdb_did.replay_mode) { ++ amdgpu_dm_connector->vsdb_info.replay_mode = vsdb_did.replay_mode; ++ amdgpu_dm_connector->vsdb_info.amd_vsdb_version = vsdb_did.amd_vsdb_version; + amdgpu_dm_connector->as_type = ADAPTIVE_SYNC_TYPE_EDP; + } + +- } else if (drm_edid && sink->sink_signal == SIGNAL_TYPE_HDMI_TYPE_A) { +- i = parse_hdmi_amd_vsdb(amdgpu_dm_connector, edid, &vsdb_info); +- if (i >= 0 && vsdb_info.freesync_supported) { +- amdgpu_dm_connector->min_vfreq = vsdb_info.min_refresh_rate_hz; +- amdgpu_dm_connector->max_vfreq = vsdb_info.max_refresh_rate_hz; +- if (amdgpu_dm_connector->max_vfreq - amdgpu_dm_connector->min_vfreq > 10) +- freesync_capable = true; +- +- connector->display_info.monitor_range.min_vfreq = vsdb_info.min_refresh_rate_hz; +- connector->display_info.monitor_range.max_vfreq = vsdb_info.max_refresh_rate_hz; +- } +- } +- +- if (amdgpu_dm_connector->dc_link) +- as_type = dm_get_adaptive_sync_support_type(amdgpu_dm_connector->dc_link); +- +- if (as_type == FREESYNC_TYPE_PCON_IN_WHITELIST) { +- i = parse_hdmi_amd_vsdb(amdgpu_dm_connector, edid, &vsdb_info); +- if (i >= 0 && vsdb_info.freesync_supported && vsdb_info.amd_vsdb_version > 0) { +- +- amdgpu_dm_connector->pack_sdp_v1_3 = true; +- amdgpu_dm_connector->as_type = as_type; ++ /* HDMI */ ++ } else if (sink->sink_signal == SIGNAL_TYPE_HDMI_TYPE_A) { ++ /* Prefer HDMI VRR */ ++ if (hdmi_vrr.supported) { ++ amdgpu_dm_connector->as_type = ADAPTIVE_SYNC_TYPE_HDMI; ++ monitor_range_from_hdmi(&connector->display_info, &vsdb_info); ++ } else if (vsdb_info.freesync_supported) ++ monitor_range_from_vsdb(&connector->display_info, &vsdb_info); ++ ++ freesync_capable = copy_range_to_amdgpu_connector(connector); ++ freesync_on_desktop = freesync_capable; ++ ++ /* DP -> HDMI PCON */ ++ } else if (pcon_allowed) { ++ /* Prefer HDMI VRR */ ++ if (hdmi_vrr.supported) ++ monitor_range_from_hdmi(&connector->display_info, &vsdb_info); ++ else if (vsdb_info.freesync_supported) { + amdgpu_dm_connector->vsdb_info = vsdb_info; +- +- amdgpu_dm_connector->min_vfreq = vsdb_info.min_refresh_rate_hz; +- amdgpu_dm_connector->max_vfreq = vsdb_info.max_refresh_rate_hz; +- if (amdgpu_dm_connector->max_vfreq - amdgpu_dm_connector->min_vfreq > 10) +- freesync_capable = true; +- +- connector->display_info.monitor_range.min_vfreq = vsdb_info.min_refresh_rate_hz; +- connector->display_info.monitor_range.max_vfreq = vsdb_info.max_refresh_rate_hz; ++ monitor_range_from_vsdb(&connector->display_info, &vsdb_info); + } ++ ++ amdgpu_dm_connector->pack_sdp_v1_3 = true; ++ amdgpu_dm_connector->as_type = ADAPTIVE_SYNC_TYPE_PCON_ALLOWED; ++ freesync_capable = copy_range_to_amdgpu_connector(connector); + } + + update: +- if (dm_con_state) ++ if (dm_con_state) { + dm_con_state->freesync_capable = freesync_capable; ++ dm_con_state->freesync_on_desktop_capable = freesync_on_desktop; ++ } + + if (connector->state && amdgpu_dm_connector->dc_link && !freesync_capable && + amdgpu_dm_connector->dc_link->replay_settings.config.replay_supported) { +@@ -13302,8 +13473,15 @@ void amdgpu_dm_update_freesync_caps(struct drm_connector *connector, + } + + if (connector->vrr_capable_property) +- drm_connector_set_vrr_capable_property(connector, +- freesync_capable); ++ drm_connector_set_vrr_capable_property(connector, freesync_capable); ++ ++ if (connector->passive_vrr_capable_property) ++ drm_connector_set_passive_vrr_capable_property(connector, freesync_on_desktop); ++ ++ amdgpu_dm_connector->hdmi_allm_capable = connector->display_info.hdmi.allm; ++ if (connector->allm_capable_property) ++ drm_connector_set_allm_capable_property( ++ connector, connector->display_info.hdmi.allm); + } + + void amdgpu_dm_trigger_timing_sync(struct drm_device *dev) +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +index beb0d04d3e68..6376d12acb72 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.h +@@ -828,6 +828,9 @@ struct amdgpu_dm_connector { + unsigned int hdmi_hpd_debounce_delay_ms; + struct delayed_work hdmi_hpd_debounce_work; + struct dc_sink *hdmi_prev_sink; ++ ++ /* HDMI ALLM */ ++ bool hdmi_allm_capable; + }; + + static inline void amdgpu_dm_set_mst_status(uint8_t *status, +@@ -1001,6 +1004,7 @@ struct dm_connector_state { + uint8_t underscan_hborder; + bool underscan_enable; + bool freesync_capable; ++ bool freesync_on_desktop_capable; + bool update_hdcp; + bool abm_sysfs_forbidden; + uint8_t abm_level; +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +index e5e993d3ef74..6413f2a587d5 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_helpers.c +@@ -137,7 +137,12 @@ enum dc_edid_status dm_helpers_parse_edid_caps( + edid_caps->display_name, + AUDIO_INFO_DISPLAY_NAME_SIZE_IN_CHARS); + +- edid_caps->edid_hdmi = connector->display_info.is_hdmi; ++ if (connector->display_info.is_hdmi) { ++ edid_caps->edid_hdmi = true; ++ edid_caps->allm = connector->display_info.hdmi.allm; ++ edid_caps->fva = connector->display_info.hdmi.vrr_cap.fva; ++ edid_caps->hdmi_vrr = connector->display_info.hdmi.vrr_cap.supported; ++ } + + if (edid_caps->edid_hdmi) + populate_hdmi_info_from_connector(&connector->display_info.hdmi, edid_caps); +@@ -1375,40 +1380,32 @@ void dm_helpers_dp_mst_update_branch_bandwidth( + // TODO + } + +-static bool dm_is_freesync_pcon_whitelist(const uint32_t branch_dev_id) ++bool dm_helpers_is_vrr_pcon_allowed(const struct dc_link *link, const struct drm_device *dev) + { +- bool ret_val = false; ++ if (link->dpcd_caps.dongle_type != DISPLAY_DONGLE_DP_HDMI_CONVERTER) ++ return false; + +- switch (branch_dev_id) { ++ if (!link->dpcd_caps.allow_invalid_MSA_timing_param) ++ return false; ++ ++ if (!link->dpcd_caps.adaptive_sync_caps.dp_adap_sync_caps.bits.ADAPTIVE_SYNC_SDP_SUPPORT) ++ return false; ++ ++ switch (link->dpcd_caps.branch_dev_id) { + case DP_BRANCH_DEVICE_ID_0060AD: + case DP_BRANCH_DEVICE_ID_00E04C: + case DP_BRANCH_DEVICE_ID_90CC24: +- ret_val = true; +- break; +- default: +- break; ++ case DP_BRANCH_DEVICE_ID_2B02F0: ++ return true; + } + +- return ret_val; +-} +- +-enum adaptive_sync_type dm_get_adaptive_sync_support_type(struct dc_link *link) +-{ +- struct dpcd_caps *dpcd_caps = &link->dpcd_caps; +- enum adaptive_sync_type as_type = ADAPTIVE_SYNC_TYPE_NONE; +- +- switch (dpcd_caps->dongle_type) { +- case DISPLAY_DONGLE_DP_HDMI_CONVERTER: +- if (dpcd_caps->adaptive_sync_caps.dp_adap_sync_caps.bits.ADAPTIVE_SYNC_SDP_SUPPORT == true && +- dpcd_caps->allow_invalid_MSA_timing_param == true && +- dm_is_freesync_pcon_whitelist(dpcd_caps->branch_dev_id)) +- as_type = FREESYNC_TYPE_PCON_IN_WHITELIST; +- break; +- default: +- break; ++ if (link->dc->debug.override_pcon_vrr_id_check) { ++ drm_info(dev, "Overriding VRR PCON check for ID: 0x%06x\n", ++ link->dpcd_caps.branch_dev_id); ++ return true; + } + +- return as_type; ++ return false; + } + + bool dm_helpers_is_fullscreen(struct dc_context *ctx, struct dc_stream_state *stream) +diff --git a/drivers/gpu/drm/amd/display/dc/core/dc.c b/drivers/gpu/drm/amd/display/dc/core/dc.c +index 8be9cbd43e18..b1db19175928 100644 +--- a/drivers/gpu/drm/amd/display/dc/core/dc.c ++++ b/drivers/gpu/drm/amd/display/dc/core/dc.c +@@ -3287,6 +3287,9 @@ static void copy_stream_update_to_stream(struct dc *dc, + if (update->vrr_active_fixed) + stream->vrr_active_fixed = *update->vrr_active_fixed; + ++ if (update->hdmi_allm_active) ++ stream->hdmi_allm_active = *update->hdmi_allm_active; ++ + if (update->crtc_timing_adjust) { + if (stream->adjust.v_total_min != update->crtc_timing_adjust->v_total_min || + stream->adjust.v_total_max != update->crtc_timing_adjust->v_total_max || +diff --git a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +index 848c267ef11e..230ada389e3a 100644 +--- a/drivers/gpu/drm/amd/display/dc/core/dc_resource.c ++++ b/drivers/gpu/drm/amd/display/dc/core/dc_resource.c +@@ -4659,7 +4659,7 @@ static void set_avi_info_frame( + vic = 0; + format = stream->timing.timing_3d_format; + /*todo, add 3DStereo support*/ +- if (format != TIMING_3D_FORMAT_NONE) { ++ if (format != TIMING_3D_FORMAT_NONE || stream->hdmi_allm_active) { + // Based on HDMI specs hdmi vic needs to be converted to cea vic when 3D is enabled + switch (pipe_ctx->stream->timing.hdmi_vic) { + case 1: +diff --git a/drivers/gpu/drm/amd/display/dc/dc.h b/drivers/gpu/drm/amd/display/dc/dc.h +index 0a9758a04258..f120dd5c05c6 100644 +--- a/drivers/gpu/drm/amd/display/dc/dc.h ++++ b/drivers/gpu/drm/amd/display/dc/dc.h +@@ -1039,6 +1039,7 @@ struct dc_debug_options { + bool scl_reset_length10; + bool hdmi20_disable; + bool skip_detection_link_training; ++ bool override_pcon_vrr_id_check; + uint32_t edid_read_retry_times; + unsigned int force_odm_combine; //bit vector based on otg inst + unsigned int seamless_boot_odm_combine; +diff --git a/drivers/gpu/drm/amd/display/dc/dc_stream.h b/drivers/gpu/drm/amd/display/dc/dc_stream.h +index 321cfe92d799..e69c17413835 100644 +--- a/drivers/gpu/drm/amd/display/dc/dc_stream.h ++++ b/drivers/gpu/drm/amd/display/dc/dc_stream.h +@@ -242,6 +242,7 @@ struct dc_stream_state { + bool vrr_active_variable; + bool freesync_on_desktop; + bool vrr_active_fixed; ++ bool hdmi_allm_active; + + bool converter_disable_audio; + uint8_t qs_bit; +@@ -343,6 +344,7 @@ struct dc_stream_update { + bool *allow_freesync; + bool *vrr_active_variable; + bool *vrr_active_fixed; ++ bool *hdmi_allm_active; + + struct colorspace_transform *gamut_remap; + enum dc_color_space *output_color_space; +diff --git a/drivers/gpu/drm/amd/display/dc/dc_types.h b/drivers/gpu/drm/amd/display/dc/dc_types.h +index 3e63d7bda166..57811bc85071 100644 +--- a/drivers/gpu/drm/amd/display/dc/dc_types.h ++++ b/drivers/gpu/drm/amd/display/dc/dc_types.h +@@ -210,9 +210,14 @@ struct dc_edid_caps { + + uint32_t max_tmds_clk_mhz; + +- /*HDMI 2.0 caps*/ ++ /* HDMI 2.0 caps */ + bool lte_340mcsc_scramble; + ++ /* HDMI 2.1 caps */ ++ bool allm; ++ bool fva; ++ bool hdmi_vrr; ++ + bool edid_hdmi; + bool hdr_supported; + bool rr_capable; +diff --git a/drivers/gpu/drm/amd/display/dc/dm_helpers.h b/drivers/gpu/drm/amd/display/dc/dm_helpers.h +index 9d160b39e8c5..ea94c52d2b87 100644 +--- a/drivers/gpu/drm/amd/display/dc/dm_helpers.h ++++ b/drivers/gpu/drm/amd/display/dc/dm_helpers.h +@@ -219,10 +219,10 @@ int dm_helpers_dmub_set_config_sync(struct dc_context *ctx, + const struct dc_link *link, + struct set_config_cmd_payload *payload, + enum set_config_status *operation_result); +-enum adaptive_sync_type dm_get_adaptive_sync_support_type(struct dc_link *link); + + enum dc_edid_status dm_helpers_get_sbios_edid(struct dc_link *link, struct dc_edid *edid); + ++bool dm_helpers_is_vrr_pcon_allowed(const struct dc_link *link, const struct drm_device *dev); + bool dm_helpers_is_fullscreen(struct dc_context *ctx, struct dc_stream_state *stream); + bool dm_helpers_is_hdr_on(struct dc_context *ctx, struct dc_stream_state *stream); + +diff --git a/drivers/gpu/drm/amd/display/include/ddc_service_types.h b/drivers/gpu/drm/amd/display/include/ddc_service_types.h +index 1c603b12957f..e838f7c1269c 100644 +--- a/drivers/gpu/drm/amd/display/include/ddc_service_types.h ++++ b/drivers/gpu/drm/amd/display/include/ddc_service_types.h +@@ -36,6 +36,7 @@ + #define DP_BRANCH_DEVICE_ID_006037 0x006037 + #define DP_BRANCH_DEVICE_ID_001CF8 0x001CF8 + #define DP_BRANCH_DEVICE_ID_0060AD 0x0060AD ++#define DP_BRANCH_DEVICE_ID_2B02F0 0x2B02F0 /* Chrontel CH7218 */ + #define DP_BRANCH_HW_REV_10 0x10 + #define DP_BRANCH_HW_REV_20 0x20 + +diff --git a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +index 1aae46d703ba..db197cf048e1 100644 +--- a/drivers/gpu/drm/amd/display/modules/freesync/freesync.c ++++ b/drivers/gpu/drm/amd/display/modules/freesync/freesync.c +@@ -27,6 +27,7 @@ + #include "dc.h" + #include "mod_freesync.h" + #include "core_types.h" ++#include "mod_info_packet.h" + + #define MOD_FREESYNC_MAX_CONCURRENT_STREAMS 32 + +@@ -955,6 +956,9 @@ void mod_freesync_build_vrr_infopacket(struct mod_freesync *mod_freesync, + return; + + switch (packet_type) { ++ case PACKET_TYPE_VTEM: ++ mod_build_vtem_infopacket(stream, vrr, infopacket); ++ break; + case PACKET_TYPE_FS_V3: + build_vrr_infopacket_v3(stream->signal, vrr, app_tf, infopacket, stream->freesync_on_desktop); + break; +diff --git a/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h b/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h +index 66dc9a19aebe..89d412772d16 100644 +--- a/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h ++++ b/drivers/gpu/drm/amd/display/modules/inc/mod_info_packet.h +@@ -33,6 +33,8 @@ struct dc_stream_state; + struct dc_info_packet; + struct mod_vrr_params; + ++#define VTEM_BRR_MAX 1023 ++ + void mod_build_vsc_infopacket(const struct dc_stream_state *stream, + struct dc_info_packet *info_packet, + enum dc_color_space cs, +@@ -41,12 +43,17 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream, + void mod_build_hf_vsif_infopacket(const struct dc_stream_state *stream, + struct dc_info_packet *info_packet); + ++void mod_build_vtem_infopacket(const struct dc_stream_state *stream, ++ const struct mod_vrr_params *vrr, ++ struct dc_info_packet *infopacket); ++ + enum adaptive_sync_type { +- ADAPTIVE_SYNC_TYPE_NONE = 0, +- ADAPTIVE_SYNC_TYPE_DP = 1, +- FREESYNC_TYPE_PCON_IN_WHITELIST = 2, +- FREESYNC_TYPE_PCON_NOT_IN_WHITELIST = 3, +- ADAPTIVE_SYNC_TYPE_EDP = 4, ++ ADAPTIVE_SYNC_TYPE_NONE = 0, ++ ADAPTIVE_SYNC_TYPE_DP = 1, ++ ADAPTIVE_SYNC_TYPE_PCON_ALLOWED = 2, ++ ADAPTIVE_SYNC_TYPE_PCON_NOT_ALLOWED = 3, ++ ADAPTIVE_SYNC_TYPE_EDP = 4, ++ ADAPTIVE_SYNC_TYPE_HDMI = 5, + }; + + enum adaptive_sync_sdp_version { +diff --git a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c +index b3d55cac3569..a16a94dffa8d 100644 +--- a/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c ++++ b/drivers/gpu/drm/amd/display/modules/info_packet/info_packet.c +@@ -44,8 +44,12 @@ enum vsc_packet_revision { + vsc_packet_rev5 = 5, + }; + ++#define HDMI_INFOFRAME_TYPE_EMP 0x7F + #define HDMI_INFOFRAME_TYPE_VENDOR 0x81 +-#define HF_VSIF_VERSION 1 ++#define HDMI_INFOFRAME_LENGTH_MASK 0x1F ++#define HF_VSIF_VERSION 1 ++#define HF_VSIF_3D_BIT 0 ++#define HF_VSIF_ALLM_BIT 1 + + // VTEM Byte Offset + #define VTEM_PB0 0 +@@ -56,64 +60,51 @@ enum vsc_packet_revision { + #define VTEM_PB5 5 + #define VTEM_PB6 6 + +-#define VTEM_MD0 7 +-#define VTEM_MD1 8 +-#define VTEM_MD2 9 +-#define VTEM_MD3 10 +- +- +-// VTEM Byte Masks +-//PB0 +-#define MASK_VTEM_PB0__RESERVED0 0x01 +-#define MASK_VTEM_PB0__SYNC 0x02 +-#define MASK_VTEM_PB0__VFR 0x04 +-#define MASK_VTEM_PB0__AFR 0x08 +-#define MASK_VTEM_PB0__DS_TYPE 0x30 +- //0: Periodic pseudo-static EM Data Set +- //1: Periodic dynamic EM Data Set +- //2: Unique EM Data Set +- //3: Reserved +-#define MASK_VTEM_PB0__END 0x40 +-#define MASK_VTEM_PB0__NEW 0x80 +- +-//PB1 +-#define MASK_VTEM_PB1__RESERVED1 0xFF +- +-//PB2 +-#define MASK_VTEM_PB2__ORGANIZATION_ID 0xFF +- //0: This is a Vendor Specific EM Data Set +- //1: This EM Data Set is defined by This Specification (HDMI 2.1 r102.clean) +- //2: This EM Data Set is defined by CTA-861-G +- //3: This EM Data Set is defined by VESA +-//PB3 +-#define MASK_VTEM_PB3__DATA_SET_TAG_MSB 0xFF +-//PB4 +-#define MASK_VTEM_PB4__DATA_SET_TAG_LSB 0xFF +-//PB5 +-#define MASK_VTEM_PB5__DATA_SET_LENGTH_MSB 0xFF +-//PB6 +-#define MASK_VTEM_PB6__DATA_SET_LENGTH_LSB 0xFF +- +- +- +-//PB7-27 (20 bytes): +-//PB7 = MD0 +-#define MASK_VTEM_MD0__VRR_EN 0x01 +-#define MASK_VTEM_MD0__M_CONST 0x02 +-#define MASK_VTEM_MD0__QMS_EN 0x04 +-#define MASK_VTEM_MD0__RESERVED2 0x08 +-#define MASK_VTEM_MD0__FVA_FACTOR_M1 0xF0 +- +-//MD1 +-#define MASK_VTEM_MD1__BASE_VFRONT 0xFF +- +-//MD2 +-#define MASK_VTEM_MD2__BASE_REFRESH_RATE_98 0x03 +-#define MASK_VTEM_MD2__RB 0x04 +-#define MASK_VTEM_MD2__NEXT_TFR 0xF8 +- +-//MD3 +-#define MASK_VTEM_MD3__BASE_REFRESH_RATE_07 0xFF ++#define VTEM_ORG_ID 1 ++#define VTEM_DATA_SET_TAG 1 ++#define VTEM_DATA_SET_LENGTH 4 ++ ++#define VTEM_M_CONST 0 ++#define VTEM_FVA_FACTOR 0 ++ ++#define VTEM_BRR_MASK_UPPER 0x03 ++#define VTEM_BRR_MASK_LOWER 0xFF ++ ++/* VTEM Byte Offset */ ++#define VTEM_PB0 0 ++#define VTEM_PB1 1 ++#define VTEM_PB2 2 ++#define VTEM_PB3 3 ++#define VTEM_PB4 4 ++#define VTEM_PB5 5 ++#define VTEM_PB6 6 ++ ++#define VTEM_MD0 7 ++#define VTEM_MD1 8 ++#define VTEM_MD2 9 ++#define VTEM_MD3 10 ++ ++/* Extended Metadata Packet */ ++/* Header */ ++#define EMP_LAST_BIT 6 ++#define EMP_FIRST_BIT 7 ++/* PB0 */ ++#define EMP_SNC_BIT 1 ++#define EMP_VFR_BIT 2 ++#define EMP_AFR_BIT 3 ++#define EMP_DST_BIT 4 ++#define EMP_END_BIT 6 ++#define EMP_NEW_BIT 7 ++/* PB7 = MD0 */ ++#define VTEM_VRR_BIT 0 ++#define VTEM_M_CONST_BIT 1 ++#define VTEM_FVA_BIT 4 ++/* MD1 Base_Vfront */ ++/* MD2 */ ++#define VTEM_BRR_UPPER_BIT 0 ++#define VTEM_RB_BIT 2 ++/* MD3 BRR Lower */ ++ + + enum ColorimetryRGBDP { + ColorimetryRGB_DP_sRGB = 0, +@@ -441,9 +432,29 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream, + } + } + ++static bool is_hdmi_vic_mode(const struct dc_stream_state *stream) ++{ ++ if (stream->timing.hdmi_vic == 0) ++ return false; ++ ++ if (stream->timing.h_total < 3840 || ++ stream->timing.v_total < 2160) ++ return false; ++ ++ /* 3D/ALLM forces HDMI VIC -> CTA VIC translation */ ++ if (stream->view_format != VIEW_3D_FORMAT_NONE) ++ return false; ++ ++ if (stream->hdmi_allm_active) ++ return false; ++ ++ return true; ++} ++ + /** + * mod_build_hf_vsif_infopacket - Prepare HDMI Vendor Specific info frame. + * Follows HDMI Spec to build up Vendor Specific info frame ++ * Conforms to h14b-vsif or hf-vsif based on the capabilities + * + * @stream: contains data we may need to construct VSIF (i.e. timing_3d_format, etc.) + * @info_packet: output structure where to store VSIF +@@ -451,63 +462,76 @@ void mod_build_vsc_infopacket(const struct dc_stream_state *stream, + void mod_build_hf_vsif_infopacket(const struct dc_stream_state *stream, + struct dc_info_packet *info_packet) + { +- unsigned int length = 5; + bool hdmi_vic_mode = false; ++ bool allm = false; ++ bool stereo = false; + uint8_t checksum = 0; +- uint32_t i = 0; ++ uint8_t offset = 0; ++ uint8_t i = 0; ++ uint8_t length = 5; ++ uint32_t oui = HDMI_IEEE_OUI; + enum dc_timing_3d_format format; + + info_packet->valid = false; +- format = stream->timing.timing_3d_format; +- if (stream->view_format == VIEW_3D_FORMAT_NONE) +- format = TIMING_3D_FORMAT_NONE; + +- if (stream->timing.hdmi_vic != 0 +- && stream->timing.h_total >= 3840 +- && stream->timing.v_total >= 2160 +- && format == TIMING_3D_FORMAT_NONE) +- hdmi_vic_mode = true; ++ allm = stream->hdmi_allm_active; ++ format = stream->view_format == VIEW_3D_FORMAT_NONE ? ++ TIMING_3D_FORMAT_NONE : ++ stream->timing.timing_3d_format; ++ stereo = format != TIMING_3D_FORMAT_NONE; ++ hdmi_vic_mode = is_hdmi_vic_mode(stream); + +- if ((format == TIMING_3D_FORMAT_NONE) && !hdmi_vic_mode) ++ if (!stereo && !hdmi_vic_mode && !allm) + return; + +- info_packet->sb[1] = 0x03; +- info_packet->sb[2] = 0x0C; +- info_packet->sb[3] = 0x00; ++ if (allm) ++ oui = HDMI_FORUM_IEEE_OUI; + +- if (format != TIMING_3D_FORMAT_NONE) +- info_packet->sb[4] = (2 << 5); ++ info_packet->sb[1] = oui & 0xFF; ++ info_packet->sb[2] = (oui >> 8) & 0xFF; ++ info_packet->sb[3] = (oui >> 16) & 0xFF; + +- else if (hdmi_vic_mode) +- info_packet->sb[4] = (1 << 5); ++ if (oui == HDMI_FORUM_IEEE_OUI) { ++ offset = 2; ++ length += 2; ++ info_packet->sb[4] = HF_VSIF_VERSION; ++ info_packet->sb[5] = stereo << HF_VSIF_3D_BIT; ++ info_packet->sb[5] |= allm << HF_VSIF_ALLM_BIT; ++ } + +- switch (format) { +- case TIMING_3D_FORMAT_HW_FRAME_PACKING: +- case TIMING_3D_FORMAT_SW_FRAME_PACKING: +- info_packet->sb[5] = (0x0 << 4); +- break; ++ if (stereo) { ++ info_packet->sb[4 + offset] = (2 << 5); + +- case TIMING_3D_FORMAT_SIDE_BY_SIDE: +- case TIMING_3D_FORMAT_SBS_SW_PACKED: +- info_packet->sb[5] = (0x8 << 4); +- length = 6; +- break; ++ switch (format) { ++ case TIMING_3D_FORMAT_HW_FRAME_PACKING: ++ case TIMING_3D_FORMAT_SW_FRAME_PACKING: ++ info_packet->sb[5 + offset] = (0x0 << 4); ++ break; + +- case TIMING_3D_FORMAT_TOP_AND_BOTTOM: +- case TIMING_3D_FORMAT_TB_SW_PACKED: +- info_packet->sb[5] = (0x6 << 4); +- break; ++ case TIMING_3D_FORMAT_SIDE_BY_SIDE: ++ case TIMING_3D_FORMAT_SBS_SW_PACKED: ++ info_packet->sb[5 + offset] = (0x8 << 4); ++ ++length; ++ break; + +- default: +- break; +- } ++ case TIMING_3D_FORMAT_TOP_AND_BOTTOM: ++ case TIMING_3D_FORMAT_TB_SW_PACKED: ++ info_packet->sb[5 + offset] = (0x6 << 4); ++ break; ++ ++ default: ++ break; ++ } + +- if (hdmi_vic_mode) ++ /* Doesn't need the offset as it can't be used with hf-vsif */ ++ } else if (hdmi_vic_mode) { ++ info_packet->sb[4] = (1 << 5); + info_packet->sb[5] = stream->timing.hdmi_vic; ++ } + + info_packet->hb0 = HDMI_INFOFRAME_TYPE_VENDOR; + info_packet->hb1 = 0x01; +- info_packet->hb2 = (uint8_t) (length); ++ info_packet->hb2 = length & HDMI_INFOFRAME_LENGTH_MASK; + + checksum += info_packet->hb0; + checksum += info_packet->hb1; +@@ -521,6 +545,92 @@ void mod_build_hf_vsif_infopacket(const struct dc_stream_state *stream, + info_packet->valid = true; + } + ++static void build_vtem_infopacket_header(struct dc_info_packet *infopacket) ++{ ++ uint8_t pb0 = 0; ++ ++ /* might need logic in the future */ ++ pb0 |= 0 << EMP_SNC_BIT; ++ pb0 |= 1 << EMP_VFR_BIT; ++ pb0 |= 0 << EMP_AFR_BIT; ++ pb0 |= 0 << EMP_DST_BIT; ++ pb0 |= 0 << EMP_END_BIT; ++ pb0 |= 1 << EMP_NEW_BIT; ++ ++ infopacket->hb0 = HDMI_INFOFRAME_TYPE_EMP; ++ infopacket->hb1 = (1 << EMP_FIRST_BIT) | (1 << EMP_LAST_BIT); ++ infopacket->hb2 = 0; // sequence ++ ++ infopacket->sb[VTEM_PB0] = pb0; ++ infopacket->sb[VTEM_PB2] = VTEM_ORG_ID; ++ infopacket->sb[VTEM_PB4] = VTEM_DATA_SET_TAG; ++ infopacket->sb[VTEM_PB6] = VTEM_DATA_SET_LENGTH; ++} ++ ++static void build_vtem_infopacket_data(const struct dc_stream_state *stream, ++ const struct mod_vrr_params *vrr, ++ struct dc_info_packet *infopacket) ++{ ++ unsigned int hblank = 0; ++ unsigned int brr = 0; ++ bool vrr_active = false; ++ bool rb = false; ++ ++ /* ++ * Enables FreeSync-like behavior by keeping HDMI VRR signalling active ++ * in fixed refresh rate conditions like normal desktop work/web browsing. ++ * Functinally behaves like non-VRR mode by keeping the actual refresh ++ * rate fixed. ++ */ ++ if (stream->freesync_on_desktop) { ++ vrr_active = vrr->state != VRR_STATE_DISABLED && ++ vrr->state != VRR_STATE_UNSUPPORTED; ++ } else { ++ vrr_active = vrr->state == VRR_STATE_ACTIVE_VARIABLE || ++ vrr->state == VRR_STATE_ACTIVE_FIXED; ++ } ++ ++ infopacket->sb[VTEM_MD0] = VTEM_M_CONST << VTEM_M_CONST_BIT; ++ infopacket->sb[VTEM_MD0] |= VTEM_FVA_FACTOR << VTEM_FVA_BIT; ++ infopacket->sb[VTEM_MD0] |= vrr_active << VTEM_VRR_BIT; ++ ++ infopacket->sb[VTEM_MD1] = 0; ++ infopacket->sb[VTEM_MD2] = 0; ++ infopacket->sb[VTEM_MD3] = 0; ++ ++ if (!vrr_active || is_hdmi_vic_mode(stream)) ++ return; ++ /* ++ * In accordance with CVT 1.2 and CVT 2.1: ++ * Reduced Blanking standard defines a fixed value of ++ * 160 for hblank, further reduced to 80 in RB2. RB3 uses ++ * fixed hblank of 80 pixels + up to 120 additional pixels ++ * in 8-pixel steps. ++ */ ++ hblank = stream->timing.h_total - stream->timing.h_addressable; ++ rb = (hblank >= 80 && hblank <= 200 && hblank % 8 == 0); ++ brr = div_u64(mod_freesync_calc_nominal_field_rate(stream), 1000000); ++ ++ if (brr > VTEM_BRR_MAX) { ++ infopacket->valid = false; ++ return; ++ } ++ ++ infopacket->sb[VTEM_MD1] = (uint8_t) stream->timing.v_front_porch; ++ infopacket->sb[VTEM_MD2] = rb << VTEM_RB_BIT; ++ infopacket->sb[VTEM_MD2] |= (brr >> 8) & VTEM_BRR_MASK_UPPER; ++ infopacket->sb[VTEM_MD3] = brr & VTEM_BRR_MASK_LOWER; ++} ++ ++void mod_build_vtem_infopacket(const struct dc_stream_state *stream, ++ const struct mod_vrr_params *vrr, ++ struct dc_info_packet *infopacket) ++{ ++ infopacket->valid = true; ++ build_vtem_infopacket_header(infopacket); ++ build_vtem_infopacket_data(stream, vrr, infopacket); ++} ++ + void mod_build_adaptive_sync_infopacket(const struct dc_stream_state *stream, + enum adaptive_sync_type asType, + const struct AS_Df_params *param, +@@ -535,12 +645,13 @@ void mod_build_adaptive_sync_infopacket(const struct dc_stream_state *stream, + if (stream != NULL) + mod_build_adaptive_sync_infopacket_v2(stream, param, info_packet); + break; +- case FREESYNC_TYPE_PCON_IN_WHITELIST: ++ case ADAPTIVE_SYNC_TYPE_PCON_ALLOWED: + case ADAPTIVE_SYNC_TYPE_EDP: + mod_build_adaptive_sync_infopacket_v1(info_packet); + break; + case ADAPTIVE_SYNC_TYPE_NONE: +- case FREESYNC_TYPE_PCON_NOT_IN_WHITELIST: ++ case ADAPTIVE_SYNC_TYPE_PCON_NOT_ALLOWED: ++ case ADAPTIVE_SYNC_TYPE_HDMI: + default: + break; + } +diff --git a/drivers/gpu/drm/amd/include/amd_shared.h b/drivers/gpu/drm/amd/include/amd_shared.h +index ac2d3701e2bd..894e1e738ce0 100644 +--- a/drivers/gpu/drm/amd/include/amd_shared.h ++++ b/drivers/gpu/drm/amd/include/amd_shared.h +@@ -412,6 +412,12 @@ enum DC_DEBUG_MASK { + * @DC_SKIP_DETECTION_LT: (0x200000) If set, skip detection link training + */ + DC_SKIP_DETECTION_LT = 0x200000, ++ ++ /** ++ * @DC_OVERRIDE_PCON_VRR_ID_CHECK: (0x400000) If set, always return true if checking for ++ * PCON VRR compatibility and print it's ID in kernel log. ++ */ ++ DC_OVERRIDE_PCON_VRR_ID_CHECK = 0x400000, + }; + + enum amd_dpm_forced_level; +diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c +index 7320db4b8489..94e1b7eb65f7 100644 +--- a/drivers/gpu/drm/drm_atomic_uapi.c ++++ b/drivers/gpu/drm/drm_atomic_uapi.c +@@ -412,6 +412,8 @@ static int drm_atomic_crtc_set_property(struct drm_crtc *crtc, + return ret; + } else if (property == config->prop_vrr_enabled) { + state->vrr_enabled = val; ++ } else if (property == config->prop_passive_vrr_disabled) { ++ state->passive_vrr_disabled = val; + } else if (property == config->degamma_lut_property) { + ret = drm_property_replace_blob_from_id(dev, + &state->degamma_lut, +@@ -477,6 +479,8 @@ drm_atomic_crtc_get_property(struct drm_crtc *crtc, + *val = (state->mode_blob) ? state->mode_blob->base.id : 0; + else if (property == config->prop_vrr_enabled) + *val = state->vrr_enabled; ++ else if (property == config->prop_passive_vrr_disabled) ++ *val = state->passive_vrr_disabled; + else if (property == config->degamma_lut_property) + *val = (state->degamma_lut) ? state->degamma_lut->base.id : 0; + else if (property == config->ctm_property) +@@ -885,6 +889,8 @@ static int drm_atomic_connector_set_property(struct drm_connector *connector, + state->content_type = val; + } else if (property == connector->scaling_mode_property) { + state->scaling_mode = val; ++ } else if (property == connector->allm_mode_property) { ++ state->allm_mode = val; + } else if (property == config->content_protection_property) { + if (val == DRM_MODE_CONTENT_PROTECTION_ENABLED) { + drm_dbg_kms(dev, "only drivers can set CP Enabled\n"); +@@ -982,6 +988,8 @@ drm_atomic_connector_get_property(struct drm_connector *connector, + *val = state->colorspace; + } else if (property == connector->scaling_mode_property) { + *val = state->scaling_mode; ++ } else if (property == connector->allm_mode_property) { ++ *val = state->allm_mode; + } else if (property == config->hdr_output_metadata_property) { + *val = state->hdr_output_metadata ? + state->hdr_output_metadata->base.id : 0; +diff --git a/drivers/gpu/drm/drm_connector.c b/drivers/gpu/drm/drm_connector.c +index 4d6dc9ebfdb5..bdf49bb3c38e 100644 +--- a/drivers/gpu/drm/drm_connector.c ++++ b/drivers/gpu/drm/drm_connector.c +@@ -1226,6 +1226,12 @@ static const struct drm_prop_enum_list drm_content_type_enum_list[] = { + { DRM_MODE_CONTENT_TYPE_GAME, "Game" }, + }; + ++static const struct drm_prop_enum_list drm_allm_mode_enum_list[] = { ++ { DRM_ALLM_MODE_DISABLED, "Disabled" }, ++ { DRM_ALLM_MODE_ENABLED_DYNAMIC, "Dynamic" }, ++ { DRM_ALLM_MODE_ENABLED_FORCED, "Always On" }, ++}; ++ + static const struct drm_prop_enum_list drm_panel_orientation_enum_list[] = { + { DRM_MODE_PANEL_ORIENTATION_NORMAL, "Normal" }, + { DRM_MODE_PANEL_ORIENTATION_BOTTOM_UP, "Upside Down" }, +@@ -2346,6 +2352,16 @@ EXPORT_SYMBOL(drm_mode_create_scaling_mode_property); + * + * Absence of the property should indicate absence of support. + * ++ * "passive_vrr_capable": ++ * Optional &drm_connector boolean property that drivers should attach ++ * with drm_connector_attach_passive_vrr_capable_property() on ++ * connectors that could support keeping variable refresh rate signalling ++ * in fixed-refresh rate scenarios like desktop work. Drivers should update ++ * the property value by calling ++ * drm_connector_set_passive_vrr_capable_property(). ++ * ++ * Absence of the property should indicate absence of support. ++ * + * "VRR_ENABLED": + * Default &drm_crtc boolean property that notifies the driver that the + * content on the CRTC is suitable for variable refresh rate presentation. +@@ -2364,6 +2380,17 @@ EXPORT_SYMBOL(drm_mode_create_scaling_mode_property); + * + * The driver may place further restrictions within these minimum + * and maximum bounds. ++ * ++ * "PASSIVE_VRR_DISABLED": ++ * Default &drm_crtc boolean property that notifies the driver that the ++ * VRR singalling should be disabled in fixed refresh rate scenarios. ++ * Functionally, psssive vrr works the same as VRR_ENABLED == false ++ * but works around displays blanking (mainly HDMI) that do not support ++ * seamless VRR transitions. Also helps with brightness flickering during ++ * VRR transitions. ++ * ++ * Passive VRR mode is not that useful for DP/eDP sinks where seamless VRR ++ * transitions are enforced by the standard. + */ + + /** +@@ -2397,6 +2424,125 @@ int drm_connector_attach_vrr_capable_property( + } + EXPORT_SYMBOL(drm_connector_attach_vrr_capable_property); + ++/** ++ * drm_connector_attach_passive_vrr_capable_property - creates the ++ * passive_vrr_capable property ++ * @connector: connector to create the passive_vrr_capable property on. ++ * ++ * This is used by atomic drivers to add support for querying ++ * variable refresh rate on desktop capability for a connector. ++ * ++ * Returns: ++ * Zero on success, negative errno on failure. ++ */ ++int drm_connector_attach_passive_vrr_capable_property( ++ struct drm_connector *connector) ++{ ++ struct drm_device *dev = connector->dev; ++ struct drm_property *prop; ++ ++ if (!connector->passive_vrr_capable_property) { ++ prop = drm_property_create_bool(dev, DRM_MODE_PROP_IMMUTABLE, ++ "passive_vrr_capable"); ++ if (!prop) ++ return -ENOMEM; ++ ++ connector->passive_vrr_capable_property = prop; ++ drm_object_attach_property(&connector->base, prop, 0); ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(drm_connector_attach_passive_vrr_capable_property); ++ ++/** ++ * DOC: Auto Low Latency Mode properties ++ * ++ * Auto Low Latency capable HDMI displays (be it PC monitors or TVs) ++ * can automatically enter a "low latency" mode, usually named "Game Mode" by ++ * receiving specific data in HDMI Forum vendor-specific info frame. ++ * ++ * This usually is the best mode for PC usage but disables as much processing as ++ * possible which might not be desireable on lower end devices casing them to ++ * produce an image that's unsatisfactory to some users. ++ * ++ * "allm_capable": ++ * Optional &drm_connector boolean property that drivers should attach ++ * with drm_connector_attach_allm_capable_property() on connectors that ++ * could support Auto Low Latency Mode. Drivers should update the ++ * property value by calling drm_connector_set_allm_capable_property(). ++ * ++ * Absence of the property should indicate absence of support. ++ * ++ * "ALLM_MODE": ++ * Optional &drm_connector enum property enables compositors to control and ++ * expose ALLM triggering behavior modes to the end user where: ++ * ++ * - ALLM_MODE_DISABLED: completely disabled ALLM signalling. ++ * - ALLM_MODE_ENABLED_DYNAMIC: triggers ALLM based on current needs. ++ * preferrably display content type hint being set to Game by compositor ++ * or VRR being enabled and active. ++ * - ALLM_MODE_ENABLED_FORCED: always-on ALLM triggering. ++ * ++ * ALLM_MODE_ENABLED_DYNAMIC should behave like gaming devices such as ++ * consoles where ALLM is only triggered when needed. It's main purpose is ++ * gaming (part of so-called HDMI gaming features). ++ * ++ * If compositors wish to control ALLM completely on their own, they can ++ * switch between disabled and enabled_forced modes. ++ */ ++ ++/** ++ * drm_connector_attach_allm_capable_property - creates the ++ * allm_capable property ++ * @connector: connector to create the allm_capable property on. ++ * ++ * This is used by atomic drivers to add support for querying ++ * Auto Low Latency Mode capability for a connector. ++ * ++ * Returns: ++ * Zero on success, negative errno on failure. ++ */ ++int drm_connector_attach_allm_capable_property(struct drm_connector *connector) ++{ ++ struct drm_device *dev = connector->dev; ++ struct drm_property *prop; ++ ++ if (!connector->allm_capable_property) { ++ prop = drm_property_create_bool(dev, DRM_MODE_PROP_IMMUTABLE, ++ "allm_capable"); ++ if (!prop) ++ return -ENOMEM; ++ ++ connector->allm_capable_property = prop; ++ drm_object_attach_property(&connector->base, prop, 0); ++ } ++ ++ return 0; ++} ++EXPORT_SYMBOL(drm_connector_attach_allm_capable_property); ++ ++int drm_connector_attach_allm_mode_property(struct drm_connector *connector) ++{ ++ struct drm_property *prop; ++ ++ if (connector->allm_mode_property) ++ return 0; ++ ++ prop = drm_property_create_enum(connector->dev, 0, "allm_mode", ++ drm_allm_mode_enum_list, ++ ARRAY_SIZE(drm_allm_mode_enum_list)); ++ if (!prop) ++ return -ENOMEM; ++ ++ connector->allm_mode_property = prop; ++ drm_object_attach_property(&connector->base, prop, ++ DRM_ALLM_MODE_DISABLED); ++ ++ return 0; ++} ++EXPORT_SYMBOL(drm_connector_attach_allm_mode_property); ++ + /** + * drm_connector_attach_scaling_mode_property - attach atomic scaling mode property + * @connector: connector to attach scaling mode property on. +@@ -2968,6 +3114,48 @@ void drm_connector_set_vrr_capable_property( + } + EXPORT_SYMBOL(drm_connector_set_vrr_capable_property); + ++/** ++ * drm_connector_set_passive_vrr_disabled_capable_property - sets the variable refresh ++ * rate on desktop capable property for a connector ++ * @connector: drm connector ++ * @capable: True if the connector is variable refresh rate on desktop capable ++ * ++ * Should be used by atomic drivers to update the indicated support for ++ * variable refresh rate on desktop over a connector. ++ */ ++void drm_connector_set_passive_vrr_capable_property( ++ struct drm_connector *connector, bool capable) ++{ ++ if (!connector->passive_vrr_capable_property) ++ return; ++ ++ drm_object_property_set_value(&connector->base, ++ connector->passive_vrr_capable_property, ++ capable); ++} ++EXPORT_SYMBOL(drm_connector_set_passive_vrr_capable_property); ++ ++/** ++ * drm_connector_set_allm_capable_property - sets Auto Low Latency Mode ++ * capable property for a connector ++ * @connector: drm connector ++ * @capable: True if the connector is ALLM capable ++ * ++ * Should be used by atomic drivers to update the indicated support for ++ * Auto Low Latency Mode over a connector. ++ */ ++void drm_connector_set_allm_capable_property( ++ struct drm_connector *connector, bool capable) ++{ ++ if (!connector->allm_capable_property) ++ return; ++ ++ drm_object_property_set_value(&connector->base, ++ connector->allm_capable_property, ++ capable); ++} ++EXPORT_SYMBOL(drm_connector_set_allm_capable_property); ++ + /** + * drm_connector_set_panel_orientation - sets the connector's panel_orientation + * @connector: connector for which to set the panel-orientation property. +diff --git a/drivers/gpu/drm/drm_crtc.c b/drivers/gpu/drm/drm_crtc.c +index a7797d260f1e..4f2c871552e5 100644 +--- a/drivers/gpu/drm/drm_crtc.c ++++ b/drivers/gpu/drm/drm_crtc.c +@@ -322,6 +322,8 @@ static int __drm_crtc_init_with_planes(struct drm_device *dev, struct drm_crtc * + config->prop_out_fence_ptr, 0); + drm_object_attach_property(&crtc->base, + config->prop_vrr_enabled, 0); ++ drm_object_attach_property(&crtc->base, ++ config->prop_passive_vrr_disabled, 0); + } + + return 0; +diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c +index 26bb7710a462..056eff8cbd1a 100644 +--- a/drivers/gpu/drm/drm_edid.c ++++ b/drivers/gpu/drm/drm_edid.c +@@ -6152,6 +6152,33 @@ static void drm_parse_ycbcr420_deep_color_info(struct drm_connector *connector, + hdmi->y420_dc_modes = dc_mask; + } + ++static void drm_parse_hdmi_gaming_info(struct drm_hdmi_info *hdmi, const u8 *db) ++{ ++ struct drm_hdmi_vrr_cap *vrr = &hdmi->vrr_cap; ++ ++ if (cea_db_payload_len(db) < 8) ++ return; ++ ++ hdmi->fapa_start_location = db[8] & DRM_EDID_FAPA_START_LOCATION; ++ hdmi->allm = db[8] & DRM_EDID_ALLM; ++ vrr->fva = db[8] & DRM_EDID_FVA; ++ vrr->cnmvrr = db[8] & DRM_EDID_CNMVRR; ++ vrr->cinema_vrr = db[8] & DRM_EDID_CINEMA_VRR; ++ vrr->mdelta = db[8] & DRM_EDID_MDELTA; ++ ++ if (cea_db_payload_len(db) < 9) ++ return; ++ ++ vrr->vrr_min = db[9] & DRM_EDID_VRR_MIN_MASK; ++ vrr->supported = (vrr->vrr_min > 0 && vrr->vrr_min <= 48); ++ ++ if (cea_db_payload_len(db) < 10) ++ return; ++ ++ vrr->vrr_max = (db[9] & DRM_EDID_VRR_MAX_UPPER_MASK) << 2 | db[10]; ++ vrr->supported &= (vrr->vrr_max == 0 || vrr->vrr_max >= 100); ++} ++ + static void drm_parse_dsc_info(struct drm_hdmi_dsc_cap *hdmi_dsc, + const u8 *hf_scds) + { +@@ -6277,7 +6304,7 @@ static void drm_parse_hdmi_forum_scds(struct drm_connector *connector, + } + + drm_parse_ycbcr420_deep_color_info(connector, hf_scds); +- ++ drm_parse_hdmi_gaming_info(&connector->display_info.hdmi, hf_scds); + if (cea_db_payload_len(hf_scds) >= 11 && hf_scds[11]) { + drm_parse_dsc_info(hdmi_dsc, hf_scds); + dsc_support = true; +@@ -6287,6 +6314,18 @@ static void drm_parse_hdmi_forum_scds(struct drm_connector *connector, + "[CONNECTOR:%d:%s] HF-VSDB: max TMDS clock: %d KHz, HDMI 2.1 support: %s, DSC 1.2 support: %s\n", + connector->base.id, connector->name, + max_tmds_clock, str_yes_no(max_frl_rate), str_yes_no(dsc_support)); ++ drm_dbg_kms(connector->dev, ++ "[CONNECTOR:%d:%s] FAPA in blanking: %s, ALLM support: %s, Fast Vactive support: %s\n", ++ connector->base.id, connector->name, str_yes_no(hdmi->fapa_start_location), ++ str_yes_no(hdmi->allm), str_yes_no(hdmi->vrr_cap.fva)); ++ drm_dbg_kms(connector->dev, ++ "[CONNECTOR:%d:%s] Negative M VRR support: %s, CinemaVRR support: %s, Mdelta: %d\n", ++ connector->base.id, connector->name, str_yes_no(hdmi->vrr_cap.cnmvrr), ++ str_yes_no(hdmi->vrr_cap.cinema_vrr), hdmi->vrr_cap.mdelta); ++ drm_dbg_kms(connector->dev, ++ "[CONNECTOR:%d:%s] VRRmin: %u, VRRmax: %u, VRR supported: %s\n", ++ connector->base.id, connector->name, hdmi->vrr_cap.vrr_min, ++ hdmi->vrr_cap.vrr_max, str_yes_no(hdmi->vrr_cap.supported)); + } + + static void drm_parse_hdmi_deep_color_info(struct drm_connector *connector, +diff --git a/drivers/gpu/drm/drm_mode_config.c b/drivers/gpu/drm/drm_mode_config.c +index d12db9b0bab8..231f54ba66f8 100644 +--- a/drivers/gpu/drm/drm_mode_config.c ++++ b/drivers/gpu/drm/drm_mode_config.c +@@ -345,6 +345,12 @@ static int drm_mode_create_standard_properties(struct drm_device *dev) + return -ENOMEM; + dev->mode_config.prop_vrr_enabled = prop; + ++ prop = drm_property_create_bool(dev, 0, ++ "PASSIVE_VRR_DISABLED"); ++ if (!prop) ++ return -ENOMEM; ++ dev->mode_config.prop_passive_vrr_disabled = prop; ++ + prop = drm_property_create(dev, + DRM_MODE_PROP_BLOB, + "DEGAMMA_LUT", 0); +diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h +index 8f34f4b8183d..fa4abfe8971e 100644 +--- a/include/drm/drm_connector.h ++++ b/include/drm/drm_connector.h +@@ -58,6 +58,12 @@ enum drm_connector_force { + DRM_FORCE_ON_DIGITAL, /* for DVI-I use digital connector */ + }; + ++enum drm_allm_mode { ++ DRM_ALLM_MODE_DISABLED, ++ DRM_ALLM_MODE_ENABLED_DYNAMIC, ++ DRM_ALLM_MODE_ENABLED_FORCED, ++}; ++ + /** + * enum drm_connector_status - status for a &drm_connector + * +@@ -254,6 +260,44 @@ struct drm_scdc { + struct drm_scrambling scrambling; + }; + ++/** ++ * struct drm_hdmi_vrr_cap - Information about VRR capabilities of a HDMI sink ++ * ++ * Describes the VRR support provided by HDMI 2.1 sink. The information is ++ * fetched fom additional HFVSDB blocks defined for HDMI 2.1. ++ */ ++struct drm_hdmi_vrr_cap { ++ /** @fva: flag for Fast VActive (Quick Frame Transport) support */ ++ bool fva; ++ ++ /** @mcnmvrr: flag for Negative M VRR support */ ++ bool cnmvrr; ++ ++ /** @mcinema_vrr: flag for Cinema VRR support */ ++ bool cinema_vrr; ++ ++ /** @mdelta: flag for limited frame-to-frame compensation support */ ++ bool mdelta; ++ ++ /** ++ * @vrr_min : minimum supported variable refresh rate in Hz. ++ * Valid values only inide 1 - 48 range ++ */ ++ u16 vrr_min; ++ ++ /** ++ * @vrr_max : maximum supported variable refresh rate in Hz (optional). ++ * Valid values are either 0 (max based on video mode) or >= 100 ++ */ ++ u16 vrr_max; ++ ++ /** ++ * @supported: flag for vrr support based on checking for VRRmin and ++ * VRRmax values having correct values. ++ */ ++ bool supported; ++}; ++ + /** + * struct drm_hdmi_dsc_cap - DSC capabilities of HDMI sink + * +@@ -330,6 +374,15 @@ struct drm_hdmi_info { + /** @max_lanes: supported by sink */ + u8 max_lanes; + ++ /** @fapa_start_location: flag for the FAPA in blanking support */ ++ bool fapa_start_location; ++ ++ /** @allm: flag for Auto Low Latency Mode support by sink */ ++ bool allm; ++ ++ /** @vrr_cap: VRR capabilities of the sink */ ++ struct drm_hdmi_vrr_cap vrr_cap; ++ + /** @dsc_cap: DSC capabilities of the sink */ + struct drm_hdmi_dsc_cap dsc_cap; + }; +@@ -1100,6 +1153,13 @@ struct drm_connector_state { + */ + unsigned int content_protection; + ++ /** ++ * @allm_mode: Connector property to control the ++ * HDMI Auto Low Latency Mode trigger setting. ++ * The %DRM_ALLM_MODE_\* values must match the values. ++ */ ++ enum drm_allm_mode allm_mode; ++ + /** + * @colorspace: State variable for Connector property to request + * colorspace change on Sink. This is most commonly used to switch +@@ -2054,6 +2114,37 @@ struct drm_connector { + */ + struct drm_property *vrr_capable_property; + ++ /** ++ * @passive_vrr_capable_property: Optional property to help userspace ++ * query hardware support for passive variable refresh rate on a ++ * connector. Drivers can add the property to a connector by ++ * calling drm_connector_attach_passive_vrr_capable_property(). ++ * ++ * This should be updated only by calling ++ * drm_connector_set_passive_vrr_capable_property(). ++ */ ++ struct drm_property *passive_vrr_capable_property; ++ ++ /** ++ * @allm_capable_property: Optional property to help userspace ++ * query hardware support for HDMI Auto Low Latency Mode on a connector. ++ * Drivers can add the property to a connector by calling ++ * drm_connector_attach_allm_capable_property(). ++ * ++ * This should be updated only by calling ++ * drm_connector_set_allm_capable_property(). ++ */ ++ struct drm_property *allm_capable_property; ++ ++ /** ++ * @allm_mode_property: ++ * ++ * Indicates HDMI Auto Low Latency Mode triggering mode for connector. ++ * Support for the requested state will depend on driver and hardware ++ * capabiltiy - lacking support is not treated as failure. ++ */ ++ struct drm_property *allm_mode_property; ++ + /** + * @colorspace_property: Connector property to set the suitable + * colorspace supported by the sink. +@@ -2448,6 +2539,10 @@ int drm_connector_attach_scaling_mode_property(struct drm_connector *connector, + u32 scaling_mode_mask); + int drm_connector_attach_vrr_capable_property( + struct drm_connector *connector); ++int drm_connector_attach_passive_vrr_capable_property( ++ struct drm_connector *connector); ++int drm_connector_attach_allm_capable_property(struct drm_connector *connector); ++int drm_connector_attach_allm_mode_property(struct drm_connector *connector); + int drm_connector_attach_broadcast_rgb_property(struct drm_connector *connector); + int drm_connector_attach_colorspace_property(struct drm_connector *connector); + int drm_connector_attach_hdr_output_metadata_property(struct drm_connector *connector); +@@ -2470,6 +2565,10 @@ void drm_connector_set_link_status_property(struct drm_connector *connector, + uint64_t link_status); + void drm_connector_set_vrr_capable_property( + struct drm_connector *connector, bool capable); ++void drm_connector_set_passive_vrr_capable_property( ++ struct drm_connector *connector, bool capable); ++void drm_connector_set_allm_capable_property( ++ struct drm_connector *connector, bool capable); + int drm_connector_set_panel_orientation( + struct drm_connector *connector, + enum drm_panel_orientation panel_orientation); +diff --git a/include/drm/drm_crtc.h b/include/drm/drm_crtc.h +index 66278ffeebd6..59dbb7ce1358 100644 +--- a/include/drm/drm_crtc.h ++++ b/include/drm/drm_crtc.h +@@ -299,6 +299,15 @@ struct drm_crtc_state { + */ + bool vrr_enabled; + ++ /** ++ * @passive_vrr_disabled: ++ * ++ * Indicates if variable refresh rate on desktop should be enabled for ++ * the CRTC. Support for the requested state will depend on driver and ++ * hardware capabiltiy - lacking support is not treated as failure. ++ */ ++ bool passive_vrr_disabled; ++ + /** + * @self_refresh_active: + * +diff --git a/include/drm/drm_mode_config.h b/include/drm/drm_mode_config.h +index 895fb820dba0..23ce744b233b 100644 +--- a/include/drm/drm_mode_config.h ++++ b/include/drm/drm_mode_config.h +@@ -697,6 +697,12 @@ struct drm_mode_config { + * whether variable refresh rate should be enabled on the CRTC. + */ + struct drm_property *prop_vrr_enabled; ++ /** ++ * @prop_passive_vrr_disabled: Default atomic CRTC property to indicate ++ * whether passive variable refresh rate should be disabled ++ * on the CRTC. ++ */ ++ struct drm_property *prop_passive_vrr_disabled; + + /** + * @dvi_i_subconnector_property: Optional DVI-I property to +-- +2.53.0 + diff --git a/sys-kernel/gentoo-sources-6.19/0006-r8125.patch b/sys-kernel/gentoo-sources-6.19/0006-r8125.patch new file mode 100644 index 0000000..ae00052 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.19/0006-r8125.patch @@ -0,0 +1,29360 @@ +From 739c942b8335f00091b9c255370d5b27448af308 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 13 Feb 2026 16:53:25 +0100 +Subject: [PATCH 6/9] r8125 + +Signed-off-by: Peter Jung +--- + drivers/net/ethernet/realtek/Kconfig | 15 + + drivers/net/ethernet/realtek/Makefile | 2 + + drivers/net/ethernet/realtek/r8125.h | 3059 +++ + drivers/net/ethernet/realtek/r8125_dash.c | 573 + + drivers/net/ethernet/realtek/r8125_dash.h | 196 + + drivers/net/ethernet/realtek/r8125_fiber.c | 464 + + drivers/net/ethernet/realtek/r8125_fiber.h | 63 + + drivers/net/ethernet/realtek/r8125_firmware.c | 264 + + drivers/net/ethernet/realtek/r8125_firmware.h | 68 + + drivers/net/ethernet/realtek/r8125_n.c | 21312 ++++++++++++++++ + drivers/net/ethernet/realtek/r8125_ptp.c | 1472 ++ + drivers/net/ethernet/realtek/r8125_ptp.h | 159 + + drivers/net/ethernet/realtek/r8125_realwow.h | 118 + + drivers/net/ethernet/realtek/r8125_rss.c | 583 + + drivers/net/ethernet/realtek/r8125_rss.h | 76 + + drivers/net/ethernet/realtek/r8169_main.c | 6 +- + drivers/net/ethernet/realtek/rtl_eeprom.c | 284 + + drivers/net/ethernet/realtek/rtl_eeprom.h | 53 + + drivers/net/ethernet/realtek/rtltool.c | 312 + + drivers/net/ethernet/realtek/rtltool.h | 89 + + 20 files changed, 29166 insertions(+), 2 deletions(-) + create mode 100755 drivers/net/ethernet/realtek/r8125.h + create mode 100755 drivers/net/ethernet/realtek/r8125_dash.c + create mode 100755 drivers/net/ethernet/realtek/r8125_dash.h + create mode 100755 drivers/net/ethernet/realtek/r8125_fiber.c + create mode 100755 drivers/net/ethernet/realtek/r8125_fiber.h + create mode 100755 drivers/net/ethernet/realtek/r8125_firmware.c + create mode 100755 drivers/net/ethernet/realtek/r8125_firmware.h + create mode 100755 drivers/net/ethernet/realtek/r8125_n.c + create mode 100755 drivers/net/ethernet/realtek/r8125_ptp.c + create mode 100755 drivers/net/ethernet/realtek/r8125_ptp.h + create mode 100755 drivers/net/ethernet/realtek/r8125_realwow.h + create mode 100755 drivers/net/ethernet/realtek/r8125_rss.c + create mode 100755 drivers/net/ethernet/realtek/r8125_rss.h + create mode 100755 drivers/net/ethernet/realtek/rtl_eeprom.c + create mode 100755 drivers/net/ethernet/realtek/rtl_eeprom.h + create mode 100755 drivers/net/ethernet/realtek/rtltool.c + create mode 100755 drivers/net/ethernet/realtek/rtltool.h + +diff --git a/drivers/net/ethernet/realtek/Kconfig b/drivers/net/ethernet/realtek/Kconfig +index 272c83bfdc6c..dc7cf96add0c 100644 +--- a/drivers/net/ethernet/realtek/Kconfig ++++ b/drivers/net/ethernet/realtek/Kconfig +@@ -95,6 +95,21 @@ config 8139_OLD_RX_RESET + experience problems, you can enable this option to restore the + old RX-reset behavior. If unsure, say N. + ++config R8125 ++ tristate "Realtek 8125/8162 ethernet support" ++ depends on PCI ++ select FW_LOADER ++ select CRC32 ++ select PHYLIB ++ select REALTEK_PHY ++ help ++ Say Y here if you have a Realtek Ethernet adapter belonging to ++ the following families: ++ RTL8125 2.5GBit Ethernet ++ ++ To compile this driver as a module, choose M here: the module ++ will be called r8125. This is recommended. ++ + config R8169 + tristate "Realtek 8169/8168/8101/8125 ethernet support" + depends on PCI +diff --git a/drivers/net/ethernet/realtek/Makefile b/drivers/net/ethernet/realtek/Makefile +index 046adf503ff4..dee73dfd003f 100644 +--- a/drivers/net/ethernet/realtek/Makefile ++++ b/drivers/net/ethernet/realtek/Makefile +@@ -9,4 +9,6 @@ obj-$(CONFIG_ATP) += atp.o + r8169-y += r8169_main.o r8169_firmware.o r8169_phy_config.o + r8169-$(CONFIG_R8169_LEDS) += r8169_leds.o + obj-$(CONFIG_R8169) += r8169.o ++r8125-y += r8125_n.o rtl_eeprom.o rtltool.o ++obj-$(CONFIG_R8125) += r8125.o + obj-$(CONFIG_RTASE) += rtase/ +diff --git a/drivers/net/ethernet/realtek/r8125.h b/drivers/net/ethernet/realtek/r8125.h +new file mode 100755 +index 000000000000..57b2b94872fd +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125.h +@@ -0,0 +1,3059 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef __R8125_H ++#define __R8125_H ++ ++#define CONFIG_SOC_LAN ++#define CONFIG_ASPM ++#define ENABLE_S5WOL ++#define ENABLE_EEE ++#define ENABLE_TX_NO_CLOSE ++#define ENABLE_GIGA_LITE ++ ++//#include ++#include ++#include ++#include ++#include "r8125_dash.h" ++#include "r8125_realwow.h" ++#ifdef ENABLE_FIBER_SUPPORT ++#include "r8125_fiber.h" ++#endif /* ENABLE_FIBER_SUPPORT */ ++#ifdef ENABLE_PTP_SUPPORT ++#include "r8125_ptp.h" ++#endif ++#include "r8125_rss.h" ++#ifdef ENABLE_LIB_SUPPORT ++#include "r8125_lib.h" ++#endif ++ ++#ifndef fallthrough ++#define fallthrough ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++#define netif_xmit_stopped netif_tx_queue_stopped ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0) ++#ifndef MDIO_AN_EEE_ADV_100TX ++#define MDIO_AN_EEE_ADV_100TX 0x0002 /* Advertise 100TX EEE cap */ ++#endif ++#ifndef MDIO_AN_EEE_ADV_1000T ++#define MDIO_AN_EEE_ADV_1000T 0x0004 /* Advertise 1000T EEE cap */ ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) ++#define MDIO_EEE_100TX MDIO_AN_EEE_ADV_100TX /* 100TX EEE cap */ ++#define MDIO_EEE_1000T MDIO_AN_EEE_ADV_1000T /* 1000T EEE cap */ ++#define MDIO_EEE_10GT 0x0008 /* 10GT EEE cap */ ++#define MDIO_EEE_1000KX 0x0010 /* 1000KX EEE cap */ ++#define MDIO_EEE_10GKX4 0x0020 /* 10G KX4 EEE cap */ ++#define MDIO_EEE_10GKR 0x0040 /* 10G KR EEE cap */ ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) */ ++ ++static inline u32 mmd_eee_adv_to_ethtool_adv_t(u16 eee_adv) ++{ ++ u32 adv = 0; ++ ++ if (eee_adv & MDIO_EEE_100TX) ++ adv |= ADVERTISED_100baseT_Full; ++ if (eee_adv & MDIO_EEE_1000T) ++ adv |= ADVERTISED_1000baseT_Full; ++ if (eee_adv & MDIO_EEE_10GT) ++ adv |= ADVERTISED_10000baseT_Full; ++ if (eee_adv & MDIO_EEE_1000KX) ++ adv |= ADVERTISED_1000baseKX_Full; ++ if (eee_adv & MDIO_EEE_10GKX4) ++ adv |= ADVERTISED_10000baseKX4_Full; ++ if (eee_adv & MDIO_EEE_10GKR) ++ adv |= ADVERTISED_10000baseKR_Full; ++ ++ return adv; ++} ++ ++static inline u16 ethtool_adv_to_mmd_eee_adv_t(u32 adv) ++{ ++ u16 reg = 0; ++ ++ if (adv & ADVERTISED_100baseT_Full) ++ reg |= MDIO_EEE_100TX; ++ if (adv & ADVERTISED_1000baseT_Full) ++ reg |= MDIO_EEE_1000T; ++ if (adv & ADVERTISED_10000baseT_Full) ++ reg |= MDIO_EEE_10GT; ++ if (adv & ADVERTISED_1000baseKX_Full) ++ reg |= MDIO_EEE_1000KX; ++ if (adv & ADVERTISED_10000baseKX4_Full) ++ reg |= MDIO_EEE_10GKX4; ++ if (adv & ADVERTISED_10000baseKR_Full) ++ reg |= MDIO_EEE_10GKR; ++ ++ return reg; ++} ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,7,0) */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) ++static inline bool skb_transport_header_was_set(const struct sk_buff *skb) ++{ ++ return skb->transport_header != ~0U; ++} ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,9,0) */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,3,0) ++static inline ++ssize_t strscpy(char *dest, const char *src, size_t count) ++{ ++ long res = 0; ++ ++ if (count == 0) ++ return -E2BIG; ++ ++ while (count) { ++ char c; ++ ++ c = src[res]; ++ dest[res] = c; ++ if (!c) ++ return res; ++ res++; ++ count--; ++ } ++ ++ /* Hit buffer length without finding a NUL; force NUL-termination. */ ++ if (res) ++ dest[res-1] = '\0'; ++ ++ return -E2BIG; ++} ++#endif ++ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0)) ++static inline unsigned char *skb_checksum_start(const struct sk_buff *skb) ++{ ++#if (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22)) ++ return skb->head + skb->csum_start; ++#else /* < 2.6.22 */ ++ return skb_transport_header(skb); ++#endif ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++static inline void netdev_tx_sent_queue(struct netdev_queue *dev_queue, ++ unsigned int bytes) ++{} ++static inline void netdev_tx_completed_queue(struct netdev_queue *dev_queue, ++ unsigned int pkts, ++ unsigned int bytes) ++{} ++static inline void netdev_tx_reset_queue(struct netdev_queue *q) {} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) ++static inline void fsleep(unsigned long usecs) ++{ ++ if (usecs <= 10) ++ udelay(usecs); ++ else if (usecs <= 20000) ++ usleep_range(usecs, 2 * usecs); ++ else ++ msleep(DIV_ROUND_UP(usecs, 1000)); ++} ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,2,0) ++#define netdev_xmit_more() (0) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,8,0) ++#define netif_testing_on(dev) ++#define netif_testing_off(dev) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0) ++#define netdev_sw_irq_coalesce_default_on(dev) ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,2,0) */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,32) ++typedef int netdev_tx_t; ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,12,0) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,1,9) ++static inline bool page_is_pfmemalloc(struct page *page) ++{ ++ /* ++ * Page index cannot be this large so this must be ++ * a pfmemalloc page. ++ */ ++ return page->index == -1UL; ++} ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(4,1,9) */ ++static inline bool dev_page_is_reusable(struct page *page) ++{ ++ return likely(page_to_nid(page) == numa_mem_id() && ++ !page_is_pfmemalloc(page)); ++} ++#endif ++ ++/* ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0)&& !defined(ENABLE_LIB_SUPPORT) ++#define RTL_USE_NEW_INTR_API ++#endif ++*/ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0) ++#define dma_map_page_attrs(dev, page, offset, size, dir, attrs) \ ++ dma_map_page(dev, page, offset, size, dir) ++#define dma_unmap_page_attrs(dev, page, size, dir, attrs) \ ++ dma_unmap_page(dev, page, size, dir) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++#define page_ref_inc(page) atomic_inc(&page->_count) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,4,216) ++#define page_ref_count(page) atomic_read(&page->_count) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,4,216) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++#define skb_transport_offset(skb) (skb->h.raw - skb->data) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,26) ++#define device_set_wakeup_enable(dev, val) do {} while (0) ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) ++static inline void ether_addr_copy(u8 *dst, const u8 *src) ++{ ++ u16 *a = (u16 *)dst; ++ const u16 *b = (const u16 *)src; ++ ++ a[0] = b[0]; ++ a[1] = b[1]; ++ a[2] = b[2]; ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,15,0) ++#define IS_ERR_OR_NULL(ptr) (!ptr) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,13,0) ++#define reinit_completion(x) ((x)->done = 0) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,39) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) ++#define pm_runtime_mark_last_busy(x) ++#define pm_runtime_put_autosuspend(x) pm_runtime_put(x) ++#define pm_runtime_put_sync_autosuspend(x) pm_runtime_put_sync(x) ++ ++static inline bool pm_runtime_suspended(struct device *dev) ++{ ++ return dev->power.runtime_status == RPM_SUSPENDED ++ && !dev->power.disable_depth; ++} ++ ++static inline bool pm_runtime_active(struct device *dev) ++{ ++ return dev->power.runtime_status == RPM_ACTIVE ++ || dev->power.disable_depth; ++} ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,36) ++#define queue_delayed_work(long_wq, work, delay) schedule_delayed_work(work, delay) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34) ++#define netif_printk(priv, type, level, netdev, fmt, args...) \ ++ do { \ ++ if (netif_msg_##type(priv)) \ ++ printk(level "%s: " fmt,(netdev)->name , ##args); \ ++ } while (0) ++ ++#define netif_emerg(priv, type, netdev, fmt, args...) \ ++ netif_printk(priv, type, KERN_EMERG, netdev, fmt, ##args) ++#define netif_alert(priv, type, netdev, fmt, args...) \ ++ netif_printk(priv, type, KERN_ALERT, netdev, fmt, ##args) ++#define netif_crit(priv, type, netdev, fmt, args...) \ ++ netif_printk(priv, type, KERN_CRIT, netdev, fmt, ##args) ++#define netif_err(priv, type, netdev, fmt, args...) \ ++ netif_printk(priv, type, KERN_ERR, netdev, fmt, ##args) ++#define netif_warn(priv, type, netdev, fmt, args...) \ ++ netif_printk(priv, type, KERN_WARNING, netdev, fmt, ##args) ++#define netif_notice(priv, type, netdev, fmt, args...) \ ++ netif_printk(priv, type, KERN_NOTICE, netdev, fmt, ##args) ++#define netif_info(priv, type, netdev, fmt, args...) \ ++ netif_printk(priv, type, KERN_INFO, (netdev), fmt, ##args) ++#endif ++#endif ++#endif ++#endif ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) ++#define setup_timer(_timer, _function, _data) \ ++do { \ ++ (_timer)->function = _function; \ ++ (_timer)->data = _data; \ ++ init_timer(_timer); \ ++} while (0) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,15) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) ++#if defined(skb_vlan_tag_present) && !defined(vlan_tx_tag_present) ++#define vlan_tx_tag_present skb_vlan_tag_present ++#endif ++#if defined(skb_vlan_tag_get) && !defined(vlan_tx_tag_get) ++#define vlan_tx_tag_get skb_vlan_tag_get ++#endif ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) ++ ++#define RTL_ALLOC_SKB_INTR(napi, length) dev_alloc_skb(length) ++#define R8125_USE_NAPI_ALLOC_SKB 0 ++#ifdef CONFIG_R8125_NAPI ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) ++#undef RTL_ALLOC_SKB_INTR ++#define RTL_ALLOC_SKB_INTR(napi, length) napi_alloc_skb(napi, length) ++#undef R8125_USE_NAPI_ALLOC_SKB ++#define R8125_USE_NAPI_ALLOC_SKB 1 ++#endif ++#endif ++ ++#define RTL_BUILD_SKB_INTR(data, frag_size) build_skb(data, frag_size) ++#ifdef CONFIG_R8125_NAPI ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,12,0) ++#undef RTL_BUILD_SKB_INTR ++#define RTL_BUILD_SKB_INTR(data, frag_size) napi_build_skb(data, frag_size) ++#endif ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) ++#define eth_random_addr(addr) random_ether_addr(addr) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) ++#define netdev_features_t u32 ++#endif ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,5,0) ++#define NETIF_F_ALL_CSUM NETIF_F_CSUM_MASK ++#else ++#ifndef NETIF_F_ALL_CSUM ++#define NETIF_F_ALL_CSUM NETIF_F_CSUM_MASK ++#endif ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37) ++#define ENABLE_R8125_PROCFS ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,11,0) ++#define ENABLE_R8125_SYSFS ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++#define NETIF_F_HW_VLAN_RX NETIF_F_HW_VLAN_CTAG_RX ++#define NETIF_F_HW_VLAN_TX NETIF_F_HW_VLAN_CTAG_TX ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,8,0) ++#define __devinit ++#define __devexit ++#define __devexit_p(func) func ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) ++#define CHECKSUM_PARTIAL CHECKSUM_HW ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) ++#define irqreturn_t void ++#define IRQ_HANDLED 1 ++#define IRQ_NONE 0 ++#define IRQ_RETVAL(x) ++#endif ++ ++#ifndef NETIF_F_RXALL ++#define NETIF_F_RXALL 0 ++#endif ++ ++#ifndef NETIF_F_RXFCS ++#define NETIF_F_RXFCS 0 ++#endif ++ ++#if !defined(HAVE_FREE_NETDEV) && (LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0)) ++#define free_netdev(x) kfree(x) ++#endif ++ ++#ifndef SET_NETDEV_DEV ++#define SET_NETDEV_DEV(net, pdev) ++#endif ++ ++#ifndef SET_MODULE_OWNER ++#define SET_MODULE_OWNER(dev) ++#endif ++ ++#ifndef SA_SHIRQ ++#define SA_SHIRQ IRQF_SHARED ++#endif ++ ++#ifndef NETIF_F_GSO ++#define gso_size tso_size ++#define gso_segs tso_segs ++#endif ++ ++#ifndef PCI_VENDOR_ID_DLINK ++#define PCI_VENDOR_ID_DLINK 0x1186 ++#endif ++ ++#ifndef dma_mapping_error ++#define dma_mapping_error(a,b) 0 ++#endif ++ ++#ifndef netif_err ++#define netif_err(a,b,c,d) ++#endif ++ ++#ifndef AUTONEG_DISABLE ++#define AUTONEG_DISABLE 0x00 ++#endif ++ ++#ifndef AUTONEG_ENABLE ++#define AUTONEG_ENABLE 0x01 ++#endif ++ ++#ifndef BMCR_SPEED1000 ++#define BMCR_SPEED1000 0x0040 ++#endif ++ ++#ifndef BMCR_SPEED100 ++#define BMCR_SPEED100 0x2000 ++#endif ++ ++#ifndef BMCR_SPEED10 ++#define BMCR_SPEED10 0x0000 ++#endif ++ ++#ifndef SPEED_UNKNOWN ++#define SPEED_UNKNOWN -1 ++#endif ++ ++#ifndef DUPLEX_UNKNOWN ++#define DUPLEX_UNKNOWN 0xff ++#endif ++ ++#ifndef SUPPORTED_Pause ++#define SUPPORTED_Pause (1 << 13) ++#endif ++ ++#ifndef SUPPORTED_Asym_Pause ++#define SUPPORTED_Asym_Pause (1 << 14) ++#endif ++ ++#ifndef MDIO_EEE_100TX ++#define MDIO_EEE_100TX 0x0002 ++#endif ++ ++#ifndef MDIO_EEE_1000T ++#define MDIO_EEE_1000T 0x0004 ++#endif ++ ++#ifndef MDIO_EEE_2_5GT ++#define MDIO_EEE_2_5GT 0x0001 ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0) ++#define ethtool_keee ethtool_eee ++#define rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t ethtool_adv_to_mmd_eee_adv_t ++static inline u32 rtl8125_ethtool_adv_to_mmd_eee_adv_cap2_t(u32 adv) ++{ ++ u32 result = 0; ++ ++ if (adv & SUPPORTED_2500baseX_Full) ++ result |= MDIO_EEE_2_5GT; ++ ++ return result; ++} ++#else ++#define rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t linkmode_to_mii_eee_cap1_t ++#define rtl8125_ethtool_adv_to_mmd_eee_adv_cap2_t linkmode_to_mii_eee_cap2_t ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0) */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) ++#ifdef CONFIG_NET_POLL_CONTROLLER ++#define RTL_NET_POLL_CONTROLLER dev->poll_controller=rtl8125_netpoll ++#else ++#define RTL_NET_POLL_CONTROLLER ++#endif ++ ++#ifdef CONFIG_R8125_VLAN ++#define RTL_SET_VLAN dev->vlan_rx_register=rtl8125_vlan_rx_register ++#else ++#define RTL_SET_VLAN ++#endif ++ ++#define RTL_NET_DEVICE_OPS(ops) dev->open=rtl8125_open; \ ++ dev->hard_start_xmit=rtl8125_start_xmit; \ ++ dev->get_stats=rtl8125_get_stats; \ ++ dev->stop=rtl8125_close; \ ++ dev->tx_timeout=rtl8125_tx_timeout; \ ++ dev->set_multicast_list=rtl8125_set_rx_mode; \ ++ dev->change_mtu=rtl8125_change_mtu; \ ++ dev->set_mac_address=rtl8125_set_mac_address; \ ++ dev->do_ioctl=rtl8125_do_ioctl; \ ++ RTL_NET_POLL_CONTROLLER; \ ++ RTL_SET_VLAN; ++#else ++#define RTL_NET_DEVICE_OPS(ops) dev->netdev_ops=&ops ++#endif ++ ++#ifndef FALSE ++#define FALSE 0 ++#endif ++ ++#ifndef TRUE ++#define TRUE 1 ++#endif ++ ++#ifndef false ++#define false 0 ++#endif ++ ++#ifndef true ++#define true 1 ++#endif ++ ++//Hardware will continue interrupt 10 times after interrupt finished. ++#define RTK_KEEP_INTERRUPT_COUNT (10) ++ ++//the low 32 bit address of receive buffer must be 8-byte alignment. ++#ifndef NET_IP_ALIGN ++#define NET_IP_ALIGN 2 ++#endif ++#define R8125_RX_ALIGN NET_IP_ALIGN ++ ++#ifdef CONFIG_R8125_NAPI ++#define NAPI_SUFFIX "-NAPI" ++#else ++#define NAPI_SUFFIX "" ++#endif ++ ++#if defined(ENABLE_REALWOW_SUPPORT) ++#define REALWOW_SUFFIX "-REALWOW" ++#else ++#define REALWOW_SUFFIX "" ++#endif ++ ++#if defined(ENABLE_DASH_SUPPORT) ++#define DASH_SUFFIX "-DASH" ++#else ++#define DASH_SUFFIX "" ++#endif ++ ++#if defined(ENABLE_PTP_SUPPORT) ++#define PTP_SUFFIX "-PTP" ++#else ++#define PTP_SUFFIX "" ++#endif ++ ++#if defined(ENABLE_RSS_SUPPORT) ++#define RSS_SUFFIX "-RSS" ++#else ++#define RSS_SUFFIX "" ++#endif ++ ++#define RTL8125_VERSION "9.016.01" NAPI_SUFFIX DASH_SUFFIX REALWOW_SUFFIX PTP_SUFFIX RSS_SUFFIX ++#define MODULENAME "r8125" ++#define PFX MODULENAME ": " ++ ++#define GPL_CLAIM "\ ++r8125 Copyright (C) 2025 Realtek NIC software team \n \ ++This program comes with ABSOLUTELY NO WARRANTY; for details, please see . \n \ ++This is free software, and you are welcome to redistribute it under certain conditions; see . \n" ++ ++#ifdef RTL8125_DEBUG ++#define assert(expr) \ ++ if(!(expr)) { \ ++ printk("Assertion failed! %s,%s,%s,line=%d\n", \ ++ #expr,__FILE__,__FUNCTION__,__LINE__); \ ++ } ++#define dprintk(fmt, args...) do { printk(PFX fmt, ## args); } while (0) ++#else ++#define assert(expr) do {} while (0) ++#define dprintk(fmt, args...) do {} while (0) ++#endif /* RTL8125_DEBUG */ ++ ++#define R8125_MSG_DEFAULT \ ++ (NETIF_MSG_DRV | NETIF_MSG_PROBE | NETIF_MSG_IFUP | NETIF_MSG_IFDOWN) ++ ++#ifdef CONFIG_R8125_NAPI ++#define rtl8125_rx_hwaccel_skb vlan_hwaccel_receive_skb ++#define rtl8125_rx_quota(count, quota) min(count, quota) ++#else ++#define rtl8125_rx_hwaccel_skb vlan_hwaccel_rx ++#define rtl8125_rx_quota(count, quota) count ++#endif ++ ++#ifdef CONFIG_R8125_NAPI ++#define r8125_spin_lock(lock, flags) (void)flags;spin_lock_bh(lock) ++#define r8125_spin_unlock(lock, flags) (void)flags;spin_unlock_bh(lock) ++#else ++#define r8125_spin_lock(lock, flags) spin_lock_irqsave(lock, flags) ++#define r8125_spin_unlock(lock, flags) spin_unlock_irqrestore(lock, flags) ++#endif ++ ++/* MAC address length */ ++#ifndef MAC_ADDR_LEN ++#define MAC_ADDR_LEN 6 ++#endif ++ ++#ifndef MAC_PROTOCOL_LEN ++#define MAC_PROTOCOL_LEN 2 ++#endif ++ ++#ifndef ETH_FCS_LEN ++#define ETH_FCS_LEN 4 ++#endif ++ ++#ifndef NETIF_F_TSO6 ++#define NETIF_F_TSO6 0 ++#endif ++ ++#define Reserved2_data 7 ++#define RX_DMA_BURST_unlimited 7 /* Maximum PCI burst, '7' is unlimited */ ++#define RX_DMA_BURST_512 5 ++#define RX_DMA_BURST_256 4 ++#define TX_DMA_BURST_unlimited 7 ++#define TX_DMA_BURST_1024 6 ++#define TX_DMA_BURST_512 5 ++#define TX_DMA_BURST_256 4 ++#define TX_DMA_BURST_128 3 ++#define TX_DMA_BURST_64 2 ++#define TX_DMA_BURST_32 1 ++#define TX_DMA_BURST_16 0 ++#define Reserved1_data 0x3F ++#define RxPacketMaxSize 0x3FE8 /* 16K - 1 - ETH_HLEN - VLAN - CRC... */ ++#define Jumbo_Frame_1k ETH_DATA_LEN ++#define Jumbo_Frame_2k (2*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define Jumbo_Frame_3k (3*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define Jumbo_Frame_4k (4*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define Jumbo_Frame_5k (5*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define Jumbo_Frame_6k (6*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define Jumbo_Frame_7k (7*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define Jumbo_Frame_8k (8*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define Jumbo_Frame_9k (9*1024 - ETH_HLEN - VLAN_HLEN - ETH_FCS_LEN) ++#define InterFrameGap 0x03 /* 3 means InterFrameGap = the shortest one */ ++#define RxEarly_off_V1 (0x07 << 11) ++#define RxEarly_off_V2 (1 << 11) ++#define Rx_Single_fetch_V2 (1 << 14) ++#define Rx_Close_Multiple (1 << 21) ++#define Rx_Fetch_Number_8 (1 << 30) ++ ++#define R8125_REGS_SIZE (256) ++#define R8125_MAC_REGS_SIZE (256) ++#define R8125_PHY_REGS_SIZE (16*2) ++#define R8125_EPHY_REGS_SIZE (31*2) ++#define R8125_ERI_REGS_SIZE (0x100) ++#define R8125_REGS_DUMP_SIZE (0x400) ++#define R8125_PCI_REGS_SIZE (0x100) ++#define R8125_NAPI_WEIGHT 64 ++ ++#define R8125_MAX_MSIX_VEC_8125A 4 ++#define R8125_MAX_MSIX_VEC_8125B 32 ++#define R8125_MAX_MSIX_VEC_8125D 32 ++#define R8125_MIN_MSIX_VEC_8125B 22 ++#define R8125_MIN_MSIX_VEC_8125BP 32 ++#define R8125_MIN_MSIX_VEC_8125CP 31 ++#define R8125_MIN_MSIX_VEC_8125D 20 ++#define R8125_MAX_MSIX_VEC 32 ++#define R8125_MAX_RX_QUEUES_VEC_V3 (16) ++ ++#define RTL8125_TX_TIMEOUT (6 * HZ) ++#define RTL8125_LINK_TIMEOUT (1 * HZ) ++#define RTL8125_ESD_TIMEOUT (2 * HZ) ++#define RTL8125_DASH_TIMEOUT (0) ++ ++#define rtl8125_rx_page_size(order) (PAGE_SIZE << order) ++ ++#define MAX_NUM_TX_DESC 1024 /* Maximum number of Tx descriptor registers */ ++#define MAX_NUM_RX_DESC 1024 /* Maximum number of Rx descriptor registers */ ++ ++#define MIN_NUM_TX_DESC 256 /* Minimum number of Tx descriptor registers */ ++#define MIN_NUM_RX_DESC 256 /* Minimum number of Rx descriptor registers */ ++ ++#define NUM_TX_DESC MAX_NUM_TX_DESC /* Number of Tx descriptor registers */ ++#define NUM_RX_DESC MAX_NUM_RX_DESC /* Number of Rx descriptor registers */ ++ ++#ifdef ENABLE_DOUBLE_VLAN ++#define RX_BUF_SIZE 0x05F6 /* 0x05F6(1526) = 1514 + 8(double vlan) + 4(crc) bytes */ ++#define RT_VALN_HLEN 8 /* 8(double vlan) bytes */ ++#else ++#define RX_BUF_SIZE 0x05F2 /* 0x05F2(1522) = 1514 + 4(single vlan) + 4(crc) bytes */ ++#define RT_VALN_HLEN 4 /* 4(single vlan) bytes */ ++#endif ++ ++#define R8125_MAX_TX_QUEUES (2) ++#define R8125_MAX_RX_QUEUES_V2 (4) ++#define R8125_MAX_RX_QUEUES_V3 (16) ++#define R8125_MAX_RX_QUEUES R8125_MAX_RX_QUEUES_V3 ++#define R8125_MAX_QUEUES R8125_MAX_RX_QUEUES ++ ++#define OCP_STD_PHY_BASE 0xa400 ++ ++//Channel Wait Count ++#define R8125_CHANNEL_WAIT_COUNT (20000) ++#define R8125_CHANNEL_WAIT_TIME (1) // 1us ++#define R8125_CHANNEL_EXIT_DELAY_TIME (20) //20us ++ ++#ifdef ENABLE_LIB_SUPPORT ++#define R8125_MULTI_RX_Q(tp) 0 ++#else ++#define R8125_MULTI_RX_Q(tp) (tp->num_rx_rings > 1) ++#endif ++ ++#define NODE_ADDRESS_SIZE 6 ++ ++#define SHORT_PACKET_PADDING_BUF_SIZE 256 ++ ++#define RTK_MAGIC_DEBUG_VALUE 0x0badbeef ++ ++/* write/read MMIO register */ ++#define RTL_W8(tp, reg, val8) writeb((val8), tp->mmio_addr + (reg)) ++#define RTL_W16(tp, reg, val16) writew((val16), tp->mmio_addr + (reg)) ++#define RTL_W32(tp, reg, val32) writel((val32), tp->mmio_addr + (reg)) ++#define RTL_R8(tp, reg) readb(tp->mmio_addr + (reg)) ++#define RTL_R16(tp, reg) readw(tp->mmio_addr + (reg)) ++#define RTL_R32(tp, reg) ((unsigned long) readl(tp->mmio_addr + (reg))) ++ ++#ifndef DMA_64BIT_MASK ++#define DMA_64BIT_MASK 0xffffffffffffffffULL ++#endif ++ ++#ifndef DMA_32BIT_MASK ++#define DMA_32BIT_MASK 0x00000000ffffffffULL ++#endif ++ ++#ifndef NETDEV_TX_OK ++#define NETDEV_TX_OK 0 /* driver took care of packet */ ++#endif ++ ++#ifndef NETDEV_TX_BUSY ++#define NETDEV_TX_BUSY 1 /* driver tx path was busy*/ ++#endif ++ ++#ifndef NETDEV_TX_LOCKED ++#define NETDEV_TX_LOCKED -1t /* driver tx lock was already taken */ ++#endif ++ ++#ifndef ADVERTISED_Pause ++#define ADVERTISED_Pause (1 << 13) ++#endif ++ ++#ifndef ADVERTISED_Asym_Pause ++#define ADVERTISED_Asym_Pause (1 << 14) ++#endif ++ ++#ifndef ADVERTISE_PAUSE_CAP ++#define ADVERTISE_PAUSE_CAP 0x400 ++#endif ++ ++#ifndef ADVERTISE_PAUSE_ASYM ++#define ADVERTISE_PAUSE_ASYM 0x800 ++#endif ++ ++#ifndef MII_CTRL1000 ++#define MII_CTRL1000 0x09 ++#endif ++ ++#ifndef ADVERTISE_1000FULL ++#define ADVERTISE_1000FULL 0x200 ++#endif ++ ++#ifndef ADVERTISE_1000HALF ++#define ADVERTISE_1000HALF 0x100 ++#endif ++ ++#ifndef ADVERTISED_2500baseX_Full ++#define ADVERTISED_2500baseX_Full 0x8000 ++#endif ++ ++#define RTK_ADVERTISE_2500FULL 0x80 ++#define RTK_ADVERTISE_5000FULL 0x100 ++#define RTK_ADVERTISE_10000FULL 0x1000 ++#define RTK_LPA_ADVERTISE_2500FULL 0x20 ++#define RTK_LPA_ADVERTISE_5000FULL 0x40 ++#define RTK_LPA_ADVERTISE_10000FULL 0x800 ++ ++#define RTK_EEE_ADVERTISE_2500FULL BIT(0) ++#define RTK_EEE_ADVERTISE_5000FULL BIT(1) ++#define RTK_LPA_EEE_ADVERTISE_2500FULL BIT(0) ++#define RTK_LPA_EEE_ADVERTISE_5000FULL BIT(1) ++ ++/* Tx NO CLOSE */ ++#define MAX_TX_NO_CLOSE_DESC_PTR_V2 0x10000 ++#define MAX_TX_NO_CLOSE_DESC_PTR_MASK_V2 0xFFFF ++#define MAX_TX_NO_CLOSE_DESC_PTR_V3 0x100000000 ++#define MAX_TX_NO_CLOSE_DESC_PTR_MASK_V3 0xFFFFFFFF ++#define MAX_TX_NO_CLOSE_DESC_PTR_V4 0x80000000 ++#define MAX_TX_NO_CLOSE_DESC_PTR_MASK_V4 0x7FFFFFFF ++#define TX_NO_CLOSE_SW_PTR_MASK_V2 0x1FFFF ++ ++#ifndef ETH_MIN_MTU ++#define ETH_MIN_MTU 68 ++#endif ++ ++#define D0_SPEED_UP_SPEED_DISABLE 0 ++#define D0_SPEED_UP_SPEED_1000 1 ++#define D0_SPEED_UP_SPEED_2500 2 ++ ++#define RTL8125_MAC_MCU_PAGE_SIZE 256 //256 words ++ ++#ifndef WRITE_ONCE ++#define WRITE_ONCE(var, val) (*((volatile typeof(val) *)(&(var))) = (val)) ++#endif ++#ifndef READ_ONCE ++#define READ_ONCE(var) (*((volatile typeof(var) *)(&(var)))) ++#endif ++ ++#define R8125_LINK_STATE_OFF 0 ++#define R8125_LINK_STATE_ON 1 ++#define R8125_LINK_STATE_UNKNOWN 2 ++ ++/*****************************************************************************/ ++ ++//#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,3) ++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2,4,27)) || \ ++ ((LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0)) && \ ++ (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,3)))) ++/* copied from linux kernel 2.6.20 include/linux/netdev.h */ ++#define NETDEV_ALIGN 32 ++#define NETDEV_ALIGN_CONST (NETDEV_ALIGN - 1) ++ ++static inline void *netdev_priv(struct net_device *dev) ++{ ++ return (char *)dev + ((sizeof(struct net_device) ++ + NETDEV_ALIGN_CONST) ++ & ~NETDEV_ALIGN_CONST); ++} ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,3) ++ ++/*****************************************************************************/ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++#define RTLDEV tp ++#else ++#define RTLDEV dev ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++/*****************************************************************************/ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) ++typedef struct net_device *napi_ptr; ++typedef int *napi_budget; ++ ++#define napi dev ++#define RTL_NAPI_CONFIG(ndev, priv, function, weig) ndev->poll=function; \ ++ ndev->weight=weig; ++#define RTL_NAPI_QUOTA(budget, ndev) min(*budget, ndev->quota) ++#define RTL_GET_PRIV(stuct_ptr, priv_struct) netdev_priv(stuct_ptr) ++#define RTL_GET_NETDEV(priv_ptr) ++#define RTL_RX_QUOTA(budget) *budget ++#define RTL_NAPI_QUOTA_UPDATE(ndev, work_done, budget) *budget -= work_done; \ ++ ndev->quota -= work_done; ++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done) netif_rx_complete(dev) ++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi) netif_rx_schedule_prep(dev) ++#define __RTL_NETIF_RX_SCHEDULE(dev, napi) __netif_rx_schedule(dev) ++#define RTL_NAPI_RETURN_VALUE work_done >= work_to_do ++#define RTL_NAPI_ENABLE(dev, napi) netif_poll_enable(dev) ++#define RTL_NAPI_DISABLE(dev, napi) netif_poll_disable(dev) ++#define DMA_BIT_MASK(n) (((n) == 64) ? ~0ULL : ((1ULL<<(n))-1)) ++#else ++typedef struct napi_struct *napi_ptr; ++typedef int napi_budget; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,1,0) ++#define RTL_NAPI_CONFIG(ndev, priv, function, weight) netif_napi_add_weight(ndev, &priv->napi, function, weight) ++#else ++#define RTL_NAPI_CONFIG(ndev, priv, function, weight) netif_napi_add(ndev, &priv->napi, function, weight) ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(6,1,0) ++#define RTL_NAPI_QUOTA(budget, ndev) min(budget, budget) ++#define RTL_GET_PRIV(stuct_ptr, priv_struct) container_of(stuct_ptr, priv_struct, stuct_ptr) ++#define RTL_GET_NETDEV(priv_ptr) struct net_device *dev = priv_ptr->dev; ++#define RTL_RX_QUOTA(budget) budget ++#define RTL_NAPI_QUOTA_UPDATE(ndev, work_done, budget) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) ++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done) netif_rx_complete(dev, napi) ++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi) netif_rx_schedule_prep(dev, napi) ++#define __RTL_NETIF_RX_SCHEDULE(dev, napi) __netif_rx_schedule(dev, napi) ++#endif ++#if LINUX_VERSION_CODE == KERNEL_VERSION(2,6,29) ++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done) netif_rx_complete(napi) ++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi) netif_rx_schedule_prep(napi) ++#define __RTL_NETIF_RX_SCHEDULE(dev, napi) __netif_rx_schedule(napi) ++#endif ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,29) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,19,0) ++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done) napi_complete_done(napi, work_done) ++#else ++#define RTL_NETIF_RX_COMPLETE(dev, napi, work_done) napi_complete(napi) ++#endif ++#define RTL_NETIF_RX_SCHEDULE_PREP(dev, napi) napi_schedule_prep(napi) ++#define __RTL_NETIF_RX_SCHEDULE(dev, napi) __napi_schedule(napi) ++#endif ++#define RTL_NAPI_RETURN_VALUE work_done ++#define RTL_NAPI_ENABLE(dev, napi) napi_enable(napi) ++#define RTL_NAPI_DISABLE(dev, napi) napi_disable(napi) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) ++#define RTL_NAPI_DEL(priv) ++#else ++#define RTL_NAPI_DEL(priv) netif_napi_del(&priv->napi) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,27) ++ ++/*****************************************************************************/ ++#ifdef CONFIG_R8125_NAPI ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) ++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget) napi_consume_skb(skb, budget) ++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) ++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget) dev_consume_skb_any(skb); ++#else ++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget) dev_kfree_skb_any(skb); ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,0) ++#else //CONFIG_R8125_NAPI ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) ++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget) dev_consume_skb_any(skb); ++#else ++#define RTL_NAPI_CONSUME_SKB_ANY(skb, budget) dev_kfree_skb_any(skb); ++#endif ++#endif //CONFIG_R8125_NAPI ++ ++/*****************************************************************************/ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++#ifdef __CHECKER__ ++#define __iomem __attribute__((noderef, address_space(2))) ++extern void __chk_io_ptr(void __iomem *); ++#define __bitwise __attribute__((bitwise)) ++#else ++#define __iomem ++#define __chk_io_ptr(x) (void)0 ++#define __bitwise ++#endif ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++ ++/*****************************************************************************/ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) ++#ifdef __CHECKER__ ++#define __force __attribute__((force)) ++#else ++#define __force ++#endif ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,8) ++ ++#ifndef module_param ++#define module_param(v,t,p) MODULE_PARM(v, "i"); ++#endif ++ ++#ifndef PCI_DEVICE ++#define PCI_DEVICE(vend,dev) \ ++ .vendor = (vend), .device = (dev), \ ++ .subvendor = PCI_ANY_ID, .subdevice = PCI_ANY_ID ++#endif ++ ++/*****************************************************************************/ ++/* 2.5.28 => 2.4.23 */ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,5,28)) ++ ++static inline void _kc_synchronize_irq(void) ++{ ++ synchronize_irq(); ++} ++#undef synchronize_irq ++#define synchronize_irq(X) _kc_synchronize_irq() ++ ++#include ++#define work_struct tq_struct ++#undef INIT_WORK ++#define INIT_WORK(a,b,c) INIT_TQUEUE(a,(void (*)(void *))b,c) ++#undef container_of ++#define container_of list_entry ++#define schedule_work schedule_task ++#define flush_scheduled_work flush_scheduled_tasks ++#endif /* 2.5.28 => 2.4.17 */ ++ ++/*****************************************************************************/ ++/* 2.6.4 => 2.6.0 */ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4)) ++#define MODULE_VERSION(_version) MODULE_INFO(version, _version) ++#endif /* 2.6.4 => 2.6.0 */ ++/*****************************************************************************/ ++/* 2.6.0 => 2.5.28 */ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0)) ++#define MODULE_INFO(version, _version) ++#ifndef CONFIG_E1000_DISABLE_PACKET_SPLIT ++#define CONFIG_E1000_DISABLE_PACKET_SPLIT 1 ++#endif ++ ++#define pci_set_consistent_dma_mask(dev,mask) 1 ++ ++#undef dev_put ++#define dev_put(dev) __dev_put(dev) ++ ++#ifndef skb_fill_page_desc ++#define skb_fill_page_desc _kc_skb_fill_page_desc ++extern void _kc_skb_fill_page_desc(struct sk_buff *skb, int i, struct page *page, int off, int size); ++#endif ++ ++#ifndef pci_dma_mapping_error ++#define pci_dma_mapping_error _kc_pci_dma_mapping_error ++static inline int _kc_pci_dma_mapping_error(dma_addr_t dma_addr) ++{ ++ return dma_addr == 0; ++} ++#endif ++ ++#undef ALIGN ++#define ALIGN(x,a) (((x)+(a)-1)&~((a)-1)) ++ ++#endif /* 2.6.0 => 2.5.28 */ ++ ++/*****************************************************************************/ ++/* 2.4.22 => 2.4.17 */ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,4,22)) ++#define pci_name(x) ((x)->slot_name) ++#endif /* 2.4.22 => 2.4.17 */ ++ ++/*****************************************************************************/ ++/* 2.6.5 => 2.6.0 */ ++#if (LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5)) ++#define pci_dma_sync_single_for_cpu pci_dma_sync_single ++#define pci_dma_sync_single_for_device pci_dma_sync_single_for_cpu ++#endif /* 2.6.5 => 2.6.0 */ ++ ++/*****************************************************************************/ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) ++/* ++ * initialize a work-struct's func and data pointers: ++ */ ++#define PREPARE_WORK(_work, _func, _data) \ ++ do { \ ++ (_work)->func = _func; \ ++ (_work)->data = _data; \ ++ } while (0) ++ ++#endif ++/*****************************************************************************/ ++/* 2.6.4 => 2.6.0 */ ++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(2,4,25) && \ ++ LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22)) || \ ++ (LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) && \ ++ LINUX_VERSION_CODE < KERNEL_VERSION(2,6,4))) ++#define ETHTOOL_OPS_COMPAT ++#endif /* 2.6.4 => 2.6.0 */ ++ ++/*****************************************************************************/ ++/* Installations with ethtool version without eeprom, adapter id, or statistics ++ * support */ ++ ++#ifndef ETH_GSTRING_LEN ++#define ETH_GSTRING_LEN 32 ++#endif ++ ++#ifndef ETHTOOL_GSTATS ++#define ETHTOOL_GSTATS 0x1d ++#undef ethtool_drvinfo ++#define ethtool_drvinfo k_ethtool_drvinfo ++struct k_ethtool_drvinfo { ++ u32 cmd; ++ char driver[32]; ++ char version[32]; ++ char fw_version[32]; ++ char bus_info[32]; ++ char reserved1[32]; ++ char reserved2[16]; ++ u32 n_stats; ++ u32 testinfo_len; ++ u32 eedump_len; ++ u32 regdump_len; ++}; ++ ++struct ethtool_stats { ++ u32 cmd; ++ u32 n_stats; ++ u64 data[0]; ++}; ++#endif /* ETHTOOL_GSTATS */ ++ ++#ifndef ETHTOOL_PHYS_ID ++#define ETHTOOL_PHYS_ID 0x1c ++#endif /* ETHTOOL_PHYS_ID */ ++ ++#ifndef ETHTOOL_GSTRINGS ++#define ETHTOOL_GSTRINGS 0x1b ++enum ethtool_stringset { ++ ETH_SS_TEST = 0, ++ ETH_SS_STATS, ++}; ++struct ethtool_gstrings { ++ u32 cmd; /* ETHTOOL_GSTRINGS */ ++ u32 string_set; /* string set id e.c. ETH_SS_TEST, etc*/ ++ u32 len; /* number of strings in the string set */ ++ u8 data[0]; ++}; ++#endif /* ETHTOOL_GSTRINGS */ ++ ++#ifndef ETHTOOL_TEST ++#define ETHTOOL_TEST 0x1a ++enum ethtool_test_flags { ++ ETH_TEST_FL_OFFLINE = (1 << 0), ++ ETH_TEST_FL_FAILED = (1 << 1), ++}; ++struct ethtool_test { ++ u32 cmd; ++ u32 flags; ++ u32 reserved; ++ u32 len; ++ u64 data[0]; ++}; ++#endif /* ETHTOOL_TEST */ ++ ++#ifndef ETHTOOL_GEEPROM ++#define ETHTOOL_GEEPROM 0xb ++#undef ETHTOOL_GREGS ++struct ethtool_eeprom { ++ u32 cmd; ++ u32 magic; ++ u32 offset; ++ u32 len; ++ u8 data[0]; ++}; ++ ++struct ethtool_value { ++ u32 cmd; ++ u32 data; ++}; ++#endif /* ETHTOOL_GEEPROM */ ++ ++#ifndef ETHTOOL_GLINK ++#define ETHTOOL_GLINK 0xa ++#endif /* ETHTOOL_GLINK */ ++ ++#ifndef ETHTOOL_GREGS ++#define ETHTOOL_GREGS 0x00000004 /* Get NIC registers */ ++#define ethtool_regs _kc_ethtool_regs ++/* for passing big chunks of data */ ++struct _kc_ethtool_regs { ++ u32 cmd; ++ u32 version; /* driver-specific, indicates different chips/revs */ ++ u32 len; /* bytes */ ++ u8 data[0]; ++}; ++#endif /* ETHTOOL_GREGS */ ++ ++#ifndef ETHTOOL_GMSGLVL ++#define ETHTOOL_GMSGLVL 0x00000007 /* Get driver message level */ ++#endif ++#ifndef ETHTOOL_SMSGLVL ++#define ETHTOOL_SMSGLVL 0x00000008 /* Set driver msg level, priv. */ ++#endif ++#ifndef ETHTOOL_NWAY_RST ++#define ETHTOOL_NWAY_RST 0x00000009 /* Restart autonegotiation, priv */ ++#endif ++#ifndef ETHTOOL_GLINK ++#define ETHTOOL_GLINK 0x0000000a /* Get link status */ ++#endif ++#ifndef ETHTOOL_GEEPROM ++#define ETHTOOL_GEEPROM 0x0000000b /* Get EEPROM data */ ++#endif ++#ifndef ETHTOOL_SEEPROM ++#define ETHTOOL_SEEPROM 0x0000000c /* Set EEPROM data */ ++#endif ++#ifndef ETHTOOL_GCOALESCE ++#define ETHTOOL_GCOALESCE 0x0000000e /* Get coalesce config */ ++/* for configuring coalescing parameters of chip */ ++#define ethtool_coalesce _kc_ethtool_coalesce ++struct _kc_ethtool_coalesce { ++ u32 cmd; /* ETHTOOL_{G,S}COALESCE */ ++ ++ /* How many usecs to delay an RX interrupt after ++ * a packet arrives. If 0, only rx_max_coalesced_frames ++ * is used. ++ */ ++ u32 rx_coalesce_usecs; ++ ++ /* How many packets to delay an RX interrupt after ++ * a packet arrives. If 0, only rx_coalesce_usecs is ++ * used. It is illegal to set both usecs and max frames ++ * to zero as this would cause RX interrupts to never be ++ * generated. ++ */ ++ u32 rx_max_coalesced_frames; ++ ++ /* Same as above two parameters, except that these values ++ * apply while an IRQ is being serviced by the host. Not ++ * all cards support this feature and the values are ignored ++ * in that case. ++ */ ++ u32 rx_coalesce_usecs_irq; ++ u32 rx_max_coalesced_frames_irq; ++ ++ /* How many usecs to delay a TX interrupt after ++ * a packet is sent. If 0, only tx_max_coalesced_frames ++ * is used. ++ */ ++ u32 tx_coalesce_usecs; ++ ++ /* How many packets to delay a TX interrupt after ++ * a packet is sent. If 0, only tx_coalesce_usecs is ++ * used. It is illegal to set both usecs and max frames ++ * to zero as this would cause TX interrupts to never be ++ * generated. ++ */ ++ u32 tx_max_coalesced_frames; ++ ++ /* Same as above two parameters, except that these values ++ * apply while an IRQ is being serviced by the host. Not ++ * all cards support this feature and the values are ignored ++ * in that case. ++ */ ++ u32 tx_coalesce_usecs_irq; ++ u32 tx_max_coalesced_frames_irq; ++ ++ /* How many usecs to delay in-memory statistics ++ * block updates. Some drivers do not have an in-memory ++ * statistic block, and in such cases this value is ignored. ++ * This value must not be zero. ++ */ ++ u32 stats_block_coalesce_usecs; ++ ++ /* Adaptive RX/TX coalescing is an algorithm implemented by ++ * some drivers to improve latency under low packet rates and ++ * improve throughput under high packet rates. Some drivers ++ * only implement one of RX or TX adaptive coalescing. Anything ++ * not implemented by the driver causes these values to be ++ * silently ignored. ++ */ ++ u32 use_adaptive_rx_coalesce; ++ u32 use_adaptive_tx_coalesce; ++ ++ /* When the packet rate (measured in packets per second) ++ * is below pkt_rate_low, the {rx,tx}_*_low parameters are ++ * used. ++ */ ++ u32 pkt_rate_low; ++ u32 rx_coalesce_usecs_low; ++ u32 rx_max_coalesced_frames_low; ++ u32 tx_coalesce_usecs_low; ++ u32 tx_max_coalesced_frames_low; ++ ++ /* When the packet rate is below pkt_rate_high but above ++ * pkt_rate_low (both measured in packets per second) the ++ * normal {rx,tx}_* coalescing parameters are used. ++ */ ++ ++ /* When the packet rate is (measured in packets per second) ++ * is above pkt_rate_high, the {rx,tx}_*_high parameters are ++ * used. ++ */ ++ u32 pkt_rate_high; ++ u32 rx_coalesce_usecs_high; ++ u32 rx_max_coalesced_frames_high; ++ u32 tx_coalesce_usecs_high; ++ u32 tx_max_coalesced_frames_high; ++ ++ /* How often to do adaptive coalescing packet rate sampling, ++ * measured in seconds. Must not be zero. ++ */ ++ u32 rate_sample_interval; ++}; ++#endif /* ETHTOOL_GCOALESCE */ ++ ++#ifndef ETHTOOL_SCOALESCE ++#define ETHTOOL_SCOALESCE 0x0000000f /* Set coalesce config. */ ++#endif ++#ifndef ETHTOOL_GRINGPARAM ++#define ETHTOOL_GRINGPARAM 0x00000010 /* Get ring parameters */ ++/* for configuring RX/TX ring parameters */ ++#define ethtool_ringparam _kc_ethtool_ringparam ++struct _kc_ethtool_ringparam { ++ u32 cmd; /* ETHTOOL_{G,S}RINGPARAM */ ++ ++ /* Read only attributes. These indicate the maximum number ++ * of pending RX/TX ring entries the driver will allow the ++ * user to set. ++ */ ++ u32 rx_max_pending; ++ u32 rx_mini_max_pending; ++ u32 rx_jumbo_max_pending; ++ u32 tx_max_pending; ++ ++ /* Values changeable by the user. The valid values are ++ * in the range 1 to the "*_max_pending" counterpart above. ++ */ ++ u32 rx_pending; ++ u32 rx_mini_pending; ++ u32 rx_jumbo_pending; ++ u32 tx_pending; ++}; ++#endif /* ETHTOOL_GRINGPARAM */ ++ ++#ifndef ETHTOOL_SRINGPARAM ++#define ETHTOOL_SRINGPARAM 0x00000011 /* Set ring parameters, priv. */ ++#endif ++#ifndef ETHTOOL_GPAUSEPARAM ++#define ETHTOOL_GPAUSEPARAM 0x00000012 /* Get pause parameters */ ++/* for configuring link flow control parameters */ ++#define ethtool_pauseparam _kc_ethtool_pauseparam ++struct _kc_ethtool_pauseparam { ++ u32 cmd; /* ETHTOOL_{G,S}PAUSEPARAM */ ++ ++ /* If the link is being auto-negotiated (via ethtool_cmd.autoneg ++ * being true) the user may set 'autonet' here non-zero to have the ++ * pause parameters be auto-negotiated too. In such a case, the ++ * {rx,tx}_pause values below determine what capabilities are ++ * advertised. ++ * ++ * If 'autoneg' is zero or the link is not being auto-negotiated, ++ * then {rx,tx}_pause force the driver to use/not-use pause ++ * flow control. ++ */ ++ u32 autoneg; ++ u32 rx_pause; ++ u32 tx_pause; ++}; ++#endif /* ETHTOOL_GPAUSEPARAM */ ++ ++#ifndef ETHTOOL_SPAUSEPARAM ++#define ETHTOOL_SPAUSEPARAM 0x00000013 /* Set pause parameters. */ ++#endif ++#ifndef ETHTOOL_GRXCSUM ++#define ETHTOOL_GRXCSUM 0x00000014 /* Get RX hw csum enable (ethtool_value) */ ++#endif ++#ifndef ETHTOOL_SRXCSUM ++#define ETHTOOL_SRXCSUM 0x00000015 /* Set RX hw csum enable (ethtool_value) */ ++#endif ++#ifndef ETHTOOL_GTXCSUM ++#define ETHTOOL_GTXCSUM 0x00000016 /* Get TX hw csum enable (ethtool_value) */ ++#endif ++#ifndef ETHTOOL_STXCSUM ++#define ETHTOOL_STXCSUM 0x00000017 /* Set TX hw csum enable (ethtool_value) */ ++#endif ++#ifndef ETHTOOL_GSG ++#define ETHTOOL_GSG 0x00000018 /* Get scatter-gather enable ++* (ethtool_value) */ ++#endif ++#ifndef ETHTOOL_SSG ++#define ETHTOOL_SSG 0x00000019 /* Set scatter-gather enable ++* (ethtool_value). */ ++#endif ++#ifndef ETHTOOL_TEST ++#define ETHTOOL_TEST 0x0000001a /* execute NIC self-test, priv. */ ++#endif ++#ifndef ETHTOOL_GSTRINGS ++#define ETHTOOL_GSTRINGS 0x0000001b /* get specified string set */ ++#endif ++#ifndef ETHTOOL_PHYS_ID ++#define ETHTOOL_PHYS_ID 0x0000001c /* identify the NIC */ ++#endif ++#ifndef ETHTOOL_GSTATS ++#define ETHTOOL_GSTATS 0x0000001d /* get NIC-specific statistics */ ++#endif ++#ifndef ETHTOOL_GTSO ++#define ETHTOOL_GTSO 0x0000001e /* Get TSO enable (ethtool_value) */ ++#endif ++#ifndef ETHTOOL_STSO ++#define ETHTOOL_STSO 0x0000001f /* Set TSO enable (ethtool_value) */ ++#endif ++ ++#ifndef ETHTOOL_BUSINFO_LEN ++#define ETHTOOL_BUSINFO_LEN 32 ++#endif ++ ++/*****************************************************************************/ ++ ++enum RTL8125_registers { ++ MAC0 = 0x00, /* Ethernet hardware address. */ ++ MAC4 = 0x04, ++ MAR0 = 0x08, /* Multicast filter. */ ++ CounterAddrLow = 0x10, ++ CounterAddrHigh = 0x14, ++ CustomLED = 0x18, ++ TxDescStartAddrLow = 0x20, ++ TxDescStartAddrHigh = 0x24, ++ TxHDescStartAddrLow = 0x28, ++ TxHDescStartAddrHigh = 0x2c, ++ FLASH = 0x30, ++ INT_CFG0_8125 = 0x34, ++ ERSR = 0x36, ++ ChipCmd = 0x37, ++ TxPoll = 0x38, ++ IntrMask = 0x3C, ++ IntrStatus = 0x3E, ++ TxConfig = 0x40, ++ RxConfig = 0x44, ++ TCTR = 0x48, ++ Cfg9346 = 0x50, ++ Config0 = 0x51, ++ Config1 = 0x52, ++ Config2 = 0x53, ++ Config3 = 0x54, ++ Config4 = 0x55, ++ Config5 = 0x56, ++ TDFNR = 0x57, ++ TimeInt0 = 0x58, ++ TimeInt1 = 0x5C, ++ PHYAR = 0x60, ++ CSIDR = 0x64, ++ CSIAR = 0x68, ++ PHYstatus = 0x6C, ++ MACDBG = 0x6D, ++ GPIO = 0x6E, ++ PMCH = 0x6F, ++ ERIDR = 0x70, ++ ERIAR = 0x74, ++ INT_CFG1_8125 = 0x7A, ++ EPHY_RXER_NUM = 0x7C, ++ EPHYAR = 0x80, ++ LEDSEL_2_8125 = 0x84, ++ LEDSEL_1_8125 = 0x86, ++ TimeInt2 = 0x8C, ++ LEDSEL_3_8125 = 0x96, ++ OCPDR = 0xB0, ++ MACOCP = 0xB0, ++ OCPAR = 0xB4, ++ SecMAC0 = 0xB4, ++ SecMAC4 = 0xB8, ++ PHYOCP = 0xB8, ++ DBG_reg = 0xD1, ++ TwiCmdReg = 0xD2, ++ MCUCmd_reg = 0xD3, ++ RxMaxSize = 0xDA, ++ EFUSEAR = 0xDC, ++ CPlusCmd = 0xE0, ++ IntrMitigate = 0xE2, ++ RxDescAddrLow = 0xE4, ++ RxDescAddrHigh = 0xE8, ++ MTPS = 0xEC, ++ FuncEvent = 0xF0, ++ PPSW = 0xF2, ++ FuncEventMask = 0xF4, ++ TimeInt3 = 0xF4, ++ FuncPresetState = 0xF8, ++ CMAC_IBCR0 = 0xF8, ++ CMAC_IBCR2 = 0xF9, ++ CMAC_IBIMR0 = 0xFA, ++ CMAC_IBISR0 = 0xFB, ++ FuncForceEvent = 0xFC, ++ //8125 ++ IMR0_8125 = 0x38, ++ ISR0_8125 = 0x3C, ++ TPPOLL_8125 = 0x90, ++ IMR1_8125 = 0x800, ++ ISR1_8125 = 0x802, ++ IMR2_8125 = 0x804, ++ ISR2_8125 = 0x806, ++ IMR3_8125 = 0x808, ++ ISR3_8125 = 0x80A, ++ BACKUP_ADDR0_8125 = 0x19E0, ++ BACKUP_ADDR1_8125 = 0X19E4, ++ TCTR0_8125 = 0x0048, ++ TCTR1_8125 = 0x004C, ++ TCTR2_8125 = 0x0088, ++ TCTR3_8125 = 0x001C, ++ TIMER_INT0_8125 = 0x0058, ++ TIMER_INT1_8125 = 0x005C, ++ TIMER_INT2_8125 = 0x008C, ++ TIMER_INT3_8125 = 0x00F4, ++ INT_MITI_V2_0_RX = 0x0A00, ++ INT_MITI_V2_0_TX = 0x0A02, ++ INT_MITI_V2_1_RX = 0x0A08, ++ INT_MITI_V2_1_TX = 0x0A0A, ++ IMR_V2_CLEAR_REG_8125 = 0x0D00, ++ ISR_V2_8125 = 0x0D04, ++ IMR_V2_SET_REG_8125 = 0x0D0C, ++ TDU_STA_8125 = 0x0D08, ++ RDU_STA_8125 = 0x0D0A, ++ IMR_V4_L2_CLEAR_REG_8125 = 0x0D10, ++ IMR_V4_L2_SET_REG_8125 = 0x0D18, ++ ISR_V4_L2_8125 = 0x0D14, ++ SW_TAIL_PTR0_8125BP = 0x0D30, ++ SW_TAIL_PTR1_8125BP = 0x0D38, ++ HW_CLO_PTR0_8125BP = 0x0D34, ++ HW_CLO_PTR1_8125BP = 0x0D3C, ++ DOUBLE_VLAN_CONFIG = 0x1000, ++ TX_NEW_CTRL = 0x203E, ++ TNPDS_Q1_LOW_8125 = 0x2100, ++ PLA_TXQ0_IDLE_CREDIT = 0x2500, ++ PLA_TXQ1_IDLE_CREDIT = 0x2504, ++ SW_TAIL_PTR0_8125 = 0x2800, ++ HW_CLO_PTR0_8125 = 0x2802, ++ SW_TAIL_PTR0_8126 = 0x2800, ++ HW_CLO_PTR0_8126 = 0x2800, ++ RDSAR_Q1_LOW_8125 = 0x4000, ++ RSS_CTRL_8125 = 0x4500, ++ Q_NUM_CTRL_8125 = 0x4800, ++ RSS_KEY_8125 = 0x4600, ++ RSS_INDIRECTION_TBL_8125_V2 = 0x4700, ++ EEE_TXIDLE_TIMER_8125 = 0x6048, ++ /* mac ptp */ ++ PTP_CTRL_8125 = 0x6800, ++ PTP_STATUS_8125 = 0x6802, ++ PTP_ISR_8125 = 0x6804, ++ PTP_IMR_8125 = 0x6805, ++ PTP_TIME_CORRECT_CMD_8125 = 0x6806, ++ PTP_SOFT_CONFIG_Time_NS_8125 = 0x6808, ++ PTP_SOFT_CONFIG_Time_S_8125 = 0x680C, ++ PTP_SOFT_CONFIG_Time_Sign = 0x6812, ++ PTP_LOCAL_Time_SUB_NS_8125 = 0x6814, ++ PTP_LOCAL_Time_NS_8125 = 0x6818, ++ PTP_LOCAL_Time_S_8125 = 0x681C, ++ PTP_Time_SHIFTER_S_8125 = 0x6856, ++ PPS_RISE_TIME_NS_8125 = 0x68A0, ++ PPS_RISE_TIME_S_8125 = 0x68A4, ++ PTP_EGRESS_TIME_BASE_NS_8125 = 0XCF20, ++ PTP_EGRESS_TIME_BASE_S_8125 = 0XCF24, ++ /* phy ptp */ ++ PTP_CTL = 0xE400, ++ PTP_INER = 0xE402, ++ PTP_INSR = 0xE404, ++ PTP_SYNCE_CTL = 0xE406, ++ PTP_GEN_CFG = 0xE408, ++ PTP_CLK_CFG_8126 = 0xE410, ++ PTP_CFG_NS_LO_8126 = 0xE412, ++ PTP_CFG_NS_HI_8126 = 0xE414, ++ PTP_CFG_S_LO_8126 = 0xE416, ++ PTP_CFG_S_MI_8126 = 0xE418, ++ PTP_CFG_S_HI_8126 = 0xE41A, ++ PTP_TAI_CFG = 0xE420, ++ PTP_TAI_TS_S_LO = 0xE42A, ++ PTP_TAI_TS_S_HI = 0xE42C, ++ PTP_TRX_TS_STA = 0xE430, ++ PTP_TRX_TS_NS_LO = 0xE446, ++ PTP_TRX_TS_NS_HI = 0xE448, ++ PTP_TRX_TS_S_LO = 0xE44A, ++ PTP_TRX_TS_S_MI = 0xE44C, ++ PTP_TRX_TS_S_HI = 0xE44E, ++ ++ ++ //TCAM ++ TCAM_NOTVALID_ADDR = 0xA000, ++ TCAM_VALID_ADDR = 0xA800, ++ TCAM_MAC_ADDR = 448, ++ TCAM_VLAN_TAG = 496, ++ //TCAM V2 ++ TCAM_NOTVALID_ADDR_V2 = 0xA000, ++ TCAM_VALID_ADDR_V2 = 0xB000, ++ TCAM_MAC_ADDR_V2 = 0x00, ++ TCAM_VLAN_TAG_V2 = 0x03, ++ //ipc2 ++ IB2SOC_SET = 0x0010, ++ IB2SOC_DATA = 0x0014, ++ IB2SOC_CMD = 0x0018, ++ IB2SOC_IMR = 0x001C, ++ ++ RISC_IMR_8125BP = 0x0D20, ++ RISC_ISR_8125BP = 0x0D22, ++}; ++ ++enum RTL8125_register_content { ++ /* InterruptStatusBits */ ++ SYSErr = 0x8000, ++ PCSTimeout = 0x4000, ++ SWInt = 0x0100, ++ TxDescUnavail = 0x0080, ++ RxFIFOOver = 0x0040, ++ LinkChg = 0x0020, ++ RxDescUnavail = 0x0010, ++ TxErr = 0x0008, ++ TxOK = 0x0004, ++ RxErr = 0x0002, ++ RxOK = 0x0001, ++ RxDU1 = 0x0002, ++ RxOK1 = 0x0001, ++ ++ /* RxStatusDesc */ ++ RxRWT = (1 << 22), ++ RxRES = (1 << 21), ++ RxRUNT = (1 << 20), ++ RxCRC = (1 << 19), ++ ++ RxRWT_V3 = (1 << 18), ++ RxRES_V3 = (1 << 20), ++ RxRUNT_V3 = (1 << 19), ++ RxCRC_V3 = (1 << 17), ++ ++ RxRES_V4 = (1 << 22), ++ RxRUNT_V4 = (1 << 21), ++ RxCRC_V4 = (1 << 20), ++ ++ /* ChipCmdBits */ ++ StopReq = 0x80, ++ CmdReset = 0x10, ++ CmdRxEnb = 0x08, ++ CmdTxEnb = 0x04, ++ RxBufEmpty = 0x01, ++ ++ /* Cfg9346Bits */ ++ Cfg9346_EEM_MASK = 0xC0, ++ Cfg9346_Lock = 0x00, ++ Cfg9346_Unlock = 0xC0, ++ Cfg9346_EEDO = (1 << 0), ++ Cfg9346_EEDI = (1 << 1), ++ Cfg9346_EESK = (1 << 2), ++ Cfg9346_EECS = (1 << 3), ++ Cfg9346_EEM0 = (1 << 6), ++ Cfg9346_EEM1 = (1 << 7), ++ ++ /* rx_mode_bits */ ++ AcceptErr = 0x20, ++ AcceptRunt = 0x10, ++ AcceptBroadcast = 0x08, ++ AcceptMulticast = 0x04, ++ AcceptMyPhys = 0x02, ++ AcceptAllPhys = 0x01, ++ AcceppVlanPhys = 0x8000, ++ ++ /* Transmit Priority Polling*/ ++ HPQ = 0x80, ++ NPQ = 0x40, ++ FSWInt = 0x01, ++ ++ /* RxConfigBits */ ++ Reserved2_shift = 13, ++ RxCfgDMAShift = 8, ++ EnableRxDescV3 = (1 << 24), ++ EnableRxDescV4_1 = (1 << 24), ++ EnableOuterVlan = (1 << 23), ++ EnableInnerVlan = (1 << 22), ++ RxCfg_128_int_en = (1 << 15), ++ RxCfg_fet_multi_en = (1 << 14), ++ RxCfg_half_refetch = (1 << 13), ++ RxCfg_pause_slot_en = (1 << 11), ++ RxCfg_9356SEL = (1 << 6), ++ EnableRxDescV4_0 = (1 << 1), //not in rcr ++ ++ /* TxConfigBits */ ++ TxInterFrameGapShift = 24, ++ TxDMAShift = 8, /* DMA burst value (0-7) is shift this many bits */ ++ TxMACLoopBack = (1 << 17), /* MAC loopback */ ++ ++ /* Config1 register */ ++ LEDS1 = (1 << 7), ++ LEDS0 = (1 << 6), ++ Speed_down = (1 << 4), ++ MEMMAP = (1 << 3), ++ IOMAP = (1 << 2), ++ VPD = (1 << 1), ++ PMEnable = (1 << 0), /* Power Management Enable */ ++ ++ /* Config2 register */ ++ PMSTS_En = (1 << 5), ++ ++ /* Config3 register */ ++ Isolate_en = (1 << 12), /* Isolate enable */ ++ MagicPacket = (1 << 5), /* Wake up when receives a Magic Packet */ ++ LinkUp = (1 << 4), /* This bit is reserved in RTL8125B.*/ ++ /* Wake up when the cable connection is re-established */ ++ ECRCEN = (1 << 3), /* This bit is reserved in RTL8125B*/ ++ Jumbo_En0 = (1 << 2), /* This bit is reserved in RTL8125B*/ ++ RDY_TO_L23 = (1 << 1), /* This bit is reserved in RTL8125B*/ ++ Beacon_en = (1 << 0), /* This bit is reserved in RTL8125B*/ ++ ++ /* Config4 register */ ++ Jumbo_En1 = (1 << 1), /* This bit is reserved in RTL8125B*/ ++ ++ /* Config5 register */ ++ BWF = (1 << 6), /* Accept Broadcast wakeup frame */ ++ MWF = (1 << 5), /* Accept Multicast wakeup frame */ ++ UWF = (1 << 4), /* Accept Unicast wakeup frame */ ++ LanWake = (1 << 1), /* LanWake enable/disable */ ++ PMEStatus = (1 << 0), /* PME status can be reset by PCI RST# */ ++ ++ /* CPlusCmd */ ++ EnableBist = (1 << 15), ++ Macdbgo_oe = (1 << 14), ++ Normal_mode = (1 << 13), ++ Force_halfdup = (1 << 12), ++ Force_rxflow_en = (1 << 11), ++ Force_txflow_en = (1 << 10), ++ Cxpl_dbg_sel = (1 << 9),//This bit is reserved in RTL8125B ++ ASF = (1 << 8),//This bit is reserved in RTL8125C ++ PktCntrDisable = (1 << 7), ++ RxVlan = (1 << 6), ++ RxChkSum = (1 << 5), ++ Macdbgo_sel = 0x001C, ++ INTT_0 = 0x0000, ++ INTT_1 = 0x0001, ++ INTT_2 = 0x0002, ++ INTT_3 = 0x0003, ++ ++ /* rtl8125_PHYstatus */ ++ PowerSaveStatus = 0x80, ++ _1000bpsL = 0x80000, ++ _5000bpsF = 0x1000, ++ _2500bpsF = 0x400, ++ _2500bpsL = 0x200, ++ TxFlowCtrl = 0x40, ++ RxFlowCtrl = 0x20, ++ _1000bpsF = 0x10, ++ _100bps = 0x08, ++ _10bps = 0x04, ++ LinkStatus = 0x02, ++ FullDup = 0x01, ++ ++ /* DBG_reg */ ++ Fix_Nak_1 = (1 << 4), ++ Fix_Nak_2 = (1 << 3), ++ DBGPIN_E2 = (1 << 0), ++ ++ /* ResetCounterCommand */ ++ CounterReset = 0x1, ++ /* DumpCounterCommand */ ++ CounterDump = 0x8, ++ ++ /* PHY access */ ++ PHYAR_Flag = 0x80000000, ++ PHYAR_Write = 0x80000000, ++ PHYAR_Read = 0x00000000, ++ PHYAR_Reg_Mask = 0x1f, ++ PHYAR_Reg_shift = 16, ++ PHYAR_Data_Mask = 0xffff, ++ ++ /* EPHY access */ ++ EPHYAR_Flag = 0x80000000, ++ EPHYAR_Write = 0x80000000, ++ EPHYAR_Read = 0x00000000, ++ EPHYAR_Reg_Mask = 0x3f, ++ EPHYAR_Reg_Mask_v2 = 0x7f, ++ EPHYAR_Reg_shift = 16, ++ EPHYAR_Data_Mask = 0xffff, ++ ++ /* CSI access */ ++ CSIAR_Flag = 0x80000000, ++ CSIAR_Write = 0x80000000, ++ CSIAR_Read = 0x00000000, ++ CSIAR_ByteEn = 0x0f, ++ CSIAR_ByteEn_shift = 12, ++ CSIAR_Addr_Mask = 0x0fff, ++ ++ /* ERI access */ ++ ERIAR_Flag = 0x80000000, ++ ERIAR_Write = 0x80000000, ++ ERIAR_Read = 0x00000000, ++ ERIAR_Addr_Align = 4, /* ERI access register address must be 4 byte alignment */ ++ ERIAR_ExGMAC = 0, ++ ERIAR_MSIX = 1, ++ ERIAR_ASF = 2, ++ ERIAR_OOB = 2, ++ ERIAR_Type_shift = 16, ++ ERIAR_ByteEn = 0x0f, ++ ERIAR_ByteEn_shift = 12, ++ ++ /* OCP GPHY access */ ++ OCPDR_Write = 0x80000000, ++ OCPDR_Read = 0x00000000, ++ OCPDR_Reg_Mask = 0xFF, ++ OCPDR_Data_Mask = 0xFFFF, ++ OCPDR_GPHY_Reg_shift = 16, ++ OCPAR_Flag = 0x80000000, ++ OCPAR_GPHY_Write = 0x8000F060, ++ OCPAR_GPHY_Read = 0x0000F060, ++ OCPR_Write = 0x80000000, ++ OCPR_Read = 0x00000000, ++ OCPR_Addr_Reg_shift = 16, ++ OCPR_Flag = 0x80000000, ++ OCP_STD_PHY_BASE_PAGE = 0x0A40, ++ ++ /* MCU Command */ ++ Now_is_oob = (1 << 7), ++ Txfifo_empty = (1 << 5), ++ Rxfifo_empty = (1 << 4), ++ ++ /* E-FUSE access */ ++ EFUSE_WRITE = 0x80000000, ++ EFUSE_WRITE_OK = 0x00000000, ++ EFUSE_READ = 0x00000000, ++ EFUSE_READ_OK = 0x80000000, ++ EFUSE_WRITE_V3 = 0x40000000, ++ EFUSE_WRITE_OK_V3 = 0x00000000, ++ EFUSE_READ_V3 = 0x80000000, ++ EFUSE_READ_OK_V3 = 0x00000000, ++ EFUSE_Reg_Mask = 0x03FF, ++ EFUSE_Reg_Shift = 8, ++ EFUSE_Check_Cnt = 300, ++ EFUSE_READ_FAIL = 0xFF, ++ EFUSE_Data_Mask = 0x000000FF, ++ ++ /* GPIO */ ++ GPIO_en = (1 << 0), ++ ++ /* PTP */ ++ PTP_ISR_TOK = (1 << 1), ++ PTP_ISR_TER = (1 << 2), ++ PTP_EXEC_CMD = (1 << 7), ++ PTP_ADJUST_TIME_NS_NEGATIVE = (1 << 30), ++ PTP_ADJUST_TIME_S_NEGATIVE = (1ULL << 48), ++ PTP_SOFT_CONFIG_TIME_NS_NEGATIVE = (1 << 30), ++ PTP_SOFT_CONFIG_TIME_S_NEGATIVE = (1ULL << 48), ++ ++ /* New Interrupt Bits */ ++ INT_CFG0_ENABLE_8125 = (1 << 0), ++ INT_CFG0_TIMEOUT0_BYPASS_8125 = (1 << 1), ++ INT_CFG0_MITIGATION_BYPASS_8125 = (1 << 2), ++ INT_CFG0_RDU_BYPASS_8126 = (1 << 4), ++ INT_CFG0_MSIX_ENTRY_NUM_MODE = (1 << 5), ++ INT_CFG0_AUTO_CLEAR_IMR = (1 << 5), ++ INT_CFG0_AVOID_MISS_INTR = (1 << 6), ++ ISRIMR_V2_ROK_Q0 = (1 << 0), ++ ISRIMR_TOK_Q0 = (1 << 16), ++ ISRIMR_TOK_Q1 = (1 << 18), ++ ISRIMR_V2_LINKCHG = (1 << 21), ++ ++ ISRIMR_V4_ROK_Q0 = (1 << 0), ++ ISRIMR_V4_LINKCHG = (1 << 29), ++ ISRIMR_V4_LAYER2_INTR_STS = (1 << 31), ++ ISRIMR_V4_L2_IPC2 = (1 << 17), ++ ++ ISRIMR_V5_ROK_Q0 = (1 << 0), ++ ISRIMR_V5_TOK_Q0 = (1 << 16), ++ ISRIMR_V5_TOK_Q1 = (1 << 17), ++ ISRIMR_V5_LINKCHG = (1 << 18), ++ ++ ISRIMR_V7_ROK_Q0 = (1 << 0), ++ ISRIMR_V7_TOK_Q0 = (1 << 27), ++ ISRIMR_V7_TOK_Q1 = (1 << 28), ++ ISRIMR_V7_LINKCHG = (1 << 29), ++ ++ /* IPC2 */ ++ RISC_IPC2_INTR = (1 << 1), ++ ++ /* Magic Number */ ++ RTL8125_MAGIC_NUMBER = 0x0badbadbadbadbadull, ++}; ++ ++enum _DescStatusBit { ++ DescOwn = (1 << 31), /* Descriptor is owned by NIC */ ++ RingEnd = (1 << 30), /* End of descriptor ring */ ++ FirstFrag = (1 << 29), /* First segment of a packet */ ++ LastFrag = (1 << 28), /* Final segment of a packet */ ++ ++ DescOwn_V3 = (DescOwn), /* Descriptor is owned by NIC */ ++ RingEnd_V3 = (RingEnd), /* End of descriptor ring */ ++ FirstFrag_V3 = (1 << 25), /* First segment of a packet */ ++ LastFrag_V3 = (1 << 24), /* Final segment of a packet */ ++ ++ DescOwn_V4 = (DescOwn), /* Descriptor is owned by NIC */ ++ RingEnd_V4 = (RingEnd), /* End of descriptor ring */ ++ FirstFrag_V4 = (FirstFrag), /* First segment of a packet */ ++ LastFrag_V4 = (LastFrag), /* Final segment of a packet */ ++ ++ /* Tx private */ ++ /*------ offset 0 of tx descriptor ------*/ ++ LargeSend = (1 << 27), /* TCP Large Send Offload (TSO) */ ++ GiantSendv4 = (1 << 26), /* TCP Giant Send Offload V4 (GSOv4) */ ++ GiantSendv6 = (1 << 25), /* TCP Giant Send Offload V6 (GSOv6) */ ++ LargeSend_DP = (1 << 16), /* TCP Large Send Offload (TSO) */ ++ MSSShift = 16, /* MSS value position */ ++ MSSMask = 0x7FFU, /* MSS value 11 bits */ ++ TxIPCS = (1 << 18), /* Calculate IP checksum */ ++ TxUDPCS = (1 << 17), /* Calculate UDP/IP checksum */ ++ TxTCPCS = (1 << 16), /* Calculate TCP/IP checksum */ ++ TxVlanTag = (1 << 17), /* Add VLAN tag */ ++ ++ /*@@@@@@ offset 4 of tx descriptor => bits for RTL8125 only begin @@@@@@*/ ++ TxUDPCS_C = (1 << 31), /* Calculate UDP/IP checksum */ ++ TxTCPCS_C = (1 << 30), /* Calculate TCP/IP checksum */ ++ TxIPCS_C = (1 << 29), /* Calculate IP checksum */ ++ TxIPV6F_C = (1 << 28), /* Indicate it is an IPv6 packet */ ++ /*@@@@@@ offset 4 of tx descriptor => bits for RTL8125 only end @@@@@@*/ ++ ++ ++ /* Rx private */ ++ /*------ offset 0 of rx descriptor ------*/ ++ PID1 = (1 << 18), /* Protocol ID bit 1/2 */ ++ PID0 = (1 << 17), /* Protocol ID bit 2/2 */ ++ ++#define RxProtoUDP (PID1) ++#define RxProtoTCP (PID0) ++#define RxProtoIP (PID1 | PID0) ++#define RxProtoMask RxProtoIP ++ ++ RxIPF = (1 << 16), /* IP checksum failed */ ++ RxUDPF = (1 << 15), /* UDP/IP checksum failed */ ++ RxTCPF = (1 << 14), /* TCP/IP checksum failed */ ++ RxVlanTag = (1 << 16), /* VLAN tag available */ ++ ++ /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only begin @@@@@@*/ ++ RxUDPT = (1 << 18), ++ RxTCPT = (1 << 17), ++ /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only end @@@@@@*/ ++ ++ /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only begin @@@@@@*/ ++ RxV6F = (1 << 31), ++ RxV4F = (1 << 30), ++ /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only end @@@@@@*/ ++ ++ ++ PID1_v3 = (1 << 29), /* Protocol ID bit 1/2 */ ++ PID0_v3 = (1 << 28), /* Protocol ID bit 2/2 */ ++ ++#define RxProtoUDP_v3 (PID1_v3) ++#define RxProtoTCP_v3 (PID0_v3) ++#define RxProtoIP_v3 (PID1_v3 | PID0_v3) ++#define RxProtoMask_v3 RxProtoIP_v3 ++ ++ RxIPF_v3 = (1 << 26), /* IP checksum failed */ ++ RxUDPF_v3 = (1 << 25), /* UDP/IP checksum failed */ ++ RxTCPF_v3 = (1 << 24), /* TCP/IP checksum failed */ ++ RxSCTPF_v3 = (1 << 23), /* SCTP checksum failed */ ++ RxVlanTag_v3 = (RxVlanTag), /* VLAN tag available */ ++ ++ /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only begin @@@@@@*/ ++ RxUDPT_v3 = (1 << 29), ++ RxTCPT_v3 = (1 << 28), ++ RxSCTP_v3 = (1 << 27), ++ /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only end @@@@@@*/ ++ ++ /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only begin @@@@@@*/ ++ RxV6F_v3 = (RxV6F), ++ RxV4F_v3 = (RxV4F), ++ /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only end @@@@@@*/ ++ ++ RxIPF_v4 = (1 << 17), /* IP checksum failed */ ++ RxUDPF_v4 = (1 << 16), /* UDP/IP checksum failed */ ++ RxTCPF_v4 = (1 << 15), /* TCP/IP checksum failed */ ++ RxSCTPF_v4 = (1 << 19), /* SCTP checksum failed */ ++ RxVlanTag_v4 = (RxVlanTag), /* VLAN tag available */ ++ ++ /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only begin @@@@@@*/ ++ RxUDPT_v4 = (1 << 19), ++ RxTCPT_v4 = (1 << 18), ++ RxSCTP_v4 = (1 << 19), ++ /*@@@@@@ offset 0 of rx descriptor => bits for RTL8125 only end @@@@@@*/ ++ ++ /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only begin @@@@@@*/ ++ RxV6F_v4 = (RxV6F), ++ RxV4F_v4 = (RxV4F), ++ /*@@@@@@ offset 4 of rx descriptor => bits for RTL8125 only end @@@@@@*/ ++}; ++ ++enum features { ++// RTL_FEATURE_WOL = (1 << 0), ++ RTL_FEATURE_MSI = (1 << 1), ++ RTL_FEATURE_MSIX = (1 << 2), ++}; ++ ++enum wol_capability { ++ WOL_DISABLED = 0, ++ WOL_ENABLED = 1 ++}; ++ ++enum bits { ++ BIT_0 = (1 << 0), ++ BIT_1 = (1 << 1), ++ BIT_2 = (1 << 2), ++ BIT_3 = (1 << 3), ++ BIT_4 = (1 << 4), ++ BIT_5 = (1 << 5), ++ BIT_6 = (1 << 6), ++ BIT_7 = (1 << 7), ++ BIT_8 = (1 << 8), ++ BIT_9 = (1 << 9), ++ BIT_10 = (1 << 10), ++ BIT_11 = (1 << 11), ++ BIT_12 = (1 << 12), ++ BIT_13 = (1 << 13), ++ BIT_14 = (1 << 14), ++ BIT_15 = (1 << 15), ++ BIT_16 = (1 << 16), ++ BIT_17 = (1 << 17), ++ BIT_18 = (1 << 18), ++ BIT_19 = (1 << 19), ++ BIT_20 = (1 << 20), ++ BIT_21 = (1 << 21), ++ BIT_22 = (1 << 22), ++ BIT_23 = (1 << 23), ++ BIT_24 = (1 << 24), ++ BIT_25 = (1 << 25), ++ BIT_26 = (1 << 26), ++ BIT_27 = (1 << 27), ++ BIT_28 = (1 << 28), ++ BIT_29 = (1 << 29), ++ BIT_30 = (1 << 30), ++ BIT_31 = (1 << 31) ++}; ++ ++/* Phy Fuse Dout */ ++#define R8125_PHY_FUSE_DOUT_NUM (32) ++#define R8125_MAX_PHY_FUSE_DOUT_NUM R8125_PHY_FUSE_DOUT_NUM ++ ++#define RTL8125_CP_NUM 4 ++#define RTL8125_MAX_SUPPORT_CP_LEN 110 ++ ++enum rtl8125_cp_status { ++ rtl8125_cp_normal = 0, ++ rtl8125_cp_short, ++ rtl8125_cp_open, ++ rtl8125_cp_mismatch, ++ rtl8125_cp_unknown ++}; ++ ++enum efuse { ++ EFUSE_NOT_SUPPORT = 0, ++ EFUSE_SUPPORT_V1, ++ EFUSE_SUPPORT_V2, ++ EFUSE_SUPPORT_V3, ++ EFUSE_SUPPORT_V4, ++}; ++#define RsvdMask 0x3fffc000 ++#define RsvdMaskV3 0x3fff8000 ++#define RsvdMaskV4 RsvdMaskV3 ++ ++struct TxDesc { ++ u32 opts1; ++ u32 opts2; ++ u64 addr; ++ u32 reserved0; ++ u32 reserved1; ++ u32 reserved2; ++ u32 reserved3; ++}; ++ ++struct RxDesc { ++ u32 opts1; ++ u32 opts2; ++ u64 addr; ++}; ++ ++struct RxDescV3 { ++ union { ++ struct { ++ u32 rsv1; ++ u32 rsv2; ++ } RxDescDDWord1; ++ }; ++ ++ union { ++ struct { ++ u32 RSSResult; ++ u16 HeaderBufferLen; ++ u16 HeaderInfo; ++ } RxDescNormalDDWord2; ++ ++ struct { ++ u32 rsv5; ++ u32 rsv6; ++ } RxDescDDWord2; ++ }; ++ ++ union { ++ u64 addr; ++ ++ struct { ++ u32 TimeStampLow; ++ u32 TimeStampHigh; ++ } RxDescTimeStamp; ++ ++ struct { ++ u32 rsv8; ++ u32 rsv9; ++ } RxDescDDWord3; ++ }; ++ ++ union { ++ struct { ++ u32 opts2; ++ u32 opts1; ++ } RxDescNormalDDWord4; ++ ++ struct { ++ u16 TimeStampHHigh; ++ u16 rsv11; ++ u32 opts1; ++ } RxDescPTPDDWord4; ++ }; ++}; ++ ++struct RxDescV4 { ++ union { ++ u64 addr; ++ ++ struct { ++ u32 RSSInfo; ++ u32 RSSResult; ++ } RxDescNormalDDWord1; ++ }; ++ ++ struct { ++ u32 opts2; ++ u32 opts1; ++ } RxDescNormalDDWord2; ++}; ++ ++enum rxdesc_type { ++ RXDESC_TYPE_NORMAL=0, ++ RXDESC_TYPE_NEXT, ++ RXDESC_TYPE_PTP, ++ RXDESC_TYPE_MAX ++}; ++ ++//Rx Desc Type ++enum rx_desc_ring_type { ++ RX_DESC_RING_TYPE_UNKNOWN=0, ++ RX_DESC_RING_TYPE_1, ++ RX_DESC_RING_TYPE_2, ++ RX_DESC_RING_TYPE_3, ++ RX_DESC_RING_TYPE_4, ++ RX_DESC_RING_TYPE_MAX ++}; ++ ++enum rx_desc_len { ++ RX_DESC_LEN_TYPE_1 = (sizeof(struct RxDesc)), ++ RX_DESC_LEN_TYPE_3 = (sizeof(struct RxDescV3)), ++ RX_DESC_LEN_TYPE_4 = (sizeof(struct RxDescV4)) ++}; ++ ++struct ring_info { ++ struct sk_buff *skb; ++ u32 len; ++ unsigned int bytecount; ++ unsigned short gso_segs; ++ u8 __pad[sizeof(void *) - sizeof(u32)]; ++}; ++ ++struct pci_resource { ++ u8 cmd; ++ u8 cls; ++ u16 io_base_h; ++ u16 io_base_l; ++ u16 mem_base_h; ++ u16 mem_base_l; ++ u8 ilr; ++ u16 resv_0x1c_h; ++ u16 resv_0x1c_l; ++ u16 resv_0x20_h; ++ u16 resv_0x20_l; ++ u16 resv_0x24_h; ++ u16 resv_0x24_l; ++ u16 resv_0x2c_h; ++ u16 resv_0x2c_l; ++ u32 pci_sn_l; ++ u32 pci_sn_h; ++}; ++ ++enum r8125_dash_req_flag { ++ R8125_RCV_REQ_SYS_OK = 0, ++ R8125_RCV_REQ_DASH_OK, ++ R8125_SEND_REQ_HOST_OK, ++ R8125_CMAC_RESET, ++ R8125_CMAC_DISALE_RX_FLAG_MAX, ++ R8125_DASH_REQ_FLAG_MAX ++}; ++ ++enum r8125_flag { ++ R8125_FLAG_DOWN = 0, ++ R8125_FLAG_TASK_RESET_PENDING, ++ R8125_FLAG_TASK_ESD_CHECK_PENDING, ++ R8125_FLAG_TASK_LINKCHG_CHECK_PENDING, ++ R8125_FLAG_TASK_LINK_CHECK_PENDING, ++ R8125_FLAG_TASK_DASH_CHECK_PENDING, ++ R8125_FLAG_MAX ++}; ++ ++enum r8125_sysfs_flag { ++ R8125_SYSFS_RTL_ADV = 0, ++ R8125_SYSFS_FLAG_MAX ++}; ++ ++struct rtl8125_tx_ring { ++ void* priv; ++ struct net_device *netdev; ++ u32 index; ++ u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */ ++ u32 dirty_tx; ++ u32 num_tx_desc; /* Number of Tx descriptor registers */ ++ struct TxDesc *TxDescArray; /* 256-aligned Tx descriptor ring */ ++ dma_addr_t TxPhyAddr; ++ u32 TxDescAllocSize; ++ struct ring_info tx_skb[MAX_NUM_TX_DESC]; /* Tx data buffers */ ++ ++ u32 NextHwDesCloPtr; ++ u32 BeginHwDesCloPtr; ++ ++ u16 hw_clo_ptr_reg; ++ u16 sw_tail_ptr_reg; ++ ++ u16 tdsar_reg; /* Transmit Descriptor Start Address */ ++}; ++ ++struct rtl8125_rx_buffer { ++ struct page *page; ++ u32 page_offset; ++ dma_addr_t dma; ++ void* data; ++ struct sk_buff *skb; ++}; ++ ++struct rtl8125_rx_ring { ++ void* priv; ++ struct net_device *netdev; ++ u32 index; ++ u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */ ++ u32 dirty_rx; ++ u32 num_rx_desc; /* Number of Rx descriptor registers */ ++ struct RxDesc *RxDescArray; /* 256-aligned Rx descriptor ring */ ++ u32 RxDescAllocSize; ++ u64 RxDescPhyAddr[MAX_NUM_RX_DESC]; /* Rx desc physical address*/ ++ dma_addr_t RxPhyAddr; ++#ifdef ENABLE_PAGE_REUSE ++ struct rtl8125_rx_buffer rx_buffer[MAX_NUM_RX_DESC]; ++ u16 rx_offset; ++#else ++ struct sk_buff *Rx_skbuff[MAX_NUM_RX_DESC]; /* Rx data buffers */ ++#endif //ENABLE_PAGE_REUSE ++ ++ u16 rdsar_reg; /* Receive Descriptor Start Address */ ++}; ++ ++struct r8125_napi { ++#ifdef CONFIG_R8125_NAPI ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,24) ++ struct napi_struct napi; ++#endif ++#endif ++ void* priv; ++ int index; ++}; ++ ++struct r8125_irq { ++ irq_handler_t handler; ++ unsigned int vector; ++ u8 requested; ++ char name[IFNAMSIZ + 10]; ++}; ++ ++#pragma pack(1) ++struct rtl8125_regs { ++ //00 ++ u8 mac_id[6]; ++ u16 reg_06; ++ u8 mar[8]; ++ //10 ++ u64 dtccr; ++ u16 ledsel0; ++ u16 legreg; ++ u32 tctr3; ++ //20 ++ u32 txq0_dsc_st_addr_0; ++ u32 txq0_dsc_st_addr_2; ++ u64 reg_28; ++ //30 ++ u16 rit; ++ u16 ritc; ++ u16 reg_34; ++ u8 reg_36; ++ u8 command; ++ u32 imr0; ++ u32 isr0; ++ //40 ++ u32 tcr; ++ u32 rcr; ++ u32 tctr0; ++ u32 tctr1; ++ //50 ++ u8 cr93c46; ++ u8 config0; ++ u8 config1; ++ u8 config2; ++ u8 config3; ++ u8 config4; ++ u8 config5; ++ u8 tdfnr; ++ u32 timer_int0; ++ u32 timer_int1; ++ //60 ++ u32 gphy_mdcmdio; ++ u32 csidr; ++ u32 csiar; ++ u16 phy_status; ++ u8 config6; ++ u8 pmch; ++ //70 ++ u32 eridr; ++ u32 eriar; ++ u16 config7; ++ u16 reg_7a; ++ u32 ephy_rxerr_cnt; ++ //80 ++ u32 ephy_mdcmdio; ++ u16 ledsel2; ++ u16 ledsel1; ++ u32 tctr2; ++ u32 timer_int2; ++ //90 ++ u8 tppoll0; ++ u8 reg_91; ++ u16 reg_92; ++ u16 led_feature; ++ u16 ledsel3; ++ u16 eee_led_config; ++ u16 reg_9a; ++ u32 reg_9c; ++ //a0 ++ u32 reg_a0; ++ u32 reg_a4; ++ u32 reg_a8; ++ u32 reg_ac; ++ //b0 ++ u32 patch_dbg; ++ u32 reg_b4; ++ u32 gphy_ocp; ++ u32 reg_bc; ++ //c0 ++ u32 reg_c0; ++ u32 reg_c4; ++ u32 reg_c8; ++ u16 otp_cmd; ++ u16 otp_pg_config; ++ //d0 ++ u16 phy_pwr; ++ u8 twsi_ctrl; ++ u8 oob_ctrl; ++ u16 mac_dbgo; ++ u16 mac_dbg; ++ u16 reg_d8; ++ u16 rms; ++ u32 efuse_data; ++ //e0 ++ u16 cplus_cmd; ++ u16 reg_e2; ++ u32 rxq0_dsc_st_addr_0; ++ u32 rxq0_dsc_st_addr_2; ++ u16 reg_ec; ++ u16 tx10midle_cnt; ++ //f0 ++ u16 misc0; ++ u16 misc1; ++ u32 timer_int3; ++ u32 cmac_ib; ++ u16 reg_fc; ++ u16 sw_rst; ++}; ++#pragma pack() ++ ++struct rtl8125_regs_save { ++ union { ++ u8 mac_io[R8125_MAC_REGS_SIZE]; ++ ++ struct rtl8125_regs mac_reg; ++ }; ++ u16 pcie_phy[R8125_EPHY_REGS_SIZE/2]; ++ u16 eth_phy[R8125_PHY_REGS_SIZE/2]; ++ u32 eri_reg[R8125_ERI_REGS_SIZE/4]; ++ u32 pci_reg[R8125_PCI_REGS_SIZE/4]; ++ u16 sw_tail_ptr_reg[R8125_MAX_TX_QUEUES]; ++ u16 hw_clo_ptr_reg[R8125_MAX_TX_QUEUES]; ++ ++ //ktime_t begin_ktime; ++ //ktime_t end_ktime; ++ //u64 duration_ns; ++ ++ u16 sw0_tail_ptr; ++ u16 next_hwq0_clo_ptr; ++ u16 sw1_tail_ptr; ++ u16 next_hwq1_clo_ptr; ++ ++ u16 int_miti_rxq0; ++ u16 int_miti_txq0; ++ u16 int_miti_rxq1; ++ u16 int_miti_txq1; ++ u8 int_config; ++ u32 imr_new; ++ u32 isr_new; ++ ++ u8 tdu_status; ++ u16 rdu_status; ++ ++ u16 tc_mode; ++ ++ u32 txq1_dsc_st_addr_0; ++ u32 txq1_dsc_st_addr_2; ++ ++ u32 pla_tx_q0_idle_credit; ++ u32 pla_tx_q1_idle_credit; ++ ++ u32 rxq1_dsc_st_addr_0; ++ u32 rxq1_dsc_st_addr_2; ++ ++ u32 rss_ctrl; ++ u8 rss_key[RTL8125_RSS_KEY_SIZE]; ++ u8 rss_i_table[RTL8125_MAX_INDIRECTION_TABLE_ENTRIES]; ++ u16 rss_queue_num_sel_r; ++}; ++ ++struct rtl8125_counters { ++ /* legacy */ ++ u64 tx_packets; ++ u64 rx_packets; ++ u64 tx_errors; ++ u32 rx_errors; ++ u16 rx_missed; ++ u16 align_errors; ++ u32 tx_one_collision; ++ u32 tx_multi_collision; ++ u64 rx_unicast; ++ u64 rx_broadcast; ++ u32 rx_multicast; ++ u16 tx_aborted; ++ u16 tx_underrun; ++ ++ /* extended */ ++ u64 tx_octets; ++ u64 rx_octets; ++ u64 rx_multicast64; ++ u64 tx_unicast64; ++ u64 tx_broadcast64; ++ u64 tx_multicast64; ++ u32 tx_pause_on; ++ u32 tx_pause_off; ++ u32 tx_pause_all; ++ u32 tx_deferred; ++ u32 tx_late_collision; ++ u32 tx_all_collision; ++ u32 tx_aborted32; ++ u32 align_errors32; ++ u32 rx_frame_too_long; ++ u32 rx_runt; ++ u32 rx_pause_on; ++ u32 rx_pause_off; ++ u32 rx_pause_all; ++ u32 rx_unknown_opcode; ++ u32 rx_mac_error; ++ u32 tx_underrun32; ++ u32 rx_mac_missed; ++ u32 rx_tcam_dropped; ++ u32 tdu; ++ u32 rdu; ++}; ++ ++/* Flow Control Settings */ ++enum rtl8125_fc_mode { ++ rtl8125_fc_none = 0, ++ rtl8125_fc_rx_pause, ++ rtl8125_fc_tx_pause, ++ rtl8125_fc_full, ++ rtl8125_fc_default ++}; ++ ++enum rtl8125_state_t { ++ __RTL8125_TESTING = 0, ++ __RTL8125_RESETTING, ++ __RTL8125_DOWN, ++ __RTL8125_PTP_TX_IN_PROGRESS, ++}; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) ++struct ethtool_eee { ++ __u32 cmd; ++ __u32 supported; ++ __u32 advertised; ++ __u32 lp_advertised; ++ __u32 eee_active; ++ __u32 eee_enabled; ++ __u32 tx_lpi_enabled; ++ __u32 tx_lpi_timer; ++ __u32 reserved[2]; ++}; ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,6,0) */ ++ ++struct rtl8125_private { ++ void __iomem *mmio_addr; /* memory map physical address */ ++ struct pci_dev *pci_dev; /* Index of PCI device */ ++ struct net_device *dev; ++ struct r8125_napi r8125napi[R8125_MAX_MSIX_VEC]; ++ struct r8125_irq irq_tbl[R8125_MAX_MSIX_VEC]; ++ unsigned int irq_nvecs; ++ unsigned int max_irq_nvecs; ++ unsigned int min_irq_nvecs; ++ unsigned int hw_supp_irq_nvecs; ++ //struct msix_entry msix_entries[R8125_MAX_MSIX_VEC]; ++ struct net_device_stats stats; /* statistics of net device */ ++ unsigned long state; ++ u8 flags; ++ ++ u32 msg_enable; ++ u32 tx_tcp_csum_cmd; ++ u32 tx_udp_csum_cmd; ++ u32 tx_ip_csum_cmd; ++ u32 tx_ipv6_csum_cmd; ++ int max_jumbo_frame_size; ++ int chipset; ++ u32 mcfg; ++ //u32 cur_rx; /* Index into the Rx descriptor buffer of next Rx pkt. */ ++ //u32 cur_tx; /* Index into the Tx descriptor buffer of next Rx pkt. */ ++ //u32 dirty_rx; ++ //u32 dirty_tx; ++ //struct TxDesc *TxDescArray; /* 256-aligned Tx descriptor ring */ ++ //struct RxDesc *RxDescArray; /* 256-aligned Rx descriptor ring */ ++ //dma_addr_t TxPhyAddr; ++ //dma_addr_t RxPhyAddr; ++ //struct sk_buff *Rx_skbuff[MAX_NUM_RX_DESC]; /* Rx data buffers */ ++ //struct ring_info tx_skb[MAX_NUM_TX_DESC]; /* Tx data buffers */ ++ unsigned rx_buf_sz; ++#ifdef ENABLE_PAGE_REUSE ++ unsigned rx_buf_page_order; ++ unsigned rx_buf_page_size; ++ u32 page_reuse_fail_cnt; ++#endif //ENABLE_PAGE_REUSE ++ u16 HwSuppNumTxQueues; ++ u16 HwSuppNumRxQueues; ++ unsigned int num_tx_rings; ++ unsigned int num_rx_rings; ++ struct rtl8125_tx_ring tx_ring[R8125_MAX_TX_QUEUES]; ++ struct rtl8125_rx_ring rx_ring[R8125_MAX_RX_QUEUES]; ++#ifdef ENABLE_LIB_SUPPORT ++ struct blocking_notifier_head lib_nh; ++ struct rtl8125_ring lib_tx_ring[R8125_MAX_TX_QUEUES]; ++ struct rtl8125_ring lib_rx_ring[R8125_MAX_RX_QUEUES]; ++#endif ++ //struct timer_list esd_timer; ++ //struct timer_list link_timer; ++ struct pci_resource pci_cfg_space; ++ unsigned int esd_flag; ++ unsigned int pci_cfg_is_read; ++ unsigned int rtl8125_rx_config; ++ u16 rms; ++ u16 cp_cmd; ++ u32 intr_mask; ++ u32 intr_l2_mask; ++ u32 timer_intr_mask; ++ u16 isr_reg[R8125_MAX_MSIX_VEC]; ++ u16 imr_reg[R8125_MAX_MSIX_VEC]; ++ int phy_auto_nego_reg; ++ int phy_1000_ctrl_reg; ++ int phy_2500_ctrl_reg; ++ u8 org_mac_addr[NODE_ADDRESS_SIZE]; ++ struct rtl8125_counters *tally_vaddr; ++ dma_addr_t tally_paddr; ++ ++#ifdef CONFIG_R8125_VLAN ++ struct vlan_group *vlgrp; ++#endif ++ u8 wol_enabled; ++ u32 wol_opts; ++ u8 efuse_ver; ++ u8 eeprom_type; ++ u8 autoneg; ++ u8 duplex; ++ u32 speed; ++ u64 advertising; ++ enum rtl8125_fc_mode fcpause; ++ u32 HwSuppMaxPhyLinkSpeed; ++ u16 eeprom_len; ++ u16 cur_page; ++ u32 bios_setting; ++ ++ int (*set_speed)(struct net_device *, u8 autoneg, u32 speed, u8 duplex, u64 adv); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ void (*get_settings)(struct net_device *, struct ethtool_cmd *); ++#else ++ void (*get_settings)(struct net_device *, struct ethtool_link_ksettings *); ++#endif ++ void (*phy_reset_enable)(struct net_device *); ++ unsigned int (*phy_reset_pending)(struct net_device *); ++ unsigned int (*link_ok)(struct net_device *); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++ struct work_struct reset_task; ++ struct work_struct esd_task; ++ struct work_struct linkchg_task; ++ struct work_struct link_task; ++ struct work_struct dash_task; ++#else ++ struct delayed_work reset_task; ++ struct delayed_work esd_task; ++ struct delayed_work linkchg_task; ++ struct delayed_work link_task; ++ struct delayed_work dash_task; ++#endif ++ DECLARE_BITMAP(task_flags, R8125_FLAG_MAX); ++ unsigned features; ++ ++ u8 org_pci_offset_99; ++ u8 org_pci_offset_180; ++ u8 issue_offset_99_event; ++ ++ u8 org_pci_offset_80; ++ u8 org_pci_offset_81; ++ u8 use_timer_interrupt; ++ ++ u32 keep_intr_cnt; ++ ++ u8 HwIcVerUnknown; ++ u8 NotWrRamCodeToMicroP; ++ u8 NotWrMcuPatchCode; ++ u8 HwHasWrRamCodeToMicroP; ++ ++ u16 sw_ram_code_ver; ++ u16 hw_ram_code_ver; ++ ++ u8 rtk_enable_diag; ++ ++ u8 ShortPacketSwChecksum; ++ ++ u8 UseSwPaddingShortPkt; ++ ++ u8 RequireAdcBiasPatch; ++ u16 AdcBiasPatchIoffset; ++ ++ u8 RequireAdjustUpsTxLinkPulseTiming; ++ u16 SwrCnt1msIni; ++ ++ u8 HwSuppNowIsOobVer; ++ ++ u8 RequiredSecLanDonglePatch; ++ ++ u8 RequiredPfmPatch; ++ ++ u8 RequirePhyMdiSwapPatch; ++ ++ u8 RequireLSOPatch; ++ ++ u32 HwFiberModeVer; ++ u32 HwFiberStat; ++ u8 HwSwitchMdiToFiber; ++ ++ u16 BackupLedSel[4]; ++ ++ u8 HwSuppMagicPktVer; ++ ++ u8 HwSuppLinkChgWakeUpVer; ++ ++ u8 HwSuppCheckPhyDisableModeVer; ++ ++ u8 random_mac; ++ ++ u16 phy_reg_aner; ++ u16 phy_reg_anlpar; ++ u16 phy_reg_gbsr; ++ u16 phy_reg_status_2500; ++ ++ u32 HwPcieSNOffset; ++ ++ u8 HwSuppEsdVer; ++ u8 TestPhyOcpReg; ++ u16 BackupPhyFuseDout[R8125_MAX_PHY_FUSE_DOUT_NUM]; ++ ++ u32 MaxTxDescPtrMask; ++ u8 HwSuppTxNoCloseVer; ++ u8 EnableTxNoClose; ++ ++ u8 HwSuppIsrVer; ++ u8 HwCurrIsrVer; ++ ++ u8 HwSuppIntMitiVer; ++ ++ u8 HwSuppExtendTallyCounterVer; ++ ++ u8 check_keep_link_speed; ++ u8 resume_not_chg_speed; ++ ++ u8 HwSuppD0SpeedUpVer; ++ u8 D0SpeedUpSpeed; ++ ++ u8 ring_lib_enabled; ++ ++ const char *fw_name; ++ struct rtl8125_fw *rtl_fw; ++ u32 ocp_base; ++ ++ //Dash+++++++++++++++++ ++ u8 HwSuppDashVer; ++ u8 DASH; ++ u8 HwPkgDet; ++ u8 HwSuppOcpChannelVer; ++ u32 DashFirmwareVersion; ++ u32 SizeOfSendToFwBuffer; ++ u32 SizeOfRecvFromFwBuffer; ++ u8 AllowAccessDashOcp; ++ DECLARE_BITMAP(dash_req_flags, R8125_DASH_REQ_FLAG_MAX); ++ ++#ifdef ENABLE_DASH_SUPPORT ++ u16 AfterRecvFromFwBufLen; ++ u8 AfterRecvFromFwBuf[RECV_FROM_FW_BUF_SIZE]; ++ u32 RecvFromFwBufErrCnt; ++ u16 AfterSendToFwBufLen; ++ u8 AfterSendToFwBuf[SEND_TO_FW_BUF_SIZE]; ++ u16 SendToFwBufferLen; ++ ++ u8 OobReq; ++ u8 OobAck; ++ u32 OobReqComplete; ++ u32 OobAckComplete; ++ ++ u8 SendingToFw; ++ ++ u32 RecvFromDashFwCnt; ++ ++ u8 DashReqRegValue; ++ ++ //Dash----------------- ++#endif //ENABLE_DASH_SUPPORT ++ ++ //Realwow++++++++++++++ ++ u8 HwSuppKCPOffloadVer; ++ ++ u8 EnableDhcpTimeoutWake; ++ u8 EnableTeredoOffload; ++ u8 EnableKCPOffload; ++#ifdef ENABLE_REALWOW_SUPPORT ++ u32 DhcpTimeout; ++ MP_KCP_INFO MpKCPInfo; ++ //Realwow-------------- ++#endif //ENABLE_REALWOW_SUPPORT ++ ++ struct ethtool_keee eee; ++ ++#ifdef ENABLE_R8125_PROCFS ++ //Procfs support ++ struct proc_dir_entry *proc_dir; ++ struct proc_dir_entry *proc_dir_debug; ++ struct proc_dir_entry *proc_dir_test; ++#endif ++#ifdef ENABLE_R8125_SYSFS ++ //sysfs support ++ DECLARE_BITMAP(sysfs_flag, R8125_SYSFS_FLAG_MAX); ++ u32 testmode; ++#endif ++ u8 HwSuppRxDescType; ++ u8 InitRxDescType; ++ u16 RxDescLength; //V1 16 Byte V2 32 Bytes ++ ++ spinlock_t phy_lock; ++ ++ u8 HwSuppPtpVer; ++ u8 EnablePtp; ++ u8 ptp_master_mode; ++#ifdef ENABLE_PTP_SUPPORT ++ u32 tx_hwtstamp_timeouts; ++ u32 tx_hwtstamp_skipped; ++ struct work_struct ptp_tx_work; ++ struct sk_buff *ptp_tx_skb; ++ struct hwtstamp_config hwtstamp_config; ++ unsigned long ptp_tx_start; ++ struct ptp_clock_info ptp_clock_info; ++ struct ptp_clock *ptp_clock; ++ u8 syncE_en; ++ u8 pps_enable; ++ struct hrtimer pps_timer; ++#endif ++ ++ u8 HwSuppRssVer; ++ u8 EnableRss; ++ u16 HwSuppIndirTblEntries; ++#ifdef ENABLE_RSS_SUPPORT ++ u32 rss_flags; ++ /* Receive Side Scaling settings */ ++ u8 rss_key[RTL8125_RSS_KEY_SIZE]; ++ u8 rss_indir_tbl[RTL8125_MAX_INDIRECTION_TABLE_ENTRIES]; ++ u32 rss_options; ++#endif ++ ++ u8 HwSuppMacMcuVer; ++ u16 MacMcuPageSize; ++ u64 hw_mcu_patch_code_ver; ++ u64 bin_mcu_patch_code_ver; ++ ++ u8 HwSuppTcamVer; ++ ++ u16 TcamNotValidReg; ++ u16 TcamValidReg; ++ u16 TcamMaAddrcOffset; ++ u16 TcamVlanTagOffset; ++}; ++ ++#ifdef ENABLE_LIB_SUPPORT ++static inline unsigned int ++rtl8125_num_lib_tx_rings(struct rtl8125_private *tp) ++{ ++ int count, i; ++ ++ for (count = 0, i = tp->num_tx_rings; i < tp->HwSuppNumTxQueues; i++) ++ if(tp->lib_tx_ring[i].enabled) ++ count++; ++ ++ return count; ++} ++ ++static inline unsigned int ++rtl8125_num_lib_rx_rings(struct rtl8125_private *tp) ++{ ++ int count, i; ++ ++ for (count = 0, i = 0; i < tp->HwSuppNumRxQueues; i++) ++ if(tp->lib_rx_ring[i].enabled) ++ count++; ++ ++ return count; ++} ++ ++#else ++static inline unsigned int ++rtl8125_num_lib_tx_rings(struct rtl8125_private *tp) ++{ ++ return 0; ++} ++ ++static inline unsigned int ++rtl8125_num_lib_rx_rings(struct rtl8125_private *tp) ++{ ++ return 0; ++} ++#endif ++ ++static inline unsigned int ++rtl8125_tot_tx_rings(struct rtl8125_private *tp) ++{ ++ return tp->num_tx_rings + rtl8125_num_lib_tx_rings(tp); ++} ++ ++static inline unsigned int ++rtl8125_tot_rx_rings(struct rtl8125_private *tp) ++{ ++ unsigned int num_lib_rx_rings; ++ ++ num_lib_rx_rings = rtl8125_num_lib_rx_rings(tp); ++ if (num_lib_rx_rings > 0) ++ return num_lib_rx_rings; ++ else ++ return tp->num_rx_rings; ++} ++ ++static inline struct netdev_queue *txring_txq(const struct rtl8125_tx_ring *ring) ++{ ++ return netdev_get_tx_queue(ring->netdev, ring->index); ++} ++ ++enum eetype { ++ EEPROM_TYPE_NONE=0, ++ EEPROM_TYPE_93C46, ++ EEPROM_TYPE_93C56, ++ EEPROM_TWSI ++}; ++ ++enum mcfg { ++ CFG_METHOD_2=2, ++ CFG_METHOD_3, ++ CFG_METHOD_4, ++ CFG_METHOD_5, ++ CFG_METHOD_6, ++ CFG_METHOD_7, ++ CFG_METHOD_8, ++ CFG_METHOD_9, ++ CFG_METHOD_10, ++ CFG_METHOD_11, ++ CFG_METHOD_12, ++ CFG_METHOD_13, ++ CFG_METHOD_DEFAULT, ++ CFG_METHOD_MAX ++}; ++ ++#define LSO_32K 32000 ++#define LSO_64K 64000 ++ ++#define NIC_MIN_PHYS_BUF_COUNT (2) ++#define NIC_MAX_PHYS_BUF_COUNT_LSO_64K (24) ++#define NIC_MAX_PHYS_BUF_COUNT_LSO2 (16*4) ++ ++#define GTTCPHO_SHIFT 18 ++#define GTTCPHO_MAX 0x70U ++#define GTPKTSIZE_MAX 0x3ffffU ++#define TCPHO_SHIFT 18 ++#define TCPHO_MAX 0x3ffU ++#define LSOPKTSIZE_MAX 0xffffU ++#define MSS_MAX 0x07ffu /* MSS value */ ++ ++#define OOB_CMD_RESET 0x00 ++#define OOB_CMD_DRIVER_START 0x05 ++#define OOB_CMD_DRIVER_STOP 0x06 ++#define OOB_CMD_SET_IPMAC 0x41 ++ ++#define WAKEUP_MAGIC_PACKET_NOT_SUPPORT (0) ++#define WAKEUP_MAGIC_PACKET_V1 (1) ++#define WAKEUP_MAGIC_PACKET_V2 (2) ++#define WAKEUP_MAGIC_PACKET_V3 (3) ++ ++//Ram Code Version ++#define NIC_RAMCODE_VERSION_CFG_METHOD_2 (0x0b11) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_3 (0x0b33) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_4 (0x0b17) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_5 (0x0b99) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_8 (0x0013) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_9 (0x0001) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_10 (0x0027) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_11 (0x0031) ++#define NIC_RAMCODE_VERSION_CFG_METHOD_12 (0x0010) ++ ++//hwoptimize ++#define HW_PATCH_SOC_LAN (BIT_0) ++#define HW_PATCH_SAMSUNG_LAN_DONGLE (BIT_2) ++ ++static const u16 other_q_intr_mask = (RxOK1 | RxDU1); ++ ++#define HW_PHY_STATUS_INI 1 ++#define HW_PHY_STATUS_EXT_INI 2 ++#define HW_PHY_STATUS_LAN_ON 3 ++ ++void rtl8125_mdio_write(struct rtl8125_private *tp, u16 RegAddr, u16 value); ++void rtl8125_mdio_prot_write(struct rtl8125_private *tp, u32 RegAddr, u32 value); ++void rtl8125_mdio_prot_direct_write_phy_ocp(struct rtl8125_private *tp, u32 RegAddr, u32 value); ++u32 rtl8125_mdio_read(struct rtl8125_private *tp, u16 RegAddr); ++u32 rtl8125_mdio_prot_read(struct rtl8125_private *tp, u32 RegAddr); ++u32 rtl8125_mdio_prot_direct_read_phy_ocp(struct rtl8125_private *tp, u32 RegAddr); ++void rtl8125_ephy_write(struct rtl8125_private *tp, int RegAddr, int value); ++void rtl8125_mac_ocp_write(struct rtl8125_private *tp, u16 reg_addr, u16 value); ++u16 rtl8125_mac_ocp_read(struct rtl8125_private *tp, u16 reg_addr); ++void rtl8125_clear_eth_phy_bit(struct rtl8125_private *tp, u8 addr, u16 mask); ++void rtl8125_set_eth_phy_bit(struct rtl8125_private *tp, u8 addr, u16 mask); ++void rtl8125_ocp_write(struct rtl8125_private *tp, u16 addr, u8 len, u32 data); ++void rtl8125_init_ring_indexes(struct rtl8125_private *tp); ++void rtl8125_oob_mutex_lock(struct rtl8125_private *tp); ++u32 rtl8125_ocp_read(struct rtl8125_private *tp, u16 addr, u8 len); ++u32 rtl8125_ocp_read_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, u32 base_address); ++u32 rtl8125_ocp_write_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, u32 value, u32 base_address); ++u32 rtl8125_eri_read(struct rtl8125_private *tp, int addr, int len, int type); ++u32 rtl8125_eri_read_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, int type, u32 base_address); ++int rtl8125_eri_write(struct rtl8125_private *tp, int addr, int len, u32 value, int type); ++int rtl8125_eri_write_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, u32 value, int type, u32 base_address); ++u16 rtl8125_ephy_read(struct rtl8125_private *tp, int RegAddr); ++void rtl8125_wait_txrx_fifo_empty(struct net_device *dev); ++void rtl8125_enable_now_is_oob(struct rtl8125_private *tp); ++void rtl8125_disable_now_is_oob(struct rtl8125_private *tp); ++void rtl8125_oob_mutex_unlock(struct rtl8125_private *tp); ++void rtl8125_dash2_disable_tx(struct rtl8125_private *tp); ++void rtl8125_dash2_enable_tx(struct rtl8125_private *tp); ++void rtl8125_dash2_disable_rx(struct rtl8125_private *tp); ++void rtl8125_dash2_enable_rx(struct rtl8125_private *tp); ++void rtl8125_hw_disable_mac_mcu_bps(struct net_device *dev); ++void rtl8125_mark_to_asic(struct rtl8125_private *tp, struct RxDesc *desc, u32 rx_buf_sz); ++void rtl8125_mark_as_last_descriptor(struct rtl8125_private *tp, struct RxDesc *desc); ++ ++static inline void ++rtl8125_make_unusable_by_asic(struct rtl8125_private *tp, ++ struct RxDesc *desc) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ ((struct RxDescV3 *)desc)->addr = RTL8125_MAGIC_NUMBER; ++ ((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts1 &= ~cpu_to_le32(DescOwn | RsvdMaskV3); ++ break; ++ case RX_DESC_RING_TYPE_4: ++ ((struct RxDescV4 *)desc)->addr = RTL8125_MAGIC_NUMBER; ++ ((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts1 &= ~cpu_to_le32(DescOwn | RsvdMaskV4); ++ break; ++ default: ++ desc->addr = RTL8125_MAGIC_NUMBER; ++ desc->opts1 &= ~cpu_to_le32(DescOwn | RsvdMask); ++ break; ++ } ++} ++ ++static inline struct RxDesc* ++rtl8125_get_rxdesc(struct rtl8125_private *tp, struct RxDesc *RxDescBase, u32 const cur_rx) ++{ ++ return (struct RxDesc*)((u8*)RxDescBase + (cur_rx * tp->RxDescLength)); ++} ++ ++static inline void ++rtl8125_disable_hw_interrupt_v2(struct rtl8125_private *tp, ++ u32 message_id) ++{ ++ RTL_W32(tp, IMR_V2_CLEAR_REG_8125, BIT(message_id)); ++} ++ ++static inline void ++rtl8125_enable_hw_interrupt_v2(struct rtl8125_private *tp, u32 message_id) ++{ ++ RTL_W32(tp, IMR_V2_SET_REG_8125, BIT(message_id)); ++} ++ ++int rtl8125_open(struct net_device *dev); ++int rtl8125_close(struct net_device *dev); ++void rtl8125_hw_config(struct net_device *dev); ++void rtl8125_hw_set_timer_int(struct rtl8125_private *tp, u32 message_id, u8 timer_intmiti_val); ++void rtl8125_set_rx_q_num(struct rtl8125_private *tp, unsigned int num_rx_queues); ++void rtl8125_set_tx_q_num(struct rtl8125_private *tp, unsigned int num_tx_queues); ++void rtl8125_enable_mcu(struct rtl8125_private *tp, bool enable); ++void rtl8125_hw_start(struct net_device *dev); ++void rtl8125_hw_reset(struct net_device *dev); ++void rtl8125_tx_clear(struct rtl8125_private *tp); ++void rtl8125_rx_clear(struct rtl8125_private *tp); ++int rtl8125_init_ring(struct net_device *dev); ++void rtl8125_hw_set_rx_packet_filter(struct net_device *dev); ++void rtl8125_enable_hw_linkchg_interrupt(struct rtl8125_private *tp); ++int rtl8125_dump_tally_counter(struct rtl8125_private *tp, dma_addr_t paddr); ++void rtl8125_enable_napi(struct rtl8125_private *tp); ++void _rtl8125_wait_for_quiescence(struct net_device *dev); ++ ++void rtl8125_clear_mac_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask); ++void rtl8125_set_mac_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask); ++ ++void rtl8125_mdio_direct_write_phy_ocp(struct rtl8125_private *tp, u16 RegAddr,u16 value); ++u32 rtl8125_mdio_direct_read_phy_ocp(struct rtl8125_private *tp, u16 RegAddr); ++void rtl8125_clear_and_set_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 clearmask, u16 setmask); ++void rtl8125_clear_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask); ++void rtl8125_set_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask); ++ ++#ifndef ENABLE_LIB_SUPPORT ++static inline void rtl8125_lib_reset_prepare(struct rtl8125_private *tp) { } ++static inline void rtl8125_lib_reset_complete(struct rtl8125_private *tp) { } ++#endif ++ ++#define HW_SUPPORT_CHECK_PHY_DISABLE_MODE(_M) ((_M)->HwSuppCheckPhyDisableModeVer > 0) ++#define HW_HAS_WRITE_PHY_MCU_RAM_CODE(_M) (((_M)->HwHasWrRamCodeToMicroP == TRUE) ? 1 : 0) ++#define HW_SUPPORT_D0_SPEED_UP(_M) ((_M)->HwSuppD0SpeedUpVer > 0) ++#define HW_SUPPORT_MAC_MCU(_M) ((_M)->HwSuppMacMcuVer > 0) ++#define HW_SUPPORT_TCAM(_M) ((_M)->HwSuppTcamVer > 0) ++ ++#define HW_SUPP_PHY_LINK_SPEED_GIGA(_M) ((_M)->HwSuppMaxPhyLinkSpeed >= 1000) ++#define HW_SUPP_PHY_LINK_SPEED_2500M(_M) ((_M)->HwSuppMaxPhyLinkSpeed >= 2500) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,34) ++#define netdev_mc_count(dev) ((dev)->mc_count) ++#define netdev_mc_empty(dev) (netdev_mc_count(dev) == 0) ++#define netdev_for_each_mc_addr(mclist, dev) \ ++ for (mclist = dev->mc_list; mclist; mclist = mclist->next) ++#endif ++ ++#endif /* __R8125_H */ +diff --git a/drivers/net/ethernet/realtek/r8125_dash.c b/drivers/net/ethernet/realtek/r8125_dash.c +new file mode 100755 +index 000000000000..a71c19aea412 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_dash.c +@@ -0,0 +1,573 @@ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "r8125.h" ++#include "r8125_dash.h" ++#include "rtl_eeprom.h" ++ ++static void r8125_dash_set_ipc2_reg_bit(struct rtl8125_private *tp, unsigned long reg, u32 mask) ++{ ++ RTL_DASH_IPC2_W32(tp, reg, RTL_DASH_IPC2_R32(tp, reg) | mask); ++} ++ ++/* ++static void r8125_dash_clear_ipc2_reg_bit(struct rtl8125_private *tp, unsigned long reg, u32 mask) ++{ ++ RTL_DASH_IPC2_W32(tp, reg, RTL_DASH_IPC2_R32(tp, reg) & ~mask); ++} ++*/ ++ ++static void r8125_write_ipc2_tx_ack(struct rtl8125_private *tp) ++{ ++ if (!tp->DASH) ++ return; ++ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ r8125_dash_set_ipc2_reg_bit(tp, IPC2_TX_SET_REG, IPC2_TX_ACK_BIT); ++} ++ ++static void r8125_write_ipc2_tx_polling(struct rtl8125_private *tp) ++{ ++ if (!tp->DASH) ++ return; ++ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ r8125_dash_set_ipc2_reg_bit(tp, IPC2_TX_SET_REG, IPC2_TX_SEND_BIT); ++} ++ ++static unsigned long ++r8125_get_ipc2_rx_buffer(struct rtl8125_private *tp) ++{ ++ if (HW_DASH_SUPPORT_IPC2(tp)) ++ return IPC2_RX_BUFFER; ++ else ++ return 0; ++} ++ ++static u8 rtl8125_copy_from_ipc2(struct rtl8125_private *tp, u8 *dest, u32 len) ++{ ++ unsigned long const data_reg = r8125_get_ipc2_rx_buffer(tp); ++ u32 offset = 0; ++ u32 *pDword; ++ u8 *pByte; ++ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ goto exit; ++ ++ if (!dest) ++ goto exit; ++ ++ if (len == 0) ++ goto exit; ++ ++ pDword = (u32*)dest; ++ while (len > 3 && offset < (IPC2_BUFFER_LENGTH - 4)) { ++ *pDword++ = RTL_DASH_IPC2_R32(tp, data_reg + offset); ++ ++ len -= 4; ++ offset += 4; ++ } ++ ++ pByte = (u8*)pDword; ++ while (len > 0 && offset < (IPC2_BUFFER_LENGTH - 1)) { ++ *pByte++ = RTL_DASH_IPC2_R8(tp, data_reg + offset); ++ ++ len -= 1; ++ offset += 1; ++ } ++ ++exit: ++ return (len == 0) ? TRUE : FALSE; ++} ++ ++static void RecvFromDashFwComplete(struct rtl8125_private *tp) ++{ ++ if (!tp->DASH) ++ return; ++ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ if (tp->DashReqRegValue == DASH_OOB_HDR_TYPE_REQ) { //rok ++ RX_DASH_BUFFER_TYPE_2 rxDashBufferType2 = {0}; ++ u32 dataLen; ++ ++ if (!tp->OobReq) ++ goto exit; ++ ++ /* copy header for check data length */ ++ if (!rtl8125_copy_from_ipc2(tp, ++ (u8*)&rxDashBufferType2, ++ sizeof(rxDashBufferType2))) ++ goto exit; ++ ++ dataLen = (u16)rxDashBufferType2.oobhdr.len; ++ ++ tp->AfterRecvFromFwBufLen = dataLen + sizeof(OSOOBHdr); ++ if (tp->AfterRecvFromFwBufLen > tp->SizeOfRecvFromFwBuffer) { ++ tp->AfterRecvFromFwBufLen = tp->SizeOfRecvFromFwBuffer; ++ tp->RecvFromFwBufErrCnt++; ++ } ++ ++ /* copy data */ ++ rtl8125_copy_from_ipc2(tp, ++ tp->AfterRecvFromFwBuf, ++ tp->AfterRecvFromFwBufLen); ++ ++ r8125_write_ipc2_tx_ack(tp); ++ ++ tp->OobReqComplete = TRUE; ++ ++ tp->RecvFromDashFwCnt++; ++ } else if (tp->DashReqRegValue == DASH_OOB_HDR_TYPE_ACK) { //rx ack ++ if (!tp->OobAck) ++ goto exit; ++ ++ tp->OobAckComplete = TRUE; ++ ++ tp->RecvFromDashFwCnt++; ++ } ++ ++exit: ++ return; ++} ++ ++static unsigned long r8125_get_ipc2_tx_buffer(struct rtl8125_private *tp) ++{ ++ if (HW_DASH_SUPPORT_IPC2(tp)) ++ return IPC2_TX_BUFFER; ++ else ++ return 0; ++} ++ ++static u32 rtl8125_copy_to_ipc2(struct rtl8125_private *tp, u8 *src, u32 len) ++{ ++ unsigned long const data_reg = r8125_get_ipc2_tx_buffer(tp); ++ u32 offset = 0; ++ u32 *pDword; ++ u8 *pByte; ++ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ goto exit; ++ ++ if (!src) ++ goto exit; ++ ++ if (len == 0) ++ goto exit; ++ ++ pDword = (u32*)src; ++ while (len > 3 && offset < (IPC2_BUFFER_LENGTH - 4)) { ++ RTL_DASH_IPC2_W32(tp, data_reg + offset, *pDword++); ++ ++ len -= 4; ++ offset += 4; ++ } ++ ++ pByte = (u8*)pDword; ++ while (len > 0 && offset < (IPC2_BUFFER_LENGTH - 1)) { ++ RTL_DASH_IPC2_W8(tp, data_reg + offset, *pByte++); ++ ++ len -= 1; ++ offset += 1; ++ } ++ ++exit: ++ return offset; ++} ++ ++static int SendToDashFw(struct rtl8125_private *tp, u8 *src, u16 len) ++{ ++ POSOOBHdr pOobHdr; ++ int rc = -1; ++ ++ if (!tp->DASH) ++ goto exit; ++ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ goto exit; ++ ++ if (TRUE == tp->SendingToFw) ++ goto exit; ++ ++ if (!src) ++ goto exit; ++ ++ if (len > tp->SizeOfSendToFwBuffer) ++ goto exit; ++ ++ if (len < sizeof(OSOOBHdr)) ++ goto exit; ++ ++ pOobHdr = (POSOOBHdr)src; ++ if (pOobHdr->hostReqV == DASH_OOB_HDR_TYPE_REQ) { ++ r8125_write_ipc2_tx_ack(tp); ++ rc = 0; ++ goto exit; ++ } ++ ++ tp->SendingToFw = TRUE; ++ ++ rtl8125_copy_to_ipc2(tp, src, len); ++ ++ r8125_write_ipc2_tx_polling(tp); ++ ++ tp->SendingToFw = FALSE; ++ ++ rc = 0; ++ ++exit: ++ if (!rc) ++ tp->AfterSendToFwBufLen = len; ++ else ++ tp->AfterSendToFwBufLen = 0; ++ ++ return rc; ++} ++ ++static u32 rtl8125_get_ipc2_isr(struct rtl8125_private *tp) ++{ ++ u32 isr = 0; ++ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ goto exit; ++ ++ isr = RTL_DASH_IPC2_R32(tp, IPC2_RX_STATUS_REG); ++ ++ if (isr == ULONG_MAX) ++ isr = 0; ++ ++exit: ++ return isr; ++} ++ ++static void rtl8125_set_ipc2_isr(struct rtl8125_private *tp, u32 val) ++{ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ RTL_DASH_IPC2_W32(tp, IPC2_RX_CLEAR_REG, val); ++} ++ ++void rtl8125_clear_ipc2_isr(struct rtl8125_private *tp) ++{ ++ rtl8125_set_ipc2_isr(tp, rtl8125_get_ipc2_isr(tp)); ++} ++ ++void rtl8125_set_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask) ++{ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ RTL_W16(tp, RISC_IMR_8125BP, RTL_R16(tp, RISC_IMR_8125BP) | mask); ++} ++ ++void rtl8125_clear_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask) ++{ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ RTL_W16(tp, RISC_IMR_8125BP, RTL_R16(tp, RISC_IMR_8125BP) & ~mask); ++} ++ ++bool rtl8125_check_dash_interrupt(struct rtl8125_private *tp) ++{ ++ bool rc = false; ++ u32 isr; ++ ++ if(!tp->DASH) ++ goto exit; ++ ++ if (FALSE == HW_DASH_SUPPORT_IPC2(tp)) ++ goto exit; ++ ++ isr = rtl8125_get_ipc2_isr(tp); ++ ++ if (isr & (IPC2_RX_ROK_BIT | IPC2_RX_ACK_BIT)) { ++ set_bit(R8125_RCV_REQ_DASH_OK, tp->dash_req_flags); ++ if (isr & IPC2_RX_ROK_BIT) ++ tp->DashReqRegValue = DASH_OOB_HDR_TYPE_REQ; ++ else ++ tp->DashReqRegValue = DASH_OOB_HDR_TYPE_ACK; ++ } ++ ++ rtl8125_set_ipc2_isr(tp, isr); ++ ++exit: ++ return rc; ++} ++ ++void rtl8125_handle_dash_interrupt(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if(!tp->DASH) ++ return; ++ ++ if (test_and_clear_bit(R8125_RCV_REQ_DASH_OK, tp->dash_req_flags)) ++ RecvFromDashFwComplete(tp); ++} ++ ++static int DashIoctlGetRcvFromFwData(struct net_device *dev, struct rtl_dash_ioctl_struct *prtl_dash_usrdata) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 ulInfoLen; ++ void *InformationBuffer; ++ u32 InformationBufferLength; ++ void *pInfo; ++ u8 *pByte; ++ u16 *pWord; ++ u8 *tmpBuf; ++ int ret = -EFAULT; ++ ++ if (!tp->DASH) ++ goto exit; ++ ++ if (!tp->rtk_enable_diag) ++ goto exit; ++ ++ if (tp->AfterRecvFromFwBufLen == 0) ++ goto exit; ++ ++ InformationBufferLength = prtl_dash_usrdata->len; ++ InformationBuffer = prtl_dash_usrdata->data_buffer; ++ ++ ulInfoLen = tp->AfterRecvFromFwBufLen + 2 + 2; ++ if (InformationBufferLength < ulInfoLen) { ++ ret = -EFAULT; ++ goto exit; ++ } ++ ++ if (!(tmpBuf = kmalloc(ulInfoLen, GFP_ATOMIC))) { ++ ret = -ENOMEM; ++ goto exit; ++ } ++ ++ pInfo = (void*) tp->AfterRecvFromFwBuf; ++ pWord = (u16*) tmpBuf; ++ *pWord++ = tp->AfterRecvFromFwBufLen; ++ pByte = (u8*)pWord; ++ memcpy(pByte, pInfo, tp->AfterRecvFromFwBufLen); ++ pWord = (u16*)(pByte + tp->AfterRecvFromFwBufLen); ++ *pWord= tp->DashReqRegValue; ++ tp->AfterRecvFromFwBufLen = 0; ++ if (copy_to_user(InformationBuffer, tmpBuf, ulInfoLen)) { ++ kfree(tmpBuf); ++ ret = -EFAULT; ++ goto exit; ++ } ++ kfree(tmpBuf); ++ ret = 0; ++ ++exit: ++ return ret; ++} ++ ++static int DashIoctlCheckSendBufferToFwComplete(struct net_device *dev, struct rtl_dash_ioctl_struct *prtl_dash_usrdata) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 ulInfoLen; ++ void *InformationBuffer; ++ u32 InformationBufferLength; ++ u16 *pWord; ++ u8 *tmpBuf; ++ int ret = -EFAULT; ++ ++ if (!tp->DASH) ++ goto exit; ++ ++ if (!tp->rtk_enable_diag) ++ goto exit; ++ ++ InformationBufferLength = prtl_dash_usrdata->len; ++ InformationBuffer = prtl_dash_usrdata->data_buffer; ++ ++ if (tp->SendingToFw == FALSE) ++ ulInfoLen = tp->AfterSendToFwBufLen + sizeof(u16); ++ else ++ ulInfoLen = sizeof(u16); ++ ++ if (InformationBufferLength < ulInfoLen) { ++ ret = -EFAULT; ++ goto exit; ++ } ++ ++ if (!(tmpBuf = kmalloc(ulInfoLen, GFP_ATOMIC))) { ++ ret = -ENOMEM; ++ goto exit; ++ } ++ ++ pWord = (u16*) tmpBuf; ++ if (tp->SendingToFw == FALSE) { ++ *pWord++ = tp->AfterSendToFwBufLen; ++ memcpy(pWord, tp->AfterSendToFwBuf, tp->AfterSendToFwBufLen); ++ tp->AfterSendToFwBufLen = 0; ++ } else { ++ *pWord = 0xffff; ++ } ++ ++ if (copy_to_user(InformationBuffer, tmpBuf, ulInfoLen)) ++ ret = -EFAULT; ++ else ++ ret = 0; ++ ++ kfree(tmpBuf); ++ ++exit: ++ return ret; ++} ++ ++static int DashIoctlCheckSendBufferToFw(struct net_device *dev, struct rtl_dash_ioctl_struct *prtl_dash_usrdata) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 ulInfoLen; ++ void *InformationBuffer; ++ u32 InformationBufferLength; ++ u16 *pWord; ++ u16 SetDataSize; ++ int ret = -EFAULT; ++ ++ if (!tp->DASH) ++ goto exit; ++ ++ if (!tp->rtk_enable_diag) ++ goto exit; ++ ++ InformationBufferLength = prtl_dash_usrdata->len; ++ if (!(InformationBuffer = kmalloc(InformationBufferLength, GFP_KERNEL))) { ++ ret = -ENOMEM; ++ goto exit; ++ } ++ ++ if (copy_from_user(InformationBuffer, prtl_dash_usrdata->data_buffer, ++ InformationBufferLength)) { ++ ret = -EFAULT; ++ goto free_mem; ++ } ++ ++ ulInfoLen = sizeof(u16) + sizeof(u16); ++ ++ if (InformationBufferLength < ulInfoLen) ++ goto free_mem; ++ ++ pWord = (u16*) InformationBuffer; ++ SetDataSize = *pWord++; ++ ++ if (InformationBufferLength < (SetDataSize + sizeof(u16) + sizeof(u16))) { ++ ret = -EFAULT; ++ goto free_mem; ++ } ++ ++ ret = SendToDashFw(tp, (u8*)pWord, SetDataSize); ++ ++free_mem: ++ kfree(InformationBuffer); ++ ++exit: ++ return ret; ++} ++ ++int rtl8125_dash_ioctl(struct net_device *dev, struct ifreq *ifr) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ void *user_data = ifr->ifr_data; ++ struct rtl_dash_ioctl_struct rtl_dash_usrdata; ++ ++ int ret=0; ++ ++ if (FALSE == HW_DASH_SUPPORT_DASH(tp)) ++ return -EOPNOTSUPP; ++ ++ if (!tp->DASH) ++ return -EINVAL; ++ ++ if (copy_from_user(&rtl_dash_usrdata, user_data, ++ sizeof(struct rtl_dash_ioctl_struct))) ++ return -EFAULT; ++ ++ switch (rtl_dash_usrdata.cmd) { ++ case RTL_DASH_SEND_BUFFER_DATA_TO_DASH_FW: ++ ret = DashIoctlCheckSendBufferToFw(dev, &rtl_dash_usrdata); ++ break; ++ case RTL_DASH_CHECK_SEND_BUFFER_TO_DASH_FW_COMPLETE: ++ ret = DashIoctlCheckSendBufferToFwComplete(dev, ++ &rtl_dash_usrdata); ++ break; ++ case RTL_DASH_GET_RCV_FROM_FW_BUFFER_DATA: ++ ret = DashIoctlGetRcvFromFwData(dev, &rtl_dash_usrdata); ++ break; ++ case RTL_DASH_OOB_REQ: ++ tp->OobReq = TRUE; ++ tp->OobReqComplete = FALSE; ++ break; ++ case RTL_DASH_OOB_ACK: ++ tp->OobAck = TRUE; ++ tp->OobAckComplete = FALSE; ++ break; ++ case RTL_DASH_DETACH_OOB_REQ: ++ tp->OobReq = FALSE; ++ tp->OobReqComplete = FALSE; ++ break; ++ case RTL_DASH_DETACH_OOB_ACK: ++ tp->OobAck = FALSE; ++ tp->OobAckComplete = FALSE; ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return ret; ++} +diff --git a/drivers/net/ethernet/realtek/r8125_dash.h b/drivers/net/ethernet/realtek/r8125_dash.h +new file mode 100755 +index 000000000000..1a4b7dae624c +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_dash.h +@@ -0,0 +1,196 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef _LINUX_R8125_DASH_H ++#define _LINUX_R8125_DASH_H ++ ++#include ++ ++#define SIOCDEVPRIVATE_RTLDASH SIOCDEVPRIVATE+2 ++ ++enum rtl_dash_cmd { ++ RTL_DASH_ARP_NS_OFFLOAD = 0, ++ RTL_DASH_SET_OOB_IPMAC, ++ RTL_DASH_NOTIFY_OOB, ++ ++ RTL_DASH_SEND_BUFFER_DATA_TO_DASH_FW, ++ RTL_DASH_CHECK_SEND_BUFFER_TO_DASH_FW_COMPLETE, ++ RTL_DASH_GET_RCV_FROM_FW_BUFFER_DATA, ++ RTL_DASH_OOB_REQ, ++ RTL_DASH_OOB_ACK, ++ RTL_DASH_DETACH_OOB_REQ, ++ RTL_DASH_DETACH_OOB_ACK, ++ ++ RTL_FW_SET_IPV4 = 0x10, ++ RTL_FW_GET_IPV4, ++ RTL_FW_SET_IPV6, ++ RTL_FW_GET_IPV6, ++ RTL_FW_SET_EXT_SNMP, ++ RTL_FW_GET_EXT_SNMP, ++ RTL_FW_SET_WAKEUP_PATTERN, ++ RTL_FW_GET_WAKEUP_PATTERN, ++ RTL_FW_DEL_WAKEUP_PATTERN, ++ ++ RTLT_DASH_COMMAND_INVALID, ++}; ++ ++struct rtl_dash_ip_mac { ++ struct sockaddr ifru_addr; ++ struct sockaddr ifru_netmask; ++ struct sockaddr ifru_hwaddr; ++}; ++ ++struct rtl_dash_ioctl_struct { ++ __u32 cmd; ++ __u32 offset; ++ __u32 len; ++ union { ++ __u32 data; ++ void *data_buffer; ++ }; ++}; ++ ++typedef struct _OSOOBHdr { ++ __le32 len; ++ u8 type; ++ u8 flag; ++ u8 hostReqV; ++ u8 res; ++} ++OSOOBHdr, *POSOOBHdr; ++ ++typedef struct _RX_DASH_BUFFER_TYPE_2 { ++ OSOOBHdr oobhdr; ++ u8 RxDataBuffer[0]; ++} ++RX_DASH_BUFFER_TYPE_2, *PRX_DASH_BUFFER_TYPE_2; ++ ++#define ALIGN_8 (0x7) ++#define ALIGN_16 (0xf) ++#define ALIGN_32 (0x1f) ++#define ALIGN_64 (0x3f) ++#define ALIGN_256 (0xff) ++#define ALIGN_4096 (0xfff) ++ ++#define OCP_REG_FIRMWARE_MAJOR_VERSION (0x120) ++ ++#define HW_DASH_SUPPORT_DASH(_M) ((_M)->HwSuppDashVer > 0) ++#define HW_DASH_SUPPORT_TYPE_1(_M) ((_M)->HwSuppDashVer == 1) ++#define HW_DASH_SUPPORT_TYPE_2(_M) ((_M)->HwSuppDashVer == 2) ++#define HW_DASH_SUPPORT_TYPE_3(_M) ((_M)->HwSuppDashVer == 3) ++#define HW_DASH_SUPPORT_TYPE_4(_M) ((_M)->HwSuppDashVer == 4) ++#define HW_DASH_SUPPORT_CMAC(_M) (HW_DASH_SUPPORT_TYPE_2(_M) || HW_DASH_SUPPORT_TYPE_3(_M)) ++#define HW_DASH_SUPPORT_IPC2(_M) (HW_DASH_SUPPORT_TYPE_4(_M)) ++#define HW_DASH_SUPPORT_GET_FIRMWARE_VERSION(_M) (HW_DASH_SUPPORT_TYPE_2(_M) || \ ++ HW_DASH_SUPPORT_TYPE_3(_M) || \ ++ HW_DASH_SUPPORT_TYPE_4(_M)) ++ ++#define RECV_FROM_FW_BUF_SIZE (1520) ++#define SEND_TO_FW_BUF_SIZE (1520) ++ ++#define TXS_CC3_0 (BIT_0|BIT_1|BIT_2|BIT_3) ++#define TXS_EXC BIT_4 ++#define TXS_LNKF BIT_5 ++#define TXS_OWC BIT_6 ++#define TXS_TES BIT_7 ++#define TXS_UNF BIT_9 ++#define TXS_LGSEN BIT_11 ++#define TXS_LS BIT_12 ++#define TXS_FS BIT_13 ++#define TXS_EOR BIT_14 ++#define TXS_OWN BIT_15 ++ ++#define TPPool_HRDY 0x20 ++ ++#define RXS_OWN BIT_15 ++#define RXS_EOR BIT_14 ++#define RXS_FS BIT_13 ++#define RXS_LS BIT_12 ++ ++#define ISRIMR_DASH_INTR_EN BIT_12 ++ ++#define NO_BASE_ADDRESS 0x00000000 ++ ++/* IB2SOC registers */ ++#define IPC2_SWISR_DRIVER_READY 0x05 ++#define IPC2_SWISR_DRIVER_EXIT 0x06 ++#define IPC2_SWISR_CLIENTTOOL_SYNC_HOSTNAME 0x20 ++#define IPC2_SWISR_DIS_DASH 0x55 ++#define IPC2_SWISR_EN_DASH 0x56 ++ ++#define IPC2_IB2SOC_SET 0x10 ++#define IPC2_IB2SOC_DATA 0x14 ++#define IPC2_IB2SOC_CMD 0x18 ++#define IPC2_IB2SOC_IMR 0x1C ++ ++/* IPC2 registers */ ++#define IPC2_PCIE_BASE 0xC100 ++#define IPC2_TX_SET_REG IPC2_PCIE_BASE ++#define IPC2_TX_STATUS_REG (IPC2_PCIE_BASE+0x04) ++#define IPC2_RX_STATUS_REG (IPC2_PCIE_BASE+0x08) ++#define IPC2_RX_CLEAR_REG (IPC2_PCIE_BASE+0x0C) ++#define IPC2_DATA_BASE 0x32000 ++#define IPC2_BUFFER_LENGTH 0x1000 ++#define IPC2_DATA_MASTER IPC2_DATA_BASE //dash tx buffer base ++#define IPC2_DATA_SLAVE (IPC2_DATA_BASE+IPC2_BUFFER_LENGTH) //dash rx buffer base ++#define IPC2_TX_BUFFER IPC2_DATA_MASTER ++#define IPC2_RX_BUFFER IPC2_DATA_SLAVE ++ ++#define IPC2_TX_SEND_BIT BIT_0 ++#define IPC2_TX_ACK_BIT BIT_8 ++#define IPC2_RX_ROK_BIT BIT_0 ++#define IPC2_RX_ACK_BIT BIT_8 ++ ++/* IPC2 write/read MMIO register */ ++#define RTL_DASH_IPC2_W8(tp, reg, val8) RTL_W8(tp, reg, val8) ++#define RTL_DASH_IPC2_W16(tp, reg, val16) RTL_W16(tp, reg, val16) ++#define RTL_DASH_IPC2_W32(tp, reg, val32) RTL_W32(tp, reg, val32) ++#define RTL_DASH_IPC2_R8(tp, reg) RTL_R8(tp, reg) ++#define RTL_DASH_IPC2_R16(tp, reg) RTL_R16(tp, reg) ++#define RTL_DASH_IPC2_R32(tp, reg) RTL_R32(tp, reg) ++ ++/* DASH OOB Header Type */ ++#define DASH_OOB_HDR_TYPE_REQ 0x91 ++#define DASH_OOB_HDR_TYPE_ACK 0x92 ++ ++struct rtl8125_private; ++ ++int rtl8125_dash_ioctl(struct net_device *dev, struct ifreq *ifr); ++bool rtl8125_check_dash_interrupt(struct rtl8125_private *tp); ++void rtl8125_handle_dash_interrupt(struct net_device *dev); ++void rtl8125_clear_ipc2_isr(struct rtl8125_private *tp); ++void rtl8125_set_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask); ++void rtl8125_clear_ipc2_soc_imr_bit(struct rtl8125_private *tp, u16 mask); ++ ++#endif /* _LINUX_R8125_DASH_H */ +diff --git a/drivers/net/ethernet/realtek/r8125_fiber.c b/drivers/net/ethernet/realtek/r8125_fiber.c +new file mode 100755 +index 000000000000..76527719bf9c +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_fiber.c +@@ -0,0 +1,464 @@ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#include ++#include ++#include ++ ++#include "r8125.h" ++#include "r8125_fiber.h" ++ ++static void ++rtl8125_fiber_set_mdc_gpio_c45(struct rtl8125_private *tp, bool pu) ++{ ++ if (pu) ++ rtl8125_set_mac_ocp_bit(tp, 0xDC52, BIT_7); ++ else ++ rtl8125_clear_mac_ocp_bit(tp, 0xDC52, BIT_7); ++ ++ //RtPciCommittp); ++} ++ ++static void ++rtl8125_fiber_set_mdc(struct rtl8125_private *tp, bool pu) ++{ ++ rtl8125_fiber_set_mdc_gpio_c45(tp, pu); ++} ++ ++static void ++rtl8125_fiber_set_mdcDownUp(struct rtl8125_private *tp) ++{ ++ udelay(1); ++ rtl8125_fiber_set_mdc(tp, 0); ++ udelay(1); ++ rtl8125_fiber_set_mdc(tp, 1); ++} ++ ++static void ++rtl8125_fiber_set_mdio_bit_gpio_c45(struct rtl8125_private *tp, bool pu) ++{ ++ if (pu) ++ rtl8125_set_mac_ocp_bit(tp, 0xDC52, BIT_2); ++ else ++ rtl8125_clear_mac_ocp_bit(tp, 0xDC52, BIT_2); ++ ++ //RtPciCommittp); ++ ++ rtl8125_fiber_set_mdcDownUp(tp); ++} ++ ++static void ++rtl8125_fiber_set_mdio_bit(struct rtl8125_private *tp, bool pu) ++{ ++ rtl8125_fiber_set_mdio_bit_gpio_c45(tp, pu); ++} ++ ++static u16 ++rtl8125_fiber_get_mdio_bit_gpio_c45(struct rtl8125_private *tp) ++{ ++ rtl8125_fiber_set_mdcDownUp(tp); ++ ++ return !!(rtl8125_mac_ocp_read(tp, 0xDC58) & BIT(2)); ++} ++ ++static u16 ++rtl8125_fiber_get_mdio_bit(struct rtl8125_private *tp) ++{ ++ return rtl8125_fiber_get_mdio_bit_gpio_c45(tp); ++} ++ ++static void ++rtl8125_fiber_shift_bit_in(struct rtl8125_private *tp, u32 val, int count) ++{ ++ int i; ++ ++ for (i = (count - 1); i >= 0; i--) ++ rtl8125_fiber_set_mdio_bit(tp, !!(val & BIT(i))); ++} ++ ++static u16 ++rtl8125_fiber_shift_bit_out(struct rtl8125_private *tp) ++{ ++ u16 data = 0; ++ int i; ++ ++ for (i = 15; i >= 0; i--) ++ data += (rtl8125_fiber_get_mdio_bit(tp) << i); ++ ++ return data; ++} ++ ++static void ++rtl8125_fiber_dir_gpio_c45(struct rtl8125_private *tp, bool output_mode) ++{ ++ if (output_mode) ++ rtl8125_set_mac_ocp_bit(tp, 0xDC4C, BIT_2); ++ else ++ rtl8125_clear_mac_ocp_bit(tp, 0xDC4C, BIT_2); ++} ++ ++static void ++rtl8125_fiber_dir(struct rtl8125_private *tp, bool output_mode) ++{ ++ rtl8125_fiber_dir_gpio_c45(tp, output_mode); ++} ++ ++//fiber ++#define R8125_FIBER_C22 (0) ++#define R8125_FIBER_C45 (1) ++ ++// sfp opcodes ++#define R8125_FIBER_ST (1) ++#define R8125_FIBER_OP_W (1) ++#define R8125_FIBER_OP_R (2) ++#define R8125_FIBER_TA (2) ++ ++// sfp C45 opcodes ++#define R8125_FIBER_MDIO_C45 (BIT(15)) ++#define R8125_FIBER_C45_ST (R8125_FIBER_MDIO_C45 | 0) ++#define R8125_FIBER_C45_OP_ADDR (R8125_FIBER_MDIO_C45 | 0) ++#define R8125_FIBER_C45_OP_W (R8125_FIBER_MDIO_C45 | 1) ++#define R8125_FIBER_C45_OP_R (R8125_FIBER_MDIO_C45 | 3) ++ ++static void ++rtl8125_fiber_cmd(struct rtl8125_private *tp, u32 cmd, u8 phy_addr, ++ u32 reg) ++{ ++ /* change to output mode */ ++ rtl8125_fiber_dir(tp, 1); ++ ++ /* preamble 32bit of 1 */ ++ rtl8125_fiber_shift_bit_in(tp, UINT_MAX, 32); ++ ++ /* start bit */ ++ if (cmd & R8125_FIBER_MDIO_C45) ++ rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_C45_ST, 2); ++ else ++ rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_ST, 2); ++ ++ /* opcode */ ++ rtl8125_fiber_shift_bit_in(tp, cmd, 2); ++ ++ /* phy address */ ++ rtl8125_fiber_shift_bit_in(tp, phy_addr, 5); ++ ++ /* phy reg */ ++ rtl8125_fiber_shift_bit_in(tp, reg, 5); ++} ++ ++static u8 ++rtl8125_fiber_cmdAddr(struct rtl8125_private *tp, u8 phy_addr, u32 reg) ++{ ++ u8 dev_addr = (reg >> 16) & 0x1F; ++ u16 addr = (u16)reg; ++ ++ rtl8125_fiber_cmd(tp, R8125_FIBER_C45_OP_ADDR, phy_addr, dev_addr); ++ ++ /* turn-around(TA) */ ++ rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_TA, 2); ++ ++ rtl8125_fiber_shift_bit_in(tp, addr, 16); ++ ++ rtl8125_fiber_dir(tp, 0); ++ ++ rtl8125_fiber_get_mdio_bit(tp); ++ ++ return dev_addr; ++} ++ ++static void ++rtl8125_fiber_reset_gpio_c45(struct rtl8125_private *tp) ++{ ++ rtl8125_set_mac_ocp_bit(tp, 0xDC4C, (BIT_7 | BIT_2)); ++ ++ /* init sfp interface */ ++ rtl8125_clear_mac_ocp_bit(tp, 0xDC52, BIT_7); ++ rtl8125_set_mac_ocp_bit(tp, 0xDC52, BIT_2); ++} ++ ++static void ++rtl8125_fiber_write_common(struct rtl8125_private *tp, u16 val) ++{ ++ /* turn-around(TA) */ ++ rtl8125_fiber_shift_bit_in(tp, R8125_FIBER_TA, 2); ++ ++ /* write phy data */ ++ rtl8125_fiber_shift_bit_in(tp, val, 16); ++ ++ /* change to input mode */ ++ rtl8125_fiber_dir(tp, 0); ++ ++ rtl8125_fiber_get_mdio_bit(tp); ++} ++ ++static void ++rtl8125_fiber_mdio_write_gpio_c45( ++ struct rtl8125_private *tp, ++ u32 reg, ++ u16 val, ++ u8 phy_addr) ++{ ++ /* opcode write */ ++ reg = rtl8125_fiber_cmdAddr(tp, phy_addr, reg); ++ rtl8125_fiber_cmd(tp, R8125_FIBER_C45_OP_W, phy_addr, reg); ++ ++ rtl8125_fiber_write_common(tp, val); ++} ++ ++static u16 ++rtl8125_fiber_read_common(struct rtl8125_private *tp) ++{ ++ u16 data = 0; ++ ++ /* change to input mode */ ++ rtl8125_fiber_dir(tp, 0); ++ ++ /* TA 0 */ ++ rtl8125_fiber_get_mdio_bit(tp); ++ ++ /* read phy data */ ++ data = rtl8125_fiber_shift_bit_out(tp); ++ ++ rtl8125_fiber_get_mdio_bit(tp); ++ ++ return data; ++} ++ ++static u16 ++rtl8125_fiber_mdio_read_gpio_c45( ++ struct rtl8125_private *tp, ++ u32 reg, ++ u8 phy_addr) ++{ ++ reg = rtl8125_fiber_cmdAddr(tp, phy_addr, reg); ++ rtl8125_fiber_cmd(tp, R8125_FIBER_C45_OP_R, phy_addr, reg); ++ ++ return rtl8125_fiber_read_common(tp); ++} ++ ++void ++rtl8125_fiber_mdio_write( ++ struct rtl8125_private *tp, ++ u32 reg, ++ u16 val) ++{ ++ switch(tp->HwFiberStat) { ++ case FIBER_STAT_CONNECT_GPO_C45: ++ return rtl8125_fiber_mdio_write_gpio_c45(tp, reg, val, 0); ++ default: ++ return; ++ } ++} ++ ++u16 ++rtl8125_fiber_mdio_read( ++ struct rtl8125_private *tp, ++ u32 reg) ++{ ++ switch(tp->HwFiberStat) { ++ case FIBER_STAT_CONNECT_GPO_C45: ++ return rtl8125_fiber_mdio_read_gpio_c45(tp, reg, 0); ++ default: ++ return 0xffff; ++ } ++} ++ ++static void ++rtl8125_fiber_clear_and_set_phy_bit(struct rtl8125_private *tp, u32 addr, u16 clearmask, u16 setmask) ++{ ++ u16 PhyRegValue; ++ ++ PhyRegValue = rtl8125_fiber_mdio_read(tp, addr); ++ PhyRegValue &= ~clearmask; ++ PhyRegValue |= setmask; ++ rtl8125_fiber_mdio_write(tp, addr, PhyRegValue); ++} ++ ++static void ++rtl8125_fiber_clear_phy_bit(struct rtl8125_private *tp, u32 addr, u16 mask) ++{ ++ rtl8125_fiber_clear_and_set_phy_bit(tp, addr, mask, 0); ++} ++ ++static void ++rtl8125_fiber_set_phy_bit(struct rtl8125_private *tp, u32 addr, u16 mask) ++{ ++ rtl8125_fiber_clear_and_set_phy_bit(tp, addr, 0, mask); ++} ++ ++#define R8125_MAKE_C45_ADDR(_mmd, _addr) (_mmd << 16 | _addr) ++ ++static void ++rtl8125_fiber_phy_reset_8221d(struct rtl8125_private *tp) ++{ ++ u16 PhyRegValue; ++ u32 Timeout; ++ ++ rtl8125_fiber_set_phy_bit(tp, R8125_MAKE_C45_ADDR(0x01, 0x00), BIT_15); ++ ++ Timeout = 0; ++ do { ++ udelay(1000); ++ ++ PhyRegValue = rtl8125_fiber_mdio_read(tp, R8125_MAKE_C45_ADDR(0x01, 0x00)); ++ ++ Timeout++; ++ } while ((PhyRegValue & BIT_15) && (Timeout < 20)); ++} ++ ++static void ++rtl8125_fiber_phy_reset(struct rtl8125_private *tp) ++{ ++ switch (tp->HwFiberModeVer) { ++ case FIBER_MODE_RTL8125D_RTL8221D: ++ rtl8125_fiber_phy_reset_8221d(tp); ++ break; ++ } ++} ++ ++static void ++rtl8125_fiber_set_rtl8221d_phy_mode(struct rtl8125_private *tp, u16 mode) ++{ ++ mode &= 0x3f; ++ ++ rtl8125_fiber_clear_phy_bit(tp, R8125_MAKE_C45_ADDR(30, 0x75F3), BIT_0); ++ rtl8125_fiber_clear_and_set_phy_bit(tp, ++ R8125_MAKE_C45_ADDR(30, 0x697A), ++ 0x003F, ++ mode); ++} ++ ++static void ++rtl8125_fiber_set_phy_mode(struct rtl8125_private *tp, u16 mode) ++{ ++ switch (tp->HwFiberModeVer) { ++ case FIBER_MODE_RTL8125D_RTL8221D: ++ rtl8125_fiber_set_rtl8221d_phy_mode(tp, mode); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void ++rtl8125_hw_rtl8221d_phy_config(struct rtl8125_private *tp) ++{ ++ rtl8125_fiber_reset_gpio_c45(tp); ++ ++ rtl8125_fiber_set_phy_mode(tp, (tp->speed == SPEED_2500) ? 0x02 : 0x04); ++ ++ ++ rtl8125_fiber_clear_phy_bit(tp, R8125_MAKE_C45_ADDR(0x07, 0x3C), (BIT_2 | BIT_1)); ++ rtl8125_fiber_clear_phy_bit(tp, R8125_MAKE_C45_ADDR(0x07, 0x3E), (BIT_1 | BIT_0)); ++ ++ ++ rtl8125_fiber_phy_reset(tp); ++} ++ ++void ++rtl8125_hw_fiber_phy_config(struct rtl8125_private *tp) ++{ ++ switch (tp->HwFiberModeVer) { ++ case FIBER_MODE_RTL8125D_RTL8221D: ++ rtl8125_hw_rtl8221d_phy_config(tp); ++ break; ++ default: ++ break; ++ } ++} ++ ++#define RTL8221D_PHY_ID_1 0x001C ++#define RTL8221D_PHY_ID_2 0xC849 ++static u32 ++rtl8125_fiber_get_connect_status_8221d(struct rtl8125_private *tp) ++{ ++ int i; ++ int const checkcnt = 4; ++ ++ rtl8125_fiber_reset_gpio_c45(tp); ++ ++ for (i = 0; i < checkcnt; i++) { ++ if (RTL8221D_PHY_ID_1 != rtl8125_fiber_mdio_read_gpio_c45(tp, R8125_MAKE_C45_ADDR(0x01, 0x02), 0) || ++ RTL8221D_PHY_ID_2 != rtl8125_fiber_mdio_read_gpio_c45(tp, R8125_MAKE_C45_ADDR(0x01, 0x03), 0)) ++ return FIBER_STAT_DISCONNECT; ++ } ++ ++ return FIBER_STAT_CONNECT_GPO_C45; ++} ++ ++static u32 ++rtl8125_fiber_get_connect_status(struct rtl8125_private *tp) ++{ ++ switch (tp->HwFiberModeVer) { ++ case FIBER_MODE_RTL8125D_RTL8221D: ++ return rtl8125_fiber_get_connect_status_8221d(tp); ++ default: ++ return FIBER_STAT_NOT_CHECKED; ++ } ++} ++ ++void ++rtl8125_check_fiber_mode_support(struct rtl8125_private *tp) ++{ ++ switch(tp->mcfg) { ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: { ++ u8 tmp = (u8)rtl8125_mac_ocp_read(tp, 0xD006); ++ if (tmp == 0x03) ++ tp->HwFiberModeVer = FIBER_MODE_RTL8125D_RTL8221D; ++ } ++ break; ++ } ++ ++ if (HW_FIBER_MODE_ENABLED(tp)) ++ tp->HwFiberStat = rtl8125_fiber_get_connect_status(tp); ++} ++ ++unsigned int ++rtl8125_fiber_link_ok(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 status; ++ ++ switch (tp->HwFiberStat) { ++ case FIBER_STAT_CONNECT_GPO_C45: ++ status = rtl8125_fiber_mdio_read(tp, R8125_MAKE_C45_ADDR(30, 0x758D)); ++ if (status != USHRT_MAX && status & BIT_1) ++ return 1; ++ else ++ return 0; ++ break; ++ default: ++ return 0; ++ } ++} +diff --git a/drivers/net/ethernet/realtek/r8125_fiber.h b/drivers/net/ethernet/realtek/r8125_fiber.h +new file mode 100755 +index 000000000000..3a328574f291 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_fiber.h +@@ -0,0 +1,63 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef _LINUX_R8125_FIBER_H ++#define _LINUX_R8125_FIBER_H ++ ++enum { ++ FIBER_MODE_NIC_ONLY = 0, ++ FIBER_MODE_RTL8125D_RTL8221D, ++ FIBER_MODE_MAX ++}; ++ ++enum { ++ FIBER_STAT_NOT_CHECKED = 0, ++ FIBER_STAT_DISCONNECT, ++ FIBER_STAT_CONNECT_GPO_C45, ++ FIBER_STAT_MAX ++}; ++ ++#define HW_FIBER_MODE_ENABLED(_M) ((_M)->HwFiberModeVer > 0) ++#define HW_FIBER_STATUS_CONNECTED(_M) (((_M)->HwFiberStat == FIBER_STAT_CONNECT_GPO_C45)) ++#define HW_FIBER_STATUS_DISCONNECTED(_M) ((_M)->HwFiberStat == FIBER_STAT_DISCONNECT) ++ ++struct rtl8125_private; ++ ++void rtl8125_hw_fiber_phy_config(struct rtl8125_private *tp); ++void rtl8125_check_fiber_mode_support(struct rtl8125_private *tp); ++void rtl8125_fiber_mdio_write( struct rtl8125_private *tp, u32 reg, u16 val); ++u16 rtl8125_fiber_mdio_read(struct rtl8125_private *tp, u32 reg); ++unsigned int rtl8125_fiber_link_ok(struct net_device *dev); ++ ++#endif /* _LINUX_R8125_FIBER_H */ +diff --git a/drivers/net/ethernet/realtek/r8125_firmware.c b/drivers/net/ethernet/realtek/r8125_firmware.c +new file mode 100755 +index 000000000000..313c7d91b1c3 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_firmware.c +@@ -0,0 +1,264 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#include ++#include ++#include ++ ++#include "r8125_firmware.h" ++ ++enum rtl_fw_opcode { ++ PHY_READ = 0x0, ++ PHY_DATA_OR = 0x1, ++ PHY_DATA_AND = 0x2, ++ PHY_BJMPN = 0x3, ++ PHY_MDIO_CHG = 0x4, ++ PHY_CLEAR_READCOUNT = 0x7, ++ PHY_WRITE = 0x8, ++ PHY_READCOUNT_EQ_SKIP = 0x9, ++ PHY_COMP_EQ_SKIPN = 0xa, ++ PHY_COMP_NEQ_SKIPN = 0xb, ++ PHY_WRITE_PREVIOUS = 0xc, ++ PHY_SKIPN = 0xd, ++ PHY_DELAY_MS = 0xe, ++}; ++ ++struct fw_info { ++ u32 magic; ++ char version[RTL8125_VER_SIZE]; ++ __le32 fw_start; ++ __le32 fw_len; ++ u8 chksum; ++} __packed; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,16,0) ++#define sizeof_field(TYPE, MEMBER) sizeof((((TYPE *)0)->MEMBER)) ++#endif ++#define FW_OPCODE_SIZE sizeof_field(struct rtl8125_fw_phy_action, code[0]) ++ ++static bool rtl8125_fw_format_ok(struct rtl8125_fw *rtl_fw) ++{ ++ const struct firmware *fw = rtl_fw->fw; ++ struct fw_info *fw_info = (struct fw_info *)fw->data; ++ struct rtl8125_fw_phy_action *pa = &rtl_fw->phy_action; ++ ++ if (fw->size < FW_OPCODE_SIZE) ++ return false; ++ ++ if (!fw_info->magic) { ++ size_t i, size, start; ++ u8 checksum = 0; ++ ++ if (fw->size < sizeof(*fw_info)) ++ return false; ++ ++ for (i = 0; i < fw->size; i++) ++ checksum += fw->data[i]; ++ if (checksum != 0) ++ return false; ++ ++ start = le32_to_cpu(fw_info->fw_start); ++ if (start > fw->size) ++ return false; ++ ++ size = le32_to_cpu(fw_info->fw_len); ++ if (size > (fw->size - start) / FW_OPCODE_SIZE) ++ return false; ++ ++ strscpy(rtl_fw->version, fw_info->version, RTL8125_VER_SIZE); ++ ++ pa->code = (__le32 *)(fw->data + start); ++ pa->size = size; ++ } else { ++ if (fw->size % FW_OPCODE_SIZE) ++ return false; ++ ++ strscpy(rtl_fw->version, rtl_fw->fw_name, RTL8125_VER_SIZE); ++ ++ pa->code = (__le32 *)fw->data; ++ pa->size = fw->size / FW_OPCODE_SIZE; ++ } ++ ++ return true; ++} ++ ++static bool rtl8125_fw_data_ok(struct rtl8125_fw *rtl_fw) ++{ ++ struct rtl8125_fw_phy_action *pa = &rtl_fw->phy_action; ++ size_t index; ++ ++ for (index = 0; index < pa->size; index++) { ++ u32 action = le32_to_cpu(pa->code[index]); ++ u32 val = action & 0x0000ffff; ++ u32 regno = (action & 0x0fff0000) >> 16; ++ ++ switch (action >> 28) { ++ case PHY_READ: ++ case PHY_DATA_OR: ++ case PHY_DATA_AND: ++ case PHY_CLEAR_READCOUNT: ++ case PHY_WRITE: ++ case PHY_WRITE_PREVIOUS: ++ case PHY_DELAY_MS: ++ break; ++ ++ case PHY_MDIO_CHG: ++ if (val > 1) ++ goto out; ++ break; ++ ++ case PHY_BJMPN: ++ if (regno > index) ++ goto out; ++ break; ++ case PHY_READCOUNT_EQ_SKIP: ++ if (index + 2 >= pa->size) ++ goto out; ++ break; ++ case PHY_COMP_EQ_SKIPN: ++ case PHY_COMP_NEQ_SKIPN: ++ case PHY_SKIPN: ++ if (index + 1 + regno >= pa->size) ++ goto out; ++ break; ++ ++ default: ++ dev_err(rtl_fw->dev, "Invalid action 0x%08x\n", action); ++ return false; ++ } ++ } ++ ++ return true; ++out: ++ dev_err(rtl_fw->dev, "Out of range of firmware\n"); ++ return false; ++} ++ ++void rtl8125_fw_write_firmware(struct rtl8125_private *tp, struct rtl8125_fw *rtl_fw) ++{ ++ struct rtl8125_fw_phy_action *pa = &rtl_fw->phy_action; ++ rtl8125_fw_write_t fw_write = rtl_fw->phy_write; ++ rtl8125_fw_read_t fw_read = rtl_fw->phy_read; ++ int predata = 0, count = 0; ++ size_t index; ++ ++ for (index = 0; index < pa->size; index++) { ++ u32 action = le32_to_cpu(pa->code[index]); ++ u32 data = action & 0x0000ffff; ++ u32 regno = (action & 0x0fff0000) >> 16; ++ enum rtl_fw_opcode opcode = action >> 28; ++ ++ if (!action) ++ break; ++ ++ switch (opcode) { ++ case PHY_READ: ++ predata = fw_read(tp, regno); ++ count++; ++ break; ++ case PHY_DATA_OR: ++ predata |= data; ++ break; ++ case PHY_DATA_AND: ++ predata &= data; ++ break; ++ case PHY_BJMPN: ++ index -= (regno + 1); ++ break; ++ case PHY_MDIO_CHG: ++ if (data) { ++ fw_write = rtl_fw->mac_mcu_write; ++ fw_read = rtl_fw->mac_mcu_read; ++ } else { ++ fw_write = rtl_fw->phy_write; ++ fw_read = rtl_fw->phy_read; ++ } ++ ++ break; ++ case PHY_CLEAR_READCOUNT: ++ count = 0; ++ break; ++ case PHY_WRITE: ++ fw_write(tp, regno, data); ++ break; ++ case PHY_READCOUNT_EQ_SKIP: ++ if (count == data) ++ index++; ++ break; ++ case PHY_COMP_EQ_SKIPN: ++ if (predata == data) ++ index += regno; ++ break; ++ case PHY_COMP_NEQ_SKIPN: ++ if (predata != data) ++ index += regno; ++ break; ++ case PHY_WRITE_PREVIOUS: ++ fw_write(tp, regno, predata); ++ break; ++ case PHY_SKIPN: ++ index += regno; ++ break; ++ case PHY_DELAY_MS: ++ mdelay(1 * data); ++ break; ++ } ++ } ++} ++ ++void rtl8125_fw_release_firmware(struct rtl8125_fw *rtl_fw) ++{ ++ release_firmware(rtl_fw->fw); ++} ++ ++int rtl8125_fw_request_firmware(struct rtl8125_fw *rtl_fw) ++{ ++ int rc; ++ ++ rc = request_firmware(&rtl_fw->fw, rtl_fw->fw_name, rtl_fw->dev); ++ if (rc < 0) ++ goto out; ++ ++ if (!rtl8125_fw_format_ok(rtl_fw) || !rtl8125_fw_data_ok(rtl_fw)) { ++ release_firmware(rtl_fw->fw); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ return 0; ++out: ++ dev_err(rtl_fw->dev, "Unable to load firmware %s (%d)\n", ++ rtl_fw->fw_name, rc); ++ return rc; ++} +diff --git a/drivers/net/ethernet/realtek/r8125_firmware.h b/drivers/net/ethernet/realtek/r8125_firmware.h +new file mode 100755 +index 000000000000..540c1d22f281 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_firmware.h +@@ -0,0 +1,68 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef _LINUX_rtl8125_FIRMWARE_H ++#define _LINUX_rtl8125_FIRMWARE_H ++ ++#include ++#include ++ ++struct rtl8125_private; ++typedef void (*rtl8125_fw_write_t)(struct rtl8125_private *tp, u16 reg, u16 val); ++typedef u32 (*rtl8125_fw_read_t)(struct rtl8125_private *tp, u16 reg); ++ ++#define RTL8125_VER_SIZE 32 ++ ++struct rtl8125_fw { ++ rtl8125_fw_write_t phy_write; ++ rtl8125_fw_read_t phy_read; ++ rtl8125_fw_write_t mac_mcu_write; ++ rtl8125_fw_read_t mac_mcu_read; ++ const struct firmware *fw; ++ const char *fw_name; ++ struct device *dev; ++ ++ char version[RTL8125_VER_SIZE]; ++ ++ struct rtl8125_fw_phy_action { ++ __le32 *code; ++ size_t size; ++ } phy_action; ++}; ++ ++int rtl8125_fw_request_firmware(struct rtl8125_fw *rtl_fw); ++void rtl8125_fw_release_firmware(struct rtl8125_fw *rtl_fw); ++void rtl8125_fw_write_firmware(struct rtl8125_private *tp, struct rtl8125_fw *rtl_fw); ++ ++#endif /* _LINUX_rtl8125_FIRMWARE_H */ +diff --git a/drivers/net/ethernet/realtek/r8125_n.c b/drivers/net/ethernet/realtek/r8125_n.c +new file mode 100755 +index 000000000000..3d775975bfc4 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_n.c +@@ -0,0 +1,21312 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++/* ++ * This driver is modified from r8169.c in Linux kernel 2.6.18 ++ */ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) ++#include ++#include ++#endif ++#include ++#include ++#include ++#include ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,4,0) ++#include ++#endif ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37) ++#include ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) ++#define dev_printk(A,B,fmt,args...) printk(A fmt,##args) ++#else ++#include ++#include ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,31) ++#include ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,4,10) ++#include ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,4,10) */ ++ ++#include ++#include ++ ++#include "r8125.h" ++#include "rtl_eeprom.h" ++#include "rtltool.h" ++#include "r8125_firmware.h" ++ ++#ifdef ENABLE_R8125_PROCFS ++#include ++#include ++#endif ++ ++#define FIRMWARE_8125A_3 "rtl_nic/rtl8125a-3.fw" ++#define FIRMWARE_8125B_2 "rtl_nic/rtl8125b-2.fw" ++#define FIRMWARE_8125BP_1 "rtl_nic/rtl8125bp-1.fw" ++#define FIRMWARE_8125BP_2 "rtl_nic/rtl8125bp-2.fw" ++#define FIRMWARE_8125D_1 "rtl_nic/rtl8125d-1.fw" ++#define FIRMWARE_8125D_2 "rtl_nic/rtl8125d-2.fw" ++#define FIRMWARE_8125CP_1 "rtl_nic/rtl8125cp-1.fw" ++ ++static const struct { ++ const char *name; ++ const char *fw_name; ++} rtl_chip_fw_infos[] = { ++ /* PCI-E devices. */ ++ [CFG_METHOD_2] = {"RTL8125A" }, ++ [CFG_METHOD_3] = {"RTL8125A", FIRMWARE_8125A_3}, ++ [CFG_METHOD_4] = {"RTL8125B", }, ++ [CFG_METHOD_5] = {"RTL8125B", FIRMWARE_8125B_2}, ++ [CFG_METHOD_6] = {"RTL8168KB", FIRMWARE_8125A_3}, ++ [CFG_METHOD_7] = {"RTL8168KB", FIRMWARE_8125B_2}, ++ [CFG_METHOD_8] = {"RTL8125BP", FIRMWARE_8125BP_1}, ++ [CFG_METHOD_9] = {"RTL8125BP", FIRMWARE_8125BP_2}, ++ [CFG_METHOD_10] = {"RTL8125D", FIRMWARE_8125D_1}, ++ [CFG_METHOD_11] = {"RTL8125D", FIRMWARE_8125D_2}, ++ [CFG_METHOD_12] = {"RTL8125CP", FIRMWARE_8125CP_1}, ++ [CFG_METHOD_13] = {"RTL8168KD", FIRMWARE_8125D_2}, ++ [CFG_METHOD_DEFAULT] = {"Unknown", }, ++}; ++ ++#define _R(NAME,MAC,RCR,MASK,JumFrameSz) \ ++ { .name = NAME, .mcfg = MAC, .RCR_Cfg = RCR, .RxConfigMask = MASK, .jumbo_frame_sz = JumFrameSz } ++ ++static const struct { ++ const char *name; ++ u8 mcfg; ++ u32 RCR_Cfg; ++ u32 RxConfigMask; /* Clears the bits supported by this chip */ ++ u32 jumbo_frame_sz; ++} rtl_chip_info[] = { ++ _R("RTL8125A", ++ CFG_METHOD_2, ++ Rx_Fetch_Number_8 | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125A", ++ CFG_METHOD_3, ++ Rx_Fetch_Number_8 | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125B", ++ CFG_METHOD_4, ++ Rx_Fetch_Number_8 | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125B", ++ CFG_METHOD_5, ++ Rx_Fetch_Number_8 | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8168KB", ++ CFG_METHOD_6, ++ Rx_Fetch_Number_8 | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8168KB", ++ CFG_METHOD_7, ++ Rx_Fetch_Number_8 | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125BP", ++ CFG_METHOD_8, ++ Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125BP", ++ CFG_METHOD_9, ++ Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125D", ++ CFG_METHOD_10, ++ Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125D", ++ CFG_METHOD_11, ++ Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8125CP", ++ CFG_METHOD_12, ++ Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("RTL8168KD", ++ CFG_METHOD_13, ++ Rx_Fetch_Number_8 | Rx_Close_Multiple | RxCfg_pause_slot_en | EnableInnerVlan | EnableOuterVlan | (RX_DMA_BURST_256 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_9k), ++ ++ _R("Unknown", ++ CFG_METHOD_DEFAULT, ++ (RX_DMA_BURST_512 << RxCfgDMAShift), ++ 0xff7e5880, ++ Jumbo_Frame_1k) ++}; ++#undef _R ++ ++ ++#ifndef PCI_VENDOR_ID_DLINK ++#define PCI_VENDOR_ID_DLINK 0x1186 ++#endif ++ ++static struct pci_device_id rtl8125_pci_tbl[] = { ++ { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8125), }, ++ { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x8162), }, ++ { PCI_DEVICE(PCI_VENDOR_ID_REALTEK, 0x3000), }, ++ {0,}, ++}; ++ ++MODULE_DEVICE_TABLE(pci, rtl8125_pci_tbl); ++ ++static int use_dac = 1; ++static int timer_count = 0x2600; ++static int timer_count_v2 = (0x2600 / 0x100); ++ ++static struct { ++ u32 msg_enable; ++} debug = { -1 }; ++ ++static unsigned int speed_mode = SPEED_2500; ++static unsigned int duplex_mode = DUPLEX_FULL; ++static unsigned int autoneg_mode = AUTONEG_ENABLE; ++#ifdef CONFIG_ASPM ++static int aspm = 1; ++#else ++static int aspm = 0; ++#endif ++#ifdef ENABLE_S5WOL ++static int s5wol = 1; ++#else ++static int s5wol = 0; ++#endif ++#ifdef ENABLE_S5_KEEP_CURR_MAC ++static int s5_keep_curr_mac = 1; ++#else ++static int s5_keep_curr_mac = 0; ++#endif ++#ifdef ENABLE_EEE ++static int eee_enable = 1; ++#else ++static int eee_enable = 0; ++#endif ++#ifdef CONFIG_SOC_LAN ++static ulong hwoptimize = HW_PATCH_SOC_LAN; ++#else ++static ulong hwoptimize = 0; ++#endif ++#ifdef ENABLE_S0_MAGIC_PACKET ++static int s0_magic_packet = 1; ++#else ++static int s0_magic_packet = 0; ++#endif ++#ifdef ENABLE_TX_NO_CLOSE ++static int tx_no_close_enable = 1; ++#else ++static int tx_no_close_enable = 0; ++#endif ++#ifdef ENABLE_PTP_MASTER_MODE ++static int enable_ptp_master_mode = 1; ++#else ++static int enable_ptp_master_mode = 0; ++#endif ++#ifdef DISABLE_WOL_SUPPORT ++static int disable_wol_support = 1; ++#else ++static int disable_wol_support = 0; ++#endif ++#ifdef ENABLE_DOUBLE_VLAN ++static int enable_double_vlan = 1; ++#else ++static int enable_double_vlan = 0; ++#endif ++#ifdef ENABLE_GIGA_LITE ++static int eee_giga_lite = 1; ++#else ++static int eee_giga_lite = 0; ++#endif ++ ++MODULE_AUTHOR("Realtek and the Linux r8125 crew "); ++MODULE_DESCRIPTION("Realtek r8125 Ethernet controller driver"); ++ ++module_param(speed_mode, uint, 0); ++MODULE_PARM_DESC(speed_mode, "force phy operation. Deprecated by ethtool (8)."); ++ ++module_param(duplex_mode, uint, 0); ++MODULE_PARM_DESC(duplex_mode, "force phy operation. Deprecated by ethtool (8)."); ++ ++module_param(autoneg_mode, uint, 0); ++MODULE_PARM_DESC(autoneg_mode, "force phy operation. Deprecated by ethtool (8)."); ++ ++module_param(aspm, int, 0); ++MODULE_PARM_DESC(aspm, "Enable ASPM."); ++ ++module_param(s5wol, int, 0); ++MODULE_PARM_DESC(s5wol, "Enable Shutdown Wake On Lan."); ++ ++module_param(s5_keep_curr_mac, int, 0); ++MODULE_PARM_DESC(s5_keep_curr_mac, "Enable Shutdown Keep Current MAC Address."); ++ ++module_param(use_dac, int, 0); ++MODULE_PARM_DESC(use_dac, "Enable PCI DAC. Unsafe on 32 bit PCI slot."); ++ ++module_param(timer_count, int, 0); ++MODULE_PARM_DESC(timer_count, "Timer Interrupt Interval."); ++ ++module_param(eee_enable, int, 0); ++MODULE_PARM_DESC(eee_enable, "Enable Energy Efficient Ethernet."); ++ ++module_param(hwoptimize, ulong, 0); ++MODULE_PARM_DESC(hwoptimize, "Enable HW optimization function."); ++ ++module_param(s0_magic_packet, int, 0); ++MODULE_PARM_DESC(s0_magic_packet, "Enable S0 Magic Packet."); ++ ++module_param(tx_no_close_enable, int, 0); ++MODULE_PARM_DESC(tx_no_close_enable, "Enable TX No Close."); ++ ++module_param(enable_ptp_master_mode, int, 0); ++MODULE_PARM_DESC(enable_ptp_master_mode, "Enable PTP Master Mode."); ++ ++module_param(disable_wol_support, int, 0); ++MODULE_PARM_DESC(disable_wol_support, "Disable PM support."); ++ ++module_param(enable_double_vlan, int, 0); ++MODULE_PARM_DESC(enable_double_vlan, "Enable Double VLAN."); ++ ++module_param(eee_giga_lite, int, 0); ++MODULE_PARM_DESC(eee_giga_lite, "Enable Giga Lite."); ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++module_param_named(debug, debug.msg_enable, int, 0); ++MODULE_PARM_DESC(debug, "Debug verbosity level (0=none, ..., 16=all)"); ++#endif//LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ ++MODULE_LICENSE("GPL"); ++#ifdef ENABLE_USE_FIRMWARE_FILE ++MODULE_FIRMWARE(FIRMWARE_8125A_3); ++MODULE_FIRMWARE(FIRMWARE_8125B_2); ++MODULE_FIRMWARE(FIRMWARE_8125BP_1); ++MODULE_FIRMWARE(FIRMWARE_8125BP_2); ++MODULE_FIRMWARE(FIRMWARE_8125D_1); ++MODULE_FIRMWARE(FIRMWARE_8125D_2); ++MODULE_FIRMWARE(FIRMWARE_8125CP_1); ++#endif ++ ++MODULE_VERSION(RTL8125_VERSION); ++ ++/* ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++static void rtl8125_esd_timer(unsigned long __opaque); ++#else ++static void rtl8125_esd_timer(struct timer_list *t); ++#endif ++*/ ++/* ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++static void rtl8125_link_timer(unsigned long __opaque); ++#else ++static void rtl8125_link_timer(struct timer_list *t); ++#endif ++*/ ++ ++static netdev_tx_t rtl8125_start_xmit(struct sk_buff *skb, struct net_device *dev); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) ++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance, struct pt_regs *regs); ++#else ++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance); ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) ++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance, struct pt_regs *regs); ++#else ++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance); ++#endif ++static void rtl8125_set_rx_mode(struct net_device *dev); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) ++static void rtl8125_tx_timeout(struct net_device *dev, unsigned int txqueue); ++#else ++static void rtl8125_tx_timeout(struct net_device *dev); ++#endif ++static int rtl8125_rx_interrupt(struct net_device *, struct rtl8125_private *, struct rtl8125_rx_ring *, napi_budget); ++static int rtl8125_tx_interrupt(struct rtl8125_tx_ring *ring, int budget); ++static int rtl8125_tx_interrupt_with_vector(struct rtl8125_private *tp, const int message_id, int budget); ++static void rtl8125_wait_for_quiescence(struct net_device *dev); ++static int rtl8125_change_mtu(struct net_device *dev, int new_mtu); ++static void rtl8125_down(struct net_device *dev); ++ ++static int rtl8125_set_mac_address(struct net_device *dev, void *p); ++static void rtl8125_rar_set(struct rtl8125_private *tp, const u8 *addr); ++static void rtl8125_desc_addr_fill(struct rtl8125_private *); ++static void rtl8125_tx_desc_init(struct rtl8125_private *tp); ++static void rtl8125_rx_desc_init(struct rtl8125_private *tp); ++ ++static u16 rtl8125_get_hw_phy_mcu_code_ver(struct rtl8125_private *tp); ++static void rtl8125_phy_power_up(struct net_device *dev); ++static void rtl8125_phy_power_down(struct net_device *dev); ++static int rtl8125_set_speed(struct net_device *dev, u8 autoneg, u32 speed, u8 duplex, u64 adv); ++static bool rtl8125_set_phy_mcu_patch_request(struct rtl8125_private *tp); ++static bool rtl8125_clear_phy_mcu_patch_request(struct rtl8125_private *tp); ++ ++#ifdef CONFIG_R8125_NAPI ++static int rtl8125_poll(napi_ptr napi, napi_budget budget); ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++static void rtl8125_reset_task(void *_data); ++static void rtl8125_esd_task(void *_data); ++static void rtl8125_linkchg_task(void *_data); ++static void rtl8125_link_task(void *_data); ++static void rtl8125_dash_task(void *_data); ++#else ++static void rtl8125_reset_task(struct work_struct *work); ++static void rtl8125_esd_task(struct work_struct *work); ++static void rtl8125_linkchg_task(struct work_struct *work); ++static void rtl8125_link_task(struct work_struct *work); ++static void rtl8125_dash_task(struct work_struct *work); ++#endif ++static void rtl8125_schedule_reset_work(struct rtl8125_private *tp); ++static void rtl8125_schedule_esd_work(struct rtl8125_private *tp); ++static void rtl8125_schedule_linkchg_work(struct rtl8125_private *tp); ++static void rtl8125_schedule_link_work(struct rtl8125_private *tp); ++void rtl8125_schedule_dash_work(struct rtl8125_private *tp); ++static void rtl8125_init_all_schedule_work(struct rtl8125_private *tp); ++static void rtl8125_cancel_all_schedule_work(struct rtl8125_private *tp); ++ ++static inline struct device *tp_to_dev(struct rtl8125_private *tp) ++{ ++ return &tp->pci_dev->dev; ++} ++ ++#if ((LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) && \ ++ LINUX_VERSION_CODE >= KERNEL_VERSION(4,6,00))) ++void ethtool_convert_legacy_u32_to_link_mode(unsigned long *dst, ++ u32 legacy_u32) ++{ ++ bitmap_zero(dst, __ETHTOOL_LINK_MODE_MASK_NBITS); ++ dst[0] = legacy_u32; ++} ++ ++bool ethtool_convert_link_mode_to_legacy_u32(u32 *legacy_u32, ++ const unsigned long *src) ++{ ++ bool retval = true; ++ ++ /* TODO: following test will soon always be true */ ++ if (__ETHTOOL_LINK_MODE_MASK_NBITS > 32) { ++ __ETHTOOL_DECLARE_LINK_MODE_MASK(ext); ++ ++ bitmap_zero(ext, __ETHTOOL_LINK_MODE_MASK_NBITS); ++ bitmap_fill(ext, 32); ++ bitmap_complement(ext, ext, __ETHTOOL_LINK_MODE_MASK_NBITS); ++ if (bitmap_intersects(ext, src, ++ __ETHTOOL_LINK_MODE_MASK_NBITS)) { ++ /* src mask goes beyond bit 31 */ ++ retval = false; ++ } ++ } ++ *legacy_u32 = src[0]; ++ return retval; ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++ ++#ifndef LPA_1000FULL ++#define LPA_1000FULL 0x0800 ++#endif ++ ++#ifndef LPA_1000HALF ++#define LPA_1000HALF 0x0400 ++#endif ++ ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,4,0) ++static inline void eth_hw_addr_random(struct net_device *dev) ++{ ++ random_ether_addr(dev->dev_addr); ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) ++#undef ethtool_ops ++#define ethtool_ops _kc_ethtool_ops ++ ++struct _kc_ethtool_ops { ++ int (*get_settings)(struct net_device *, struct ethtool_cmd *); ++ int (*set_settings)(struct net_device *, struct ethtool_cmd *); ++ void (*get_drvinfo)(struct net_device *, struct ethtool_drvinfo *); ++ int (*get_regs_len)(struct net_device *); ++ void (*get_regs)(struct net_device *, struct ethtool_regs *, void *); ++ void (*get_wol)(struct net_device *, struct ethtool_wolinfo *); ++ int (*set_wol)(struct net_device *, struct ethtool_wolinfo *); ++ u32 (*get_msglevel)(struct net_device *); ++ void (*set_msglevel)(struct net_device *, u32); ++ int (*nway_reset)(struct net_device *); ++ u32 (*get_link)(struct net_device *); ++ int (*get_eeprom_len)(struct net_device *); ++ int (*get_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); ++ int (*set_eeprom)(struct net_device *, struct ethtool_eeprom *, u8 *); ++ int (*get_coalesce)(struct net_device *, struct ethtool_coalesce *); ++ int (*set_coalesce)(struct net_device *, struct ethtool_coalesce *); ++ void (*get_ringparam)(struct net_device *, struct ethtool_ringparam *); ++ int (*set_ringparam)(struct net_device *, struct ethtool_ringparam *); ++ void (*get_pauseparam)(struct net_device *, ++ struct ethtool_pauseparam*); ++ int (*set_pauseparam)(struct net_device *, ++ struct ethtool_pauseparam*); ++ u32 (*get_rx_csum)(struct net_device *); ++ int (*set_rx_csum)(struct net_device *, u32); ++ u32 (*get_tx_csum)(struct net_device *); ++ int (*set_tx_csum)(struct net_device *, u32); ++ u32 (*get_sg)(struct net_device *); ++ int (*set_sg)(struct net_device *, u32); ++ u32 (*get_tso)(struct net_device *); ++ int (*set_tso)(struct net_device *, u32); ++ int (*self_test_count)(struct net_device *); ++ void (*self_test)(struct net_device *, struct ethtool_test *, u64 *); ++ void (*get_strings)(struct net_device *, u32 stringset, u8 *); ++ int (*phys_id)(struct net_device *, u32); ++ int (*get_stats_count)(struct net_device *); ++ void (*get_ethtool_stats)(struct net_device *, struct ethtool_stats *, ++ u64 *); ++} *ethtool_ops = NULL; ++ ++#undef SET_ETHTOOL_OPS ++#define SET_ETHTOOL_OPS(netdev, ops) (ethtool_ops = (ops)) ++ ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0) ++#ifndef SET_ETHTOOL_OPS ++#define SET_ETHTOOL_OPS(netdev,ops) \ ++ ((netdev)->ethtool_ops = (ops)) ++#endif //SET_ETHTOOL_OPS ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,16,0) ++ ++//#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5) ++#ifndef netif_msg_init ++#define netif_msg_init _kc_netif_msg_init ++/* copied from linux kernel 2.6.20 include/linux/netdevice.h */ ++static inline u32 netif_msg_init(int debug_value, int default_msg_enable_bits) ++{ ++ /* use default */ ++ if (debug_value < 0 || debug_value >= (sizeof(u32) * 8)) ++ return default_msg_enable_bits; ++ if (debug_value == 0) /* no output */ ++ return 0; ++ /* set low N bits */ ++ return (1 << debug_value) - 1; ++} ++ ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,5) ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) ++static inline void eth_copy_and_sum (struct sk_buff *dest, ++ const unsigned char *src, ++ int len, int base) ++{ ++ skb_copy_to_linear_data(dest, src, len); ++} ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,22) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7) ++/* copied from linux kernel 2.6.20 /include/linux/time.h */ ++/* Parameters used to convert the timespec values: */ ++#define MSEC_PER_SEC 1000L ++ ++/* copied from linux kernel 2.6.20 /include/linux/jiffies.h */ ++/* ++ * Change timeval to jiffies, trying to avoid the ++ * most obvious overflows.. ++ * ++ * And some not so obvious. ++ * ++ * Note that we don't want to return MAX_LONG, because ++ * for various timeout reasons we often end up having ++ * to wait "jiffies+1" in order to guarantee that we wait ++ * at _least_ "jiffies" - so "jiffies+1" had better still ++ * be positive. ++ */ ++#define MAX_JIFFY_OFFSET ((~0UL >> 1)-1) ++ ++/* ++ * Convert jiffies to milliseconds and back. ++ * ++ * Avoid unnecessary multiplications/divisions in the ++ * two most common HZ cases: ++ */ ++static inline unsigned int _kc_jiffies_to_msecs(const unsigned long j) ++{ ++#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) ++ return (MSEC_PER_SEC / HZ) * j; ++#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) ++ return (j + (HZ / MSEC_PER_SEC) - 1)/(HZ / MSEC_PER_SEC); ++#else ++ return (j * MSEC_PER_SEC) / HZ; ++#endif ++} ++ ++static inline unsigned long _kc_msecs_to_jiffies(const unsigned int m) ++{ ++ if (m > _kc_jiffies_to_msecs(MAX_JIFFY_OFFSET)) ++ return MAX_JIFFY_OFFSET; ++#if HZ <= MSEC_PER_SEC && !(MSEC_PER_SEC % HZ) ++ return (m + (MSEC_PER_SEC / HZ) - 1) / (MSEC_PER_SEC / HZ); ++#elif HZ > MSEC_PER_SEC && !(HZ % MSEC_PER_SEC) ++ return m * (HZ / MSEC_PER_SEC); ++#else ++ return (m * HZ + MSEC_PER_SEC - 1) / MSEC_PER_SEC; ++#endif ++} ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7) ++ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) ++ ++/* copied from linux kernel 2.6.12.6 /include/linux/pm.h */ ++typedef int __bitwise pci_power_t; ++ ++/* copied from linux kernel 2.6.12.6 /include/linux/pci.h */ ++typedef u32 __bitwise pm_message_t; ++ ++#define PCI_D0 ((pci_power_t __force) 0) ++#define PCI_D1 ((pci_power_t __force) 1) ++#define PCI_D2 ((pci_power_t __force) 2) ++#define PCI_D3hot ((pci_power_t __force) 3) ++#define PCI_D3cold ((pci_power_t __force) 4) ++#define PCI_POWER_ERROR ((pci_power_t __force) -1) ++ ++/* copied from linux kernel 2.6.12.6 /drivers/pci/pci.c */ ++/** ++ * pci_choose_state - Choose the power state of a PCI device ++ * @dev: PCI device to be suspended ++ * @state: target sleep state for the whole system. This is the value ++ * that is passed to suspend() function. ++ * ++ * Returns PCI power state suitable for given device and given system ++ * message. ++ */ ++ ++pci_power_t pci_choose_state(struct pci_dev *dev, pm_message_t state) ++{ ++ if (!pci_find_capability(dev, PCI_CAP_ID_PM)) ++ return PCI_D0; ++ ++ switch (state) { ++ case 0: ++ return PCI_D0; ++ case 3: ++ return PCI_D3hot; ++ default: ++ printk("They asked me for state %d\n", state); ++// BUG(); ++ } ++ return PCI_D0; ++} ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++/** ++ * msleep_interruptible - sleep waiting for waitqueue interruptions ++ * @msecs: Time in milliseconds to sleep for ++ */ ++#define msleep_interruptible _kc_msleep_interruptible ++unsigned long _kc_msleep_interruptible(unsigned int msecs) ++{ ++ unsigned long timeout = _kc_msecs_to_jiffies(msecs); ++ ++ while (timeout && !signal_pending(current)) { ++ set_current_state(TASK_INTERRUPTIBLE); ++ timeout = schedule_timeout(timeout); ++ } ++ return _kc_jiffies_to_msecs(timeout); ++} ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,9) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7) ++/* copied from linux kernel 2.6.20 include/linux/sched.h */ ++#ifndef __sched ++#define __sched __attribute__((__section__(".sched.text"))) ++#endif ++ ++/* copied from linux kernel 2.6.20 kernel/timer.c */ ++signed long __sched schedule_timeout_uninterruptible(signed long timeout) ++{ ++ __set_current_state(TASK_UNINTERRUPTIBLE); ++ return schedule_timeout(timeout); ++} ++ ++/* copied from linux kernel 2.6.20 include/linux/mii.h */ ++#undef if_mii ++#define if_mii _kc_if_mii ++static inline struct mii_ioctl_data *if_mii(struct ifreq *rq) ++{ ++ return (struct mii_ioctl_data *) &rq->ifr_ifru; ++} ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,7) ++ ++static u16 _rtl8125_read_thermal_sensor(struct rtl8125_private *tp) ++{ ++ u16 ts_digout; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ ts_digout = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD84); ++ ts_digout &= 0x3ff; ++ break; ++ default: ++ ts_digout = 0xffff; ++ break; ++ } ++ ++ return ts_digout; ++} ++ ++static int rtl8125_read_thermal_sensor(struct rtl8125_private *tp) ++{ ++ int tmp; ++ ++ tmp = _rtl8125_read_thermal_sensor(tp); ++ if (tmp > 512) ++ return (0 - ((512 - (tmp - 512)) / 2)); ++ else ++ return (tmp / 2); ++} ++ ++int rtl8125_dump_tally_counter(struct rtl8125_private *tp, dma_addr_t paddr) ++{ ++ u32 cmd; ++ u32 WaitCnt; ++ int retval = -1; ++ ++ RTL_W32(tp, CounterAddrHigh, (u64)paddr >> 32); ++ cmd = (u64)paddr & DMA_BIT_MASK(32); ++ RTL_W32(tp, CounterAddrLow, cmd); ++ RTL_W32(tp, CounterAddrLow, cmd | CounterDump); ++ ++ WaitCnt = 0; ++ while (RTL_R32(tp, CounterAddrLow) & CounterDump) { ++ udelay(10); ++ ++ WaitCnt++; ++ if (WaitCnt > 20) ++ break; ++ } ++ ++ if (WaitCnt <= 20) ++ retval = 0; ++ ++ return retval; ++} ++ ++static u32 ++rtl8125_get_hw_clo_ptr(struct rtl8125_tx_ring *ring) ++{ ++ struct rtl8125_private *tp = ring->priv; ++ ++ if (!tp) ++ return 0; ++ ++ switch (tp->HwSuppTxNoCloseVer) { ++ case 3: ++ return RTL_R16(tp, ring->hw_clo_ptr_reg); ++ case 4: ++ case 5: ++ case 6: ++ return RTL_R32(tp, ring->hw_clo_ptr_reg); ++ default: ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ WARN_ON(1); ++#endif ++ return 0; ++ } ++} ++ ++static u32 ++rtl8125_get_sw_tail_ptr(struct rtl8125_tx_ring *ring) ++{ ++ struct rtl8125_private *tp = ring->priv; ++ ++ if (!tp) ++ return 0; ++ ++ switch (tp->HwSuppTxNoCloseVer) { ++ case 3: ++ return RTL_R16(tp, ring->sw_tail_ptr_reg); ++ case 4: ++ case 5: ++ case 6: ++ return RTL_R32(tp, ring->sw_tail_ptr_reg); ++ default: ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ WARN_ON(1); ++#endif ++ return 0; ++ } ++} ++ ++static u32 ++rtl8125_get_phy_status(struct rtl8125_private *tp) ++{ ++ return RTL_R32(tp, PHYstatus); ++} ++ ++static bool ++rtl8125_sysfs_testmode_on(struct rtl8125_private *tp) ++{ ++#ifdef ENABLE_R8125_SYSFS ++ return !!tp->testmode; ++#else ++ return 1; ++#endif ++} ++ ++static u32 rtl8125_convert_link_speed(u32 status) ++{ ++ u32 speed = SPEED_UNKNOWN; ++ ++ if (status & LinkStatus) { ++ if (status & _2500bpsF) ++ speed = SPEED_2500; ++ else if (status & (_1000bpsF | _2500bpsL | _1000bpsL)) ++ speed = SPEED_1000; ++ else if (status & _100bps) ++ speed = SPEED_100; ++ else if (status & _10bps) ++ speed = SPEED_10; ++ } ++ ++ return speed; ++} ++ ++static void rtl8125_mdi_swap(struct rtl8125_private *tp) ++{ ++ int i; ++ u16 reg, val, mdi_reverse; ++ u16 tps_p0, tps_p1, tps_p2, tps_p3, tps_p3_p0; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ reg = 0x8284; ++ break; ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ reg = 0x81aa; ++ break; ++ default: ++ return; ++ }; ++ ++ tps_p3_p0 = rtl8125_mac_ocp_read(tp, 0xD440) & 0xF000; ++ tps_p3 = !!(tps_p3_p0 & BIT_15); ++ tps_p2 = !!(tps_p3_p0 & BIT_14); ++ tps_p1 = !!(tps_p3_p0 & BIT_13); ++ tps_p0 = !!(tps_p3_p0 & BIT_12); ++ mdi_reverse = rtl8125_mac_ocp_read(tp, 0xD442); ++ ++ if ((mdi_reverse & BIT_5) && tps_p3_p0 == 0xA000) ++ return; ++ ++ if (!(mdi_reverse & BIT_5)) ++ val = tps_p0 << 8 | ++ tps_p1 << 9 | ++ tps_p2 << 10 | ++ tps_p3 << 11; ++ else ++ val = tps_p3 << 8 | ++ tps_p2 << 9 | ++ tps_p1 << 10 | ++ tps_p0 << 11; ++ ++ for (i=8; i<12; i++) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, reg); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ BIT(i), ++ val & BIT(i)); ++ } ++} ++ ++static int _rtl8125_vcd_test(struct rtl8125_private *tp) ++{ ++ u16 val; ++ u32 wait_cnt; ++ int ret = -1; ++ ++ rtl8125_mdi_swap(tp); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA422, BIT(0)); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA422, 0x00F0); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA422, BIT(0)); ++ ++ wait_cnt = 0; ++ do { ++ mdelay(1); ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA422); ++ wait_cnt++; ++ } while (!(val & BIT_15) && (wait_cnt < 5000)); ++ ++ if (wait_cnt == 5000) ++ goto exit; ++ ++ ret = 0; ++ ++exit: ++ return ret; ++} ++ ++static int rtl8125_vcd_test(struct rtl8125_private *tp, bool poe_mode) ++{ ++ int ret; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ /* update rtct threshold for poe mode */ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, poe_mode ? 0x0A44 : 0x0000); ++ ++ /* enable rtct poe mode */ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, poe_mode ? 0x0100 : 0x0000); ++ ++ ret = _rtl8125_vcd_test(tp); ++ ++ /* disable rtct poe mode */ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ ++ /* restore rtct threshold */ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ break; ++ default: ++ ret = _rtl8125_vcd_test(tp); ++ break; ++ } ++ ++ return ret; ++} ++ ++static void rtl8125_get_cp_len(struct rtl8125_private *tp, ++ int cp_len[RTL8125_CP_NUM]) ++{ ++ int i; ++ u32 status; ++ int tmp_cp_len; ++ ++ status = rtl8125_get_phy_status(tp); ++ if (status & LinkStatus) { ++ if (status & _10bps) { ++ tmp_cp_len = -1; ++ } else if (status & (_100bps | _1000bpsF)) { ++ rtl8125_mdio_write(tp, 0x1f, 0x0a88); ++ tmp_cp_len = rtl8125_mdio_read(tp, 0x10); ++ } else if (status & _2500bpsF) { ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ rtl8125_mdio_write(tp, 0x1f, 0x0ac5); ++ tmp_cp_len = rtl8125_mdio_read(tp, 0x14); ++ tmp_cp_len >>= 4; ++ break; ++ default: ++ rtl8125_mdio_write(tp, 0x1f, 0x0acb); ++ tmp_cp_len = rtl8125_mdio_read(tp, 0x15); ++ tmp_cp_len >>= 2; ++ break; ++ } ++ } else ++ tmp_cp_len = 0; ++ } else ++ tmp_cp_len = 0; ++ ++ if (tmp_cp_len > 0) ++ tmp_cp_len &= 0xff; ++ for (i=0; i RTL8125_MAX_SUPPORT_CP_LEN) ++ cp_len[i] = RTL8125_MAX_SUPPORT_CP_LEN; ++ ++ return; ++} ++ ++static int __rtl8125_get_cp_status(u16 val) ++{ ++ switch (val) { ++ case 0x0060: ++ return rtl8125_cp_normal; ++ case 0x0048: ++ return rtl8125_cp_open; ++ case 0x0050: ++ return rtl8125_cp_short; ++ case 0x0042: ++ case 0x0044: ++ return rtl8125_cp_mismatch; ++ default: ++ return rtl8125_cp_normal; ++ } ++} ++ ++static int _rtl8125_get_cp_status(struct rtl8125_private *tp, u8 pair_num) ++{ ++ u16 val; ++ int cp_status = rtl8125_cp_unknown; ++ ++ if (pair_num > 3) ++ goto exit; ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8027 + 4 * pair_num); ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA438); ++ ++ cp_status = __rtl8125_get_cp_status(val); ++ ++exit: ++ return cp_status; ++} ++ ++static const char * rtl8125_get_cp_status_string(int cp_status) ++{ ++ switch(cp_status) { ++ case rtl8125_cp_normal: ++ return "normal "; ++ case rtl8125_cp_short: ++ return "short "; ++ case rtl8125_cp_open: ++ return "open "; ++ case rtl8125_cp_mismatch: ++ return "mismatch"; ++ default: ++ return "unknown "; ++ } ++} ++ ++static u16 rtl8125_get_cp_pp(struct rtl8125_private *tp, u8 pair_num) ++{ ++ u16 pp = 0; ++ ++ if (pair_num > 3) ++ goto exit; ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8029 + 4 * pair_num); ++ pp = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA438); ++ ++ pp &= 0x3fff; ++ pp /= 80; ++ ++exit: ++ return pp; ++} ++ ++static void rtl8125_get_cp_status(struct rtl8125_private *tp, ++ int cp_status[RTL8125_CP_NUM], ++ bool poe_mode) ++{ ++ u32 status; ++ int i; ++ ++ status = rtl8125_get_phy_status(tp); ++ if (status & LinkStatus && !(status & (_10bps | _100bps))) { ++ for (i=0; i= KERNEL_VERSION(3,10,0) ++static int proc_get_driver_variable(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ seq_puts(m, "\nDump Driver Variable\n"); ++ ++ rtnl_lock(); ++ ++ seq_puts(m, "Variable\tValue\n----------\t-----\n"); ++ seq_printf(m, "MODULENAME\t%s\n", MODULENAME); ++ seq_printf(m, "driver version\t%s\n", RTL8125_VERSION); ++ seq_printf(m, "mcfg\t%d\n", tp->mcfg); ++ seq_printf(m, "chipset\t%d\n", tp->chipset); ++ seq_printf(m, "chipset_name\t%s\n", rtl_chip_info[tp->chipset].name); ++ seq_printf(m, "mtu\t%d\n", dev->mtu); ++ seq_printf(m, "NUM_RX_DESC\t0x%x\n", tp->rx_ring[0].num_rx_desc); ++ seq_printf(m, "cur_rx0\t0x%x\n", tp->rx_ring[0].cur_rx); ++ seq_printf(m, "dirty_rx0\t0x%x\n", tp->rx_ring[0].dirty_rx); ++ seq_printf(m, "cur_rx1\t0x%x\n", tp->rx_ring[1].cur_rx); ++ seq_printf(m, "dirty_rx1\t0x%x\n", tp->rx_ring[1].dirty_rx); ++ seq_printf(m, "cur_rx2\t0x%x\n", tp->rx_ring[2].cur_rx); ++ seq_printf(m, "dirty_rx2\t0x%x\n", tp->rx_ring[2].dirty_rx); ++ seq_printf(m, "cur_rx3\t0x%x\n", tp->rx_ring[3].cur_rx); ++ seq_printf(m, "dirty_rx3\t0x%x\n", tp->rx_ring[3].dirty_rx); ++ seq_printf(m, "NUM_TX_DESC\t0x%x\n", tp->tx_ring[0].num_tx_desc); ++ seq_printf(m, "cur_tx0\t0x%x\n", tp->tx_ring[0].cur_tx); ++ seq_printf(m, "dirty_tx0\t0x%x\n", tp->tx_ring[0].dirty_tx); ++ seq_printf(m, "cur_tx1\t0x%x\n", tp->tx_ring[1].cur_tx); ++ seq_printf(m, "dirty_tx1\t0x%x\n", tp->tx_ring[1].dirty_tx); ++ seq_printf(m, "rx_buf_sz\t0x%x\n", tp->rx_buf_sz); ++#ifdef ENABLE_PAGE_REUSE ++ seq_printf(m, "rx_buf_page_order\t0x%x\n", tp->rx_buf_page_order); ++ seq_printf(m, "rx_buf_page_size\t0x%x\n", tp->rx_buf_page_size); ++ seq_printf(m, "page_reuse_fail_cnt\t0x%x\n", tp->page_reuse_fail_cnt); ++#endif //ENABLE_PAGE_REUSE ++ seq_printf(m, "esd_flag\t0x%x\n", tp->esd_flag); ++ seq_printf(m, "pci_cfg_is_read\t0x%x\n", tp->pci_cfg_is_read); ++ seq_printf(m, "rtl8125_rx_config\t0x%x\n", tp->rtl8125_rx_config); ++ seq_printf(m, "cp_cmd\t0x%x\n", tp->cp_cmd); ++ seq_printf(m, "intr_mask\t0x%x\n", tp->intr_mask); ++ seq_printf(m, "timer_intr_mask\t0x%x\n", tp->timer_intr_mask); ++ seq_printf(m, "wol_enabled\t0x%x\n", tp->wol_enabled); ++ seq_printf(m, "wol_opts\t0x%x\n", tp->wol_opts); ++ seq_printf(m, "efuse_ver\t0x%x\n", tp->efuse_ver); ++ seq_printf(m, "eeprom_type\t0x%x\n", tp->eeprom_type); ++ seq_printf(m, "autoneg\t0x%x\n", tp->autoneg); ++ seq_printf(m, "duplex\t0x%x\n", tp->duplex); ++ seq_printf(m, "speed\t%d\n", tp->speed); ++ seq_printf(m, "advertising\t0x%llx\n", tp->advertising); ++ seq_printf(m, "eeprom_len\t0x%x\n", tp->eeprom_len); ++ seq_printf(m, "cur_page\t0x%x\n", tp->cur_page); ++ seq_printf(m, "features\t0x%x\n", tp->features); ++ seq_printf(m, "org_pci_offset_99\t0x%x\n", tp->org_pci_offset_99); ++ seq_printf(m, "org_pci_offset_180\t0x%x\n", tp->org_pci_offset_180); ++ seq_printf(m, "issue_offset_99_event\t0x%x\n", tp->issue_offset_99_event); ++ seq_printf(m, "org_pci_offset_80\t0x%x\n", tp->org_pci_offset_80); ++ seq_printf(m, "org_pci_offset_81\t0x%x\n", tp->org_pci_offset_81); ++ seq_printf(m, "use_timer_interrupt\t0x%x\n", tp->use_timer_interrupt); ++ seq_printf(m, "HwIcVerUnknown\t0x%x\n", tp->HwIcVerUnknown); ++ seq_printf(m, "NotWrRamCodeToMicroP\t0x%x\n", tp->NotWrRamCodeToMicroP); ++ seq_printf(m, "NotWrMcuPatchCode\t0x%x\n", tp->NotWrMcuPatchCode); ++ seq_printf(m, "HwHasWrRamCodeToMicroP\t0x%x\n", tp->HwHasWrRamCodeToMicroP); ++ seq_printf(m, "sw_ram_code_ver\t0x%x\n", tp->sw_ram_code_ver); ++ seq_printf(m, "hw_ram_code_ver\t0x%x\n", tp->hw_ram_code_ver); ++ seq_printf(m, "rtk_enable_diag\t0x%x\n", tp->rtk_enable_diag); ++ seq_printf(m, "ShortPacketSwChecksum\t0x%x\n", tp->ShortPacketSwChecksum); ++ seq_printf(m, "UseSwPaddingShortPkt\t0x%x\n", tp->UseSwPaddingShortPkt); ++ seq_printf(m, "RequireAdcBiasPatch\t0x%x\n", tp->RequireAdcBiasPatch); ++ seq_printf(m, "AdcBiasPatchIoffset\t0x%x\n", tp->AdcBiasPatchIoffset); ++ seq_printf(m, "RequireAdjustUpsTxLinkPulseTiming\t0x%x\n", tp->RequireAdjustUpsTxLinkPulseTiming); ++ seq_printf(m, "SwrCnt1msIni\t0x%x\n", tp->SwrCnt1msIni); ++ seq_printf(m, "HwSuppNowIsOobVer\t0x%x\n", tp->HwSuppNowIsOobVer); ++ seq_printf(m, "HwFiberModeVer\t0x%x\n", tp->HwFiberModeVer); ++ seq_printf(m, "HwFiberStat\t0x%x\n", tp->HwFiberStat); ++ seq_printf(m, "HwSwitchMdiToFiber\t0x%x\n", tp->HwSwitchMdiToFiber); ++ seq_printf(m, "Led0\t0x%x\n", tp->BackupLedSel[0]); ++ seq_printf(m, "RequiredSecLanDonglePatch\t0x%x\n", tp->RequiredSecLanDonglePatch); ++ seq_printf(m, "RequiredPfmPatch\t0x%x\n", tp->RequiredPfmPatch); ++ seq_printf(m, "HwSuppDashVer\t0x%x\n", tp->HwSuppDashVer); ++ seq_printf(m, "DASH\t0x%x\n", tp->DASH); ++ seq_printf(m, "DashFirmwareVersion\t0x%x\n", tp->DashFirmwareVersion); ++ seq_printf(m, "HwSuppKCPOffloadVer\t0x%x\n", tp->HwSuppKCPOffloadVer); ++ seq_printf(m, "speed_mode\t0x%x\n", speed_mode); ++ seq_printf(m, "duplex_mode\t0x%x\n", duplex_mode); ++ seq_printf(m, "autoneg_mode\t0x%x\n", autoneg_mode); ++ seq_printf(m, "aspm\t0x%x\n", aspm); ++ seq_printf(m, "s5wol\t0x%x\n", s5wol); ++ seq_printf(m, "s5_keep_curr_mac\t0x%x\n", s5_keep_curr_mac); ++ seq_printf(m, "eee_enable\t0x%x\n", tp->eee.eee_enabled); ++ seq_printf(m, "hwoptimize\t0x%lx\n", hwoptimize); ++ seq_printf(m, "proc_init_num\t0x%x\n", proc_init_num); ++ seq_printf(m, "s0_magic_packet\t0x%x\n", s0_magic_packet); ++ seq_printf(m, "disable_wol_support\t0x%x\n", disable_wol_support); ++ seq_printf(m, "enable_double_vlan\t0x%x\n", enable_double_vlan); ++ seq_printf(m, "eee_giga_lite\t0x%x\n", eee_giga_lite); ++ seq_printf(m, "HwSuppMagicPktVer\t0x%x\n", tp->HwSuppMagicPktVer); ++ seq_printf(m, "HwSuppEsdVer\t0x%x\n", tp->HwSuppEsdVer); ++ seq_printf(m, "HwSuppLinkChgWakeUpVer\t0x%x\n", tp->HwSuppLinkChgWakeUpVer); ++ seq_printf(m, "HwSuppD0SpeedUpVer\t0x%x\n", tp->HwSuppD0SpeedUpVer); ++ seq_printf(m, "D0SpeedUpSpeed\t0x%x\n", tp->D0SpeedUpSpeed); ++ seq_printf(m, "HwSuppCheckPhyDisableModeVer\t0x%x\n", tp->HwSuppCheckPhyDisableModeVer); ++ seq_printf(m, "HwPkgDet\t0x%x\n", tp->HwPkgDet); ++ seq_printf(m, "HwSuppTxNoCloseVer\t0x%x\n", tp->HwSuppTxNoCloseVer); ++ seq_printf(m, "EnableTxNoClose\t0x%x\n", tp->EnableTxNoClose); ++ seq_printf(m, "NextHwDesCloPtr0\t0x%x\n", tp->tx_ring[0].NextHwDesCloPtr); ++ seq_printf(m, "BeginHwDesCloPtr0\t0x%x\n", tp->tx_ring[0].BeginHwDesCloPtr); ++ seq_printf(m, "hw_clo_ptr_reg0\t0x%x\n", rtl8125_get_hw_clo_ptr(&tp->tx_ring[0])); ++ seq_printf(m, "sw_tail_ptr_reg0\t0x%x\n", rtl8125_get_sw_tail_ptr(&tp->tx_ring[0])); ++ seq_printf(m, "NextHwDesCloPtr1\t0x%x\n", tp->tx_ring[1].NextHwDesCloPtr); ++ seq_printf(m, "BeginHwDesCloPtr1\t0x%x\n", tp->tx_ring[1].BeginHwDesCloPtr); ++ seq_printf(m, "hw_clo_ptr_reg1\t0x%x\n", rtl8125_get_hw_clo_ptr(&tp->tx_ring[1])); ++ seq_printf(m, "sw_tail_ptr_reg1\t0x%x\n", rtl8125_get_sw_tail_ptr(&tp->tx_ring[1])); ++ seq_printf(m, "InitRxDescType\t0x%x\n", tp->InitRxDescType); ++ seq_printf(m, "RxDescLength\t0x%x\n", tp->RxDescLength); ++ seq_printf(m, "num_rx_rings\t0x%x\n", tp->num_rx_rings); ++ seq_printf(m, "num_tx_rings\t0x%x\n", tp->num_tx_rings); ++ seq_printf(m, "tot_rx_rings\t0x%x\n", rtl8125_tot_rx_rings(tp)); ++ seq_printf(m, "tot_tx_rings\t0x%x\n", rtl8125_tot_tx_rings(tp)); ++ seq_printf(m, "HwSuppNumRxQueues\t0x%x\n", tp->HwSuppNumRxQueues); ++ seq_printf(m, "HwSuppNumTxQueues\t0x%x\n", tp->HwSuppNumTxQueues); ++ seq_printf(m, "EnableRss\t0x%x\n", tp->EnableRss); ++ seq_printf(m, "EnablePtp\t0x%x\n", tp->EnablePtp); ++ seq_printf(m, "ptp_master_mode\t0x%x\n", tp->ptp_master_mode); ++ seq_printf(m, "min_irq_nvecs\t0x%x\n", tp->min_irq_nvecs); ++ seq_printf(m, "irq_nvecs\t0x%x\n", tp->irq_nvecs); ++ seq_printf(m, "hw_supp_irq_nvecs\t0x%x\n", tp->hw_supp_irq_nvecs); ++ seq_printf(m, "ring_lib_enabled\t0x%x\n", tp->ring_lib_enabled); ++ seq_printf(m, "HwSuppIsrVer\t0x%x\n", tp->HwSuppIsrVer); ++ seq_printf(m, "HwCurrIsrVer\t0x%x\n", tp->HwCurrIsrVer); ++ seq_printf(m, "HwSuppMacMcuVer\t0x%x\n", tp->HwSuppMacMcuVer); ++ seq_printf(m, "MacMcuPageSize\t0x%x\n", tp->MacMcuPageSize); ++ seq_printf(m, "hw_mcu_patch_code_ver\t0x%llx\n", tp->hw_mcu_patch_code_ver); ++ seq_printf(m, "bin_mcu_patch_code_ver\t0x%llx\n", tp->bin_mcu_patch_code_ver); ++#ifdef ENABLE_PTP_SUPPORT ++ seq_printf(m, "tx_hwtstamp_timeouts\t0x%x\n", tp->tx_hwtstamp_timeouts); ++ seq_printf(m, "tx_hwtstamp_skipped\t0x%x\n", tp->tx_hwtstamp_skipped); ++#endif ++ seq_printf(m, "random_mac\t0x%x\n", tp->random_mac); ++ seq_printf(m, "org_mac_addr\t%pM\n", tp->org_mac_addr); ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) ++ seq_printf(m, "perm_addr\t%pM\n", dev->perm_addr); ++#endif ++ seq_printf(m, "dev_addr\t%pM\n", dev->dev_addr); ++ ++ rtnl_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_get_tally_counter(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct rtl8125_counters *counters; ++ dma_addr_t paddr; ++ ++ seq_puts(m, "\nDump Tally Counter\n"); ++ ++ rtnl_lock(); ++ ++ counters = tp->tally_vaddr; ++ paddr = tp->tally_paddr; ++ if (!counters) { ++ seq_puts(m, "\nDump Tally Counter Fail\n"); ++ goto out_unlock; ++ } ++ ++ rtl8125_dump_tally_counter(tp, paddr); ++ ++ seq_puts(m, "Statistics\tValue\n----------\t-----\n"); ++ seq_printf(m, "tx_packets\t%lld\n", le64_to_cpu(counters->tx_packets)); ++ seq_printf(m, "rx_packets\t%lld\n", le64_to_cpu(counters->rx_packets)); ++ seq_printf(m, "tx_errors\t%lld\n", le64_to_cpu(counters->tx_errors)); ++ seq_printf(m, "rx_errors\t%d\n", le32_to_cpu(counters->rx_errors)); ++ seq_printf(m, "rx_missed\t%d\n", le16_to_cpu(counters->rx_missed)); ++ seq_printf(m, "align_errors\t%d\n", le16_to_cpu(counters->align_errors)); ++ seq_printf(m, "tx_one_collision\t%d\n", le32_to_cpu(counters->tx_one_collision)); ++ seq_printf(m, "tx_multi_collision\t%d\n", le32_to_cpu(counters->tx_multi_collision)); ++ seq_printf(m, "rx_unicast\t%lld\n", le64_to_cpu(counters->rx_unicast)); ++ seq_printf(m, "rx_broadcast\t%lld\n", le64_to_cpu(counters->rx_broadcast)); ++ seq_printf(m, "rx_multicast\t%d\n", le32_to_cpu(counters->rx_multicast)); ++ seq_printf(m, "tx_aborted\t%d\n", le16_to_cpu(counters->tx_aborted)); ++ seq_printf(m, "tx_underrun\t%d\n", le16_to_cpu(counters->tx_underrun)); ++ ++ seq_printf(m, "tx_octets\t%lld\n", le64_to_cpu(counters->tx_octets)); ++ seq_printf(m, "rx_octets\t%lld\n", le64_to_cpu(counters->rx_octets)); ++ seq_printf(m, "rx_multicast64\t%lld\n", le64_to_cpu(counters->rx_multicast64)); ++ seq_printf(m, "tx_unicast64\t%lld\n", le64_to_cpu(counters->tx_unicast64)); ++ seq_printf(m, "tx_broadcast64\t%lld\n", le64_to_cpu(counters->tx_broadcast64)); ++ seq_printf(m, "tx_multicast64\t%lld\n", le64_to_cpu(counters->tx_multicast64)); ++ seq_printf(m, "tx_pause_on\t%d\n", le32_to_cpu(counters->tx_pause_on)); ++ seq_printf(m, "tx_pause_off\t%d\n", le32_to_cpu(counters->tx_pause_off)); ++ seq_printf(m, "tx_pause_all\t%d\n", le32_to_cpu(counters->tx_pause_all)); ++ seq_printf(m, "tx_deferred\t%d\n", le32_to_cpu(counters->tx_deferred)); ++ seq_printf(m, "tx_late_collision\t%d\n", le32_to_cpu(counters->tx_late_collision)); ++ seq_printf(m, "tx_all_collision\t%d\n", le32_to_cpu(counters->tx_all_collision)); ++ seq_printf(m, "tx_aborted32\t%d\n", le32_to_cpu(counters->tx_aborted32)); ++ seq_printf(m, "align_errors32\t%d\n", le32_to_cpu(counters->align_errors32)); ++ seq_printf(m, "rx_frame_too_long\t%d\n", le32_to_cpu(counters->rx_frame_too_long)); ++ seq_printf(m, "rx_runt\t%d\n", le32_to_cpu(counters->rx_runt)); ++ seq_printf(m, "rx_pause_on\t%d\n", le32_to_cpu(counters->rx_pause_on)); ++ seq_printf(m, "rx_pause_off\t%d\n", le32_to_cpu(counters->rx_pause_off)); ++ seq_printf(m, "rx_pause_all\t%d\n", le32_to_cpu(counters->rx_pause_all)); ++ seq_printf(m, "rx_unknown_opcode\t%d\n", le32_to_cpu(counters->rx_unknown_opcode)); ++ seq_printf(m, "rx_mac_error\t%d\n", le32_to_cpu(counters->rx_mac_error)); ++ seq_printf(m, "tx_underrun32\t%d\n", le32_to_cpu(counters->tx_underrun32)); ++ seq_printf(m, "rx_mac_missed\t%d\n", le32_to_cpu(counters->rx_mac_missed)); ++ seq_printf(m, "rx_tcam_dropped\t%d\n", le32_to_cpu(counters->rx_tcam_dropped)); ++ seq_printf(m, "tdu\t%d\n", le32_to_cpu(counters->tdu)); ++ seq_printf(m, "rdu\t%d\n", le32_to_cpu(counters->rdu)); ++ ++ seq_putc(m, '\n'); ++ ++out_unlock: ++ rtnl_unlock(); ++ ++ return 0; ++} ++ ++static int proc_get_registers(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ int i, n, max = R8125_MAC_REGS_SIZE; ++ u8 byte_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ void __iomem *ioaddr = tp->mmio_addr; ++ ++ seq_puts(m, "\nDump MAC Registers\n"); ++ seq_puts(m, "Offset\tValue\n------\t-----\n"); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ seq_printf(m, "\n0x%04x:\t", n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ seq_printf(m, "%02x ", byte_rd); ++ } ++ } ++ ++ max = 0xB00; ++ for (n = 0xA00; n < max;) { ++ seq_printf(m, "\n0x%04x:\t", n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ seq_printf(m, "%02x ", byte_rd); ++ } ++ } ++ ++ max = 0xD40; ++ for (n = 0xD00; n < max;) { ++ seq_printf(m, "\n0x%04x:\t", n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ seq_printf(m, "%02x ", byte_rd); ++ } ++ } ++ ++ max = 0x2840; ++ for (n = 0x2800; n < max;) { ++ seq_printf(m, "\n0x%04x:\t", n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ seq_printf(m, "%02x ", byte_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_get_all_registers(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ int i, n, max; ++ u8 byte_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ void __iomem *ioaddr = tp->mmio_addr; ++ struct pci_dev *pdev = tp->pci_dev; ++ ++ seq_puts(m, "\nDump All MAC Registers\n"); ++ seq_puts(m, "Offset\tValue\n------\t-----\n"); ++ ++ rtnl_lock(); ++ ++ max = pci_resource_len(pdev, 2); ++ max = min(max, 0x8000); ++ ++ for (n = 0; n < max;) { ++ seq_printf(m, "\n0x%04x:\t", n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ seq_printf(m, "%02x ", byte_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ seq_printf(m, "\nTotal length:0x%X", max); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_get_pcie_phy(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ int i, n, max = R8125_EPHY_REGS_SIZE/2; ++ u16 word_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ seq_puts(m, "\nDump PCIE PHY\n"); ++ seq_puts(m, "\nOffset\tValue\n------\t-----\n "); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ seq_printf(m, "\n0x%02x:\t", n); ++ ++ for (i = 0; i < 8 && n < max; i++, n++) { ++ word_rd = rtl8125_ephy_read(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_get_eth_phy(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ int i, n, max = R8125_PHY_REGS_SIZE/2; ++ u16 word_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ ++ seq_puts(m, "\nDump Ethernet PHY\n"); ++ seq_puts(m, "\nOffset\tValue\n------\t-----\n "); ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ seq_puts(m, "\n####################page 0##################\n "); ++ rtl8125_mdio_write(tp, 0x1f, 0x0000); ++ for (n = 0; n < max;) { ++ seq_printf(m, "\n0x%02x:\t", n); ++ ++ for (i = 0; i < 8 && n < max; i++, n++) { ++ word_rd = rtl8125_mdio_read(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ } ++ } ++ ++ seq_puts(m, "\n####################extra reg##################\n "); ++ n = 0xA400; ++ seq_printf(m, "\n0x%02x:\t", n); ++ for (i = 0; i < 8; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ } ++ ++ n = 0xA410; ++ seq_printf(m, "\n0x%02x:\t", n); ++ for (i = 0; i < 3; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ } ++ ++ n = 0xA434; ++ seq_printf(m, "\n0x%02x:\t", n); ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ ++ n = 0xA5D0; ++ seq_printf(m, "\n0x%02x:\t", n); ++ for (i = 0; i < 4; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ } ++ ++ n = 0xA61A; ++ seq_printf(m, "\n0x%02x:\t", n); ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ ++ n = 0xA6D0; ++ seq_printf(m, "\n0x%02x:\t", n); ++ for (i = 0; i < 3; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ seq_printf(m, "%04x ", word_rd); ++ } ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_get_extended_registers(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ int i, n, max = R8125_ERI_REGS_SIZE; ++ u32 dword_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ seq_puts(m, "\nDump Extended Registers\n"); ++ seq_puts(m, "\nOffset\tValue\n------\t-----\n "); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ seq_printf(m, "\n0x%02x:\t", n); ++ ++ for (i = 0; i < 4 && n < max; i++, n+=4) { ++ dword_rd = rtl8125_eri_read(tp, n, 4, ERIAR_ExGMAC); ++ seq_printf(m, "%08x ", dword_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_get_pci_registers(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ int i, n, max = R8125_PCI_REGS_SIZE; ++ u32 dword_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ seq_puts(m, "\nDump PCI Registers\n"); ++ seq_puts(m, "\nOffset\tValue\n------\t-----\n "); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ seq_printf(m, "\n0x%03x:\t", n); ++ ++ for (i = 0; i < 4 && n < max; i++, n+=4) { ++ pci_read_config_dword(tp->pci_dev, n, &dword_rd); ++ seq_printf(m, "%08x ", dword_rd); ++ } ++ } ++ ++ n = 0x110; ++ pci_read_config_dword(tp->pci_dev, n, &dword_rd); ++ seq_printf(m, "\n0x%03x:\t%08x ", n, dword_rd); ++ n = 0x70c; ++ pci_read_config_dword(tp->pci_dev, n, &dword_rd); ++ seq_printf(m, "\n0x%03x:\t%08x ", n, dword_rd); ++ ++ rtnl_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_get_temperature(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ int cel, fah; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ seq_puts(m, "\nChip Temperature\n"); ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ rtnl_lock(); ++ ++ if (!rtl8125_sysfs_testmode_on(tp)) { ++ seq_puts(m, "\nPlease turn on ""/sys/class/net//rtk_adv/testmode"".\n\n"); ++ rtnl_unlock(); ++ return 0; ++ } ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ netif_testing_on(dev); ++ cel = rtl8125_read_thermal_sensor(tp); ++ netif_testing_off(dev); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ rtnl_unlock(); ++ ++ fah = rtl8125_cel_to_fah(cel); ++ ++ seq_printf(m, "Cel:%d\n", cel); ++ seq_printf(m, "Fah:%d\n", fah); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int _proc_get_cable_info(struct seq_file *m, void *v, bool poe_mode) ++{ ++ int i; ++ u32 status; ++ int cp_status[RTL8125_CP_NUM]; ++ int cp_len[RTL8125_CP_NUM] = {0}; ++ struct net_device *dev = m->private; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ const char *pair_str[RTL8125_CP_NUM] = {"1-2", "3-6", "4-5", "7-8"}; ++ unsigned long flags; ++ int ret; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2 ... CFG_METHOD_7: ++ /* support */ ++ break; ++ default: ++ ret = -EOPNOTSUPP; ++ goto error_out; ++ } ++ ++ rtnl_lock(); ++ ++ if (!rtl8125_sysfs_testmode_on(tp)) { ++ seq_puts(m, "\nPlease turn on ""/sys/class/net//rtk_adv/testmode"".\n\n"); ++ ret = 0; ++ goto error_unlock; ++ } ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ if (rtl8125_mdio_read(tp, MII_BMCR) & BMCR_PDOWN) { ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ret = -EIO; ++ goto error_unlock; ++ } ++ ++ netif_testing_on(dev); ++ ++ status = rtl8125_get_phy_status(tp); ++ if (status & LinkStatus) ++ seq_printf(m, "\nlink speed:%d", ++ rtl8125_convert_link_speed(status)); ++ else ++ seq_puts(m, "\nlink status:off"); ++ ++ rtl8125_get_cp_len(tp, cp_len); ++ ++ rtl8125_get_cp_status(tp, cp_status, poe_mode); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ seq_puts(m, "\npair\tlength\tstatus \tpp\n"); ++ ++ for (i=0; iprivate; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ rtnl_lock(); ++ ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ struct rtl8125_rx_ring *ring = &tp->rx_ring[i]; ++ ++ if (!ring) ++ continue; ++ ++ seq_printf(m, "\ndump rx %d desc:%d\n", i, ring->num_rx_desc); ++ ++ _proc_dump_desc(m, (void*)ring->RxDescArray, ring->RxDescAllocSize); ++ } ++ ++#ifdef ENABLE_LIB_SUPPORT ++ if (rtl8125_num_lib_rx_rings(tp) > 0) { ++ for (i = 0; i < tp->HwSuppNumRxQueues; i++) { ++ struct rtl8125_ring *lib_ring = &tp->lib_rx_ring[i]; ++ if (lib_ring->enabled) { ++ seq_printf(m, "\ndump lib rx %d desc:%d\n", i, ++ lib_ring->ring_size); ++ _proc_dump_desc(m, (void*)lib_ring->desc_addr, ++ lib_ring->desc_size); ++ } ++ } ++ } ++#endif //ENABLE_LIB_SUPPORT ++ ++ rtnl_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_dump_tx_desc(struct seq_file *m, void *v) ++{ ++ struct net_device *dev = m->private; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ rtnl_lock(); ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ struct rtl8125_tx_ring *ring = &tp->tx_ring[i]; ++ ++ if (!ring) ++ continue; ++ ++ seq_printf(m, "\ndump tx %d desc:%d\n", i, ring->num_tx_desc); ++ ++ _proc_dump_desc(m, (void*)ring->TxDescArray, ring->TxDescAllocSize); ++ } ++ ++#ifdef ENABLE_LIB_SUPPORT ++ if (rtl8125_num_lib_tx_rings(tp) > 0) { ++ for (i = 0; i < tp->HwSuppNumTxQueues; i++) { ++ struct rtl8125_ring *lib_ring = &tp->lib_tx_ring[i]; ++ if (lib_ring->enabled) { ++ seq_printf(m, "\ndump lib tx %d desc:%d\n", i, ++ lib_ring->ring_size); ++ _proc_dump_desc(m, (void*)lib_ring->desc_addr, ++ lib_ring->desc_size); ++ } ++ } ++ } ++#endif //ENABLE_LIB_SUPPORT ++ ++ rtnl_unlock(); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++static int proc_dump_msix_tbl(struct seq_file *m, void *v) ++{ ++ int i, j; ++ void __iomem *ioaddr; ++ struct net_device *dev = m->private; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ /* ioremap MMIO region */ ++ ioaddr = ioremap(pci_resource_start(tp->pci_dev, 4), pci_resource_len(tp->pci_dev, 4)); ++ if (!ioaddr) ++ return -EFAULT; ++ ++ rtnl_lock(); ++ ++ seq_printf(m, "\ndump MSI-X Table. Total Entry %d. \n", tp->hw_supp_irq_nvecs); ++ ++ for (i=0; ihw_supp_irq_nvecs; i++) { ++ seq_printf(m, "\n%04x ", i); ++ for (j=0; j<4; j++) ++ seq_printf(m, "%08x ", ++ readl(ioaddr + i*0x10 + 4*j)); ++ } ++ ++ rtnl_unlock(); ++ ++ iounmap(ioaddr); ++ ++ seq_putc(m, '\n'); ++ return 0; ++} ++ ++#else //LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++ ++static int proc_get_driver_variable(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump Driver Driver\n"); ++ ++ rtnl_lock(); ++ ++ len += snprintf(page + len, count - len, ++ "Variable\tValue\n----------\t-----\n"); ++ ++ len += snprintf(page + len, count - len, ++ "MODULENAME\t%s\n" ++ "driver version\t%s\n" ++ "mcfg\t%d\n" ++ "chipset\t%d\n" ++ "chipset_name\t%s\n" ++ "mtu\t%d\n" ++ "NUM_RX_DESC\t0x%x\n" ++ "cur_rx0\t0x%x\n" ++ "dirty_rx0\t0x%x\n" ++ "cur_rx1\t0x%x\n" ++ "dirty_rx1\t0x%x\n" ++ "cur_rx2\t0x%x\n" ++ "dirty_rx2\t0x%x\n" ++ "cur_rx3\t0x%x\n" ++ "dirty_rx3\t0x%x\n" ++ "NUM_TX_DESC\t0x%x\n" ++ "cur_tx0\t0x%x\n" ++ "dirty_tx0\t0x%x\n" ++ "cur_tx1\t0x%x\n" ++ "dirty_tx1\t0x%x\n" ++ "rx_buf_sz\t0x%x\n" ++#ifdef ENABLE_PAGE_REUSE ++ "rx_buf_page_order\t0x%x\n" ++ "rx_buf_page_size\t0x%x\n" ++ "page_reuse_fail_cnt\t0x%x\n" ++#endif //ENABLE_PAGE_REUSE ++ "esd_flag\t0x%x\n" ++ "pci_cfg_is_read\t0x%x\n" ++ "rtl8125_rx_config\t0x%x\n" ++ "cp_cmd\t0x%x\n" ++ "intr_mask\t0x%x\n" ++ "timer_intr_mask\t0x%x\n" ++ "wol_enabled\t0x%x\n" ++ "wol_opts\t0x%x\n" ++ "efuse_ver\t0x%x\n" ++ "eeprom_type\t0x%x\n" ++ "autoneg\t0x%x\n" ++ "duplex\t0x%x\n" ++ "speed\t%d\n" ++ "advertising\t0x%llx\n" ++ "eeprom_len\t0x%x\n" ++ "cur_page\t0x%x\n" ++ "features\t0x%x\n" ++ "org_pci_offset_99\t0x%x\n" ++ "org_pci_offset_180\t0x%x\n" ++ "issue_offset_99_event\t0x%x\n" ++ "org_pci_offset_80\t0x%x\n" ++ "org_pci_offset_81\t0x%x\n" ++ "use_timer_interrupt\t0x%x\n" ++ "HwIcVerUnknown\t0x%x\n" ++ "NotWrRamCodeToMicroP\t0x%x\n" ++ "NotWrMcuPatchCode\t0x%x\n" ++ "HwHasWrRamCodeToMicroP\t0x%x\n" ++ "sw_ram_code_ver\t0x%x\n" ++ "hw_ram_code_ver\t0x%x\n" ++ "rtk_enable_diag\t0x%x\n" ++ "ShortPacketSwChecksum\t0x%x\n" ++ "UseSwPaddingShortPkt\t0x%x\n" ++ "RequireAdcBiasPatch\t0x%x\n" ++ "AdcBiasPatchIoffset\t0x%x\n" ++ "RequireAdjustUpsTxLinkPulseTiming\t0x%x\n" ++ "SwrCnt1msIni\t0x%x\n" ++ "HwSuppNowIsOobVer\t0x%x\n" ++ "HwFiberModeVer\t0x%x\n" ++ "HwFiberStat\t0x%x\n" ++ "HwSwitchMdiToFiber\t0x%x\n" ++ "Led0\t0x%x\n" ++ "RequiredSecLanDonglePatch\t0x%x\n" ++ "RequiredPfmPatch\t0x%x\n" ++ "HwSuppDashVer\t0x%x\n" ++ "DASH\t0x%x\n" ++ "DashFirmwareVersion\t0x%x\n" ++ "HwSuppKCPOffloadVer\t0x%x\n" ++ "speed_mode\t0x%x\n" ++ "duplex_mode\t0x%x\n" ++ "autoneg_mode\t0x%x\n" ++ "aspm\t0x%x\n" ++ "s5wol\t0x%x\n" ++ "s5_keep_curr_mac\t0x%x\n" ++ "eee_enable\t0x%x\n" ++ "hwoptimize\t0x%lx\n" ++ "proc_init_num\t0x%x\n" ++ "s0_magic_packet\t0x%x\n" ++ "disable_wol_support\t0x%x\n" ++ "enable_double_vlan\t0x%x\n" ++ "eee_giga_lite\t0x%x\n" ++ "HwSuppMagicPktVer\t0x%x\n" ++ "HwSuppEsdVer\t0x%x\n" ++ "HwSuppLinkChgWakeUpVer\t0x%x\n" ++ "HwSuppD0SpeedUpVer\t0x%x\n" ++ "D0SpeedUpSpeed\t0x%x\n" ++ "HwSuppCheckPhyDisableModeVer\t0x%x\n" ++ "HwPkgDet\t0x%x\n" ++ "HwSuppTxNoCloseVer\t0x%x\n" ++ "EnableTxNoClose\t0x%x\n" ++ "NextHwDesCloPtr0\t0x%x\n" ++ "BeginHwDesCloPtr0\t0x%x\n" ++ "hw_clo_ptr_reg0\t0x%x\n" ++ "sw_tail_ptr_reg0\t0x%x\n" ++ "NextHwDesCloPtr1\t0x%x\n" ++ "BeginHwDesCloPtr1\t0x%x\n" ++ "hw_clo_ptr_reg1\t0x%x\n" ++ "sw_tail_ptr_reg1\t0x%x\n" ++ "InitRxDescType\t0x%x\n" ++ "RxDescLength\t0x%x\n" ++ "num_rx_rings\t0x%x\n" ++ "num_tx_rings\t0x%x\n" ++ "tot_rx_rings\t0x%x\n" ++ "tot_tx_rings\t0x%x\n" ++ "HwSuppNumRxQueues\t0x%x\n" ++ "HwSuppNumTxQueues\t0x%x\n" ++ "EnableRss\t0x%x\n" ++ "EnablePtp\t0x%x\n" ++ "ptp_master_mode\t0x%x\n" ++ "min_irq_nvecs\t0x%x\n" ++ "irq_nvecs\t0x%x\n" ++ "hw_supp_irq_nvecs\t0x%x\n" ++ "ring_lib_enabled\t0x%x\n" ++ "HwSuppIsrVer\t0x%x\n" ++ "HwCurrIsrVer\t0x%x\n" ++ "HwSuppMacMcuVer\t0x%x\n" ++ "MacMcuPageSize\t0x%x\n" ++ "hw_mcu_patch_code_ver\t0x%llx\n" ++ "bin_mcu_patch_code_ver\t0x%llx\n" ++#ifdef ENABLE_PTP_SUPPORT ++ "tx_hwtstamp_timeouts\t0x%x\n" ++ "tx_hwtstamp_skipped\t0x%x\n" ++#endif ++ "random_mac\t0x%x\n" ++ "org_mac_addr\t%pM\n" ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) ++ "perm_addr\t%pM\n" ++#endif ++ "dev_addr\t%pM\n", ++ MODULENAME, ++ RTL8125_VERSION, ++ tp->mcfg, ++ tp->chipset, ++ rtl_chip_info[tp->chipset].name, ++ dev->mtu, ++ tp->rx_ring[0].num_rx_desc, ++ tp->rx_ring[0].cur_rx, ++ tp->rx_ring[0].dirty_rx, ++ tp->rx_ring[1].cur_rx, ++ tp->rx_ring[1].dirty_rx, ++ tp->rx_ring[2].cur_rx, ++ tp->rx_ring[2].dirty_rx, ++ tp->rx_ring[3].cur_rx, ++ tp->rx_ring[3].dirty_rx, ++ tp->tx_ring[0].num_tx_desc, ++ tp->tx_ring[0].cur_tx, ++ tp->tx_ring[0].dirty_tx, ++ tp->tx_ring[1].cur_tx, ++ tp->tx_ring[1].dirty_tx, ++ tp->rx_buf_sz, ++#ifdef ENABLE_PAGE_REUSE ++ tp->rx_buf_page_order, ++ tp->rx_buf_page_size, ++ tp->page_reuse_fail_cnt, ++#endif //ENABLE_PAGE_REUSE ++ tp->esd_flag, ++ tp->pci_cfg_is_read, ++ tp->rtl8125_rx_config, ++ tp->cp_cmd, ++ tp->intr_mask, ++ tp->timer_intr_mask, ++ tp->wol_enabled, ++ tp->wol_opts, ++ tp->efuse_ver, ++ tp->eeprom_type, ++ tp->autoneg, ++ tp->duplex, ++ tp->speed, ++ tp->advertising, ++ tp->eeprom_len, ++ tp->cur_page, ++ tp->features, ++ tp->org_pci_offset_99, ++ tp->org_pci_offset_180, ++ tp->issue_offset_99_event, ++ tp->org_pci_offset_80, ++ tp->org_pci_offset_81, ++ tp->use_timer_interrupt, ++ tp->HwIcVerUnknown, ++ tp->NotWrRamCodeToMicroP, ++ tp->NotWrMcuPatchCode, ++ tp->HwHasWrRamCodeToMicroP, ++ tp->sw_ram_code_ver, ++ tp->hw_ram_code_ver, ++ tp->rtk_enable_diag, ++ tp->ShortPacketSwChecksum, ++ tp->UseSwPaddingShortPkt, ++ tp->RequireAdcBiasPatch, ++ tp->AdcBiasPatchIoffset, ++ tp->RequireAdjustUpsTxLinkPulseTiming, ++ tp->SwrCnt1msIni, ++ tp->HwSuppNowIsOobVer, ++ tp->HwFiberModeVer, ++ tp->HwFiberStat, ++ tp->HwSwitchMdiToFiber, ++ tp->BackupLedSel[0], ++ tp->RequiredSecLanDonglePatch, ++ tp->RequiredPfmPatch, ++ tp->HwSuppDashVer, ++ tp->DASH, ++ tp->DashFirmwareVersion, ++ tp->HwSuppKCPOffloadVer, ++ speed_mode, ++ duplex_mode, ++ autoneg_mode, ++ aspm, ++ s5wol, ++ s5_keep_curr_mac, ++ tp->eee.eee_enabled, ++ hwoptimize, ++ proc_init_num, ++ s0_magic_packet, ++ disable_wol_support, ++ enable_double_vlan, ++ eee_giga_lite, ++ tp->HwSuppMagicPktVer, ++ tp->HwSuppEsdVer, ++ tp->HwSuppLinkChgWakeUpVer, ++ tp->HwSuppD0SpeedUpVer, ++ tp->D0SpeedUpSpeed, ++ tp->HwSuppCheckPhyDisableModeVer, ++ tp->HwPkgDet, ++ tp->HwSuppTxNoCloseVer, ++ tp->EnableTxNoClose, ++ tp->tx_ring[0].NextHwDesCloPtr, ++ tp->tx_ring[0].BeginHwDesCloPtr, ++ rtl8125_get_hw_clo_ptr(&tp->tx_ring[0]), ++ rtl8125_get_sw_tail_ptr(&tp->tx_ring[0]), ++ tp->tx_ring[1].NextHwDesCloPtr, ++ tp->tx_ring[1].BeginHwDesCloPtr, ++ rtl8125_get_hw_clo_ptr(&tp->tx_ring[1]), ++ rtl8125_get_sw_tail_ptr(&tp->tx_ring[1]), ++ tp->InitRxDescType, ++ tp->RxDescLength, ++ tp->num_rx_rings, ++ tp->num_tx_rings, ++ rtl8125_tot_rx_rings(tp), ++ rtl8125_tot_tx_rings(tp), ++ tp->HwSuppNumRxQueues, ++ tp->HwSuppNumTxQueues, ++ tp->EnableRss, ++ tp->EnablePtp, ++ tp->ptp_master_mode, ++ tp->min_irq_nvecs, ++ tp->irq_nvecs, ++ tp->hw_supp_irq_nvecs, ++ tp->ring_lib_enabled, ++ tp->HwSuppIsrVer, ++ tp->HwCurrIsrVer, ++ tp->HwSuppMacMcuVer, ++ tp->MacMcuPageSize, ++ tp->hw_mcu_patch_code_ver, ++ tp->bin_mcu_patch_code_ver, ++#ifdef ENABLE_PTP_SUPPORT ++ tp->tx_hwtstamp_timeouts, ++ tp->tx_hwtstamp_skipped, ++#endif ++ tp->random_mac, ++ tp->org_mac_addr, ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) ++ dev->perm_addr, ++#endif ++ dev->dev_addr); ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_tally_counter(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct rtl8125_counters *counters; ++ dma_addr_t paddr; ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump Tally Counter\n"); ++ ++ rtnl_lock(); ++ ++ counters = tp->tally_vaddr; ++ paddr = tp->tally_paddr; ++ if (!counters) { ++ len += snprintf(page + len, count - len, ++ "\nDump Tally Counter Fail\n"); ++ goto out_unlock; ++ } ++ ++ rtl8125_dump_tally_counter(tp, paddr); ++ ++ len += snprintf(page + len, count - len, ++ "Statistics\tValue\n----------\t-----\n"); ++ ++ len += snprintf(page + len, count - len, ++ "tx_packets\t%lld\n" ++ "rx_packets\t%lld\n" ++ "tx_errors\t%lld\n" ++ "rx_errors\t%d\n" ++ "rx_missed\t%d\n" ++ "align_errors\t%d\n" ++ "tx_one_collision\t%d\n" ++ "tx_multi_collision\t%d\n" ++ "rx_unicast\t%lld\n" ++ "rx_broadcast\t%lld\n" ++ "rx_multicast\t%d\n" ++ "tx_aborted\t%d\n" ++ "tx_underrun\t%d\n" ++ ++ "tx_octets\t%lld\n" ++ "rx_octets\t%lld\n" ++ "rx_multicast64\t%lld\n" ++ "tx_unicast64\t%lld\n" ++ "tx_broadcast64\t%lld\n" ++ "tx_multicast64\t%lld\n" ++ "tx_pause_on\t%d\n" ++ "tx_pause_off\t%d\n" ++ "tx_pause_all\t%d\n" ++ "tx_deferred\t%d\n" ++ "tx_late_collision\t%d\n" ++ "tx_all_collision\t%d\n" ++ "tx_aborted32\t%d\n" ++ "align_errors32\t%d\n" ++ "rx_frame_too_long\t%d\n" ++ "rx_runt\t%d\n" ++ "rx_pause_on\t%d\n" ++ "rx_pause_off\t%d\n" ++ "rx_pause_all\t%d\n" ++ "rx_unknown_opcode\t%d\n" ++ "rx_mac_error\t%d\n" ++ "tx_underrun32\t%d\n" ++ "rx_mac_missed\t%d\n" ++ "rx_tcam_dropped\t%d\n" ++ "tdu\t%d\n" ++ "rdu\t%d\n", ++ le64_to_cpu(counters->tx_packets), ++ le64_to_cpu(counters->rx_packets), ++ le64_to_cpu(counters->tx_errors), ++ le32_to_cpu(counters->rx_errors), ++ le16_to_cpu(counters->rx_missed), ++ le16_to_cpu(counters->align_errors), ++ le32_to_cpu(counters->tx_one_collision), ++ le32_to_cpu(counters->tx_multi_collision), ++ le64_to_cpu(counters->rx_unicast), ++ le64_to_cpu(counters->rx_broadcast), ++ le32_to_cpu(counters->rx_multicast), ++ le16_to_cpu(counters->tx_aborted), ++ le16_to_cpu(counters->tx_underrun), ++ ++ le64_to_cpu(counters->tx_octets), ++ le64_to_cpu(counters->rx_octets), ++ le64_to_cpu(counters->rx_multicast64), ++ le64_to_cpu(counters->tx_unicast64), ++ le64_to_cpu(counters->tx_broadcast64), ++ le64_to_cpu(counters->tx_multicast64), ++ le32_to_cpu(counters->tx_pause_on), ++ le32_to_cpu(counters->tx_pause_off), ++ le32_to_cpu(counters->tx_pause_all), ++ le32_to_cpu(counters->tx_deferred), ++ le32_to_cpu(counters->tx_late_collision), ++ le32_to_cpu(counters->tx_all_collision), ++ le32_to_cpu(counters->tx_aborted32), ++ le32_to_cpu(counters->align_errors32), ++ le32_to_cpu(counters->rx_frame_too_long), ++ le32_to_cpu(counters->rx_runt), ++ le32_to_cpu(counters->rx_pause_on), ++ le32_to_cpu(counters->rx_pause_off), ++ le32_to_cpu(counters->rx_pause_all), ++ le32_to_cpu(counters->rx_unknown_opcode), ++ le32_to_cpu(counters->rx_mac_error), ++ le32_to_cpu(counters->tx_underrun32), ++ le32_to_cpu(counters->rx_mac_missed), ++ le32_to_cpu(counters->rx_tcam_dropped), ++ le32_to_cpu(counters->tdu), ++ le32_to_cpu(counters->rdu)); ++ ++ len += snprintf(page + len, count - len, "\n"); ++out_unlock: ++ rtnl_unlock(); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_registers(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ int i, n, max = R8125_MAC_REGS_SIZE; ++ u8 byte_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ void __iomem *ioaddr = tp->mmio_addr; ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump MAC Registers\n" ++ "Offset\tValue\n------\t-----\n"); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%04x:\t", ++ n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ len += snprintf(page + len, count - len, ++ "%02x ", ++ byte_rd); ++ } ++ } ++ ++ max = 0xB00; ++ for (n = 0xA00; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%04x:\t", ++ n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ len += snprintf(page + len, count - len, ++ "%02x ", ++ byte_rd); ++ } ++ } ++ ++ max = 0xD40; ++ for (n = 0xD00; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%04x:\t", ++ n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ len += snprintf(page + len, count - len, ++ "%02x ", ++ byte_rd); ++ } ++ } ++ ++ max = 0x2840; ++ for (n = 0x2800; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%04x:\t", ++ n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ len += snprintf(page + len, count - len, ++ "%02x ", ++ byte_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_all_registers(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ int i, n, max; ++ u8 byte_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ void __iomem *ioaddr = tp->mmio_addr; ++ struct pci_dev *pdev = tp->pci_dev; ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump All MAC Registers\n" ++ "Offset\tValue\n------\t-----\n"); ++ ++ rtnl_lock(); ++ ++ max = pci_resource_len(pdev, 2); ++ max = min(max, 0x8000); ++ ++ for (n = 0; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%04x:\t", ++ n); ++ ++ for (i = 0; i < 16 && n < max; i++, n++) { ++ byte_rd = readb(ioaddr + n); ++ len += snprintf(page + len, count - len, ++ "%02x ", ++ byte_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\nTotal length:0x%X", max); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_pcie_phy(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ int i, n, max = R8125_EPHY_REGS_SIZE/2; ++ u16 word_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump PCIE PHY\n" ++ "Offset\tValue\n------\t-----\n"); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ ++ for (i = 0; i < 8 && n < max; i++, n++) { ++ word_rd = rtl8125_ephy_read(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_eth_phy(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ int i, n, max = R8125_PHY_REGS_SIZE/2; ++ u16 word_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump Ethernet PHY\n" ++ "Offset\tValue\n------\t-----\n"); ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ len += snprintf(page + len, count - len, ++ "\n####################page 0##################\n"); ++ rtl8125_mdio_write(tp, 0x1f, 0x0000); ++ for (n = 0; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ ++ for (i = 0; i < 8 && n < max; i++, n++) { ++ word_rd = rtl8125_mdio_read(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ } ++ } ++ ++ len += snprintf(page + len, count - len, ++ "\n####################extra reg##################\n"); ++ n = 0xA400; ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ for (i = 0; i < 8; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ } ++ ++ n = 0xA410; ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ for (i = 0; i < 3; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ } ++ ++ n = 0xA434; ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ ++ n = 0xA5D0; ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ for (i = 0; i < 4; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ } ++ ++ n = 0xA61A; ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ ++ n = 0xA6D0; ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ for (i = 0; i < 3; i++, n+=2) { ++ word_rd = rtl8125_mdio_direct_read_phy_ocp(tp, n); ++ len += snprintf(page + len, count - len, ++ "%04x ", ++ word_rd); ++ } ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_extended_registers(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ int i, n, max = R8125_ERI_REGS_SIZE; ++ u32 dword_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump Extended Registers\n" ++ "Offset\tValue\n------\t-----\n"); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%02x:\t", ++ n); ++ ++ for (i = 0; i < 4 && n < max; i++, n+=4) { ++ dword_rd = rtl8125_eri_read(tp, n, 4, ERIAR_ExGMAC); ++ len += snprintf(page + len, count - len, ++ "%08x ", ++ dword_rd); ++ } ++ } ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_pci_registers(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ int i, n, max = R8125_PCI_REGS_SIZE; ++ u32 dword_rd; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int len = 0; ++ ++ len += snprintf(page + len, count - len, ++ "\nDump PCI Registers\n" ++ "Offset\tValue\n------\t-----\n"); ++ ++ rtnl_lock(); ++ ++ for (n = 0; n < max;) { ++ len += snprintf(page + len, count - len, ++ "\n0x%03x:\t", ++ n); ++ ++ for (i = 0; i < 4 && n < max; i++, n+=4) { ++ pci_read_config_dword(tp->pci_dev, n, &dword_rd); ++ len += snprintf(page + len, count - len, ++ "%08x ", ++ dword_rd); ++ } ++ } ++ ++ n = 0x110; ++ pci_read_config_dword(tp->pci_dev, n, &dword_rd); ++ len += snprintf(page + len, count - len, ++ "\n0x%03x:\t%08x ", ++ n, ++ dword_rd); ++ n = 0x70c; ++ pci_read_config_dword(tp->pci_dev, n, &dword_rd); ++ len += snprintf(page + len, count - len, ++ "\n0x%03x:\t%08x ", ++ n, ++ dword_rd); ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_temperature(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ struct net_device *dev = data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ int cel, fah; ++ int len = 0; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ len += snprintf(page + len, count - len, ++ "\nChip Temperature\n"); ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ rtnl_lock(); ++ ++ if (!rtl8125_sysfs_testmode_on(tp)) { ++ len += snprintf(page + len, count - len, ++ "\nPlease turn on ""/sys/class/net//rtk_adv/testmode"".\n\n"); ++ goto out_unlock; ++ } ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ cel = rtl8125_read_thermal_sensor(tp); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ fah = rtl8125_cel_to_fah(cel); ++ ++ len += snprintf(page + len, count - len, ++ "Cel:%d\n", ++ cel); ++ len += snprintf(page + len, count - len, ++ "Fah:%d\n", ++ fah); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++out_unlock: ++ rtnl_unlock(); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int _proc_get_cable_info(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data, ++ bool poe_mode) ++{ ++ int i; ++ u32 status; ++ int len = 0; ++ struct net_device *dev = data; ++ int cp_status[RTL8125_CP_NUM] = {0}; ++ int cp_len[RTL8125_CP_NUM] = {0}; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ const char *pair_str[RTL8125_CP_NUM] = {"1-2", "3-6", "4-5", "7-8"}; ++ unsigned long flags; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2 ... CFG_METHOD_7: ++ /* support */ ++ break; ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ rtnl_lock(); ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ if (!rtl8125_sysfs_testmode_on(tp)) { ++ len += snprintf(page + len, count - len, ++ "\nPlease turn on ""/sys/class/net//rtk_adv/testmode"".\n\n"); ++ goto out_unlock; ++ } ++ ++ status = rtl8125_get_phy_status(tp); ++ if (status & LinkStatus) ++ len += snprintf(page + len, count - len, ++ "\nlink speed:%d", ++ rtl8125_convert_link_speed(status)); ++ else ++ len += snprintf(page + len, count - len, ++ "\nlink status:off"); ++ ++ rtl8125_get_cp_len(tp, cp_len); ++ ++ rtl8125_get_cp_status(tp, cp_status, poe_mode); ++ ++ len += snprintf(page + len, count - len, ++ "\npair\tlength\tstatus \tpp\n"); ++ ++ for (i=0; iphy_lock, flags); ++ ++ rtnl_unlock(); ++ ++ *eof = 1; ++ return len; ++} ++ ++static int proc_get_cable_info(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ return _proc_get_cable_info(page, start, offset, count, eof, data, 0); ++} ++ ++static int proc_get_poe_cable_info(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ return _proc_get_cable_info(page, start, offset, count, eof, data, 1); ++} ++ ++static void _proc_dump_desc(char *page, int *page_len, int *count, void *desc_base, ++ u32 alloc_size) ++{ ++ u32 *pdword; ++ int i, len; ++ ++ if (desc_base == NULL || ++ alloc_size == 0) ++ return; ++ ++ len = *page_len; ++ pdword = (u32*)desc_base; ++ for (i=0; i<(alloc_size/4); i++) { ++ if (!(i % 4)) ++ len += snprintf(page + len, *count - len, ++ "\n%04x ", ++ i); ++ len += snprintf(page + len, *count - len, ++ "%08x ", ++ pdword[i]); ++ } ++ ++ len += snprintf(page + len, *count - len, "\n"); ++ ++ *page_len = len; ++ return; ++} ++ ++static int proc_dump_rx_desc(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ int i; ++ int len = 0; ++ struct net_device *dev = data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtnl_lock(); ++ ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ struct rtl8125_rx_ring *ring = &tp->rx_ring[i]; ++ ++ if (!ring) ++ continue; ++ ++ len += snprintf(page + len, count - len, ++ "\ndump rx %d desc:%d", ++ i, ring->num_rx_desc); ++ ++ _proc_dump_desc(page, &len, &count, ++ ring->RxDescArray, ++ ring->RxDescAllocSize); ++ } ++ ++#ifdef ENABLE_LIB_SUPPORT ++ if (rtl8125_num_lib_rx_rings(tp) > 0) { ++ for (i = 0; i < tp->HwSuppNumRxQueues; i++) { ++ struct rtl8125_ring *lib_ring = &tp->lib_rx_ring[i]; ++ if (lib_ring->enabled) { ++ len += snprintf(page + len, count - len, ++ "\ndump lib rx %d desc:%d", ++ i, ++ ring->ring_size); ++ _proc_dump_desc(page, &len, &count, ++ (void*)lib_ring->desc_addr, ++ lib_ring->desc_size); ++ } ++ } ++ } ++#endif //ENABLE_LIB_SUPPORT ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ ++ return len; ++} ++ ++static int proc_dump_tx_desc(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ int len = 0; ++ struct net_device *dev = data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ rtnl_lock(); ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ struct rtl8125_tx_ring *ring = &tp->tx_ring[i]; ++ ++ if (!ring) ++ continue; ++ ++ len += snprintf(page + len, count - len, ++ "\ndump tx desc:%d", ++ ring->num_tx_desc); ++ ++ _proc_dump_desc(page, &len, &count, ++ ring->TxDescArray, ++ ring->TxDescAllocSize); ++ } ++ ++#ifdef ENABLE_LIB_SUPPORT ++ if (rtl8125_num_lib_tx_rings(tp) > 0) { ++ for (i = 0; i < tp->HwSuppNumTxQueues; i++) { ++ struct rtl8125_ring *lib_ring = &tp->lib_tx_ring[i]; ++ if (lib_ring->enabled) { ++ len += snprintf(page + len, count - len, ++ "\ndump lib tx %d desc:%d", ++ i, ++ ring->ring_size); ++ _proc_dump_desc(page, &len, &count, ++ (void*)lib_ring->desc_addr, ++ lib_ring->desc_size); ++ } ++ } ++ } ++#endif //ENABLE_LIB_SUPPORT ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ ++ return len; ++} ++ ++static int proc_dump_msix_tbl(char *page, char **start, ++ off_t offset, int count, ++ int *eof, void *data) ++{ ++ int i, j; ++ int len = 0; ++ void __iomem *ioaddr; ++ struct net_device *dev = data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ /* ioremap MMIO region */ ++ ioaddr = ioremap(pci_resource_start(tp->pci_dev, 4), pci_resource_len(tp->pci_dev, 4)); ++ if (!ioaddr) ++ return -EFAULT; ++ ++ rtnl_lock(); ++ ++ len += snprintf(page + len, count - len, ++ "\ndump MSI-X Table. Total Entry %d. \n", ++ tp->hw_supp_irq_nvecs); ++ ++ for (i=0; ihw_supp_irq_nvecs; i++) { ++ len += snprintf(page + len, count - len, ++ "\n%04x ", i); ++ for (j=0; j<4; j++) ++ len += snprintf(page + len, count - len, "%08x ", ++ readl(ioaddr + i*0x10 + 4*j)); ++ } ++ ++ rtnl_unlock(); ++ ++ len += snprintf(page + len, count - len, "\n"); ++ ++ *eof = 1; ++ return 0; ++} ++ ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++ ++static void rtl8125_proc_module_init(void) ++{ ++ //create /proc/net/r8125 ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) ++ rtl8125_proc = proc_mkdir(MODULENAME, init_net.proc_net); ++#else ++ rtl8125_proc = proc_mkdir(MODULENAME, proc_net); ++#endif ++ if (!rtl8125_proc) ++ dprintk("cannot create %s proc entry \n", MODULENAME); ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++/* ++ * seq_file wrappers for procfile show routines. ++ */ ++static int rtl8125_proc_open(struct inode *inode, struct file *file) ++{ ++ struct net_device *dev = proc_get_parent_data(inode); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++ int (*show)(struct seq_file *, void *) = pde_data(inode); ++#else ++ int (*show)(struct seq_file *, void *) = PDE_DATA(inode); ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++ ++ return single_open(file, show, dev); ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) ++static const struct proc_ops rtl8125_proc_fops = { ++ .proc_open = rtl8125_proc_open, ++ .proc_read = seq_read, ++ .proc_lseek = seq_lseek, ++ .proc_release = single_release, ++}; ++#else ++static const struct file_operations rtl8125_proc_fops = { ++ .open = rtl8125_proc_open, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++#endif ++ ++#endif ++ ++/* ++ * Table of proc files we need to create. ++ */ ++struct rtl8125_proc_file { ++ char name[16]; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++ int (*show)(struct seq_file *, void *); ++#else ++ int (*show)(char *, char **, off_t, int, int *, void *); ++#endif ++}; ++ ++static const struct rtl8125_proc_file rtl8125_debug_proc_files[] = { ++ { "driver_var", &proc_get_driver_variable }, ++ { "tally", &proc_get_tally_counter }, ++ { "registers", &proc_get_registers }, ++ { "registers2", &proc_get_all_registers }, ++ { "pcie_phy", &proc_get_pcie_phy }, ++ { "eth_phy", &proc_get_eth_phy }, ++ { "ext_regs", &proc_get_extended_registers }, ++ { "pci_regs", &proc_get_pci_registers }, ++ { "tx_desc", &proc_dump_tx_desc }, ++ { "rx_desc", &proc_dump_rx_desc }, ++ { "msix_tbl", &proc_dump_msix_tbl }, ++ { "", NULL } ++}; ++ ++static const struct rtl8125_proc_file rtl8125_test_proc_files[] = { ++ { "temp", &proc_get_temperature }, ++ { "cdt", &proc_get_cable_info }, ++ { "cdt_poe", &proc_get_poe_cable_info }, ++ { "", NULL } ++}; ++ ++#define R8125_PROC_DEBUG_DIR "debug" ++#define R8125_PROC_TEST_DIR "test" ++ ++static void rtl8125_proc_init(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ const struct rtl8125_proc_file *f; ++ struct proc_dir_entry *dir; ++ ++ if (!rtl8125_proc) ++ return; ++ ++ if (tp->proc_dir_debug || tp->proc_dir_test) ++ return; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++ dir = proc_mkdir_data(dev->name, 0, rtl8125_proc, dev); ++ if (!dir) { ++ printk("Unable to initialize /proc/net/%s/%s\n", ++ MODULENAME, dev->name); ++ return; ++ } ++ tp->proc_dir = dir; ++ proc_init_num++; ++ ++ /* create debug entry */ ++ dir = proc_mkdir_data(R8125_PROC_DEBUG_DIR, 0, tp->proc_dir, dev); ++ if (!dir) { ++ printk("Unable to initialize /proc/net/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_DEBUG_DIR); ++ return; ++ } ++ ++ tp->proc_dir_debug = dir; ++ for (f = rtl8125_debug_proc_files; f->name[0]; f++) { ++ if (!proc_create_data(f->name, S_IFREG | S_IRUGO, dir, ++ &rtl8125_proc_fops, f->show)) { ++ printk("Unable to initialize " ++ "/proc/net/%s/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_DEBUG_DIR, ++ f->name); ++ return; ++ } ++ } ++ ++ /* create test entry */ ++ dir = proc_mkdir_data(R8125_PROC_TEST_DIR, 0, tp->proc_dir, dev); ++ if (!dir) { ++ printk("Unable to initialize /proc/net/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_TEST_DIR); ++ return; ++ } ++ ++ tp->proc_dir_test = dir; ++ for (f = rtl8125_test_proc_files; f->name[0]; f++) { ++ if (!proc_create_data(f->name, S_IFREG | S_IRUGO, dir, ++ &rtl8125_proc_fops, f->show)) { ++ printk("Unable to initialize " ++ "/proc/net/%s/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_TEST_DIR, ++ f->name); ++ return; ++ } ++ } ++#else ++ dir = proc_mkdir(dev->name, rtl8125_proc); ++ if (!dir) { ++ printk("Unable to initialize /proc/net/%s/%s\n", ++ MODULENAME, dev->name); ++ return; ++ } ++ ++ tp->proc_dir = dir; ++ proc_init_num++; ++ ++ /* create debug entry */ ++ dir = proc_mkdir(R8125_PROC_DEBUG_DIR, tp->proc_dir); ++ if (!dir) { ++ printk("Unable to initialize /proc/net/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_DEBUG_DIR); ++ return; ++ } ++ ++ tp->proc_dir_debug = dir; ++ for (f = rtl8125_debug_proc_files; f->name[0]; f++) { ++ if (!create_proc_read_entry(f->name, S_IFREG | S_IRUGO, ++ dir, f->show, dev)) { ++ printk("Unable to initialize " ++ "/proc/net/%s/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_DEBUG_DIR, ++ f->name); ++ return; ++ } ++ } ++ ++ /* create test entry */ ++ dir = proc_mkdir(R8125_PROC_TEST_DIR, tp->proc_dir); ++ if (!dir) { ++ printk("Unable to initialize /proc/net/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_TEST_DIR); ++ return; ++ } ++ ++ tp->proc_dir_test = dir; ++ for (f = rtl8125_test_proc_files; f->name[0]; f++) { ++ if (!create_proc_read_entry(f->name, S_IFREG | S_IRUGO, ++ dir, f->show, dev)) { ++ printk("Unable to initialize " ++ "/proc/net/%s/%s/%s/%s\n", ++ MODULENAME, dev->name, R8125_PROC_TEST_DIR, ++ f->name); ++ return; ++ } ++ } ++#endif ++} ++ ++static void rtl8125_proc_remove(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->proc_dir) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++ remove_proc_subtree(dev->name, rtl8125_proc); ++#else ++ const struct rtl8125_proc_file *f; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->proc_dir_debug) { ++ for (f = rtl8125_debug_proc_files; f->name[0]; f++) ++ remove_proc_entry(f->name, tp->proc_dir_debug); ++ remove_proc_entry(R8125_PROC_DEBUG_DIR, tp->proc_dir); ++ } ++ ++ if (tp->proc_dir_test) { ++ for (f = rtl8125_test_proc_files; f->name[0]; f++) ++ remove_proc_entry(f->name, tp->proc_dir_test); ++ remove_proc_entry(R8125_PROC_TEST_DIR, tp->proc_dir); ++ } ++ ++ remove_proc_entry(dev->name, rtl8125_proc); ++#endif ++ proc_init_num--; ++ ++ tp->proc_dir_debug = NULL; ++ tp->proc_dir_test = NULL; ++ tp->proc_dir = NULL; ++ } ++} ++ ++#endif //ENABLE_R8125_PROCFS ++ ++#ifdef ENABLE_R8125_SYSFS ++/**************************************************************************** ++* -----------------------------SYSFS STUFF------------------------- ++***************************************************************************** ++*/ ++static ssize_t testmode_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ struct net_device *netdev = to_net_dev(dev); ++ struct rtl8125_private *tp = netdev_priv(netdev); ++ ++ sprintf(buf, "%u\n", tp->testmode); ++ ++ return strlen(buf); ++} ++ ++static ssize_t testmode_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct net_device *netdev = to_net_dev(dev); ++ struct rtl8125_private *tp = netdev_priv(netdev); ++ u32 testmode; ++ ++ if (sscanf(buf, "%u\n", &testmode) != 1) ++ return -EINVAL; ++ ++ if (tp->testmode != testmode) { ++ rtnl_lock(); ++ tp->testmode = testmode; ++ rtnl_unlock(); ++ } ++ ++ return count; ++} ++ ++static DEVICE_ATTR_RW(testmode); ++ ++static struct attribute *rtk_adv_attrs[] = { ++ &dev_attr_testmode.attr, ++ NULL ++}; ++ ++static struct attribute_group rtk_adv_grp = { ++ .name = "rtl_adv", ++ .attrs = rtk_adv_attrs, ++}; ++ ++static void rtl8125_sysfs_init(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int ret; ++ ++ /* init rtl_adv */ ++#ifdef ENABLE_LIB_SUPPORT ++ tp->testmode = 0; ++#else ++ tp->testmode = 1; ++#endif //ENABLE_LIB_SUPPORT ++ ++ ret = sysfs_create_group(&dev->dev.kobj, &rtk_adv_grp); ++ if (ret < 0) ++ netif_warn(tp, probe, dev, "create rtk_adv_grp fail\n"); ++ else ++ set_bit(R8125_SYSFS_RTL_ADV, tp->sysfs_flag); ++} ++ ++static void rtl8125_sysfs_remove(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (test_and_clear_bit(R8125_SYSFS_RTL_ADV, tp->sysfs_flag)) ++ sysfs_remove_group(&dev->dev.kobj, &rtk_adv_grp); ++} ++#endif //ENABLE_R8125_SYSFS ++ ++static inline u16 map_phy_ocp_addr(u16 PageNum, u8 RegNum) ++{ ++ u16 OcpPageNum = 0; ++ u8 OcpRegNum = 0; ++ u16 OcpPhyAddress = 0; ++ ++ if (PageNum == 0) { ++ OcpPageNum = OCP_STD_PHY_BASE_PAGE + (RegNum / 8); ++ OcpRegNum = 0x10 + (RegNum % 8); ++ } else { ++ OcpPageNum = PageNum; ++ OcpRegNum = RegNum; ++ } ++ ++ OcpPageNum <<= 4; ++ ++ if (OcpRegNum < 16) { ++ OcpPhyAddress = 0; ++ } else { ++ OcpRegNum -= 16; ++ OcpRegNum <<= 1; ++ ++ OcpPhyAddress = OcpPageNum + OcpRegNum; ++ } ++ ++ ++ return OcpPhyAddress; ++} ++ ++static void mdio_real_direct_write_phy_ocp(struct rtl8125_private *tp, ++ u16 RegAddr, ++ u16 value) ++{ ++ u32 data32; ++ int i; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) ++ WARN_ON_ONCE(RegAddr % 2); ++#endif ++ data32 = RegAddr/2; ++ data32 <<= OCPR_Addr_Reg_shift; ++ data32 |= OCPR_Write | value; ++ ++ RTL_W32(tp, PHYOCP, data32); ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ if (!(RTL_R32(tp, PHYOCP) & OCPR_Flag)) ++ break; ++ } ++} ++ ++void rtl8125_mdio_direct_write_phy_ocp(struct rtl8125_private *tp, ++ u16 RegAddr, ++ u16 value) ++{ ++ if (tp->rtk_enable_diag) ++ return; ++ ++ mdio_real_direct_write_phy_ocp(tp, RegAddr, value); ++} ++ ++/* ++void rtl8125_mdio_write_phy_ocp(struct rtl8125_private *tp, ++ u16 PageNum, ++ u32 RegAddr, ++ u32 value) ++{ ++ u16 ocp_addr; ++ ++ ocp_addr = map_phy_ocp_addr(PageNum, RegAddr); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, ocp_addr, value); ++} ++*/ ++ ++static void rtl8125_mdio_real_write_phy_ocp(struct rtl8125_private *tp, ++ u16 PageNum, ++ u32 RegAddr, ++ u32 value) ++{ ++ u16 ocp_addr; ++ ++ ocp_addr = map_phy_ocp_addr(PageNum, RegAddr); ++ ++ mdio_real_direct_write_phy_ocp(tp, ocp_addr, value); ++} ++ ++static void mdio_real_write(struct rtl8125_private *tp, ++ u16 RegAddr, ++ u16 value) ++{ ++ if (RegAddr == 0x1F) { ++ tp->cur_page = value; ++ return; ++ } ++ rtl8125_mdio_real_write_phy_ocp(tp, tp->cur_page, RegAddr, value); ++} ++ ++void rtl8125_mdio_write(struct rtl8125_private *tp, ++ u16 RegAddr, ++ u16 value) ++{ ++ if (tp->rtk_enable_diag) ++ return; ++ ++ mdio_real_write(tp, RegAddr, value); ++} ++ ++void rtl8125_mdio_prot_write(struct rtl8125_private *tp, ++ u32 RegAddr, ++ u32 value) ++{ ++ mdio_real_write(tp, RegAddr, value); ++} ++ ++void rtl8125_mdio_prot_direct_write_phy_ocp(struct rtl8125_private *tp, ++ u32 RegAddr, ++ u32 value) ++{ ++ mdio_real_direct_write_phy_ocp(tp, RegAddr, value); ++} ++ ++static u32 mdio_real_direct_read_phy_ocp(struct rtl8125_private *tp, ++ u16 RegAddr) ++{ ++ u32 data32; ++ int i, value = 0; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) ++ WARN_ON_ONCE(RegAddr % 2); ++#endif ++ data32 = RegAddr/2; ++ data32 <<= OCPR_Addr_Reg_shift; ++ ++ RTL_W32(tp, PHYOCP, data32); ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ if (RTL_R32(tp, PHYOCP) & OCPR_Flag) ++ break; ++ } ++ value = RTL_R32(tp, PHYOCP) & OCPDR_Data_Mask; ++ ++ return value; ++} ++ ++u32 rtl8125_mdio_direct_read_phy_ocp(struct rtl8125_private *tp, ++ u16 RegAddr) ++{ ++ if (tp->rtk_enable_diag) ++ return 0xffffffff; ++ ++ return mdio_real_direct_read_phy_ocp(tp, RegAddr); ++} ++ ++/* ++static u32 rtl8125_mdio_read_phy_ocp(struct rtl8125_private *tp, ++ u16 PageNum, ++ u32 RegAddr) ++{ ++ u16 ocp_addr; ++ ++ ocp_addr = map_phy_ocp_addr(PageNum, RegAddr); ++ ++ return rtl8125_mdio_direct_read_phy_ocp(tp, ocp_addr); ++} ++*/ ++ ++static u32 rtl8125_mdio_real_read_phy_ocp(struct rtl8125_private *tp, ++ u16 PageNum, ++ u32 RegAddr) ++{ ++ u16 ocp_addr; ++ ++ ocp_addr = map_phy_ocp_addr(PageNum, RegAddr); ++ ++ return mdio_real_direct_read_phy_ocp(tp, ocp_addr); ++} ++ ++static u32 mdio_real_read(struct rtl8125_private *tp, ++ u16 RegAddr) ++{ ++ return rtl8125_mdio_real_read_phy_ocp(tp, tp->cur_page, RegAddr); ++} ++ ++u32 rtl8125_mdio_read(struct rtl8125_private *tp, ++ u16 RegAddr) ++{ ++ if (tp->rtk_enable_diag) ++ return 0xffffffff; ++ ++ return mdio_real_read(tp, RegAddr); ++} ++ ++u32 rtl8125_mdio_prot_read(struct rtl8125_private *tp, ++ u32 RegAddr) ++{ ++ return mdio_real_read(tp, RegAddr); ++} ++ ++u32 rtl8125_mdio_prot_direct_read_phy_ocp(struct rtl8125_private *tp, ++ u32 RegAddr) ++{ ++ return mdio_real_direct_read_phy_ocp(tp, RegAddr); ++} ++ ++static void rtl8125_clear_and_set_eth_phy_bit(struct rtl8125_private *tp, u8 addr, u16 clearmask, u16 setmask) ++{ ++ u16 PhyRegValue; ++ ++ PhyRegValue = rtl8125_mdio_read(tp, addr); ++ PhyRegValue &= ~clearmask; ++ PhyRegValue |= setmask; ++ rtl8125_mdio_write(tp, addr, PhyRegValue); ++} ++ ++void rtl8125_clear_eth_phy_bit(struct rtl8125_private *tp, u8 addr, u16 mask) ++{ ++ rtl8125_clear_and_set_eth_phy_bit(tp, ++ addr, ++ mask, ++ 0); ++} ++ ++void rtl8125_set_eth_phy_bit(struct rtl8125_private *tp, u8 addr, u16 mask) ++{ ++ rtl8125_clear_and_set_eth_phy_bit(tp, ++ addr, ++ 0, ++ mask); ++} ++ ++void rtl8125_clear_and_set_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 clearmask, u16 setmask) ++{ ++ u16 PhyRegValue; ++ ++ PhyRegValue = rtl8125_mdio_direct_read_phy_ocp(tp, addr); ++ PhyRegValue &= ~clearmask; ++ PhyRegValue |= setmask; ++ rtl8125_mdio_direct_write_phy_ocp(tp, addr, PhyRegValue); ++} ++ ++void rtl8125_clear_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask) ++{ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ addr, ++ mask, ++ 0); ++} ++ ++void rtl8125_set_eth_phy_ocp_bit(struct rtl8125_private *tp, u16 addr, u16 mask) ++{ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ addr, ++ 0, ++ mask); ++} ++ ++void rtl8125_mac_ocp_write(struct rtl8125_private *tp, u16 reg_addr, u16 value) ++{ ++ u32 data32; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) ++ WARN_ON_ONCE(reg_addr % 2); ++#endif ++ ++ data32 = reg_addr/2; ++ data32 <<= OCPR_Addr_Reg_shift; ++ data32 += value; ++ data32 |= OCPR_Write; ++ ++ RTL_W32(tp, MACOCP, data32); ++} ++ ++u16 rtl8125_mac_ocp_read(struct rtl8125_private *tp, u16 reg_addr) ++{ ++ u32 data32; ++ u16 data16 = 0; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) ++ WARN_ON_ONCE(reg_addr % 2); ++#endif ++ ++ data32 = reg_addr/2; ++ data32 <<= OCPR_Addr_Reg_shift; ++ ++ RTL_W32(tp, MACOCP, data32); ++ data16 = (u16)RTL_R32(tp, MACOCP); ++ ++ return data16; ++} ++ ++#ifdef ENABLE_USE_FIRMWARE_FILE ++static void mac_mcu_write(struct rtl8125_private *tp, u16 reg, u16 value) ++{ ++ if (reg == 0x1f) { ++ tp->ocp_base = value << 4; ++ return; ++ } ++ ++ rtl8125_mac_ocp_write(tp, tp->ocp_base + reg, value); ++} ++ ++static u32 mac_mcu_read(struct rtl8125_private *tp, u16 reg) ++{ ++ return rtl8125_mac_ocp_read(tp, tp->ocp_base + reg); ++} ++#endif ++ ++static void ++rtl8125_clear_set_mac_ocp_bit( ++ struct rtl8125_private *tp, ++ u16 addr, ++ u16 clearmask, ++ u16 setmask ++) ++{ ++ u16 PhyRegValue; ++ ++ PhyRegValue = rtl8125_mac_ocp_read(tp, addr); ++ PhyRegValue &= ~clearmask; ++ PhyRegValue |= setmask; ++ rtl8125_mac_ocp_write(tp, addr, PhyRegValue); ++} ++ ++void ++rtl8125_clear_mac_ocp_bit( ++ struct rtl8125_private *tp, ++ u16 addr, ++ u16 mask ++) ++{ ++ rtl8125_clear_set_mac_ocp_bit(tp, ++ addr, ++ mask, ++ 0); ++} ++ ++void ++rtl8125_set_mac_ocp_bit( ++ struct rtl8125_private *tp, ++ u16 addr, ++ u16 mask ++) ++{ ++ rtl8125_clear_set_mac_ocp_bit(tp, ++ addr, ++ 0, ++ mask); ++} ++ ++u32 rtl8125_ocp_read_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, const u32 base_address) ++{ ++ return rtl8125_eri_read_with_oob_base_address(tp, addr, len, ERIAR_OOB, base_address); ++} ++ ++u32 rtl8125_ocp_read(struct rtl8125_private *tp, u16 addr, u8 len) ++{ ++ if (!tp->AllowAccessDashOcp || tp->HwSuppOcpChannelVer != 2) ++ return 0xffffffff; ++ ++ return rtl8125_ocp_read_with_oob_base_address(tp, addr, len, ++ NO_BASE_ADDRESS); ++} ++ ++u32 rtl8125_ocp_write_with_oob_base_address(struct rtl8125_private *tp, u16 addr, u8 len, u32 value, const u32 base_address) ++{ ++ return rtl8125_eri_write_with_oob_base_address(tp, addr, len, value, ++ ERIAR_OOB, base_address); ++} ++ ++void rtl8125_ocp_write(struct rtl8125_private *tp, u16 addr, u8 len, u32 value) ++{ ++ if (!tp->AllowAccessDashOcp || tp->HwSuppOcpChannelVer != 2) ++ return; ++ ++ rtl8125_ocp_write_with_oob_base_address(tp, addr, len, value, NO_BASE_ADDRESS); ++} ++ ++void rtl8125_oob_mutex_lock(struct rtl8125_private *tp) ++{ ++ u8 reg_16, reg_a0; ++ u32 wait_cnt_0, wait_Cnt_1; ++ u16 ocp_reg_mutex_ib; ++ u16 ocp_reg_mutex_oob; ++ u16 ocp_reg_mutex_prio; ++ ++ if (!HW_DASH_SUPPORT_DASH(tp)) ++ return; ++ ++ if (!tp->DASH) ++ return; ++ ++ ocp_reg_mutex_oob = 0x110; ++ ocp_reg_mutex_ib = 0x114; ++ ocp_reg_mutex_prio = 0x11C; ++ ++ rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, BIT_0); ++ reg_16 = rtl8125_ocp_read(tp, ocp_reg_mutex_oob, 1); ++ wait_cnt_0 = 0; ++ while(reg_16) { ++ reg_a0 = rtl8125_ocp_read(tp, ocp_reg_mutex_prio, 1); ++ if (reg_a0) { ++ rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, 0x00); ++ reg_a0 = rtl8125_ocp_read(tp, ocp_reg_mutex_prio, 1); ++ wait_Cnt_1 = 0; ++ while(reg_a0) { ++ reg_a0 = rtl8125_ocp_read(tp, ocp_reg_mutex_prio, 1); ++ ++ wait_Cnt_1++; ++ ++ if (wait_Cnt_1 > 2000) ++ break; ++ }; ++ rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, BIT_0); ++ ++ } ++ reg_16 = rtl8125_ocp_read(tp, ocp_reg_mutex_oob, 1); ++ ++ wait_cnt_0++; ++ ++ if (wait_cnt_0 > 2000) ++ break; ++ }; ++} ++ ++void rtl8125_oob_mutex_unlock(struct rtl8125_private *tp) ++{ ++ u16 ocp_reg_mutex_ib; ++ u16 ocp_reg_mutex_prio; ++ ++ if (!HW_DASH_SUPPORT_DASH(tp)) ++ return; ++ ++ if (!tp->DASH) ++ return; ++ ++ ocp_reg_mutex_ib = 0x114; ++ ocp_reg_mutex_prio = 0x11C; ++ ++ rtl8125_ocp_write(tp, ocp_reg_mutex_prio, 1, BIT_0); ++ rtl8125_ocp_write(tp, ocp_reg_mutex_ib, 1, 0x00); ++} ++ ++static bool rtl8125_is_allow_access_dash_ocp(struct rtl8125_private *tp) ++{ ++ bool allow_access = false; ++ u16 mac_ocp_data; ++ ++ if (!HW_DASH_SUPPORT_DASH(tp)) ++ goto exit; ++ ++ allow_access = true; ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xd460); ++ if (mac_ocp_data == 0xffff || !(mac_ocp_data & BIT_0)) ++ allow_access = false; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xd4c0); ++ if (mac_ocp_data == 0xffff || (mac_ocp_data & BIT_3)) ++ allow_access = false; ++ break; ++ default: ++ goto exit; ++ } ++exit: ++ return allow_access; ++} ++ ++static u32 rtl8125_get_dash_fw_ver(struct rtl8125_private *tp) ++{ ++ u32 ver = 0xffffffff; ++ ++ if (FALSE == HW_DASH_SUPPORT_GET_FIRMWARE_VERSION(tp)) ++ goto exit; ++ ++ ver = rtl8125_ocp_read(tp, OCP_REG_FIRMWARE_MAJOR_VERSION, 4); ++ ++exit: ++ return ver; ++} ++ ++static int _rtl8125_check_dash(struct rtl8125_private *tp) ++{ ++ if (!tp->AllowAccessDashOcp) ++ return 0; ++ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return 0; ++ ++ if (rtl8125_ocp_read(tp, 0x128, 1) & BIT_0) ++ return 1; ++ ++ return 0; ++} ++ ++static int rtl8125_check_dash(struct rtl8125_private *tp) ++{ ++ if (HW_DASH_SUPPORT_DASH(tp) && _rtl8125_check_dash(tp)) { ++ u32 ver = rtl8125_get_dash_fw_ver(tp); ++ if (!(ver == 0 || ver == 0xffffffff)) ++ return 1; ++ } ++ ++ return 0; ++} ++ ++static int rtl8125_wait_dash_fw_ready(struct rtl8125_private *tp) ++{ ++ int rc = -1; ++ int timeout; ++ ++ if (!tp->DASH) ++ goto out; ++ ++ for (timeout = 0; timeout < 10; timeout++) { ++ fsleep(10000); ++ if (rtl8125_ocp_read(tp, 0x124, 1) & BIT_0) { ++ rc = 1; ++ goto out; ++ } ++ } ++ ++ rc = 0; ++ ++out: ++ return rc; ++} ++ ++static void ++rtl8125_notify_dash_oob_cmac(struct rtl8125_private *tp, u32 cmd) ++{ ++ u32 val; ++ ++ if (!HW_DASH_SUPPORT_CMAC(tp)) ++ return; ++ ++ rtl8125_ocp_write(tp, 0x180, 4, cmd); ++ val = rtl8125_ocp_read(tp, 0x30, 4); ++ val |= BIT_0; ++ rtl8125_ocp_write(tp, 0x30, 4, val); ++} ++ ++static void ++rtl8125_notify_dash_oob_ipc2(struct rtl8125_private *tp, u32 cmd) ++{ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ rtl8125_ocp_write(tp, IB2SOC_DATA, 4, cmd); ++ rtl8125_ocp_write(tp, IB2SOC_CMD, 4, 0x00); ++ rtl8125_ocp_write(tp, IB2SOC_SET, 4, 0x01); ++} ++ ++static void ++rtl8125_notify_dash_oob(struct rtl8125_private *tp, u32 cmd) ++{ ++ if (HW_DASH_SUPPORT_CMAC(tp)) ++ return rtl8125_notify_dash_oob_cmac(tp, cmd); ++ else if (HW_DASH_SUPPORT_IPC2(tp)) ++ return rtl8125_notify_dash_oob_ipc2(tp, cmd); ++ else ++ return; ++} ++ ++static void rtl8125_driver_start(struct rtl8125_private *tp) ++{ ++ if (!tp->AllowAccessDashOcp) ++ return; ++ ++ rtl8125_notify_dash_oob(tp, OOB_CMD_DRIVER_START); ++ ++ rtl8125_wait_dash_fw_ready(tp); ++} ++ ++static void rtl8125_driver_stop(struct rtl8125_private *tp) ++{ ++ if (!tp->AllowAccessDashOcp) ++ return; ++ ++ rtl8125_notify_dash_oob(tp, OOB_CMD_DRIVER_STOP); ++ ++ rtl8125_wait_dash_fw_ready(tp); ++} ++ ++void rtl8125_ephy_write(struct rtl8125_private *tp, int RegAddr, int value) ++{ ++ int i; ++ ++ RTL_W32(tp, EPHYAR, ++ EPHYAR_Write | ++ (RegAddr & EPHYAR_Reg_Mask_v2) << EPHYAR_Reg_shift | ++ (value & EPHYAR_Data_Mask)); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ /* Check if the RTL8125 has completed EPHY write */ ++ if (!(RTL_R32(tp, EPHYAR) & EPHYAR_Flag)) ++ break; ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++} ++ ++u16 rtl8125_ephy_read(struct rtl8125_private *tp, int RegAddr) ++{ ++ int i; ++ u16 value = 0xffff; ++ ++ RTL_W32(tp, EPHYAR, ++ EPHYAR_Read | (RegAddr & EPHYAR_Reg_Mask_v2) << EPHYAR_Reg_shift); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ /* Check if the RTL8125 has completed EPHY read */ ++ if (RTL_R32(tp, EPHYAR) & EPHYAR_Flag) { ++ value = (u16) (RTL_R32(tp, EPHYAR) & EPHYAR_Data_Mask); ++ break; ++ } ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++ ++ return value; ++} ++ ++static void ClearAndSetPCIePhyBit(struct rtl8125_private *tp, u8 addr, u16 clearmask, u16 setmask) ++{ ++ u16 EphyValue; ++ ++ EphyValue = rtl8125_ephy_read(tp, addr); ++ EphyValue &= ~clearmask; ++ EphyValue |= setmask; ++ rtl8125_ephy_write(tp, addr, EphyValue); ++} ++ ++static void ClearPCIePhyBit(struct rtl8125_private *tp, u8 addr, u16 mask) ++{ ++ ClearAndSetPCIePhyBit(tp, ++ addr, ++ mask, ++ 0); ++} ++ ++static void SetPCIePhyBit(struct rtl8125_private *tp, u8 addr, u16 mask) ++{ ++ ClearAndSetPCIePhyBit(tp, ++ addr, ++ 0, ++ mask); ++} ++ ++static u32 ++rtl8125_csi_other_fun_read(struct rtl8125_private *tp, ++ u8 multi_fun_sel_bit, ++ u32 addr) ++{ ++ u32 cmd; ++ int i; ++ u32 value = 0xffffffff; ++ ++ cmd = CSIAR_Read | CSIAR_ByteEn << CSIAR_ByteEn_shift | (addr & CSIAR_Addr_Mask); ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT) ++ multi_fun_sel_bit = 0; ++ ++ if (multi_fun_sel_bit > 7) ++ goto exit; ++ ++ cmd |= multi_fun_sel_bit << 16; ++ ++ RTL_W32(tp, CSIAR, cmd); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ /* Check if the RTL8125 has completed CSI read */ ++ if (RTL_R32(tp, CSIAR) & CSIAR_Flag) { ++ value = (u32)RTL_R32(tp, CSIDR); ++ break; ++ } ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++ ++exit: ++ return value; ++} ++ ++static void ++rtl8125_csi_other_fun_write(struct rtl8125_private *tp, ++ u8 multi_fun_sel_bit, ++ u32 addr, ++ u32 value) ++{ ++ u32 cmd; ++ int i; ++ ++ RTL_W32(tp, CSIDR, value); ++ cmd = CSIAR_Write | CSIAR_ByteEn << CSIAR_ByteEn_shift | (addr & CSIAR_Addr_Mask); ++ if (tp->mcfg == CFG_METHOD_DEFAULT) ++ multi_fun_sel_bit = 0; ++ ++ if (multi_fun_sel_bit > 7) ++ return; ++ ++ cmd |= multi_fun_sel_bit << 16; ++ ++ RTL_W32(tp, CSIAR, cmd); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ /* Check if the RTL8125 has completed CSI write */ ++ if (!(RTL_R32(tp, CSIAR) & CSIAR_Flag)) ++ break; ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++} ++ ++static u32 ++rtl8125_csi_read(struct rtl8125_private *tp, ++ u32 addr) ++{ ++ u8 multi_fun_sel_bit; ++ ++ multi_fun_sel_bit = 0; ++ ++ return rtl8125_csi_other_fun_read(tp, multi_fun_sel_bit, addr); ++} ++ ++static void ++rtl8125_csi_write(struct rtl8125_private *tp, ++ u32 addr, ++ u32 value) ++{ ++ u8 multi_fun_sel_bit; ++ ++ multi_fun_sel_bit = 0; ++ ++ rtl8125_csi_other_fun_write(tp, multi_fun_sel_bit, addr, value); ++} ++ ++static u8 ++rtl8125_csi_fun0_read_byte(struct rtl8125_private *tp, ++ u32 addr) ++{ ++ u8 RetVal = 0; ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT) { ++ struct pci_dev *pdev = tp->pci_dev; ++ ++ pci_read_config_byte(pdev, addr, &RetVal); ++ } else { ++ u32 TmpUlong; ++ u16 RegAlignAddr; ++ u8 ShiftByte; ++ ++ RegAlignAddr = addr & ~(0x3); ++ ShiftByte = addr & (0x3); ++ TmpUlong = rtl8125_csi_other_fun_read(tp, 0, RegAlignAddr); ++ TmpUlong >>= (8*ShiftByte); ++ RetVal = (u8)TmpUlong; ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++ ++ return RetVal; ++} ++ ++static void ++rtl8125_csi_fun0_write_byte(struct rtl8125_private *tp, ++ u32 addr, ++ u8 value) ++{ ++ if (tp->mcfg == CFG_METHOD_DEFAULT) { ++ struct pci_dev *pdev = tp->pci_dev; ++ ++ pci_write_config_byte(pdev, addr, value); ++ } else { ++ u32 TmpUlong; ++ u16 RegAlignAddr; ++ u8 ShiftByte; ++ ++ RegAlignAddr = addr & ~(0x3); ++ ShiftByte = addr & (0x3); ++ TmpUlong = rtl8125_csi_other_fun_read(tp, 0, RegAlignAddr); ++ TmpUlong &= ~(0xFF << (8*ShiftByte)); ++ TmpUlong |= (value << (8*ShiftByte)); ++ rtl8125_csi_other_fun_write(tp, 0, RegAlignAddr, TmpUlong); ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++} ++ ++u32 rtl8125_eri_read_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, int type, const u32 base_address) ++{ ++ int i, val_shift, shift = 0; ++ u32 value1 = 0, value2 = 0, mask; ++ u32 eri_cmd; ++ const u32 transformed_base_address = ((base_address & 0x00FFF000) << 6) | (base_address & 0x000FFF); ++ ++ if (len > 4 || len <= 0) ++ return -1; ++ ++ while (len > 0) { ++ val_shift = addr % ERIAR_Addr_Align; ++ addr = addr & ~0x3; ++ ++ eri_cmd = ERIAR_Read | ++ transformed_base_address | ++ type << ERIAR_Type_shift | ++ ERIAR_ByteEn << ERIAR_ByteEn_shift | ++ (addr & 0x0FFF); ++ if (addr & 0xF000) { ++ u32 tmp; ++ ++ tmp = addr & 0xF000; ++ tmp >>= 12; ++ eri_cmd |= (tmp << 20) & 0x00F00000; ++ } ++ ++ RTL_W32(tp, ERIAR, eri_cmd); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ /* Check if the RTL8125 has completed ERI read */ ++ if (RTL_R32(tp, ERIAR) & ERIAR_Flag) ++ break; ++ } ++ ++ if (len == 1) mask = (0xFF << (val_shift * 8)) & 0xFFFFFFFF; ++ else if (len == 2) mask = (0xFFFF << (val_shift * 8)) & 0xFFFFFFFF; ++ else if (len == 3) mask = (0xFFFFFF << (val_shift * 8)) & 0xFFFFFFFF; ++ else mask = (0xFFFFFFFF << (val_shift * 8)) & 0xFFFFFFFF; ++ ++ value1 = RTL_R32(tp, ERIDR) & mask; ++ value2 |= (value1 >> val_shift * 8) << shift * 8; ++ ++ if (len <= 4 - val_shift) { ++ len = 0; ++ } else { ++ len -= (4 - val_shift); ++ shift = 4 - val_shift; ++ addr += 4; ++ } ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++ ++ return value2; ++} ++ ++u32 rtl8125_eri_read(struct rtl8125_private *tp, int addr, int len, int type) ++{ ++ return rtl8125_eri_read_with_oob_base_address(tp, addr, len, type, 0); ++} ++ ++int rtl8125_eri_write_with_oob_base_address(struct rtl8125_private *tp, int addr, int len, u32 value, int type, const u32 base_address) ++{ ++ int i, val_shift, shift = 0; ++ u32 value1 = 0, mask; ++ u32 eri_cmd; ++ const u32 transformed_base_address = ((base_address & 0x00FFF000) << 6) | (base_address & 0x000FFF); ++ ++ if (len > 4 || len <= 0) ++ return -1; ++ ++ while (len > 0) { ++ val_shift = addr % ERIAR_Addr_Align; ++ addr = addr & ~0x3; ++ ++ if (len == 1) mask = (0xFF << (val_shift * 8)) & 0xFFFFFFFF; ++ else if (len == 2) mask = (0xFFFF << (val_shift * 8)) & 0xFFFFFFFF; ++ else if (len == 3) mask = (0xFFFFFF << (val_shift * 8)) & 0xFFFFFFFF; ++ else mask = (0xFFFFFFFF << (val_shift * 8)) & 0xFFFFFFFF; ++ ++ value1 = rtl8125_eri_read_with_oob_base_address(tp, addr, 4, type, base_address) & ~mask; ++ value1 |= ((value << val_shift * 8) >> shift * 8); ++ ++ RTL_W32(tp, ERIDR, value1); ++ ++ eri_cmd = ERIAR_Write | ++ transformed_base_address | ++ type << ERIAR_Type_shift | ++ ERIAR_ByteEn << ERIAR_ByteEn_shift | ++ (addr & 0x0FFF); ++ if (addr & 0xF000) { ++ u32 tmp; ++ ++ tmp = addr & 0xF000; ++ tmp >>= 12; ++ eri_cmd |= (tmp << 20) & 0x00F00000; ++ } ++ ++ RTL_W32(tp, ERIAR, eri_cmd); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ /* Check if the RTL8125 has completed ERI write */ ++ if (!(RTL_R32(tp, ERIAR) & ERIAR_Flag)) ++ break; ++ } ++ ++ if (len <= 4 - val_shift) { ++ len = 0; ++ } else { ++ len -= (4 - val_shift); ++ shift = 4 - val_shift; ++ addr += 4; ++ } ++ } ++ ++ udelay(R8125_CHANNEL_EXIT_DELAY_TIME); ++ ++ return 0; ++} ++ ++int rtl8125_eri_write(struct rtl8125_private *tp, int addr, int len, u32 value, int type) ++{ ++ return rtl8125_eri_write_with_oob_base_address(tp, addr, len, value, type, NO_BASE_ADDRESS); ++} ++ ++static void ++rtl8125_enable_rxdvgate(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) | BIT_3); ++} ++ ++static void ++rtl8125_disable_rxdvgate(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) & ~BIT_3); ++} ++ ++static u8 ++rtl8125_is_gpio_low(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u8 gpio_low = FALSE; ++ ++ switch (tp->HwSuppCheckPhyDisableModeVer) { ++ case 3: ++ if (!(rtl8125_mac_ocp_read(tp, 0xDC04) & BIT_13)) ++ gpio_low = TRUE; ++ break; ++ } ++ ++ if (gpio_low) ++ dprintk("gpio is low.\n"); ++ ++ return gpio_low; ++} ++ ++static u8 ++rtl8125_is_phy_disable_mode_enabled(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u8 phy_disable_mode_enabled = FALSE; ++ ++ switch (tp->HwSuppCheckPhyDisableModeVer) { ++ case 3: ++ if (RTL_R8(tp, 0xF2) & BIT_5) ++ phy_disable_mode_enabled = TRUE; ++ break; ++ } ++ ++ if (phy_disable_mode_enabled) ++ dprintk("phy disable mode enabled.\n"); ++ ++ return phy_disable_mode_enabled; ++} ++ ++static u8 ++rtl8125_is_in_phy_disable_mode(struct net_device *dev) ++{ ++ u8 in_phy_disable_mode = FALSE; ++ ++ if (rtl8125_is_phy_disable_mode_enabled(dev) && rtl8125_is_gpio_low(dev)) ++ in_phy_disable_mode = TRUE; ++ ++ if (in_phy_disable_mode) ++ dprintk("Hardware is in phy disable mode.\n"); ++ ++ return in_phy_disable_mode; ++} ++ ++static bool ++rtl8125_stop_all_request(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) | StopReq); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ for (i = 0; i < 20; i++) { ++ udelay(10); ++ if (!(RTL_R8(tp, ChipCmd) & StopReq)) ++ break; ++ } ++ ++ if (i == 20) ++ return false; ++ break; ++ default: ++ udelay(200); ++ break; ++ } ++ ++ return true; ++} ++ ++static void ++rtl8125_clear_stop_all_request(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ RTL_W8(tp, ChipCmd, RTL_R8(tp, ChipCmd) & (CmdTxEnb | CmdRxEnb)); ++} ++ ++void ++rtl8125_wait_txrx_fifo_empty(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ /* Txfifo_empty require StopReq been set */ ++ for (i = 0; i < 3000; i++) { ++ udelay(50); ++ if ((RTL_R8(tp, MCUCmd_reg) & (Txfifo_empty | Rxfifo_empty)) == (Txfifo_empty | Rxfifo_empty)) ++ break; ++ } ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ for (i = 0; i < 3000; i++) { ++ udelay(50); ++ if ((RTL_R16(tp, IntrMitigate) & (BIT_0 | BIT_1 | BIT_8)) == (BIT_0 | BIT_1 | BIT_8)) ++ break; ++ } ++ break; ++ } ++} ++ ++#ifdef ENABLE_DASH_SUPPORT ++ ++static inline void ++rtl8125_enable_dash2_interrupt(struct rtl8125_private *tp) ++{ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ if (!tp->DASH) ++ return; ++ ++ rtl8125_set_ipc2_soc_imr_bit(tp, RISC_IPC2_INTR); ++} ++ ++static inline void ++rtl8125_disable_dash2_interrupt(struct rtl8125_private *tp) ++{ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ rtl8125_clear_ipc2_soc_imr_bit(tp, RISC_IPC2_INTR); ++} ++#endif ++ ++void ++rtl8125_enable_hw_linkchg_interrupt(struct rtl8125_private *tp) ++{ ++ switch (tp->HwCurrIsrVer) { ++ case 7: ++ RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V7_LINKCHG); ++ break; ++ case 5: ++ RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V5_LINKCHG); ++ break; ++ case 4: ++ RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V4_LINKCHG); ++ break; ++ case 2: ++ case 3: ++ RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V2_LINKCHG); ++ break; ++ case 1: ++ RTL_W32(tp, tp->imr_reg[0], LinkChg | RTL_R32(tp, tp->imr_reg[0])); ++ break; ++ } ++ ++#ifdef ENABLE_DASH_SUPPORT ++ if (tp->DASH) ++ rtl8125_enable_dash2_interrupt(tp); ++#endif ++} ++ ++static inline void ++rtl8125_enable_hw_interrupt(struct rtl8125_private *tp) ++{ ++ switch (tp->HwCurrIsrVer) { ++ case 2: ++ case 3: ++ case 4: ++ case 5: ++ case 7: ++ RTL_W32(tp, IMR_V2_SET_REG_8125, tp->intr_mask); ++ break; ++ case 1: ++ RTL_W32(tp, tp->imr_reg[0], tp->intr_mask); ++ ++ if (R8125_MULTI_RX_Q(tp)) { ++ int i; ++ for (i=1; inum_rx_rings; i++) ++ RTL_W16(tp, tp->imr_reg[i], other_q_intr_mask); ++ } ++ break; ++ } ++ ++#ifdef ENABLE_DASH_SUPPORT ++ if (tp->DASH) ++ rtl8125_enable_dash2_interrupt(tp); ++#endif ++} ++ ++static inline void rtl8125_clear_hw_isr_v2(struct rtl8125_private *tp, ++ u32 message_id) ++{ ++ RTL_W32(tp, ISR_V2_8125, BIT(message_id)); ++} ++ ++static inline void ++rtl8125_disable_hw_interrupt(struct rtl8125_private *tp) ++{ ++ if (tp->HwCurrIsrVer > 1) { ++ RTL_W32(tp, IMR_V2_CLEAR_REG_8125, 0xFFFFFFFF); ++ if (tp->HwCurrIsrVer > 3) ++ RTL_W32(tp, IMR_V4_L2_CLEAR_REG_8125, 0xFFFFFFFF); ++ } else { ++ RTL_W32(tp, tp->imr_reg[0], 0x0000); ++ ++ if (R8125_MULTI_RX_Q(tp)) { ++ int i; ++ for (i=1; inum_rx_rings; i++) ++ RTL_W16(tp, tp->imr_reg[i], 0); ++ } ++ } ++ ++#ifdef ENABLE_DASH_SUPPORT ++ rtl8125_disable_dash2_interrupt(tp); ++#endif ++} ++ ++static inline void ++rtl8125_switch_to_hw_interrupt(struct rtl8125_private *tp) ++{ ++ RTL_W32(tp, TIMER_INT0_8125, 0x0000); ++ ++ rtl8125_enable_hw_interrupt(tp); ++} ++ ++static inline void ++rtl8125_switch_to_timer_interrupt(struct rtl8125_private *tp) ++{ ++ if (tp->use_timer_interrupt) { ++ RTL_W32(tp, TIMER_INT0_8125, timer_count); ++ RTL_W32(tp, TCTR0_8125, timer_count); ++ RTL_W32(tp, tp->imr_reg[0], tp->timer_intr_mask); ++ } else { ++ rtl8125_switch_to_hw_interrupt(tp); ++ } ++} ++ ++static void ++rtl8125_irq_mask_and_ack(struct rtl8125_private *tp) ++{ ++ rtl8125_disable_hw_interrupt(tp); ++ ++ if (tp->HwCurrIsrVer > 1) { ++ RTL_W32(tp, ISR_V2_8125, 0xFFFFFFFF); ++ if (tp->HwCurrIsrVer > 3) ++ RTL_W32(tp, ISR_V4_L2_8125, 0xFFFFFFFF); ++ } else { ++ RTL_W32(tp, tp->isr_reg[0], RTL_R32(tp, tp->isr_reg[0])); ++ ++ if (R8125_MULTI_RX_Q(tp)) { ++ int i; ++ for (i=1; inum_rx_rings; i++) ++ RTL_W16(tp, tp->isr_reg[i], RTL_R16(tp, tp->isr_reg[i])); ++ } ++ } ++ ++#ifdef ENABLE_DASH_SUPPORT ++ rtl8125_clear_ipc2_isr(tp); ++#endif ++} ++ ++static void ++rtl8125_disable_rx_packet_filter(struct rtl8125_private *tp) ++{ ++ ++ RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) & ++ ~(AcceptErr | AcceptRunt |AcceptBroadcast | AcceptMulticast | ++ AcceptMyPhys | AcceptAllPhys)); ++} ++ ++static void ++rtl8125_nic_reset(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ rtl8125_disable_rx_packet_filter(tp); ++ ++ rtl8125_enable_rxdvgate(dev); ++ ++ rtl8125_stop_all_request(dev); ++ ++ rtl8125_wait_txrx_fifo_empty(dev); ++ ++ rtl8125_clear_stop_all_request(dev); ++ ++ /* Soft reset the chip. */ ++ RTL_W8(tp, ChipCmd, CmdReset); ++ ++ /* Check that the chip has finished the reset. */ ++ for (i = 100; i > 0; i--) { ++ udelay(100); ++ if ((RTL_R8(tp, ChipCmd) & CmdReset) == 0) ++ break; ++ } ++ ++ /* reset rcr */ ++ RTL_W32(tp, RxConfig, (RX_DMA_BURST_512 << RxCfgDMAShift)); ++} ++ ++static void ++rtl8125_hw_set_interrupt_type(struct rtl8125_private *tp, u8 isr_ver) ++{ ++ u8 tmp; ++ ++ if (tp->HwSuppIsrVer < 2) ++ return; ++ ++ tmp = RTL_R8(tp, INT_CFG0_8125); ++ ++ switch (tp->HwSuppIsrVer) { ++ case 7: ++ tmp &= ~INT_CFG0_AVOID_MISS_INTR; ++ fallthrough; ++ case 4: ++ case 5: ++ if (tp->HwSuppIsrVer == 7) ++ tmp &= ~INT_CFG0_AUTO_CLEAR_IMR; ++ else ++ tmp &= ~INT_CFG0_MSIX_ENTRY_NUM_MODE; ++ fallthrough; ++ case 2: ++ case 3: ++ tmp &= ~(INT_CFG0_ENABLE_8125); ++ if (isr_ver > 1) ++ tmp |= INT_CFG0_ENABLE_8125; ++ break; ++ default: ++ return; ++ } ++ ++ RTL_W8(tp, INT_CFG0_8125, tmp); ++} ++ ++static void ++rtl8125_hw_clear_timer_int(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ RTL_W32(tp, TIMER_INT0_8125, 0x0000); ++ RTL_W32(tp, TIMER_INT1_8125, 0x0000); ++ RTL_W32(tp, TIMER_INT2_8125, 0x0000); ++ RTL_W32(tp, TIMER_INT3_8125, 0x0000); ++} ++ ++static void ++rtl8125_hw_clear_int_miti(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ switch (tp->HwSuppIntMitiVer) { ++ case 3: ++ case 6: ++ //IntMITI_0-IntMITI_31 ++ for (i=0xA00; i<0xB00; i+=4) ++ RTL_W32(tp, i, 0x0000); ++ break; ++ case 4: ++ case 5: ++ //IntMITI_0-IntMITI_15 ++ for (i = 0xA00; i < 0xA80; i += 4) ++ RTL_W32(tp, i, 0x0000); ++ ++ if (tp->HwSuppIntMitiVer == 5) ++ RTL_W8(tp, INT_CFG0_8125, RTL_R8(tp, INT_CFG0_8125) & ++ ~(INT_CFG0_TIMEOUT0_BYPASS_8125 | ++ INT_CFG0_MITIGATION_BYPASS_8125 | ++ INT_CFG0_RDU_BYPASS_8126)); ++ else ++ RTL_W8(tp, INT_CFG0_8125, RTL_R8(tp, INT_CFG0_8125) & ++ ~(INT_CFG0_TIMEOUT0_BYPASS_8125 | INT_CFG0_MITIGATION_BYPASS_8125)); ++ ++ RTL_W16(tp, INT_CFG1_8125, 0x0000); ++ break; ++ } ++} ++ ++static bool ++rtl8125_vec_2_tx_q_num( ++ struct rtl8125_private *tp, ++ u32 messageId, ++ u32 *qnum ++) ++{ ++ u32 whichQ = 0xffffffff; ++ bool rc = false; ++ ++ switch (tp->HwSuppIsrVer) { ++ case 2: ++ if (messageId == 0x10) ++ whichQ = 0; ++ else if (messageId == 0x12 && tp->num_tx_rings > 1) ++ whichQ = 1; ++ break; ++ case 3: ++ case 4: ++ if (messageId == 0x00) ++ whichQ = 0; ++ else if (messageId == 0x01 && tp->num_tx_rings > 1) ++ whichQ = 1; ++ break; ++ case 5: ++ if (messageId == 0x10) ++ whichQ = 0; ++ else if (messageId == 0x11 && tp->num_tx_rings > 1) ++ whichQ = 1; ++ break; ++ case 6: ++ if (messageId == 0x08) ++ whichQ = 0; ++ else if (messageId == 0x09 && tp->num_tx_rings > 1) ++ whichQ = 1; ++ break; ++ case 7: ++ if (messageId == 0x1B) ++ whichQ = 0; ++ else if (messageId == 0x1C && tp->num_tx_rings > 1) ++ whichQ = 1; ++ break; ++ } ++ ++ if (whichQ != 0xffffffff) { ++ *qnum = whichQ; ++ rc = true; ++ } ++ ++ return rc; ++} ++ ++static bool ++rtl8125_vec_2_rx_q_num( ++ struct rtl8125_private *tp, ++ u32 messageId, ++ u32 *qnum ++) ++{ ++ u32 whichQ = 0xffffffff; ++ bool rc = false; ++ ++ switch (tp->HwSuppIsrVer) { ++ case 2: ++ case 3: ++ case 4: ++ case 5: ++ case 6: ++ case 7: ++ if (messageId < tp->HwSuppNumRxQueues) ++ whichQ = messageId; ++ break; ++ } ++ ++ if (whichQ != 0xffffffff) { ++ *qnum = whichQ; ++ rc = true; ++ } ++ ++ return rc; ++} ++ ++void ++rtl8125_hw_set_timer_int(struct rtl8125_private *tp, ++ u32 message_id, ++ u8 timer_intmiti_val) ++{ ++ u32 qnum; ++ ++ switch (tp->HwSuppIntMitiVer) { ++ case 4: ++ case 5: ++ case 6: ++#ifdef ENABLE_LIB_SUPPORT ++ if (message_id < R8125_MAX_RX_QUEUES_VEC_V3) ++ timer_intmiti_val = 0; ++#else ++ if ((tp->HwCurrIsrVer == 2) && (message_id < R8125_MAX_RX_QUEUES_VEC_V3)) ++ timer_intmiti_val = 0; ++#endif //ENABLE_LIB_SUPPORT ++ //ROK ++ if (rtl8125_vec_2_rx_q_num(tp, message_id, &qnum)) ++ RTL_W8(tp,INT_MITI_V2_0_RX + 8 * qnum, timer_intmiti_val); ++ //TOK ++ if (rtl8125_vec_2_tx_q_num(tp, message_id, &qnum)) ++ RTL_W8(tp,INT_MITI_V2_0_TX + 8 * qnum, timer_intmiti_val); ++ break; ++ } ++} ++ ++void ++rtl8125_hw_reset(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_lib_reset_prepare(tp); ++ ++ /* Disable interrupts */ ++ rtl8125_irq_mask_and_ack(tp); ++ ++ rtl8125_hw_clear_timer_int(dev); ++ ++ rtl8125_nic_reset(dev); ++} ++ ++static unsigned int ++rtl8125_xmii_reset_pending(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ unsigned int retval; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_mdio_write(tp, 0x1f, 0x0000); ++ retval = rtl8125_mdio_read(tp, MII_BMCR) & BMCR_RESET; ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return retval; ++} ++ ++static unsigned int ++_rtl8125_xmii_link_ok(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 status; ++ ++ status = rtl8125_get_phy_status(tp); ++ if (status == UINT_MAX) ++ return 0; ++ ++ return (status & LinkStatus) ? 1 : 0; ++} ++ ++static unsigned int ++rtl8125_xmii_link_ok(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned int link_state; ++ ++ link_state = _rtl8125_xmii_link_ok(dev); ++#ifdef ENABLE_FIBER_SUPPORT ++ if (HW_FIBER_MODE_ENABLED(tp) && ++ link_state == R8125_LINK_STATE_ON) ++ return rtl8125_fiber_link_ok(dev); ++#else ++ (void)tp; ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ return link_state; ++} ++ ++static int ++rtl8125_wait_phy_reset_complete(struct rtl8125_private *tp) ++{ ++ int i, val; ++ ++ for (i = 0; i < 2500; i++) { ++ val = rtl8125_mdio_read(tp, MII_BMCR) & BMCR_RESET; ++ if (!val) ++ return 0; ++ ++ mdelay(1); ++ } ++ ++ return -1; ++} ++ ++static void ++rtl8125_xmii_reset_enable(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ int ret; ++ ++ if (rtl8125_is_in_phy_disable_mode(dev)) ++ return; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_mdio_write(tp, 0x1f, 0x0000); ++ rtl8125_mdio_write(tp, MII_ADVERTISE, rtl8125_mdio_read(tp, MII_ADVERTISE) & ++ ~(ADVERTISE_10HALF | ADVERTISE_10FULL | ++ ADVERTISE_100HALF | ADVERTISE_100FULL)); ++ rtl8125_mdio_write(tp, MII_CTRL1000, rtl8125_mdio_read(tp, MII_CTRL1000) & ++ ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL)); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA5D4, rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D4) & ++ ~RTK_ADVERTISE_2500FULL); ++ rtl8125_mdio_write(tp, MII_BMCR, BMCR_RESET | BMCR_ANENABLE); ++ ++ ret = rtl8125_wait_phy_reset_complete(tp); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ if (ret != 0 && netif_msg_link(tp)) ++ printk(KERN_ERR "%s: PHY reset failed.\n", dev->name); ++} ++ ++void ++rtl8125_init_ring_indexes(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < tp->HwSuppNumTxQueues; i++) { ++ struct rtl8125_tx_ring *ring = &tp->tx_ring[i]; ++ ring->dirty_tx = ring->cur_tx = 0; ++ ring->NextHwDesCloPtr = 0; ++ ring->BeginHwDesCloPtr = 0; ++ ring->index = i; ++ ring->priv = tp; ++ ring->netdev = tp->dev; ++ ++ /* reset BQL for queue */ ++ netdev_tx_reset_queue(txring_txq(ring)); ++ } ++ ++ for (i = 0; i < tp->HwSuppNumRxQueues; i++) { ++ struct rtl8125_rx_ring *ring = &tp->rx_ring[i]; ++ ring->dirty_rx = ring->cur_rx = 0; ++ ring->index = i; ++ ring->priv = tp; ++ ring->netdev = tp->dev; ++ } ++ ++#ifdef ENABLE_LIB_SUPPORT ++ for (i = 0; i < tp->HwSuppNumTxQueues; i++) { ++ struct rtl8125_ring *ring = &tp->lib_tx_ring[i]; ++ ring->direction = RTL8125_CH_DIR_TX; ++ ring->queue_num = i; ++ ring->private = tp; ++ } ++ ++ for (i = 0; i < tp->HwSuppNumRxQueues; i++) { ++ struct rtl8125_ring *ring = &tp->lib_rx_ring[i]; ++ ring->direction = RTL8125_CH_DIR_RX; ++ ring->queue_num = i; ++ ring->private = tp; ++ } ++#endif ++} ++ ++static void ++rtl8125_issue_offset_99_event(struct rtl8125_private *tp) ++{ ++ rtl8125_mac_ocp_write(tp, 0xE09A, rtl8125_mac_ocp_read(tp, 0xE09A) | BIT_0); ++} ++ ++#ifdef ENABLE_DASH_SUPPORT ++static void ++rtl8125_check_and_enable_dash_interrupt(struct rtl8125_private *tp) ++{ ++ if (!HW_DASH_SUPPORT_IPC2(tp)) ++ return; ++ ++ if (!tp->DASH) ++ return; ++ ++ // ++ // even disconnected, enable dash interrupt mask bits for in-band/out-band communication ++ // ++ rtl8125_enable_dash2_interrupt(tp); ++ if (tp->HwCurrIsrVer > 1) { ++ RTL_W32(tp, IMR_V2_SET_REG_8125, ISRIMR_V4_LAYER2_INTR_STS); ++ RTL_W32(tp, IMR_V4_L2_SET_REG_8125, ISRIMR_V4_L2_IPC2); ++ } else { ++ RTL_W16(tp, tp->imr_reg[0], ISRIMR_DASH_INTR_EN); ++ } ++} ++#endif ++ ++static int rtl8125_enable_eee_plus(struct rtl8125_private *tp) ++{ ++ rtl8125_mac_ocp_write(tp, 0xE080, rtl8125_mac_ocp_read(tp, 0xE080)|BIT_1); ++ ++ return 0; ++} ++ ++static int rtl8125_disable_eee_plus(struct rtl8125_private *tp) ++{ ++ rtl8125_mac_ocp_write(tp, 0xE080, rtl8125_mac_ocp_read(tp, 0xE080)&~BIT_1); ++ ++ return 0; ++} ++ ++static void rtl8125_enable_double_vlan(struct rtl8125_private *tp) ++{ ++ RTL_W16(tp, DOUBLE_VLAN_CONFIG, 0xf002); ++} ++ ++static void rtl8125_disable_double_vlan(struct rtl8125_private *tp) ++{ ++ RTL_W16(tp, DOUBLE_VLAN_CONFIG, 0); ++} ++ ++static void ++rtl8125_set_pfm_patch(struct rtl8125_private *tp, bool enable) ++{ ++ if (!tp->RequiredPfmPatch) ++ goto exit; ++ ++ if (enable) { ++ rtl8125_set_mac_ocp_bit(tp, 0xD3F0, BIT_0); ++ rtl8125_set_mac_ocp_bit(tp, 0xD3F2, BIT_0); ++ rtl8125_set_mac_ocp_bit(tp, 0xE85A, BIT_6); ++ } else { ++ rtl8125_clear_mac_ocp_bit(tp, 0xD3F0, BIT_0); ++ rtl8125_clear_mac_ocp_bit(tp, 0xD3F2, BIT_0); ++ rtl8125_clear_mac_ocp_bit(tp, 0xE85A, BIT_6); ++ } ++ ++exit: ++ return; ++} ++ ++static void ++rtl8125_link_on_patch(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ u32 status; ++ ++ rtl8125_hw_config(dev); ++ ++ if ((tp->mcfg == CFG_METHOD_2) && ++ netif_running(dev)) { ++ if (rtl8125_get_phy_status(tp)&FullDup) ++ RTL_W32(tp, TxConfig, (RTL_R32(tp, TxConfig) | (BIT_24 | BIT_25)) & ~BIT_19); ++ else ++ RTL_W32(tp, TxConfig, (RTL_R32(tp, TxConfig) | BIT_25) & ~(BIT_19 | BIT_24)); ++ } ++ ++ status = rtl8125_get_phy_status(tp); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_12: ++ if (status & _10bps) ++ rtl8125_enable_eee_plus(tp); ++ break; ++ default: ++ break; ++ } ++ ++ if (tp->RequiredPfmPatch) ++ rtl8125_set_pfm_patch(tp, (status & _10bps) ? 1 : 0); ++ ++ rtl8125_hw_start(dev); ++ ++ netif_carrier_on(dev); ++ ++ netif_tx_wake_all_queues(dev); ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ tp->phy_reg_aner = rtl8125_mdio_read(tp, MII_EXPANSION); ++ tp->phy_reg_anlpar = rtl8125_mdio_read(tp, MII_LPA); ++ tp->phy_reg_gbsr = rtl8125_mdio_read(tp, MII_STAT1000); ++ tp->phy_reg_status_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D6); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++#ifdef ENABLE_PTP_SUPPORT ++ if (tp->HwSuppPtpVer == 3) ++ rtl8125_set_phy_local_time(tp); ++#endif // ENABLE_PTP_SUPPORT ++} ++ ++static void ++rtl8125_link_down_patch(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ tp->phy_reg_aner = 0; ++ tp->phy_reg_anlpar = 0; ++ tp->phy_reg_gbsr = 0; ++ tp->phy_reg_status_2500 = 0; ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_12: ++ rtl8125_disable_eee_plus(tp); ++ break; ++ default: ++ break; ++ } ++ ++ if (tp->RequiredPfmPatch) ++ rtl8125_set_pfm_patch(tp, 1); ++ ++ netif_carrier_off(dev); ++ ++ netif_tx_disable(dev); ++ ++ rtl8125_hw_reset(dev); ++ ++ rtl8125_tx_clear(tp); ++ ++ rtl8125_rx_clear(tp); ++ ++ rtl8125_init_ring(dev); ++ ++ rtl8125_enable_hw_linkchg_interrupt(tp); ++ ++ //rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising); ++ ++#ifdef ENABLE_DASH_SUPPORT ++ rtl8125_check_and_enable_dash_interrupt(tp); ++#endif ++} ++ ++static void ++_rtl8125_check_link_status(struct net_device *dev, unsigned int link_state) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (link_state != R8125_LINK_STATE_OFF && ++ link_state != R8125_LINK_STATE_ON) ++ link_state = tp->link_ok(dev); ++ ++ if (link_state == R8125_LINK_STATE_ON) { ++ rtl8125_link_on_patch(dev); ++ ++ if (netif_msg_ifup(tp)) ++ printk(KERN_INFO PFX "%s: link up\n", dev->name); ++ } else { ++ if (netif_msg_ifdown(tp)) ++ printk(KERN_INFO PFX "%s: link down\n", dev->name); ++ ++ rtl8125_link_down_patch(dev); ++ } ++} ++ ++static void ++rtl8125_check_link_status(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned int link_status_on; ++ ++ tp->resume_not_chg_speed = 0; ++ ++ link_status_on = tp->link_ok(dev); ++ if (netif_carrier_ok(dev) == link_status_on) ++ rtl8125_enable_hw_linkchg_interrupt(tp); ++ else ++ _rtl8125_check_link_status(dev, link_status_on); ++} ++ ++static bool ++rtl8125_is_autoneg_mode_valid(u32 autoneg) ++{ ++ switch(autoneg) { ++ case AUTONEG_ENABLE: ++ case AUTONEG_DISABLE: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static bool ++rtl8125_is_speed_mode_valid(u32 speed) ++{ ++ switch(speed) { ++ case SPEED_2500: ++ case SPEED_1000: ++ case SPEED_100: ++ case SPEED_10: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static bool ++rtl8125_is_duplex_mode_valid(u8 duplex) ++{ ++ switch(duplex) { ++ case DUPLEX_FULL: ++ case DUPLEX_HALF: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++static void ++rtl8125_set_link_option(struct rtl8125_private *tp, ++ u8 autoneg, ++ u32 speed, ++ u8 duplex, ++ enum rtl8125_fc_mode fc) ++{ ++ u64 adv; ++ ++ if (!rtl8125_is_speed_mode_valid(speed)) ++ speed = SPEED_2500; ++ ++ if (!rtl8125_is_duplex_mode_valid(duplex)) ++ duplex = DUPLEX_FULL; ++ ++ if (!rtl8125_is_autoneg_mode_valid(autoneg)) ++ autoneg = AUTONEG_ENABLE; ++ ++ speed = min(speed, tp->HwSuppMaxPhyLinkSpeed); ++ ++ adv = 0; ++ switch(speed) { ++ case SPEED_2500: ++ adv |= ADVERTISED_2500baseX_Full; ++ fallthrough; ++ default: ++ adv |= (ADVERTISED_10baseT_Half | ADVERTISED_10baseT_Full | ++ ADVERTISED_100baseT_Half | ADVERTISED_100baseT_Full | ++ ADVERTISED_1000baseT_Half | ADVERTISED_1000baseT_Full); ++ break; ++ } ++ ++ tp->autoneg = autoneg; ++ tp->speed = speed; ++ tp->duplex = duplex; ++ tp->advertising = adv; ++ tp->fcpause = fc; ++} ++ ++/* ++static void ++rtl8125_enable_ocp_phy_power_saving(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 val; ++ ++ if (tp->mcfg == CFG_METHOD_2 || ++ tp->mcfg == CFG_METHOD_3 || ++ tp->mcfg == CFG_METHOD_6) { ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC416); ++ if (val != 0x0050) { ++ rtl8125_set_phy_mcu_patch_request(tp); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0050); ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ } ++ } ++} ++*/ ++ ++static void ++rtl8125_disable_ocp_phy_power_saving(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 val; ++ ++ if (tp->mcfg == CFG_METHOD_2 || ++ tp->mcfg == CFG_METHOD_3 || ++ tp->mcfg == CFG_METHOD_6) { ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC416); ++ if (val != 0x0500) { ++ rtl8125_set_phy_mcu_patch_request(tp); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xC416, 0x0500); ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ } ++ } ++} ++ ++static void ++rtl8125_wait_ll_share_fifo_ready(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ for (i = 0; i < 10; i++) { ++ udelay(100); ++ if (RTL_R16(tp, 0xD2) & BIT_9) ++ break; ++ } ++} ++ ++static void ++rtl8125_disable_pci_offset_99(struct rtl8125_private *tp) ++{ ++ rtl8125_mac_ocp_write(tp, 0xE032, rtl8125_mac_ocp_read(tp, 0xE032) & ~(BIT_0 | BIT_1)); ++ ++ rtl8125_csi_fun0_write_byte(tp, 0x99, 0x00); ++} ++ ++static void ++rtl8125_enable_pci_offset_99(struct rtl8125_private *tp) ++{ ++ u32 csi_tmp; ++ ++ rtl8125_csi_fun0_write_byte(tp, 0x99, tp->org_pci_offset_99); ++ ++ csi_tmp = rtl8125_mac_ocp_read(tp, 0xE032); ++ csi_tmp &= ~(BIT_0 | BIT_1); ++ if (tp->org_pci_offset_99 & (BIT_5 | BIT_6)) ++ csi_tmp |= BIT_1; ++ if (tp->org_pci_offset_99 & BIT_2) ++ csi_tmp |= BIT_0; ++ rtl8125_mac_ocp_write(tp, 0xE032, csi_tmp); ++} ++ ++static void ++rtl8125_init_pci_offset_99(struct rtl8125_private *tp) ++{ ++ rtl8125_mac_ocp_write(tp, 0xCDD0, 0x9003); ++ rtl8125_set_mac_ocp_bit(tp, 0xE034, (BIT_15 | BIT_14)); ++ rtl8125_mac_ocp_write(tp, 0xCDD2, 0x889C); ++ rtl8125_mac_ocp_write(tp, 0xCDD8, 0x9003); ++ rtl8125_mac_ocp_write(tp, 0xCDD4, 0x8C30); ++ rtl8125_mac_ocp_write(tp, 0xCDDA, 0x9003); ++ rtl8125_mac_ocp_write(tp, 0xCDD6, 0x9003); ++ rtl8125_mac_ocp_write(tp, 0xCDDC, 0x9003); ++ rtl8125_mac_ocp_write(tp, 0xCDE8, 0x883E); ++ rtl8125_mac_ocp_write(tp, 0xCDEA, 0x9003); ++ rtl8125_mac_ocp_write(tp, 0xCDEC, 0x889C); ++ rtl8125_mac_ocp_write(tp, 0xCDEE, 0x9003); ++ rtl8125_mac_ocp_write(tp, 0xCDF0, 0x8C09); ++ rtl8125_mac_ocp_write(tp, 0xCDF2, 0x9003); ++ rtl8125_set_mac_ocp_bit(tp, 0xE032, BIT_14); ++ rtl8125_set_mac_ocp_bit(tp, 0xE0A2, BIT_0); ++ ++ rtl8125_enable_pci_offset_99(tp); ++} ++ ++static void ++rtl8125_disable_pci_offset_180(struct rtl8125_private *tp) ++{ ++ rtl8125_clear_mac_ocp_bit(tp, 0xE092, 0x00FF); ++} ++ ++static void ++rtl8125_enable_pci_offset_180(struct rtl8125_private *tp) ++{ ++ rtl8125_clear_mac_ocp_bit(tp, 0xE094, 0xFF00); ++ ++ rtl8125_clear_set_mac_ocp_bit(tp, 0xE092, 0x00FF, BIT_2); ++} ++ ++static void ++rtl8125_init_pci_offset_180(struct rtl8125_private *tp) ++{ ++ rtl8125_enable_pci_offset_180(tp); ++} ++ ++static void ++rtl8125_set_pci_99_exit_driver_para(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->org_pci_offset_99 & BIT_2) ++ rtl8125_issue_offset_99_event(tp); ++ rtl8125_disable_pci_offset_99(tp); ++} ++ ++static void ++rtl8125_enable_cfg9346_write(struct rtl8125_private *tp) ++{ ++ RTL_W8(tp, Cfg9346, RTL_R8(tp, Cfg9346) | Cfg9346_Unlock); ++} ++ ++static void ++rtl8125_disable_cfg9346_write(struct rtl8125_private *tp) ++{ ++ RTL_W8(tp, Cfg9346, RTL_R8(tp, Cfg9346) & ~Cfg9346_Unlock); ++} ++ ++static void ++rtl8125_enable_exit_l1_mask(struct rtl8125_private *tp) ++{ ++ //(1)ERI(0xD4)(OCP 0xC0AC).bit[7:12]=6'b111111, L1 Mask ++ rtl8125_set_mac_ocp_bit(tp, 0xC0AC, (BIT_7 | BIT_8 | BIT_9 | BIT_10 | BIT_11 | BIT_12)); ++} ++ ++static void ++rtl8125_disable_exit_l1_mask(struct rtl8125_private *tp) ++{ ++ //(1)ERI(0xD4)(OCP 0xC0AC).bit[7:12]=6'b000000, L1 Mask ++ rtl8125_clear_mac_ocp_bit(tp, 0xC0AC, (BIT_7 | BIT_8 | BIT_9 | BIT_10 | BIT_11 | BIT_12)); ++} ++ ++static void ++rtl8125_enable_extend_tally_couter(struct rtl8125_private *tp) ++{ ++ switch (tp->HwSuppExtendTallyCounterVer) { ++ case 1: ++ rtl8125_set_mac_ocp_bit(tp, 0xEA84, (BIT_1 | BIT_0)); ++ break; ++ } ++} ++ ++static void ++rtl8125_disable_extend_tally_couter(struct rtl8125_private *tp) ++{ ++ switch (tp->HwSuppExtendTallyCounterVer) { ++ case 1: ++ rtl8125_clear_mac_ocp_bit(tp, 0xEA84, (BIT_1 | BIT_0)); ++ break; ++ } ++} ++ ++static void ++rtl8125_enable_force_clkreq(struct rtl8125_private *tp, bool enable) ++{ ++ if (enable) ++ RTL_W8(tp, 0xF1, RTL_R8(tp, 0xF1) | BIT_7); ++ else ++ RTL_W8(tp, 0xF1, RTL_R8(tp, 0xF1) & ~BIT_7); ++} ++ ++static void ++rtl8125_enable_aspm_clkreq_lock(struct rtl8125_private *tp, bool enable) ++{ ++ bool unlock_cfg_wr; ++ ++ if ((RTL_R8(tp, Cfg9346) & Cfg9346_EEM_MASK) == Cfg9346_Unlock) ++ unlock_cfg_wr = false; ++ else ++ unlock_cfg_wr = true; ++ ++ if (unlock_cfg_wr) ++ rtl8125_enable_cfg9346_write(tp); ++ ++ if (enable) { ++ RTL_W8(tp, Config2, RTL_R8(tp, Config2) | BIT_7); ++ RTL_W8(tp, Config5, RTL_R8(tp, Config5) | BIT_0); ++ } else { ++ RTL_W8(tp, Config2, RTL_R8(tp, Config2) & ~BIT_7); ++ RTL_W8(tp, Config5, RTL_R8(tp, Config5) & ~BIT_0); ++ } ++ ++ if (unlock_cfg_wr) ++ rtl8125_disable_cfg9346_write(tp); ++} ++ ++static void ++rtl8125_set_reg_oobs_en_sel(struct rtl8125_private *tp, bool enable) ++{ ++ switch (tp->mcfg) { ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ if (enable) ++ rtl8125_set_mac_ocp_bit(tp, 0xD434, BIT_1); ++ else ++ rtl8125_clear_mac_ocp_bit(tp, 0xD434, BIT_1); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void ++rtl8125_hw_d3_para(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ RTL_W16(tp, RxMaxSize, RX_BUF_SIZE); ++ ++ rtl8125_enable_force_clkreq(tp, 0); ++ rtl8125_enable_aspm_clkreq_lock(tp, 0); ++ ++ rtl8125_disable_exit_l1_mask(tp); ++ ++#ifdef ENABLE_REALWOW_SUPPORT ++ rtl8125_set_realwow_d3_para(dev); ++#endif ++ ++ rtl8125_set_pci_99_exit_driver_para(dev); ++ ++ /*disable ocp phy power saving*/ ++ if (tp->mcfg == CFG_METHOD_2 || ++ tp->mcfg == CFG_METHOD_3 || ++ tp->mcfg == CFG_METHOD_6) ++ rtl8125_disable_ocp_phy_power_saving(dev); ++ ++ rtl8125_disable_rxdvgate(dev); ++ ++ rtl8125_disable_extend_tally_couter(tp); ++ ++ rtl8125_set_reg_oobs_en_sel(tp, false); ++} ++ ++static void ++rtl8125_enable_magic_packet(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->HwSuppMagicPktVer) { ++ case WAKEUP_MAGIC_PACKET_V3: ++ rtl8125_mac_ocp_write(tp, 0xC0B6, rtl8125_mac_ocp_read(tp, 0xC0B6) | BIT_0); ++ break; ++ } ++} ++static void ++rtl8125_disable_magic_packet(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->HwSuppMagicPktVer) { ++ case WAKEUP_MAGIC_PACKET_V3: ++ rtl8125_mac_ocp_write(tp, 0xC0B6, rtl8125_mac_ocp_read(tp, 0xC0B6) & ~BIT_0); ++ break; ++ } ++} ++ ++static void ++rtl8125_enable_linkchg_wakeup(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->HwSuppLinkChgWakeUpVer) { ++ case 3: ++ RTL_W8(tp, Config3, RTL_R8(tp, Config3) | LinkUp); ++ rtl8125_clear_set_mac_ocp_bit(tp, 0xE0C6, (BIT_5 | BIT_3 | BIT_2), (BIT_4 | BIT_1 | BIT_0)); ++ break; ++ } ++} ++ ++static void ++rtl8125_disable_linkchg_wakeup(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->HwSuppLinkChgWakeUpVer) { ++ case 3: ++ RTL_W8(tp, Config3, RTL_R8(tp, Config3) & ~LinkUp); ++ if (!(rtl8125_mac_ocp_read(tp, 0xE0C6) & BIT_0)) ++ rtl8125_clear_set_mac_ocp_bit(tp, 0xE0C6, (BIT_5 | BIT_3 | BIT_2 | BIT_1), BIT_4); ++ break; ++ } ++} ++ ++#define WAKE_ANY (WAKE_PHY | WAKE_MAGIC | WAKE_UCAST | WAKE_BCAST | WAKE_MCAST) ++ ++static u32 ++rtl8125_get_hw_wol(struct rtl8125_private *tp) ++{ ++ u8 options; ++ u32 csi_tmp; ++ u32 wol_opts = 0; ++ ++ if (disable_wol_support) ++ goto out; ++ ++ options = RTL_R8(tp, Config1); ++ if (!(options & PMEnable)) ++ goto out; ++ ++ options = RTL_R8(tp, Config3); ++ if (options & LinkUp) ++ wol_opts |= WAKE_PHY; ++ ++ switch (tp->HwSuppMagicPktVer) { ++ case WAKEUP_MAGIC_PACKET_V3: ++ csi_tmp = rtl8125_mac_ocp_read(tp, 0xC0B6); ++ if (csi_tmp & BIT_0) ++ wol_opts |= WAKE_MAGIC; ++ break; ++ } ++ ++ options = RTL_R8(tp, Config5); ++ if (options & UWF) ++ wol_opts |= WAKE_UCAST; ++ if (options & BWF) ++ wol_opts |= WAKE_BCAST; ++ if (options & MWF) ++ wol_opts |= WAKE_MCAST; ++ ++out: ++ return wol_opts; ++} ++ ++static void ++rtl8125_enable_d0_speedup(struct rtl8125_private *tp) ++{ ++ u16 clearmask; ++ u16 setmask; ++ ++ if (FALSE == HW_SUPPORT_D0_SPEED_UP(tp)) ++ return; ++ ++ if (tp->D0SpeedUpSpeed == D0_SPEED_UP_SPEED_DISABLE) ++ return; ++ ++ if (tp->HwSuppD0SpeedUpVer == 1 || tp->HwSuppD0SpeedUpVer == 2) { ++ //speed up speed ++ clearmask = (BIT_10 | BIT_9 | BIT_8 | BIT_7); ++ if (tp->D0SpeedUpSpeed == D0_SPEED_UP_SPEED_2500) ++ setmask = BIT_7; ++ else ++ setmask = 0; ++ rtl8125_clear_set_mac_ocp_bit(tp, 0xE10A, clearmask, setmask); ++ ++ //speed up flowcontrol ++ clearmask = (BIT_15 | BIT_14); ++ if (tp->HwSuppD0SpeedUpVer == 2) ++ clearmask |= BIT_13; ++ ++ if (tp->fcpause == rtl8125_fc_full) { ++ setmask = (BIT_15 | BIT_14); ++ if (tp->HwSuppD0SpeedUpVer == 2) ++ setmask |= BIT_13; ++ } else ++ setmask = 0; ++ rtl8125_clear_set_mac_ocp_bit(tp, 0xE860, clearmask, setmask); ++ } ++ ++ RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) | BIT_3); ++} ++ ++static void ++rtl8125_disable_d0_speedup(struct rtl8125_private *tp) ++{ ++ if (FALSE == HW_SUPPORT_D0_SPEED_UP(tp)) ++ return; ++ ++ RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) & ~BIT_3); ++} ++ ++static void ++rtl8125_set_hw_wol(struct net_device *dev, u32 wolopts) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i,tmp; ++ static struct { ++ u32 opt; ++ u16 reg; ++ u8 mask; ++ } cfg[] = { ++ { WAKE_PHY, Config3, LinkUp }, ++ { WAKE_UCAST, Config5, UWF }, ++ { WAKE_BCAST, Config5, BWF }, ++ { WAKE_MCAST, Config5, MWF }, ++ { WAKE_ANY, Config5, LanWake }, ++ { WAKE_MAGIC, Config3, MagicPacket }, ++ }; ++ ++ switch (tp->HwSuppMagicPktVer) { ++ case WAKEUP_MAGIC_PACKET_V3: ++ default: ++ tmp = ARRAY_SIZE(cfg) - 1; ++ ++ if (wolopts & WAKE_MAGIC) ++ rtl8125_enable_magic_packet(dev); ++ else ++ rtl8125_disable_magic_packet(dev); ++ break; ++ } ++ ++ rtl8125_enable_cfg9346_write(tp); ++ ++ for (i = 0; i < tmp; i++) { ++ u8 options = RTL_R8(tp, cfg[i].reg) & ~cfg[i].mask; ++ if (wolopts & cfg[i].opt) ++ options |= cfg[i].mask; ++ RTL_W8(tp, cfg[i].reg, options); ++ } ++ ++ switch (tp->HwSuppLinkChgWakeUpVer) { ++ case 3: ++ if (wolopts & WAKE_PHY) ++ rtl8125_enable_linkchg_wakeup(dev); ++ else ++ rtl8125_disable_linkchg_wakeup(dev); ++ break; ++ } ++ ++ rtl8125_disable_cfg9346_write(tp); ++} ++ ++static void ++rtl8125_phy_restart_nway(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (rtl8125_is_in_phy_disable_mode(dev)) ++ return; ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ rtl8125_mdio_write(tp, MII_BMCR, BMCR_ANENABLE | BMCR_ANRESTART); ++} ++ ++static void ++rtl8125_phy_setup_force_mode(struct net_device *dev, u32 speed, u8 duplex) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 bmcr_true_force = 0; ++ ++ if (rtl8125_is_in_phy_disable_mode(dev)) ++ return; ++ ++ if ((speed == SPEED_10) && (duplex == DUPLEX_HALF)) { ++ bmcr_true_force = BMCR_SPEED10; ++ } else if ((speed == SPEED_10) && (duplex == DUPLEX_FULL)) { ++ bmcr_true_force = BMCR_SPEED10 | BMCR_FULLDPLX; ++ } else if ((speed == SPEED_100) && (duplex == DUPLEX_HALF)) { ++ bmcr_true_force = BMCR_SPEED100; ++ } else if ((speed == SPEED_100) && (duplex == DUPLEX_FULL)) { ++ bmcr_true_force = BMCR_SPEED100 | BMCR_FULLDPLX; ++ } else { ++ netif_err(tp, drv, dev, "Failed to set phy force mode!\n"); ++ return; ++ } ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ rtl8125_mdio_write(tp, MII_BMCR, bmcr_true_force); ++} ++ ++static void ++rtl8125_set_pci_pme(struct rtl8125_private *tp, int set) ++{ ++ struct pci_dev *pdev = tp->pci_dev; ++ u16 pmc; ++ ++ if (!pdev->pm_cap) ++ return; ++ ++ pci_read_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, &pmc); ++ pmc |= PCI_PM_CTRL_PME_STATUS; ++ if (set) ++ pmc |= PCI_PM_CTRL_PME_ENABLE; ++ else ++ pmc &= ~PCI_PM_CTRL_PME_ENABLE; ++ pci_write_config_word(pdev, pdev->pm_cap + PCI_PM_CTRL, pmc); ++} ++ ++static void ++rtl8125_enable_giga_lite(struct rtl8125_private *tp, u64 adv) ++{ ++ if (adv & ADVERTISED_1000baseT_Full) ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA428, BIT_9); ++ else ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_9); ++ ++ if (adv & ADVERTISED_2500baseX_Full) ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA5EA, BIT_0); ++ else ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5EA, BIT_0); ++} ++ ++static void ++rtl8125_disable_giga_lite(struct rtl8125_private *tp) ++{ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_9); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5EA, BIT_0); ++} ++ ++static int ++rtl8125_set_wol_link_speed(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ int auto_nego = 0; ++ int giga_ctrl; ++ int ctrl_2500; ++ u64 adv; ++ u16 anlpar; ++ u16 gbsr; ++ u16 status_2500; ++ u16 aner; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ if (tp->autoneg != AUTONEG_ENABLE) ++ goto exit; ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ ++ auto_nego = rtl8125_mdio_read(tp, MII_ADVERTISE); ++ auto_nego &= ~(ADVERTISE_10HALF | ADVERTISE_10FULL ++ | ADVERTISE_100HALF | ADVERTISE_100FULL); ++ ++ giga_ctrl = rtl8125_mdio_read(tp, MII_CTRL1000); ++ giga_ctrl &= ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL); ++ ++ ctrl_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D4); ++ ctrl_2500 &= ~RTK_ADVERTISE_2500FULL; ++ ++ aner = tp->phy_reg_aner; ++ anlpar = tp->phy_reg_anlpar; ++ gbsr = tp->phy_reg_gbsr; ++ status_2500 = tp->phy_reg_status_2500; ++ if (tp->link_ok(dev)) { ++ aner = rtl8125_mdio_read(tp, MII_EXPANSION); ++ anlpar = rtl8125_mdio_read(tp, MII_LPA); ++ gbsr = rtl8125_mdio_read(tp, MII_STAT1000); ++ status_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D6); ++ } ++ ++ adv = tp->advertising; ++ if ((aner | anlpar | gbsr | status_2500) == 0) { ++ int auto_nego_tmp = 0; ++ if (adv & ADVERTISED_10baseT_Half) ++ auto_nego_tmp |= ADVERTISE_10HALF; ++ if (adv & ADVERTISED_10baseT_Full) ++ auto_nego_tmp |= ADVERTISE_10FULL; ++ if (adv & ADVERTISED_100baseT_Half) ++ auto_nego_tmp |= ADVERTISE_100HALF; ++ if (adv & ADVERTISED_100baseT_Full) ++ auto_nego_tmp |= ADVERTISE_100FULL; ++ ++ if (auto_nego_tmp == 0) ++ goto exit; ++ ++ auto_nego |= auto_nego_tmp; ++ goto skip_check_lpa; ++ } ++ if (!(aner & EXPANSION_NWAY)) ++ goto exit; ++ ++ if ((adv & ADVERTISED_10baseT_Half) && (anlpar & LPA_10HALF)) ++ auto_nego |= ADVERTISE_10HALF; ++ else if ((adv & ADVERTISED_10baseT_Full) && (anlpar & LPA_10FULL)) ++ auto_nego |= ADVERTISE_10FULL; ++ else if ((adv & ADVERTISED_100baseT_Half) && (anlpar & LPA_100HALF)) ++ auto_nego |= ADVERTISE_100HALF; ++ else if ((adv & ADVERTISED_100baseT_Full) && (anlpar & LPA_100FULL)) ++ auto_nego |= ADVERTISE_100FULL; ++ else if (adv & ADVERTISED_1000baseT_Half && (gbsr & LPA_1000HALF)) ++ giga_ctrl |= ADVERTISE_1000HALF; ++ else if (adv & ADVERTISED_1000baseT_Full && (gbsr & LPA_1000FULL)) ++ giga_ctrl |= ADVERTISE_1000FULL; ++ else if (adv & ADVERTISED_2500baseX_Full && (status_2500 & RTK_LPA_ADVERTISE_2500FULL)) ++ ctrl_2500 |= RTK_ADVERTISE_2500FULL; ++ else ++ goto exit; ++ ++skip_check_lpa: ++ if (tp->DASH) ++ auto_nego |= (ADVERTISE_100FULL | ADVERTISE_100HALF | ADVERTISE_10HALF | ADVERTISE_10FULL); ++ ++#ifdef CONFIG_DOWN_SPEED_100 ++ auto_nego |= (ADVERTISE_100FULL | ADVERTISE_100HALF | ADVERTISE_10HALF | ADVERTISE_10FULL); ++#endif ++ ++ rtl8125_mdio_write(tp, MII_ADVERTISE, auto_nego); ++ rtl8125_mdio_write(tp, MII_CTRL1000, giga_ctrl); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA5D4, ctrl_2500); ++ ++ rtl8125_disable_giga_lite(tp); ++ ++ rtl8125_phy_restart_nway(dev); ++ ++exit: ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return auto_nego; ++} ++ ++static bool ++rtl8125_keep_wol_link_speed(struct net_device *dev, u8 from_suspend) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (from_suspend && tp->link_ok(dev) && (tp->wol_opts & WAKE_PHY)) ++ return 1; ++ ++ if (!from_suspend && tp->resume_not_chg_speed) ++ return 1; ++ ++ return 0; ++} ++static void ++rtl8125_powerdown_pll(struct net_device *dev, u8 from_suspend) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ /* Reboot not set wol link speed */ ++ if (system_state == SYSTEM_RESTART) ++ return; ++ ++ tp->check_keep_link_speed = 0; ++ if (tp->wol_enabled == WOL_ENABLED || tp->DASH || tp->EnableKCPOffload) { ++ int auto_nego; ++ ++ rtl8125_set_hw_wol(dev, tp->wol_opts); ++ ++ rtl8125_enable_cfg9346_write(tp); ++ RTL_W8(tp, Config2, RTL_R8(tp, Config2) | PMSTS_En); ++ rtl8125_disable_cfg9346_write(tp); ++ ++ /* Enable the PME and clear the status */ ++ rtl8125_set_pci_pme(tp, 1); ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ if (HW_FIBER_MODE_ENABLED(tp)) ++ return; ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ if (rtl8125_keep_wol_link_speed(dev, from_suspend)) { ++ tp->check_keep_link_speed = 1; ++ } else { ++ if (tp->D0SpeedUpSpeed != D0_SPEED_UP_SPEED_DISABLE) { ++ rtl8125_enable_d0_speedup(tp); ++ tp->check_keep_link_speed = 1; ++ } ++ ++ auto_nego = rtl8125_set_wol_link_speed(dev); ++ ++ if (tp->RequiredPfmPatch) ++ rtl8125_set_pfm_patch(tp, ++ (auto_nego & (ADVERTISE_10HALF | ADVERTISE_10FULL)) ? ++ 1 : 0); ++ } ++ ++ RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | AcceptBroadcast | AcceptMulticast | AcceptMyPhys); ++ ++ return; ++ } ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ if (HW_FIBER_MODE_ENABLED(tp)) ++ return; ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ if (tp->DASH) ++ return; ++ ++ rtl8125_phy_power_down(dev); ++ ++ if (!tp->HwIcVerUnknown) ++ RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) & ~BIT_7); ++ ++ RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) & ~BIT_6); ++} ++ ++static void rtl8125_powerup_pll(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ RTL_W8(tp, PMCH, RTL_R8(tp, PMCH) | BIT_7 | BIT_6); ++ ++ if (tp->resume_not_chg_speed) ++ return; ++ ++ rtl8125_phy_power_up(dev); ++} ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++static void ++rtl8125_get_wol(struct net_device *dev, ++ struct ethtool_wolinfo *wol) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u8 options; ++ ++ wol->wolopts = 0; ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT || disable_wol_support) { ++ wol->supported = 0; ++ return; ++ } else { ++ wol->supported = WAKE_ANY; ++ } ++ ++ options = RTL_R8(tp, Config1); ++ if (!(options & PMEnable)) ++ return; ++ ++ wol->wolopts = tp->wol_opts; ++} ++ ++static int ++rtl8125_set_wol(struct net_device *dev, ++ struct ethtool_wolinfo *wol) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT || disable_wol_support) ++ return -EOPNOTSUPP; ++ ++ tp->wol_opts = wol->wolopts; ++ ++ tp->wol_enabled = (tp->wol_opts) ? WOL_ENABLED : WOL_DISABLED; ++ ++ device_set_wakeup_enable(tp_to_dev(tp), wol->wolopts); ++ ++ return 0; ++} ++ ++static void ++rtl8125_get_drvinfo(struct net_device *dev, ++ struct ethtool_drvinfo *info) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct rtl8125_fw *rtl_fw = tp->rtl_fw; ++ ++ strscpy(info->driver, MODULENAME, sizeof(info->driver)); ++ strscpy(info->version, RTL8125_VERSION, sizeof(info->version)); ++ strscpy(info->bus_info, pci_name(tp->pci_dev), sizeof(info->bus_info)); ++ info->regdump_len = R8125_REGS_DUMP_SIZE; ++ info->eedump_len = tp->eeprom_len; ++ BUILD_BUG_ON(sizeof(info->fw_version) < sizeof(rtl_fw->version)); ++ if (rtl_fw) ++ strscpy(info->fw_version, rtl_fw->version, ++ sizeof(info->fw_version)); ++} ++ ++static int ++rtl8125_get_regs_len(struct net_device *dev) ++{ ++ return R8125_REGS_DUMP_SIZE; ++} ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++ ++static void ++rtl8125_set_d0_speedup_speed(struct rtl8125_private *tp) ++{ ++ if (FALSE == HW_SUPPORT_D0_SPEED_UP(tp)) ++ return; ++ ++ tp->D0SpeedUpSpeed = D0_SPEED_UP_SPEED_DISABLE; ++ if (tp->autoneg == AUTONEG_ENABLE) { ++ if (tp->speed == SPEED_2500) ++ tp->D0SpeedUpSpeed = D0_SPEED_UP_SPEED_2500; ++ else if (tp->speed == SPEED_1000) ++ tp->D0SpeedUpSpeed = D0_SPEED_UP_SPEED_1000; ++ } ++} ++ ++static int ++rtl8125_set_speed_xmii(struct net_device *dev, ++ u8 autoneg, ++ u32 speed, ++ u8 duplex, ++ u64 adv) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int auto_nego = 0; ++ int giga_ctrl = 0; ++ int ctrl_2500 = 0; ++ int rc = -EINVAL; ++ ++ if (!rtl8125_is_speed_mode_valid(speed)) { ++ speed = SPEED_2500; ++ duplex = DUPLEX_FULL; ++ adv |= tp->advertising; ++ } ++ ++ if (eee_giga_lite && (autoneg == AUTONEG_ENABLE)) ++ rtl8125_enable_giga_lite(tp, adv); ++ else ++ rtl8125_disable_giga_lite(tp); ++ ++ giga_ctrl = rtl8125_mdio_read(tp, MII_CTRL1000); ++ giga_ctrl &= ~(ADVERTISE_1000HALF | ADVERTISE_1000FULL); ++ ctrl_2500 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D4); ++ ctrl_2500 &= ~RTK_ADVERTISE_2500FULL; ++ ++ if (autoneg == AUTONEG_ENABLE) { ++ /*n-way force*/ ++ auto_nego = rtl8125_mdio_read(tp, MII_ADVERTISE); ++ auto_nego &= ~(ADVERTISE_10HALF | ADVERTISE_10FULL | ++ ADVERTISE_100HALF | ADVERTISE_100FULL | ++ ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM); ++ ++ if (adv & ADVERTISED_10baseT_Half) ++ auto_nego |= ADVERTISE_10HALF; ++ if (adv & ADVERTISED_10baseT_Full) ++ auto_nego |= ADVERTISE_10FULL; ++ if (adv & ADVERTISED_100baseT_Half) ++ auto_nego |= ADVERTISE_100HALF; ++ if (adv & ADVERTISED_100baseT_Full) ++ auto_nego |= ADVERTISE_100FULL; ++ if (adv & ADVERTISED_1000baseT_Half) ++ giga_ctrl |= ADVERTISE_1000HALF; ++ if (adv & ADVERTISED_1000baseT_Full) ++ giga_ctrl |= ADVERTISE_1000FULL; ++ if (adv & ADVERTISED_2500baseX_Full) ++ ctrl_2500 |= RTK_ADVERTISE_2500FULL; ++ ++ //flow control ++ if (tp->fcpause == rtl8125_fc_full) ++ auto_nego |= ADVERTISE_PAUSE_CAP | ADVERTISE_PAUSE_ASYM; ++ ++ tp->phy_auto_nego_reg = auto_nego; ++ tp->phy_1000_ctrl_reg = giga_ctrl; ++ ++ tp->phy_2500_ctrl_reg = ctrl_2500; ++ ++ rtl8125_mdio_write(tp, 0x1f, 0x0000); ++ rtl8125_mdio_write(tp, MII_ADVERTISE, auto_nego); ++ rtl8125_mdio_write(tp, MII_CTRL1000, giga_ctrl); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA5D4, ctrl_2500); ++ rtl8125_phy_restart_nway(dev); ++ } else { ++ /*true force*/ ++ if (speed == SPEED_10 || speed == SPEED_100) ++ rtl8125_phy_setup_force_mode(dev, speed, duplex); ++ else ++ goto out; ++ } ++ ++ tp->autoneg = autoneg; ++ tp->speed = speed; ++ tp->duplex = duplex; ++ tp->advertising = adv; ++ ++ rtl8125_set_d0_speedup_speed(tp); ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ rtl8125_hw_fiber_phy_config(tp); ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ rc = 0; ++out: ++ return rc; ++} ++ ++static int ++rtl8125_set_speed(struct net_device *dev, ++ u8 autoneg, ++ u32 speed, ++ u8 duplex, ++ u64 adv) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int ret; ++ ++ if (tp->resume_not_chg_speed) ++ return 0; ++ ++ ret = tp->set_speed(dev, autoneg, speed, duplex, adv); ++ ++ return ret; ++} ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++static int ++rtl8125_set_settings(struct net_device *dev, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ struct ethtool_cmd *cmd ++#else ++ const struct ethtool_link_ksettings *cmd ++#endif ++ ) ++{ ++ int ret; ++ u8 autoneg; ++ u32 speed; ++ u8 duplex; ++ u64 supported = 0, advertising = 0; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ autoneg = cmd->autoneg; ++ speed = cmd->speed; ++ duplex = cmd->duplex; ++ supported = cmd->supported; ++ advertising = cmd->advertising; ++#else ++ const struct ethtool_link_settings *base = &cmd->base; ++ autoneg = base->autoneg; ++ speed = base->speed; ++ duplex = base->duplex; ++ ethtool_convert_link_mode_to_legacy_u32((u32*)&supported, ++ cmd->link_modes.supported); ++ ethtool_convert_link_mode_to_legacy_u32((u32*)&advertising, ++ cmd->link_modes.advertising); ++ if (test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, ++ cmd->link_modes.supported)) ++ supported |= ADVERTISED_2500baseX_Full; ++ if (test_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, ++ cmd->link_modes.advertising)) ++ advertising |= ADVERTISED_2500baseX_Full; ++#endif ++ if (advertising & ~supported) ++ return -EINVAL; ++ ++ ret = rtl8125_set_speed(dev, autoneg, speed, duplex, advertising); ++ ++ return ret; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++static u32 ++rtl8125_get_tx_csum(struct net_device *dev) ++{ ++ u32 ret; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ ret = ((dev->features & NETIF_F_IP_CSUM) != 0); ++#else ++ ret = ((dev->features & (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM)) != 0); ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ ++ return ret; ++} ++ ++static u32 ++rtl8125_get_rx_csum(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 ret; ++ ++ ret = tp->cp_cmd & RxChkSum; ++ ++ return ret; ++} ++ ++static int ++rtl8125_set_tx_csum(struct net_device *dev, ++ u32 data) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT) ++ return -EOPNOTSUPP; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ if (data) ++ dev->features |= NETIF_F_IP_CSUM; ++ else ++ dev->features &= ~NETIF_F_IP_CSUM; ++#else ++ if (data) ++ dev->features |= (NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); ++ else ++ dev->features &= ~(NETIF_F_IP_CSUM | NETIF_F_IPV6_CSUM); ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ ++ return 0; ++} ++ ++static int ++rtl8125_set_rx_csum(struct net_device *dev, ++ u32 data) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT) ++ return -EOPNOTSUPP; ++ ++ if (data) ++ tp->cp_cmd |= RxChkSum; ++ else ++ tp->cp_cmd &= ~RxChkSum; ++ ++ RTL_W16(tp, CPlusCmd, tp->cp_cmd); ++ ++ return 0; ++} ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++ ++static u32 ++rtl8125_rx_desc_opts1(struct rtl8125_private *tp, ++ struct RxDesc *desc) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ return READ_ONCE(((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts1); ++ case RX_DESC_RING_TYPE_4: ++ return READ_ONCE(((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts1); ++ default: ++ return READ_ONCE(desc->opts1); ++ } ++} ++ ++static u32 ++rtl8125_rx_desc_opts2(struct rtl8125_private *tp, ++ struct RxDesc *desc) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ return ((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts2; ++ case RX_DESC_RING_TYPE_4: ++ return ((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts2; ++ default: ++ return desc->opts2; ++ } ++} ++ ++#ifdef CONFIG_R8125_VLAN ++ ++static void ++rtl8125_clear_rx_desc_opts2(struct rtl8125_private *tp, ++ struct RxDesc *desc) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ ((struct RxDescV3 *)desc)->RxDescNormalDDWord4.opts2 = 0; ++ break; ++ case RX_DESC_RING_TYPE_4: ++ ((struct RxDescV4 *)desc)->RxDescNormalDDWord2.opts2 = 0; ++ break; ++ default: ++ desc->opts2 = 0; ++ break; ++ } ++} ++ ++static inline u32 ++rtl8125_tx_vlan_tag(struct rtl8125_private *tp, ++ struct sk_buff *skb) ++{ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ return (tp->vlgrp && vlan_tx_tag_present(skb)) ? ++ TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00; ++#elif LINUX_VERSION_CODE < KERNEL_VERSION(4,0,0) ++ return (vlan_tx_tag_present(skb)) ? ++ TxVlanTag | swab16(vlan_tx_tag_get(skb)) : 0x00; ++#else ++ return (skb_vlan_tag_present(skb)) ? ++ TxVlanTag | swab16(skb_vlan_tag_get(skb)) : 0x00; ++#endif ++ ++ return 0; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ ++static void ++rtl8125_vlan_rx_register(struct net_device *dev, ++ struct vlan_group *grp) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ tp->vlgrp = grp; ++ ++ if (tp->vlgrp) { ++ tp->rtl8125_rx_config |= (EnableInnerVlan | EnableOuterVlan); ++ RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) | (EnableInnerVlan | EnableOuterVlan)) ++ } else { ++ tp->rtl8125_rx_config &= ~(EnableInnerVlan | EnableOuterVlan); ++ RTL_W32(tp, RxConfig, RTL_R32(tp, RxConfig) & ~(EnableInnerVlan | EnableOuterVlan)) ++ } ++} ++ ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++static void ++rtl8125_vlan_rx_kill_vid(struct net_device *dev, ++ unsigned short vid) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) ++ if (tp->vlgrp) ++ tp->vlgrp->vlan_devices[vid] = NULL; ++#else ++ vlan_group_set_device(tp->vlgrp, vid, NULL); ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,21) ++} ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++ ++static int ++rtl8125_rx_vlan_skb(struct rtl8125_private *tp, ++ struct RxDesc *desc, ++ struct sk_buff *skb) ++{ ++ u32 opts2 = le32_to_cpu(rtl8125_rx_desc_opts2(tp, desc)); ++ int ret = -1; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ if (tp->vlgrp && (opts2 & RxVlanTag)) { ++ rtl8125_rx_hwaccel_skb(skb, tp->vlgrp, ++ swab16(opts2 & 0xffff)); ++ ret = 0; ++ } ++#elif LINUX_VERSION_CODE < KERNEL_VERSION(3,10,0) ++ if (opts2 & RxVlanTag) ++ __vlan_hwaccel_put_tag(skb, swab16(opts2 & 0xffff)); ++#else ++ if (opts2 & RxVlanTag) ++ __vlan_hwaccel_put_tag(skb, htons(ETH_P_8021Q), swab16(opts2 & 0xffff)); ++#endif ++ ++ rtl8125_clear_rx_desc_opts2(tp, desc); ++ return ret; ++} ++ ++#else /* !CONFIG_R8125_VLAN */ ++ ++static inline u32 ++rtl8125_tx_vlan_tag(struct rtl8125_private *tp, ++ struct sk_buff *skb) ++{ ++ return 0; ++} ++ ++static int ++rtl8125_rx_vlan_skb(struct rtl8125_private *tp, ++ struct RxDesc *desc, ++ struct sk_buff *skb) ++{ ++ return -1; ++} ++ ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) ++ ++static netdev_features_t rtl8125_fix_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ if (dev->mtu > MSS_MAX || dev->mtu > ETH_DATA_LEN) ++ features &= ~NETIF_F_ALL_TSO; ++#ifndef CONFIG_R8125_VLAN ++ features &= ~NETIF_F_ALL_CSUM; ++#endif ++ ++ return features; ++} ++ ++static int rtl8125_hw_set_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 rx_config; ++ ++ rx_config = RTL_R32(tp, RxConfig); ++ if (features & NETIF_F_RXALL) { ++ tp->rtl8125_rx_config |= (AcceptErr | AcceptRunt); ++ rx_config |= (AcceptErr | AcceptRunt); ++ } else { ++ tp->rtl8125_rx_config &= ~(AcceptErr | AcceptRunt); ++ rx_config &= ~(AcceptErr | AcceptRunt); ++ } ++ ++ if (features & NETIF_F_HW_VLAN_RX) { ++ tp->rtl8125_rx_config |= (EnableInnerVlan | EnableOuterVlan); ++ rx_config |= (EnableInnerVlan | EnableOuterVlan); ++ } else { ++ tp->rtl8125_rx_config &= ~(EnableInnerVlan | EnableOuterVlan); ++ rx_config &= ~(EnableInnerVlan | EnableOuterVlan); ++ } ++ ++ RTL_W32(tp, RxConfig, rx_config); ++ ++ if (features & NETIF_F_RXCSUM) ++ tp->cp_cmd |= RxChkSum; ++ else ++ tp->cp_cmd &= ~RxChkSum; ++ ++ RTL_W16(tp, CPlusCmd, tp->cp_cmd); ++ RTL_R16(tp, CPlusCmd); ++ ++ return 0; ++} ++ ++static int rtl8125_set_features(struct net_device *dev, ++ netdev_features_t features) ++{ ++ features &= NETIF_F_RXALL | NETIF_F_RXCSUM | NETIF_F_HW_VLAN_RX; ++ ++ rtl8125_hw_set_features(dev, features); ++ ++ return 0; ++} ++ ++#endif ++ ++static u8 rtl8125_get_mdi_status(struct rtl8125_private *tp) ++{ ++ if (!tp->link_ok(tp->dev)) ++ return ETH_TP_MDI_INVALID; ++ ++ if (rtl8125_mdio_direct_read_phy_ocp(tp, 0xA444) & BIT_1) ++ return ETH_TP_MDI; ++ else ++ return ETH_TP_MDI_X; ++} ++ ++static void rtl8125_gset_xmii(struct net_device *dev, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ struct ethtool_cmd *cmd ++#else ++ struct ethtool_link_ksettings *cmd ++#endif ++ ) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 aner = tp->phy_reg_aner; ++ u16 anlpar = tp->phy_reg_anlpar; ++ u16 gbsr = tp->phy_reg_gbsr; ++ u16 status_2500 = tp->phy_reg_status_2500; ++ u64 lpa_adv = 0; ++ u32 status; ++ u8 autoneg, duplex; ++ u32 speed = 0; ++ u16 bmcr; ++ u64 supported, advertising; ++ unsigned long flags; ++ u8 report_lpa = 0; ++ ++ supported = SUPPORTED_10baseT_Half | ++ SUPPORTED_10baseT_Full | ++ SUPPORTED_100baseT_Half | ++ SUPPORTED_100baseT_Full | ++ SUPPORTED_1000baseT_Full | ++ SUPPORTED_2500baseX_Full | ++ SUPPORTED_Autoneg | ++ SUPPORTED_TP | ++ SUPPORTED_Pause | ++ SUPPORTED_Asym_Pause; ++ ++ if (!HW_SUPP_PHY_LINK_SPEED_2500M(tp)) ++ supported &= ~SUPPORTED_2500baseX_Full; ++ ++ advertising = tp->advertising; ++ if (tp->phy_auto_nego_reg || tp->phy_1000_ctrl_reg || ++ tp->phy_2500_ctrl_reg) { ++ advertising = 0; ++ if (tp->phy_auto_nego_reg & ADVERTISE_10HALF) ++ advertising |= ADVERTISED_10baseT_Half; ++ if (tp->phy_auto_nego_reg & ADVERTISE_10FULL) ++ advertising |= ADVERTISED_10baseT_Full; ++ if (tp->phy_auto_nego_reg & ADVERTISE_100HALF) ++ advertising |= ADVERTISED_100baseT_Half; ++ if (tp->phy_auto_nego_reg & ADVERTISE_100FULL) ++ advertising |= ADVERTISED_100baseT_Full; ++ if (tp->phy_1000_ctrl_reg & ADVERTISE_1000FULL) ++ advertising |= ADVERTISED_1000baseT_Full; ++ if (tp->phy_2500_ctrl_reg & RTK_ADVERTISE_2500FULL) ++ advertising |= ADVERTISED_2500baseX_Full; ++ } ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ bmcr = rtl8125_mdio_read(tp, MII_BMCR); ++ if (bmcr & BMCR_ANENABLE) { ++ autoneg = AUTONEG_ENABLE; ++ advertising |= ADVERTISED_Autoneg; ++ } else { ++ autoneg = AUTONEG_DISABLE; ++ } ++ ++ advertising |= ADVERTISED_TP; ++ ++ status = rtl8125_get_phy_status(tp); ++ if (netif_running(dev) && (status & LinkStatus)) ++ report_lpa = 1; ++#ifdef ENABLE_FIBER_SUPPORT ++ if (HW_FIBER_MODE_ENABLED(tp) && ++ rtl8125_fiber_link_ok(dev) != R8125_LINK_STATE_ON) ++ report_lpa = 0; ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ if (report_lpa) { ++ /*link on*/ ++ speed = rtl8125_convert_link_speed(status); ++ ++ if (status & TxFlowCtrl) ++ advertising |= ADVERTISED_Asym_Pause; ++ ++ if (status & RxFlowCtrl) ++ advertising |= ADVERTISED_Pause; ++ ++ duplex = ((status & (_1000bpsF | _2500bpsF)) || ++ (status & FullDup)) ? ++ DUPLEX_FULL : DUPLEX_HALF; ++ ++ /*link partner*/ ++ if (aner & EXPANSION_NWAY) ++ lpa_adv |= ADVERTISED_Autoneg; ++ if (anlpar & LPA_10HALF) ++ lpa_adv |= ADVERTISED_10baseT_Half; ++ if (anlpar & LPA_10FULL) ++ lpa_adv |= ADVERTISED_10baseT_Full; ++ if (anlpar & LPA_100HALF) ++ lpa_adv |= ADVERTISED_100baseT_Half; ++ if (anlpar & LPA_100FULL) ++ lpa_adv |= ADVERTISED_100baseT_Full; ++ if (anlpar & LPA_PAUSE_CAP) ++ lpa_adv |= ADVERTISED_Pause; ++ if (anlpar & LPA_PAUSE_ASYM) ++ lpa_adv |= ADVERTISED_Asym_Pause; ++ if (gbsr & LPA_1000HALF) ++ lpa_adv |= ADVERTISED_1000baseT_Half; ++ if (gbsr & LPA_1000FULL) ++ lpa_adv |= ADVERTISED_1000baseT_Full; ++ if (status_2500 & RTK_LPA_ADVERTISE_2500FULL) ++ lpa_adv |= ADVERTISED_2500baseX_Full; ++ } else { ++ /*link down*/ ++ speed = SPEED_UNKNOWN; ++ duplex = DUPLEX_UNKNOWN; ++ lpa_adv = 0; ++ } ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ cmd->supported = (u32)supported; ++ cmd->advertising = (u32)advertising; ++ cmd->autoneg = autoneg; ++ cmd->speed = speed; ++ cmd->duplex = duplex; ++ cmd->port = PORT_TP; ++ cmd->lp_advertising = (u32)lpa_adv; ++ cmd->eth_tp_mdix = rtl8125_get_mdi_status(tp); ++#else ++ ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.supported, ++ supported); ++ ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.advertising, ++ advertising); ++ ethtool_convert_legacy_u32_to_link_mode(cmd->link_modes.lp_advertising, ++ lpa_adv); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) ++ if (supported & SUPPORTED_2500baseX_Full) { ++ linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT, ++ cmd->link_modes.supported, 0); ++ linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, ++ cmd->link_modes.supported, 1); ++ } ++ if (advertising & ADVERTISED_2500baseX_Full) { ++ linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT, ++ cmd->link_modes.advertising, 0); ++ linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, ++ cmd->link_modes.advertising, 1); ++ } ++ if (report_lpa) { ++ if (lpa_adv & ADVERTISED_2500baseX_Full) { ++ linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseX_Full_BIT, ++ cmd->link_modes.lp_advertising, 0); ++ linkmode_mod_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, ++ cmd->link_modes.lp_advertising, 1); ++ } ++ } ++#endif ++ cmd->base.autoneg = autoneg; ++ cmd->base.speed = speed; ++ cmd->base.duplex = duplex; ++ cmd->base.port = PORT_TP; ++ cmd->base.eth_tp_mdix = rtl8125_get_mdi_status(tp); ++#endif ++ r8125_spin_unlock(&tp->phy_lock, flags); ++} ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++static int ++rtl8125_get_settings(struct net_device *dev, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ struct ethtool_cmd *cmd ++#else ++ struct ethtool_link_ksettings *cmd ++#endif ++ ) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ tp->get_settings(dev, cmd); ++ ++ return 0; ++} ++ ++static void rtl8125_get_regs(struct net_device *dev, struct ethtool_regs *regs, ++ void *p) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ void __iomem *ioaddr = tp->mmio_addr; ++ unsigned int i; ++ u8 *data = p; ++ ++ if (regs->len < R8125_REGS_DUMP_SIZE) ++ return /* -EINVAL */; ++ ++ memset(p, 0, regs->len); ++ ++ for (i = 0; i < R8125_MAC_REGS_SIZE; i++) ++ *data++ = readb(ioaddr + i); ++ data = (u8*)p + 256; ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ for (i = 0; i < R8125_PHY_REGS_SIZE/2; i++) { ++ *(u16*)data = rtl8125_mdio_read(tp, i); ++ data += 2; ++ } ++ data = (u8*)p + 256 * 2; ++ ++ for (i = 0; i < R8125_EPHY_REGS_SIZE/2; i++) { ++ *(u16*)data = rtl8125_ephy_read(tp, i); ++ data += 2; ++ } ++ data = (u8*)p + 256 * 3; ++ ++ for (i = 0; i < R8125_ERI_REGS_SIZE; i+=4) { ++ *(u32*)data = rtl8125_eri_read(tp, i , 4, ERIAR_ExGMAC); ++ data += 4; ++ } ++} ++ ++static void rtl8125_get_pauseparam(struct net_device *dev, ++ struct ethtool_pauseparam *pause) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ pause->autoneg = (tp->autoneg ? AUTONEG_ENABLE : AUTONEG_DISABLE); ++ if (tp->fcpause == rtl8125_fc_rx_pause) ++ pause->rx_pause = 1; ++ else if (tp->fcpause == rtl8125_fc_tx_pause) ++ pause->tx_pause = 1; ++ else if (tp->fcpause == rtl8125_fc_full) { ++ pause->rx_pause = 1; ++ pause->tx_pause = 1; ++ } ++} ++ ++static int rtl8125_set_pauseparam(struct net_device *dev, ++ struct ethtool_pauseparam *pause) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ enum rtl8125_fc_mode newfc; ++ ++ if (pause->tx_pause || pause->rx_pause) ++ newfc = rtl8125_fc_full; ++ else ++ newfc = rtl8125_fc_none; ++ ++ if (tp->fcpause != newfc) { ++ tp->fcpause = newfc; ++ ++ rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising); ++ } ++ ++ return 0; ++} ++ ++static u32 ++rtl8125_get_msglevel(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ return tp->msg_enable; ++} ++ ++static void ++rtl8125_set_msglevel(struct net_device *dev, ++ u32 value) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ tp->msg_enable = value; ++} ++ ++static const char rtl8125_gstrings[][ETH_GSTRING_LEN] = { ++ /* legacy */ ++ "tx_packets", ++ "rx_packets", ++ "tx_errors", ++ "rx_errors", ++ "rx_missed", ++ "align_errors", ++ "tx_single_collisions", ++ "tx_multi_collisions", ++ "unicast", ++ "broadcast", ++ "multicast", ++ "tx_aborted", ++ "tx_underrun", ++ ++ /* extended */ ++ "tx_octets", ++ "rx_octets", ++ "rx_multicast64", ++ "tx_unicast64", ++ "tx_broadcast64", ++ "tx_multicast64", ++ "tx_pause_on", ++ "tx_pause_off", ++ "tx_pause_all", ++ "tx_deferred", ++ "tx_late_collision", ++ "tx_all_collision", ++ "tx_aborted32", ++ "align_errors32", ++ "rx_frame_too_long", ++ "rx_runt", ++ "rx_pause_on", ++ "rx_pause_off", ++ "rx_pause_all", ++ "rx_unknown_opcode", ++ "rx_mac_error", ++ "tx_underrun32", ++ "rx_mac_missed", ++ "rx_tcam_dropped", ++ "tdu", ++ "rdu", ++}; ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++static int rtl8125_get_stats_count(struct net_device *dev) ++{ ++ return ARRAY_SIZE(rtl8125_gstrings); ++} ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++#else ++static int rtl8125_get_sset_count(struct net_device *dev, int sset) ++{ ++ switch (sset) { ++ case ETH_SS_STATS: ++ return ARRAY_SIZE(rtl8125_gstrings); ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++#endif ++ ++static void ++rtl8125_set_ring_size(struct rtl8125_private *tp, u32 rx, u32 tx) ++{ ++ int i; ++ ++ for (i = 0; i < R8125_MAX_RX_QUEUES; i++) ++ tp->rx_ring[i].num_rx_desc = rx; ++ ++ for (i = 0; i < R8125_MAX_TX_QUEUES; i++) ++ tp->tx_ring[i].num_tx_desc = tx; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++static void rtl8125_get_ringparam(struct net_device *dev, ++ struct ethtool_ringparam *ring, ++ struct kernel_ethtool_ringparam *kernel_ring, ++ struct netlink_ext_ack *extack) ++#else ++static void rtl8125_get_ringparam(struct net_device *dev, ++ struct ethtool_ringparam *ring) ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ ring->rx_max_pending = MAX_NUM_TX_DESC; ++ ring->tx_max_pending = MAX_NUM_RX_DESC; ++ ring->rx_pending = tp->rx_ring[0].num_rx_desc; ++ ring->tx_pending = tp->tx_ring[0].num_tx_desc; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++static int rtl8125_set_ringparam(struct net_device *dev, ++ struct ethtool_ringparam *ring, ++ struct kernel_ethtool_ringparam *kernel_ring, ++ struct netlink_ext_ack *extack) ++#else ++static int rtl8125_set_ringparam(struct net_device *dev, ++ struct ethtool_ringparam *ring) ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 new_rx_count, new_tx_count; ++ int rc = 0; ++ ++ if ((ring->rx_mini_pending) || (ring->rx_jumbo_pending)) ++ return -EINVAL; ++ ++ new_tx_count = clamp_t(u32, ring->tx_pending, ++ MIN_NUM_TX_DESC, MAX_NUM_TX_DESC); ++ ++ new_rx_count = clamp_t(u32, ring->rx_pending, ++ MIN_NUM_RX_DESC, MAX_NUM_RX_DESC); ++ ++ if ((new_rx_count == tp->rx_ring[0].num_rx_desc) && ++ (new_tx_count == tp->tx_ring[0].num_tx_desc)) { ++ /* nothing to do */ ++ return 0; ++ } ++ ++ if (netif_running(dev)) { ++ rtl8125_wait_for_quiescence(dev); ++ rtl8125_close(dev); ++ } ++ ++ rtl8125_set_ring_size(tp, new_rx_count, new_tx_count); ++ ++ if (netif_running(dev)) ++ rc = rtl8125_open(dev); ++ ++ return rc; ++} ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++static void ++rtl8125_get_ethtool_stats(struct net_device *dev, ++ struct ethtool_stats *stats, ++ u64 *data) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct rtl8125_counters *counters; ++ dma_addr_t paddr; ++ ++ ASSERT_RTNL(); ++ ++ counters = tp->tally_vaddr; ++ paddr = tp->tally_paddr; ++ if (!counters) ++ return; ++ ++ rtl8125_dump_tally_counter(tp, paddr); ++ ++ data[0] = le64_to_cpu(counters->tx_packets); ++ data[1] = le64_to_cpu(counters->rx_packets); ++ data[2] = le64_to_cpu(counters->tx_errors); ++ data[3] = le32_to_cpu(counters->rx_errors); ++ data[4] = le16_to_cpu(counters->rx_missed); ++ data[5] = le16_to_cpu(counters->align_errors); ++ data[6] = le32_to_cpu(counters->tx_one_collision); ++ data[7] = le32_to_cpu(counters->tx_multi_collision); ++ data[8] = le64_to_cpu(counters->rx_unicast); ++ data[9] = le64_to_cpu(counters->rx_broadcast); ++ data[10] = le32_to_cpu(counters->rx_multicast); ++ data[11] = le16_to_cpu(counters->tx_aborted); ++ data[12] = le16_to_cpu(counters->tx_underrun); ++ ++ data[13] = le64_to_cpu(counters->tx_octets); ++ data[14] = le64_to_cpu(counters->rx_octets); ++ data[15] = le64_to_cpu(counters->rx_multicast64); ++ data[16] = le64_to_cpu(counters->tx_unicast64); ++ data[17] = le64_to_cpu(counters->tx_broadcast64); ++ data[18] = le64_to_cpu(counters->tx_multicast64); ++ data[19] = le32_to_cpu(counters->tx_pause_on); ++ data[20] = le32_to_cpu(counters->tx_pause_off); ++ data[21] = le32_to_cpu(counters->tx_pause_all); ++ data[22] = le32_to_cpu(counters->tx_deferred); ++ data[23] = le32_to_cpu(counters->tx_late_collision); ++ data[24] = le32_to_cpu(counters->tx_all_collision); ++ data[25] = le32_to_cpu(counters->tx_aborted32); ++ data[26] = le32_to_cpu(counters->align_errors32); ++ data[27] = le32_to_cpu(counters->rx_frame_too_long); ++ data[28] = le32_to_cpu(counters->rx_runt); ++ data[29] = le32_to_cpu(counters->rx_pause_on); ++ data[30] = le32_to_cpu(counters->rx_pause_off); ++ data[31] = le32_to_cpu(counters->rx_pause_all); ++ data[32] = le32_to_cpu(counters->rx_unknown_opcode); ++ data[33] = le32_to_cpu(counters->rx_mac_error); ++ data[34] = le32_to_cpu(counters->tx_underrun32); ++ data[35] = le32_to_cpu(counters->rx_mac_missed); ++ data[36] = le32_to_cpu(counters->rx_tcam_dropped); ++ data[37] = le32_to_cpu(counters->tdu); ++ data[38] = le32_to_cpu(counters->rdu); ++} ++ ++static void ++rtl8125_get_strings(struct net_device *dev, ++ u32 stringset, ++ u8 *data) ++{ ++ switch (stringset) { ++ case ETH_SS_STATS: ++ memcpy(data, rtl8125_gstrings, sizeof(rtl8125_gstrings)); ++ break; ++ } ++} ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++ ++static int rtl_get_eeprom_len(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ return tp->eeprom_len; ++} ++ ++static int rtl_get_eeprom(struct net_device *dev, struct ethtool_eeprom *eeprom, u8 *buf) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i,j,ret; ++ int start_w, end_w; ++ int VPD_addr, VPD_data; ++ u32 *eeprom_buff; ++ u16 tmp; ++ ++ if (tp->eeprom_type == EEPROM_TYPE_NONE) { ++ dev_printk(KERN_DEBUG, tp_to_dev(tp), "Detect none EEPROM\n"); ++ return -EOPNOTSUPP; ++ } else if (eeprom->len == 0 || (eeprom->offset+eeprom->len) > tp->eeprom_len) { ++ dev_printk(KERN_DEBUG, tp_to_dev(tp), "Invalid parameter\n"); ++ return -EINVAL; ++ } ++ ++ VPD_addr = 0xD2; ++ VPD_data = 0xD4; ++ ++ start_w = eeprom->offset >> 2; ++ end_w = (eeprom->offset + eeprom->len - 1) >> 2; ++ ++ eeprom_buff = kmalloc(sizeof(u32)*(end_w - start_w + 1), GFP_KERNEL); ++ if (!eeprom_buff) ++ return -ENOMEM; ++ ++ rtl8125_enable_cfg9346_write(tp); ++ ret = -EFAULT; ++ for (i=start_w; i<=end_w; i++) { ++ pci_write_config_word(tp->pci_dev, VPD_addr, (u16)i*4); ++ ret = -EFAULT; ++ for (j = 0; j < 10; j++) { ++ udelay(400); ++ pci_read_config_word(tp->pci_dev, VPD_addr, &tmp); ++ if (tmp&0x8000) { ++ ret = 0; ++ break; ++ } ++ } ++ ++ if (ret) ++ break; ++ ++ pci_read_config_dword(tp->pci_dev, VPD_data, &eeprom_buff[i-start_w]); ++ } ++ rtl8125_disable_cfg9346_write(tp); ++ ++ if (!ret) ++ memcpy(buf, (u8 *)eeprom_buff + (eeprom->offset & 3), eeprom->len); ++ ++ kfree(eeprom_buff); ++ ++ return ret; ++} ++ ++#undef ethtool_op_get_link ++#define ethtool_op_get_link _kc_ethtool_op_get_link ++static u32 _kc_ethtool_op_get_link(struct net_device *dev) ++{ ++ return netif_carrier_ok(dev) ? 1 : 0; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++#undef ethtool_op_get_sg ++#define ethtool_op_get_sg _kc_ethtool_op_get_sg ++static u32 _kc_ethtool_op_get_sg(struct net_device *dev) ++{ ++#ifdef NETIF_F_SG ++ return (dev->features & NETIF_F_SG) != 0; ++#else ++ return 0; ++#endif ++} ++ ++#undef ethtool_op_set_sg ++#define ethtool_op_set_sg _kc_ethtool_op_set_sg ++static int _kc_ethtool_op_set_sg(struct net_device *dev, u32 data) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT) ++ return -EOPNOTSUPP; ++ ++#ifdef NETIF_F_SG ++ if (data) ++ dev->features |= NETIF_F_SG; ++ else ++ dev->features &= ~NETIF_F_SG; ++#endif ++ ++ return 0; ++} ++#endif ++ ++static void ++rtl8125_set_eee_lpi_timer(struct rtl8125_private *tp) ++{ ++ u16 dev_lpi_timer; ++ ++ dev_lpi_timer = tp->eee.tx_lpi_timer; ++ ++ RTL_W16(tp, EEE_TXIDLE_TIMER_8125, dev_lpi_timer); ++} ++ ++static bool rtl8125_is_adv_eee_enabled(struct rtl8125_private *tp) ++{ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ //case CFG_METHOD_10: ++ //case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ //case CFG_METHOD_13: ++ if (rtl8125_mdio_direct_read_phy_ocp(tp, 0xA430) & BIT_15) ++ return true; ++ break; ++ default: ++ break; ++ } ++ ++ return false; ++} ++ ++static void _rtl8125_disable_adv_eee(struct rtl8125_private *tp) ++{ ++ bool lock; ++ ++ if (rtl8125_is_adv_eee_enabled(tp)) ++ lock = true; ++ else ++ lock = false; ++ ++ if (lock) ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_clear_mac_ocp_bit(tp, 0xE052, BIT_0); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA442, BIT_12 | BIT_13); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA430, BIT_15); ++ ++ if (lock) ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void rtl8125_disable_adv_eee(struct rtl8125_private *tp) ++{ ++ rtl8125_oob_mutex_lock(tp); ++ ++ _rtl8125_disable_adv_eee(tp); ++ ++ rtl8125_oob_mutex_unlock(tp); ++} ++ ++static int rtl8125_enable_eee(struct rtl8125_private *tp) ++{ ++ struct ethtool_keee *eee = &tp->eee; ++ u16 eee_adv_cap1_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t(eee->advertised); ++ u16 eee_adv_cap2_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap2_t(eee->advertised); ++ int ret; ++ ++ ret = 0; ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ rtl8125_set_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0)); ++ rtl8125_set_mac_ocp_bit(tp, 0xEB62, (BIT_2|BIT_1)); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA432, BIT_4); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA5D0, ++ MDIO_EEE_100TX | MDIO_EEE_1000T, ++ eee_adv_cap1_t); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D4, MDIO_EEE_2_5GT); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9); ++ break; ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ rtl8125_set_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0)); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA432, BIT_4); ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA5D0, ++ MDIO_EEE_100TX | MDIO_EEE_1000T, ++ eee_adv_cap1_t); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA6D4, ++ MDIO_EEE_2_5GT, ++ eee_adv_cap2_t); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9); ++ break; ++ default: ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ /*Advanced EEE*/ ++ rtl8125_disable_adv_eee(tp); ++ ++ return ret; ++} ++ ++static int rtl8125_disable_eee(struct rtl8125_private *tp) ++{ ++ int ret; ++ ++ ret = 0; ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ rtl8125_clear_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0)); ++ rtl8125_clear_mac_ocp_bit(tp, 0xEB62, (BIT_2|BIT_1)); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA432, BIT_4); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D0, (MDIO_EEE_100TX | MDIO_EEE_1000T)); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D4, BIT_0); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9); ++ break; ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ rtl8125_clear_mac_ocp_bit(tp, 0xE040, (BIT_1|BIT_0)); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA432, BIT_4); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D0, (MDIO_EEE_100TX | MDIO_EEE_1000T)); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D4, MDIO_EEE_2_5GT); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA6D8, BIT_4); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA428, BIT_7); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4A2, BIT_9); ++ break; ++ default: ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ /*Advanced EEE*/ ++ rtl8125_disable_adv_eee(tp); ++ ++ return ret; ++} ++ ++static int rtl_nway_reset(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int ret, bmcr; ++ ++ if (unlikely(tp->rtk_enable_diag)) ++ return -EBUSY; ++ ++ /* if autoneg is off, it's an error */ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ bmcr = rtl8125_mdio_read(tp, MII_BMCR); ++ ++ if (bmcr & BMCR_ANENABLE) { ++ bmcr |= BMCR_ANRESTART; ++ rtl8125_mdio_write(tp, MII_BMCR, bmcr); ++ ret = 0; ++ } else { ++ ret = -EINVAL; ++ } ++ ++ return ret; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) ++static u32 ++rtl8125_device_lpi_t_to_ethtool_lpi_t(struct rtl8125_private *tp , u32 lpi_timer) ++{ ++ u32 to_us; ++ u32 status; ++ ++ to_us = lpi_timer * 80; ++ status = rtl8125_get_phy_status(tp); ++ if (status & LinkStatus) { ++ /*link on*/ ++ //2.5G : lpi_timer * 3.2ns ++ //Giga: lpi_timer * 8ns ++ //100M : lpi_timer * 80ns ++ if (status & _2500bpsF) ++ to_us = (lpi_timer * 32) / 10; ++ else if (status & _1000bpsF) ++ to_us = lpi_timer * 8; ++ } ++ ++ //ns to us ++ to_us /= 1000; ++ ++ return to_us; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,9,0) ++static void ++rtl8125_adv_to_linkmode(unsigned long *mode, u64 adv) ++{ ++ linkmode_zero(mode); ++ ++ if (adv & ADVERTISED_10baseT_Half) ++ linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Half_BIT, mode); ++ if (adv & ADVERTISED_10baseT_Full) ++ linkmode_set_bit(ETHTOOL_LINK_MODE_10baseT_Full_BIT, mode); ++ if (adv & ADVERTISED_100baseT_Half) ++ linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Half_BIT, mode); ++ if (adv & ADVERTISED_100baseT_Full) ++ linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, mode); ++ if (adv & ADVERTISED_1000baseT_Half) ++ linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Half_BIT, mode); ++ if (adv & ADVERTISED_1000baseT_Full) ++ linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, mode); ++ if (adv & ADVERTISED_2500baseX_Full) ++ linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, mode); ++} ++ ++static int ++rtl_ethtool_get_eee(struct net_device *net, struct ethtool_keee *edata) ++{ ++ __ETHTOOL_DECLARE_LINK_MODE_MASK(common); ++ struct rtl8125_private *tp = netdev_priv(net); ++ struct ethtool_keee *eee = &tp->eee; ++ unsigned long flags; ++ u32 tx_lpi_timer; ++ u16 val; ++ ++ if (unlikely(tp->rtk_enable_diag)) ++ return -EBUSY; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ /* Get LP advertisement EEE */ ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D2); ++ mii_eee_cap1_mod_linkmode_t(edata->lp_advertised, val); ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA6D0); ++ mii_eee_cap2_mod_linkmode_sup_t(edata->lp_advertised, val); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ /* Get EEE Tx LPI timer*/ ++ tx_lpi_timer = rtl8125_device_lpi_t_to_ethtool_lpi_t(tp, eee->tx_lpi_timer); ++ ++ val = rtl8125_mac_ocp_read(tp, 0xE040); ++ val &= BIT_1 | BIT_0; ++ ++ edata->eee_enabled = !!val; ++ linkmode_copy(edata->supported, eee->supported); ++ linkmode_copy(edata->advertised, eee->advertised); ++ edata->tx_lpi_enabled = edata->eee_enabled; ++ edata->tx_lpi_timer = tx_lpi_timer; ++ linkmode_and(common, edata->advertised, edata->lp_advertised); ++ edata->eee_active = !linkmode_empty(common); ++ ++ return 0; ++} ++ ++static int ++rtl_ethtool_set_eee(struct net_device *net, struct ethtool_keee *edata) ++{ ++ __ETHTOOL_DECLARE_LINK_MODE_MASK(advertising); ++ __ETHTOOL_DECLARE_LINK_MODE_MASK(tmp); ++ struct rtl8125_private *tp = netdev_priv(net); ++ struct ethtool_keee *eee = &tp->eee; ++ unsigned long flags; ++ int rc = 0; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ if (!HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp) || ++ tp->DASH) { ++ rc = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (unlikely(tp->rtk_enable_diag)) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "Diag Enabled\n"); ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ if (tp->autoneg != AUTONEG_ENABLE) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE requires autoneg\n"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ /* ++ if (edata->tx_lpi_enabled) { ++ if (edata->tx_lpi_timer > tp->max_jumbo_frame_size || ++ edata->tx_lpi_timer < ETH_MIN_MTU) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "Valid LPI timer range is %d to %d. \n", ++ ETH_MIN_MTU, tp->max_jumbo_frame_size); ++ rc = -EINVAL; ++ goto out; ++ } ++ } ++ */ ++ ++ rtl8125_adv_to_linkmode(advertising, tp->advertising); ++ if (linkmode_empty(edata->advertised)) { ++ linkmode_and(edata->advertised, advertising, eee->supported); ++ } else if (linkmode_andnot(tmp, edata->advertised, advertising)) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised must be a subset of autoneg advertised speeds\n"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ if (linkmode_andnot(tmp, edata->advertised, eee->supported)) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised must be a subset of support \n"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ //tp->eee.eee_enabled = edata->eee_enabled; ++ //tp->eee_adv_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t(edata->advertised); ++ ++ linkmode_copy(eee->advertised, edata->advertised); ++ //eee->tx_lpi_enabled = edata->tx_lpi_enabled; ++ //eee->tx_lpi_timer = edata->tx_lpi_timer; ++ eee->eee_enabled = edata->eee_enabled; ++ ++ if (eee->eee_enabled) ++ rtl8125_enable_eee(tp); ++ else ++ rtl8125_disable_eee(tp); ++ ++ rtl_nway_reset(net); ++ ++out: ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return rc; ++} ++#else ++static int ++rtl_ethtool_get_eee(struct net_device *net, struct ethtool_eee *edata) ++{ ++ struct rtl8125_private *tp = netdev_priv(net); ++ struct ethtool_eee *eee = &tp->eee; ++ u32 lp, adv, tx_lpi_timer, supported = 0; ++ unsigned long flags; ++ u16 val; ++ ++ if (unlikely(tp->rtk_enable_diag)) ++ return -EBUSY; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ /* Get Supported EEE */ ++ //val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5C4); ++ //supported = mmd_eee_cap_to_ethtool_sup_t(val); ++ supported = eee->supported; ++ ++ /* Get advertisement EEE */ ++ adv = eee->advertised; ++ ++ /* Get LP advertisement EEE */ ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA5D2); ++ lp = mmd_eee_adv_to_ethtool_adv_t(val); ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA6D0); ++ if (val & RTK_LPA_EEE_ADVERTISE_2500FULL) ++ lp |= ADVERTISED_2500baseX_Full; ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ /* Get EEE Tx LPI timer*/ ++ tx_lpi_timer = rtl8125_device_lpi_t_to_ethtool_lpi_t(tp, eee->tx_lpi_timer); ++ ++ val = rtl8125_mac_ocp_read(tp, 0xE040); ++ val &= BIT_1 | BIT_0; ++ ++ edata->eee_enabled = !!val; ++ edata->eee_active = !!(supported & adv & lp); ++ edata->supported = supported; ++ edata->advertised = adv; ++ edata->lp_advertised = lp; ++ edata->tx_lpi_enabled = edata->eee_enabled; ++ edata->tx_lpi_timer = tx_lpi_timer; ++ ++ return 0; ++} ++ ++static int ++rtl_ethtool_set_eee(struct net_device *net, struct ethtool_eee *edata) ++{ ++ struct rtl8125_private *tp = netdev_priv(net); ++ struct ethtool_eee *eee = &tp->eee; ++ unsigned long flags; ++ u64 advertising; ++ int rc = 0; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ if (!HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp) || ++ tp->DASH) { ++ rc = -EOPNOTSUPP; ++ goto out; ++ } ++ ++ if (unlikely(tp->rtk_enable_diag)) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "Diag Enabled\n"); ++ rc = -EBUSY; ++ goto out; ++ } ++ ++ if (tp->autoneg != AUTONEG_ENABLE) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE requires autoneg\n"); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ /* ++ if (edata->tx_lpi_enabled) { ++ if (edata->tx_lpi_timer > tp->max_jumbo_frame_size || ++ edata->tx_lpi_timer < ETH_MIN_MTU) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "Valid LPI timer range is %d to %d. \n", ++ ETH_MIN_MTU, tp->max_jumbo_frame_size); ++ rc = -EINVAL; ++ goto out; ++ } ++ } ++ */ ++ ++ advertising = tp->advertising; ++ if (!edata->advertised) { ++ edata->advertised = advertising & eee->supported; ++ } else if (edata->advertised & ~advertising) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised %x must be a subset of autoneg advertised speeds %llu\n", ++ edata->advertised, advertising); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ if (edata->advertised & ~eee->supported) { ++ dev_printk(KERN_WARNING, tp_to_dev(tp), "EEE advertised %x must be a subset of support %x\n", ++ edata->advertised, eee->supported); ++ rc = -EINVAL; ++ goto out; ++ } ++ ++ //tp->eee.eee_enabled = edata->eee_enabled; ++ //tp->eee_adv_t = rtl8125_ethtool_adv_to_mmd_eee_adv_cap1_t(edata->advertised); ++ ++ eee->advertised = edata->advertised; ++ //eee->tx_lpi_enabled = edata->tx_lpi_enabled; ++ //eee->tx_lpi_timer = edata->tx_lpi_timer; ++ eee->eee_enabled = edata->eee_enabled; ++ ++ if (eee->eee_enabled) ++ rtl8125_enable_eee(tp); ++ else ++ rtl8125_disable_eee(tp); ++ ++ rtl_nway_reset(net); ++ ++out: ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return rc; ++} ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,9,0) */ ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) */ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) ++static void rtl8125_get_channels(struct net_device *dev, ++ struct ethtool_channels *channel) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ channel->max_rx = tp->HwSuppNumRxQueues; ++ channel->max_tx = tp->HwSuppNumTxQueues; ++ channel->rx_count = tp->num_rx_rings; ++ channel->tx_count = tp->num_tx_rings; ++} ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) */ ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++static const struct ethtool_ops rtl8125_ethtool_ops = { ++ .get_drvinfo = rtl8125_get_drvinfo, ++ .get_regs_len = rtl8125_get_regs_len, ++ .get_link = ethtool_op_get_link, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) ++ .get_ringparam = rtl8125_get_ringparam, ++ .set_ringparam = rtl8125_set_ringparam, ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++ .get_settings = rtl8125_get_settings, ++ .set_settings = rtl8125_set_settings, ++#else ++ .get_link_ksettings = rtl8125_get_settings, ++ .set_link_ksettings = rtl8125_set_settings, ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,6,0) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) ++ .get_pauseparam = rtl8125_get_pauseparam, ++ .set_pauseparam = rtl8125_set_pauseparam, ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,0) ++ .get_msglevel = rtl8125_get_msglevel, ++ .set_msglevel = rtl8125_set_msglevel, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++ .get_rx_csum = rtl8125_get_rx_csum, ++ .set_rx_csum = rtl8125_set_rx_csum, ++ .get_tx_csum = rtl8125_get_tx_csum, ++ .set_tx_csum = rtl8125_set_tx_csum, ++ .get_sg = ethtool_op_get_sg, ++ .set_sg = ethtool_op_set_sg, ++#ifdef NETIF_F_TSO ++ .get_tso = ethtool_op_get_tso, ++ .set_tso = ethtool_op_set_tso, ++#endif //NETIF_F_TSO ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,3,0) ++ .get_regs = rtl8125_get_regs, ++ .get_wol = rtl8125_get_wol, ++ .set_wol = rtl8125_set_wol, ++ .get_strings = rtl8125_get_strings, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) ++ .get_stats_count = rtl8125_get_stats_count, ++#else ++ .get_sset_count = rtl8125_get_sset_count, ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,33) ++ .get_ethtool_stats = rtl8125_get_ethtool_stats, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) ++#ifdef ETHTOOL_GPERMADDR ++ .get_perm_addr = ethtool_op_get_perm_addr, ++#endif //ETHTOOL_GPERMADDR ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,23) ++ .get_eeprom = rtl_get_eeprom, ++ .get_eeprom_len = rtl_get_eeprom_len, ++#ifdef ENABLE_RSS_SUPPORT ++ .get_rxnfc = rtl8125_get_rxnfc, ++ .set_rxnfc = rtl8125_set_rxnfc, ++ .get_rxfh_indir_size = rtl8125_rss_indir_size, ++ .get_rxfh_key_size = rtl8125_get_rxfh_key_size, ++ .get_rxfh = rtl8125_get_rxfh, ++ .set_rxfh = rtl8125_set_rxfh, ++#endif //ENABLE_RSS_SUPPORT ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) ++#ifdef ENABLE_PTP_SUPPORT ++ .get_ts_info = rtl8125_get_ts_info, ++#else ++ .get_ts_info = ethtool_op_get_ts_info, ++#endif //ENABLE_PTP_SUPPORT ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,5,0) ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) ++ .get_eee = rtl_ethtool_get_eee, ++ .set_eee = rtl_ethtool_set_eee, ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,6,0) */ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) ++ .get_channels = rtl8125_get_channels, ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) */ ++ .nway_reset = rtl_nway_reset, ++ ++}; ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++ ++static void rtl8125_get_mac_version(struct rtl8125_private *tp) ++{ ++ u32 reg,val32; ++ u32 ICVerID; ++ struct pci_dev *pdev = tp->pci_dev; ++ ++ val32 = RTL_R32(tp, TxConfig); ++ reg = val32 & 0x7c800000; ++ ICVerID = val32 & 0x00700000; ++ ++ switch (reg) { ++ case 0x60800000: ++ if (ICVerID == 0x00000000) { ++ tp->mcfg = CFG_METHOD_2; ++ } else if (ICVerID == 0x100000) { ++ tp->mcfg = CFG_METHOD_3; ++ } else { ++ tp->mcfg = CFG_METHOD_3; ++ tp->HwIcVerUnknown = TRUE; ++ } ++ ++ tp->efuse_ver = EFUSE_SUPPORT_V4; ++ break; ++ case 0x64000000: ++ if (ICVerID == 0x00000000) { ++ tp->mcfg = CFG_METHOD_4; ++ } else if (ICVerID == 0x100000) { ++ tp->mcfg = CFG_METHOD_5; ++ } else { ++ tp->mcfg = CFG_METHOD_5; ++ tp->HwIcVerUnknown = TRUE; ++ } ++ ++ tp->efuse_ver = EFUSE_SUPPORT_V4; ++ break; ++ case 0x68000000: ++ if (ICVerID == 0x00000000) { ++ tp->mcfg = CFG_METHOD_8; ++ } else if (ICVerID == 0x100000) { ++ tp->mcfg = CFG_METHOD_9; ++ } else { ++ tp->mcfg = CFG_METHOD_9; ++ tp->HwIcVerUnknown = TRUE; ++ } ++ ++ tp->efuse_ver = EFUSE_SUPPORT_V4; ++ break; ++ case 0x68800000: ++ if (ICVerID == 0x00000000) { ++ tp->mcfg = CFG_METHOD_10; ++ } else if (ICVerID == 0x100000) { ++ tp->mcfg = CFG_METHOD_11; ++ } else { ++ tp->mcfg = CFG_METHOD_11; ++ tp->HwIcVerUnknown = TRUE; ++ } ++ ++ tp->efuse_ver = EFUSE_SUPPORT_V4; ++ break; ++ case 0x70800000: ++ if (ICVerID == 0x00000000) { ++ tp->mcfg = CFG_METHOD_12; ++ } else { ++ tp->mcfg = CFG_METHOD_12; ++ tp->HwIcVerUnknown = TRUE; ++ } ++ ++ tp->efuse_ver = EFUSE_SUPPORT_V4; ++ break; ++ default: ++ printk("unknown chip version (%x)\n",reg); ++ tp->mcfg = CFG_METHOD_DEFAULT; ++ tp->HwIcVerUnknown = TRUE; ++ tp->efuse_ver = EFUSE_NOT_SUPPORT; ++ break; ++ } ++ ++ if (pdev->device == 0x8162) { ++ if (tp->mcfg == CFG_METHOD_3) ++ tp->mcfg = CFG_METHOD_6; ++ else if (tp->mcfg == CFG_METHOD_5) ++ tp->mcfg = CFG_METHOD_7; ++ else if (tp->mcfg == CFG_METHOD_11) ++ tp->mcfg = CFG_METHOD_13; ++ } ++} ++ ++static void ++rtl8125_print_mac_version(struct rtl8125_private *tp) ++{ ++ int i; ++ for (i = ARRAY_SIZE(rtl_chip_info) - 1; i >= 0; i--) { ++ if (tp->mcfg == rtl_chip_info[i].mcfg) { ++ dprintk("Realtek %s Ethernet controller mcfg = %04d\n", ++ MODULENAME, rtl_chip_info[i].mcfg); ++ return; ++ } ++ } ++ ++ dprintk("mac_version == Unknown\n"); ++} ++ ++static void ++rtl8125_tally_counter_addr_fill(struct rtl8125_private *tp) ++{ ++ if (!tp->tally_paddr) ++ return; ++ ++ RTL_W32(tp, CounterAddrHigh, (u64)tp->tally_paddr >> 32); ++ RTL_W32(tp, CounterAddrLow, (u64)tp->tally_paddr & (DMA_BIT_MASK(32))); ++} ++ ++static void ++rtl8125_tally_counter_clear(struct rtl8125_private *tp) ++{ ++ if (!tp->tally_paddr) ++ return; ++ ++ RTL_W32(tp, CounterAddrHigh, (u64)tp->tally_paddr >> 32); ++ RTL_W32(tp, CounterAddrLow, ((u64)tp->tally_paddr & (DMA_BIT_MASK(32))) | CounterReset); ++} ++ ++static void ++rtl8125_clear_phy_ups_reg(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA466, BIT_0); ++ break; ++ }; ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA468, BIT_3 | BIT_1); ++} ++ ++static int ++rtl8125_is_ups_resume(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ return (rtl8125_mac_ocp_read(tp, 0xD42C) & BIT_8); ++} ++ ++static void ++rtl8125_clear_ups_resume_bit(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_clear_mac_ocp_bit(tp, 0xD42C, BIT_8); ++} ++ ++static u8 ++rtl8125_get_phy_state(struct rtl8125_private *tp) ++{ ++ return (rtl8125_mdio_direct_read_phy_ocp(tp, 0xA420) & 0x7); ++} ++ ++static bool ++rtl8125_wait_phy_state_ready(struct rtl8125_private *tp, u16 state, ++ u32 ms) ++{ ++ u16 tmp_state; ++ u32 wait_cnt; ++ bool ready; ++ u32 i; ++ ++ if (ms >= 1000) ++ wait_cnt = ms / 1000; ++ else ++ wait_cnt = 100; ++ ++ i = 0; ++ do { ++ tmp_state = rtl8125_get_phy_state(tp); ++ mdelay(1); ++ i++; ++ } while ((i < wait_cnt) && (tmp_state != state)); ++ ++ ready = (i == wait_cnt && tmp_state != state) ? FALSE : TRUE; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) ++ WARN_ON_ONCE(i == wait_cnt); ++#endif ++ return ready; ++} ++ ++static void ++rtl8125_wait_phy_ups_resume(struct net_device *dev, u16 PhyState) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ for (i=0; i< 100; i++) { ++ if (rtl8125_get_phy_state(tp) == PhyState) ++ break; ++ else ++ mdelay(1); ++ } ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,18) ++ WARN_ON_ONCE(i == 100); ++#endif ++} ++ ++static void ++rtl8125_set_mcu_d3_stack(struct rtl8125_private *tp) ++{ ++ switch (tp->mcfg) { ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ rtl8125_mac_ocp_write(tp, 0xD018, 0xD116); ++ rtl8125_mac_ocp_write(tp, 0xD116, 0x45E0); ++ break; ++ case CFG_METHOD_9: ++ rtl8125_mac_ocp_write(tp, 0xD018, 0xD116); ++ rtl8125_mac_ocp_write(tp, 0xD116, 0x4782); ++ break; ++ case CFG_METHOD_10: ++ rtl8125_mac_ocp_write(tp, 0xD018, 0xD116); ++ rtl8125_mac_ocp_write(tp, 0xD116, 0x4836); ++ break; ++ case CFG_METHOD_11: ++ rtl8125_mac_ocp_write(tp, 0xD018, 0xD116); ++ rtl8125_mac_ocp_write(tp, 0xD116, 0x4848); ++ break; ++ case CFG_METHOD_12: ++ rtl8125_mac_ocp_write(tp, 0xD018, 0xD116); ++ rtl8125_mac_ocp_write(tp, 0xD116, 0x4C76); ++ break; ++ default: ++ return; ++ } ++} ++ ++static void ++_rtl8125_enable_now_is_oob(struct rtl8125_private *tp) ++{ ++ if (tp->HwSuppNowIsOobVer == 1) ++ RTL_W8(tp, MCUCmd_reg, RTL_R8(tp, MCUCmd_reg) | Now_is_oob); ++} ++ ++void ++rtl8125_enable_now_is_oob(struct rtl8125_private *tp) ++{ ++ rtl8125_set_mcu_d3_stack(tp); ++ _rtl8125_enable_now_is_oob(tp); ++} ++ ++void ++rtl8125_disable_now_is_oob(struct rtl8125_private *tp) ++{ ++ if (tp->HwSuppNowIsOobVer == 1) ++ RTL_W8(tp, MCUCmd_reg, RTL_R8(tp, MCUCmd_reg) & ~Now_is_oob); ++} ++ ++static void ++rtl8125_exit_oob(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 data16; ++ ++ rtl8125_disable_rx_packet_filter(tp); ++ ++ if (HW_DASH_SUPPORT_DASH(tp)) ++ rtl8125_driver_start(tp); ++ ++#ifdef ENABLE_REALWOW_SUPPORT ++ rtl8125_realwow_hw_init(dev); ++#else ++ //Disable realwow function ++ rtl8125_mac_ocp_write(tp, 0xC0BC, 0x00FF); ++#endif //ENABLE_REALWOW_SUPPORT ++ ++ rtl8125_nic_reset(dev); ++ ++ rtl8125_disable_now_is_oob(tp); ++ ++ data16 = rtl8125_mac_ocp_read(tp, 0xE8DE) & ~BIT_14; ++ rtl8125_mac_ocp_write(tp, 0xE8DE, data16); ++ rtl8125_wait_ll_share_fifo_ready(dev); ++ ++ rtl8125_mac_ocp_write(tp, 0xC0AA, 0x07D0); ++#ifdef ENABLE_LIB_SUPPORT ++ rtl8125_mac_ocp_write(tp, 0xC0A6, 0x04E2); ++#else ++ rtl8125_mac_ocp_write(tp, 0xC0A6, 0x01B5); ++#endif ++ rtl8125_mac_ocp_write(tp, 0xC01E, 0x5555); ++ ++ rtl8125_wait_ll_share_fifo_ready(dev); ++ ++ //wait ups resume (phy state 2) ++ if (rtl8125_is_ups_resume(dev)) { ++ rtl8125_wait_phy_ups_resume(dev, 2); ++ rtl8125_clear_ups_resume_bit(dev); ++ rtl8125_clear_phy_ups_reg(dev); ++ } ++} ++ ++void ++rtl8125_hw_disable_mac_mcu_bps(struct net_device *dev) ++{ ++ u16 regAddr; ++ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_enable_aspm_clkreq_lock(tp, 0); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x0000); ++ ++ for (regAddr = 0xFC28; regAddr < 0xFC48; regAddr += 2) { ++ rtl8125_mac_ocp_write(tp, regAddr, 0x0000); ++ } ++ ++ fsleep(3000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x0000); ++} ++ ++#ifndef ENABLE_USE_FIRMWARE_FILE ++static void ++rtl8125_switch_mac_mcu_ram_code_page(struct rtl8125_private *tp, u16 page) ++{ ++ u16 tmpUshort; ++ ++ page &= (BIT_1 | BIT_0); ++ tmpUshort = rtl8125_mac_ocp_read(tp, 0xE446); ++ tmpUshort &= ~(BIT_1 | BIT_0); ++ tmpUshort |= page; ++ rtl8125_mac_ocp_write(tp, 0xE446, tmpUshort); ++} ++ ++static void ++_rtl8125_set_hw_mcu_patch_code_ver(struct rtl8125_private *tp, u64 ver) ++{ ++ int i; ++ ++ /* Switch to page 2 */ ++ rtl8125_switch_mac_mcu_ram_code_page(tp, 2); ++ ++ for (i = 0; i < 8; i += 2) { ++ rtl8125_mac_ocp_write(tp, 0xF9F8 + 6 - i, (u16)ver); ++ ver >>= 16; ++ } ++ ++ /* Switch back to page 0 */ ++ rtl8125_switch_mac_mcu_ram_code_page(tp, 0); ++} ++ ++static void ++rtl8125_set_hw_mcu_patch_code_ver(struct rtl8125_private *tp, u64 ver) ++{ ++ _rtl8125_set_hw_mcu_patch_code_ver(tp, ver); ++ ++ tp->hw_mcu_patch_code_ver = ver; ++} ++ ++static u64 ++rtl8125_get_hw_mcu_patch_code_ver(struct rtl8125_private *tp) ++{ ++ u64 ver; ++ int i; ++ ++ /* Switch to page 2 */ ++ rtl8125_switch_mac_mcu_ram_code_page(tp, 2); ++ ++ ver = 0; ++ for (i = 0; i < 8; i += 2) { ++ ver <<= 16; ++ ver |= rtl8125_mac_ocp_read(tp, 0xF9F8 + i); ++ } ++ ++ /* Switch back to page 0 */ ++ rtl8125_switch_mac_mcu_ram_code_page(tp, 0); ++ ++ return ver; ++} ++ ++static u64 ++rtl8125_get_bin_mcu_patch_code_ver(const u16 *entry, u16 entry_cnt) ++{ ++ u64 ver; ++ int i; ++ ++ if (entry == NULL || entry_cnt == 0 || entry_cnt < 4) ++ return 0; ++ ++ ver = 0; ++ for (i = 0; i < 4; i++) { ++ ver <<= 16; ++ ver |= entry[entry_cnt - 4 + i]; ++ } ++ ++ return ver; ++} ++ ++static void ++_rtl8125_write_mac_mcu_ram_code(struct rtl8125_private *tp, const u16 *entry, u16 entry_cnt) ++{ ++ u16 i; ++ ++ for (i = 0; i < entry_cnt; i++) ++ rtl8125_mac_ocp_write(tp, 0xF800 + i * 2, entry[i]); ++} ++ ++static void ++_rtl8125_write_mac_mcu_ram_code_with_page(struct rtl8125_private *tp, const u16 *entry, u16 entry_cnt, u16 page_size) ++{ ++ u16 i; ++ u16 offset; ++ ++ if (page_size == 0) ++ return; ++ ++ for (i = 0; i < entry_cnt; i++) { ++ offset = i % page_size; ++ if (offset == 0) { ++ u16 page = (i / page_size); ++ rtl8125_switch_mac_mcu_ram_code_page(tp, page); ++ } ++ rtl8125_mac_ocp_write(tp, 0xF800 + offset * 2, entry[i]); ++ } ++} ++ ++static void ++rtl8125_write_mac_mcu_ram_code(struct rtl8125_private *tp, const u16 *entry, u16 entry_cnt) ++{ ++ if (FALSE == HW_SUPPORT_MAC_MCU(tp)) ++ return; ++ ++ if (entry == NULL || entry_cnt == 0) ++ return; ++ ++ if (tp->MacMcuPageSize > 0) ++ _rtl8125_write_mac_mcu_ram_code_with_page(tp, entry, entry_cnt, tp->MacMcuPageSize); ++ else ++ _rtl8125_write_mac_mcu_ram_code(tp, entry, entry_cnt); ++ ++ if (tp->bin_mcu_patch_code_ver > 0) ++ rtl8125_set_hw_mcu_patch_code_ver(tp, tp->bin_mcu_patch_code_ver); ++} ++ ++static void ++rtl8125_set_mac_mcu_8125a_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ static const u16 mcu_patch_code[] = { ++ 0xE010, 0xE062, 0xE072, 0xE074, 0xE079, 0xE07B, 0xE0E4, 0xE0ED, 0xE0EF, ++ 0xE0FA, 0xE105, 0xE116, 0xE11C, 0xE121, 0xE126, 0xE12A, 0xB400, 0xB401, ++ 0xB402, 0xB403, 0xB404, 0xB405, 0xC03F, 0x7206, 0x49AE, 0xF1FE, 0xC13C, ++ 0x9904, 0xC13B, 0x9906, 0x7206, 0x49AE, 0xF1FE, 0x7200, 0x49A0, 0xF10D, ++ 0xC534, 0xC133, 0xC238, 0xC338, 0xE817, 0xC337, 0xE815, 0xC336, 0xE813, ++ 0xC335, 0xE811, 0xE01B, 0xC129, 0xC22D, 0xC528, 0xC32C, 0xE80B, 0xC526, ++ 0xC32A, 0xE808, 0xC524, 0xC328, 0xE805, 0xC522, 0xC326, 0xE802, 0xE00C, ++ 0x740E, 0x49CE, 0xF1FE, 0x9908, 0x9D0A, 0x9A0C, 0x9B0E, 0x740E, 0x49CE, ++ 0xF1FE, 0xFF80, 0xB005, 0xB004, 0xB003, 0xB002, 0xB001, 0xB000, 0xC604, ++ 0xC002, 0xB800, 0x3044, 0xE000, 0xE8E0, 0xF128, 0x0002, 0xFFFF, 0x10EC, ++ 0x816A, 0x816F, 0x8164, 0x816D, 0xF000, 0x8001, 0x8002, 0x8003, 0x8004, ++ 0xC60F, 0x73C4, 0x49B3, 0xF106, 0x73C2, 0xC608, 0xB406, 0xC609, 0xFF80, ++ 0xC605, 0xB406, 0xC605, 0xFF80, 0x0544, 0x0568, 0xE906, 0xCDE8, 0xC602, ++ 0xBE00, 0x0000, 0x48C1, 0x48C2, 0x9C46, 0xC402, 0xBC00, 0x0A12, 0xC602, ++ 0xBE00, 0x0EBA, 0x1501, 0xF02A, 0x1500, 0xF15D, 0xC661, 0x75C8, 0x49D5, ++ 0xF00A, 0x49D6, 0xF008, 0x49D7, 0xF006, 0x49D8, 0xF004, 0x75D2, 0x49D9, ++ 0xF150, 0xC553, 0x77A0, 0x75C8, 0x4855, 0x4856, 0x4857, 0x4858, 0x48DA, ++ 0x48DB, 0x49FE, 0xF002, 0x485A, 0x49FF, 0xF002, 0x485B, 0x9DC8, 0x75D2, ++ 0x4859, 0x9DD2, 0xC643, 0x75C0, 0x49D4, 0xF033, 0x49D1, 0xF137, 0xE030, ++ 0xC63A, 0x75C8, 0x49D5, 0xF00E, 0x49D6, 0xF00C, 0x49D7, 0xF00A, 0x49D8, ++ 0xF008, 0x75D2, 0x49D9, 0xF005, 0xC62E, 0x75C0, 0x49D7, 0xF125, 0xC528, ++ 0x77A0, 0xC627, 0x75C8, 0x4855, 0x4856, 0x4857, 0x4858, 0x48DA, 0x48DB, ++ 0x49FE, 0xF002, 0x485A, 0x49FF, 0xF002, 0x485B, 0x9DC8, 0x75D2, 0x4859, ++ 0x9DD2, 0xC616, 0x75C0, 0x4857, 0x9DC0, 0xC613, 0x75C0, 0x49DA, 0xF003, ++ 0x49D1, 0xF107, 0xC60B, 0xC50E, 0x48D9, 0x9DC0, 0x4859, 0x9DC0, 0xC608, ++ 0xC702, 0xBF00, 0x3AE0, 0xE860, 0xB400, 0xB5D4, 0xE908, 0xE86C, 0x1200, ++ 0xC409, 0x6780, 0x48F1, 0x8F80, 0xC404, 0xC602, 0xBE00, 0x10AA, 0xC010, ++ 0xEA7C, 0xC602, 0xBE00, 0x0000, 0x740A, 0x4846, 0x4847, 0x9C0A, 0xC607, ++ 0x74C0, 0x48C6, 0x9CC0, 0xC602, 0xBE00, 0x13FE, 0xE054, 0x72CA, 0x4826, ++ 0x4827, 0x9ACA, 0xC607, 0x72C0, 0x48A6, 0x9AC0, 0xC602, 0xBE00, 0x07DC, ++ 0xE054, 0xC60F, 0x74C4, 0x49CC, 0xF109, 0xC60C, 0x74CA, 0x48C7, 0x9CCA, ++ 0xC609, 0x74C0, 0x4846, 0x9CC0, 0xC602, 0xBE00, 0x2480, 0xE092, 0xE0C0, ++ 0xE054, 0x7420, 0x48C0, 0x9C20, 0x7444, 0xC602, 0xBE00, 0x12F8, 0x1BFF, ++ 0x46EB, 0x1BFF, 0xC102, 0xB900, 0x0D5A, 0x1BFF, 0x46EB, 0x1BFF, 0xC102, ++ 0xB900, 0x0E2A, 0xC104, 0xC202, 0xBA00, 0x21DE, 0xD116, 0xC602, 0xBE00, ++ 0x0000, 0x6486, 0x0119, 0x0606, 0x1327 ++ }; ++ ++ /* Get BIN mac mcu patch code version */ ++ tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver) ++ rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC2A, 0x0540); ++ rtl8125_mac_ocp_write(tp, 0xFC2E, 0x0A06); ++ rtl8125_mac_ocp_write(tp, 0xFC30, 0x0EB8); ++ rtl8125_mac_ocp_write(tp, 0xFC32, 0x3A5C); ++ rtl8125_mac_ocp_write(tp, 0xFC34, 0x10A8); ++ rtl8125_mac_ocp_write(tp, 0xFC40, 0x0D54); ++ rtl8125_mac_ocp_write(tp, 0xFC42, 0x0E24); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x307A); ++} ++ ++static void ++rtl8125_set_mac_mcu_8125b_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ static const u16 mcu_patch_code[] = { ++ 0xE010, 0xE01B, 0xE026, 0xE037, 0xE03D, 0xE057, 0xE05B, 0xE060, 0xE0B6, ++ 0xE103, 0xE14C, 0xE150, 0xE153, 0xE156, 0xE158, 0xE15A, 0x740A, 0x4846, ++ 0x4847, 0x9C0A, 0xC607, 0x74C0, 0x48C6, 0x9CC0, 0xC602, 0xBE00, 0x13F0, ++ 0xE054, 0x72CA, 0x4826, 0x4827, 0x9ACA, 0xC607, 0x72C0, 0x48A6, 0x9AC0, ++ 0xC602, 0xBE00, 0x081C, 0xE054, 0xC60F, 0x74C4, 0x49CC, 0xF109, 0xC60C, ++ 0x74CA, 0x48C7, 0x9CCA, 0xC609, 0x74C0, 0x4846, 0x9CC0, 0xC602, 0xBE00, ++ 0x2494, 0xE092, 0xE0C0, 0xE054, 0x7420, 0x48C0, 0x9C20, 0x7444, 0xC602, ++ 0xBE00, 0x12DC, 0x733A, 0x21B5, 0x25BC, 0x1304, 0xF111, 0x1B12, 0x1D2A, ++ 0x3168, 0x3ADA, 0x31AB, 0x1A00, 0x9AC0, 0x1300, 0xF1FB, 0x7620, 0x236E, ++ 0x276F, 0x1A3C, 0x22A1, 0x41B5, 0x9EE2, 0x76E4, 0x486F, 0x9EE4, 0xC602, ++ 0xBE00, 0x4A26, 0x733A, 0x49BB, 0xC602, 0xBE00, 0x47A2, 0x48C1, 0x48C2, ++ 0x9C46, 0xC402, 0xBC00, 0x0A52, 0xC74B, 0x76E2, 0xC54A, 0x402E, 0xF034, ++ 0x76E0, 0x402E, 0xF006, 0xC703, 0xC403, 0xBC00, 0xC0BC, 0x0980, 0x76F0, ++ 0x1601, 0xF023, 0xC741, 0x1E04, 0x9EE0, 0x1E40, 0x9EE4, 0xC63D, 0x9EE8, ++ 0xC73D, 0x76E0, 0x4863, 0x9EE0, 0xC73A, 0x76E0, 0x48EA, 0x48EB, 0x9EE0, ++ 0xC736, 0x1E01, 0x9EE2, 0xC72D, 0x76E0, 0x486F, 0x9EE0, 0xC72D, 0x76E0, ++ 0x48E3, 0x9EE0, 0xC728, 0x1E0E, 0x9EE0, 0xC71D, 0x1E01, 0x9EE4, 0xE00D, ++ 0x1E00, 0x9EF0, 0x1E05, 0xC715, 0x9EE0, 0xE00A, 0x1E00, 0x9EE2, 0xC614, ++ 0x75CC, 0x48D2, 0x9DCC, 0x1E04, 0xC70B, 0x9EE0, 0xB000, 0xB001, 0xB002, ++ 0xB003, 0xB004, 0xB005, 0xB006, 0xB007, 0xFFC0, 0xE428, 0xD3C0, 0xBEEF, ++ 0x473E, 0xDC46, 0xE0CC, 0xE84E, 0xC0A2, 0x0100, 0xC010, 0xE85A, 0xE812, ++ 0xC0B4, 0xC5F4, 0x74A0, 0xC6F3, 0x4026, 0xF107, 0x74A2, 0xC6EF, 0x4026, ++ 0xF107, 0xC6ED, 0xBE00, 0x753A, 0xC602, 0xBE00, 0x462E, 0x7520, 0x49DE, ++ 0xF102, 0xE7F9, 0xC6A1, 0x67C6, 0x7520, 0x22D2, 0x26DD, 0x1500, 0xF002, ++ 0xE7F1, 0x7532, 0x26D5, 0x0530, 0x0D6C, 0xC42D, 0x308D, 0x7540, 0x4025, ++ 0xF11E, 0x7542, 0x4025, 0xF11B, 0x7544, 0x4025, 0xF118, 0xC423, 0x7546, ++ 0x4025, 0xF114, 0x7548, 0x4025, 0xF111, 0x754A, 0x4025, 0xF10E, 0xC5C0, ++ 0xC4C0, 0x9CA2, 0xC6C0, 0x75CC, 0x4852, 0x9DCC, 0xC6B8, 0x1D7D, 0x9DC2, ++ 0x1D01, 0x9DC0, 0xE7C9, 0xC40B, 0x7546, 0x4025, 0xF1FC, 0x7548, 0x4025, ++ 0xF1F9, 0x754A, 0x4025, 0xF1F6, 0xE7C0, 0xFFFF, 0xEEEE, 0xC2A6, 0x7340, ++ 0xC2A5, 0x4013, 0xF013, 0xC2AC, 0x7340, 0x4835, 0x9B40, 0xC240, 0x7358, ++ 0x48B7, 0x48B2, 0x9B58, 0x7346, 0x48B7, 0x48B2, 0x9B46, 0x7340, 0x48B7, ++ 0x48B2, 0x9B40, 0xE012, 0xC29A, 0x7340, 0x48B5, 0x9B40, 0xC22E, 0x7358, ++ 0x4837, 0x4832, 0x9B58, 0x7346, 0x4837, 0x4832, 0x9B46, 0x7340, 0x4837, ++ 0x4832, 0x9B40, 0xC283, 0x7340, 0x49BF, 0xF010, 0xC21B, 0x7344, 0x1300, ++ 0xF104, 0x1B00, 0xC217, 0x9B40, 0x1B01, 0xC213, 0x9B44, 0xC213, 0x734C, ++ 0x48B7, 0x9B4C, 0xE008, 0xC20C, 0x1B00, 0x9B44, 0xC20B, 0x734C, 0x4837, ++ 0x9B4C, 0xC204, 0xC302, 0xBB00, 0x2230, 0xE092, 0xD3C0, 0xE428, 0xDC46, ++ 0xC104, 0xC202, 0xBA00, 0x21F8, 0xD116, 0x49D1, 0xC602, 0xBE00, 0x3E7A, ++ 0x49D1, 0xC602, 0xBE00, 0x3EDA, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, ++ 0x0000, 0xC602, 0xBE00, 0x0000, 0x6637, 0x0119, 0x0604, 0x1203 ++ }; ++ ++ /* Get BIN mac mcu patch code version */ ++ tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver) ++ rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC28, 0x13E6); ++ rtl8125_mac_ocp_write(tp, 0xFC2A, 0x0812); ++ rtl8125_mac_ocp_write(tp, 0xFC2C, 0x248C); ++ rtl8125_mac_ocp_write(tp, 0xFC2E, 0x12DA); ++ rtl8125_mac_ocp_write(tp, 0xFC30, 0x4A20); ++ rtl8125_mac_ocp_write(tp, 0xFC32, 0x47A0); ++ //rtl8125_mac_ocp_write(tp, 0xFC34, 0x0A46); ++ //rtl8125_mac_ocp_write(tp, 0xFC36, 0x097E); ++ //rtl8125_mac_ocp_write(tp, 0xFC38, 0x462C); ++ //rtl8125_mac_ocp_write(tp, 0xFC3A, 0x222E); ++ rtl8125_mac_ocp_write(tp, 0xFC3C, 0x21F6); ++ rtl8125_mac_ocp_write(tp, 0xFC3E, 0x3E78); ++ rtl8125_mac_ocp_write(tp, 0xFC40, 0x3ED8); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x1C7B); ++} ++ ++static void ++rtl8125_set_mac_mcu_8125bp_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ static const u16 mcu_patch_code[] = { ++ 0xE010, 0xE014, 0xE027, 0xE04A, 0xE04D, 0xE050, 0xE052, 0xE054, 0xE056, ++ 0xE058, 0xE05A, 0xE05C, 0xE05E, 0xE060, 0xE062, 0xE064, 0x1BC8, 0x46EB, ++ 0xC302, 0xBB00, 0x0F14, 0xC211, 0x400A, 0xF00A, 0xC20F, 0x400A, 0xF007, ++ 0x73A4, 0xC20C, 0x400A, 0xF102, 0x48B0, 0x9B20, 0x1B00, 0x9BA0, 0xC602, ++ 0xBE00, 0x4364, 0xE6E0, 0xE6E2, 0xC01C, 0xB406, 0x1000, 0xF016, 0xC61F, ++ 0x400E, 0xF012, 0x218E, 0x25BE, 0x1300, 0xF007, 0x7340, 0xC618, 0x400E, ++ 0xF102, 0x48B0, 0x8320, 0xB400, 0x2402, 0x1000, 0xF003, 0x7342, 0x8322, ++ 0xB000, 0xE007, 0x7322, 0x9B42, 0x7320, 0x9B40, 0x0300, 0x0300, 0xB006, ++ 0xC302, 0xBB00, 0x413E, 0xE6E0, 0xC01C, 0x49D1, 0xC602, 0xBE00, 0x3F94, ++ 0x49D1, 0xC602, 0xBE00, 0x4030, 0xC602, 0xBE00, 0x3FDA, 0xC102, 0xB900, ++ 0x401A, 0xC102, 0xB900, 0x0000, 0xC002, 0xB800, 0x0000, 0xC602, 0xBE00, ++ 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, ++ 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, ++ 0x0000, 0x6936, 0x0A18, 0x0C02, 0x0D21 ++ }; ++ ++ /* Get BIN mac mcu patch code version */ ++ tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver) ++ rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC28, 0x0f10); ++ rtl8125_mac_ocp_write(tp, 0xFC2A, 0x435c); ++ rtl8125_mac_ocp_write(tp, 0xFC2C, 0x4112); ++ rtl8125_mac_ocp_write(tp, 0xFC2E, 0x3F92); ++ rtl8125_mac_ocp_write(tp, 0xFC30, 0x402E); ++ rtl8125_mac_ocp_write(tp, 0xFC32, 0x3FD6); ++ rtl8125_mac_ocp_write(tp, 0xFC34, 0x4018); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x007F); ++} ++ ++static void ++rtl8125_set_mac_mcu_8125bp_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ static const u16 mcu_patch_code[] = { ++ 0xE010, 0xE033, 0xE046, 0xE04A, 0xE04D, 0xE050, 0xE054, 0xE056, 0xE058, ++ 0xE05A, 0xE05C, 0xE05E, 0xE060, 0xE062, 0xE064, 0xE066, 0xB406, 0x1000, ++ 0xF016, 0xC61F, 0x400E, 0xF012, 0x218E, 0x25BE, 0x1300, 0xF007, 0x7340, ++ 0xC618, 0x400E, 0xF102, 0x48B0, 0x8320, 0xB400, 0x2402, 0x1000, 0xF003, ++ 0x7342, 0x8322, 0xB000, 0xE007, 0x7322, 0x9B42, 0x7320, 0x9B40, 0x0300, ++ 0x0300, 0xB006, 0xC302, 0xBB00, 0x4168, 0xE6E0, 0xC01C, 0xC211, 0x400A, ++ 0xF00A, 0xC20F, 0x400A, 0xF007, 0x73A4, 0xC20C, 0x400A, 0xF102, 0x48B0, ++ 0x9B20, 0x1B00, 0x9BA0, 0xC602, 0xBE00, 0x4392, 0xE6E0, 0xE6E2, 0xC01C, ++ 0x4166, 0x9CF6, 0xC002, 0xB800, 0x143C, 0x49D1, 0xC602, 0xBE00, 0x3FC4, ++ 0x49D1, 0xC602, 0xBE00, 0x405A, 0xC104, 0xC202, 0xBA00, 0x22E6, 0xD116, ++ 0xC602, 0xBE00, 0x0000, 0xC102, 0xB900, 0x0000, 0xC002, 0xB800, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0x6936, 0x0119, 0x030E, 0x0B18 ++ }; ++ ++ /* Get BIN mac mcu patch code version */ ++ tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver) ++ rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC28, 0x413C); ++ rtl8125_mac_ocp_write(tp, 0xFC2A, 0x438A); ++ rtl8125_mac_ocp_write(tp, 0xFC2C, 0x143A); ++ rtl8125_mac_ocp_write(tp, 0xFC2E, 0x3FC2); ++ rtl8125_mac_ocp_write(tp, 0xFC30, 0x4058); ++ rtl8125_mac_ocp_write(tp, 0xFC32, 0x22E4); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x003F); ++} ++ ++static void ++rtl8125_set_mac_mcu_8125d_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ static const u16 mcu_patch_code[] = { ++ 0xE010, 0xE014, 0xE018, 0xE01A, 0xE01C, 0xE01E, 0xE020, 0xE022, 0xE024, ++ 0xE026, 0xE028, 0xE02A, 0xE02C, 0xE02E, 0xE030, 0xE032, 0x4166, 0x9CF6, ++ 0xC002, 0xB800, 0x14A4, 0xC104, 0xC202, 0xBA00, 0x2378, 0xD116, 0xC602, ++ 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, ++ 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, ++ 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, ++ 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, ++ 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x6938, ++ 0x0A19, 0x030E, 0x0B2B ++ }; ++ ++ /* Get BIN mac mcu patch code version */ ++ tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver) ++ rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC28, 0x14A2); ++ rtl8125_mac_ocp_write(tp, 0xFC2A, 0x2376); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x0003); ++} ++ ++static void ++rtl8125_set_mac_mcu_8125d_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ static const u16 mcu_patch_code[] = { ++ 0xE010, 0xE014, 0xE016, 0xE018, 0xE01A, 0xE01C, 0xE01E, 0xE020, 0xE022, ++ 0xE024, 0xE026, 0xE028, 0xE02A, 0xE02C, 0xE02E, 0xE030, 0xC104, 0xC202, ++ 0xBA00, 0x2384, 0xD116, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, ++ 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x0000, 0x6938, ++ 0x0A19, 0x030E, 0x0B2F ++ }; ++ ++ /* Get BIN mac mcu patch code version */ ++ tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver) ++ rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC28, 0x2382); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x0001); ++} ++ ++ ++static void ++rtl8125_set_mac_mcu_8125cp_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ static const u16 mcu_patch_code[] = { ++ 0xE010, 0xE014, 0xE016, 0xE018, 0xE01A, 0xE01C, 0xE01E, 0xE020, 0xE022, ++ 0xE024, 0xE026, 0xE028, 0xE02A, 0xE02C, 0xE02E, 0xE030, 0xC104, 0xC202, ++ 0xBA00, 0x2438, 0xD116, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, 0xC602, 0xBE00, 0x0000, ++ 0xC602, 0xBE00, 0x0000, 0x7023, 0x0019, 0x031A, 0x0E20 ++ }; ++ ++ /* Get BIN mac mcu patch code version */ ++ tp->bin_mcu_patch_code_ver = rtl8125_get_bin_mcu_patch_code_ver(mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ if (tp->hw_mcu_patch_code_ver != tp->bin_mcu_patch_code_ver) ++ rtl8125_write_mac_mcu_ram_code(tp, mcu_patch_code, ARRAY_SIZE(mcu_patch_code)); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC26, 0x8000); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC28, 0x2436); ++ ++ rtl8125_mac_ocp_write(tp, 0xFC48, 0x0001); ++} ++ ++static void ++rtl8125_hw_mac_mcu_config(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (tp->NotWrMcuPatchCode == TRUE) ++ return; ++ ++ rtl8125_hw_disable_mac_mcu_bps(dev); ++ ++ /* Get H/W mac mcu patch code version */ ++ tp->hw_mcu_patch_code_ver = rtl8125_get_hw_mcu_patch_code_ver(tp); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ rtl8125_set_mac_mcu_8125a_2(dev); ++ break; ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ rtl8125_set_mac_mcu_8125b_2(dev); ++ break; ++ case CFG_METHOD_8: ++ rtl8125_set_mac_mcu_8125bp_1(dev); ++ break; ++ case CFG_METHOD_9: ++ rtl8125_set_mac_mcu_8125bp_2(dev); ++ break; ++ case CFG_METHOD_10: ++ rtl8125_set_mac_mcu_8125d_1(dev); ++ break; ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ rtl8125_set_mac_mcu_8125d_2(dev); ++ break; ++ case CFG_METHOD_12: ++ rtl8125_set_mac_mcu_8125cp_1(dev); ++ break; ++ case CFG_METHOD_2: ++ case CFG_METHOD_4: ++ /* no mac mcu patch code */ ++ break; ++ default: ++ break; ++ } ++} ++#endif ++ ++#ifdef ENABLE_USE_FIRMWARE_FILE ++static void rtl8125_release_firmware(struct rtl8125_private *tp) ++{ ++ if (tp->rtl_fw) { ++ rtl8125_fw_release_firmware(tp->rtl_fw); ++ kfree(tp->rtl_fw); ++ tp->rtl_fw = NULL; ++ } ++} ++ ++static void rtl8125_apply_firmware(struct rtl8125_private *tp) ++{ ++ unsigned long flags; ++ ++ /* TODO: release firmware if rtl_fw_write_firmware signals failure. */ ++ if (tp->rtl_fw) { ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_fw_write_firmware(tp, tp->rtl_fw); ++ /* At least one firmware doesn't reset tp->ocp_base. */ ++ tp->ocp_base = OCP_STD_PHY_BASE; ++ ++ /* PHY soft reset may still be in progress */ ++ //phy_read_poll_timeout(tp->phydev, MII_BMCR, val, ++ // !(val & BMCR_RESET), ++ // 50000, 600000, true); ++ rtl8125_wait_phy_reset_complete(tp); ++ ++ tp->hw_ram_code_ver = rtl8125_get_hw_phy_mcu_code_ver(tp); ++ tp->sw_ram_code_ver = tp->hw_ram_code_ver; ++ tp->HwHasWrRamCodeToMicroP = TRUE; ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ } ++} ++#endif ++ ++static void ++rtl8125_hw_init(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 csi_tmp; ++ ++ rtl8125_enable_aspm_clkreq_lock(tp, 0); ++ rtl8125_enable_force_clkreq(tp, 0); ++ ++ rtl8125_set_reg_oobs_en_sel(tp, true); ++ ++ //Disable UPS ++ rtl8125_mac_ocp_write(tp, 0xD40A, rtl8125_mac_ocp_read(tp, 0xD40A) & ~(BIT_4)); ++ ++#ifndef ENABLE_USE_FIRMWARE_FILE ++ if (!tp->rtl_fw) ++ rtl8125_hw_mac_mcu_config(dev); ++#endif ++ ++ /*disable ocp phy power saving*/ ++ if (tp->mcfg == CFG_METHOD_2 || ++ tp->mcfg == CFG_METHOD_3 || ++ tp->mcfg == CFG_METHOD_6) ++ rtl8125_disable_ocp_phy_power_saving(dev); ++ ++ //Set PCIE uncorrectable error status mask pcie 0x108 ++ csi_tmp = rtl8125_csi_read(tp, 0x108); ++ csi_tmp |= BIT_20; ++ rtl8125_csi_write(tp, 0x108, csi_tmp); ++ ++ rtl8125_enable_cfg9346_write(tp); ++ rtl8125_disable_linkchg_wakeup(dev); ++ rtl8125_disable_cfg9346_write(tp); ++ rtl8125_disable_magic_packet(dev); ++ rtl8125_disable_d0_speedup(tp); ++ rtl8125_set_pci_pme(tp, 0); ++ if (s0_magic_packet == 1) ++ rtl8125_enable_magic_packet(dev); ++ ++#ifdef ENABLE_USE_FIRMWARE_FILE ++ if (tp->rtl_fw && !tp->resume_not_chg_speed) ++ rtl8125_apply_firmware(tp); ++#endif ++} ++ ++static void ++rtl8125_hw_ephy_config(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ rtl8125_ephy_write(tp, 0x01, 0xA812); ++ rtl8125_ephy_write(tp, 0x09, 0x520C); ++ rtl8125_ephy_write(tp, 0x04, 0xD000); ++ rtl8125_ephy_write(tp, 0x0D, 0xF702); ++ rtl8125_ephy_write(tp, 0x0A, 0x8653); ++ rtl8125_ephy_write(tp, 0x06, 0x001E); ++ rtl8125_ephy_write(tp, 0x08, 0x3595); ++ rtl8125_ephy_write(tp, 0x20, 0x9455); ++ rtl8125_ephy_write(tp, 0x21, 0x99FF); ++ rtl8125_ephy_write(tp, 0x02, 0x6046); ++ rtl8125_ephy_write(tp, 0x29, 0xFE00); ++ rtl8125_ephy_write(tp, 0x23, 0xAB62); ++ ++ rtl8125_ephy_write(tp, 0x41, 0xA80C); ++ rtl8125_ephy_write(tp, 0x49, 0x520C); ++ rtl8125_ephy_write(tp, 0x44, 0xD000); ++ rtl8125_ephy_write(tp, 0x4D, 0xF702); ++ rtl8125_ephy_write(tp, 0x4A, 0x8653); ++ rtl8125_ephy_write(tp, 0x46, 0x001E); ++ rtl8125_ephy_write(tp, 0x48, 0x3595); ++ rtl8125_ephy_write(tp, 0x60, 0x9455); ++ rtl8125_ephy_write(tp, 0x61, 0x99FF); ++ rtl8125_ephy_write(tp, 0x42, 0x6046); ++ rtl8125_ephy_write(tp, 0x69, 0xFE00); ++ rtl8125_ephy_write(tp, 0x63, 0xAB62); ++ break; ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ rtl8125_ephy_write(tp, 0x04, 0xD000); ++ rtl8125_ephy_write(tp, 0x0A, 0x8653); ++ rtl8125_ephy_write(tp, 0x23, 0xAB66); ++ rtl8125_ephy_write(tp, 0x20, 0x9455); ++ rtl8125_ephy_write(tp, 0x21, 0x99FF); ++ rtl8125_ephy_write(tp, 0x29, 0xFE04); ++ ++ rtl8125_ephy_write(tp, 0x44, 0xD000); ++ rtl8125_ephy_write(tp, 0x4A, 0x8653); ++ rtl8125_ephy_write(tp, 0x63, 0xAB66); ++ rtl8125_ephy_write(tp, 0x60, 0x9455); ++ rtl8125_ephy_write(tp, 0x61, 0x99FF); ++ rtl8125_ephy_write(tp, 0x69, 0xFE04); ++ ++ ClearAndSetPCIePhyBit(tp, ++ 0x2A, ++ (BIT_14 | BIT_13 | BIT_12), ++ (BIT_13 | BIT_12)); ++ ClearPCIePhyBit(tp, 0x19, BIT_6); ++ SetPCIePhyBit(tp, 0x1B, (BIT_11 | BIT_10 | BIT_9)); ++ ClearPCIePhyBit(tp, 0x1B, (BIT_14 | BIT_13 | BIT_12)); ++ rtl8125_ephy_write(tp, 0x02, 0x6042); ++ rtl8125_ephy_write(tp, 0x06, 0x0014); ++ ++ ClearAndSetPCIePhyBit(tp, ++ 0x6A, ++ (BIT_14 | BIT_13 | BIT_12), ++ (BIT_13 | BIT_12)); ++ ClearPCIePhyBit(tp, 0x59, BIT_6); ++ SetPCIePhyBit(tp, 0x5B, (BIT_11 | BIT_10 | BIT_9)); ++ ClearPCIePhyBit(tp, 0x5B, (BIT_14 | BIT_13 | BIT_12)); ++ rtl8125_ephy_write(tp, 0x42, 0x6042); ++ rtl8125_ephy_write(tp, 0x46, 0x0014); ++ break; ++ case CFG_METHOD_4: ++ rtl8125_ephy_write(tp, 0x06, 0x001F); ++ rtl8125_ephy_write(tp, 0x0A, 0xB66B); ++ rtl8125_ephy_write(tp, 0x01, 0xA852); ++ rtl8125_ephy_write(tp, 0x24, 0x0008); ++ rtl8125_ephy_write(tp, 0x2F, 0x6052); ++ rtl8125_ephy_write(tp, 0x0D, 0xF716); ++ rtl8125_ephy_write(tp, 0x20, 0xD477); ++ rtl8125_ephy_write(tp, 0x21, 0x4477); ++ rtl8125_ephy_write(tp, 0x22, 0x0013); ++ rtl8125_ephy_write(tp, 0x23, 0xBB66); ++ rtl8125_ephy_write(tp, 0x0B, 0xA909); ++ rtl8125_ephy_write(tp, 0x29, 0xFF04); ++ rtl8125_ephy_write(tp, 0x1B, 0x1EA0); ++ ++ rtl8125_ephy_write(tp, 0x46, 0x001F); ++ rtl8125_ephy_write(tp, 0x4A, 0xB66B); ++ rtl8125_ephy_write(tp, 0x41, 0xA84A); ++ rtl8125_ephy_write(tp, 0x64, 0x000C); ++ rtl8125_ephy_write(tp, 0x6F, 0x604A); ++ rtl8125_ephy_write(tp, 0x4D, 0xF716); ++ rtl8125_ephy_write(tp, 0x60, 0xD477); ++ rtl8125_ephy_write(tp, 0x61, 0x4477); ++ rtl8125_ephy_write(tp, 0x62, 0x0013); ++ rtl8125_ephy_write(tp, 0x63, 0xBB66); ++ rtl8125_ephy_write(tp, 0x4B, 0xA909); ++ rtl8125_ephy_write(tp, 0x69, 0xFF04); ++ rtl8125_ephy_write(tp, 0x5B, 0x1EA0); ++ break; ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ rtl8125_ephy_write(tp, 0x0B, 0xA908); ++ rtl8125_ephy_write(tp, 0x1E, 0x20EB); ++ rtl8125_ephy_write(tp, 0x22, 0x0023); ++ rtl8125_ephy_write(tp, 0x02, 0x60C2); ++ rtl8125_ephy_write(tp, 0x29, 0xFF00); ++ ++ rtl8125_ephy_write(tp, 0x4B, 0xA908); ++ rtl8125_ephy_write(tp, 0x5E, 0x28EB); ++ rtl8125_ephy_write(tp, 0x62, 0x0023); ++ rtl8125_ephy_write(tp, 0x42, 0x60C2); ++ rtl8125_ephy_write(tp, 0x69, 0xFF00); ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ /* nothing to do */ ++ break; ++ } ++} ++ ++static u16 ++rtl8125_get_hw_phy_mcu_code_ver(struct rtl8125_private *tp) ++{ ++ u16 hw_ram_code_ver; ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801E); ++ hw_ram_code_ver = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA438); ++ ++ return hw_ram_code_ver; ++} ++ ++static int ++rtl8125_check_hw_phy_mcu_code_ver(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ tp->hw_ram_code_ver = rtl8125_get_hw_phy_mcu_code_ver(tp); ++ ++ if (tp->hw_ram_code_ver == tp->sw_ram_code_ver) { ++ tp->HwHasWrRamCodeToMicroP = TRUE; ++ return 1; ++ } else { ++ tp->HwHasWrRamCodeToMicroP = FALSE; ++ return 0; ++ } ++} ++ ++bool ++rtl8125_set_phy_mcu_patch_request(struct rtl8125_private *tp) ++{ ++ u16 gphy_val; ++ u16 WaitCount; ++ bool bSuccess = TRUE; ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xB820, BIT_4); ++ ++ WaitCount = 0; ++ do { ++ gphy_val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB800); ++ udelay(100); ++ WaitCount++; ++ } while (!(gphy_val & BIT_6) && (WaitCount < 1000)); ++ ++ if (!(gphy_val & BIT_6) && (WaitCount == 1000)) ++ bSuccess = FALSE; ++ ++ if (!bSuccess) ++ dprintk("rtl8125_set_phy_mcu_patch_request fail.\n"); ++ ++ return bSuccess; ++} ++ ++bool ++rtl8125_clear_phy_mcu_patch_request(struct rtl8125_private *tp) ++{ ++ u16 gphy_val; ++ u16 WaitCount; ++ bool bSuccess = TRUE; ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB820, BIT_4); ++ ++ WaitCount = 0; ++ do { ++ gphy_val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB800); ++ udelay(100); ++ WaitCount++; ++ } while ((gphy_val & BIT_6) && (WaitCount < 1000)); ++ ++ if ((gphy_val & BIT_6) && (WaitCount == 1000)) ++ bSuccess = FALSE; ++ ++ if (!bSuccess) ++ dprintk("rtl8125_clear_phy_mcu_patch_request fail.\n"); ++ ++ return bSuccess; ++} ++ ++#ifndef ENABLE_USE_FIRMWARE_FILE ++static void ++rtl8125_write_hw_phy_mcu_code_ver(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, tp->sw_ram_code_ver); ++ tp->hw_ram_code_ver = tp->sw_ram_code_ver; ++} ++ ++static void ++rtl8125_acquire_phy_mcu_patch_key_lock(struct rtl8125_private *tp) ++{ ++ u16 PatchKey; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ PatchKey = 0x8600; ++ break; ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ PatchKey = 0x8601; ++ break; ++ case CFG_METHOD_4: ++ PatchKey = 0x3700; ++ break; ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ PatchKey = 0x3701; ++ break; ++ default: ++ return; ++ } ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8024); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, PatchKey); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xB82E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0001); ++} ++ ++static void ++rtl8125_release_phy_mcu_patch_key_lock(struct rtl8125_private *tp) ++{ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB82E, BIT_0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8024); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void ++rtl8125_set_phy_mcu_ram_code(struct net_device *dev, const u16 *ramcode, u16 codesize) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 i; ++ u16 addr; ++ u16 val; ++ ++ if (ramcode == NULL || codesize % 2) { ++ goto out; ++ } ++ ++ for (i = 0; i < codesize; i += 2) { ++ addr = ramcode[i]; ++ val = ramcode[i + 1]; ++ if (addr == 0xFFFF && val == 0xFFFF) { ++ break; ++ } ++ rtl8125_mdio_direct_write_phy_ocp(tp, addr, val); ++ } ++ ++out: ++ return; ++} ++ ++static void ++rtl8125_enable_phy_disable_mode(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->HwSuppCheckPhyDisableModeVer) { ++ case 3: ++ RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) | BIT_5); ++ break; ++ } ++ ++ dprintk("enable phy disable mode.\n"); ++} ++ ++static void ++rtl8125_disable_phy_disable_mode(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ switch (tp->HwSuppCheckPhyDisableModeVer) { ++ case 3: ++ RTL_W8(tp, 0xF2, RTL_R8(tp, 0xF2) & ~BIT_5); ++ break; ++ } ++ ++ mdelay(1); ++ ++ dprintk("disable phy disable mode.\n"); ++} ++ ++static void ++rtl8125_set_hw_phy_before_init_phy_mcu(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u16 PhyRegValue; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBF86, 0x9000); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xC402, BIT_10); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xC402, BIT_10); ++ ++ PhyRegValue = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBF86); ++ PhyRegValue &= (BIT_1 | BIT_0); ++ if (PhyRegValue != 0) ++ dprintk("PHY watch dog not clear, value = 0x%x \n", PhyRegValue); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBD86, 0x1010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBD88, 0x1010); ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBD4E, ++ BIT_11 | BIT_10, ++ BIT_11); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBF46, ++ BIT_11 | BIT_10 | BIT_9 | BIT_8, ++ BIT_10 | BIT_9 | BIT_8); ++ break; ++ } ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125a_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_acquire_phy_mcu_patch_key_lock(tp); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xB820, BIT_7); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8013); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8021); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x802f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x803d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8042); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8051); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8051); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa088); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a50); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8008); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1a3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x401a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd707); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40c2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f8b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a6c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8080); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd019); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1a2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x401a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd707); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40c4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f8b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a84); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd503); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8970); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c07); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0901); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcf09); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd705); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xceff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf0a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1213); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8401); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8580); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1253); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd064); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd181); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4018); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc50f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd706); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2c59); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x804d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc60f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc605); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10fd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA026); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA024); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA022); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10f4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA020); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1252); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA006); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1206); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA004); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a78); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a60); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a4f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA008); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3f00); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8066); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x807c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8089); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x808e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80b2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80c2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62db); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x655c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0505); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0509); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x653c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0502); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0506); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0505); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0506); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0509); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x050c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0508); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0304); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd73e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60e9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x614a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61ab); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0502); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0508); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0321); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0346); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8208); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x609d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x001a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x001a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x607d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ab); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ab); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60fd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaa0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x017b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a05); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x017b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60fd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaa0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01e0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a05); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01e0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60fd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa50f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaa0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0231); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0503); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a05); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0231); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0221); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01ce); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA088); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0169); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA086); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00a6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA084); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x000d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA082); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0308); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA080); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x029f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA090); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x007f); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0020); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8017); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8029); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8054); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x805a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8064); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80a7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9430); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9480); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb408); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd120); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd057); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb80); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9906); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0567); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb94); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8406); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8dff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa840); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0773); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb91); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd139); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd140); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07dc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa110); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa2a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4045); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa180); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07ec); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f74); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07dc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07c0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fa7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0481); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x94bc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x870c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa190); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa00a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa280); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8220); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x078e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb92); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa840); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd140); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd150); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd703); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6121); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x61a2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6223); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d10); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d20); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d30); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf005); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d40); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa008); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4046); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07f7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f74); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0742); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3ad4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0537); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8840); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8301); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa70c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9402); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x890c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8840); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x064b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0642); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0686); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0788); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA108); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x047b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA106); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x065c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA104); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0769); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA102); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0565); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x06f9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA110); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ff); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8530); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3caf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8593); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9caf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x85a5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5afb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe083); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfb0c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x020d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x021b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86d7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86da); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbe0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x83fc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1b10); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xda02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xdd02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5afb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe083); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfd0c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x020d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x021b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x10bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86dd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbe0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x83fe); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1b10); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf2f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2cac); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0286); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x65af); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x212b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x022c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86b6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf21); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cd1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8710); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x870d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8719); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8716); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x871f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x871c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8728); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8725); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8707); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbad); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x281c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1302); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2202); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2b02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae1a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd101); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1302); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2202); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2b02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd101); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3402); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3102); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3d02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3a02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4302); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4c02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4902); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2e02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3702); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4602); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf87); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4f02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ab7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf35); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7ff8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfaef); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x69bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86fb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86fe); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86e9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86ec); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfbbf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x025a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7bf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86ef); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cbf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86f2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cbf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86f5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cbf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x86f8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0262); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7cef); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x96fe); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfc04); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf8fa); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef69); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf202); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf502); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbf86); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf802); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6273); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef96); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfefc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0420); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb540); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x53b5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4086); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb540); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb9b5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40c8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb03a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8b0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbac8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb13a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8b1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xba77); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd26); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffbd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2677); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd28); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffbd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2840); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd26); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8bd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2640); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbd28); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc8bd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x28bb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa430); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x98b0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1eba); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb01e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xdcb0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e98); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbab0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9edc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x98b1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1eba); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb11e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xdcb1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e98); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbab1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9edc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11b0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e22); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb01e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x33b0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e11); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x22b0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9e33); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb09e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11b1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e22); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb11e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x33b1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1e11); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x22b1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9e33); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb19e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb85e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2f71); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb860); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x20d9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb862); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2109); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb864); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x34e7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb878); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x000f); ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB820, BIT_7); ++ ++ ++ rtl8125_release_phy_mcu_patch_key_lock(tp); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125a_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125a_1(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125a_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_acquire_phy_mcu_patch_key_lock(tp); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xB820, BIT_7); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x808b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x808f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8093); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8097); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x809d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80a1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80aa); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x607b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x42da); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf01e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x615b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf01c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf024); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14a4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x14bc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f2e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf034); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac11); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa410); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4779); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf034); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac22); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa420); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4559); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf023); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac44); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa440); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4339); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd719); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4118); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac88); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa480); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xce00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4119); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xac0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1444); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf001); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1456); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd718); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fac); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc48f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x141b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd504); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x121a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd0b4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1bb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0898); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd0b4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1bb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a0e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd064); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd18a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0b7e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x401c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd501); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa804); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8804); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x053b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa301); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0648); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc520); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa201); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x252d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1646); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd708); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4006); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1646); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0308); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA026); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0307); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA024); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1645); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA022); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0647); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA020); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x053a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA006); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0b7c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA004); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0a0c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0896); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11a1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA008); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xff00); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8015); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x801a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xad02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x02d7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ed); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0509); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xc100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x008f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA08A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA088); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA086); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA084); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA082); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x008d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA080); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00eb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA090); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0103); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA016); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0020); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8014); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8018); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8024); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8051); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8055); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8072); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x80dc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfffd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfffd); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8301); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa70c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x9402); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x890c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8840); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa380); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x066e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb91); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd139); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd140); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa110); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa2a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4085); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa180); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8280); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07f0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5f74); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8190); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x82a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0c0f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x066e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd158); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd04d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03d4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x94bc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x870c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8380); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd10d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07c4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fb4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa190); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa00a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa280); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa404); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa220); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd130); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07c4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5fb4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xbb80); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1c4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd074); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa301); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x604b); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa90c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0556); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xcb92); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4063); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd116); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd119); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd040); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd703); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x60a0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6241); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x63e2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6583); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf054); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d10); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d50); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf02a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d20); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf021); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d60); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf01c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d30); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf013); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d70); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf00e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x611e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x40da); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d40); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf005); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d80); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x405d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa720); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5ff4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa008); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd704); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4046); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07fb); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd703); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f6f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f4e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f2d); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7f0c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x800a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0cf0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0d00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07e8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8010); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa740); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0743); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd702); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7fb5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd701); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3ad4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0556); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8610); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x066e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd1f5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xd049); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x1800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01ec); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x01ea); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x06a9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA10A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x078a); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA108); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03d2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA106); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x067f); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA104); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0665); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA102); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xA110); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00fc); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8530); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb87e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x3caf); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8545); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf85); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x45af); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8545); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xee82); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf900); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0103); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xaf03); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb7f8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe0a6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00e1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa601); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xef01); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x58f0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa080); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x37a1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8402); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae16); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa185); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x02ae); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x11a1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8702); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae0c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xa188); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x02ae); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x07a1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8902); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae02); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xae1c); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe0b4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6901); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe4b4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe0b4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6901); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xe4b4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x62e5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xb463); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xfc04); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb85e); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03b3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb860); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb862); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb864); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xffff); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0xb878); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0001); ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB820, BIT_7); ++ ++ ++ rtl8125_release_phy_mcu_patch_key_lock(tp); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125a_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125a_2(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static const u16 phy_mcu_ram_code_8125b_1[] = { ++ 0xa436, 0x8024, 0xa438, 0x3700, 0xa436, 0xB82E, 0xa438, 0x0001, ++ 0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x8025, 0xa438, 0x1800, 0xa438, 0x803a, ++ 0xa438, 0x1800, 0xa438, 0x8044, 0xa438, 0x1800, 0xa438, 0x8083, ++ 0xa438, 0x1800, 0xa438, 0x808d, 0xa438, 0x1800, 0xa438, 0x808d, ++ 0xa438, 0x1800, 0xa438, 0x808d, 0xa438, 0xd712, 0xa438, 0x4077, ++ 0xa438, 0xd71e, 0xa438, 0x4159, 0xa438, 0xd71e, 0xa438, 0x6099, ++ 0xa438, 0x7f44, 0xa438, 0x1800, 0xa438, 0x1a14, 0xa438, 0x9040, ++ 0xa438, 0x9201, 0xa438, 0x1800, 0xa438, 0x1b1a, 0xa438, 0xd71e, ++ 0xa438, 0x2425, 0xa438, 0x1a14, 0xa438, 0xd71f, 0xa438, 0x3ce5, ++ 0xa438, 0x1afb, 0xa438, 0x1800, 0xa438, 0x1b00, 0xa438, 0xd712, ++ 0xa438, 0x4077, 0xa438, 0xd71e, 0xa438, 0x4159, 0xa438, 0xd71e, ++ 0xa438, 0x60b9, 0xa438, 0x2421, 0xa438, 0x1c17, 0xa438, 0x1800, ++ 0xa438, 0x1a14, 0xa438, 0x9040, 0xa438, 0x1800, 0xa438, 0x1c2c, ++ 0xa438, 0xd71e, 0xa438, 0x2425, 0xa438, 0x1a14, 0xa438, 0xd71f, ++ 0xa438, 0x3ce5, 0xa438, 0x1c0f, 0xa438, 0x1800, 0xa438, 0x1c13, ++ 0xa438, 0xd702, 0xa438, 0xd501, 0xa438, 0x6072, 0xa438, 0x8401, ++ 0xa438, 0xf002, 0xa438, 0xa401, 0xa438, 0x1000, 0xa438, 0x146e, ++ 0xa438, 0x1800, 0xa438, 0x0b77, 0xa438, 0xd703, 0xa438, 0x665d, ++ 0xa438, 0x653e, 0xa438, 0x641f, 0xa438, 0xd700, 0xa438, 0x62c4, ++ 0xa438, 0x6185, 0xa438, 0x6066, 0xa438, 0x1800, 0xa438, 0x165a, ++ 0xa438, 0xc101, 0xa438, 0xcb00, 0xa438, 0x1000, 0xa438, 0x1945, ++ 0xa438, 0xd700, 0xa438, 0x7fa6, 0xa438, 0x1800, 0xa438, 0x807d, ++ 0xa438, 0xc102, 0xa438, 0xcb00, 0xa438, 0x1000, 0xa438, 0x1945, ++ 0xa438, 0xd700, 0xa438, 0x2569, 0xa438, 0x8058, 0xa438, 0x1800, ++ 0xa438, 0x807d, 0xa438, 0xc104, 0xa438, 0xcb00, 0xa438, 0x1000, ++ 0xa438, 0x1945, 0xa438, 0xd700, 0xa438, 0x7fa4, 0xa438, 0x1800, ++ 0xa438, 0x807d, 0xa438, 0xc120, 0xa438, 0xcb00, 0xa438, 0x1000, ++ 0xa438, 0x1945, 0xa438, 0xd703, 0xa438, 0x7fbf, 0xa438, 0x1800, ++ 0xa438, 0x807d, 0xa438, 0xc140, 0xa438, 0xcb00, 0xa438, 0x1000, ++ 0xa438, 0x1945, 0xa438, 0xd703, 0xa438, 0x7fbe, 0xa438, 0x1800, ++ 0xa438, 0x807d, 0xa438, 0xc180, 0xa438, 0xcb00, 0xa438, 0x1000, ++ 0xa438, 0x1945, 0xa438, 0xd703, 0xa438, 0x7fbd, 0xa438, 0xc100, ++ 0xa438, 0xcb00, 0xa438, 0xd708, 0xa438, 0x6018, 0xa438, 0x1800, ++ 0xa438, 0x165a, 0xa438, 0x1000, 0xa438, 0x14f6, 0xa438, 0xd014, ++ 0xa438, 0xd1e3, 0xa438, 0x1000, 0xa438, 0x1356, 0xa438, 0xd705, ++ 0xa438, 0x5fbe, 0xa438, 0x1800, 0xa438, 0x1559, 0xa436, 0xA026, ++ 0xa438, 0xffff, 0xa436, 0xA024, 0xa438, 0xffff, 0xa436, 0xA022, ++ 0xa438, 0xffff, 0xa436, 0xA020, 0xa438, 0x1557, 0xa436, 0xA006, ++ 0xa438, 0x1677, 0xa436, 0xA004, 0xa438, 0x0b75, 0xa436, 0xA002, ++ 0xa438, 0x1c17, 0xa436, 0xA000, 0xa438, 0x1b04, 0xa436, 0xA008, ++ 0xa438, 0x1f00, 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x817f, 0xa438, 0x1800, 0xa438, 0x82ab, ++ 0xa438, 0x1800, 0xa438, 0x83f8, 0xa438, 0x1800, 0xa438, 0x8444, ++ 0xa438, 0x1800, 0xa438, 0x8454, 0xa438, 0x1800, 0xa438, 0x8459, ++ 0xa438, 0x1800, 0xa438, 0x8465, 0xa438, 0xcb11, 0xa438, 0xa50c, ++ 0xa438, 0x8310, 0xa438, 0xd701, 0xa438, 0x4076, 0xa438, 0x0c03, ++ 0xa438, 0x0903, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, ++ 0xa438, 0x0d00, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d00, ++ 0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0x1000, 0xa438, 0x0a4d, ++ 0xa438, 0xcb12, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x5f84, 0xa438, 0xd102, 0xa438, 0xd040, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd701, ++ 0xa438, 0x60f3, 0xa438, 0xd413, 0xa438, 0x1000, 0xa438, 0x0a37, ++ 0xa438, 0xd410, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xcb13, ++ 0xa438, 0xa108, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8108, ++ 0xa438, 0xa00a, 0xa438, 0xa910, 0xa438, 0xa780, 0xa438, 0xd14a, ++ 0xa438, 0xd048, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd701, ++ 0xa438, 0x6255, 0xa438, 0xd700, 0xa438, 0x5f74, 0xa438, 0x6326, ++ 0xa438, 0xd702, 0xa438, 0x5f07, 0xa438, 0x800a, 0xa438, 0xa004, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0x0c03, ++ 0xa438, 0x0902, 0xa438, 0xffe2, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x5fab, 0xa438, 0xba08, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8b, 0xa438, 0x9a08, ++ 0xa438, 0x800a, 0xa438, 0xd702, 0xa438, 0x6535, 0xa438, 0xd40d, ++ 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xcb14, 0xa438, 0xa004, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0xa00a, ++ 0xa438, 0xa780, 0xa438, 0xd14a, 0xa438, 0xd048, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0x6206, ++ 0xa438, 0xd702, 0xa438, 0x5f47, 0xa438, 0x800a, 0xa438, 0xa004, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0x0c03, ++ 0xa438, 0x0902, 0xa438, 0x1800, 0xa438, 0x8064, 0xa438, 0x800a, ++ 0xa438, 0xd40e, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xb920, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, ++ 0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x7f8c, 0xa438, 0xd701, 0xa438, 0x6073, 0xa438, 0xd701, ++ 0xa438, 0x4216, 0xa438, 0xa004, 0xa438, 0x1000, 0xa438, 0x0a42, ++ 0xa438, 0x8004, 0xa438, 0xa001, 0xa438, 0x1000, 0xa438, 0x0a42, ++ 0xa438, 0x8001, 0xa438, 0xd120, 0xa438, 0xd040, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0x8504, ++ 0xa438, 0xcb21, 0xa438, 0xa301, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd700, 0xa438, 0x5f9f, 0xa438, 0x8301, 0xa438, 0xd704, ++ 0xa438, 0x40e0, 0xa438, 0xd196, 0xa438, 0xd04d, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb22, ++ 0xa438, 0x1000, 0xa438, 0x0a6d, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xa640, 0xa438, 0x9503, 0xa438, 0x8910, 0xa438, 0x8720, ++ 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d01, ++ 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0x1000, ++ 0xa438, 0x0a7d, 0xa438, 0x0c1f, 0xa438, 0x0f14, 0xa438, 0xcb23, ++ 0xa438, 0x8fc0, 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xaf40, ++ 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0x0cc0, 0xa438, 0x0f80, ++ 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xafc0, 0xa438, 0x1000, ++ 0xa438, 0x0a25, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd701, ++ 0xa438, 0x5dee, 0xa438, 0xcb24, 0xa438, 0x8f1f, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd701, 0xa438, 0x7f6e, 0xa438, 0xa111, ++ 0xa438, 0xa215, 0xa438, 0xa401, 0xa438, 0x8404, 0xa438, 0xa720, ++ 0xa438, 0xcb25, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8640, ++ 0xa438, 0x9503, 0xa438, 0x1000, 0xa438, 0x0b43, 0xa438, 0x1000, ++ 0xa438, 0x0b86, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xb920, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, ++ 0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x7f8c, 0xa438, 0xcb26, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x5f82, 0xa438, 0x8111, 0xa438, 0x8205, ++ 0xa438, 0x8404, 0xa438, 0xcb27, 0xa438, 0xd404, 0xa438, 0x1000, ++ 0xa438, 0x0a37, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, ++ 0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02, ++ 0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xa710, 0xa438, 0xa104, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8104, 0xa438, 0xa001, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0xa120, ++ 0xa438, 0xaa0f, 0xa438, 0x8110, 0xa438, 0xa284, 0xa438, 0xa404, ++ 0xa438, 0xa00a, 0xa438, 0xd193, 0xa438, 0xd046, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb28, ++ 0xa438, 0xa110, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fa8, 0xa438, 0x8110, 0xa438, 0x8284, 0xa438, 0xa404, ++ 0xa438, 0x800a, 0xa438, 0x8710, 0xa438, 0xb804, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f82, 0xa438, 0x9804, ++ 0xa438, 0xcb29, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x5f85, 0xa438, 0xa710, 0xa438, 0xb820, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f65, 0xa438, 0x9820, ++ 0xa438, 0xcb2a, 0xa438, 0xa190, 0xa438, 0xa284, 0xa438, 0xa404, ++ 0xa438, 0xa00a, 0xa438, 0xd13d, 0xa438, 0xd04a, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x8149, ++ 0xa438, 0xa220, 0xa438, 0xd1a0, 0xa438, 0xd040, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x8151, ++ 0xa438, 0xd702, 0xa438, 0x5f51, 0xa438, 0xcb2f, 0xa438, 0xa302, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd708, 0xa438, 0x5f63, ++ 0xa438, 0xd411, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0x8302, ++ 0xa438, 0xd409, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xb920, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, ++ 0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x7f8c, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x5fa3, 0xa438, 0x8190, 0xa438, 0x82a4, 0xa438, 0x8404, ++ 0xa438, 0x800a, 0xa438, 0xb808, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x7fa3, 0xa438, 0x9808, 0xa438, 0x1800, ++ 0xa438, 0x0433, 0xa438, 0xcb15, 0xa438, 0xa508, 0xa438, 0xd700, ++ 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0xf003, ++ 0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0x1000, 0xa438, 0x0a7d, ++ 0xa438, 0x1000, 0xa438, 0x0a4d, 0xa438, 0xa301, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5f9f, 0xa438, 0x8301, ++ 0xa438, 0xd704, 0xa438, 0x40e0, 0xa438, 0xd115, 0xa438, 0xd04f, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, ++ 0xa438, 0xd413, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xcb16, ++ 0xa438, 0x1000, 0xa438, 0x0a6d, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xa640, 0xa438, 0x9503, 0xa438, 0x8720, 0xa438, 0xd17a, ++ 0xa438, 0xd04c, 0xa438, 0x0c1f, 0xa438, 0x0f14, 0xa438, 0xcb17, ++ 0xa438, 0x8fc0, 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xaf40, ++ 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0x0cc0, 0xa438, 0x0f80, ++ 0xa438, 0x1000, 0xa438, 0x0a25, 0xa438, 0xafc0, 0xa438, 0x1000, ++ 0xa438, 0x0a25, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd701, ++ 0xa438, 0x61ce, 0xa438, 0xd700, 0xa438, 0x5db4, 0xa438, 0xcb18, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8640, 0xa438, 0x9503, ++ 0xa438, 0xa720, 0xa438, 0x1000, 0xa438, 0x0b43, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xffd6, 0xa438, 0x8f1f, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd701, 0xa438, 0x7f8e, 0xa438, 0xa131, ++ 0xa438, 0xaa0f, 0xa438, 0xa2d5, 0xa438, 0xa407, 0xa438, 0xa720, ++ 0xa438, 0x8310, 0xa438, 0xa308, 0xa438, 0x8308, 0xa438, 0xcb19, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8640, 0xa438, 0x9503, ++ 0xa438, 0x1000, 0xa438, 0x0b43, 0xa438, 0x1000, 0xa438, 0x0b86, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xb920, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, 0xa438, 0x9920, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8c, ++ 0xa438, 0xcb1a, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x5f82, 0xa438, 0x8111, 0xa438, 0x82c5, 0xa438, 0xa404, ++ 0xa438, 0x8402, 0xa438, 0xb804, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x7f82, 0xa438, 0x9804, 0xa438, 0xcb1b, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5f85, ++ 0xa438, 0xa710, 0xa438, 0xb820, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x7f65, 0xa438, 0x9820, 0xa438, 0xcb1c, ++ 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d02, ++ 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02, 0xa438, 0x1000, ++ 0xa438, 0x0a7d, 0xa438, 0xa110, 0xa438, 0xa284, 0xa438, 0xa404, ++ 0xa438, 0x8402, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fa8, 0xa438, 0xcb1d, 0xa438, 0xa180, 0xa438, 0xa402, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa8, ++ 0xa438, 0xa220, 0xa438, 0xd1f5, 0xa438, 0xd049, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x8221, ++ 0xa438, 0xd702, 0xa438, 0x5f51, 0xa438, 0xb920, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, 0xa438, 0x9920, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8c, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fa3, ++ 0xa438, 0xa504, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, ++ 0xa438, 0x0d00, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d00, ++ 0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xa00a, 0xa438, 0x8190, ++ 0xa438, 0x82a4, 0xa438, 0x8402, 0xa438, 0xa404, 0xa438, 0xb808, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7fa3, ++ 0xa438, 0x9808, 0xa438, 0xcb2b, 0xa438, 0xcb2c, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5f84, 0xa438, 0xd14a, ++ 0xa438, 0xd048, 0xa438, 0xa780, 0xa438, 0xcb2d, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5f94, 0xa438, 0x6208, ++ 0xa438, 0xd702, 0xa438, 0x5f27, 0xa438, 0x800a, 0xa438, 0xa004, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, 0xa438, 0xa001, ++ 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, 0xa438, 0x0c03, ++ 0xa438, 0x0902, 0xa438, 0xa00a, 0xa438, 0xffe9, 0xa438, 0xcb2e, ++ 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d02, ++ 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02, 0xa438, 0x1000, ++ 0xa438, 0x0a7d, 0xa438, 0xa190, 0xa438, 0xa284, 0xa438, 0xa406, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa8, ++ 0xa438, 0xa220, 0xa438, 0xd1a0, 0xa438, 0xd040, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x3444, 0xa438, 0x827d, ++ 0xa438, 0xd702, 0xa438, 0x5f51, 0xa438, 0xcb2f, 0xa438, 0xa302, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd708, 0xa438, 0x5f63, ++ 0xa438, 0xd411, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0x8302, ++ 0xa438, 0xd409, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0xb920, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, ++ 0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x7f8c, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x5fa3, 0xa438, 0x8190, 0xa438, 0x82a4, 0xa438, 0x8406, ++ 0xa438, 0x800a, 0xa438, 0xb808, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x7fa3, 0xa438, 0x9808, 0xa438, 0x1800, ++ 0xa438, 0x0433, 0xa438, 0xcb30, 0xa438, 0x8380, 0xa438, 0xcb31, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5f86, ++ 0xa438, 0x9308, 0xa438, 0xb204, 0xa438, 0xb301, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd701, 0xa438, 0x5fa2, 0xa438, 0xb302, ++ 0xa438, 0x9204, 0xa438, 0xcb32, 0xa438, 0xd408, 0xa438, 0x1000, ++ 0xa438, 0x0a37, 0xa438, 0xd141, 0xa438, 0xd043, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd704, ++ 0xa438, 0x4ccc, 0xa438, 0xd700, 0xa438, 0x4c81, 0xa438, 0xd702, ++ 0xa438, 0x609e, 0xa438, 0xd1e5, 0xa438, 0xd04d, 0xa438, 0xf003, ++ 0xa438, 0xd1e5, 0xa438, 0xd04d, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd700, 0xa438, 0x6083, ++ 0xa438, 0x0c1f, 0xa438, 0x0d01, 0xa438, 0xf003, 0xa438, 0x0c1f, ++ 0xa438, 0x0d01, 0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0x8710, ++ 0xa438, 0xa108, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8108, ++ 0xa438, 0xa203, 0xa438, 0x8120, 0xa438, 0x8a0f, 0xa438, 0xa111, ++ 0xa438, 0x8204, 0xa438, 0xa140, 0xa438, 0x1000, 0xa438, 0x0a42, ++ 0xa438, 0x8140, 0xa438, 0xd17a, 0xa438, 0xd04b, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xa204, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa7, ++ 0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x5fac, 0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x7f8c, 0xa438, 0xd404, 0xa438, 0x1000, ++ 0xa438, 0x0a37, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, ++ 0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02, ++ 0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xa710, 0xa438, 0x8101, ++ 0xa438, 0x8201, 0xa438, 0xa104, 0xa438, 0x1000, 0xa438, 0x0a42, ++ 0xa438, 0x8104, 0xa438, 0xa120, 0xa438, 0xaa0f, 0xa438, 0x8110, ++ 0xa438, 0xa284, 0xa438, 0xa404, 0xa438, 0xa00a, 0xa438, 0xd193, ++ 0xa438, 0xd047, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fb4, 0xa438, 0xa110, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd700, 0xa438, 0x5fa8, 0xa438, 0xa180, 0xa438, 0xd13d, ++ 0xa438, 0xd04a, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fb4, 0xa438, 0xf024, 0xa438, 0xa710, 0xa438, 0xa00a, ++ 0xa438, 0x8190, 0xa438, 0x8204, 0xa438, 0xa280, 0xa438, 0xa404, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fa7, ++ 0xa438, 0x8710, 0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x5fac, 0xa438, 0x9920, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f8c, 0xa438, 0x800a, ++ 0xa438, 0x8190, 0xa438, 0x8284, 0xa438, 0x8406, 0xa438, 0xd700, ++ 0xa438, 0x4121, 0xa438, 0xd701, 0xa438, 0x60f3, 0xa438, 0xd1e5, ++ 0xa438, 0xd04d, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fb4, 0xa438, 0x8710, 0xa438, 0xa00a, 0xa438, 0x8190, ++ 0xa438, 0x8204, 0xa438, 0xa280, 0xa438, 0xa404, 0xa438, 0xb920, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x5fac, ++ 0xa438, 0x9920, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, ++ 0xa438, 0x7f8c, 0xa438, 0xcb33, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd71f, 0xa438, 0x5f85, 0xa438, 0xa710, 0xa438, 0xb820, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd71f, 0xa438, 0x7f65, ++ 0xa438, 0x9820, 0xa438, 0xcb34, 0xa438, 0xa00a, 0xa438, 0xa190, ++ 0xa438, 0xa284, 0xa438, 0xa404, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd700, 0xa438, 0x5fa9, 0xa438, 0xd701, 0xa438, 0x6853, ++ 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, 0xa438, 0x0d00, ++ 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d00, 0xa438, 0x1000, ++ 0xa438, 0x0a7d, 0xa438, 0x8190, 0xa438, 0x8284, 0xa438, 0xcb35, ++ 0xa438, 0xd407, 0xa438, 0x1000, 0xa438, 0x0a37, 0xa438, 0x8110, ++ 0xa438, 0x8204, 0xa438, 0xa280, 0xa438, 0xa00a, 0xa438, 0xd704, ++ 0xa438, 0x4215, 0xa438, 0xa304, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd700, 0xa438, 0x5fb8, 0xa438, 0xd1c3, 0xa438, 0xd043, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, ++ 0xa438, 0x8304, 0xa438, 0xd700, 0xa438, 0x4109, 0xa438, 0xf01e, ++ 0xa438, 0xcb36, 0xa438, 0xd412, 0xa438, 0x1000, 0xa438, 0x0a37, ++ 0xa438, 0xd700, 0xa438, 0x6309, 0xa438, 0xd702, 0xa438, 0x42c7, ++ 0xa438, 0x800a, 0xa438, 0x8180, 0xa438, 0x8280, 0xa438, 0x8404, ++ 0xa438, 0xa004, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8004, ++ 0xa438, 0xa001, 0xa438, 0x1000, 0xa438, 0x0a42, 0xa438, 0x8001, ++ 0xa438, 0x0c03, 0xa438, 0x0902, 0xa438, 0xa00a, 0xa438, 0xd14a, ++ 0xa438, 0xd048, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fb4, 0xa438, 0xd700, 0xa438, 0x6083, 0xa438, 0x0c1f, ++ 0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c1f, 0xa438, 0x0d02, ++ 0xa438, 0x1000, 0xa438, 0x0a7d, 0xa438, 0xcc55, 0xa438, 0xcb37, ++ 0xa438, 0xa00a, 0xa438, 0xa190, 0xa438, 0xa2a4, 0xa438, 0xa404, ++ 0xa438, 0xd700, 0xa438, 0x6041, 0xa438, 0xa402, 0xa438, 0xd13d, ++ 0xa438, 0xd04a, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fb4, 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, ++ 0xa438, 0x5fa9, 0xa438, 0xd702, 0xa438, 0x5f71, 0xa438, 0xcb38, ++ 0xa438, 0x8224, 0xa438, 0xa288, 0xa438, 0x8180, 0xa438, 0xa110, ++ 0xa438, 0xa404, 0xa438, 0x800a, 0xa438, 0xd700, 0xa438, 0x6041, ++ 0xa438, 0x8402, 0xa438, 0xd415, 0xa438, 0x1000, 0xa438, 0x0a37, ++ 0xa438, 0xd13d, 0xa438, 0xd04a, 0xa438, 0x1000, 0xa438, 0x0a5e, ++ 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb39, 0xa438, 0xa00a, ++ 0xa438, 0xa190, 0xa438, 0xa2a0, 0xa438, 0xa404, 0xa438, 0xd700, ++ 0xa438, 0x6041, 0xa438, 0xa402, 0xa438, 0xd17a, 0xa438, 0xd047, ++ 0xa438, 0x1000, 0xa438, 0x0a5e, 0xa438, 0xd700, 0xa438, 0x5fb4, ++ 0xa438, 0x1800, 0xa438, 0x0560, 0xa438, 0xa111, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0xd3f5, ++ 0xa438, 0xd219, 0xa438, 0x1000, 0xa438, 0x0c31, 0xa438, 0xd708, ++ 0xa438, 0x5fa5, 0xa438, 0xa215, 0xa438, 0xd30e, 0xa438, 0xd21a, ++ 0xa438, 0x1000, 0xa438, 0x0c31, 0xa438, 0xd708, 0xa438, 0x63e9, ++ 0xa438, 0xd708, 0xa438, 0x5f65, 0xa438, 0xd708, 0xa438, 0x7f36, ++ 0xa438, 0xa004, 0xa438, 0x1000, 0xa438, 0x0c35, 0xa438, 0x8004, ++ 0xa438, 0xa001, 0xa438, 0x1000, 0xa438, 0x0c35, 0xa438, 0x8001, ++ 0xa438, 0xd708, 0xa438, 0x4098, 0xa438, 0xd102, 0xa438, 0x9401, ++ 0xa438, 0xf003, 0xa438, 0xd103, 0xa438, 0xb401, 0xa438, 0x1000, ++ 0xa438, 0x0c27, 0xa438, 0xa108, 0xa438, 0x1000, 0xa438, 0x0c35, ++ 0xa438, 0x8108, 0xa438, 0x8110, 0xa438, 0x8294, 0xa438, 0xa202, ++ 0xa438, 0x1800, 0xa438, 0x0bdb, 0xa438, 0xd39c, 0xa438, 0xd210, ++ 0xa438, 0x1000, 0xa438, 0x0c31, 0xa438, 0xd708, 0xa438, 0x5fa5, ++ 0xa438, 0xd39c, 0xa438, 0xd210, 0xa438, 0x1000, 0xa438, 0x0c31, ++ 0xa438, 0xd708, 0xa438, 0x5fa5, 0xa438, 0x1000, 0xa438, 0x0c31, ++ 0xa438, 0xd708, 0xa438, 0x29b5, 0xa438, 0x840e, 0xa438, 0xd708, ++ 0xa438, 0x5f4a, 0xa438, 0x0c1f, 0xa438, 0x1014, 0xa438, 0x1000, ++ 0xa438, 0x0c31, 0xa438, 0xd709, 0xa438, 0x7fa4, 0xa438, 0x901f, ++ 0xa438, 0x1800, 0xa438, 0x0c23, 0xa438, 0xcb43, 0xa438, 0xa508, ++ 0xa438, 0xd701, 0xa438, 0x3699, 0xa438, 0x844a, 0xa438, 0xa504, ++ 0xa438, 0xa190, 0xa438, 0xa2a0, 0xa438, 0xa404, 0xa438, 0xa00a, ++ 0xa438, 0xd700, 0xa438, 0x2109, 0xa438, 0x05ea, 0xa438, 0xa402, ++ 0xa438, 0x1800, 0xa438, 0x05ea, 0xa438, 0xcb90, 0xa438, 0x0cf0, ++ 0xa438, 0x0ca0, 0xa438, 0x1800, 0xa438, 0x06db, 0xa438, 0xd1ff, ++ 0xa438, 0xd052, 0xa438, 0xa508, 0xa438, 0x8718, 0xa438, 0xa00a, ++ 0xa438, 0xa190, 0xa438, 0xa2a0, 0xa438, 0xa404, 0xa438, 0x0cf0, ++ 0xa438, 0x0c50, 0xa438, 0x1800, 0xa438, 0x09ef, 0xa438, 0x1000, ++ 0xa438, 0x0a5e, 0xa438, 0xd704, 0xa438, 0x2e70, 0xa438, 0x06da, ++ 0xa438, 0xd700, 0xa438, 0x5f55, 0xa438, 0xa90c, 0xa438, 0x1800, ++ 0xa438, 0x0645, 0xa436, 0xA10E, 0xa438, 0x0644, 0xa436, 0xA10C, ++ 0xa438, 0x09e9, 0xa436, 0xA10A, 0xa438, 0x06da, 0xa436, 0xA108, ++ 0xa438, 0x05e1, 0xa436, 0xA106, 0xa438, 0x0be4, 0xa436, 0xA104, ++ 0xa438, 0x0435, 0xa436, 0xA102, 0xa438, 0x0141, 0xa436, 0xA100, ++ 0xa438, 0x026d, 0xa436, 0xA110, 0xa438, 0x00ff, 0xa436, 0xb87c, ++ 0xa438, 0x85fe, 0xa436, 0xb87e, 0xa438, 0xaf86, 0xa438, 0x16af, ++ 0xa438, 0x8699, 0xa438, 0xaf86, 0xa438, 0xe5af, 0xa438, 0x86f9, ++ 0xa438, 0xaf87, 0xa438, 0x7aaf, 0xa438, 0x883a, 0xa438, 0xaf88, ++ 0xa438, 0x58af, 0xa438, 0x8b6c, 0xa438, 0xd48b, 0xa438, 0x7c02, ++ 0xa438, 0x8644, 0xa438, 0x2c00, 0xa438, 0x503c, 0xa438, 0xffd6, ++ 0xa438, 0xac27, 0xa438, 0x18e1, 0xa438, 0x82fe, 0xa438, 0xad28, ++ 0xa438, 0x0cd4, 0xa438, 0x8b84, 0xa438, 0x0286, 0xa438, 0x442c, ++ 0xa438, 0x003c, 0xa438, 0xac27, 0xa438, 0x06ee, 0xa438, 0x8299, ++ 0xa438, 0x01ae, 0xa438, 0x04ee, 0xa438, 0x8299, 0xa438, 0x00af, ++ 0xa438, 0x23dc, 0xa438, 0xf9fa, 0xa438, 0xcefa, 0xa438, 0xfbef, ++ 0xa438, 0x79fb, 0xa438, 0xc4bf, 0xa438, 0x8b76, 0xa438, 0x026c, ++ 0xa438, 0x6dac, 0xa438, 0x2804, 0xa438, 0xd203, 0xa438, 0xae02, ++ 0xa438, 0xd201, 0xa438, 0xbdd8, 0xa438, 0x19d9, 0xa438, 0xef94, ++ 0xa438, 0x026c, 0xa438, 0x6d78, 0xa438, 0x03ef, 0xa438, 0x648a, ++ 0xa438, 0x0002, 0xa438, 0xbdd8, 0xa438, 0x19d9, 0xa438, 0xef94, ++ 0xa438, 0x026c, 0xa438, 0x6d78, 0xa438, 0x03ef, 0xa438, 0x7402, ++ 0xa438, 0x72cd, 0xa438, 0xac50, 0xa438, 0x02ef, 0xa438, 0x643a, ++ 0xa438, 0x019f, 0xa438, 0xe4ef, 0xa438, 0x4678, 0xa438, 0x03ac, ++ 0xa438, 0x2002, 0xa438, 0xae02, 0xa438, 0xd0ff, 0xa438, 0xffef, ++ 0xa438, 0x97ff, 0xa438, 0xfec6, 0xa438, 0xfefd, 0xa438, 0x041f, ++ 0xa438, 0x771f, 0xa438, 0x221c, 0xa438, 0x450d, 0xa438, 0x481f, ++ 0xa438, 0x00ac, 0xa438, 0x7f04, 0xa438, 0x1a94, 0xa438, 0xae08, ++ 0xa438, 0x1a94, 0xa438, 0xac7f, 0xa438, 0x03d7, 0xa438, 0x0100, ++ 0xa438, 0xef46, 0xa438, 0x0d48, 0xa438, 0x1f00, 0xa438, 0x1c45, ++ 0xa438, 0xef69, 0xa438, 0xef57, 0xa438, 0xef74, 0xa438, 0x0272, ++ 0xa438, 0xe8a7, 0xa438, 0xffff, 0xa438, 0x0d1a, 0xa438, 0x941b, ++ 0xa438, 0x979e, 0xa438, 0x072d, 0xa438, 0x0100, 0xa438, 0x1a64, ++ 0xa438, 0xef76, 0xa438, 0xef97, 0xa438, 0x0d98, 0xa438, 0xd400, ++ 0xa438, 0xff1d, 0xa438, 0x941a, 0xa438, 0x89cf, 0xa438, 0x1a75, ++ 0xa438, 0xaf74, 0xa438, 0xf9bf, 0xa438, 0x8b79, 0xa438, 0x026c, ++ 0xa438, 0x6da1, 0xa438, 0x0005, 0xa438, 0xe180, 0xa438, 0xa0ae, ++ 0xa438, 0x03e1, 0xa438, 0x80a1, 0xa438, 0xaf26, 0xa438, 0x9aac, ++ 0xa438, 0x284d, 0xa438, 0xe08f, 0xa438, 0xffef, 0xa438, 0x10c0, ++ 0xa438, 0xe08f, 0xa438, 0xfe10, 0xa438, 0x1b08, 0xa438, 0xa000, ++ 0xa438, 0x04c8, 0xa438, 0xaf40, 0xa438, 0x67c8, 0xa438, 0xbf8b, ++ 0xa438, 0x8c02, 0xa438, 0x6c4e, 0xa438, 0xc4bf, 0xa438, 0x8b8f, ++ 0xa438, 0x026c, 0xa438, 0x6def, 0xa438, 0x74e0, 0xa438, 0x830c, ++ 0xa438, 0xad20, 0xa438, 0x0302, 0xa438, 0x74ac, 0xa438, 0xccef, ++ 0xa438, 0x971b, 0xa438, 0x76ad, 0xa438, 0x5f02, 0xa438, 0xae13, ++ 0xa438, 0xef69, 0xa438, 0xef30, 0xa438, 0x1b32, 0xa438, 0xc4ef, ++ 0xa438, 0x46e4, 0xa438, 0x8ffb, 0xa438, 0xe58f, 0xa438, 0xfce7, ++ 0xa438, 0x8ffd, 0xa438, 0xcc10, 0xa438, 0x11ae, 0xa438, 0xb8d1, ++ 0xa438, 0x00a1, 0xa438, 0x1f03, 0xa438, 0xaf40, 0xa438, 0x4fbf, ++ 0xa438, 0x8b8c, 0xa438, 0x026c, 0xa438, 0x4ec4, 0xa438, 0xbf8b, ++ 0xa438, 0x8f02, 0xa438, 0x6c6d, 0xa438, 0xef74, 0xa438, 0xe083, ++ 0xa438, 0x0cad, 0xa438, 0x2003, 0xa438, 0x0274, 0xa438, 0xaccc, ++ 0xa438, 0xef97, 0xa438, 0x1b76, 0xa438, 0xad5f, 0xa438, 0x02ae, ++ 0xa438, 0x04ef, 0xa438, 0x69ef, 0xa438, 0x3111, 0xa438, 0xaed1, ++ 0xa438, 0x0287, 0xa438, 0x80af, 0xa438, 0x2293, 0xa438, 0xf8f9, ++ 0xa438, 0xfafb, 0xa438, 0xef59, 0xa438, 0xe080, 0xa438, 0x13ad, ++ 0xa438, 0x252f, 0xa438, 0xbf88, 0xa438, 0x2802, 0xa438, 0x6c6d, ++ 0xa438, 0xef64, 0xa438, 0x1f44, 0xa438, 0xe18f, 0xa438, 0xb91b, ++ 0xa438, 0x64ad, 0xa438, 0x4f1d, 0xa438, 0xd688, 0xa438, 0x2bd7, ++ 0xa438, 0x882e, 0xa438, 0x0274, 0xa438, 0x73ad, 0xa438, 0x5008, ++ 0xa438, 0xbf88, 0xa438, 0x3102, 0xa438, 0x737c, 0xa438, 0xae03, ++ 0xa438, 0x0287, 0xa438, 0xd0bf, 0xa438, 0x882b, 0xa438, 0x0273, ++ 0xa438, 0x73e0, 0xa438, 0x824c, 0xa438, 0xf621, 0xa438, 0xe482, ++ 0xa438, 0x4cbf, 0xa438, 0x8834, 0xa438, 0x0273, 0xa438, 0x7cef, ++ 0xa438, 0x95ff, 0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf8f9, ++ 0xa438, 0xfafb, 0xa438, 0xef79, 0xa438, 0xbf88, 0xa438, 0x1f02, ++ 0xa438, 0x737c, 0xa438, 0x1f22, 0xa438, 0xac32, 0xa438, 0x31ef, ++ 0xa438, 0x12bf, 0xa438, 0x8822, 0xa438, 0x026c, 0xa438, 0x4ed6, ++ 0xa438, 0x8fba, 0xa438, 0x1f33, 0xa438, 0xac3c, 0xa438, 0x1eef, ++ 0xa438, 0x13bf, 0xa438, 0x8837, 0xa438, 0x026c, 0xa438, 0x4eef, ++ 0xa438, 0x96d8, 0xa438, 0x19d9, 0xa438, 0xbf88, 0xa438, 0x2502, ++ 0xa438, 0x6c4e, 0xa438, 0xbf88, 0xa438, 0x2502, 0xa438, 0x6c4e, ++ 0xa438, 0x1616, 0xa438, 0x13ae, 0xa438, 0xdf12, 0xa438, 0xaecc, ++ 0xa438, 0xbf88, 0xa438, 0x1f02, 0xa438, 0x7373, 0xa438, 0xef97, ++ 0xa438, 0xfffe, 0xa438, 0xfdfc, 0xa438, 0x0466, 0xa438, 0xac88, ++ 0xa438, 0x54ac, 0xa438, 0x88f0, 0xa438, 0xac8a, 0xa438, 0x92ac, ++ 0xa438, 0xbadd, 0xa438, 0xac6c, 0xa438, 0xeeac, 0xa438, 0x6cff, ++ 0xa438, 0xad02, 0xa438, 0x99ac, 0xa438, 0x0030, 0xa438, 0xac88, ++ 0xa438, 0xd4c3, 0xa438, 0x5000, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x00b4, 0xa438, 0xecee, ++ 0xa438, 0x8298, 0xa438, 0x00af, 0xa438, 0x1412, 0xa438, 0xf8bf, ++ 0xa438, 0x8b5d, 0xa438, 0x026c, 0xa438, 0x6d58, 0xa438, 0x03e1, ++ 0xa438, 0x8fb8, 0xa438, 0x2901, 0xa438, 0xe58f, 0xa438, 0xb8a0, ++ 0xa438, 0x0049, 0xa438, 0xef47, 0xa438, 0xe483, 0xa438, 0x02e5, ++ 0xa438, 0x8303, 0xa438, 0xbfc2, 0xa438, 0x5f1a, 0xa438, 0x95f7, ++ 0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00d8, 0xa438, 0xf605, ++ 0xa438, 0x1f11, 0xa438, 0xef60, 0xa438, 0xbf8b, 0xa438, 0x3002, ++ 0xa438, 0x6c4e, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c6d, ++ 0xa438, 0xf728, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e, ++ 0xa438, 0xf628, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e, ++ 0xa438, 0x0c64, 0xa438, 0xef46, 0xa438, 0xbf8b, 0xa438, 0x6002, ++ 0xa438, 0x6c4e, 0xa438, 0x0289, 0xa438, 0x9902, 0xa438, 0x3920, ++ 0xa438, 0xaf89, 0xa438, 0x96a0, 0xa438, 0x0149, 0xa438, 0xef47, ++ 0xa438, 0xe483, 0xa438, 0x04e5, 0xa438, 0x8305, 0xa438, 0xbfc2, ++ 0xa438, 0x5f1a, 0xa438, 0x95f7, 0xa438, 0x05ee, 0xa438, 0xffd2, ++ 0xa438, 0x00d8, 0xa438, 0xf605, 0xa438, 0x1f11, 0xa438, 0xef60, ++ 0xa438, 0xbf8b, 0xa438, 0x3002, 0xa438, 0x6c4e, 0xa438, 0xbf8b, ++ 0xa438, 0x3302, 0xa438, 0x6c6d, 0xa438, 0xf729, 0xa438, 0xbf8b, ++ 0xa438, 0x3302, 0xa438, 0x6c4e, 0xa438, 0xf629, 0xa438, 0xbf8b, ++ 0xa438, 0x3302, 0xa438, 0x6c4e, 0xa438, 0x0c64, 0xa438, 0xef46, ++ 0xa438, 0xbf8b, 0xa438, 0x6302, 0xa438, 0x6c4e, 0xa438, 0x0289, ++ 0xa438, 0x9902, 0xa438, 0x3920, 0xa438, 0xaf89, 0xa438, 0x96a0, ++ 0xa438, 0x0249, 0xa438, 0xef47, 0xa438, 0xe483, 0xa438, 0x06e5, ++ 0xa438, 0x8307, 0xa438, 0xbfc2, 0xa438, 0x5f1a, 0xa438, 0x95f7, ++ 0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00d8, 0xa438, 0xf605, ++ 0xa438, 0x1f11, 0xa438, 0xef60, 0xa438, 0xbf8b, 0xa438, 0x3002, ++ 0xa438, 0x6c4e, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c6d, ++ 0xa438, 0xf72a, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e, ++ 0xa438, 0xf62a, 0xa438, 0xbf8b, 0xa438, 0x3302, 0xa438, 0x6c4e, ++ 0xa438, 0x0c64, 0xa438, 0xef46, 0xa438, 0xbf8b, 0xa438, 0x6602, ++ 0xa438, 0x6c4e, 0xa438, 0x0289, 0xa438, 0x9902, 0xa438, 0x3920, ++ 0xa438, 0xaf89, 0xa438, 0x96ef, 0xa438, 0x47e4, 0xa438, 0x8308, ++ 0xa438, 0xe583, 0xa438, 0x09bf, 0xa438, 0xc25f, 0xa438, 0x1a95, ++ 0xa438, 0xf705, 0xa438, 0xeeff, 0xa438, 0xd200, 0xa438, 0xd8f6, ++ 0xa438, 0x051f, 0xa438, 0x11ef, 0xa438, 0x60bf, 0xa438, 0x8b30, ++ 0xa438, 0x026c, 0xa438, 0x4ebf, 0xa438, 0x8b33, 0xa438, 0x026c, ++ 0xa438, 0x6df7, 0xa438, 0x2bbf, 0xa438, 0x8b33, 0xa438, 0x026c, ++ 0xa438, 0x4ef6, 0xa438, 0x2bbf, 0xa438, 0x8b33, 0xa438, 0x026c, ++ 0xa438, 0x4e0c, 0xa438, 0x64ef, 0xa438, 0x46bf, 0xa438, 0x8b69, ++ 0xa438, 0x026c, 0xa438, 0x4e02, 0xa438, 0x8999, 0xa438, 0x0239, ++ 0xa438, 0x20af, 0xa438, 0x8996, 0xa438, 0xaf39, 0xa438, 0x1ef8, ++ 0xa438, 0xf9fa, 0xa438, 0xe08f, 0xa438, 0xb838, 0xa438, 0x02ad, ++ 0xa438, 0x2702, 0xa438, 0xae03, 0xa438, 0xaf8b, 0xa438, 0x201f, ++ 0xa438, 0x66ef, 0xa438, 0x65bf, 0xa438, 0xc21f, 0xa438, 0x1a96, ++ 0xa438, 0xf705, 0xa438, 0xeeff, 0xa438, 0xd200, 0xa438, 0xdaf6, ++ 0xa438, 0x05bf, 0xa438, 0xc22f, 0xa438, 0x1a96, 0xa438, 0xf705, ++ 0xa438, 0xeeff, 0xa438, 0xd200, 0xa438, 0xdbf6, 0xa438, 0x05ef, ++ 0xa438, 0x021f, 0xa438, 0x110d, 0xa438, 0x42bf, 0xa438, 0x8b3c, ++ 0xa438, 0x026c, 0xa438, 0x4eef, 0xa438, 0x021b, 0xa438, 0x031f, ++ 0xa438, 0x110d, 0xa438, 0x42bf, 0xa438, 0x8b36, 0xa438, 0x026c, ++ 0xa438, 0x4eef, 0xa438, 0x021a, 0xa438, 0x031f, 0xa438, 0x110d, ++ 0xa438, 0x42bf, 0xa438, 0x8b39, 0xa438, 0x026c, 0xa438, 0x4ebf, ++ 0xa438, 0xc23f, 0xa438, 0x1a96, 0xa438, 0xf705, 0xa438, 0xeeff, ++ 0xa438, 0xd200, 0xa438, 0xdaf6, 0xa438, 0x05bf, 0xa438, 0xc24f, ++ 0xa438, 0x1a96, 0xa438, 0xf705, 0xa438, 0xeeff, 0xa438, 0xd200, ++ 0xa438, 0xdbf6, 0xa438, 0x05ef, 0xa438, 0x021f, 0xa438, 0x110d, ++ 0xa438, 0x42bf, 0xa438, 0x8b45, 0xa438, 0x026c, 0xa438, 0x4eef, ++ 0xa438, 0x021b, 0xa438, 0x031f, 0xa438, 0x110d, 0xa438, 0x42bf, ++ 0xa438, 0x8b3f, 0xa438, 0x026c, 0xa438, 0x4eef, 0xa438, 0x021a, ++ 0xa438, 0x031f, 0xa438, 0x110d, 0xa438, 0x42bf, 0xa438, 0x8b42, ++ 0xa438, 0x026c, 0xa438, 0x4eef, 0xa438, 0x56d0, 0xa438, 0x201f, ++ 0xa438, 0x11bf, 0xa438, 0x8b4e, 0xa438, 0x026c, 0xa438, 0x4ebf, ++ 0xa438, 0x8b48, 0xa438, 0x026c, 0xa438, 0x4ebf, 0xa438, 0x8b4b, ++ 0xa438, 0x026c, 0xa438, 0x4ee1, 0xa438, 0x8578, 0xa438, 0xef03, ++ 0xa438, 0x480a, 0xa438, 0x2805, 0xa438, 0xef20, 0xa438, 0x1b01, ++ 0xa438, 0xad27, 0xa438, 0x3f1f, 0xa438, 0x44e0, 0xa438, 0x8560, ++ 0xa438, 0xe185, 0xa438, 0x61bf, 0xa438, 0x8b51, 0xa438, 0x026c, ++ 0xa438, 0x4ee0, 0xa438, 0x8566, 0xa438, 0xe185, 0xa438, 0x67bf, ++ 0xa438, 0x8b54, 0xa438, 0x026c, 0xa438, 0x4ee0, 0xa438, 0x856c, ++ 0xa438, 0xe185, 0xa438, 0x6dbf, 0xa438, 0x8b57, 0xa438, 0x026c, ++ 0xa438, 0x4ee0, 0xa438, 0x8572, 0xa438, 0xe185, 0xa438, 0x73bf, ++ 0xa438, 0x8b5a, 0xa438, 0x026c, 0xa438, 0x4ee1, 0xa438, 0x8fb8, ++ 0xa438, 0x5900, 0xa438, 0xf728, 0xa438, 0xe58f, 0xa438, 0xb8af, ++ 0xa438, 0x8b2c, 0xa438, 0xe185, 0xa438, 0x791b, 0xa438, 0x21ad, ++ 0xa438, 0x373e, 0xa438, 0x1f44, 0xa438, 0xe085, 0xa438, 0x62e1, ++ 0xa438, 0x8563, 0xa438, 0xbf8b, 0xa438, 0x5102, 0xa438, 0x6c4e, ++ 0xa438, 0xe085, 0xa438, 0x68e1, 0xa438, 0x8569, 0xa438, 0xbf8b, ++ 0xa438, 0x5402, 0xa438, 0x6c4e, 0xa438, 0xe085, 0xa438, 0x6ee1, ++ 0xa438, 0x856f, 0xa438, 0xbf8b, 0xa438, 0x5702, 0xa438, 0x6c4e, ++ 0xa438, 0xe085, 0xa438, 0x74e1, 0xa438, 0x8575, 0xa438, 0xbf8b, ++ 0xa438, 0x5a02, 0xa438, 0x6c4e, 0xa438, 0xe18f, 0xa438, 0xb859, ++ 0xa438, 0x00f7, 0xa438, 0x28e5, 0xa438, 0x8fb8, 0xa438, 0xae4a, ++ 0xa438, 0x1f44, 0xa438, 0xe085, 0xa438, 0x64e1, 0xa438, 0x8565, ++ 0xa438, 0xbf8b, 0xa438, 0x5102, 0xa438, 0x6c4e, 0xa438, 0xe085, ++ 0xa438, 0x6ae1, 0xa438, 0x856b, 0xa438, 0xbf8b, 0xa438, 0x5402, ++ 0xa438, 0x6c4e, 0xa438, 0xe085, 0xa438, 0x70e1, 0xa438, 0x8571, ++ 0xa438, 0xbf8b, 0xa438, 0x5702, 0xa438, 0x6c4e, 0xa438, 0xe085, ++ 0xa438, 0x76e1, 0xa438, 0x8577, 0xa438, 0xbf8b, 0xa438, 0x5a02, ++ 0xa438, 0x6c4e, 0xa438, 0xe18f, 0xa438, 0xb859, 0xa438, 0x00f7, ++ 0xa438, 0x28e5, 0xa438, 0x8fb8, 0xa438, 0xae0c, 0xa438, 0xe18f, ++ 0xa438, 0xb839, 0xa438, 0x04ac, 0xa438, 0x2f04, 0xa438, 0xee8f, ++ 0xa438, 0xb800, 0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf0ac, ++ 0xa438, 0x8efc, 0xa438, 0xac8c, 0xa438, 0xf0ac, 0xa438, 0xfaf0, ++ 0xa438, 0xacf8, 0xa438, 0xf0ac, 0xa438, 0xf6f0, 0xa438, 0xad00, ++ 0xa438, 0xf0ac, 0xa438, 0xfef0, 0xa438, 0xacfc, 0xa438, 0xf0ac, ++ 0xa438, 0xf4f0, 0xa438, 0xacf2, 0xa438, 0xf0ac, 0xa438, 0xf0f0, ++ 0xa438, 0xacb0, 0xa438, 0xf0ac, 0xa438, 0xaef0, 0xa438, 0xacac, ++ 0xa438, 0xf0ac, 0xa438, 0xaaf0, 0xa438, 0xacee, 0xa438, 0xf0b0, ++ 0xa438, 0x24f0, 0xa438, 0xb0a4, 0xa438, 0xf0b1, 0xa438, 0x24f0, ++ 0xa438, 0xb1a4, 0xa438, 0xee8f, 0xa438, 0xb800, 0xa438, 0xd400, ++ 0xa438, 0x00af, 0xa438, 0x3976, 0xa438, 0x66ac, 0xa438, 0xeabb, ++ 0xa438, 0xa430, 0xa438, 0x6e50, 0xa438, 0x6e53, 0xa438, 0x6e56, ++ 0xa438, 0x6e59, 0xa438, 0x6e5c, 0xa438, 0x6e5f, 0xa438, 0x6e62, ++ 0xa438, 0x6e65, 0xa438, 0xd9ac, 0xa438, 0x70f0, 0xa438, 0xac6a, ++ 0xa436, 0xb85e, 0xa438, 0x23b7, 0xa436, 0xb860, 0xa438, 0x74db, ++ 0xa436, 0xb862, 0xa438, 0x268c, 0xa436, 0xb864, 0xa438, 0x3FE5, ++ 0xa436, 0xb886, 0xa438, 0x2250, 0xa436, 0xb888, 0xa438, 0x140e, ++ 0xa436, 0xb88a, 0xa438, 0x3696, 0xa436, 0xb88c, 0xa438, 0x3973, ++ 0xa436, 0xb838, 0xa438, 0x00ff, 0xb820, 0x0010, 0xa436, 0x8464, ++ 0xa438, 0xaf84, 0xa438, 0x7caf, 0xa438, 0x8485, 0xa438, 0xaf85, ++ 0xa438, 0x13af, 0xa438, 0x851e, 0xa438, 0xaf85, 0xa438, 0xb9af, ++ 0xa438, 0x8684, 0xa438, 0xaf87, 0xa438, 0x01af, 0xa438, 0x8701, ++ 0xa438, 0xac38, 0xa438, 0x03af, 0xa438, 0x38bb, 0xa438, 0xaf38, ++ 0xa438, 0xc302, 0xa438, 0x4618, 0xa438, 0xbf85, 0xa438, 0x0a02, ++ 0xa438, 0x54b7, 0xa438, 0xbf85, 0xa438, 0x1002, 0xa438, 0x54c0, ++ 0xa438, 0xd400, 0xa438, 0x0fbf, 0xa438, 0x8507, 0xa438, 0x024f, ++ 0xa438, 0x48bf, 0xa438, 0x8504, 0xa438, 0x024f, 0xa438, 0x6759, ++ 0xa438, 0xf0a1, 0xa438, 0x3008, 0xa438, 0xbf85, 0xa438, 0x0d02, ++ 0xa438, 0x54c0, 0xa438, 0xae06, 0xa438, 0xbf85, 0xa438, 0x0d02, ++ 0xa438, 0x54b7, 0xa438, 0xbf85, 0xa438, 0x0402, 0xa438, 0x4f67, ++ 0xa438, 0xa183, 0xa438, 0x02ae, 0xa438, 0x15a1, 0xa438, 0x8502, ++ 0xa438, 0xae10, 0xa438, 0x59f0, 0xa438, 0xa180, 0xa438, 0x16bf, ++ 0xa438, 0x8501, 0xa438, 0x024f, 0xa438, 0x67a1, 0xa438, 0x381b, ++ 0xa438, 0xae0b, 0xa438, 0xe18f, 0xa438, 0xffbf, 0xa438, 0x84fe, ++ 0xa438, 0x024f, 0xa438, 0x48ae, 0xa438, 0x17bf, 0xa438, 0x84fe, ++ 0xa438, 0x0254, 0xa438, 0xb7bf, 0xa438, 0x84fb, 0xa438, 0x0254, ++ 0xa438, 0xb7ae, 0xa438, 0x09a1, 0xa438, 0x5006, 0xa438, 0xbf84, ++ 0xa438, 0xfb02, 0xa438, 0x54c0, 0xa438, 0xaf04, 0xa438, 0x4700, ++ 0xa438, 0xad34, 0xa438, 0xfdad, 0xa438, 0x0670, 0xa438, 0xae14, ++ 0xa438, 0xf0a6, 0xa438, 0x00b8, 0xa438, 0xbd32, 0xa438, 0x30bd, ++ 0xa438, 0x30aa, 0xa438, 0xbd2c, 0xa438, 0xccbd, 0xa438, 0x2ca1, ++ 0xa438, 0x0705, 0xa438, 0xec80, 0xa438, 0xaf40, 0xa438, 0xf7af, ++ 0xa438, 0x40f5, 0xa438, 0xd101, 0xa438, 0xbf85, 0xa438, 0xa402, ++ 0xa438, 0x4f48, 0xa438, 0xbf85, 0xa438, 0xa702, 0xa438, 0x54c0, ++ 0xa438, 0xd10f, 0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48, ++ 0xa438, 0x024d, 0xa438, 0x6abf, 0xa438, 0x85ad, 0xa438, 0x024f, ++ 0xa438, 0x67bf, 0xa438, 0x8ff7, 0xa438, 0xddbf, 0xa438, 0x85b0, ++ 0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ff8, 0xa438, 0xddbf, ++ 0xa438, 0x85b3, 0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ff9, ++ 0xa438, 0xddbf, 0xa438, 0x85b6, 0xa438, 0x024f, 0xa438, 0x67bf, ++ 0xa438, 0x8ffa, 0xa438, 0xddd1, 0xa438, 0x00bf, 0xa438, 0x85aa, ++ 0xa438, 0x024f, 0xa438, 0x4802, 0xa438, 0x4d6a, 0xa438, 0xbf85, ++ 0xa438, 0xad02, 0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfbdd, ++ 0xa438, 0xbf85, 0xa438, 0xb002, 0xa438, 0x4f67, 0xa438, 0xbf8f, ++ 0xa438, 0xfcdd, 0xa438, 0xbf85, 0xa438, 0xb302, 0xa438, 0x4f67, ++ 0xa438, 0xbf8f, 0xa438, 0xfddd, 0xa438, 0xbf85, 0xa438, 0xb602, ++ 0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfedd, 0xa438, 0xbf85, ++ 0xa438, 0xa702, 0xa438, 0x54b7, 0xa438, 0xbf85, 0xa438, 0xa102, ++ 0xa438, 0x54b7, 0xa438, 0xaf3c, 0xa438, 0x2066, 0xa438, 0xb800, ++ 0xa438, 0xb8bd, 0xa438, 0x30ee, 0xa438, 0xbd2c, 0xa438, 0xb8bd, ++ 0xa438, 0x7040, 0xa438, 0xbd86, 0xa438, 0xc8bd, 0xa438, 0x8640, ++ 0xa438, 0xbd88, 0xa438, 0xc8bd, 0xa438, 0x8802, 0xa438, 0x1929, ++ 0xa438, 0xa202, 0xa438, 0x02ae, 0xa438, 0x03a2, 0xa438, 0x032e, ++ 0xa438, 0xd10f, 0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48, ++ 0xa438, 0xe18f, 0xa438, 0xf7bf, 0xa438, 0x85ad, 0xa438, 0x024f, ++ 0xa438, 0x48e1, 0xa438, 0x8ff8, 0xa438, 0xbf85, 0xa438, 0xb002, ++ 0xa438, 0x4f48, 0xa438, 0xe18f, 0xa438, 0xf9bf, 0xa438, 0x85b3, ++ 0xa438, 0x024f, 0xa438, 0x48e1, 0xa438, 0x8ffa, 0xa438, 0xbf85, ++ 0xa438, 0xb602, 0xa438, 0x4f48, 0xa438, 0xae2c, 0xa438, 0xd100, ++ 0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48, 0xa438, 0xe18f, ++ 0xa438, 0xfbbf, 0xa438, 0x85ad, 0xa438, 0x024f, 0xa438, 0x48e1, ++ 0xa438, 0x8ffc, 0xa438, 0xbf85, 0xa438, 0xb002, 0xa438, 0x4f48, ++ 0xa438, 0xe18f, 0xa438, 0xfdbf, 0xa438, 0x85b3, 0xa438, 0x024f, ++ 0xa438, 0x48e1, 0xa438, 0x8ffe, 0xa438, 0xbf85, 0xa438, 0xb602, ++ 0xa438, 0x4f48, 0xa438, 0xbf86, 0xa438, 0x7e02, 0xa438, 0x4f67, ++ 0xa438, 0xa100, 0xa438, 0x02ae, 0xa438, 0x25a1, 0xa438, 0x041d, ++ 0xa438, 0xe18f, 0xa438, 0xf1bf, 0xa438, 0x8675, 0xa438, 0x024f, ++ 0xa438, 0x48e1, 0xa438, 0x8ff2, 0xa438, 0xbf86, 0xa438, 0x7802, ++ 0xa438, 0x4f48, 0xa438, 0xe18f, 0xa438, 0xf3bf, 0xa438, 0x867b, ++ 0xa438, 0x024f, 0xa438, 0x48ae, 0xa438, 0x29a1, 0xa438, 0x070b, ++ 0xa438, 0xae24, 0xa438, 0xbf86, 0xa438, 0x8102, 0xa438, 0x4f67, ++ 0xa438, 0xad28, 0xa438, 0x1be1, 0xa438, 0x8ff4, 0xa438, 0xbf86, ++ 0xa438, 0x7502, 0xa438, 0x4f48, 0xa438, 0xe18f, 0xa438, 0xf5bf, ++ 0xa438, 0x8678, 0xa438, 0x024f, 0xa438, 0x48e1, 0xa438, 0x8ff6, ++ 0xa438, 0xbf86, 0xa438, 0x7b02, 0xa438, 0x4f48, 0xa438, 0xaf09, ++ 0xa438, 0x8420, 0xa438, 0xbc32, 0xa438, 0x20bc, 0xa438, 0x3e76, ++ 0xa438, 0xbc08, 0xa438, 0xfda6, 0xa438, 0x1a00, 0xa438, 0xb64e, ++ 0xa438, 0xd101, 0xa438, 0xbf85, 0xa438, 0xa402, 0xa438, 0x4f48, ++ 0xa438, 0xbf85, 0xa438, 0xa702, 0xa438, 0x54c0, 0xa438, 0xd10f, ++ 0xa438, 0xbf85, 0xa438, 0xaa02, 0xa438, 0x4f48, 0xa438, 0x024d, ++ 0xa438, 0x6abf, 0xa438, 0x85ad, 0xa438, 0x024f, 0xa438, 0x67bf, ++ 0xa438, 0x8ff7, 0xa438, 0xddbf, 0xa438, 0x85b0, 0xa438, 0x024f, ++ 0xa438, 0x67bf, 0xa438, 0x8ff8, 0xa438, 0xddbf, 0xa438, 0x85b3, ++ 0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ff9, 0xa438, 0xddbf, ++ 0xa438, 0x85b6, 0xa438, 0x024f, 0xa438, 0x67bf, 0xa438, 0x8ffa, ++ 0xa438, 0xddd1, 0xa438, 0x00bf, 0xa438, 0x85aa, 0xa438, 0x024f, ++ 0xa438, 0x4802, 0xa438, 0x4d6a, 0xa438, 0xbf85, 0xa438, 0xad02, ++ 0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfbdd, 0xa438, 0xbf85, ++ 0xa438, 0xb002, 0xa438, 0x4f67, 0xa438, 0xbf8f, 0xa438, 0xfcdd, ++ 0xa438, 0xbf85, 0xa438, 0xb302, 0xa438, 0x4f67, 0xa438, 0xbf8f, ++ 0xa438, 0xfddd, 0xa438, 0xbf85, 0xa438, 0xb602, 0xa438, 0x4f67, ++ 0xa438, 0xbf8f, 0xa438, 0xfedd, 0xa438, 0xbf85, 0xa438, 0xa702, ++ 0xa438, 0x54b7, 0xa438, 0xaf00, 0xa438, 0x8800, 0xa436, 0xb818, ++ 0xa438, 0x38b8, 0xa436, 0xb81a, 0xa438, 0x0444, 0xa436, 0xb81c, ++ 0xa438, 0x40ee, 0xa436, 0xb81e, 0xa438, 0x3C1A, 0xa436, 0xb850, ++ 0xa438, 0x0981, 0xa436, 0xb852, 0xa438, 0x0085, 0xa436, 0xb878, ++ 0xa438, 0xffff, 0xa436, 0xb884, 0xa438, 0xffff, 0xa436, 0xb832, ++ 0xa438, 0x003f, 0xa436, 0x0000, 0xa438, 0x0000, 0xa436, 0xB82E, ++ 0xa438, 0x0000, 0xa436, 0x8024, 0xa438, 0x0000, 0xb820, 0x0000, ++ 0xa436, 0x801E, 0xa438, 0x0021, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125b_2[] = { ++ 0xa436, 0x8024, 0xa438, 0x3701, 0xa436, 0xB82E, 0xa438, 0x0001, ++ 0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x801a, 0xa438, 0x1800, 0xa438, 0x803f, ++ 0xa438, 0x1800, 0xa438, 0x8045, 0xa438, 0x1800, 0xa438, 0x8067, ++ 0xa438, 0x1800, 0xa438, 0x806d, 0xa438, 0x1800, 0xa438, 0x8071, ++ 0xa438, 0x1800, 0xa438, 0x80b1, 0xa438, 0xd093, 0xa438, 0xd1c4, ++ 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd704, 0xa438, 0x5fbc, ++ 0xa438, 0xd504, 0xa438, 0xc9f1, 0xa438, 0x1800, 0xa438, 0x0fc9, ++ 0xa438, 0xbb50, 0xa438, 0xd505, 0xa438, 0xa202, 0xa438, 0xd504, ++ 0xa438, 0x8c0f, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1519, ++ 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd75e, 0xa438, 0x5fae, ++ 0xa438, 0x9b50, 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd75e, ++ 0xa438, 0x7fae, 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd707, ++ 0xa438, 0x40a7, 0xa438, 0xd719, 0xa438, 0x4071, 0xa438, 0x1800, ++ 0xa438, 0x1557, 0xa438, 0xd719, 0xa438, 0x2f70, 0xa438, 0x803b, ++ 0xa438, 0x2f73, 0xa438, 0x156a, 0xa438, 0x5e70, 0xa438, 0x1800, ++ 0xa438, 0x155d, 0xa438, 0xd505, 0xa438, 0xa202, 0xa438, 0xd500, ++ 0xa438, 0xffed, 0xa438, 0xd709, 0xa438, 0x4054, 0xa438, 0xa788, ++ 0xa438, 0xd70b, 0xa438, 0x1800, 0xa438, 0x172a, 0xa438, 0xc0c1, ++ 0xa438, 0xc0c0, 0xa438, 0xd05a, 0xa438, 0xd1ba, 0xa438, 0xd701, ++ 0xa438, 0x2529, 0xa438, 0x022a, 0xa438, 0xd0a7, 0xa438, 0xd1b9, ++ 0xa438, 0xa208, 0xa438, 0x1000, 0xa438, 0x080e, 0xa438, 0xd701, ++ 0xa438, 0x408b, 0xa438, 0x1000, 0xa438, 0x0a65, 0xa438, 0xf003, ++ 0xa438, 0x1000, 0xa438, 0x0a6b, 0xa438, 0xd701, 0xa438, 0x1000, ++ 0xa438, 0x0920, 0xa438, 0x1000, 0xa438, 0x0915, 0xa438, 0x1000, ++ 0xa438, 0x0909, 0xa438, 0x228f, 0xa438, 0x804e, 0xa438, 0x9801, ++ 0xa438, 0xd71e, 0xa438, 0x5d61, 0xa438, 0xd701, 0xa438, 0x1800, ++ 0xa438, 0x022a, 0xa438, 0x2005, 0xa438, 0x091a, 0xa438, 0x3bd9, ++ 0xa438, 0x0919, 0xa438, 0x1800, 0xa438, 0x0916, 0xa438, 0xd090, ++ 0xa438, 0xd1c9, 0xa438, 0x1800, 0xa438, 0x1064, 0xa438, 0xd096, ++ 0xa438, 0xd1a9, 0xa438, 0xd503, 0xa438, 0xa104, 0xa438, 0x0c07, ++ 0xa438, 0x0902, 0xa438, 0xd500, 0xa438, 0xbc10, 0xa438, 0xd501, ++ 0xa438, 0xce01, 0xa438, 0xa201, 0xa438, 0x8201, 0xa438, 0xce00, ++ 0xa438, 0xd500, 0xa438, 0xc484, 0xa438, 0xd503, 0xa438, 0xcc02, ++ 0xa438, 0xcd0d, 0xa438, 0xaf01, 0xa438, 0xd500, 0xa438, 0xd703, ++ 0xa438, 0x4371, 0xa438, 0xbd08, 0xa438, 0x1000, 0xa438, 0x135c, ++ 0xa438, 0xd75e, 0xa438, 0x5fb3, 0xa438, 0xd503, 0xa438, 0xd0f5, ++ 0xa438, 0xd1c6, 0xa438, 0x0cf0, 0xa438, 0x0e50, 0xa438, 0xd704, ++ 0xa438, 0x401c, 0xa438, 0xd0f5, 0xa438, 0xd1c6, 0xa438, 0x0cf0, ++ 0xa438, 0x0ea0, 0xa438, 0x401c, 0xa438, 0xd07b, 0xa438, 0xd1c5, ++ 0xa438, 0x8ef0, 0xa438, 0x401c, 0xa438, 0x9d08, 0xa438, 0x1000, ++ 0xa438, 0x135c, 0xa438, 0xd75e, 0xa438, 0x7fb3, 0xa438, 0x1000, ++ 0xa438, 0x135c, 0xa438, 0xd75e, 0xa438, 0x5fad, 0xa438, 0x1000, ++ 0xa438, 0x14c5, 0xa438, 0xd703, 0xa438, 0x3181, 0xa438, 0x80af, ++ 0xa438, 0x60ad, 0xa438, 0x1000, 0xa438, 0x135c, 0xa438, 0xd703, ++ 0xa438, 0x5fba, 0xa438, 0x1800, 0xa438, 0x0cc7, 0xa438, 0xa802, ++ 0xa438, 0xa301, 0xa438, 0xa801, 0xa438, 0xc004, 0xa438, 0xd710, ++ 0xa438, 0x4000, 0xa438, 0x1800, 0xa438, 0x1e79, 0xa436, 0xA026, ++ 0xa438, 0x1e78, 0xa436, 0xA024, 0xa438, 0x0c93, 0xa436, 0xA022, ++ 0xa438, 0x1062, 0xa436, 0xA020, 0xa438, 0x0915, 0xa436, 0xA006, ++ 0xa438, 0x020a, 0xa436, 0xA004, 0xa438, 0x1726, 0xa436, 0xA002, ++ 0xa438, 0x1542, 0xa436, 0xA000, 0xa438, 0x0fc7, 0xa436, 0xA008, ++ 0xa438, 0xff00, 0xa436, 0xA016, 0xa438, 0x0010, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x801d, 0xa438, 0x1800, 0xa438, 0x802c, ++ 0xa438, 0x1800, 0xa438, 0x802c, 0xa438, 0x1800, 0xa438, 0x802c, ++ 0xa438, 0x1800, 0xa438, 0x802c, 0xa438, 0x1800, 0xa438, 0x802c, ++ 0xa438, 0x1800, 0xa438, 0x802c, 0xa438, 0xd700, 0xa438, 0x6090, ++ 0xa438, 0x60d1, 0xa438, 0xc95c, 0xa438, 0xf007, 0xa438, 0x60b1, ++ 0xa438, 0xc95a, 0xa438, 0xf004, 0xa438, 0xc956, 0xa438, 0xf002, ++ 0xa438, 0xc94e, 0xa438, 0x1800, 0xa438, 0x00cd, 0xa438, 0xd700, ++ 0xa438, 0x6090, 0xa438, 0x60d1, 0xa438, 0xc95c, 0xa438, 0xf007, ++ 0xa438, 0x60b1, 0xa438, 0xc95a, 0xa438, 0xf004, 0xa438, 0xc956, ++ 0xa438, 0xf002, 0xa438, 0xc94e, 0xa438, 0x1000, 0xa438, 0x022a, ++ 0xa438, 0x1800, 0xa438, 0x0132, 0xa436, 0xA08E, 0xa438, 0xffff, ++ 0xa436, 0xA08C, 0xa438, 0xffff, 0xa436, 0xA08A, 0xa438, 0xffff, ++ 0xa436, 0xA088, 0xa438, 0xffff, 0xa436, 0xA086, 0xa438, 0xffff, ++ 0xa436, 0xA084, 0xa438, 0xffff, 0xa436, 0xA082, 0xa438, 0x012f, ++ 0xa436, 0xA080, 0xa438, 0x00cc, 0xa436, 0xA090, 0xa438, 0x0103, ++ 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000, ++ 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800, ++ 0xa438, 0x8020, 0xa438, 0x1800, 0xa438, 0x802a, 0xa438, 0x1800, ++ 0xa438, 0x8035, 0xa438, 0x1800, 0xa438, 0x803c, 0xa438, 0x1800, ++ 0xa438, 0x803c, 0xa438, 0x1800, 0xa438, 0x803c, 0xa438, 0x1800, ++ 0xa438, 0x803c, 0xa438, 0xd107, 0xa438, 0xd042, 0xa438, 0xa404, ++ 0xa438, 0x1000, 0xa438, 0x09df, 0xa438, 0xd700, 0xa438, 0x5fb4, ++ 0xa438, 0x8280, 0xa438, 0xd700, 0xa438, 0x6065, 0xa438, 0xd125, ++ 0xa438, 0xf002, 0xa438, 0xd12b, 0xa438, 0xd040, 0xa438, 0x1800, ++ 0xa438, 0x077f, 0xa438, 0x0cf0, 0xa438, 0x0c50, 0xa438, 0xd104, ++ 0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0aa8, 0xa438, 0xd700, ++ 0xa438, 0x5fb4, 0xa438, 0x1800, 0xa438, 0x0a2e, 0xa438, 0xcb9b, ++ 0xa438, 0xd110, 0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0b7b, ++ 0xa438, 0x1000, 0xa438, 0x09df, 0xa438, 0xd700, 0xa438, 0x5fb4, ++ 0xa438, 0x1800, 0xa438, 0x081b, 0xa438, 0x1000, 0xa438, 0x09df, ++ 0xa438, 0xd704, 0xa438, 0x7fb8, 0xa438, 0xa718, 0xa438, 0x1800, ++ 0xa438, 0x074e, 0xa436, 0xA10E, 0xa438, 0xffff, 0xa436, 0xA10C, ++ 0xa438, 0xffff, 0xa436, 0xA10A, 0xa438, 0xffff, 0xa436, 0xA108, ++ 0xa438, 0xffff, 0xa436, 0xA106, 0xa438, 0x074d, 0xa436, 0xA104, ++ 0xa438, 0x0818, 0xa436, 0xA102, 0xa438, 0x0a2c, 0xa436, 0xA100, ++ 0xa438, 0x077e, 0xa436, 0xA110, 0xa438, 0x000f, 0xa436, 0xb87c, ++ 0xa438, 0x8625, 0xa436, 0xb87e, 0xa438, 0xaf86, 0xa438, 0x3daf, ++ 0xa438, 0x8689, 0xa438, 0xaf88, 0xa438, 0x69af, 0xa438, 0x8887, ++ 0xa438, 0xaf88, 0xa438, 0x9caf, 0xa438, 0x88be, 0xa438, 0xaf88, ++ 0xa438, 0xbeaf, 0xa438, 0x88be, 0xa438, 0xbf86, 0xa438, 0x49d7, ++ 0xa438, 0x0040, 0xa438, 0x0277, 0xa438, 0x7daf, 0xa438, 0x2727, ++ 0xa438, 0x0000, 0xa438, 0x7205, 0xa438, 0x0000, 0xa438, 0x7208, ++ 0xa438, 0x0000, 0xa438, 0x71f3, 0xa438, 0x0000, 0xa438, 0x71f6, ++ 0xa438, 0x0000, 0xa438, 0x7229, 0xa438, 0x0000, 0xa438, 0x722c, ++ 0xa438, 0x0000, 0xa438, 0x7217, 0xa438, 0x0000, 0xa438, 0x721a, ++ 0xa438, 0x0000, 0xa438, 0x721d, 0xa438, 0x0000, 0xa438, 0x7211, ++ 0xa438, 0x0000, 0xa438, 0x7220, 0xa438, 0x0000, 0xa438, 0x7214, ++ 0xa438, 0x0000, 0xa438, 0x722f, 0xa438, 0x0000, 0xa438, 0x7223, ++ 0xa438, 0x0000, 0xa438, 0x7232, 0xa438, 0x0000, 0xa438, 0x7226, ++ 0xa438, 0xf8f9, 0xa438, 0xfae0, 0xa438, 0x85b3, 0xa438, 0x3802, ++ 0xa438, 0xad27, 0xa438, 0x02ae, 0xa438, 0x03af, 0xa438, 0x8830, ++ 0xa438, 0x1f66, 0xa438, 0xef65, 0xa438, 0xbfc2, 0xa438, 0x1f1a, ++ 0xa438, 0x96f7, 0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00da, ++ 0xa438, 0xf605, 0xa438, 0xbfc2, 0xa438, 0x2f1a, 0xa438, 0x96f7, ++ 0xa438, 0x05ee, 0xa438, 0xffd2, 0xa438, 0x00db, 0xa438, 0xf605, ++ 0xa438, 0xef02, 0xa438, 0x1f11, 0xa438, 0x0d42, 0xa438, 0xbf88, ++ 0xa438, 0x4202, 0xa438, 0x6e7d, 0xa438, 0xef02, 0xa438, 0x1b03, ++ 0xa438, 0x1f11, 0xa438, 0x0d42, 0xa438, 0xbf88, 0xa438, 0x4502, ++ 0xa438, 0x6e7d, 0xa438, 0xef02, 0xa438, 0x1a03, 0xa438, 0x1f11, ++ 0xa438, 0x0d42, 0xa438, 0xbf88, 0xa438, 0x4802, 0xa438, 0x6e7d, ++ 0xa438, 0xbfc2, 0xa438, 0x3f1a, 0xa438, 0x96f7, 0xa438, 0x05ee, ++ 0xa438, 0xffd2, 0xa438, 0x00da, 0xa438, 0xf605, 0xa438, 0xbfc2, ++ 0xa438, 0x4f1a, 0xa438, 0x96f7, 0xa438, 0x05ee, 0xa438, 0xffd2, ++ 0xa438, 0x00db, 0xa438, 0xf605, 0xa438, 0xef02, 0xa438, 0x1f11, ++ 0xa438, 0x0d42, 0xa438, 0xbf88, 0xa438, 0x4b02, 0xa438, 0x6e7d, ++ 0xa438, 0xef02, 0xa438, 0x1b03, 0xa438, 0x1f11, 0xa438, 0x0d42, ++ 0xa438, 0xbf88, 0xa438, 0x4e02, 0xa438, 0x6e7d, 0xa438, 0xef02, ++ 0xa438, 0x1a03, 0xa438, 0x1f11, 0xa438, 0x0d42, 0xa438, 0xbf88, ++ 0xa438, 0x5102, 0xa438, 0x6e7d, 0xa438, 0xef56, 0xa438, 0xd020, ++ 0xa438, 0x1f11, 0xa438, 0xbf88, 0xa438, 0x5402, 0xa438, 0x6e7d, ++ 0xa438, 0xbf88, 0xa438, 0x5702, 0xa438, 0x6e7d, 0xa438, 0xbf88, ++ 0xa438, 0x5a02, 0xa438, 0x6e7d, 0xa438, 0xe185, 0xa438, 0xa0ef, ++ 0xa438, 0x0348, 0xa438, 0x0a28, 0xa438, 0x05ef, 0xa438, 0x201b, ++ 0xa438, 0x01ad, 0xa438, 0x2735, 0xa438, 0x1f44, 0xa438, 0xe085, ++ 0xa438, 0x88e1, 0xa438, 0x8589, 0xa438, 0xbf88, 0xa438, 0x5d02, ++ 0xa438, 0x6e7d, 0xa438, 0xe085, 0xa438, 0x8ee1, 0xa438, 0x858f, ++ 0xa438, 0xbf88, 0xa438, 0x6002, 0xa438, 0x6e7d, 0xa438, 0xe085, ++ 0xa438, 0x94e1, 0xa438, 0x8595, 0xa438, 0xbf88, 0xa438, 0x6302, ++ 0xa438, 0x6e7d, 0xa438, 0xe085, 0xa438, 0x9ae1, 0xa438, 0x859b, ++ 0xa438, 0xbf88, 0xa438, 0x6602, 0xa438, 0x6e7d, 0xa438, 0xaf88, ++ 0xa438, 0x3cbf, 0xa438, 0x883f, 0xa438, 0x026e, 0xa438, 0x9cad, ++ 0xa438, 0x2835, 0xa438, 0x1f44, 0xa438, 0xe08f, 0xa438, 0xf8e1, ++ 0xa438, 0x8ff9, 0xa438, 0xbf88, 0xa438, 0x5d02, 0xa438, 0x6e7d, ++ 0xa438, 0xe08f, 0xa438, 0xfae1, 0xa438, 0x8ffb, 0xa438, 0xbf88, ++ 0xa438, 0x6002, 0xa438, 0x6e7d, 0xa438, 0xe08f, 0xa438, 0xfce1, ++ 0xa438, 0x8ffd, 0xa438, 0xbf88, 0xa438, 0x6302, 0xa438, 0x6e7d, ++ 0xa438, 0xe08f, 0xa438, 0xfee1, 0xa438, 0x8fff, 0xa438, 0xbf88, ++ 0xa438, 0x6602, 0xa438, 0x6e7d, 0xa438, 0xaf88, 0xa438, 0x3ce1, ++ 0xa438, 0x85a1, 0xa438, 0x1b21, 0xa438, 0xad37, 0xa438, 0x341f, ++ 0xa438, 0x44e0, 0xa438, 0x858a, 0xa438, 0xe185, 0xa438, 0x8bbf, ++ 0xa438, 0x885d, 0xa438, 0x026e, 0xa438, 0x7de0, 0xa438, 0x8590, ++ 0xa438, 0xe185, 0xa438, 0x91bf, 0xa438, 0x8860, 0xa438, 0x026e, ++ 0xa438, 0x7de0, 0xa438, 0x8596, 0xa438, 0xe185, 0xa438, 0x97bf, ++ 0xa438, 0x8863, 0xa438, 0x026e, 0xa438, 0x7de0, 0xa438, 0x859c, ++ 0xa438, 0xe185, 0xa438, 0x9dbf, 0xa438, 0x8866, 0xa438, 0x026e, ++ 0xa438, 0x7dae, 0xa438, 0x401f, 0xa438, 0x44e0, 0xa438, 0x858c, ++ 0xa438, 0xe185, 0xa438, 0x8dbf, 0xa438, 0x885d, 0xa438, 0x026e, ++ 0xa438, 0x7de0, 0xa438, 0x8592, 0xa438, 0xe185, 0xa438, 0x93bf, ++ 0xa438, 0x8860, 0xa438, 0x026e, 0xa438, 0x7de0, 0xa438, 0x8598, ++ 0xa438, 0xe185, 0xa438, 0x99bf, 0xa438, 0x8863, 0xa438, 0x026e, ++ 0xa438, 0x7de0, 0xa438, 0x859e, 0xa438, 0xe185, 0xa438, 0x9fbf, ++ 0xa438, 0x8866, 0xa438, 0x026e, 0xa438, 0x7dae, 0xa438, 0x0ce1, ++ 0xa438, 0x85b3, 0xa438, 0x3904, 0xa438, 0xac2f, 0xa438, 0x04ee, ++ 0xa438, 0x85b3, 0xa438, 0x00af, 0xa438, 0x39d9, 0xa438, 0x22ac, ++ 0xa438, 0xeaf0, 0xa438, 0xacf6, 0xa438, 0xf0ac, 0xa438, 0xfaf0, ++ 0xa438, 0xacf8, 0xa438, 0xf0ac, 0xa438, 0xfcf0, 0xa438, 0xad00, ++ 0xa438, 0xf0ac, 0xa438, 0xfef0, 0xa438, 0xacf0, 0xa438, 0xf0ac, ++ 0xa438, 0xf4f0, 0xa438, 0xacf2, 0xa438, 0xf0ac, 0xa438, 0xb0f0, ++ 0xa438, 0xacae, 0xa438, 0xf0ac, 0xa438, 0xacf0, 0xa438, 0xacaa, ++ 0xa438, 0xa100, 0xa438, 0x0ce1, 0xa438, 0x8ff7, 0xa438, 0xbf88, ++ 0xa438, 0x8402, 0xa438, 0x6e7d, 0xa438, 0xaf26, 0xa438, 0xe9e1, ++ 0xa438, 0x8ff6, 0xa438, 0xbf88, 0xa438, 0x8402, 0xa438, 0x6e7d, ++ 0xa438, 0xaf26, 0xa438, 0xf520, 0xa438, 0xac86, 0xa438, 0xbf88, ++ 0xa438, 0x3f02, 0xa438, 0x6e9c, 0xa438, 0xad28, 0xa438, 0x03af, ++ 0xa438, 0x3324, 0xa438, 0xad38, 0xa438, 0x03af, 0xa438, 0x32e6, ++ 0xa438, 0xaf32, 0xa438, 0xfbee, 0xa438, 0x826a, 0xa438, 0x0002, ++ 0xa438, 0x88a6, 0xa438, 0xaf04, 0xa438, 0x78f8, 0xa438, 0xfaef, ++ 0xa438, 0x69e0, 0xa438, 0x8015, 0xa438, 0xad20, 0xa438, 0x06bf, ++ 0xa438, 0x88bb, 0xa438, 0x0275, 0xa438, 0xb1ef, 0xa438, 0x96fe, ++ 0xa438, 0xfc04, 0xa438, 0x00b8, 0xa438, 0x7a00, 0xa436, 0xb87c, ++ 0xa438, 0x8ff6, 0xa436, 0xb87e, 0xa438, 0x0705, 0xa436, 0xb87c, ++ 0xa438, 0x8ff8, 0xa436, 0xb87e, 0xa438, 0x19cc, 0xa436, 0xb87c, ++ 0xa438, 0x8ffa, 0xa436, 0xb87e, 0xa438, 0x28e3, 0xa436, 0xb87c, ++ 0xa438, 0x8ffc, 0xa436, 0xb87e, 0xa438, 0x1047, 0xa436, 0xb87c, ++ 0xa438, 0x8ffe, 0xa436, 0xb87e, 0xa438, 0x0a45, 0xa436, 0xb85e, ++ 0xa438, 0x271E, 0xa436, 0xb860, 0xa438, 0x3846, 0xa436, 0xb862, ++ 0xa438, 0x26E6, 0xa436, 0xb864, 0xa438, 0x32E3, 0xa436, 0xb886, ++ 0xa438, 0x0474, 0xa436, 0xb888, 0xa438, 0xffff, 0xa436, 0xb88a, ++ 0xa438, 0xffff, 0xa436, 0xb88c, 0xa438, 0xffff, 0xa436, 0xb838, ++ 0xa438, 0x001f, 0xb820, 0x0010, 0xa436, 0x846e, 0xa438, 0xaf84, ++ 0xa438, 0x86af, 0xa438, 0x8690, 0xa438, 0xaf86, 0xa438, 0xa4af, ++ 0xa438, 0x8934, 0xa438, 0xaf89, 0xa438, 0x60af, 0xa438, 0x897e, ++ 0xa438, 0xaf89, 0xa438, 0xa9af, 0xa438, 0x89a9, 0xa438, 0xee82, ++ 0xa438, 0x5f00, 0xa438, 0x0284, 0xa438, 0x90af, 0xa438, 0x0441, ++ 0xa438, 0xf8e0, 0xa438, 0x8ff3, 0xa438, 0xa000, 0xa438, 0x0502, ++ 0xa438, 0x84a4, 0xa438, 0xae06, 0xa438, 0xa001, 0xa438, 0x0302, ++ 0xa438, 0x84c8, 0xa438, 0xfc04, 0xa438, 0xf8f9, 0xa438, 0xef59, ++ 0xa438, 0xe080, 0xa438, 0x15ad, 0xa438, 0x2702, 0xa438, 0xae03, ++ 0xa438, 0xaf84, 0xa438, 0xc3bf, 0xa438, 0x53ca, 0xa438, 0x0252, ++ 0xa438, 0xc8ad, 0xa438, 0x2807, 0xa438, 0x0285, 0xa438, 0x2cee, ++ 0xa438, 0x8ff3, 0xa438, 0x01ef, 0xa438, 0x95fd, 0xa438, 0xfc04, ++ 0xa438, 0xf8f9, 0xa438, 0xfaef, 0xa438, 0x69bf, 0xa438, 0x53ca, ++ 0xa438, 0x0252, 0xa438, 0xc8ac, 0xa438, 0x2822, 0xa438, 0xd480, ++ 0xa438, 0x00bf, 0xa438, 0x8684, 0xa438, 0x0252, 0xa438, 0xa9bf, ++ 0xa438, 0x8687, 0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x868a, ++ 0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x868d, 0xa438, 0x0252, ++ 0xa438, 0xa9ee, 0xa438, 0x8ff3, 0xa438, 0x00af, 0xa438, 0x8526, ++ 0xa438, 0xe08f, 0xa438, 0xf4e1, 0xa438, 0x8ff5, 0xa438, 0xe28f, ++ 0xa438, 0xf6e3, 0xa438, 0x8ff7, 0xa438, 0x1b45, 0xa438, 0xac27, ++ 0xa438, 0x0eee, 0xa438, 0x8ff4, 0xa438, 0x00ee, 0xa438, 0x8ff5, ++ 0xa438, 0x0002, 0xa438, 0x852c, 0xa438, 0xaf85, 0xa438, 0x26e0, ++ 0xa438, 0x8ff4, 0xa438, 0xe18f, 0xa438, 0xf52c, 0xa438, 0x0001, ++ 0xa438, 0xe48f, 0xa438, 0xf4e5, 0xa438, 0x8ff5, 0xa438, 0xef96, ++ 0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf8f9, 0xa438, 0xef59, ++ 0xa438, 0xbf53, 0xa438, 0x2202, 0xa438, 0x52c8, 0xa438, 0xa18b, ++ 0xa438, 0x02ae, 0xa438, 0x03af, 0xa438, 0x85da, 0xa438, 0xbf57, ++ 0xa438, 0x7202, 0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xf8e5, ++ 0xa438, 0x8ff9, 0xa438, 0xbf57, 0xa438, 0x7502, 0xa438, 0x52c8, ++ 0xa438, 0xe48f, 0xa438, 0xfae5, 0xa438, 0x8ffb, 0xa438, 0xbf57, ++ 0xa438, 0x7802, 0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xfce5, ++ 0xa438, 0x8ffd, 0xa438, 0xbf57, 0xa438, 0x7b02, 0xa438, 0x52c8, ++ 0xa438, 0xe48f, 0xa438, 0xfee5, 0xa438, 0x8fff, 0xa438, 0xbf57, ++ 0xa438, 0x6c02, 0xa438, 0x52c8, 0xa438, 0xa102, 0xa438, 0x13ee, ++ 0xa438, 0x8ffc, 0xa438, 0x80ee, 0xa438, 0x8ffd, 0xa438, 0x00ee, ++ 0xa438, 0x8ffe, 0xa438, 0x80ee, 0xa438, 0x8fff, 0xa438, 0x00af, ++ 0xa438, 0x8599, 0xa438, 0xa101, 0xa438, 0x0cbf, 0xa438, 0x534c, ++ 0xa438, 0x0252, 0xa438, 0xc8a1, 0xa438, 0x0303, 0xa438, 0xaf85, ++ 0xa438, 0x77bf, 0xa438, 0x5322, 0xa438, 0x0252, 0xa438, 0xc8a1, ++ 0xa438, 0x8b02, 0xa438, 0xae03, 0xa438, 0xaf86, 0xa438, 0x64e0, ++ 0xa438, 0x8ff8, 0xa438, 0xe18f, 0xa438, 0xf9bf, 0xa438, 0x8684, ++ 0xa438, 0x0252, 0xa438, 0xa9e0, 0xa438, 0x8ffa, 0xa438, 0xe18f, ++ 0xa438, 0xfbbf, 0xa438, 0x8687, 0xa438, 0x0252, 0xa438, 0xa9e0, ++ 0xa438, 0x8ffc, 0xa438, 0xe18f, 0xa438, 0xfdbf, 0xa438, 0x868a, ++ 0xa438, 0x0252, 0xa438, 0xa9e0, 0xa438, 0x8ffe, 0xa438, 0xe18f, ++ 0xa438, 0xffbf, 0xa438, 0x868d, 0xa438, 0x0252, 0xa438, 0xa9af, ++ 0xa438, 0x867f, 0xa438, 0xbf53, 0xa438, 0x2202, 0xa438, 0x52c8, ++ 0xa438, 0xa144, 0xa438, 0x3cbf, 0xa438, 0x547b, 0xa438, 0x0252, ++ 0xa438, 0xc8e4, 0xa438, 0x8ff8, 0xa438, 0xe58f, 0xa438, 0xf9bf, ++ 0xa438, 0x547e, 0xa438, 0x0252, 0xa438, 0xc8e4, 0xa438, 0x8ffa, ++ 0xa438, 0xe58f, 0xa438, 0xfbbf, 0xa438, 0x5481, 0xa438, 0x0252, ++ 0xa438, 0xc8e4, 0xa438, 0x8ffc, 0xa438, 0xe58f, 0xa438, 0xfdbf, ++ 0xa438, 0x5484, 0xa438, 0x0252, 0xa438, 0xc8e4, 0xa438, 0x8ffe, ++ 0xa438, 0xe58f, 0xa438, 0xffbf, 0xa438, 0x5322, 0xa438, 0x0252, ++ 0xa438, 0xc8a1, 0xa438, 0x4448, 0xa438, 0xaf85, 0xa438, 0xa7bf, ++ 0xa438, 0x5322, 0xa438, 0x0252, 0xa438, 0xc8a1, 0xa438, 0x313c, ++ 0xa438, 0xbf54, 0xa438, 0x7b02, 0xa438, 0x52c8, 0xa438, 0xe48f, ++ 0xa438, 0xf8e5, 0xa438, 0x8ff9, 0xa438, 0xbf54, 0xa438, 0x7e02, ++ 0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xfae5, 0xa438, 0x8ffb, ++ 0xa438, 0xbf54, 0xa438, 0x8102, 0xa438, 0x52c8, 0xa438, 0xe48f, ++ 0xa438, 0xfce5, 0xa438, 0x8ffd, 0xa438, 0xbf54, 0xa438, 0x8402, ++ 0xa438, 0x52c8, 0xa438, 0xe48f, 0xa438, 0xfee5, 0xa438, 0x8fff, ++ 0xa438, 0xbf53, 0xa438, 0x2202, 0xa438, 0x52c8, 0xa438, 0xa131, ++ 0xa438, 0x03af, 0xa438, 0x85a7, 0xa438, 0xd480, 0xa438, 0x00bf, ++ 0xa438, 0x8684, 0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x8687, ++ 0xa438, 0x0252, 0xa438, 0xa9bf, 0xa438, 0x868a, 0xa438, 0x0252, ++ 0xa438, 0xa9bf, 0xa438, 0x868d, 0xa438, 0x0252, 0xa438, 0xa9ef, ++ 0xa438, 0x95fd, 0xa438, 0xfc04, 0xa438, 0xf0d1, 0xa438, 0x2af0, ++ 0xa438, 0xd12c, 0xa438, 0xf0d1, 0xa438, 0x44f0, 0xa438, 0xd146, ++ 0xa438, 0xbf86, 0xa438, 0xa102, 0xa438, 0x52c8, 0xa438, 0xbf86, ++ 0xa438, 0xa102, 0xa438, 0x52c8, 0xa438, 0xd101, 0xa438, 0xaf06, ++ 0xa438, 0xa570, 0xa438, 0xce42, 0xa438, 0xee83, 0xa438, 0xc800, ++ 0xa438, 0x0286, 0xa438, 0xba02, 0xa438, 0x8728, 0xa438, 0x0287, ++ 0xa438, 0xbe02, 0xa438, 0x87f9, 0xa438, 0x0288, 0xa438, 0xc3af, ++ 0xa438, 0x4771, 0xa438, 0xf8f9, 0xa438, 0xfafb, 0xa438, 0xef69, ++ 0xa438, 0xfae0, 0xa438, 0x8015, 0xa438, 0xad25, 0xa438, 0x45d2, ++ 0xa438, 0x0002, 0xa438, 0x8714, 0xa438, 0xac4f, 0xa438, 0x02ae, ++ 0xa438, 0x0bef, 0xa438, 0x46f6, 0xa438, 0x273c, 0xa438, 0x0400, ++ 0xa438, 0xab26, 0xa438, 0xae30, 0xa438, 0xe08f, 0xa438, 0xe9e1, ++ 0xa438, 0x8fea, 0xa438, 0x1b46, 0xa438, 0xab26, 0xa438, 0xef32, ++ 0xa438, 0x0c31, 0xa438, 0xbf8f, 0xa438, 0xe91a, 0xa438, 0x93d8, ++ 0xa438, 0x19d9, 0xa438, 0x1b46, 0xa438, 0xab0a, 0xa438, 0x19d8, ++ 0xa438, 0x19d9, 0xa438, 0x1b46, 0xa438, 0xaa02, 0xa438, 0xae0c, ++ 0xa438, 0xbf57, 0xa438, 0x1202, 0xa438, 0x58b1, 0xa438, 0xbf57, ++ 0xa438, 0x1202, 0xa438, 0x58a8, 0xa438, 0xfeef, 0xa438, 0x96ff, ++ 0xa438, 0xfefd, 0xa438, 0xfc04, 0xa438, 0xf8fb, 0xa438, 0xef79, ++ 0xa438, 0xa200, 0xa438, 0x08bf, 0xa438, 0x892e, 0xa438, 0x0252, ++ 0xa438, 0xc8ef, 0xa438, 0x64ef, 0xa438, 0x97ff, 0xa438, 0xfc04, ++ 0xa438, 0xf8f9, 0xa438, 0xfafb, 0xa438, 0xef69, 0xa438, 0xfae0, ++ 0xa438, 0x8015, 0xa438, 0xad25, 0xa438, 0x50d2, 0xa438, 0x0002, ++ 0xa438, 0x878d, 0xa438, 0xac4f, 0xa438, 0x02ae, 0xa438, 0x0bef, ++ 0xa438, 0x46f6, 0xa438, 0x273c, 0xa438, 0x1000, 0xa438, 0xab31, ++ 0xa438, 0xae29, 0xa438, 0xe08f, 0xa438, 0xede1, 0xa438, 0x8fee, ++ 0xa438, 0x1b46, 0xa438, 0xab1f, 0xa438, 0xa200, 0xa438, 0x04ef, ++ 0xa438, 0x32ae, 0xa438, 0x02d3, 0xa438, 0x010c, 0xa438, 0x31bf, ++ 0xa438, 0x8fed, 0xa438, 0x1a93, 0xa438, 0xd819, 0xa438, 0xd91b, ++ 0xa438, 0x46ab, 0xa438, 0x0e19, 0xa438, 0xd819, 0xa438, 0xd91b, ++ 0xa438, 0x46aa, 0xa438, 0x0612, 0xa438, 0xa205, 0xa438, 0xc0ae, ++ 0xa438, 0x0cbf, 0xa438, 0x5712, 0xa438, 0x0258, 0xa438, 0xb1bf, ++ 0xa438, 0x5712, 0xa438, 0x0258, 0xa438, 0xa8fe, 0xa438, 0xef96, ++ 0xa438, 0xfffe, 0xa438, 0xfdfc, 0xa438, 0x04f8, 0xa438, 0xfbef, ++ 0xa438, 0x79a2, 0xa438, 0x0005, 0xa438, 0xbf89, 0xa438, 0x1fae, ++ 0xa438, 0x1ba2, 0xa438, 0x0105, 0xa438, 0xbf89, 0xa438, 0x22ae, ++ 0xa438, 0x13a2, 0xa438, 0x0205, 0xa438, 0xbf89, 0xa438, 0x25ae, ++ 0xa438, 0x0ba2, 0xa438, 0x0305, 0xa438, 0xbf89, 0xa438, 0x28ae, ++ 0xa438, 0x03bf, 0xa438, 0x892b, 0xa438, 0x0252, 0xa438, 0xc8ef, ++ 0xa438, 0x64ef, 0xa438, 0x97ff, 0xa438, 0xfc04, 0xa438, 0xf8f9, ++ 0xa438, 0xfaef, 0xa438, 0x69fa, 0xa438, 0xe080, 0xa438, 0x15ad, ++ 0xa438, 0x2628, 0xa438, 0xe081, 0xa438, 0xabe1, 0xa438, 0x81ac, ++ 0xa438, 0xef64, 0xa438, 0xbf57, 0xa438, 0x1802, 0xa438, 0x52c8, ++ 0xa438, 0x1b46, 0xa438, 0xaa0a, 0xa438, 0xbf57, 0xa438, 0x1b02, ++ 0xa438, 0x52c8, 0xa438, 0x1b46, 0xa438, 0xab0c, 0xa438, 0xbf57, ++ 0xa438, 0x1502, 0xa438, 0x58b1, 0xa438, 0xbf57, 0xa438, 0x1502, ++ 0xa438, 0x58a8, 0xa438, 0xfeef, 0xa438, 0x96fe, 0xa438, 0xfdfc, ++ 0xa438, 0x04f8, 0xa438, 0xf9ef, 0xa438, 0x59f9, 0xa438, 0xe080, ++ 0xa438, 0x15ad, 0xa438, 0x2622, 0xa438, 0xbf53, 0xa438, 0x2202, ++ 0xa438, 0x52c8, 0xa438, 0x3972, 0xa438, 0x9e10, 0xa438, 0xe083, ++ 0xa438, 0xc9ac, 0xa438, 0x2605, 0xa438, 0x0288, 0xa438, 0x2cae, ++ 0xa438, 0x0d02, 0xa438, 0x8870, 0xa438, 0xae08, 0xa438, 0xe283, ++ 0xa438, 0xc9f6, 0xa438, 0x36e6, 0xa438, 0x83c9, 0xa438, 0xfdef, ++ 0xa438, 0x95fd, 0xa438, 0xfc04, 0xa438, 0xf8f9, 0xa438, 0xfafb, ++ 0xa438, 0xef79, 0xa438, 0xfbbf, 0xa438, 0x5718, 0xa438, 0x0252, ++ 0xa438, 0xc8ef, 0xa438, 0x64e2, 0xa438, 0x8fe5, 0xa438, 0xe38f, ++ 0xa438, 0xe61b, 0xa438, 0x659e, 0xa438, 0x10e4, 0xa438, 0x8fe5, ++ 0xa438, 0xe58f, 0xa438, 0xe6e2, 0xa438, 0x83c9, 0xa438, 0xf636, ++ 0xa438, 0xe683, 0xa438, 0xc9ae, 0xa438, 0x13e2, 0xa438, 0x83c9, ++ 0xa438, 0xf736, 0xa438, 0xe683, 0xa438, 0xc902, 0xa438, 0x5820, ++ 0xa438, 0xef57, 0xa438, 0xe68f, 0xa438, 0xe7e7, 0xa438, 0x8fe8, ++ 0xa438, 0xffef, 0xa438, 0x97ff, 0xa438, 0xfefd, 0xa438, 0xfc04, ++ 0xa438, 0xf8f9, 0xa438, 0xfafb, 0xa438, 0xef79, 0xa438, 0xfbe2, ++ 0xa438, 0x8fe7, 0xa438, 0xe38f, 0xa438, 0xe8ef, 0xa438, 0x65e2, ++ 0xa438, 0x81b8, 0xa438, 0xe381, 0xa438, 0xb9ef, 0xa438, 0x7502, ++ 0xa438, 0x583b, 0xa438, 0xac50, 0xa438, 0x1abf, 0xa438, 0x5718, ++ 0xa438, 0x0252, 0xa438, 0xc8ef, 0xa438, 0x64e2, 0xa438, 0x8fe5, ++ 0xa438, 0xe38f, 0xa438, 0xe61b, 0xa438, 0x659e, 0xa438, 0x1ce4, ++ 0xa438, 0x8fe5, 0xa438, 0xe58f, 0xa438, 0xe6ae, 0xa438, 0x0cbf, ++ 0xa438, 0x5715, 0xa438, 0x0258, 0xa438, 0xb1bf, 0xa438, 0x5715, ++ 0xa438, 0x0258, 0xa438, 0xa8e2, 0xa438, 0x83c9, 0xa438, 0xf636, ++ 0xa438, 0xe683, 0xa438, 0xc9ff, 0xa438, 0xef97, 0xa438, 0xfffe, ++ 0xa438, 0xfdfc, 0xa438, 0x04f8, 0xa438, 0xf9fa, 0xa438, 0xef69, ++ 0xa438, 0xe080, 0xa438, 0x15ad, 0xa438, 0x264b, 0xa438, 0xbf53, ++ 0xa438, 0xca02, 0xa438, 0x52c8, 0xa438, 0xad28, 0xa438, 0x42bf, ++ 0xa438, 0x8931, 0xa438, 0x0252, 0xa438, 0xc8ef, 0xa438, 0x54bf, ++ 0xa438, 0x576c, 0xa438, 0x0252, 0xa438, 0xc8a1, 0xa438, 0x001b, ++ 0xa438, 0xbf53, 0xa438, 0x4c02, 0xa438, 0x52c8, 0xa438, 0xac29, ++ 0xa438, 0x0dac, 0xa438, 0x2805, 0xa438, 0xa302, 0xa438, 0x16ae, ++ 0xa438, 0x20a3, 0xa438, 0x0311, 0xa438, 0xae1b, 0xa438, 0xa304, ++ 0xa438, 0x0cae, 0xa438, 0x16a3, 0xa438, 0x0802, 0xa438, 0xae11, ++ 0xa438, 0xa309, 0xa438, 0x02ae, 0xa438, 0x0cbf, 0xa438, 0x5715, ++ 0xa438, 0x0258, 0xa438, 0xb1bf, 0xa438, 0x5715, 0xa438, 0x0258, ++ 0xa438, 0xa8ef, 0xa438, 0x96fe, 0xa438, 0xfdfc, 0xa438, 0x04f0, ++ 0xa438, 0xa300, 0xa438, 0xf0a3, 0xa438, 0x02f0, 0xa438, 0xa304, ++ 0xa438, 0xf0a3, 0xa438, 0x06f0, 0xa438, 0xa308, 0xa438, 0xf0a2, ++ 0xa438, 0x8074, 0xa438, 0xa600, 0xa438, 0xac4f, 0xa438, 0x02ae, ++ 0xa438, 0x0bef, 0xa438, 0x46f6, 0xa438, 0x273c, 0xa438, 0x1000, ++ 0xa438, 0xab1b, 0xa438, 0xae16, 0xa438, 0xe081, 0xa438, 0xabe1, ++ 0xa438, 0x81ac, 0xa438, 0x1b46, 0xa438, 0xab0c, 0xa438, 0xac32, ++ 0xa438, 0x04ef, 0xa438, 0x32ae, 0xa438, 0x02d3, 0xa438, 0x04af, ++ 0xa438, 0x486c, 0xa438, 0xaf48, 0xa438, 0x82af, 0xa438, 0x4888, ++ 0xa438, 0xe081, 0xa438, 0x9be1, 0xa438, 0x819c, 0xa438, 0xe28f, ++ 0xa438, 0xe3ad, 0xa438, 0x3009, 0xa438, 0x1f55, 0xa438, 0xe38f, ++ 0xa438, 0xe20c, 0xa438, 0x581a, 0xa438, 0x45e4, 0xa438, 0x83a6, ++ 0xa438, 0xe583, 0xa438, 0xa7af, 0xa438, 0x2a75, 0xa438, 0xe08f, ++ 0xa438, 0xe3ad, 0xa438, 0x201c, 0xa438, 0x1f44, 0xa438, 0xe18f, ++ 0xa438, 0xe10c, 0xa438, 0x44ef, 0xa438, 0x64e0, 0xa438, 0x8232, ++ 0xa438, 0xe182, 0xa438, 0x331b, 0xa438, 0x649f, 0xa438, 0x091f, ++ 0xa438, 0x44e1, 0xa438, 0x8fe2, 0xa438, 0x0c48, 0xa438, 0x1b54, ++ 0xa438, 0xe683, 0xa438, 0xa6e7, 0xa438, 0x83a7, 0xa438, 0xaf2b, ++ 0xa438, 0xd900, 0xa436, 0xb818, 0xa438, 0x043d, 0xa436, 0xb81a, ++ 0xa438, 0x06a3, 0xa436, 0xb81c, 0xa438, 0x476d, 0xa436, 0xb81e, ++ 0xa438, 0x4852, 0xa436, 0xb850, 0xa438, 0x2A69, 0xa436, 0xb852, ++ 0xa438, 0x2BD3, 0xa436, 0xb878, 0xa438, 0xffff, 0xa436, 0xb884, ++ 0xa438, 0xffff, 0xa436, 0xb832, 0xa438, 0x003f, 0xb844, 0xffff, ++ 0xa436, 0x8fe9, 0xa438, 0x0000, 0xa436, 0x8feb, 0xa438, 0x02fe, ++ 0xa436, 0x8fed, 0xa438, 0x0019, 0xa436, 0x8fef, 0xa438, 0x0bdb, ++ 0xa436, 0x8ff1, 0xa438, 0x0ca4, 0xa436, 0x0000, 0xa438, 0x0000, ++ 0xa436, 0xB82E, 0xa438, 0x0000, 0xa436, 0x8024, 0xa438, 0x0000, ++ 0xa436, 0x801E, 0xa438, 0x0024, 0xb820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125d_1_1[] = { ++ 0xa436, 0x8023, 0xa438, 0x3800, 0xa436, 0xB82E, 0xa438, 0x0001, ++ 0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x8018, 0xa438, 0x1800, 0xa438, 0x8021, ++ 0xa438, 0x1800, 0xa438, 0x8029, 0xa438, 0x1800, 0xa438, 0x8031, ++ 0xa438, 0x1800, 0xa438, 0x8035, 0xa438, 0x1800, 0xa438, 0x819c, ++ 0xa438, 0x1800, 0xa438, 0x81e9, 0xa438, 0xd711, 0xa438, 0x6081, ++ 0xa438, 0x8904, 0xa438, 0x1800, 0xa438, 0x2021, 0xa438, 0xa904, ++ 0xa438, 0x1800, 0xa438, 0x2021, 0xa438, 0xd75f, 0xa438, 0x4083, ++ 0xa438, 0xd503, 0xa438, 0xa908, 0xa438, 0x87f0, 0xa438, 0x1000, ++ 0xa438, 0x17e0, 0xa438, 0x1800, 0xa438, 0x13c3, 0xa438, 0xd707, ++ 0xa438, 0x2005, 0xa438, 0x8027, 0xa438, 0xd75e, 0xa438, 0x1800, ++ 0xa438, 0x1434, 0xa438, 0x1800, 0xa438, 0x14a5, 0xa438, 0xc504, ++ 0xa438, 0xce20, 0xa438, 0xcf01, 0xa438, 0xd70a, 0xa438, 0x4005, ++ 0xa438, 0xcf02, 0xa438, 0x1800, 0xa438, 0x1c50, 0xa438, 0xa980, ++ 0xa438, 0xd500, 0xa438, 0x1800, 0xa438, 0x14f3, 0xa438, 0xd75e, ++ 0xa438, 0x67b1, 0xa438, 0xd504, 0xa438, 0xd71e, 0xa438, 0x65bb, ++ 0xa438, 0x63da, 0xa438, 0x61f9, 0xa438, 0x0cf0, 0xa438, 0x0c10, ++ 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xd501, ++ 0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0470, 0xa438, 0x0cf0, ++ 0xa438, 0x0430, 0xa438, 0x0cf0, 0xa438, 0x0410, 0xa438, 0xf02a, ++ 0xa438, 0x0cf0, 0xa438, 0x0c20, 0xa438, 0xd505, 0xa438, 0x0c0f, ++ 0xa438, 0x0804, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0, ++ 0xa438, 0x0470, 0xa438, 0x0cf0, 0xa438, 0x0430, 0xa438, 0x0cf0, ++ 0xa438, 0x0420, 0xa438, 0xf01c, 0xa438, 0x0cf0, 0xa438, 0x0c40, ++ 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xd501, ++ 0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0470, 0xa438, 0x0cf0, ++ 0xa438, 0x0450, 0xa438, 0x0cf0, 0xa438, 0x0440, 0xa438, 0xf00e, ++ 0xa438, 0x0cf0, 0xa438, 0x0c80, 0xa438, 0xd505, 0xa438, 0x0c0f, ++ 0xa438, 0x0801, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0, ++ 0xa438, 0x04b0, 0xa438, 0x0cf0, 0xa438, 0x0490, 0xa438, 0x0cf0, ++ 0xa438, 0x0480, 0xa438, 0xd501, 0xa438, 0xce00, 0xa438, 0xd500, ++ 0xa438, 0xc48e, 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, ++ 0xa438, 0x5faf, 0xa438, 0xd504, 0xa438, 0x8e01, 0xa438, 0x8c0f, ++ 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x17e0, 0xa438, 0xd504, ++ 0xa438, 0xd718, 0xa438, 0x4074, 0xa438, 0x6195, 0xa438, 0xf005, ++ 0xa438, 0x60f5, 0xa438, 0x0c03, 0xa438, 0x0d00, 0xa438, 0xf009, ++ 0xa438, 0x0c03, 0xa438, 0x0d01, 0xa438, 0xf006, 0xa438, 0x0c03, ++ 0xa438, 0x0d02, 0xa438, 0xf003, 0xa438, 0x0c03, 0xa438, 0x0d03, ++ 0xa438, 0xd500, 0xa438, 0xd706, 0xa438, 0x2529, 0xa438, 0x809c, ++ 0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da, 0xa438, 0xf00f, ++ 0xa438, 0x431a, 0xa438, 0xf021, 0xa438, 0xd718, 0xa438, 0x617b, ++ 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0x1000, 0xa438, 0x1ad1, ++ 0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34, ++ 0xa438, 0xf020, 0xa438, 0xf053, 0xa438, 0x1000, 0xa438, 0x1a41, ++ 0xa438, 0x1000, 0xa438, 0x1ad1, 0xa438, 0xd718, 0xa438, 0x608e, ++ 0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf023, 0xa438, 0xf067, ++ 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0x1000, 0xa438, 0x1ad1, ++ 0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34, ++ 0xa438, 0xf026, 0xa438, 0xf07b, 0xa438, 0x1000, 0xa438, 0x1a41, ++ 0xa438, 0x1000, 0xa438, 0x1ad1, 0xa438, 0xd718, 0xa438, 0x608e, ++ 0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf029, 0xa438, 0xf08f, ++ 0xa438, 0x1000, 0xa438, 0x8173, 0xa438, 0x1000, 0xa438, 0x1a41, ++ 0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x8188, ++ 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fae, ++ 0xa438, 0xf028, 0xa438, 0x1000, 0xa438, 0x8173, 0xa438, 0x1000, ++ 0xa438, 0x1a41, 0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000, ++ 0xa438, 0x8188, 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, ++ 0xa438, 0x5fae, 0xa438, 0xf039, 0xa438, 0x1000, 0xa438, 0x8173, ++ 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd73e, 0xa438, 0x7fb4, ++ 0xa438, 0x1000, 0xa438, 0x8188, 0xa438, 0x1000, 0xa438, 0x1a41, ++ 0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf04a, 0xa438, 0x1000, ++ 0xa438, 0x8173, 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd73e, ++ 0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x8188, 0xa438, 0x1000, ++ 0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf05b, ++ 0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac01, ++ 0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a2f, ++ 0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504, ++ 0xa438, 0xac11, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa410, ++ 0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a41, ++ 0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719, ++ 0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf05d, 0xa438, 0x4b98, ++ 0xa438, 0xa808, 0xa438, 0xf05a, 0xa438, 0xd719, 0xa438, 0x4119, ++ 0xa438, 0xd504, 0xa438, 0xac02, 0xa438, 0xae01, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a2f, 0xa438, 0xf00a, 0xa438, 0xd719, ++ 0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac22, 0xa438, 0xd501, ++ 0xa438, 0xce01, 0xa438, 0xa420, 0xa438, 0xce00, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fb0, ++ 0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f, ++ 0xa438, 0xf03f, 0xa438, 0x47d8, 0xa438, 0xa804, 0xa438, 0xf03c, ++ 0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac04, ++ 0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a2f, ++ 0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504, ++ 0xa438, 0xac44, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa440, ++ 0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a41, ++ 0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719, ++ 0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf021, 0xa438, 0x4418, ++ 0xa438, 0xa802, 0xa438, 0xf01e, 0xa438, 0xd719, 0xa438, 0x4119, ++ 0xa438, 0xd504, 0xa438, 0xac08, 0xa438, 0xae01, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a2f, 0xa438, 0xf00a, 0xa438, 0xd719, ++ 0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac88, 0xa438, 0xd501, ++ 0xa438, 0xce01, 0xa438, 0xa480, 0xa438, 0xce00, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a41, 0xa438, 0xd718, 0xa438, 0x5fb0, ++ 0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f, ++ 0xa438, 0xf003, 0xa438, 0x4058, 0xa438, 0xa801, 0xa438, 0x1800, ++ 0xa438, 0x16ed, 0xa438, 0xd73e, 0xa438, 0xd505, 0xa438, 0x3088, ++ 0xa438, 0x817a, 0xa438, 0x6193, 0xa438, 0x6132, 0xa438, 0x60d1, ++ 0xa438, 0x3298, 0xa438, 0x8185, 0xa438, 0xf00a, 0xa438, 0xa808, ++ 0xa438, 0xf008, 0xa438, 0xa804, 0xa438, 0xf006, 0xa438, 0xa802, ++ 0xa438, 0xf004, 0xa438, 0xa801, 0xa438, 0xf002, 0xa438, 0xa80f, ++ 0xa438, 0xd500, 0xa438, 0x0800, 0xa438, 0xd505, 0xa438, 0xd75e, ++ 0xa438, 0x6211, 0xa438, 0xd71e, 0xa438, 0x619b, 0xa438, 0x611a, ++ 0xa438, 0x6099, 0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xf009, ++ 0xa438, 0x0c0f, 0xa438, 0x0804, 0xa438, 0xf006, 0xa438, 0x0c0f, ++ 0xa438, 0x0802, 0xa438, 0xf003, 0xa438, 0x0c0f, 0xa438, 0x0801, ++ 0xa438, 0xd500, 0xa438, 0x0800, 0xa438, 0xd500, 0xa438, 0xc48d, ++ 0xa438, 0xd504, 0xa438, 0x8d03, 0xa438, 0xd701, 0xa438, 0x4045, ++ 0xa438, 0xad02, 0xa438, 0xd504, 0xa438, 0xd706, 0xa438, 0x2529, ++ 0xa438, 0x81ad, 0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da, ++ 0xa438, 0xf013, 0xa438, 0x441a, 0xa438, 0xf02d, 0xa438, 0xd718, ++ 0xa438, 0x61fb, 0xa438, 0xbb01, 0xa438, 0xd75e, 0xa438, 0x6171, ++ 0xa438, 0x0cf0, 0xa438, 0x0c10, 0xa438, 0xd501, 0xa438, 0xce01, ++ 0xa438, 0x0cf0, 0xa438, 0x0410, 0xa438, 0xce00, 0xa438, 0xd505, ++ 0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xf02a, 0xa438, 0xbb02, ++ 0xa438, 0xd75e, 0xa438, 0x6171, 0xa438, 0x0cf0, 0xa438, 0x0c20, ++ 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0420, ++ 0xa438, 0xce00, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0804, ++ 0xa438, 0xf01c, 0xa438, 0xbb04, 0xa438, 0xd75e, 0xa438, 0x6171, ++ 0xa438, 0x0cf0, 0xa438, 0x0c40, 0xa438, 0xd501, 0xa438, 0xce01, ++ 0xa438, 0x0cf0, 0xa438, 0x0440, 0xa438, 0xce00, 0xa438, 0xd505, ++ 0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xf00e, 0xa438, 0xbb08, ++ 0xa438, 0xd75e, 0xa438, 0x6171, 0xa438, 0x0cf0, 0xa438, 0x0c80, ++ 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x0cf0, 0xa438, 0x0480, ++ 0xa438, 0xce00, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0801, ++ 0xa438, 0xd500, 0xa438, 0x1800, 0xa438, 0x1616, 0xa436, 0xA026, ++ 0xa438, 0xffff, 0xa436, 0xA024, 0xa438, 0x15d8, 0xa436, 0xA022, ++ 0xa438, 0x161f, 0xa436, 0xA020, 0xa438, 0x14f2, 0xa436, 0xA006, ++ 0xa438, 0x1c4f, 0xa436, 0xA004, 0xa438, 0x1433, 0xa436, 0xA002, ++ 0xa438, 0x13c1, 0xa436, 0xA000, 0xa438, 0x2020, 0xa436, 0xA008, ++ 0xa438, 0x7f00, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012, ++ 0xa438, 0x07f8, 0xa436, 0xA014, 0xa438, 0xd04d, 0xa438, 0x8904, ++ 0xa438, 0x813C, 0xa438, 0xA13D, 0xa438, 0xcc01, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA152, 0xa438, 0x1384, ++ 0xa436, 0xA154, 0xa438, 0x1fa8, 0xa436, 0xA156, 0xa438, 0x218B, ++ 0xa436, 0xA158, 0xa438, 0x21B8, 0xa436, 0xA15A, 0xa438, 0x021c, ++ 0xa436, 0xA15C, 0xa438, 0x3fff, 0xa436, 0xA15E, 0xa438, 0x3fff, ++ 0xa436, 0xA160, 0xa438, 0x3fff, 0xa436, 0xA150, 0xa438, 0x001f, ++ 0xa436, 0xA016, 0xa438, 0x0010, 0xa436, 0xA012, 0xa438, 0x0000, ++ 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800, ++ 0xa438, 0x8013, 0xa438, 0x1800, 0xa438, 0x803a, 0xa438, 0x1800, ++ 0xa438, 0x8045, 0xa438, 0x1800, 0xa438, 0x8049, 0xa438, 0x1800, ++ 0xa438, 0x804d, 0xa438, 0x1800, 0xa438, 0x8059, 0xa438, 0x1800, ++ 0xa438, 0x805d, 0xa438, 0xc2ff, 0xa438, 0x1800, 0xa438, 0x0042, ++ 0xa438, 0x1000, 0xa438, 0x02e5, 0xa438, 0x1000, 0xa438, 0x02b4, ++ 0xa438, 0xd701, 0xa438, 0x40e3, 0xa438, 0xd700, 0xa438, 0x5f6c, ++ 0xa438, 0x1000, 0xa438, 0x8021, 0xa438, 0x1800, 0xa438, 0x0073, ++ 0xa438, 0x1800, 0xa438, 0x0084, 0xa438, 0xd701, 0xa438, 0x4061, ++ 0xa438, 0xba0f, 0xa438, 0xf004, 0xa438, 0x4060, 0xa438, 0x1000, ++ 0xa438, 0x802a, 0xa438, 0xba10, 0xa438, 0x0800, 0xa438, 0xd700, ++ 0xa438, 0x60bb, 0xa438, 0x611c, 0xa438, 0x0c0f, 0xa438, 0x1a01, ++ 0xa438, 0xf00a, 0xa438, 0x60fc, 0xa438, 0x0c0f, 0xa438, 0x1a02, ++ 0xa438, 0xf006, 0xa438, 0x0c0f, 0xa438, 0x1a04, 0xa438, 0xf003, ++ 0xa438, 0x0c0f, 0xa438, 0x1a08, 0xa438, 0x0800, 0xa438, 0x0c0f, ++ 0xa438, 0x0504, 0xa438, 0xad02, 0xa438, 0x1000, 0xa438, 0x02c0, ++ 0xa438, 0xd700, 0xa438, 0x5fac, 0xa438, 0x1000, 0xa438, 0x8021, ++ 0xa438, 0x1800, 0xa438, 0x0139, 0xa438, 0x9a1f, 0xa438, 0x8bf0, ++ 0xa438, 0x1800, 0xa438, 0x02df, 0xa438, 0x9a1f, 0xa438, 0x9910, ++ 0xa438, 0x1800, 0xa438, 0x02d7, 0xa438, 0xad02, 0xa438, 0x8d01, ++ 0xa438, 0x9a1f, 0xa438, 0x9910, 0xa438, 0x9860, 0xa438, 0xcb00, ++ 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0x85f0, 0xa438, 0xd500, ++ 0xa438, 0x1800, 0xa438, 0x015c, 0xa438, 0x8580, 0xa438, 0x8d02, ++ 0xa438, 0x1800, 0xa438, 0x018f, 0xa438, 0x0c0f, 0xa438, 0x0503, ++ 0xa438, 0xad02, 0xa438, 0x1800, 0xa438, 0x00dd, 0xa436, 0xA08E, ++ 0xa438, 0x00db, 0xa436, 0xA08C, 0xa438, 0x018e, 0xa436, 0xA08A, ++ 0xa438, 0x015a, 0xa436, 0xA088, 0xa438, 0x02d6, 0xa436, 0xA086, ++ 0xa438, 0x02de, 0xa436, 0xA084, 0xa438, 0x0137, 0xa436, 0xA082, ++ 0xa438, 0x0071, 0xa436, 0xA080, 0xa438, 0x0041, 0xa436, 0xA090, ++ 0xa438, 0x00ff, 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, ++ 0xa438, 0x1ff8, 0xa436, 0xA014, 0xa438, 0x001c, 0xa438, 0xce15, ++ 0xa438, 0xd105, 0xa438, 0xa410, 0xa438, 0x8320, 0xa438, 0xFFD7, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA164, 0xa438, 0x0260, ++ 0xa436, 0xA166, 0xa438, 0x0add, 0xa436, 0xA168, 0xa438, 0x05CC, ++ 0xa436, 0xA16A, 0xa438, 0x05C5, 0xa436, 0xA16C, 0xa438, 0x0429, ++ 0xa436, 0xA16E, 0xa438, 0x07B6, 0xa436, 0xA170, 0xa438, 0x0259, ++ 0xa436, 0xA172, 0xa438, 0x3fff, 0xa436, 0xA162, 0xa438, 0x003f, ++ 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000, ++ 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800, ++ 0xa438, 0x8023, 0xa438, 0x1800, 0xa438, 0x814c, 0xa438, 0x1800, ++ 0xa438, 0x8156, 0xa438, 0x1800, 0xa438, 0x815e, 0xa438, 0x1800, ++ 0xa438, 0x8210, 0xa438, 0x1800, 0xa438, 0x8221, 0xa438, 0x1800, ++ 0xa438, 0x822f, 0xa438, 0xa801, 0xa438, 0x9308, 0xa438, 0xb201, ++ 0xa438, 0xb301, 0xa438, 0xd701, 0xa438, 0x4000, 0xa438, 0xd2ff, ++ 0xa438, 0xb302, 0xa438, 0xd200, 0xa438, 0xb201, 0xa438, 0xb309, ++ 0xa438, 0xd701, 0xa438, 0x4000, 0xa438, 0xd2ff, 0xa438, 0xb302, ++ 0xa438, 0xd200, 0xa438, 0xa800, 0xa438, 0x1800, 0xa438, 0x0031, ++ 0xa438, 0xd700, 0xa438, 0x4543, 0xa438, 0xd71f, 0xa438, 0x40fe, ++ 0xa438, 0xd1b7, 0xa438, 0xd049, 0xa438, 0x1000, 0xa438, 0x109e, ++ 0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0xa220, 0xa438, 0x8501, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x0c70, 0xa438, 0x0b00, ++ 0xa438, 0x0c07, 0xa438, 0x0604, 0xa438, 0x9503, 0xa438, 0xa510, ++ 0xa438, 0xce49, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x8520, ++ 0xa438, 0xa520, 0xa438, 0xa501, 0xa438, 0xd105, 0xa438, 0xd047, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd707, 0xa438, 0x6087, ++ 0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0xffe9, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0x8501, 0xa438, 0xd707, 0xa438, 0x5e08, ++ 0xa438, 0x8530, 0xa438, 0xba20, 0xa438, 0xf00c, 0xa438, 0xd700, ++ 0xa438, 0x4098, 0xa438, 0xd1ef, 0xa438, 0xd047, 0xa438, 0xf003, ++ 0xa438, 0xd1db, 0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x109e, ++ 0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0x8980, 0xa438, 0xd702, ++ 0xa438, 0x6126, 0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702, ++ 0xa438, 0x6060, 0xa438, 0xd702, 0xa438, 0x6077, 0xa438, 0x8410, ++ 0xa438, 0xf002, 0xa438, 0xa410, 0xa438, 0xce02, 0xa438, 0x1000, ++ 0xa438, 0x10be, 0xa438, 0xcd81, 0xa438, 0xd412, 0xa438, 0x1000, ++ 0xa438, 0x1069, 0xa438, 0xcd82, 0xa438, 0xd40e, 0xa438, 0x1000, ++ 0xa438, 0x1069, 0xa438, 0xcd83, 0xa438, 0x1000, 0xa438, 0x109e, ++ 0xa438, 0xd71f, 0xa438, 0x5fb4, 0xa438, 0xd702, 0xa438, 0x6c26, ++ 0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702, 0xa438, 0x6060, ++ 0xa438, 0xd702, 0xa438, 0x6b77, 0xa438, 0xa340, 0xa438, 0x0c06, ++ 0xa438, 0x0102, 0xa438, 0xce01, 0xa438, 0x1000, 0xa438, 0x10be, ++ 0xa438, 0xa240, 0xa438, 0xa902, 0xa438, 0xa204, 0xa438, 0xa280, ++ 0xa438, 0xa364, 0xa438, 0xab02, 0xa438, 0x8380, 0xa438, 0xa00a, ++ 0xa438, 0xcd8d, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706, ++ 0xa438, 0x5fb5, 0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x109e, ++ 0xa438, 0xd71f, 0xa438, 0x7fb4, 0xa438, 0x9920, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0xd71f, 0xa438, 0x6065, 0xa438, 0x7c74, ++ 0xa438, 0xfffb, 0xa438, 0xb820, 0xa438, 0x1000, 0xa438, 0x109e, ++ 0xa438, 0xd71f, 0xa438, 0x7fa5, 0xa438, 0x9820, 0xa438, 0xa410, ++ 0xa438, 0x8902, 0xa438, 0xa120, 0xa438, 0xa380, 0xa438, 0xce02, ++ 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x8280, 0xa438, 0xa324, ++ 0xa438, 0xab02, 0xa438, 0xa00a, 0xa438, 0x8118, 0xa438, 0x863f, ++ 0xa438, 0x87fb, 0xa438, 0xcd8e, 0xa438, 0xd193, 0xa438, 0xd047, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10a3, ++ 0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0xa280, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10a3, 0xa438, 0xd706, ++ 0xa438, 0x5f78, 0xa438, 0xa210, 0xa438, 0xd700, 0xa438, 0x6083, ++ 0xa438, 0xd101, 0xa438, 0xd047, 0xa438, 0xf003, 0xa438, 0xd160, ++ 0xa438, 0xd04b, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000, ++ 0xa438, 0x10a3, 0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10a3, 0xa438, 0xd706, ++ 0xa438, 0x5f79, 0xa438, 0x8120, 0xa438, 0xbb20, 0xa438, 0xf04c, ++ 0xa438, 0xa00a, 0xa438, 0xa340, 0xa438, 0x0c06, 0xa438, 0x0102, ++ 0xa438, 0xa240, 0xa438, 0xa290, 0xa438, 0xa324, 0xa438, 0xab02, ++ 0xa438, 0xd13e, 0xa438, 0xd05a, 0xa438, 0xd13e, 0xa438, 0xd06b, ++ 0xa438, 0xcd84, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706, ++ 0xa438, 0x6079, 0xa438, 0xd700, 0xa438, 0x5f5c, 0xa438, 0xcd8a, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706, 0xa438, 0x6079, ++ 0xa438, 0xd700, 0xa438, 0x5f5d, 0xa438, 0xcd8b, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0xcd8c, 0xa438, 0xd700, 0xa438, 0x6050, ++ 0xa438, 0xab04, 0xa438, 0xd700, 0xa438, 0x4083, 0xa438, 0xd160, ++ 0xa438, 0xd04b, 0xa438, 0xf003, 0xa438, 0xd193, 0xa438, 0xd047, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd700, 0xa438, 0x5fbb, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x8410, 0xa438, 0xd71f, ++ 0xa438, 0x5f94, 0xa438, 0xb920, 0xa438, 0x1000, 0xa438, 0x109e, ++ 0xa438, 0xd71f, 0xa438, 0x7fb4, 0xa438, 0x9920, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0xd71f, 0xa438, 0x6105, 0xa438, 0x6054, ++ 0xa438, 0xfffb, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706, ++ 0xa438, 0x5fb9, 0xa438, 0xfff0, 0xa438, 0xa410, 0xa438, 0xb820, ++ 0xa438, 0xcd85, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd71f, ++ 0xa438, 0x7fa5, 0xa438, 0x9820, 0xa438, 0xbb20, 0xa438, 0xd105, ++ 0xa438, 0xd042, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd706, ++ 0xa438, 0x5fbb, 0xa438, 0x5f85, 0xa438, 0xd700, 0xa438, 0x5f5b, ++ 0xa438, 0xd700, 0xa438, 0x6090, 0xa438, 0xd700, 0xa438, 0x4043, ++ 0xa438, 0xaa20, 0xa438, 0xcd86, 0xa438, 0xd700, 0xa438, 0x6083, ++ 0xa438, 0xd1c7, 0xa438, 0xd045, 0xa438, 0xf003, 0xa438, 0xd17a, ++ 0xa438, 0xd04b, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd700, ++ 0xa438, 0x5fbb, 0xa438, 0x0c18, 0xa438, 0x0108, 0xa438, 0x0c3f, ++ 0xa438, 0x0609, 0xa438, 0x0cfb, 0xa438, 0x0729, 0xa438, 0xa308, ++ 0xa438, 0x8320, 0xa438, 0xd105, 0xa438, 0xd042, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0x1800, ++ 0xa438, 0x08f7, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000, ++ 0xa438, 0x10a3, 0xa438, 0xd700, 0xa438, 0x607b, 0xa438, 0xd700, ++ 0xa438, 0x5f2b, 0xa438, 0x1800, 0xa438, 0x0a81, 0xa438, 0xd700, ++ 0xa438, 0x40bd, 0xa438, 0xd707, 0xa438, 0x4065, 0xa438, 0x1800, ++ 0xa438, 0x1121, 0xa438, 0x1800, 0xa438, 0x1124, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8f80, 0xa438, 0x9503, 0xa438, 0xd705, ++ 0xa438, 0x641d, 0xa438, 0xd704, 0xa438, 0x62b2, 0xa438, 0xd702, ++ 0xa438, 0x4116, 0xa438, 0xce15, 0xa438, 0x1000, 0xa438, 0x10be, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503, ++ 0xa438, 0xa00a, 0xa438, 0xd704, 0xa438, 0x4247, 0xa438, 0xd700, ++ 0xa438, 0x3691, 0xa438, 0x8183, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xa570, 0xa438, 0x9503, 0xa438, 0xf00a, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0xaf40, 0xa438, 0x9503, 0xa438, 0x800a, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x1108, ++ 0xa438, 0xcd64, 0xa438, 0xd704, 0xa438, 0x3398, 0xa438, 0x8203, ++ 0xa438, 0xd71f, 0xa438, 0x620e, 0xa438, 0xd704, 0xa438, 0x6096, ++ 0xa438, 0xd705, 0xa438, 0x6051, 0xa438, 0xf004, 0xa438, 0xd705, ++ 0xa438, 0x605d, 0xa438, 0xf008, 0xa438, 0xd706, 0xa438, 0x609d, ++ 0xa438, 0xd705, 0xa438, 0x405f, 0xa438, 0xf003, 0xa438, 0xd700, ++ 0xa438, 0x58fb, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xc7aa, ++ 0xa438, 0x9503, 0xa438, 0xd71f, 0xa438, 0x6d2e, 0xa438, 0xd704, ++ 0xa438, 0x6096, 0xa438, 0xd705, 0xa438, 0x6051, 0xa438, 0xf005, ++ 0xa438, 0xd705, 0xa438, 0x607d, 0xa438, 0x1800, 0xa438, 0x0cc7, ++ 0xa438, 0xd706, 0xa438, 0x60bd, 0xa438, 0xd705, 0xa438, 0x407f, ++ 0xa438, 0x1800, 0xa438, 0x0e42, 0xa438, 0xd702, 0xa438, 0x40a4, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8e20, 0xa438, 0x9503, ++ 0xa438, 0xd702, 0xa438, 0x40a5, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8e40, 0xa438, 0x9503, 0xa438, 0xd705, 0xa438, 0x659d, ++ 0xa438, 0xd704, 0xa438, 0x62b2, 0xa438, 0xd702, 0xa438, 0x4116, ++ 0xa438, 0xce15, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0xa00a, ++ 0xa438, 0xd704, 0xa438, 0x4247, 0xa438, 0xd700, 0xa438, 0x3691, ++ 0xa438, 0x81de, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa570, ++ 0xa438, 0x9503, 0xa438, 0xf00a, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xaf40, 0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xd706, ++ 0xa438, 0x60e4, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x0cf0, ++ 0xa438, 0x07a0, 0xa438, 0x9503, 0xa438, 0xf005, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x87f0, 0xa438, 0x9503, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x1108, 0xa438, 0xcd61, ++ 0xa438, 0xd704, 0xa438, 0x3398, 0xa438, 0x8203, 0xa438, 0xd704, ++ 0xa438, 0x6096, 0xa438, 0xd705, 0xa438, 0x6051, 0xa438, 0xf005, ++ 0xa438, 0xd705, 0xa438, 0x607d, 0xa438, 0x1800, 0xa438, 0x0cc7, ++ 0xa438, 0xd71f, 0xa438, 0x61ce, 0xa438, 0xd706, 0xa438, 0x767d, ++ 0xa438, 0xd705, 0xa438, 0x563f, 0xa438, 0x1800, 0xa438, 0x0e42, ++ 0xa438, 0x800a, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae40, ++ 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0c47, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0xaf80, 0xa438, 0x9503, 0xa438, 0x1800, ++ 0xa438, 0x0b5f, 0xa438, 0x607c, 0xa438, 0x1800, 0xa438, 0x027a, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae01, 0xa438, 0x9503, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd702, 0xa438, 0x5fa3, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8e01, 0xa438, 0x9503, ++ 0xa438, 0x1800, 0xa438, 0x027d, 0xa438, 0x1000, 0xa438, 0x10be, ++ 0xa438, 0xd702, 0xa438, 0x40a5, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8e40, 0xa438, 0x9503, 0xa438, 0xd73e, 0xa438, 0x6065, ++ 0xa438, 0x1800, 0xa438, 0x0cea, 0xa438, 0x1800, 0xa438, 0x0cf4, ++ 0xa438, 0xd701, 0xa438, 0x6fd1, 0xa438, 0xd71f, 0xa438, 0x6eee, ++ 0xa438, 0xd707, 0xa438, 0x4d0f, 0xa438, 0xd73e, 0xa438, 0x4cc5, ++ 0xa438, 0xd705, 0xa438, 0x4c99, 0xa438, 0xd704, 0xa438, 0x6c57, ++ 0xa438, 0xd702, 0xa438, 0x6c11, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8c20, 0xa438, 0xa608, 0xa438, 0x9503, 0xa438, 0xa201, ++ 0xa438, 0xa804, 0xa438, 0xd704, 0xa438, 0x40a7, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0xa620, 0xa438, 0x9503, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0xac40, 0xa438, 0x9503, 0xa438, 0x800a, ++ 0xa438, 0x8290, 0xa438, 0x8306, 0xa438, 0x8b02, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xce00, ++ 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0xcd99, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0x1000, 0xa438, 0x10cc, 0xa438, 0xd701, ++ 0xa438, 0x69f1, 0xa438, 0xd71f, 0xa438, 0x690e, 0xa438, 0xd73e, ++ 0xa438, 0x5ee6, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x87f0, ++ 0xa438, 0x9503, 0xa438, 0xce46, 0xa438, 0x1000, 0xa438, 0x10be, ++ 0xa438, 0xa00a, 0xa438, 0xd704, 0xa438, 0x40a7, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0xa570, 0xa438, 0x9503, 0xa438, 0xcd9a, ++ 0xa438, 0xd700, 0xa438, 0x6078, 0xa438, 0xd700, 0xa438, 0x609a, ++ 0xa438, 0xd109, 0xa438, 0xd074, 0xa438, 0xf003, 0xa438, 0xd109, ++ 0xa438, 0xd075, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0x1000, ++ 0xa438, 0x10cc, 0xa438, 0xd701, 0xa438, 0x65b1, 0xa438, 0xd71f, ++ 0xa438, 0x64ce, 0xa438, 0xd700, 0xa438, 0x5efe, 0xa438, 0xce00, ++ 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8608, 0xa438, 0x8c40, 0xa438, 0x9503, 0xa438, 0x8201, ++ 0xa438, 0x800a, 0xa438, 0x8290, 0xa438, 0x8306, 0xa438, 0x8b02, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xc7aa, 0xa438, 0x8570, ++ 0xa438, 0x8d08, 0xa438, 0x9503, 0xa438, 0xcd9b, 0xa438, 0x1800, ++ 0xa438, 0x0c8b, 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd705, ++ 0xa438, 0x61d9, 0xa438, 0xd704, 0xa438, 0x4193, 0xa438, 0x800a, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae40, 0xa438, 0x9503, ++ 0xa438, 0x1800, 0xa438, 0x0c47, 0xa438, 0x1800, 0xa438, 0x0df8, ++ 0xa438, 0x1800, 0xa438, 0x8339, 0xa438, 0x0800, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8d08, 0xa438, 0x8f02, 0xa438, 0x8c40, ++ 0xa438, 0x9503, 0xa438, 0x8201, 0xa438, 0xa804, 0xa438, 0xd704, ++ 0xa438, 0x40a7, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa620, ++ 0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x8290, 0xa438, 0x8306, ++ 0xa438, 0x8b02, 0xa438, 0x8010, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xaa03, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0xac20, 0xa438, 0xa608, 0xa438, 0x9503, ++ 0xa438, 0xce00, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0xcd95, ++ 0xa438, 0x1000, 0xa438, 0x109e, 0xa438, 0xd701, 0xa438, 0x7b91, ++ 0xa438, 0xd71f, 0xa438, 0x7aae, 0xa438, 0xd701, 0xa438, 0x7ab0, ++ 0xa438, 0xd704, 0xa438, 0x7ef3, 0xa438, 0xd701, 0xa438, 0x5eb3, ++ 0xa438, 0x84b0, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa608, ++ 0xa438, 0xc700, 0xa438, 0x9503, 0xa438, 0xce54, 0xa438, 0x1000, ++ 0xa438, 0x10be, 0xa438, 0xa290, 0xa438, 0xa304, 0xa438, 0xab02, ++ 0xa438, 0xd700, 0xa438, 0x6050, 0xa438, 0xab04, 0xa438, 0x0c38, ++ 0xa438, 0x0608, 0xa438, 0xaa0b, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8d01, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xae40, ++ 0xa438, 0x9503, 0xa438, 0xd702, 0xa438, 0x40a4, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8e20, 0xa438, 0x9503, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8c20, 0xa438, 0x9503, 0xa438, 0xd700, ++ 0xa438, 0x6078, 0xa438, 0xd700, 0xa438, 0x609a, 0xa438, 0xd109, ++ 0xa438, 0xd074, 0xa438, 0xf003, 0xa438, 0xd109, 0xa438, 0xd075, ++ 0xa438, 0xd704, 0xa438, 0x62b2, 0xa438, 0xd702, 0xa438, 0x4116, ++ 0xa438, 0xce54, 0xa438, 0x1000, 0xa438, 0x10be, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0xa00a, ++ 0xa438, 0xd704, 0xa438, 0x4247, 0xa438, 0xd700, 0xa438, 0x3691, ++ 0xa438, 0x8326, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa570, ++ 0xa438, 0x9503, 0xa438, 0xf00a, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xaf40, 0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0x1000, ++ 0xa438, 0x109e, 0xa438, 0xd704, 0xa438, 0x60f3, 0xa438, 0xd71f, ++ 0xa438, 0x618e, 0xa438, 0xd700, 0xa438, 0x5b5e, 0xa438, 0x1800, ++ 0xa438, 0x0deb, 0xa438, 0x800a, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xae40, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0c47, ++ 0xa438, 0x1800, 0xa438, 0x0df8, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8608, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0e2b, ++ 0xa436, 0xA10E, 0xa438, 0x0d14, 0xa436, 0xA10C, 0xa438, 0x0ce8, ++ 0xa436, 0xA10A, 0xa438, 0x0279, 0xa436, 0xA108, 0xa438, 0x0b19, ++ 0xa436, 0xA106, 0xa438, 0x111f, 0xa436, 0xA104, 0xa438, 0x0a7b, ++ 0xa436, 0xA102, 0xa438, 0x0ba3, 0xa436, 0xA100, 0xa438, 0x0022, ++ 0xa436, 0xA110, 0xa438, 0x00ff, 0xa436, 0xb87c, 0xa438, 0x859b, ++ 0xa436, 0xb87e, 0xa438, 0xaf85, 0xa438, 0xb3af, 0xa438, 0x863b, ++ 0xa438, 0xaf86, 0xa438, 0x4caf, 0xa438, 0x8688, 0xa438, 0xaf86, ++ 0xa438, 0xceaf, 0xa438, 0x8744, 0xa438, 0xaf87, 0xa438, 0x68af, ++ 0xa438, 0x8781, 0xa438, 0xbf5e, 0xa438, 0x7202, 0xa438, 0x5f7e, ++ 0xa438, 0xac28, 0xa438, 0x68e1, 0xa438, 0x84e6, 0xa438, 0xad28, ++ 0xa438, 0x09bf, 0xa438, 0x5e75, 0xa438, 0x025f, 0xa438, 0x7eac, ++ 0xa438, 0x2d59, 0xa438, 0xe18f, 0xa438, 0xebad, 0xa438, 0x2809, ++ 0xa438, 0xbf5e, 0xa438, 0x7502, 0xa438, 0x5f7e, 0xa438, 0xac2e, ++ 0xa438, 0x50e1, 0xa438, 0x84e6, 0xa438, 0xac28, 0xa438, 0x08bf, ++ 0xa438, 0x873e, 0xa438, 0x025f, 0xa438, 0x3cae, 0xa438, 0x06bf, ++ 0xa438, 0x873e, 0xa438, 0x025f, 0xa438, 0x33bf, 0xa438, 0x8741, ++ 0xa438, 0x025f, 0xa438, 0x33ee, 0xa438, 0x8fea, 0xa438, 0x02e1, ++ 0xa438, 0x84e4, 0xa438, 0xad28, 0xa438, 0x14e1, 0xa438, 0x8fe8, ++ 0xa438, 0xad28, 0xa438, 0x17e1, 0xa438, 0x84e5, 0xa438, 0x11e5, ++ 0xa438, 0x84e5, 0xa438, 0xa10c, 0xa438, 0x04ee, 0xa438, 0x84e5, ++ 0xa438, 0x0002, 0xa438, 0x4977, 0xa438, 0xee84, 0xa438, 0xdc03, ++ 0xa438, 0xae1d, 0xa438, 0xe18f, 0xa438, 0xe811, 0xa438, 0xe58f, ++ 0xa438, 0xe8ae, 0xa438, 0x14bf, 0xa438, 0x873e, 0xa438, 0x025f, ++ 0xa438, 0x3cbf, 0xa438, 0x8741, 0xa438, 0x025f, 0xa438, 0x3cee, ++ 0xa438, 0x8fea, 0xa438, 0x01ee, 0xa438, 0x84e4, 0xa438, 0x00af, ++ 0xa438, 0x50c1, 0xa438, 0x1f00, 0xa438, 0xbf5a, 0xa438, 0x6102, ++ 0xa438, 0x5f5f, 0xa438, 0xbf5a, 0xa438, 0x5e02, 0xa438, 0x5f3c, ++ 0xa438, 0xaf45, 0xa438, 0x7be0, 0xa438, 0x8012, 0xa438, 0xad23, ++ 0xa438, 0x141f, 0xa438, 0x001f, 0xa438, 0x22d1, 0xa438, 0x00bf, ++ 0xa438, 0x3fcf, 0xa438, 0x0261, 0xa438, 0x3412, 0xa438, 0xa204, ++ 0xa438, 0xf6ee, 0xa438, 0x8317, 0xa438, 0x00e0, 0xa438, 0x8012, ++ 0xa438, 0xad24, 0xa438, 0x141f, 0xa438, 0x001f, 0xa438, 0x22d1, ++ 0xa438, 0x00bf, 0xa438, 0x3fd7, 0xa438, 0x0261, 0xa438, 0x3412, ++ 0xa438, 0xa204, 0xa438, 0xf6ee, 0xa438, 0x8317, 0xa438, 0x00ef, ++ 0xa438, 0x96fe, 0xa438, 0xfdfc, 0xa438, 0xaf42, 0xa438, 0x9802, ++ 0xa438, 0x56ec, 0xa438, 0xf70b, 0xa438, 0xac13, 0xa438, 0x0fbf, ++ 0xa438, 0x5e75, 0xa438, 0x025f, 0xa438, 0x7eac, 0xa438, 0x280c, ++ 0xa438, 0xe2ff, 0xa438, 0xcfad, 0xa438, 0x32ee, 0xa438, 0x0257, ++ 0xa438, 0x05af, 0xa438, 0x00a4, 0xa438, 0x0286, 0xa438, 0xaaae, ++ 0xa438, 0xeff8, 0xa438, 0xf9ef, 0xa438, 0x5902, 0xa438, 0x1fe1, ++ 0xa438, 0xbf59, 0xa438, 0x4d02, 0xa438, 0x5f3c, 0xa438, 0xac13, ++ 0xa438, 0x09bf, 0xa438, 0x5e75, 0xa438, 0x025f, 0xa438, 0x7ea1, ++ 0xa438, 0x00f4, 0xa438, 0xbf59, 0xa438, 0x4d02, 0xa438, 0x5f33, ++ 0xa438, 0xef95, 0xa438, 0xfdfc, 0xa438, 0x04bf, 0xa438, 0x5e72, ++ 0xa438, 0x025f, 0xa438, 0x7eac, 0xa438, 0x284a, 0xa438, 0xe184, ++ 0xa438, 0xe6ad, 0xa438, 0x2809, 0xa438, 0xbf5e, 0xa438, 0x7502, ++ 0xa438, 0x5f7e, 0xa438, 0xac2d, 0xa438, 0x3be1, 0xa438, 0x8feb, ++ 0xa438, 0xad28, 0xa438, 0x09bf, 0xa438, 0x5e75, 0xa438, 0x025f, ++ 0xa438, 0x7eac, 0xa438, 0x2e32, 0xa438, 0xe184, 0xa438, 0xe6ac, ++ 0xa438, 0x2808, 0xa438, 0xbf87, 0xa438, 0x3e02, 0xa438, 0x5f3c, ++ 0xa438, 0xae06, 0xa438, 0xbf87, 0xa438, 0x3e02, 0xa438, 0x5f33, ++ 0xa438, 0xbf87, 0xa438, 0x4102, 0xa438, 0x5f33, 0xa438, 0xee8f, ++ 0xa438, 0xea04, 0xa438, 0xbf5e, 0xa438, 0x4e02, 0xa438, 0x5f7e, ++ 0xa438, 0xad28, 0xa438, 0x1f02, 0xa438, 0x4b12, 0xa438, 0xae1a, ++ 0xa438, 0xbf87, 0xa438, 0x3e02, 0xa438, 0x5f3c, 0xa438, 0xbf87, ++ 0xa438, 0x4102, 0xa438, 0x5f3c, 0xa438, 0xee8f, 0xa438, 0xea03, ++ 0xa438, 0xbf5e, 0xa438, 0x2a02, 0xa438, 0x5f33, 0xa438, 0xee84, ++ 0xa438, 0xe701, 0xa438, 0xaf4a, 0xa438, 0x7444, 0xa438, 0xac0e, ++ 0xa438, 0x55ac, 0xa438, 0x0ebf, 0xa438, 0x5e75, 0xa438, 0x025f, ++ 0xa438, 0x7ead, 0xa438, 0x2d0b, 0xa438, 0xbf5e, 0xa438, 0x36e1, ++ 0xa438, 0x8fe9, 0xa438, 0x025f, 0xa438, 0x5fae, 0xa438, 0x09bf, ++ 0xa438, 0x5e36, 0xa438, 0xe184, 0xa438, 0xe102, 0xa438, 0x5f5f, ++ 0xa438, 0xee8f, 0xa438, 0xe800, 0xa438, 0xaf49, 0xa438, 0xcdbf, ++ 0xa438, 0x595c, 0xa438, 0x025f, 0xa438, 0x7ea1, 0xa438, 0x0203, ++ 0xa438, 0xaf87, 0xa438, 0x79d1, 0xa438, 0x00af, 0xa438, 0x877c, ++ 0xa438, 0xe181, 0xa438, 0x941f, 0xa438, 0x00af, 0xa438, 0x3ff7, ++ 0xa438, 0xac4e, 0xa438, 0x06ac, 0xa438, 0x4003, 0xa438, 0xaf24, ++ 0xa438, 0x97af, 0xa438, 0x2467, 0xa436, 0xb85e, 0xa438, 0x5082, ++ 0xa436, 0xb860, 0xa438, 0x4575, 0xa436, 0xb862, 0xa438, 0x425F, ++ 0xa436, 0xb864, 0xa438, 0x0096, 0xa436, 0xb886, 0xa438, 0x4A44, ++ 0xa436, 0xb888, 0xa438, 0x49c4, 0xa436, 0xb88a, 0xa438, 0x3FF2, ++ 0xa436, 0xb88c, 0xa438, 0x245C, 0xa436, 0xb838, 0xa438, 0x00ff, ++ 0xb820, 0x0010, 0xa436, 0x843d, 0xa438, 0xaf84, 0xa438, 0xa6af, ++ 0xa438, 0x8540, 0xa438, 0xaf85, 0xa438, 0xaeaf, 0xa438, 0x85b5, ++ 0xa438, 0xaf87, 0xa438, 0x7daf, 0xa438, 0x8784, 0xa438, 0xaf87, ++ 0xa438, 0x87af, 0xa438, 0x87e5, 0xa438, 0x0066, 0xa438, 0x0a03, ++ 0xa438, 0x6607, 0xa438, 0x2666, 0xa438, 0x1c00, 0xa438, 0x660d, ++ 0xa438, 0x0166, 0xa438, 0x1004, 0xa438, 0x6616, 0xa438, 0x0566, ++ 0xa438, 0x1f06, 0xa438, 0x6a5d, 0xa438, 0x2766, 0xa438, 0x1900, ++ 0xa438, 0x6625, 0xa438, 0x2466, 0xa438, 0x2820, 0xa438, 0x662b, ++ 0xa438, 0x2466, 0xa438, 0x4600, 0xa438, 0x664c, 0xa438, 0x0166, ++ 0xa438, 0x4902, 0xa438, 0x8861, 0xa438, 0x0388, 0xa438, 0x5e05, ++ 0xa438, 0x886d, 0xa438, 0x0588, 0xa438, 0x7005, 0xa438, 0x8873, ++ 0xa438, 0x0588, 0xa438, 0x7605, 0xa438, 0x8879, 0xa438, 0x0588, ++ 0xa438, 0x7c05, 0xa438, 0x887f, 0xa438, 0x0588, 0xa438, 0x8205, ++ 0xa438, 0x8885, 0xa438, 0x0588, 0xa438, 0x881e, 0xa438, 0x13ad, ++ 0xa438, 0x2841, 0xa438, 0xbf64, 0xa438, 0xf102, 0xa438, 0x6b9d, ++ 0xa438, 0xad28, 0xa438, 0x03af, 0xa438, 0x15fc, 0xa438, 0xbf65, ++ 0xa438, 0xcb02, 0xa438, 0x6b9d, 0xa438, 0x0d11, 0xa438, 0xf62f, ++ 0xa438, 0xef31, 0xa438, 0xd202, 0xa438, 0xbf88, 0xa438, 0x6402, ++ 0xa438, 0x6b52, 0xa438, 0xe082, 0xa438, 0x020d, 0xa438, 0x01f6, ++ 0xa438, 0x271b, 0xa438, 0x03aa, 0xa438, 0x0182, 0xa438, 0xe082, ++ 0xa438, 0x010d, 0xa438, 0x01f6, 0xa438, 0x271b, 0xa438, 0x03aa, ++ 0xa438, 0x0782, 0xa438, 0xbf88, 0xa438, 0x6402, 0xa438, 0x6b5b, ++ 0xa438, 0xaf15, 0xa438, 0xf9bf, 0xa438, 0x65cb, 0xa438, 0x026b, ++ 0xa438, 0x9d0d, 0xa438, 0x11f6, 0xa438, 0x2fef, 0xa438, 0x31e0, ++ 0xa438, 0x8ff7, 0xa438, 0x0d01, 0xa438, 0xf627, 0xa438, 0x1b03, ++ 0xa438, 0xaa20, 0xa438, 0xe18f, 0xa438, 0xf4d0, 0xa438, 0x00bf, ++ 0xa438, 0x6587, 0xa438, 0x026b, 0xa438, 0x7ee1, 0xa438, 0x8ff5, ++ 0xa438, 0xbf65, 0xa438, 0x8a02, 0xa438, 0x6b7e, 0xa438, 0xe18f, ++ 0xa438, 0xf6bf, 0xa438, 0x6584, 0xa438, 0x026b, 0xa438, 0x7eaf, ++ 0xa438, 0x15fc, 0xa438, 0xe18f, 0xa438, 0xf1d0, 0xa438, 0x00bf, ++ 0xa438, 0x6587, 0xa438, 0x026b, 0xa438, 0x7ee1, 0xa438, 0x8ff2, ++ 0xa438, 0xbf65, 0xa438, 0x8a02, 0xa438, 0x6b7e, 0xa438, 0xe18f, ++ 0xa438, 0xf3bf, 0xa438, 0x6584, 0xa438, 0xaf15, 0xa438, 0xfcd1, ++ 0xa438, 0x07bf, 0xa438, 0x65ce, 0xa438, 0x026b, 0xa438, 0x7ed1, ++ 0xa438, 0x0cbf, 0xa438, 0x65d1, 0xa438, 0x026b, 0xa438, 0x7ed1, ++ 0xa438, 0x03bf, 0xa438, 0x885e, 0xa438, 0x026b, 0xa438, 0x7ed1, ++ 0xa438, 0x05bf, 0xa438, 0x8867, 0xa438, 0x026b, 0xa438, 0x7ed1, ++ 0xa438, 0x07bf, 0xa438, 0x886a, 0xa438, 0x026b, 0xa438, 0x7ebf, ++ 0xa438, 0x6a6c, 0xa438, 0x026b, 0xa438, 0x5b02, 0xa438, 0x62b5, ++ 0xa438, 0xbf6a, 0xa438, 0x0002, 0xa438, 0x6b5b, 0xa438, 0xbf64, ++ 0xa438, 0x4e02, 0xa438, 0x6b9d, 0xa438, 0xac28, 0xa438, 0x0bbf, ++ 0xa438, 0x6412, 0xa438, 0x026b, 0xa438, 0x9da1, 0xa438, 0x0502, ++ 0xa438, 0xaeec, 0xa438, 0xd104, 0xa438, 0xbf65, 0xa438, 0xce02, ++ 0xa438, 0x6b7e, 0xa438, 0xd104, 0xa438, 0xbf65, 0xa438, 0xd102, ++ 0xa438, 0x6b7e, 0xa438, 0xd102, 0xa438, 0xbf88, 0xa438, 0x6702, ++ 0xa438, 0x6b7e, 0xa438, 0xd104, 0xa438, 0xbf88, 0xa438, 0x6a02, ++ 0xa438, 0x6b7e, 0xa438, 0xaf62, 0xa438, 0x72f6, 0xa438, 0x0af6, ++ 0xa438, 0x09af, 0xa438, 0x34e3, 0xa438, 0x0285, 0xa438, 0xbe02, ++ 0xa438, 0x106c, 0xa438, 0xaf10, 0xa438, 0x6bf8, 0xa438, 0xfaef, ++ 0xa438, 0x69e0, 0xa438, 0x804c, 0xa438, 0xac25, 0xa438, 0x17e0, ++ 0xa438, 0x8040, 0xa438, 0xad25, 0xa438, 0x1a02, 0xa438, 0x85ed, ++ 0xa438, 0xe080, 0xa438, 0x40ac, 0xa438, 0x2511, 0xa438, 0xbf87, ++ 0xa438, 0x6502, 0xa438, 0x6b5b, 0xa438, 0xae09, 0xa438, 0x0287, ++ 0xa438, 0x2402, 0xa438, 0x875a, 0xa438, 0x0287, 0xa438, 0x4fef, ++ 0xa438, 0x96fe, 0xa438, 0xfc04, 0xa438, 0xf8e0, 0xa438, 0x8019, ++ 0xa438, 0xad20, 0xa438, 0x11e0, 0xa438, 0x8fe3, 0xa438, 0xac20, ++ 0xa438, 0x0502, 0xa438, 0x860a, 0xa438, 0xae03, 0xa438, 0x0286, ++ 0xa438, 0x7802, 0xa438, 0x86c1, 0xa438, 0x0287, 0xa438, 0x4ffc, ++ 0xa438, 0x04f8, 0xa438, 0xf9ef, 0xa438, 0x79fb, 0xa438, 0xbf87, ++ 0xa438, 0x6802, 0xa438, 0x6b9d, 0xa438, 0x5c20, 0xa438, 0x000d, ++ 0xa438, 0x4da1, 0xa438, 0x0151, 0xa438, 0xbf87, 0xa438, 0x6802, ++ 0xa438, 0x6b9d, 0xa438, 0x5c07, 0xa438, 0xffe3, 0xa438, 0x8fe4, ++ 0xa438, 0x1b31, 0xa438, 0x9f41, 0xa438, 0x0d48, 0xa438, 0xe38f, ++ 0xa438, 0xe51b, 0xa438, 0x319f, 0xa438, 0x38bf, 0xa438, 0x876b, ++ 0xa438, 0x026b, 0xa438, 0x9d5c, 0xa438, 0x07ff, 0xa438, 0xe38f, ++ 0xa438, 0xe61b, 0xa438, 0x319f, 0xa438, 0x280d, 0xa438, 0x48e3, ++ 0xa438, 0x8fe7, 0xa438, 0x1b31, 0xa438, 0x9f1f, 0xa438, 0xbf87, ++ 0xa438, 0x6e02, 0xa438, 0x6b9d, 0xa438, 0x5c07, 0xa438, 0xffe3, ++ 0xa438, 0x8fe8, 0xa438, 0x1b31, 0xa438, 0x9f0f, 0xa438, 0x0d48, ++ 0xa438, 0xe38f, 0xa438, 0xe91b, 0xa438, 0x319f, 0xa438, 0x06ee, ++ 0xa438, 0x8fe3, 0xa438, 0x01ae, 0xa438, 0x04ee, 0xa438, 0x8fe3, ++ 0xa438, 0x00ff, 0xa438, 0xef97, 0xa438, 0xfdfc, 0xa438, 0x04f8, ++ 0xa438, 0xf9ef, 0xa438, 0x79fb, 0xa438, 0xbf87, 0xa438, 0x6802, ++ 0xa438, 0x6b9d, 0xa438, 0x5c20, 0xa438, 0x000d, 0xa438, 0x4da1, ++ 0xa438, 0x0020, 0xa438, 0xbf87, 0xa438, 0x6802, 0xa438, 0x6b9d, ++ 0xa438, 0x5c06, 0xa438, 0x000d, 0xa438, 0x49e3, 0xa438, 0x8fea, ++ 0xa438, 0x1b31, 0xa438, 0x9f0e, 0xa438, 0xbf87, 0xa438, 0x7102, ++ 0xa438, 0x6b5b, 0xa438, 0xbf87, 0xa438, 0x7702, 0xa438, 0x6b5b, ++ 0xa438, 0xae0c, 0xa438, 0xbf87, 0xa438, 0x7102, 0xa438, 0x6b52, ++ 0xa438, 0xbf87, 0xa438, 0x7702, 0xa438, 0x6b52, 0xa438, 0xee8f, ++ 0xa438, 0xe300, 0xa438, 0xffef, 0xa438, 0x97fd, 0xa438, 0xfc04, ++ 0xa438, 0xf8f9, 0xa438, 0xef79, 0xa438, 0xfbbf, 0xa438, 0x8768, ++ 0xa438, 0x026b, 0xa438, 0x9d5c, 0xa438, 0x2000, 0xa438, 0x0d4d, ++ 0xa438, 0xa101, 0xa438, 0x4abf, 0xa438, 0x8768, 0xa438, 0x026b, ++ 0xa438, 0x9d5c, 0xa438, 0x07ff, 0xa438, 0xe38f, 0xa438, 0xeb1b, ++ 0xa438, 0x319f, 0xa438, 0x3a0d, 0xa438, 0x48e3, 0xa438, 0x8fec, ++ 0xa438, 0x1b31, 0xa438, 0x9f31, 0xa438, 0xbf87, 0xa438, 0x6b02, ++ 0xa438, 0x6b9d, 0xa438, 0xe38f, 0xa438, 0xed1b, 0xa438, 0x319f, ++ 0xa438, 0x240d, 0xa438, 0x48e3, 0xa438, 0x8fee, 0xa438, 0x1b31, ++ 0xa438, 0x9f1b, 0xa438, 0xbf87, 0xa438, 0x6e02, 0xa438, 0x6b9d, ++ 0xa438, 0xe38f, 0xa438, 0xef1b, 0xa438, 0x319f, 0xa438, 0x0ebf, ++ 0xa438, 0x8774, 0xa438, 0x026b, 0xa438, 0x5bbf, 0xa438, 0x877a, ++ 0xa438, 0x026b, 0xa438, 0x5bae, 0xa438, 0x00ff, 0xa438, 0xef97, ++ 0xa438, 0xfdfc, 0xa438, 0x04f8, 0xa438, 0xef79, 0xa438, 0xfbe0, ++ 0xa438, 0x8019, 0xa438, 0xad20, 0xa438, 0x1cee, 0xa438, 0x8fe3, ++ 0xa438, 0x00bf, 0xa438, 0x8771, 0xa438, 0x026b, 0xa438, 0x52bf, ++ 0xa438, 0x8777, 0xa438, 0x026b, 0xa438, 0x52bf, 0xa438, 0x8774, ++ 0xa438, 0x026b, 0xa438, 0x52bf, 0xa438, 0x877a, 0xa438, 0x026b, ++ 0xa438, 0x52ff, 0xa438, 0xef97, 0xa438, 0xfc04, 0xa438, 0xf8e0, ++ 0xa438, 0x8040, 0xa438, 0xf625, 0xa438, 0xe480, 0xa438, 0x40fc, ++ 0xa438, 0x04f8, 0xa438, 0xe080, 0xa438, 0x4cf6, 0xa438, 0x25e4, ++ 0xa438, 0x804c, 0xa438, 0xfc04, 0xa438, 0x55a4, 0xa438, 0xbaf0, ++ 0xa438, 0xa64a, 0xa438, 0xf0a6, 0xa438, 0x4cf0, 0xa438, 0xa64e, ++ 0xa438, 0x66a4, 0xa438, 0xb655, 0xa438, 0xa4b6, 0xa438, 0x00ac, ++ 0xa438, 0x0e66, 0xa438, 0xac0e, 0xa438, 0xee80, 0xa438, 0x4c3a, ++ 0xa438, 0xaf07, 0xa438, 0xd0af, 0xa438, 0x26d0, 0xa438, 0xa201, ++ 0xa438, 0x0ebf, 0xa438, 0x663d, 0xa438, 0x026b, 0xa438, 0x52bf, ++ 0xa438, 0x6643, 0xa438, 0x026b, 0xa438, 0x52ae, 0xa438, 0x11bf, ++ 0xa438, 0x6643, 0xa438, 0x026b, 0xa438, 0x5bd4, 0xa438, 0x0054, ++ 0xa438, 0xb4fe, 0xa438, 0xbf66, 0xa438, 0x3d02, 0xa438, 0x6b5b, ++ 0xa438, 0xd300, 0xa438, 0x020d, 0xa438, 0xf6a2, 0xa438, 0x0405, ++ 0xa438, 0xe081, 0xa438, 0x47ae, 0xa438, 0x03e0, 0xa438, 0x8148, ++ 0xa438, 0xac23, 0xa438, 0x02ae, 0xa438, 0x0268, 0xa438, 0xf01a, ++ 0xa438, 0x10ad, 0xa438, 0x2f04, 0xa438, 0xd100, 0xa438, 0xae05, ++ 0xa438, 0xad2c, 0xa438, 0x02d1, 0xa438, 0x0f1f, 0xa438, 0x00a2, ++ 0xa438, 0x0407, 0xa438, 0x3908, 0xa438, 0xad2f, 0xa438, 0x02d1, ++ 0xa438, 0x0002, 0xa438, 0x0e1c, 0xa438, 0x2b01, 0xa438, 0xad3a, ++ 0xa438, 0xc9af, 0xa438, 0x0dee, 0xa438, 0xa000, 0xa438, 0x2702, ++ 0xa438, 0x1beb, 0xa438, 0xe18f, 0xa438, 0xe1ac, 0xa438, 0x2819, ++ 0xa438, 0xee8f, 0xa438, 0xe101, 0xa438, 0x1f44, 0xa438, 0xbf65, ++ 0xa438, 0x9302, 0xa438, 0x6b9d, 0xa438, 0xe58f, 0xa438, 0xe21f, ++ 0xa438, 0x44d1, 0xa438, 0x02bf, 0xa438, 0x6593, 0xa438, 0x026b, ++ 0xa438, 0x7ee0, 0xa438, 0x82b1, 0xa438, 0xae49, 0xa438, 0xa001, ++ 0xa438, 0x0502, 0xa438, 0x1c4d, 0xa438, 0xae41, 0xa438, 0xa002, ++ 0xa438, 0x0502, 0xa438, 0x1c90, 0xa438, 0xae39, 0xa438, 0xa003, ++ 0xa438, 0x0502, 0xa438, 0x1c9d, 0xa438, 0xae31, 0xa438, 0xa004, ++ 0xa438, 0x0502, 0xa438, 0x1cbc, 0xa438, 0xae29, 0xa438, 0xa005, ++ 0xa438, 0x1e02, 0xa438, 0x1cc9, 0xa438, 0xe080, 0xa438, 0xdfac, ++ 0xa438, 0x2013, 0xa438, 0xac21, 0xa438, 0x10ac, 0xa438, 0x220d, ++ 0xa438, 0xe18f, 0xa438, 0xe2bf, 0xa438, 0x6593, 0xa438, 0x026b, ++ 0xa438, 0x7eee, 0xa438, 0x8fe1, 0xa438, 0x00ae, 0xa438, 0x08a0, ++ 0xa438, 0x0605, 0xa438, 0x021d, 0xa438, 0x07ae, 0xa438, 0x00e0, ++ 0xa438, 0x82b1, 0xa438, 0xaf1b, 0xa438, 0xe910, 0xa438, 0xbf4a, ++ 0xa438, 0x99bf, 0xa438, 0x4a00, 0xa438, 0xa86a, 0xa438, 0xfdad, ++ 0xa438, 0x5eca, 0xa438, 0xad5e, 0xa438, 0x88bd, 0xa438, 0x2c99, ++ 0xa438, 0xbd2c, 0xa438, 0x33bd, 0xa438, 0x3222, 0xa438, 0xbd32, ++ 0xa438, 0x11bd, 0xa438, 0x3200, 0xa438, 0xbd32, 0xa438, 0x77bd, ++ 0xa438, 0x3266, 0xa438, 0xbd32, 0xa438, 0x55bd, 0xa438, 0x3244, ++ 0xa438, 0xbd32, 0xa436, 0xb818, 0xa438, 0x15c5, 0xa436, 0xb81a, ++ 0xa438, 0x6255, 0xa436, 0xb81c, 0xa438, 0x34e1, 0xa436, 0xb81e, ++ 0xa438, 0x1068, 0xa436, 0xb850, 0xa438, 0x07cc, 0xa436, 0xb852, ++ 0xa438, 0x26ca, 0xa436, 0xb878, 0xa438, 0x0dbf, 0xa436, 0xb884, ++ 0xa438, 0x1BB1, 0xa436, 0xb832, 0xa438, 0x00ff, 0xa436, 0x0000, ++ 0xa438, 0x0000, 0xB82E, 0x0000, 0xa436, 0x8023, 0xa438, 0x0000, ++ 0xa436, 0x801E, 0xa438, 0x0031, 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125d_1_2[] = { ++ 0xb892, 0x0000, 0xB88E, 0xC28F, 0xB890, 0x252D, 0xB88E, 0xC290, ++ 0xB890, 0xC924, 0xB88E, 0xC291, 0xB890, 0xC92E, 0xB88E, 0xC292, ++ 0xB890, 0xF626, 0xB88E, 0xC293, 0xB890, 0xF630, 0xB88E, 0xC294, ++ 0xB890, 0xA328, 0xB88E, 0xC295, 0xB890, 0xA332, 0xB88E, 0xC296, ++ 0xB890, 0xD72B, 0xB88E, 0xC297, 0xB890, 0xD735, 0xB88E, 0xC298, ++ 0xB890, 0x8A2E, 0xB88E, 0xC299, 0xB890, 0x8A38, 0xB88E, 0xC29A, ++ 0xB890, 0xBE32, 0xB88E, 0xC29B, 0xB890, 0xBE3C, 0xB88E, 0xC29C, ++ 0xB890, 0x7436, 0xB88E, 0xC29D, 0xB890, 0x7440, 0xB88E, 0xC29E, ++ 0xB890, 0xAD3B, 0xB88E, 0xC29F, 0xB890, 0xAD45, 0xB88E, 0xC2A0, ++ 0xB890, 0x6640, 0xB88E, 0xC2A1, 0xB890, 0x664A, 0xB88E, 0xC2A2, ++ 0xB890, 0xA646, 0xB88E, 0xC2A3, 0xB890, 0xA650, 0xB88E, 0xC2A4, ++ 0xB890, 0x624C, 0xB88E, 0xC2A5, 0xB890, 0x6256, 0xB88E, 0xC2A6, ++ 0xB890, 0xA453, 0xB88E, 0xC2A7, 0xB890, 0xA45D, 0xB88E, 0xC2A8, ++ 0xB890, 0x665A, 0xB88E, 0xC2A9, 0xB890, 0x6664, 0xB88E, 0xC2AA, ++ 0xB890, 0xAC62, 0xB88E, 0xC2AB, 0xB890, 0xAC6C, 0xB88E, 0xC2AC, ++ 0xB890, 0x746A, 0xB88E, 0xC2AD, 0xB890, 0x7474, 0xB88E, 0xC2AE, ++ 0xB890, 0xBCFA, 0xB88E, 0xC2AF, 0xB890, 0xBCFD, 0xB88E, 0xC2B0, ++ 0xB890, 0x79FF, 0xB88E, 0xC2B1, 0xB890, 0x7901, 0xB88E, 0xC2B2, ++ 0xB890, 0xF703, 0xB88E, 0xC2B3, 0xB890, 0xF706, 0xB88E, 0xC2B4, ++ 0xB890, 0x7408, 0xB88E, 0xC2B5, 0xB890, 0x740A, 0xB88E, 0xC2B6, ++ 0xB890, 0xF10C, 0xB88E, 0xC2B7, 0xB890, 0xF10F, 0xB88E, 0xC2B8, ++ 0xB890, 0x6F10, 0xB88E, 0xC2B9, 0xB890, 0x6F13, 0xB88E, 0xC2BA, ++ 0xB890, 0xEC15, 0xB88E, 0xC2BB, 0xB890, 0xEC18, 0xB88E, 0xC2BC, ++ 0xB890, 0x6A1A, 0xB88E, 0xC2BD, 0xB890, 0x6A1C, 0xB88E, 0xC2BE, ++ 0xB890, 0xE71E, 0xB88E, 0xC2BF, 0xB890, 0xE721, 0xB88E, 0xC2C0, ++ 0xB890, 0x6424, 0xB88E, 0xC2C1, 0xB890, 0x6425, 0xB88E, 0xC2C2, ++ 0xB890, 0xE228, 0xB88E, 0xC2C3, 0xB890, 0xE22A, 0xB88E, 0xC2C4, ++ 0xB890, 0x5F2B, 0xB88E, 0xC2C5, 0xB890, 0x5F2E, 0xB88E, 0xC2C6, ++ 0xB890, 0xDC31, 0xB88E, 0xC2C7, 0xB890, 0xDC33, 0xB88E, 0xC2C8, ++ 0xB890, 0x2035, 0xB88E, 0xC2C9, 0xB890, 0x2036, 0xB88E, 0xC2CA, ++ 0xB890, 0x9F3A, 0xB88E, 0xC2CB, 0xB890, 0x9F3A, 0xB88E, 0xC2CC, ++ 0xB890, 0x4430, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125d_1_3[] = { ++ 0xa436, 0xacca, 0xa438, 0x0104, 0xa436, 0xaccc, 0xa438, 0x8000, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x0fff, ++ 0xa436, 0xacce, 0xa438, 0xfd47, 0xa436, 0xacd0, 0xa438, 0x0fff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xe56f, 0xa436, 0xacd0, 0xa438, 0x01c0, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xed97, 0xa436, 0xacd0, 0xa438, 0x01c8, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xf5bf, 0xa436, 0xacd0, 0xa438, 0x01d0, ++ 0xa436, 0xacce, 0xa438, 0xfb07, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb0f, 0xa436, 0xacd0, 0xa438, 0x01d8, ++ 0xa436, 0xacce, 0xa438, 0xa087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0xa00f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0xa807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0xa88f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0xb027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0xb02f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0xb847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0xb84f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0xfb17, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb1f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xa017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0xa01f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0xa837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0xa83f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0xb097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0xb05f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0xb857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0xb89f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0xfb27, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb2f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x8087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0x800f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0x8807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0x888f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0x9027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0x902f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0x9847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0x984f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0xa0a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0xa8af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0xa067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0xa86f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb37, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb3f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x8017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0x801f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0x8837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0x883f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0x9097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0x905f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0x9857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0x989f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0xb0b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0xb8bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0xb077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0xb87f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfb47, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb4f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x6087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0x600f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0x6807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0x688f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0x7027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0x702f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0x7847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0x784f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0x80a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0x88af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0x8067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x886f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb57, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb5f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x6017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0x601f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0x6837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0x683f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0x7097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0x705f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0x7857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0x789f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0x90b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0x98bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0x9077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x987f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfb67, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb6f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x4087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0x400f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0x4807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0x488f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0x5027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0x502f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0x5847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0x584f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0x60a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0x68af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0x6067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x686f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb77, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb7f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x4017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0x401f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0x4837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0x483f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0x5097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0x505f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0x5857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0x589f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0x70b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0x78bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0x7077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x787f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfb87, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb8f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x40a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0x48af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0x4067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x486f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb97, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb9f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x50b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0x58bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0x5077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x587f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfba7, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfbaf, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x2067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x286f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfbb7, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfbbf, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x3077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x387f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff, ++ 0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x0fff, ++ 0xa436, 0xacce, 0xa438, 0xfff8, 0xa436, 0xacd0, 0xa438, 0x0fff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb47, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb4f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x6087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0x600f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0x6807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0x688f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0x7027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0x702f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0x7847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0x784f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0x80a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0x88af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0x8067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x886f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb57, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb5f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x6017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0x601f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0x6837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0x683f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0x7097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0x705f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0x7857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0x789f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0x90b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0x98bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0x9077, 0xa436, 0xacd0, 0xa438, 0x1171, ++ 0xa436, 0xacce, 0xa438, 0x987f, 0xa436, 0xacd0, 0xa438, 0x1179, ++ 0xa436, 0xacca, 0xa438, 0x0004, 0xa436, 0xacc6, 0xa438, 0x0008, ++ 0xa436, 0xacc8, 0xa438, 0xc000, 0xa436, 0xacc6, 0xa438, 0x0015, ++ 0xa436, 0xacc8, 0xa438, 0xc043, 0xa436, 0xacc8, 0xa438, 0x0000, ++ 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125d_1_efuse[] = { ++ 0xB87C, 0x8014, 0xB87E, 0x90C0, 0xa436, 0x8023, 0xa438, 0x3800, ++ 0xa436, 0xB82E, 0xa438, 0x0001, 0xb820, 0x0010, 0xa436, 0x843d, ++ 0xa438, 0xaf84, 0xa438, 0x55af, 0xa438, 0x8458, 0xa438, 0xaf84, ++ 0xa438, 0x58af, 0xa438, 0x8458, 0xa438, 0xaf84, 0xa438, 0x58af, ++ 0xa438, 0x8458, 0xa438, 0xaf84, 0xa438, 0x58af, 0xa438, 0x8458, ++ 0xa438, 0xaf26, 0xa438, 0xd000, 0xa436, 0xb818, 0xa438, 0x26ca, ++ 0xa436, 0xb81a, 0xa438, 0xffff, 0xa436, 0xb81c, 0xa438, 0xffff, ++ 0xa436, 0xb81e, 0xa438, 0xffff, 0xa436, 0xb850, 0xa438, 0xffff, ++ 0xa436, 0xb852, 0xa438, 0xffff, 0xa436, 0xb878, 0xa438, 0xffff, ++ 0xa436, 0xb884, 0xa438, 0xffff, 0xa436, 0xb832, 0xa438, 0x0001, ++ 0xa436, 0x0000, 0xa438, 0x0000, 0xB82E, 0x0000, 0xa436, 0x8023, ++ 0xa438, 0x0000, 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125d_2_1[] = { ++ 0xa436, 0x8023, 0xa438, 0x3801, 0xa436, 0xB82E, 0xa438, 0x0001, ++ 0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x808e, 0xa438, 0x1800, 0xa438, 0x80d6, ++ 0xa438, 0x1800, 0xa438, 0x81e2, 0xa438, 0x1800, 0xa438, 0x81e2, ++ 0xa438, 0x1800, 0xa438, 0x81e2, 0xa438, 0x1800, 0xa438, 0x81e2, ++ 0xa438, 0x1800, 0xa438, 0x81e2, 0xa438, 0xd500, 0xa438, 0xc48d, ++ 0xa438, 0xd504, 0xa438, 0x8d03, 0xa438, 0xd701, 0xa438, 0x4045, ++ 0xa438, 0xad02, 0xa438, 0xd504, 0xa438, 0xd706, 0xa438, 0x2529, ++ 0xa438, 0x8021, 0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da, ++ 0xa438, 0xf019, 0xa438, 0x459a, 0xa438, 0xf03f, 0xa438, 0xd718, ++ 0xa438, 0x62bb, 0xa438, 0xbb01, 0xa438, 0xd75e, 0xa438, 0x6231, ++ 0xa438, 0x0cf0, 0xa438, 0x0c10, 0xa438, 0xd501, 0xa438, 0xce01, ++ 0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0x8480, 0xa438, 0x8440, ++ 0xa438, 0x8420, 0xa438, 0xa410, 0xa438, 0xce00, 0xa438, 0xd505, ++ 0xa438, 0x0c0f, 0xa438, 0x0808, 0xa438, 0xf002, 0xa438, 0xa4f0, ++ 0xa438, 0xf03c, 0xa438, 0xbb02, 0xa438, 0xd75e, 0xa438, 0x6231, ++ 0xa438, 0x0cf0, 0xa438, 0x0c20, 0xa438, 0xd501, 0xa438, 0xce01, ++ 0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0x8480, 0xa438, 0x8440, ++ 0xa438, 0xa420, 0xa438, 0x8410, 0xa438, 0xce00, 0xa438, 0xd505, ++ 0xa438, 0x0c0f, 0xa438, 0x0804, 0xa438, 0xf002, 0xa438, 0xa4f0, ++ 0xa438, 0xf028, 0xa438, 0xbb04, 0xa438, 0xd75e, 0xa438, 0x6231, ++ 0xa438, 0x0cf0, 0xa438, 0x0c40, 0xa438, 0xd501, 0xa438, 0xce01, ++ 0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0x8480, 0xa438, 0xa440, ++ 0xa438, 0x8420, 0xa438, 0x8410, 0xa438, 0xce00, 0xa438, 0xd505, ++ 0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xf002, 0xa438, 0xa4f0, ++ 0xa438, 0xf014, 0xa438, 0xbb08, 0xa438, 0xd75e, 0xa438, 0x6231, ++ 0xa438, 0x0cf0, 0xa438, 0x0c80, 0xa438, 0xd501, 0xa438, 0xce01, ++ 0xa438, 0xd70c, 0xa438, 0x6147, 0xa438, 0xa480, 0xa438, 0x8440, ++ 0xa438, 0x8420, 0xa438, 0x8410, 0xa438, 0xce00, 0xa438, 0xd505, ++ 0xa438, 0x0c0f, 0xa438, 0x0801, 0xa438, 0xf002, 0xa438, 0xa4f0, ++ 0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a8a, ++ 0xa438, 0x1000, 0xa438, 0x1829, 0xa438, 0xd73e, 0xa438, 0x6074, ++ 0xa438, 0xd718, 0xa438, 0x5f2d, 0xa438, 0x1000, 0xa438, 0x81b7, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1829, ++ 0xa438, 0xd73e, 0xa438, 0x7f74, 0xa438, 0x1000, 0xa438, 0x81ce, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1829, ++ 0xa438, 0xd718, 0xa438, 0x5f6d, 0xa438, 0x1800, 0xa438, 0x1660, ++ 0xa438, 0xd75e, 0xa438, 0x68b1, 0xa438, 0xd504, 0xa438, 0xd71e, ++ 0xa438, 0x667b, 0xa438, 0x645a, 0xa438, 0x6239, 0xa438, 0x0cf0, ++ 0xa438, 0x0c10, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0808, ++ 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7, ++ 0xa438, 0x8480, 0xa438, 0x8440, 0xa438, 0x8420, 0xa438, 0xa410, ++ 0xa438, 0xf032, 0xa438, 0xa4f0, 0xa438, 0xf030, 0xa438, 0x0cf0, ++ 0xa438, 0x0c20, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0804, ++ 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7, ++ 0xa438, 0x8480, 0xa438, 0x8440, 0xa438, 0xa420, 0xa438, 0x8410, ++ 0xa438, 0xf022, 0xa438, 0xa4f0, 0xa438, 0xf020, 0xa438, 0x0cf0, ++ 0xa438, 0x0c40, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0802, ++ 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7, ++ 0xa438, 0x8480, 0xa438, 0xa440, 0xa438, 0x8420, 0xa438, 0x8410, ++ 0xa438, 0xf012, 0xa438, 0xa4f0, 0xa438, 0xf010, 0xa438, 0x0cf0, ++ 0xa438, 0x0c80, 0xa438, 0xd505, 0xa438, 0x0c0f, 0xa438, 0x0801, ++ 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xd70c, 0xa438, 0x60c7, ++ 0xa438, 0xa480, 0xa438, 0x8440, 0xa438, 0x8420, 0xa438, 0x8410, ++ 0xa438, 0xf002, 0xa438, 0xa4f0, 0xa438, 0x1800, 0xa438, 0x168c, ++ 0xa438, 0xd500, 0xa438, 0xd706, 0xa438, 0x2529, 0xa438, 0x80e0, ++ 0xa438, 0xd718, 0xa438, 0x607b, 0xa438, 0x40da, 0xa438, 0xf00f, ++ 0xa438, 0x431a, 0xa438, 0xf021, 0xa438, 0xd718, 0xa438, 0x617b, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1b1a, ++ 0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34, ++ 0xa438, 0xf020, 0xa438, 0xf053, 0xa438, 0x1000, 0xa438, 0x1a8a, ++ 0xa438, 0x1000, 0xa438, 0x1b1a, 0xa438, 0xd718, 0xa438, 0x608e, ++ 0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf023, 0xa438, 0xf067, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0x1000, 0xa438, 0x1b1a, ++ 0xa438, 0xd718, 0xa438, 0x608e, 0xa438, 0xd73e, 0xa438, 0x5f34, ++ 0xa438, 0xf026, 0xa438, 0xf07b, 0xa438, 0x1000, 0xa438, 0x1a8a, ++ 0xa438, 0x1000, 0xa438, 0x1b1a, 0xa438, 0xd718, 0xa438, 0x608e, ++ 0xa438, 0xd73e, 0xa438, 0x5f34, 0xa438, 0xf029, 0xa438, 0xf08f, ++ 0xa438, 0x1000, 0xa438, 0x81b7, 0xa438, 0x1000, 0xa438, 0x1a8a, ++ 0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x81ce, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fae, ++ 0xa438, 0xf028, 0xa438, 0x1000, 0xa438, 0x81b7, 0xa438, 0x1000, ++ 0xa438, 0x1a8a, 0xa438, 0xd73e, 0xa438, 0x7fb4, 0xa438, 0x1000, ++ 0xa438, 0x81ce, 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718, ++ 0xa438, 0x5fae, 0xa438, 0xf039, 0xa438, 0x1000, 0xa438, 0x81b7, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd73e, 0xa438, 0x7fb4, ++ 0xa438, 0x1000, 0xa438, 0x81ce, 0xa438, 0x1000, 0xa438, 0x1a8a, ++ 0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf04a, 0xa438, 0x1000, ++ 0xa438, 0x81b7, 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd73e, ++ 0xa438, 0x7fb4, 0xa438, 0x1000, 0xa438, 0x81ce, 0xa438, 0x1000, ++ 0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fae, 0xa438, 0xf05b, ++ 0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac01, ++ 0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a78, ++ 0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504, ++ 0xa438, 0xac11, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa410, ++ 0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a8a, ++ 0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719, ++ 0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf05d, 0xa438, 0x4b98, ++ 0xa438, 0xa808, 0xa438, 0xf05a, 0xa438, 0xd719, 0xa438, 0x4119, ++ 0xa438, 0xd504, 0xa438, 0xac02, 0xa438, 0xae01, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a78, 0xa438, 0xf00a, 0xa438, 0xd719, ++ 0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac22, 0xa438, 0xd501, ++ 0xa438, 0xce01, 0xa438, 0xa420, 0xa438, 0xce00, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fb0, ++ 0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f, ++ 0xa438, 0xf03f, 0xa438, 0x47d8, 0xa438, 0xa804, 0xa438, 0xf03c, ++ 0xa438, 0xd719, 0xa438, 0x4119, 0xa438, 0xd504, 0xa438, 0xac04, ++ 0xa438, 0xae01, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a78, ++ 0xa438, 0xf00a, 0xa438, 0xd719, 0xa438, 0x4118, 0xa438, 0xd504, ++ 0xa438, 0xac44, 0xa438, 0xd501, 0xa438, 0xce01, 0xa438, 0xa440, ++ 0xa438, 0xce00, 0xa438, 0xd500, 0xa438, 0x1000, 0xa438, 0x1a8a, ++ 0xa438, 0xd718, 0xa438, 0x5fb0, 0xa438, 0xd505, 0xa438, 0xd719, ++ 0xa438, 0x4079, 0xa438, 0xa80f, 0xa438, 0xf021, 0xa438, 0x4418, ++ 0xa438, 0xa802, 0xa438, 0xf01e, 0xa438, 0xd719, 0xa438, 0x4119, ++ 0xa438, 0xd504, 0xa438, 0xac08, 0xa438, 0xae01, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a78, 0xa438, 0xf00a, 0xa438, 0xd719, ++ 0xa438, 0x4118, 0xa438, 0xd504, 0xa438, 0xac88, 0xa438, 0xd501, ++ 0xa438, 0xce01, 0xa438, 0xa480, 0xa438, 0xce00, 0xa438, 0xd500, ++ 0xa438, 0x1000, 0xa438, 0x1a8a, 0xa438, 0xd718, 0xa438, 0x5fb0, ++ 0xa438, 0xd505, 0xa438, 0xd719, 0xa438, 0x4079, 0xa438, 0xa80f, ++ 0xa438, 0xf003, 0xa438, 0x4058, 0xa438, 0xa801, 0xa438, 0x1800, ++ 0xa438, 0x1736, 0xa438, 0xd73e, 0xa438, 0xd505, 0xa438, 0x3088, ++ 0xa438, 0x81c0, 0xa438, 0x61d3, 0xa438, 0x6172, 0xa438, 0x6111, ++ 0xa438, 0x60b0, 0xa438, 0xf00d, 0xa438, 0x3298, 0xa438, 0x81cb, ++ 0xa438, 0xf00a, 0xa438, 0xa808, 0xa438, 0xf008, 0xa438, 0xa804, ++ 0xa438, 0xf006, 0xa438, 0xa802, 0xa438, 0xf004, 0xa438, 0xa801, ++ 0xa438, 0xf002, 0xa438, 0xa80f, 0xa438, 0xd500, 0xa438, 0x0800, ++ 0xa438, 0xd505, 0xa438, 0xd75e, 0xa438, 0x6211, 0xa438, 0xd71e, ++ 0xa438, 0x619b, 0xa438, 0x611a, 0xa438, 0x6099, 0xa438, 0x0c0f, ++ 0xa438, 0x0808, 0xa438, 0xf009, 0xa438, 0x0c0f, 0xa438, 0x0804, ++ 0xa438, 0xf006, 0xa438, 0x0c0f, 0xa438, 0x0802, 0xa438, 0xf003, ++ 0xa438, 0x0c0f, 0xa438, 0x0801, 0xa438, 0xd500, 0xa438, 0x0800, ++ 0xa436, 0xA026, 0xa438, 0xffff, 0xa436, 0xA024, 0xa438, 0xffff, ++ 0xa436, 0xA022, 0xa438, 0xffff, 0xa436, 0xA020, 0xa438, 0xffff, ++ 0xa436, 0xA006, 0xa438, 0xffff, 0xa436, 0xA004, 0xa438, 0x16ab, ++ 0xa436, 0xA002, 0xa438, 0x1663, 0xa436, 0xA000, 0xa438, 0x1608, ++ 0xa436, 0xA008, 0xa438, 0x0700, 0xa436, 0xA016, 0xa438, 0x0000, ++ 0xa436, 0xA012, 0xa438, 0x07f8, 0xa436, 0xA014, 0xa438, 0xcc01, ++ 0xa438, 0x20f6, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA152, ++ 0xa438, 0x021c, 0xa436, 0xA154, 0xa438, 0x2100, 0xa436, 0xA156, ++ 0xa438, 0x3fff, 0xa436, 0xA158, 0xa438, 0x3fff, 0xa436, 0xA15A, ++ 0xa438, 0x3fff, 0xa436, 0xA15C, 0xa438, 0x3fff, 0xa436, 0xA15E, ++ 0xa438, 0x3fff, 0xa436, 0xA160, 0xa438, 0x3fff, 0xa436, 0xA150, ++ 0xa438, 0x0003, 0xa436, 0xA016, 0xa438, 0x0010, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x8014, 0xa438, 0x1800, 0xa438, 0x803d, ++ 0xa438, 0x1800, 0xa438, 0x804a, 0xa438, 0x1800, 0xa438, 0x804e, ++ 0xa438, 0x1800, 0xa438, 0x8052, 0xa438, 0x1800, 0xa438, 0x8092, ++ 0xa438, 0x1800, 0xa438, 0x80a0, 0xa438, 0xc2ff, 0xa438, 0x9a40, ++ 0xa438, 0x1800, 0xa438, 0x0042, 0xa438, 0x1000, 0xa438, 0x02e5, ++ 0xa438, 0xba20, 0xa438, 0x1000, 0xa438, 0x02b4, 0xa438, 0xd701, ++ 0xa438, 0x4103, 0xa438, 0xd700, 0xa438, 0x5f6c, 0xa438, 0x1000, ++ 0xa438, 0x8024, 0xa438, 0x9a20, 0xa438, 0x1800, 0xa438, 0x0073, ++ 0xa438, 0x1800, 0xa438, 0x0084, 0xa438, 0xd701, 0xa438, 0x4061, ++ 0xa438, 0xba0f, 0xa438, 0xf004, 0xa438, 0x4060, 0xa438, 0x1000, ++ 0xa438, 0x802d, 0xa438, 0xba10, 0xa438, 0x0800, 0xa438, 0xd700, ++ 0xa438, 0x60bb, 0xa438, 0x611c, 0xa438, 0x0c0f, 0xa438, 0x1a01, ++ 0xa438, 0xf00a, 0xa438, 0x60fc, 0xa438, 0x0c0f, 0xa438, 0x1a02, ++ 0xa438, 0xf006, 0xa438, 0x0c0f, 0xa438, 0x1a04, 0xa438, 0xf003, ++ 0xa438, 0x0c0f, 0xa438, 0x1a08, 0xa438, 0x0800, 0xa438, 0x0c0f, ++ 0xa438, 0x0504, 0xa438, 0xad02, 0xa438, 0xd73e, 0xa438, 0x40f6, ++ 0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5fac, ++ 0xa438, 0x1000, 0xa438, 0x8024, 0xa438, 0x1800, 0xa438, 0x0139, ++ 0xa438, 0x9a3f, 0xa438, 0x8bf0, 0xa438, 0x1800, 0xa438, 0x02df, ++ 0xa438, 0x9a3f, 0xa438, 0x9910, 0xa438, 0x1800, 0xa438, 0x02d7, ++ 0xa438, 0xad02, 0xa438, 0x8d01, 0xa438, 0x9a7f, 0xa438, 0x9910, ++ 0xa438, 0x9860, 0xa438, 0xcb00, 0xa438, 0xd501, 0xa438, 0xce01, ++ 0xa438, 0x85f0, 0xa438, 0xd500, 0xa438, 0x0c0f, 0xa438, 0x0505, ++ 0xa438, 0xb820, 0xa438, 0xc000, 0xa438, 0xc100, 0xa438, 0xc628, ++ 0xa438, 0xc700, 0xa438, 0xc801, 0xa438, 0xc91e, 0xa438, 0xc001, ++ 0xa438, 0x4019, 0xa438, 0xc6f8, 0xa438, 0xc702, 0xa438, 0xc809, ++ 0xa438, 0xc940, 0xa438, 0xc002, 0xa438, 0x4019, 0xa438, 0x1000, ++ 0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x5fa7, 0xa438, 0xc010, ++ 0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x5fa0, ++ 0xa438, 0xc020, 0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700, ++ 0xa438, 0x5fa1, 0xa438, 0x0c0f, 0xa438, 0x0506, 0xa438, 0xb840, ++ 0xa438, 0xc6ca, 0xa438, 0xc701, 0xa438, 0xc809, 0xa438, 0xc900, ++ 0xa438, 0xc001, 0xa438, 0x4019, 0xa438, 0xc6b8, 0xa438, 0xc700, ++ 0xa438, 0xc800, 0xa438, 0xc900, 0xa438, 0xc008, 0xa438, 0x4019, ++ 0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x5fa5, ++ 0xa438, 0x8580, 0xa438, 0x8d02, 0xa438, 0x1800, 0xa438, 0x018f, ++ 0xa438, 0x1000, 0xa438, 0x02cc, 0xa438, 0xd700, 0xa438, 0x6124, ++ 0xa438, 0xd73e, 0xa438, 0x5f75, 0xa438, 0xd700, 0xa438, 0x5f2c, ++ 0xa438, 0x1000, 0xa438, 0x8024, 0xa438, 0x9a20, 0xa438, 0xfff5, ++ 0xa438, 0x1800, 0xa438, 0x00b8, 0xa438, 0x0c0f, 0xa438, 0x0503, ++ 0xa438, 0xad02, 0xa438, 0x68c8, 0xa438, 0x1000, 0xa438, 0x02c0, ++ 0xa438, 0xd700, 0xa438, 0x6848, 0xa438, 0x604d, 0xa438, 0xfffb, ++ 0xa438, 0xd73e, 0xa438, 0x6082, 0xa438, 0x1000, 0xa438, 0x02a1, ++ 0xa438, 0x8a0f, 0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700, ++ 0xa438, 0x5fae, 0xa438, 0x1000, 0xa438, 0x02de, 0xa438, 0x1000, ++ 0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5faf, 0xa438, 0x8d01, ++ 0xa438, 0x8b0f, 0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700, ++ 0xa438, 0x2a58, 0xa438, 0x80c5, 0xa438, 0x2a5b, 0xa438, 0x80cd, ++ 0xa438, 0x2b53, 0xa438, 0x80d9, 0xa438, 0xfff7, 0xa438, 0x1000, ++ 0xa438, 0x022a, 0xa438, 0x1000, 0xa438, 0x02e5, 0xa438, 0xba40, ++ 0xa438, 0x1000, 0xa438, 0x02fd, 0xa438, 0xf018, 0xa438, 0x1000, ++ 0xa438, 0x022a, 0xa438, 0x1000, 0xa438, 0x02e5, 0xa438, 0xba40, ++ 0xa438, 0x1000, 0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5faa, ++ 0xa438, 0x1000, 0xa438, 0x02fd, 0xa438, 0xf00c, 0xa438, 0x1000, ++ 0xa438, 0x022a, 0xa438, 0x1000, 0xa438, 0x02fd, 0xa438, 0x1000, ++ 0xa438, 0x02c0, 0xa438, 0xd700, 0xa438, 0x5fab, 0xa438, 0x1000, ++ 0xa438, 0x02e5, 0xa438, 0xba40, 0xa438, 0x1000, 0xa438, 0x02c0, ++ 0xa438, 0xd700, 0xa438, 0x6088, 0xa438, 0xfffc, 0xa438, 0x1800, ++ 0xa438, 0x0120, 0xa438, 0x1800, 0xa438, 0x0122, 0xa436, 0xA08E, ++ 0xa438, 0x00db, 0xa436, 0xA08C, 0xa438, 0x00b4, 0xa436, 0xA08A, ++ 0xa438, 0x015a, 0xa436, 0xA088, 0xa438, 0x02d6, 0xa436, 0xA086, ++ 0xa438, 0x02de, 0xa436, 0xA084, 0xa438, 0x0137, 0xa436, 0xA082, ++ 0xa438, 0x0071, 0xa436, 0xA080, 0xa438, 0x0041, 0xa436, 0xA090, ++ 0xa438, 0x00ff, 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x801d, 0xa438, 0x1800, 0xa438, 0x808a, ++ 0xa438, 0x1800, 0xa438, 0x80a5, 0xa438, 0x1800, 0xa438, 0x80b8, ++ 0xa438, 0x1800, 0xa438, 0x8108, 0xa438, 0x1800, 0xa438, 0x810f, ++ 0xa438, 0x1800, 0xa438, 0x811b, 0xa438, 0x8980, 0xa438, 0xd702, ++ 0xa438, 0x6126, 0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702, ++ 0xa438, 0x6060, 0xa438, 0xd702, 0xa438, 0x6077, 0xa438, 0x1800, ++ 0xa438, 0x0c29, 0xa438, 0x1800, 0xa438, 0x0c2b, 0xa438, 0x1000, ++ 0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x5fb4, 0xa438, 0xd702, ++ 0xa438, 0x6c46, 0xa438, 0xd704, 0xa438, 0x4063, 0xa438, 0xd702, ++ 0xa438, 0x6060, 0xa438, 0xd702, 0xa438, 0x6b97, 0xa438, 0xa340, ++ 0xa438, 0x0c06, 0xa438, 0x0102, 0xa438, 0xce01, 0xa438, 0x1000, ++ 0xa438, 0x117a, 0xa438, 0xa240, 0xa438, 0xa902, 0xa438, 0xa204, ++ 0xa438, 0xa280, 0xa438, 0xa364, 0xa438, 0xab02, 0xa438, 0x8380, ++ 0xa438, 0xa00a, 0xa438, 0xcd8d, 0xa438, 0x1000, 0xa438, 0x115a, ++ 0xa438, 0xd706, 0xa438, 0x5fb5, 0xa438, 0xb920, 0xa438, 0x1000, ++ 0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x7fb4, 0xa438, 0x9920, ++ 0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x6065, ++ 0xa438, 0x7c74, 0xa438, 0xfffb, 0xa438, 0xb820, 0xa438, 0x1000, ++ 0xa438, 0x115a, 0xa438, 0xd71f, 0xa438, 0x7fa5, 0xa438, 0x9820, ++ 0xa438, 0xa410, 0xa438, 0x8902, 0xa438, 0xa120, 0xa438, 0xa380, ++ 0xa438, 0xce02, 0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x8280, ++ 0xa438, 0xa324, 0xa438, 0xab02, 0xa438, 0xa00a, 0xa438, 0x8118, ++ 0xa438, 0x863f, 0xa438, 0x87fb, 0xa438, 0xcd8e, 0xa438, 0xd193, ++ 0xa438, 0xd047, 0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0x1000, ++ 0xa438, 0x115f, 0xa438, 0xd700, 0xa438, 0x5f7b, 0xa438, 0xa280, ++ 0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0x1000, 0xa438, 0x115f, ++ 0xa438, 0xd706, 0xa438, 0x5f78, 0xa438, 0xa210, 0xa438, 0xd700, ++ 0xa438, 0x6083, 0xa438, 0xd101, 0xa438, 0xd047, 0xa438, 0xf003, ++ 0xa438, 0xd160, 0xa438, 0xd04b, 0xa438, 0x1000, 0xa438, 0x115a, ++ 0xa438, 0x1000, 0xa438, 0x115f, 0xa438, 0xd700, 0xa438, 0x5f7b, ++ 0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0x1000, 0xa438, 0x115f, ++ 0xa438, 0xd706, 0xa438, 0x5f79, 0xa438, 0x8120, 0xa438, 0xbb20, ++ 0xa438, 0x1800, 0xa438, 0x0c8b, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8f80, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0c3c, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa608, 0xa438, 0x9503, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8f80, 0xa438, 0x9503, ++ 0xa438, 0xd704, 0xa438, 0x6192, 0xa438, 0xd702, 0xa438, 0x4116, ++ 0xa438, 0xce04, 0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0x1800, ++ 0xa438, 0x0b3d, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xaf40, ++ 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x0b48, 0xa438, 0xd704, ++ 0xa438, 0x6192, 0xa438, 0xd702, 0xa438, 0x4116, 0xa438, 0xce04, ++ 0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8f40, 0xa438, 0x9503, 0xa438, 0x1800, 0xa438, 0x1269, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xaf40, 0xa438, 0x9503, ++ 0xa438, 0x1800, 0xa438, 0x1274, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xa608, 0xa438, 0xc700, 0xa438, 0x9503, 0xa438, 0xce54, ++ 0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0xa290, 0xa438, 0xa304, ++ 0xa438, 0xab02, 0xa438, 0xd700, 0xa438, 0x6050, 0xa438, 0xab04, ++ 0xa438, 0x0c38, 0xa438, 0x0608, 0xa438, 0xaa0b, 0xa438, 0xd702, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8d01, 0xa438, 0xae40, ++ 0xa438, 0x4044, 0xa438, 0x8e20, 0xa438, 0x9503, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8c20, 0xa438, 0x9503, 0xa438, 0xd700, ++ 0xa438, 0x6078, 0xa438, 0xd700, 0xa438, 0x609a, 0xa438, 0xd109, ++ 0xa438, 0xd074, 0xa438, 0xf003, 0xa438, 0xd109, 0xa438, 0xd075, ++ 0xa438, 0x1000, 0xa438, 0x115a, 0xa438, 0xd704, 0xa438, 0x6252, ++ 0xa438, 0xd702, 0xa438, 0x4116, 0xa438, 0xce54, 0xa438, 0x1000, ++ 0xa438, 0x117a, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0x8f40, ++ 0xa438, 0x9503, 0xa438, 0xa00a, 0xa438, 0xd704, 0xa438, 0x41e7, ++ 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xa570, 0xa438, 0x9503, ++ 0xa438, 0xf00a, 0xa438, 0x0c03, 0xa438, 0x1502, 0xa438, 0xaf40, ++ 0xa438, 0x9503, 0xa438, 0x800a, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0x8570, 0xa438, 0x9503, 0xa438, 0xd704, 0xa438, 0x60f3, ++ 0xa438, 0xd71f, 0xa438, 0x60ee, 0xa438, 0xd700, 0xa438, 0x5bbe, ++ 0xa438, 0x1800, 0xa438, 0x0e71, 0xa438, 0x1800, 0xa438, 0x0e7c, ++ 0xa438, 0x1800, 0xa438, 0x0e7e, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xaf80, 0xa438, 0x9503, 0xa438, 0xcd62, 0xa438, 0x1800, ++ 0xa438, 0x0bd2, 0xa438, 0x800a, 0xa438, 0x8530, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8d10, 0xa438, 0x9503, 0xa438, 0xd700, ++ 0xa438, 0x6050, 0xa438, 0xaa20, 0xa438, 0x8306, 0xa438, 0x1800, ++ 0xa438, 0x0cb6, 0xa438, 0xd105, 0xa438, 0xd040, 0xa438, 0x1000, ++ 0xa438, 0x0d8f, 0xa438, 0xd700, 0xa438, 0x5fbb, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8608, 0xa438, 0x9503, 0xa438, 0x1000, ++ 0xa438, 0x0d8f, 0xa438, 0xd704, 0xa438, 0x7fb6, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x87f0, 0xa438, 0x9503, 0xa438, 0xce88, ++ 0xa438, 0x1000, 0xa438, 0x117a, 0xa438, 0x0c03, 0xa438, 0x1502, ++ 0xa438, 0xa608, 0xa438, 0x9503, 0xa438, 0xd73e, 0xa438, 0x60a5, ++ 0xa438, 0xd705, 0xa438, 0x4071, 0xa438, 0x1800, 0xa438, 0x0d65, ++ 0xa438, 0x1800, 0xa438, 0x0d6f, 0xa436, 0xA10E, 0xa438, 0x0d58, ++ 0xa436, 0xA10C, 0xa438, 0x0cb5, 0xa436, 0xA10A, 0xa438, 0x0bd1, ++ 0xa436, 0xA108, 0xa438, 0x0e37, 0xa436, 0xA106, 0xa438, 0x1267, ++ 0xa436, 0xA104, 0xa438, 0x0b3b, 0xa436, 0xA102, 0xa438, 0x0c38, ++ 0xa436, 0xA100, 0xa438, 0x0c24, 0xa436, 0xA110, 0xa438, 0x00ff, ++ 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x1ff8, ++ 0xa436, 0xA014, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa436, 0xA164, 0xa438, 0x0ceb, 0xa436, 0xA166, ++ 0xa438, 0x0e73, 0xa436, 0xA168, 0xa438, 0x0deb, 0xa436, 0xA16A, ++ 0xa438, 0x3fff, 0xa436, 0xA16C, 0xa438, 0x3fff, 0xa436, 0xA16E, ++ 0xa438, 0x3fff, 0xa436, 0xA170, 0xa438, 0x3fff, 0xa436, 0xA172, ++ 0xa438, 0x3fff, 0xa436, 0xA162, 0xa438, 0x0007, 0xa436, 0xb87c, ++ 0xa438, 0x85bf, 0xa436, 0xb87e, 0xa438, 0xaf85, 0xa438, 0xd7af, ++ 0xa438, 0x85fb, 0xa438, 0xaf86, 0xa438, 0x10af, 0xa438, 0x8638, ++ 0xa438, 0xaf86, 0xa438, 0x47af, 0xa438, 0x8647, 0xa438, 0xaf86, ++ 0xa438, 0x47af, 0xa438, 0x8647, 0xa438, 0xbf85, 0xa438, 0xf802, ++ 0xa438, 0x627f, 0xa438, 0xbf61, 0xa438, 0xc702, 0xa438, 0x627f, ++ 0xa438, 0xae0c, 0xa438, 0xbf85, 0xa438, 0xf802, 0xa438, 0x6276, ++ 0xa438, 0xbf61, 0xa438, 0xc702, 0xa438, 0x6276, 0xa438, 0xee85, ++ 0xa438, 0x4200, 0xa438, 0xaf1b, 0xa438, 0x2333, 0xa438, 0xa484, ++ 0xa438, 0xbf86, 0xa438, 0x0a02, 0xa438, 0x627f, 0xa438, 0xbf86, ++ 0xa438, 0x0d02, 0xa438, 0x627f, 0xa438, 0xaf1b, 0xa438, 0x8422, ++ 0xa438, 0xa484, 0xa438, 0x66ac, 0xa438, 0x0ef8, 0xa438, 0xfbef, ++ 0xa438, 0x79fb, 0xa438, 0xe080, 0xa438, 0x16ad, 0xa438, 0x230f, ++ 0xa438, 0xee85, 0xa438, 0x4200, 0xa438, 0x1f44, 0xa438, 0xbf86, ++ 0xa438, 0x30d7, 0xa438, 0x0008, 0xa438, 0x0264, 0xa438, 0xa3ff, ++ 0xa438, 0xef97, 0xa438, 0xfffc, 0xa438, 0x0485, 0xa438, 0xf861, ++ 0xa438, 0xc786, 0xa438, 0x0a86, 0xa438, 0x0de1, 0xa438, 0x8feb, ++ 0xa438, 0xe583, 0xa438, 0x20e1, 0xa438, 0x8fea, 0xa438, 0xe583, ++ 0xa438, 0x21af, 0xa438, 0x41a7, 0xa436, 0xb85e, 0xa438, 0x1b05, ++ 0xa436, 0xb860, 0xa438, 0x1b78, 0xa436, 0xb862, 0xa438, 0x1a08, ++ 0xa436, 0xb864, 0xa438, 0x419F, 0xa436, 0xb886, 0xa438, 0xffff, ++ 0xa436, 0xb888, 0xa438, 0xffff, 0xa436, 0xb88a, 0xa438, 0xffff, ++ 0xa436, 0xb88c, 0xa438, 0xffff, 0xa436, 0xb838, 0xa438, 0x000f, ++ 0xb820, 0x0010, 0xa436, 0x0000, 0xa438, 0x0000, 0xB82E, 0x0000, ++ 0xa436, 0x8023, 0xa438, 0x0000, 0xa436, 0x801E, 0xa438, 0x0013, ++ 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125d_2_2[] = { ++ 0xa436, 0xacca, 0xa438, 0x0104, 0xa436, 0xaccc, 0xa438, 0x8000, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x0fff, ++ 0xa436, 0xacce, 0xa438, 0xfd47, 0xa436, 0xacd0, 0xa438, 0x0fff, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xe56f, 0xa436, 0xacd0, 0xa438, 0x01c0, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xed97, 0xa436, 0xacd0, 0xa438, 0x01c8, ++ 0xa436, 0xacce, 0xa438, 0xffff, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xf5bf, 0xa436, 0xacd0, 0xa438, 0x01d0, ++ 0xa436, 0xacce, 0xa438, 0xfb07, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb0f, 0xa436, 0xacd0, 0xa438, 0x01d8, ++ 0xa436, 0xacce, 0xa438, 0xa087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0xa00f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0xa807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0xa88f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0xb027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0xb02f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0xb847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0xb84f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0xfb17, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb1f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xa017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0xa01f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0xa837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0xa83f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0xb097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0xb05f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0xb857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0xb89f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0xfb27, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb2f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x8087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0x800f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0x8807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0x888f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0x9027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0x902f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0x9847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0x984f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0xa0a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0xa8af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0xa067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0xa86f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb37, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb3f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x8017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0x801f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0x8837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0x883f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0x9097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0x905f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0x9857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0x989f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0xb0b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0xb8bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0xb077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0xb87f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfb47, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb4f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x6087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0x600f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0x6807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0x688f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0x7027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0x702f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0x7847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0x784f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0x80a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0x88af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0x8067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x886f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb57, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb5f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x6017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0x601f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0x6837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0x683f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0x7097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0x705f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0x7857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0x789f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0x90b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0x98bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0x9077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x987f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfb67, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb6f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x4087, 0xa436, 0xacd0, 0xa438, 0x0180, ++ 0xa436, 0xacce, 0xa438, 0x400f, 0xa436, 0xacd0, 0xa438, 0x0108, ++ 0xa436, 0xacce, 0xa438, 0x4807, 0xa436, 0xacd0, 0xa438, 0x0100, ++ 0xa436, 0xacce, 0xa438, 0x488f, 0xa436, 0xacd0, 0xa438, 0x0188, ++ 0xa436, 0xacce, 0xa438, 0x5027, 0xa436, 0xacd0, 0xa438, 0x0120, ++ 0xa436, 0xacce, 0xa438, 0x502f, 0xa436, 0xacd0, 0xa438, 0x0128, ++ 0xa436, 0xacce, 0xa438, 0x5847, 0xa436, 0xacd0, 0xa438, 0x0140, ++ 0xa436, 0xacce, 0xa438, 0x584f, 0xa436, 0xacd0, 0xa438, 0x0148, ++ 0xa436, 0xacce, 0xa438, 0x60a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0x68af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0x6067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x686f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb77, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb7f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x4017, 0xa436, 0xacd0, 0xa438, 0x0110, ++ 0xa436, 0xacce, 0xa438, 0x401f, 0xa436, 0xacd0, 0xa438, 0x0118, ++ 0xa436, 0xacce, 0xa438, 0x4837, 0xa436, 0xacd0, 0xa438, 0x0130, ++ 0xa436, 0xacce, 0xa438, 0x483f, 0xa436, 0xacd0, 0xa438, 0x0138, ++ 0xa436, 0xacce, 0xa438, 0x5097, 0xa436, 0xacd0, 0xa438, 0x0190, ++ 0xa436, 0xacce, 0xa438, 0x505f, 0xa436, 0xacd0, 0xa438, 0x0158, ++ 0xa436, 0xacce, 0xa438, 0x5857, 0xa436, 0xacd0, 0xa438, 0x0150, ++ 0xa436, 0xacce, 0xa438, 0x589f, 0xa436, 0xacd0, 0xa438, 0x0198, ++ 0xa436, 0xacce, 0xa438, 0x70b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0x78bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0x7077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x787f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfb87, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb8f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x40a7, 0xa436, 0xacd0, 0xa438, 0x01a0, ++ 0xa436, 0xacce, 0xa438, 0x48af, 0xa436, 0xacd0, 0xa438, 0x01a8, ++ 0xa436, 0xacce, 0xa438, 0x4067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x486f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfb97, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfb9f, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x50b7, 0xa436, 0xacd0, 0xa438, 0x01b0, ++ 0xa436, 0xacce, 0xa438, 0x58bf, 0xa436, 0xacd0, 0xa438, 0x01b8, ++ 0xa436, 0xacce, 0xa438, 0x5077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x587f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfba7, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfbaf, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x2067, 0xa436, 0xacd0, 0xa438, 0x0161, ++ 0xa436, 0xacce, 0xa438, 0x286f, 0xa436, 0xacd0, 0xa438, 0x0169, ++ 0xa436, 0xacce, 0xa438, 0xfbb7, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0xfbbf, 0xa436, 0xacd0, 0xa438, 0x07ff, ++ 0xa436, 0xacce, 0xa438, 0x3077, 0xa436, 0xacd0, 0xa438, 0x0171, ++ 0xa436, 0xacce, 0xa438, 0x387f, 0xa436, 0xacd0, 0xa438, 0x0179, ++ 0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff, ++ 0xa436, 0xacce, 0xa438, 0xfff9, 0xa436, 0xacd0, 0xa438, 0x17ff, ++ 0xa436, 0xacca, 0xa438, 0x0004, 0xa436, 0xacc6, 0xa438, 0x0008, ++ 0xa436, 0xacc8, 0xa438, 0xc000, 0xa436, 0xacc8, 0xa438, 0x0000, ++ 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125bp_1_1[] = { ++ 0xa436, 0x8024, 0xa438, 0x3600, 0xa436, 0xB82E, 0xa438, 0x0001, ++ 0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012, ++ 0xa438, 0x0000, 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, ++ 0xa438, 0x1800, 0xa438, 0x8014, 0xa438, 0x1800, 0xa438, 0x8018, ++ 0xa438, 0x1800, 0xa438, 0x801c, 0xa438, 0x1800, 0xa438, 0x8020, ++ 0xa438, 0x1800, 0xa438, 0x8024, 0xa438, 0x1800, 0xa438, 0x8028, ++ 0xa438, 0x1800, 0xa438, 0x8028, 0xa438, 0xdb20, 0xa438, 0xd501, ++ 0xa438, 0x1800, 0xa438, 0x034c, 0xa438, 0xdb10, 0xa438, 0xd501, ++ 0xa438, 0x1800, 0xa438, 0x032c, 0xa438, 0x8620, 0xa438, 0xa480, ++ 0xa438, 0x1800, 0xa438, 0x1cfe, 0xa438, 0xbf40, 0xa438, 0xd703, ++ 0xa438, 0x1800, 0xa438, 0x0ce9, 0xa438, 0x9c10, 0xa438, 0x9f40, ++ 0xa438, 0x1800, 0xa438, 0x137a, 0xa438, 0x9f20, 0xa438, 0x9f40, ++ 0xa438, 0x1800, 0xa438, 0x16c4, 0xa436, 0xA026, 0xa438, 0xffff, ++ 0xa436, 0xA024, 0xa438, 0xffff, 0xa436, 0xA022, 0xa438, 0x16c3, ++ 0xa436, 0xA020, 0xa438, 0x1379, 0xa436, 0xA006, 0xa438, 0x0ce8, ++ 0xa436, 0xA004, 0xa438, 0x1cfd, 0xa436, 0xA002, 0xa438, 0x032b, ++ 0xa436, 0xA000, 0xa438, 0x034b, 0xa436, 0xA008, 0xa438, 0x3f00, ++ 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000, ++ 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800, ++ 0xa438, 0x8018, 0xa438, 0x1800, 0xa438, 0x8021, 0xa438, 0x1800, ++ 0xa438, 0x802b, 0xa438, 0x1800, 0xa438, 0x8055, 0xa438, 0x1800, ++ 0xa438, 0x805a, 0xa438, 0x1800, 0xa438, 0x805e, 0xa438, 0x1800, ++ 0xa438, 0x8062, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0xcb11, ++ 0xa438, 0xd1b9, 0xa438, 0xd05b, 0xa438, 0x0000, 0xa438, 0x1800, ++ 0xa438, 0x0284, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0xd700, ++ 0xa438, 0x5fb4, 0xa438, 0x5f95, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0x1800, 0xa438, 0x02b7, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0xcb21, 0xa438, 0x1000, 0xa438, 0x0b34, 0xa438, 0xd71f, ++ 0xa438, 0x5f5e, 0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x0322, ++ 0xa438, 0xd700, 0xa438, 0xd113, 0xa438, 0xd040, 0xa438, 0x1000, ++ 0xa438, 0x0a57, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xd700, ++ 0xa438, 0x6065, 0xa438, 0xd122, 0xa438, 0xf002, 0xa438, 0xd122, ++ 0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0b53, 0xa438, 0xa008, ++ 0xa438, 0xd704, 0xa438, 0x4052, 0xa438, 0xa002, 0xa438, 0xd704, ++ 0xa438, 0x4054, 0xa438, 0xa740, 0xa438, 0x1000, 0xa438, 0x0a57, ++ 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0xcb9b, 0xa438, 0xd110, ++ 0xa438, 0xd040, 0xa438, 0x1000, 0xa438, 0x0c01, 0xa438, 0x1000, ++ 0xa438, 0x0a57, 0xa438, 0xd700, 0xa438, 0x5fb4, 0xa438, 0x801a, ++ 0xa438, 0x1000, 0xa438, 0x0a57, 0xa438, 0xd704, 0xa438, 0x7fb9, ++ 0xa438, 0x1800, 0xa438, 0x088d, 0xa438, 0xcb62, 0xa438, 0xd700, ++ 0xa438, 0x8880, 0xa438, 0x1800, 0xa438, 0x06cb, 0xa438, 0xbe02, ++ 0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x002c, 0xa438, 0xbe04, ++ 0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x002c, 0xa438, 0xbe08, ++ 0xa438, 0x0000, 0xa438, 0x1800, 0xa438, 0x002c, 0xa436, 0xA10E, ++ 0xa438, 0x802a, 0xa436, 0xA10C, 0xa438, 0x8026, 0xa436, 0xA10A, ++ 0xa438, 0x8022, 0xa436, 0xA108, 0xa438, 0x06ca, 0xa436, 0xA106, ++ 0xa438, 0x086f, 0xa436, 0xA104, 0xa438, 0x0321, 0xa436, 0xA102, ++ 0xa438, 0x02b5, 0xa436, 0xA100, 0xa438, 0x0283, 0xa436, 0xA110, ++ 0xa438, 0x001f, 0xb820, 0x0010, 0xb82e, 0x0000, 0xa436, 0x8024, ++ 0xa438, 0x0000, 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125bp_1_2[] = { ++ 0xb892, 0x0000, 0xb88e, 0xC201, 0xb890, 0x2C01, 0xb890, 0xCD02, ++ 0xb890, 0x0602, 0xb890, 0x5502, 0xb890, 0xB903, 0xb890, 0x3303, ++ 0xb890, 0xC204, 0xb890, 0x6605, 0xb890, 0x1F05, 0xb890, 0xEE06, ++ 0xb890, 0xD207, 0xb890, 0xCC08, 0xb890, 0xDA09, 0xb890, 0xFF0B, ++ 0xb890, 0x380C, 0xb890, 0x87F3, 0xb88e, 0xC27F, 0xb890, 0x2B66, ++ 0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x6666, ++ 0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x6666, 0xb890, 0x66C2, ++ 0xb88e, 0xC26F, 0xb890, 0x751D, 0xb890, 0x1D1F, 0xb890, 0x2022, ++ 0xb890, 0x2325, 0xb890, 0x2627, 0xb890, 0x2829, 0xb890, 0x2929, ++ 0xb890, 0x2A2A, 0xb890, 0x2B66, 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static const u16 phy_mcu_ram_code_8125cp_1_1[] = { ++ 0xa436, 0x8023, 0xa438, 0x2300, 0xa436, 0xB82E, 0xa438, 0x0001, ++ 0xb820, 0x0090, 0xa436, 0xA016, 0xa438, 0x0000, 0xa436, 0xA012, ++ 0xa438, 0x07f8, 0xa436, 0xA014, 0xa438, 0xcc01, 0xa438, 0x2166, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, 0xa438, 0x0000, ++ 0xa438, 0x0000, 0xa438, 0x0000, 0xa436, 0xA152, 0xa438, 0x021c, ++ 0xa436, 0xA154, 0xa438, 0x2170, 0xa436, 0xA156, 0xa438, 0x3fff, ++ 0xa436, 0xA158, 0xa438, 0x3fff, 0xa436, 0xA15A, 0xa438, 0x3fff, ++ 0xa436, 0xA15C, 0xa438, 0x3fff, 0xa436, 0xA15E, 0xa438, 0x3fff, ++ 0xa436, 0xA160, 0xa438, 0x3fff, 0xa436, 0xA150, 0xa438, 0x0003, ++ 0xa436, 0xA016, 0xa438, 0x0020, 0xa436, 0xA012, 0xa438, 0x0000, ++ 0xa436, 0xA014, 0xa438, 0x1800, 0xa438, 0x8010, 0xa438, 0x1800, ++ 0xa438, 0x801b, 0xa438, 0x1800, 0xa438, 0x802b, 0xa438, 0x1800, ++ 0xa438, 0x8031, 0xa438, 0x1800, 0xa438, 0x8037, 0xa438, 0x1800, ++ 0xa438, 0x8037, 0xa438, 0x1800, 0xa438, 0x8037, 0xa438, 0x1800, ++ 0xa438, 0x8037, 0xa438, 0x800a, 0xa438, 0x8530, 0xa438, 0x0c03, ++ 0xa438, 0x1502, 0xa438, 0x8d10, 0xa438, 0x9503, 0xa438, 0xd700, ++ 0xa438, 0x6050, 0xa438, 0xaa20, 0xa438, 0x1800, 0xa438, 0x0d53, ++ 0xa438, 0xd707, 0xa438, 0x40f6, 0xa438, 0x8901, 0xa438, 0xd704, ++ 0xa438, 0x6091, 0xa438, 0x8306, 0xa438, 0x8b02, 0xa438, 0x8290, ++ 0xa438, 0x1000, 0xa438, 0x0e4d, 0xa438, 0x1000, 0xa438, 0x1277, ++ 0xa438, 0xd704, 0xa438, 0x7e77, 0xa438, 0x1800, 0xa438, 0x0dc5, ++ 0xa438, 0xd700, 0xa438, 0x4063, 0xa438, 0x1800, 0xa438, 0x0d15, ++ 0xa438, 0x1800, 0xa438, 0x0d18, 0xa438, 0xd700, 0xa438, 0x6063, ++ 0xa438, 0x1800, 0xa438, 0x0ca6, 0xa438, 0x1800, 0xa438, 0x0ca7, ++ 0xa436, 0xA10E, 0xa438, 0xffff, 0xa436, 0xA10C, 0xa438, 0xffff, ++ 0xa436, 0xA10A, 0xa438, 0xffff, 0xa436, 0xA108, 0xa438, 0xffff, ++ 0xa436, 0xA106, 0xa438, 0x0ca2, 0xa436, 0xA104, 0xa438, 0x0d13, ++ 0xa436, 0xA102, 0xa438, 0x0dbf, 0xa436, 0xA100, 0xa438, 0x0d52, ++ 0xa436, 0xA110, 0xa438, 0x000f, 0xa436, 0xb87c, 0xa438, 0x85bd, ++ 0xa436, 0xb87e, 0xa438, 0xaf85, 0xa438, 0xd5af, 0xa438, 0x85fb, ++ 0xa438, 0xaf85, 0xa438, 0xfbaf, 0xa438, 0x85fb, 0xa438, 0xaf85, ++ 0xa438, 0xfbaf, 0xa438, 0x85fb, 0xa438, 0xaf85, 0xa438, 0xfbaf, ++ 0xa438, 0x85fb, 0xa438, 0xac28, 0xa438, 0x0bd4, 0xa438, 0x0294, ++ 0xa438, 0xbf85, 0xa438, 0xf802, 0xa438, 0x61c2, 0xa438, 0xae09, ++ 0xa438, 0xd414, 0xa438, 0x50bf, 0xa438, 0x85f8, 0xa438, 0x0261, ++ 0xa438, 0xc2bf, 0xa438, 0x60de, 0xa438, 0x0261, 0xa438, 0xe1bf, ++ 0xa438, 0x80cf, 0xa438, 0xaf24, 0xa438, 0xe8f0, 0xa438, 0xac52, ++ 0xa436, 0xb85e, 0xa438, 0x24e5, 0xa436, 0xb860, 0xa438, 0xffff, ++ 0xa436, 0xb862, 0xa438, 0xffff, 0xa436, 0xb864, 0xa438, 0xffff, ++ 0xa436, 0xb886, 0xa438, 0xffff, 0xa436, 0xb888, 0xa438, 0xffff, ++ 0xa436, 0xb88a, 0xa438, 0xffff, 0xa436, 0xb88c, 0xa438, 0xffff, ++ 0xa436, 0xb838, 0xa438, 0x0001, 0xb820, 0x0010, 0xB82E, 0x0000, ++ 0xa436, 0x8023, 0xa438, 0x0000, 0xB820, 0x0000, 0xFFFF, 0xFFFF ++}; ++ ++static void ++rtl8125_real_set_phy_mcu_8125b_1(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125b_1, ++ ARRAY_SIZE(phy_mcu_ram_code_8125b_1)); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125b_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125b_1(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125b_2(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125b_2, ++ ARRAY_SIZE(phy_mcu_ram_code_8125b_2)); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125b_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125b_2(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125d_1_1(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125d_1_1, ++ ARRAY_SIZE(phy_mcu_ram_code_8125d_1_1)); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125d_1_2(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125d_1_2, ++ ARRAY_SIZE(phy_mcu_ram_code_8125d_1_2)); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125d_1_3(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125d_1_3, ++ ARRAY_SIZE(phy_mcu_ram_code_8125d_1_3)); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125d_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125d_1_1(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125d_1_2(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125d_1_3(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125d_1_efuse(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125d_1_efuse, ++ ARRAY_SIZE(phy_mcu_ram_code_8125d_1_efuse)); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125d_2_1(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125d_2_1, ++ ARRAY_SIZE(phy_mcu_ram_code_8125d_2_1)); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125d_2_2(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125d_2_2, ++ ARRAY_SIZE(phy_mcu_ram_code_8125d_2_2)); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125d_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125d_2_1(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125d_2_2(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125bp_1_1(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125bp_1_1, ++ ARRAY_SIZE(phy_mcu_ram_code_8125bp_1_1)); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125bp_1_2(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125bp_1_2, ++ ARRAY_SIZE(phy_mcu_ram_code_8125bp_1_2)); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125bp_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125bp_1_1(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125bp_1_2(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void ++rtl8125_real_set_phy_mcu_8125cp_1_1(struct net_device *dev) ++{ ++ rtl8125_set_phy_mcu_ram_code(dev, ++ phy_mcu_ram_code_8125cp_1_1, ++ ARRAY_SIZE(phy_mcu_ram_code_8125cp_1_1)); ++} ++ ++static void ++rtl8125_set_phy_mcu_8125cp_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_real_set_phy_mcu_8125cp_1_1(dev); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++} ++ ++static void ++rtl8125_init_hw_phy_mcu(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u8 require_disable_phy_disable_mode = FALSE; ++ ++ if (tp->NotWrRamCodeToMicroP == TRUE) ++ return; ++ ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ return; ++ ++ if (HW_SUPPORT_CHECK_PHY_DISABLE_MODE(tp) && rtl8125_is_in_phy_disable_mode(dev)) ++ require_disable_phy_disable_mode = TRUE; ++ ++ if (require_disable_phy_disable_mode) ++ rtl8125_disable_phy_disable_mode(dev); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ rtl8125_set_phy_mcu_8125a_1(dev); ++ break; ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ rtl8125_set_phy_mcu_8125a_2(dev); ++ break; ++ case CFG_METHOD_4: ++ rtl8125_set_phy_mcu_8125b_1(dev); ++ break; ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ rtl8125_set_phy_mcu_8125b_2(dev); ++ break; ++ case CFG_METHOD_8: ++ rtl8125_set_phy_mcu_8125bp_1(dev); ++ break; ++ case CFG_METHOD_9: ++ /* nothing to do */ ++ break; ++ case CFG_METHOD_10: ++ rtl8125_set_phy_mcu_8125d_1(dev); ++ break; ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ rtl8125_set_phy_mcu_8125d_2(dev); ++ break; ++ case CFG_METHOD_12: ++ rtl8125_set_phy_mcu_8125cp_1(dev); ++ break; ++ } ++ ++ if (require_disable_phy_disable_mode) ++ rtl8125_enable_phy_disable_mode(dev); ++ ++ rtl8125_write_hw_phy_mcu_code_ver(dev); ++ ++ rtl8125_mdio_write(tp,0x1F, 0x0000); ++ ++ tp->HwHasWrRamCodeToMicroP = TRUE; ++} ++#endif ++ ++static void ++rtl8125_enable_phy_aldps(struct rtl8125_private *tp) ++{ ++ //enable aldps ++ //GPHY OCP 0xA430 bit[2] = 0x1 (en_aldps) ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA430, BIT_2); ++} ++ ++static void ++rtl8125_tgphy_irq_mask_and_ack(struct rtl8125_private *tp) ++{ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA4D2, 0x0000); ++ (void)rtl8125_mdio_direct_read_phy_ocp(tp, 0xA4D4); ++ break; ++ default: ++ break; ++ } ++} ++ ++static void ++rtl8125_hw_phy_config_8125a_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD40, ++ 0x03FF, ++ 0x84); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xAD4E, BIT_4); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD16, ++ 0x03FF, ++ 0x0006); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD32, ++ 0x003F, ++ 0x0006); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_12); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAC8A, ++ BIT_15|BIT_14|BIT_13|BIT_12, ++ BIT_14|BIT_13|BIT_12); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xAD18, BIT_10); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xAD1A, 0x3FF); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xAD1C, 0x3FF); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80EA); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xC400); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80EB); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0x0700, ++ 0x0300); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80F8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x1C00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80F1); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x3000); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80FE); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xA500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8102); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x5000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8105); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x3300); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8100); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x7000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8104); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xF000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8106); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x6500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DC); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xED00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DF); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80E1); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_8); ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBF06, ++ 0x003F, ++ 0x38); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x819F); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xD0B6); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBC34, 0x5555); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBF0A, ++ BIT_11|BIT_10|BIT_9, ++ BIT_11|BIT_9); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5C0, BIT_10); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ //enable aldps ++ //GPHY OCP 0xA430 bit[2] = 0x1 (en_aldps) ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125a_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xAD4E, BIT_4); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD16, ++ 0x03FF, ++ 0x03FF); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD32, ++ 0x003F, ++ 0x0006); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_12); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC08, BIT_8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xACC0, ++ BIT_1|BIT_0, ++ BIT_1); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD40, ++ BIT_7|BIT_6|BIT_5, ++ BIT_6); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD40, ++ BIT_2|BIT_1|BIT_0, ++ BIT_2); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC14, BIT_7); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAC80, BIT_9|BIT_8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAC5E, ++ BIT_2|BIT_1|BIT_0, ++ BIT_1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAD4C, 0x00A8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC5C, 0x01FF); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAC8A, ++ BIT_7|BIT_6|BIT_5|BIT_4, ++ BIT_5|BIT_4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8157); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8159); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0700); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80A2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0153); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x809C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0153); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81B3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0043); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00A7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00D6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00EC); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00F6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00FB); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00FD); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00FF); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00BB); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0058); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0029); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0013); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0009); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0004); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8257); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x020F); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80EA); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7843); ++ ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB896, BIT_0); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB892, 0xFF00); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC091); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E12); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC092); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1214); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC094); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1516); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC096); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x171B); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC098); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1B1C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC09A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1F1F); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC09C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2021); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC09E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2224); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC0A0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2424); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC0A2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2424); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC0A4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x2424); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC018); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0AF2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC01A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0D4A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC01C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0F26); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC01E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x118D); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC020); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x14F3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC022); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x175A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC024); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x19C0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC026); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1C26); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC089); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6050); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC08A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x5F6E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC08C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E6E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC08E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E6E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC090); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x6E12); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xB896, BIT_0); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xD068, BIT_13); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81A2); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB54C, ++ 0xFF00, ++ 0xDB00); ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA454, BIT_0); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA5D4, BIT_5); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAD4E, BIT_4); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA86A, BIT_0); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ if (tp->RequirePhyMdiSwapPatch) { ++ u16 adccal_offset_p0; ++ u16 adccal_offset_p1; ++ u16 adccal_offset_p2; ++ u16 adccal_offset_p3; ++ u16 rg_lpf_cap_xg_p0; ++ u16 rg_lpf_cap_xg_p1; ++ u16 rg_lpf_cap_xg_p2; ++ u16 rg_lpf_cap_xg_p3; ++ u16 rg_lpf_cap_p0; ++ u16 rg_lpf_cap_p1; ++ u16 rg_lpf_cap_p2; ++ u16 rg_lpf_cap_p3; ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0007, ++ 0x0001); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0000); ++ adccal_offset_p0 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A); ++ adccal_offset_p0 &= 0x07FF; ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0008); ++ adccal_offset_p1 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A); ++ adccal_offset_p1 &= 0x07FF; ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0010); ++ adccal_offset_p2 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A); ++ adccal_offset_p2 &= 0x07FF; ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0018); ++ adccal_offset_p3 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xD06A); ++ adccal_offset_p3 &= 0x07FF; ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0000); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD06A, ++ 0x07FF, ++ adccal_offset_p3); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0008); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD06A, ++ 0x07FF, ++ adccal_offset_p2); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0010); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD06A, ++ 0x07FF, ++ adccal_offset_p1); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD068, ++ 0x0018, ++ 0x0018); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xD06A, ++ 0x07FF, ++ adccal_offset_p0); ++ ++ ++ rg_lpf_cap_xg_p0 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5A); ++ rg_lpf_cap_xg_p0 &= 0x001F; ++ rg_lpf_cap_xg_p1 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5A); ++ rg_lpf_cap_xg_p1 &= 0x1F00; ++ rg_lpf_cap_xg_p2 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5C); ++ rg_lpf_cap_xg_p2 &= 0x001F; ++ rg_lpf_cap_xg_p3 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBD5C); ++ rg_lpf_cap_xg_p3 &= 0x1F00; ++ rg_lpf_cap_p0 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC18); ++ rg_lpf_cap_p0 &= 0x001F; ++ rg_lpf_cap_p1 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC18); ++ rg_lpf_cap_p1 &= 0x1F00; ++ rg_lpf_cap_p2 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC1A); ++ rg_lpf_cap_p2 &= 0x001F; ++ rg_lpf_cap_p3 = rtl8125_mdio_direct_read_phy_ocp(tp, 0xBC1A); ++ rg_lpf_cap_p3 &= 0x1F00; ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBD5A, ++ 0x001F, ++ rg_lpf_cap_xg_p3 >> 8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBD5A, ++ 0x1F00, ++ rg_lpf_cap_xg_p2 << 8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBD5C, ++ 0x001F, ++ rg_lpf_cap_xg_p1 >> 8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBD5C, ++ 0x1F00, ++ rg_lpf_cap_xg_p0 << 8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC18, ++ 0x001F, ++ rg_lpf_cap_p3 >> 8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC18, ++ 0x1F00, ++ rg_lpf_cap_p2 << 8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC1A, ++ 0x001F, ++ rg_lpf_cap_p1 >> 8); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC1A, ++ 0x1F00, ++ rg_lpf_cap_p0 << 8); ++ } ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA424, BIT_3); ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125b_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC08, (BIT_3 | BIT_2)); ++ ++ ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FFF); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x0400); ++ } ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8560); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x19CC); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8562); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x19CC); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8564); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x19CC); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8566); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x147D); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8568); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x147D); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x856A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x147D); ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FFE); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0907); ++ } ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xACDA, ++ 0xFF00, ++ 0xFF00); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xACDE, ++ 0xF000, ++ 0xF000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80D6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x2801); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80F2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x2801); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80F4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x6077); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB506, 0x01E7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC8C, 0x0FFC); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC46, 0xB7B4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC50, 0x0FBC); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC3C, 0x9240); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC4E, 0x0DB4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xACC6, 0x0707); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xACC8, 0xA0D3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAD08, 0x0007); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8013); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0700); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FB9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x2801); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FBA); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FBC); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x1900); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FBE); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xE100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0800); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xE500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0F00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FC8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0400); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FCa); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF300); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FCc); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFD00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FCe); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFF00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFB00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD2); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF400); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFF00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FD8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xF600); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x813D); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x390E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x814F); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x790E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80B0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0F31); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBF4C, BIT_1); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBCCA, (BIT_9 | BIT_8)); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8141); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x320E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8153); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x720E); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA432, BIT_6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8529); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x050E); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x816C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xC4A0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8170); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xC4A0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8174); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x04A0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8178); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x04A0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x817C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0719); ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0400); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0404); ++ } ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBF4A, 0x001B); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8033); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8037); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x803B); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0xFC32); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x803F); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8043); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8047); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x7C13); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8145); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x370E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8157); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x770E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8169); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x0D0A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x817B); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x1D0A); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8217); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x5000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x821A); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x5000); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DA); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0403); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DC); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0384); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2007); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BA); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x6C00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xF009); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BD); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x9F00); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80C7); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xf083); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DD); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x03f0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DF); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x1000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CB); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x2007); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CE); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x6C00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80C9); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8009); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80D1); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x8000); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x200A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xF0AD); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x809F); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x6073); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x000B); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A9); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xC000); ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB896, BIT_0); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xB892, 0xFF00); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC23E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC240); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0103); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC242); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0507); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC244); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x090B); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC246); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x0C0E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC248); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1012); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB88E, 0xC24A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB890, 0x1416); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xB896, BIT_0); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA86A, BIT_0); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA6F0, BIT_0); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA0, 0xD70D); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA2, 0x4100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA4, 0xE868); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA6, 0xDC59); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB54C, 0x3C18); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBFA4, BIT_5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x817D); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_12); ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125b_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAC46, ++ 0x00F0, ++ 0x0090); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD30, ++ 0x0003, ++ 0x0001); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80F5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x760E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8107); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87E, 0x360E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8551); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ BIT_15 | BIT_14 | BIT_13 | BIT_12 | BIT_11 | BIT_10 | BIT_9 | BIT_8, ++ BIT_11); ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xbf00, ++ 0xE000, ++ 0xA000); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xbf46, ++ 0x0F00, ++ 0x0300); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8044); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x804A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8050); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8056); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x805C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8062); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8068); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x806E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x8074); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa436, 0x807A); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xa438, 0x2417); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA4CA, BIT_6); ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBF84, ++ BIT_15 | BIT_14 | BIT_13, ++ BIT_15 | BIT_13); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8170); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ BIT_13 | BIT_10 | BIT_9 | BIT_8, ++ BIT_15 | BIT_14 | BIT_12 | BIT_11); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8015); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xB87E, BIT_8); ++ rtl8125_mdio_direct_read_phy_ocp(tp, 0xB906); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA424, BIT_3); ++ ++ /* ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA0, 0xD70D); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA2, 0x4100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA4, 0xE868); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA6, 0xDC59); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB54C, 0x3C18); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBFA4, BIT_5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x817D); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_12); ++ */ ++ ++ ++#ifdef ENABLE_LIB_SUPPORT ++ /* disable phy speed down */ ++ ClearEthPhyOcpBit(tp, 0xA442, BIT_3 | BIT_2); ++#endif /* ENABLE_LIB_SUPPORT */ ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125bp_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA80C, ++ BIT_14, ++ BIT_15 | BIT_11 | BIT_10); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8010); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_11); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8088); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x9000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x808F); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x9000); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8174); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ BIT_13, ++ BIT_12 | BIT_11); ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125bp_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8010); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_11); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8088); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x9000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x808F); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x9000); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8174); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ BIT_13, ++ BIT_12 | BIT_11); ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125cp_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_tgphy_irq_mask_and_ack(tp); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xad0e, ++ 0x007F, ++ 0x000B); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xad78, BIT_4); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81B8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00B4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81BA); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00E4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81C5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0104); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81D0); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x054D); ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125d_1(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBF96, BIT_15); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBF94, ++ 0x0007, ++ 0x0005); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBF8E, ++ 0x3C00, ++ 0x2800); ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBCD8, ++ 0xC000, ++ 0x4000); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBCD8, ++ 0xC000, ++ 0x4000); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC80, ++ 0x001F, ++ 0x0004); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_15 | BIT_14 | BIT_13); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_12 | BIT_11 | BIT_10); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC80, ++ 0x001F, ++ 0x0005); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC82, ++ 0x00E0, ++ 0x0040); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_4 | BIT_3 | BIT_2); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBCD8, ++ 0xC000, ++ 0x8000); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBD70, BIT_8); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA466, BIT_1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x836a); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, 0xFF00); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x832C); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0500); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB106, ++ 0x0700, ++ 0x0100); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB206, ++ 0x0700, ++ 0x0200); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB306, ++ 0x0700, ++ 0x0300); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80CB); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0300); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBCF4, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBCF6, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBC12, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x844d); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0200); ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8feb); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0100); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8fe9); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0600); ++ } ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAC7E, ++ 0x01FC, ++ 0x00B4); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8105); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x7A00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8117); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x3A00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8103); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x7400); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8115); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x3400); ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xAD40, BIT_5 | BIT_4); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD66, ++ 0x000F, ++ 0x0007); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD68, ++ 0xF000, ++ 0x8000); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD68, ++ 0x0F00, ++ 0x0500); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD68, ++ 0x000F, ++ 0x0002); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAD6A, ++ 0xF000, ++ 0x7000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xAC50, 0x01E8); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x81FA); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x5400); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA864, ++ 0x00F0, ++ 0x00C0); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA42C, ++ 0x00FF, ++ 0x0002); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80E1); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x0F00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80DE); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xF000, ++ 0x0700); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA846, BIT_7); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BA); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8A04); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80BD); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xCA00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80B7); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xB300); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CE); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8A04); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80D1); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xCA00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80CB); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0xBB00); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x4909); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x80A8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x05B8); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8200); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x5800); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF1); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7078); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x5D78); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF5); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x7862); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FF7); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x1400); ++ ++ ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x814C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x8455); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x814E); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x84A6); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8163); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x0600); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x816A); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x0500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8171); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x1f00); ++ } ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC3A, ++ 0x000F, ++ 0x0006); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8064); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8067); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x806A); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x806D); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8070); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8073); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8076); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8079); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x807C); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x807F); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA438, BIT_10 | BIT_9 | BIT_8); ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBFA0, ++ 0xFF70, ++ 0x5500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xBFA2, 0x9D00); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8165); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0x0700, ++ 0x0200); ++ ++ ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8019); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8FE3); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0005); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0000); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x00ED); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0502); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0x0B00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, 0xD401); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x2900); ++ } ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x8018); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x1700); ++ ++ ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x815B); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA438, ++ 0xFF00, ++ 0x1700); ++ } ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA4E0, BIT_15); ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D4, BIT_5); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA654, BIT_11); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA430, BIT_12 | BIT_0); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_7); ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config_8125d_2(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_11); ++ ++ ++ rtl8125_set_phy_mcu_patch_request(tp); ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBCD8, ++ 0xC000, ++ 0x4000); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBCD8, ++ 0xC000, ++ 0x4000); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC80, ++ 0x001F, ++ 0x0004); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_15 | BIT_14 | BIT_13); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_12 | BIT_11 | BIT_10); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC80, ++ 0x001F, ++ 0x0005); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC82, ++ 0x00E0, ++ 0x0040); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC82, BIT_4 | BIT_3 | BIT_2); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBCD8, ++ 0xC000, ++ 0x8000); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBCD8, BIT_15 | BIT_14); ++ ++ rtl8125_clear_phy_mcu_patch_request(tp); ++ ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xAC7E, ++ 0x01FC, ++ 0x00B4); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8105); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x7A00); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8117); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x3A00); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8103); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x7400); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8115); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x3400); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FEB); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0500); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x8FEA); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0x0700); ++ ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xB87C, 0x80D6); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xB87E, ++ 0xFF00, ++ 0xEF00); ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5D4, BIT_5); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA654, BIT_11); ++ ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA448, BIT_10); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA586, BIT_10); ++ ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA430, BIT_12 | BIT_0); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA442, BIT_7); ++ ++ ++ if (aspm && HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ rtl8125_enable_phy_aldps(tp); ++} ++ ++static void ++rtl8125_hw_phy_config(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ ++ if (tp->resume_not_chg_speed) ++ return; ++ ++ tp->phy_reset_enable(dev); ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++#ifndef ENABLE_USE_FIRMWARE_FILE ++ if (!tp->rtl_fw) { ++ rtl8125_set_hw_phy_before_init_phy_mcu(dev); ++ ++ rtl8125_init_hw_phy_mcu(dev); ++ } ++#endif ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ rtl8125_hw_phy_config_8125a_1(dev); ++ break; ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ rtl8125_hw_phy_config_8125a_2(dev); ++ break; ++ case CFG_METHOD_4: ++ rtl8125_hw_phy_config_8125b_1(dev); ++ break; ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ rtl8125_hw_phy_config_8125b_2(dev); ++ break; ++ case CFG_METHOD_8: ++ rtl8125_hw_phy_config_8125bp_1(dev); ++ break; ++ case CFG_METHOD_9: ++ rtl8125_hw_phy_config_8125bp_2(dev); ++ break; ++ case CFG_METHOD_10: ++ rtl8125_hw_phy_config_8125d_1(dev); ++ break; ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ rtl8125_hw_phy_config_8125d_2(dev); ++ break; ++ case CFG_METHOD_12: ++ rtl8125_hw_phy_config_8125cp_1(dev); ++ break; ++ } ++ ++ //legacy force mode(Chap 22) ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xA5B4, BIT_15); ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ rtl8125_hw_fiber_phy_config(tp); ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ /*ocp phy power saving*/ ++ /* ++ if (aspm) { ++ if (tp->mcfg == CFG_METHOD_2 || tp->mcfg == CFG_METHOD_3 || ++ tp->mcfg == CFG_METHOD_6) ++ rtl8125_enable_ocp_phy_power_saving(dev); ++ } ++ */ ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ ++ if (HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) { ++ if (tp->eee.eee_enabled) ++ rtl8125_enable_eee(tp); ++ else ++ rtl8125_disable_eee(tp); ++ } ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++} ++ ++static void ++rtl8125_up(struct net_device *dev) ++{ ++ rtl8125_hw_init(dev); ++ rtl8125_hw_reset(dev); ++ rtl8125_powerup_pll(dev); ++ rtl8125_hw_ephy_config(dev); ++ rtl8125_hw_phy_config(dev); ++ rtl8125_hw_config(dev); ++} ++ ++/* ++static inline void rtl8125_delete_esd_timer(struct net_device *dev, struct timer_list *timer) ++{ ++ del_timer_sync(timer); ++} ++ ++static inline void rtl8125_request_esd_timer(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct timer_list *timer = &tp->esd_timer; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++ setup_timer(timer, rtl8125_esd_timer, (unsigned long)dev); ++#else ++ timer_setup(timer, rtl8125_esd_timer, 0); ++#endif ++ mod_timer(timer, jiffies + RTL8125_ESD_TIMEOUT); ++} ++*/ ++ ++/* ++static inline void rtl8125_delete_link_timer(struct net_device *dev, struct timer_list *timer) ++{ ++ del_timer_sync(timer); ++} ++ ++static inline void rtl8125_request_link_timer(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct timer_list *timer = &tp->link_timer; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++ setup_timer(timer, rtl8125_link_timer, (unsigned long)dev); ++#else ++ timer_setup(timer, rtl8125_link_timer, 0); ++#endif ++ mod_timer(timer, jiffies + RTL8125_LINK_TIMEOUT); ++} ++*/ ++ ++#ifdef CONFIG_NET_POLL_CONTROLLER ++/* ++ * Polling 'interrupt' - used by things like netconsole to send skbs ++ * without having to re-enable interrupts. It's not called while ++ * the interrupt routine is executing. ++ */ ++static void ++rtl8125_netpoll(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ for (i = 0; i < tp->irq_nvecs; i++) { ++ struct r8125_irq *irq = &tp->irq_tbl[i]; ++ struct r8125_napi *r8125napi = &tp->r8125napi[i]; ++ ++ disable_irq(irq->vector); ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,12,0) ++ irq->handler(irq->vector, r8125napi); ++#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) ++ irq->handler(irq->vector, r8125napi, NULL); ++#else ++ irq->handler(irq->vector, r8125napi); ++#endif ++ ++ enable_irq(irq->vector); ++ } ++} ++#endif //CONFIG_NET_POLL_CONTROLLER ++ ++static void ++rtl8125_setup_interrupt_mask(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ if (tp->HwCurrIsrVer == 7) { ++ tp->intr_mask = ISRIMR_V7_LINKCHG | ISRIMR_V7_TOK_Q0; ++ if (tp->num_tx_rings > 1) ++ tp->intr_mask |= ISRIMR_V7_TOK_Q1; ++ for (i = 0; i < tp->num_rx_rings; i++) ++ tp->intr_mask |= ISRIMR_V7_ROK_Q0 << i; ++ } else if (tp->HwCurrIsrVer == 5) { ++ tp->intr_mask = ISRIMR_V5_LINKCHG | ISRIMR_V5_TOK_Q0; ++ if (tp->num_tx_rings > 1) ++ tp->intr_mask |= ISRIMR_V5_TOK_Q1; ++ for (i = 0; i < tp->num_rx_rings; i++) ++ tp->intr_mask |= ISRIMR_V5_ROK_Q0 << i; ++ } else if (tp->HwCurrIsrVer == 4) { ++ tp->intr_mask = ISRIMR_V4_LINKCHG; ++ for (i = 0; i < max(tp->num_tx_rings, tp->num_rx_rings); i++) ++ tp->intr_mask |= ISRIMR_V4_ROK_Q0 << i; ++ ++ if (tp->DASH) ++ tp->intr_l2_mask |= ISRIMR_V4_L2_IPC2; ++ ++ if (tp->intr_l2_mask > 0) ++ tp->intr_mask |= ISRIMR_V4_LAYER2_INTR_STS; ++ } else if (tp->HwCurrIsrVer == 3) { ++ tp->intr_mask = ISRIMR_V2_LINKCHG; ++ for (i = 0; i < max(tp->num_tx_rings, tp->num_rx_rings); i++) ++ tp->intr_mask |= ISRIMR_V2_ROK_Q0 << i; ++ } else if (tp->HwCurrIsrVer == 2) { ++ tp->intr_mask = ISRIMR_V2_LINKCHG | ISRIMR_TOK_Q0; ++ if (tp->num_tx_rings > 1) ++ tp->intr_mask |= ISRIMR_TOK_Q1; ++ ++ for (i = 0; i < tp->num_rx_rings; i++) ++ tp->intr_mask |= ISRIMR_V2_ROK_Q0 << i; ++ } else { ++ tp->intr_mask = LinkChg | RxDescUnavail | TxOK | RxOK | SWInt; ++ tp->timer_intr_mask = LinkChg | PCSTimeout; ++ ++#ifdef ENABLE_DASH_SUPPORT ++ if (tp->DASH) { ++ if (HW_DASH_SUPPORT_IPC2(tp)) { ++ tp->timer_intr_mask |= ISRIMR_DASH_INTR_EN; ++ tp->intr_mask |= ISRIMR_DASH_INTR_EN; ++ } ++ } ++#endif ++ } ++} ++ ++static void ++rtl8125_setup_mqs_reg(struct rtl8125_private *tp) ++{ ++ u16 hw_clo_ptr0_reg, sw_tail_ptr0_reg; ++ u16 reg_len; ++ int i; ++ ++ //tx ++ tp->tx_ring[0].tdsar_reg = TxDescStartAddrLow; ++ for (i = 1; i < tp->HwSuppNumTxQueues; i++) { ++ tp->tx_ring[i].tdsar_reg = (u16)(TNPDS_Q1_LOW_8125 + (i - 1) * 8); ++ } ++ ++ switch (tp->HwSuppTxNoCloseVer) { ++ case 4: ++ case 5: ++ hw_clo_ptr0_reg = HW_CLO_PTR0_8126; ++ sw_tail_ptr0_reg = SW_TAIL_PTR0_8126; ++ reg_len = 4; ++ break; ++ case 6: ++ hw_clo_ptr0_reg = HW_CLO_PTR0_8125BP; ++ sw_tail_ptr0_reg = SW_TAIL_PTR0_8125BP; ++ reg_len = 8; ++ break; ++ default: ++ hw_clo_ptr0_reg = HW_CLO_PTR0_8125; ++ sw_tail_ptr0_reg = SW_TAIL_PTR0_8125; ++ reg_len = 4; ++ break; ++ } ++ ++ for (i = 0; i < tp->HwSuppNumTxQueues; i++) { ++ tp->tx_ring[i].hw_clo_ptr_reg = (u16)(hw_clo_ptr0_reg + i * reg_len); ++ tp->tx_ring[i].sw_tail_ptr_reg = (u16)(sw_tail_ptr0_reg + i * reg_len); ++ } ++ ++ //rx ++ tp->rx_ring[0].rdsar_reg = RxDescAddrLow; ++ for (i = 1; i < tp->HwSuppNumRxQueues; i++) ++ tp->rx_ring[i].rdsar_reg = (u16)(RDSAR_Q1_LOW_8125 + (i - 1) * 8); ++ ++ tp->isr_reg[0] = ISR0_8125; ++ for (i = 1; i < tp->hw_supp_irq_nvecs; i++) ++ tp->isr_reg[i] = (u16)(ISR1_8125 + (i - 1) * 4); ++ ++ tp->imr_reg[0] = IMR0_8125; ++ for (i = 1; i < tp->hw_supp_irq_nvecs; i++) ++ tp->imr_reg[i] = (u16)(IMR1_8125 + (i - 1) * 4); ++} ++ ++static void ++rtl8125_backup_led_select(struct rtl8125_private *tp) ++{ ++ tp->BackupLedSel[1] = RTL_R16(tp, LEDSEL_1_8125); ++ tp->BackupLedSel[2] = RTL_R16(tp, LEDSEL_2_8125); ++ tp->BackupLedSel[3] = RTL_R16(tp, LEDSEL_3_8125); ++ tp->BackupLedSel[0] = RTL_R16(tp, CustomLED); ++} ++ ++static void ++rtl8125_restore_led_select(struct rtl8125_private *tp) ++{ ++ RTL_W16(tp, LEDSEL_1_8125, tp->BackupLedSel[1]); ++ RTL_W16(tp, LEDSEL_2_8125, tp->BackupLedSel[2]); ++ RTL_W16(tp, LEDSEL_3_8125, tp->BackupLedSel[3]); ++ RTL_W16(tp, CustomLED, tp->BackupLedSel[0]); ++} ++ ++static bool ++_rtl8125_backup_phy_fuse_dout_v4(struct rtl8125_private *tp) ++{ ++ u16 i; ++ ++ for (i = 0; i < R8125_PHY_FUSE_DOUT_NUM; i++) { ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA460, ++ 0x001F, ++ i); ++ tp->BackupPhyFuseDout[i] = rtl8125_mdio_direct_read_phy_ocp(tp, 0xA462); ++ } ++ ++ if (tp->HwSuppEsdVer == 4) { ++ tp->BackupPhyFuseDout[3] |= 0xF000; ++ tp->BackupPhyFuseDout[7] |= 0x03FF; ++ tp->BackupPhyFuseDout[4] = USHRT_MAX; ++ tp->BackupPhyFuseDout[5] = USHRT_MAX; ++ tp->BackupPhyFuseDout[6] = USHRT_MAX; ++ } else if (tp->HwSuppEsdVer == 5) { ++ tp->BackupPhyFuseDout[30] = USHRT_MAX; ++ tp->BackupPhyFuseDout[31] = USHRT_MAX; ++ } ++ ++ return TRUE; ++} ++ ++static bool ++rtl8125_backup_phy_fuse_dout(struct rtl8125_private *tp) ++{ ++ if (tp->HwSuppEsdVer == 4 || tp->HwSuppEsdVer == 5) ++ return _rtl8125_backup_phy_fuse_dout_v4(tp); ++ else ++ return FALSE; ++} ++ ++static void ++_rtl8125_restore_phy_fuse_dout_v4(struct rtl8125_private *tp) ++{ ++ u16 i; ++ ++ for (i = 0; i < R8125_PHY_FUSE_DOUT_NUM; i++) { ++ if (tp->BackupPhyFuseDout[i] == USHRT_MAX) ++ continue; ++ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xA460, ++ 0x001F, ++ i); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA462, tp->BackupPhyFuseDout[i]); ++ } ++} ++ ++static void ++rtl8125_restore_phy_fuse_dout(struct rtl8125_private *tp) ++{ ++ if (tp->HwSuppEsdVer == 4 || tp->HwSuppEsdVer == 5) ++ _rtl8125_restore_phy_fuse_dout_v4(tp); ++ else ++ return; ++} ++ ++static void ++rtl8125_init_software_variable(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct pci_dev *pdev = tp->pci_dev; ++ ++#ifdef ENABLE_LIB_SUPPORT ++ tp->ring_lib_enabled = 1; ++#endif ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: { ++ u8 tmp = (u8)rtl8125_mac_ocp_read(tp, 0xD006); ++ if (tmp == 0x02 || tmp == 0x04) ++ tp->HwSuppDashVer = 2; ++ } ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ tp->HwSuppDashVer = 4; ++ break; ++ default: ++ tp->HwSuppDashVer = 0; ++ break; ++ } ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ if (HW_DASH_SUPPORT_DASH(tp)) ++ tp->HwSuppOcpChannelVer = 2; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_12: ++ tp->HwSuppOcpChannelVer = 2; ++ break; ++ } ++ tp->AllowAccessDashOcp = rtl8125_is_allow_access_dash_ocp(tp); ++ ++ tp->HwPkgDet = rtl8125_mac_ocp_read(tp, 0xDC00); ++ tp->HwPkgDet = (tp->HwPkgDet >> 3) & 0x07; ++ ++ tp->HwSuppNowIsOobVer = 1; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ tp->HwPcieSNOffset = 0x16C; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ tp->HwPcieSNOffset = 0x168; ++ break; ++ } ++ ++#ifdef ENABLE_REALWOW_SUPPORT ++ rtl8125_get_realwow_hw_version(dev); ++#endif //ENABLE_REALWOW_SUPPORT ++ ++ tp->DASH = rtl8125_check_dash(tp); ++ ++ if (tp->DASH) { ++ eee_enable = 0; ++ ++ tp->SizeOfSendToFwBuffer = SEND_TO_FW_BUF_SIZE; ++ tp->SizeOfRecvFromFwBuffer = RECV_FROM_FW_BUF_SIZE; ++ ++ tp->DashFirmwareVersion = rtl8125_get_dash_fw_ver(tp); ++ } ++ ++ if (aspm) { ++ tp->org_pci_offset_99 = rtl8125_csi_fun0_read_byte(tp, 0x99); ++ tp->org_pci_offset_99 &= ~(BIT_5|BIT_6); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x264); ++ break; ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x214); ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x210); ++ break; ++ case CFG_METHOD_12: ++ tp->org_pci_offset_180 = rtl8125_csi_fun0_read_byte(tp, 0x184); ++ break; ++ } ++ } ++ ++ pci_read_config_byte(pdev, 0x80, &tp->org_pci_offset_80); ++ pci_read_config_byte(pdev, 0x81, &tp->org_pci_offset_81); ++ ++ tp->use_timer_interrupt = TRUE; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ tp->HwSuppMaxPhyLinkSpeed = 2500; ++ break; ++ default: ++ tp->HwSuppMaxPhyLinkSpeed = 1000; ++ break; ++ } ++ ++ if (timer_count == 0 || tp->mcfg == CFG_METHOD_DEFAULT) ++ tp->use_timer_interrupt = FALSE; ++ ++ tp->ShortPacketSwChecksum = TRUE; ++ tp->UseSwPaddingShortPkt = TRUE; ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ rtl8125_check_fiber_mode_support(tp); ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ tp->HwSuppMagicPktVer = WAKEUP_MAGIC_PACKET_V3; ++ break; ++ default: ++ tp->HwSuppMagicPktVer = WAKEUP_MAGIC_PACKET_NOT_SUPPORT; ++ break; ++ } ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ tp->HwSuppEsdVer = 4; ++ break; ++ case CFG_METHOD_10: ++ tp->HwSuppEsdVer = 5; ++ break; ++ default: ++ tp->HwSuppEsdVer = 1; ++ break; ++ } ++ ++ if (rtl8125_backup_phy_fuse_dout(tp)) ++ tp->TestPhyOcpReg = TRUE; ++ ++#ifdef ENABLE_USE_FIRMWARE_FILE ++ tp->TestPhyOcpReg = FALSE; ++#endif ++ ++ tp->HwSuppLinkChgWakeUpVer = 3; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ tp->HwSuppD0SpeedUpVer = 1; ++ break; ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ tp->HwSuppD0SpeedUpVer = 2; ++ break; ++ } ++ ++ tp->HwSuppCheckPhyDisableModeVer = 3; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ tp->HwSuppTxNoCloseVer = 3; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ tp->HwSuppTxNoCloseVer = 6; ++ break; ++ } ++ ++ switch (tp->HwSuppTxNoCloseVer) { ++ case 5: ++ case 6: ++ tp->MaxTxDescPtrMask = MAX_TX_NO_CLOSE_DESC_PTR_MASK_V4; ++ break; ++ case 4: ++ tp->MaxTxDescPtrMask = MAX_TX_NO_CLOSE_DESC_PTR_MASK_V3; ++ break; ++ case 3: ++ tp->MaxTxDescPtrMask = MAX_TX_NO_CLOSE_DESC_PTR_MASK_V2; ++ break; ++ default: ++ tx_no_close_enable = 0; ++ break; ++ } ++ ++ if (tp->HwSuppTxNoCloseVer > 0 && tx_no_close_enable == 1) ++ tp->EnableTxNoClose = TRUE; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ tp->RequireLSOPatch = TRUE; ++ break; ++ } ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_2; ++ break; ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_3; ++ break; ++ case CFG_METHOD_4: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_4; ++ break; ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_5; ++ break; ++ case CFG_METHOD_8: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_8; ++ break; ++ case CFG_METHOD_9: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_9; ++ break; ++ case CFG_METHOD_10: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_10; ++ break; ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_11; ++ break; ++ case CFG_METHOD_12: ++ tp->sw_ram_code_ver = NIC_RAMCODE_VERSION_CFG_METHOD_12; ++ break; ++ } ++ ++ if (tp->HwIcVerUnknown) { ++ tp->NotWrRamCodeToMicroP = TRUE; ++ tp->NotWrMcuPatchCode = TRUE; ++ } ++ ++ rtl8125_check_hw_phy_mcu_code_ver(dev); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ if ((rtl8125_mac_ocp_read(tp, 0xD442) & BIT_5) && ++ (rtl8125_mdio_direct_read_phy_ocp(tp, 0xD068) & BIT_1)) ++ tp->RequirePhyMdiSwapPatch = TRUE; ++ break; ++ } ++ ++ tp->HwSuppMacMcuVer = 2; ++ ++ tp->MacMcuPageSize = RTL8125_MAC_MCU_PAGE_SIZE; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ tp->HwSuppNumTxQueues = 2; ++ tp->HwSuppNumRxQueues = 4; ++ break; ++ default: ++ tp->HwSuppNumTxQueues = 1; ++ tp->HwSuppNumRxQueues = 1; ++ break; ++ } ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ /* mac ptp */ ++ tp->HwSuppPtpVer = 1; ++ break; ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ /* phy ptp */ ++ tp->HwSuppPtpVer = 3; ++ break; ++ } ++#ifdef ENABLE_PTP_SUPPORT ++ if (tp->HwSuppPtpVer > 0) ++ tp->EnablePtp = 1; ++#endif ++ ++ //init interrupt ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ tp->HwSuppIsrVer = 2; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ tp->HwSuppIsrVer = 4; ++ break; ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ tp->HwSuppIsrVer = 5; ++ break; ++ case CFG_METHOD_12: ++ tp->HwSuppIsrVer = 7; ++ break; ++ default: ++ tp->HwSuppIsrVer = 1; ++ break; ++ } ++ ++ tp->HwCurrIsrVer = tp->HwSuppIsrVer; ++ if (tp->HwCurrIsrVer > 1) { ++ if (!(tp->features & RTL_FEATURE_MSIX) || ++ tp->irq_nvecs < tp->min_irq_nvecs) ++ tp->HwCurrIsrVer = 1; ++ } ++ ++ tp->num_tx_rings = 1; ++#ifdef ENABLE_MULTIPLE_TX_QUEUE ++#ifndef ENABLE_LIB_SUPPORT ++ tp->num_tx_rings = tp->HwSuppNumTxQueues; ++#endif ++#endif ++ if (tp->HwCurrIsrVer < 2 || ++ (tp->HwCurrIsrVer == 2 && tp->irq_nvecs < 19)) ++ tp->num_tx_rings = 1; ++ ++ //RSS ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ tp->HwSuppRssVer = 5; ++ tp->HwSuppIndirTblEntries = 128; ++ break; ++ } ++ ++ tp->num_rx_rings = 1; ++#ifdef ENABLE_RSS_SUPPORT ++#ifdef ENABLE_LIB_SUPPORT ++ if (tp->HwSuppRssVer > 0) ++ tp->EnableRss = 1; ++#else ++ if (tp->HwSuppRssVer > 0 && tp->HwCurrIsrVer > 1) { ++ u8 rss_queue_num = netif_get_num_default_rss_queues(); ++ tp->num_rx_rings = (tp->HwSuppNumRxQueues > rss_queue_num)? ++ rss_queue_num : tp->HwSuppNumRxQueues; ++ ++ if (!(tp->num_rx_rings >= 2 && tp->irq_nvecs >= tp->num_rx_rings)) ++ tp->num_rx_rings = 1; ++ ++ if (tp->num_rx_rings >= 2) ++ tp->EnableRss = 1; ++ } ++#endif ++#endif ++ ++ //interrupt mask ++ rtl8125_setup_interrupt_mask(tp); ++ ++ rtl8125_setup_mqs_reg(tp); ++ ++ rtl8125_set_ring_size(tp, NUM_RX_DESC, NUM_TX_DESC); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ tp->HwSuppIntMitiVer = 3; ++ break; ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ tp->HwSuppIntMitiVer = 4; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ tp->HwSuppIntMitiVer = 6; ++ break; ++ } ++ ++ tp->HwSuppTcamVer = 1; ++ tp->TcamNotValidReg = TCAM_NOTVALID_ADDR; ++ tp->TcamValidReg = TCAM_VALID_ADDR; ++ tp->TcamMaAddrcOffset = TCAM_MAC_ADDR; ++ tp->TcamVlanTagOffset = TCAM_VLAN_TAG; ++ ++ tp->HwSuppExtendTallyCounterVer = 1; ++ ++ timer_count_v2 = (timer_count / 0x100); ++ /* timer unit is double */ ++ switch (tp->mcfg) { ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ timer_count_v2 /= 2; ++ break; ++ } ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ tp->RequiredPfmPatch = TRUE; ++ break; ++ } ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_6: ++ case CFG_METHOD_7: ++ tp->HwSuppRxDescType = RX_DESC_RING_TYPE_3; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ tp->HwSuppRxDescType = RX_DESC_RING_TYPE_4; ++ break; ++ default: ++ tp->HwSuppRxDescType = RX_DESC_RING_TYPE_1; ++ break; ++ } ++ ++ tp->InitRxDescType = RX_DESC_RING_TYPE_1; ++ tp->RxDescLength = RX_DESC_LEN_TYPE_1; ++ switch (tp->HwSuppRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ if (tp->EnableRss || tp->EnablePtp) { ++ tp->InitRxDescType = RX_DESC_RING_TYPE_3; ++ tp->RxDescLength = RX_DESC_LEN_TYPE_3; ++ } ++ break; ++ case RX_DESC_RING_TYPE_4: ++ if (tp->EnableRss) { ++ tp->InitRxDescType = RX_DESC_RING_TYPE_4; ++ tp->RxDescLength = RX_DESC_LEN_TYPE_4; ++ } ++ break; ++ } ++ ++ tp->rtl8125_rx_config = rtl_chip_info[tp->chipset].RCR_Cfg; ++ if (tp->InitRxDescType == RX_DESC_RING_TYPE_3) ++ tp->rtl8125_rx_config |= EnableRxDescV3; ++ else if (tp->InitRxDescType == RX_DESC_RING_TYPE_4) ++ tp->rtl8125_rx_config &= ~EnableRxDescV4_1; ++ ++ rtl8125_backup_led_select(tp); ++ ++ tp->wol_opts = rtl8125_get_hw_wol(tp); ++ tp->wol_enabled = (tp->wol_opts) ? WOL_ENABLED : WOL_DISABLED; ++ ++ rtl8125_set_link_option(tp, autoneg_mode, speed_mode, duplex_mode, ++ rtl8125_fc_full); ++ ++ tp->max_jumbo_frame_size = rtl_chip_info[tp->chipset].jumbo_frame_sz; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) ++ /* MTU range: 60 - hw-specific max */ ++ dev->min_mtu = ETH_MIN_MTU; ++ dev->max_mtu = tp->max_jumbo_frame_size; ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) ++ ++ if (tp->mcfg != CFG_METHOD_DEFAULT) { ++ struct ethtool_keee *eee = &tp->eee; ++ ++ eee->eee_enabled = eee_enable; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0) ++ eee->supported = SUPPORTED_100baseT_Full | ++ SUPPORTED_1000baseT_Full; ++ eee->advertised = mmd_eee_adv_to_ethtool_adv_t(MDIO_EEE_1000T | MDIO_EEE_100TX); ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ /* nothing to do */ ++ break; ++ default: ++ if (HW_SUPP_PHY_LINK_SPEED_2500M(tp)) { ++ eee->supported |= SUPPORTED_2500baseX_Full; ++ eee->advertised |= SUPPORTED_2500baseX_Full; ++ } ++ break; ++ } ++#else ++ linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, eee->supported); ++ linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, eee->supported); ++ linkmode_set_bit(ETHTOOL_LINK_MODE_100baseT_Full_BIT, eee->advertised); ++ linkmode_set_bit(ETHTOOL_LINK_MODE_1000baseT_Full_BIT, eee->advertised); ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ /* nothing to do */ ++ break; ++ default: ++ if (HW_SUPP_PHY_LINK_SPEED_2500M(tp)) { ++ linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, eee->supported); ++ linkmode_set_bit(ETHTOOL_LINK_MODE_2500baseT_Full_BIT, eee->advertised); ++ } ++ break; ++ } ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,9,0) */ ++ eee->tx_lpi_enabled = eee_enable; ++ eee->tx_lpi_timer = dev->mtu + ETH_HLEN + 0x20; ++ } ++ ++ tp->ptp_master_mode = enable_ptp_master_mode; ++ ++#ifdef ENABLE_RSS_SUPPORT ++ if (tp->EnableRss) ++ rtl8125_init_rss(tp); ++#endif ++} ++ ++static void ++rtl8125_release_board(struct pci_dev *pdev, ++ struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ void __iomem *ioaddr = tp->mmio_addr; ++ ++ rtl8125_rar_set(tp, tp->org_mac_addr); ++ tp->wol_enabled = WOL_DISABLED; ++ ++ if (!tp->DASH) ++ rtl8125_phy_power_down(dev); ++ ++ iounmap(ioaddr); ++ pci_release_regions(pdev); ++ pci_clear_mwi(pdev); ++ pci_disable_device(pdev); ++ free_netdev(dev); ++} ++ ++static void ++rtl8125_hw_address_set(struct net_device *dev, u8 mac_addr[MAC_ADDR_LEN]) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++ eth_hw_addr_set(dev, mac_addr); ++#else ++ memcpy(dev->dev_addr, mac_addr, MAC_ADDR_LEN); ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,17,0) ++} ++ ++static int ++rtl8125_get_mac_address(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ u8 mac_addr[MAC_ADDR_LEN]; ++ ++ for (i = 0; i < MAC_ADDR_LEN; i++) ++ mac_addr[i] = RTL_R8(tp, MAC0 + i); ++ ++ *(u32*)&mac_addr[0] = RTL_R32(tp, BACKUP_ADDR0_8125); ++ *(u16*)&mac_addr[4] = RTL_R16(tp, BACKUP_ADDR1_8125); ++ ++ if (!is_valid_ether_addr(mac_addr)) { ++ netif_err(tp, probe, dev, "Invalid ether addr %pM\n", ++ mac_addr); ++ eth_random_addr(mac_addr); ++ dev->addr_assign_type = NET_ADDR_RANDOM; ++ netif_info(tp, probe, dev, "Random ether addr %pM\n", ++ mac_addr); ++ tp->random_mac = 1; ++ } ++ ++ rtl8125_hw_address_set(dev, mac_addr); ++ rtl8125_rar_set(tp, mac_addr); ++ ++ /* keep the original MAC address */ ++ memcpy(tp->org_mac_addr, dev->dev_addr, MAC_ADDR_LEN); ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) ++ memcpy(dev->perm_addr, dev->dev_addr, MAC_ADDR_LEN); ++#endif ++ return 0; ++} ++ ++/** ++ * rtl8125_set_mac_address - Change the Ethernet Address of the NIC ++ * @dev: network interface device structure ++ * @p: pointer to an address structure ++ * ++ * Return 0 on success, negative on failure ++ **/ ++static int ++rtl8125_set_mac_address(struct net_device *dev, ++ void *p) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct sockaddr *addr = p; ++ ++ if (!is_valid_ether_addr(addr->sa_data)) ++ return -EADDRNOTAVAIL; ++ ++ rtl8125_hw_address_set(dev, addr->sa_data); ++ ++ rtl8125_rar_set(tp, dev->dev_addr); ++ ++ return 0; ++} ++ ++/****************************************************************************** ++ * rtl8125_rar_set - Puts an ethernet address into a receive address register. ++ * ++ * tp - The private data structure for driver ++ * addr - Address to put into receive address register ++ *****************************************************************************/ ++void ++rtl8125_rar_set(struct rtl8125_private *tp, ++ const u8 *addr) ++{ ++ uint32_t rar_low = 0; ++ uint32_t rar_high = 0; ++ ++ rar_low = ((uint32_t) addr[0] | ++ ((uint32_t) addr[1] << 8) | ++ ((uint32_t) addr[2] << 16) | ++ ((uint32_t) addr[3] << 24)); ++ ++ rar_high = ((uint32_t) addr[4] | ++ ((uint32_t) addr[5] << 8)); ++ ++ rtl8125_enable_cfg9346_write(tp); ++ RTL_W32(tp, MAC0, rar_low); ++ RTL_W32(tp, MAC4, rar_high); ++ ++ rtl8125_disable_cfg9346_write(tp); ++} ++ ++#ifdef ETHTOOL_OPS_COMPAT ++static int ethtool_get_settings(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_cmd cmd = { ETHTOOL_GSET }; ++ int err; ++ ++ if (!ethtool_ops->get_settings) ++ return -EOPNOTSUPP; ++ ++ err = ethtool_ops->get_settings(dev, &cmd); ++ if (err < 0) ++ return err; ++ ++ if (copy_to_user(useraddr, &cmd, sizeof(cmd))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_settings(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_cmd cmd; ++ ++ if (!ethtool_ops->set_settings) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&cmd, useraddr, sizeof(cmd))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_settings(dev, &cmd); ++} ++ ++static int ethtool_get_drvinfo(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_drvinfo info; ++ struct ethtool_ops *ops = ethtool_ops; ++ ++ if (!ops->get_drvinfo) ++ return -EOPNOTSUPP; ++ ++ memset(&info, 0, sizeof(info)); ++ info.cmd = ETHTOOL_GDRVINFO; ++ ops->get_drvinfo(dev, &info); ++ ++ if (ops->self_test_count) ++ info.testinfo_len = ops->self_test_count(dev); ++ if (ops->get_stats_count) ++ info.n_stats = ops->get_stats_count(dev); ++ if (ops->get_regs_len) ++ info.regdump_len = ops->get_regs_len(dev); ++ if (ops->get_eeprom_len) ++ info.eedump_len = ops->get_eeprom_len(dev); ++ ++ if (copy_to_user(useraddr, &info, sizeof(info))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_get_regs(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_regs regs; ++ struct ethtool_ops *ops = ethtool_ops; ++ void *regbuf; ++ int reglen, ret; ++ ++ if (!ops->get_regs || !ops->get_regs_len) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(®s, useraddr, sizeof(regs))) ++ return -EFAULT; ++ ++ reglen = ops->get_regs_len(dev); ++ if (regs.len > reglen) ++ regs.len = reglen; ++ ++ regbuf = kmalloc(reglen, GFP_USER); ++ if (!regbuf) ++ return -ENOMEM; ++ ++ ops->get_regs(dev, ®s, regbuf); ++ ++ ret = -EFAULT; ++ if (copy_to_user(useraddr, ®s, sizeof(regs))) ++ goto out; ++ useraddr += offsetof(struct ethtool_regs, data); ++ if (copy_to_user(useraddr, regbuf, reglen)) ++ goto out; ++ ret = 0; ++ ++out: ++ kfree(regbuf); ++ return ret; ++} ++ ++static int ethtool_get_wol(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_wolinfo wol = { ETHTOOL_GWOL }; ++ ++ if (!ethtool_ops->get_wol) ++ return -EOPNOTSUPP; ++ ++ ethtool_ops->get_wol(dev, &wol); ++ ++ if (copy_to_user(useraddr, &wol, sizeof(wol))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_wol(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_wolinfo wol; ++ ++ if (!ethtool_ops->set_wol) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&wol, useraddr, sizeof(wol))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_wol(dev, &wol); ++} ++ ++static int ethtool_get_msglevel(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata = { ETHTOOL_GMSGLVL }; ++ ++ if (!ethtool_ops->get_msglevel) ++ return -EOPNOTSUPP; ++ ++ edata.data = ethtool_ops->get_msglevel(dev); ++ ++ if (copy_to_user(useraddr, &edata, sizeof(edata))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_msglevel(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata; ++ ++ if (!ethtool_ops->set_msglevel) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&edata, useraddr, sizeof(edata))) ++ return -EFAULT; ++ ++ ethtool_ops->set_msglevel(dev, edata.data); ++ return 0; ++} ++ ++static int ethtool_nway_reset(struct net_device *dev) ++{ ++ if (!ethtool_ops->nway_reset) ++ return -EOPNOTSUPP; ++ ++ return ethtool_ops->nway_reset(dev); ++} ++ ++static int ethtool_get_link(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_value edata = { ETHTOOL_GLINK }; ++ ++ if (!ethtool_ops->get_link) ++ return -EOPNOTSUPP; ++ ++ edata.data = ethtool_ops->get_link(dev); ++ ++ if (copy_to_user(useraddr, &edata, sizeof(edata))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_get_eeprom(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_eeprom eeprom; ++ struct ethtool_ops *ops = ethtool_ops; ++ u8 *data; ++ int ret; ++ ++ if (!ops->get_eeprom || !ops->get_eeprom_len) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) ++ return -EFAULT; ++ ++ /* Check for wrap and zero */ ++ if (eeprom.offset + eeprom.len <= eeprom.offset) ++ return -EINVAL; ++ ++ /* Check for exceeding total eeprom len */ ++ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) ++ return -EINVAL; ++ ++ data = kmalloc(eeprom.len, GFP_USER); ++ if (!data) ++ return -ENOMEM; ++ ++ ret = -EFAULT; ++ if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) ++ goto out; ++ ++ ret = ops->get_eeprom(dev, &eeprom, data); ++ if (ret) ++ goto out; ++ ++ ret = -EFAULT; ++ if (copy_to_user(useraddr, &eeprom, sizeof(eeprom))) ++ goto out; ++ if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) ++ goto out; ++ ret = 0; ++ ++out: ++ kfree(data); ++ return ret; ++} ++ ++static int ethtool_set_eeprom(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_eeprom eeprom; ++ struct ethtool_ops *ops = ethtool_ops; ++ u8 *data; ++ int ret; ++ ++ if (!ops->set_eeprom || !ops->get_eeprom_len) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&eeprom, useraddr, sizeof(eeprom))) ++ return -EFAULT; ++ ++ /* Check for wrap and zero */ ++ if (eeprom.offset + eeprom.len <= eeprom.offset) ++ return -EINVAL; ++ ++ /* Check for exceeding total eeprom len */ ++ if (eeprom.offset + eeprom.len > ops->get_eeprom_len(dev)) ++ return -EINVAL; ++ ++ data = kmalloc(eeprom.len, GFP_USER); ++ if (!data) ++ return -ENOMEM; ++ ++ ret = -EFAULT; ++ if (copy_from_user(data, useraddr + sizeof(eeprom), eeprom.len)) ++ goto out; ++ ++ ret = ops->set_eeprom(dev, &eeprom, data); ++ if (ret) ++ goto out; ++ ++ if (copy_to_user(useraddr + sizeof(eeprom), data, eeprom.len)) ++ ret = -EFAULT; ++ ++out: ++ kfree(data); ++ return ret; ++} ++ ++static int ethtool_get_coalesce(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_coalesce coalesce = { ETHTOOL_GCOALESCE }; ++ ++ if (!ethtool_ops->get_coalesce) ++ return -EOPNOTSUPP; ++ ++ ethtool_ops->get_coalesce(dev, &coalesce); ++ ++ if (copy_to_user(useraddr, &coalesce, sizeof(coalesce))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_coalesce(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_coalesce coalesce; ++ ++ if (!ethtool_ops->get_coalesce) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&coalesce, useraddr, sizeof(coalesce))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_coalesce(dev, &coalesce); ++} ++ ++static int ethtool_get_ringparam(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_ringparam ringparam = { ETHTOOL_GRINGPARAM }; ++ ++ if (!ethtool_ops->get_ringparam) ++ return -EOPNOTSUPP; ++ ++ ethtool_ops->get_ringparam(dev, &ringparam); ++ ++ if (copy_to_user(useraddr, &ringparam, sizeof(ringparam))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_ringparam(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_ringparam ringparam; ++ ++ if (!ethtool_ops->get_ringparam) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&ringparam, useraddr, sizeof(ringparam))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_ringparam(dev, &ringparam); ++} ++ ++static int ethtool_get_pauseparam(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_pauseparam pauseparam = { ETHTOOL_GPAUSEPARAM }; ++ ++ if (!ethtool_ops->get_pauseparam) ++ return -EOPNOTSUPP; ++ ++ ethtool_ops->get_pauseparam(dev, &pauseparam); ++ ++ if (copy_to_user(useraddr, &pauseparam, sizeof(pauseparam))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_pauseparam(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_pauseparam pauseparam; ++ ++ if (!ethtool_ops->get_pauseparam) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&pauseparam, useraddr, sizeof(pauseparam))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_pauseparam(dev, &pauseparam); ++} ++ ++static int ethtool_get_rx_csum(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata = { ETHTOOL_GRXCSUM }; ++ ++ if (!ethtool_ops->get_rx_csum) ++ return -EOPNOTSUPP; ++ ++ edata.data = ethtool_ops->get_rx_csum(dev); ++ ++ if (copy_to_user(useraddr, &edata, sizeof(edata))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_rx_csum(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata; ++ ++ if (!ethtool_ops->set_rx_csum) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&edata, useraddr, sizeof(edata))) ++ return -EFAULT; ++ ++ ethtool_ops->set_rx_csum(dev, edata.data); ++ return 0; ++} ++ ++static int ethtool_get_tx_csum(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata = { ETHTOOL_GTXCSUM }; ++ ++ if (!ethtool_ops->get_tx_csum) ++ return -EOPNOTSUPP; ++ ++ edata.data = ethtool_ops->get_tx_csum(dev); ++ ++ if (copy_to_user(useraddr, &edata, sizeof(edata))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_tx_csum(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata; ++ ++ if (!ethtool_ops->set_tx_csum) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&edata, useraddr, sizeof(edata))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_tx_csum(dev, edata.data); ++} ++ ++static int ethtool_get_sg(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata = { ETHTOOL_GSG }; ++ ++ if (!ethtool_ops->get_sg) ++ return -EOPNOTSUPP; ++ ++ edata.data = ethtool_ops->get_sg(dev); ++ ++ if (copy_to_user(useraddr, &edata, sizeof(edata))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_sg(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata; ++ ++ if (!ethtool_ops->set_sg) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&edata, useraddr, sizeof(edata))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_sg(dev, edata.data); ++} ++ ++static int ethtool_get_tso(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata = { ETHTOOL_GTSO }; ++ ++ if (!ethtool_ops->get_tso) ++ return -EOPNOTSUPP; ++ ++ edata.data = ethtool_ops->get_tso(dev); ++ ++ if (copy_to_user(useraddr, &edata, sizeof(edata))) ++ return -EFAULT; ++ return 0; ++} ++ ++static int ethtool_set_tso(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_value edata; ++ ++ if (!ethtool_ops->set_tso) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&edata, useraddr, sizeof(edata))) ++ return -EFAULT; ++ ++ return ethtool_ops->set_tso(dev, edata.data); ++} ++ ++static int ethtool_self_test(struct net_device *dev, char *useraddr) ++{ ++ struct ethtool_test test; ++ struct ethtool_ops *ops = ethtool_ops; ++ u64 *data; ++ int ret; ++ ++ if (!ops->self_test || !ops->self_test_count) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&test, useraddr, sizeof(test))) ++ return -EFAULT; ++ ++ test.len = ops->self_test_count(dev); ++ data = kmalloc(test.len * sizeof(u64), GFP_USER); ++ if (!data) ++ return -ENOMEM; ++ ++ ops->self_test(dev, &test, data); ++ ++ ret = -EFAULT; ++ if (copy_to_user(useraddr, &test, sizeof(test))) ++ goto out; ++ useraddr += sizeof(test); ++ if (copy_to_user(useraddr, data, test.len * sizeof(u64))) ++ goto out; ++ ret = 0; ++ ++out: ++ kfree(data); ++ return ret; ++} ++ ++static int ethtool_get_strings(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_gstrings gstrings; ++ struct ethtool_ops *ops = ethtool_ops; ++ u8 *data; ++ int ret; ++ ++ if (!ops->get_strings) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&gstrings, useraddr, sizeof(gstrings))) ++ return -EFAULT; ++ ++ switch (gstrings.string_set) { ++ case ETH_SS_TEST: ++ if (!ops->self_test_count) ++ return -EOPNOTSUPP; ++ gstrings.len = ops->self_test_count(dev); ++ break; ++ case ETH_SS_STATS: ++ if (!ops->get_stats_count) ++ return -EOPNOTSUPP; ++ gstrings.len = ops->get_stats_count(dev); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ data = kmalloc(gstrings.len * ETH_GSTRING_LEN, GFP_USER); ++ if (!data) ++ return -ENOMEM; ++ ++ ops->get_strings(dev, gstrings.string_set, data); ++ ++ ret = -EFAULT; ++ if (copy_to_user(useraddr, &gstrings, sizeof(gstrings))) ++ goto out; ++ useraddr += sizeof(gstrings); ++ if (copy_to_user(useraddr, data, gstrings.len * ETH_GSTRING_LEN)) ++ goto out; ++ ret = 0; ++ ++out: ++ kfree(data); ++ return ret; ++} ++ ++static int ethtool_phys_id(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_value id; ++ ++ if (!ethtool_ops->phys_id) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&id, useraddr, sizeof(id))) ++ return -EFAULT; ++ ++ return ethtool_ops->phys_id(dev, id.data); ++} ++ ++static int ethtool_get_stats(struct net_device *dev, void *useraddr) ++{ ++ struct ethtool_stats stats; ++ struct ethtool_ops *ops = ethtool_ops; ++ u64 *data; ++ int ret; ++ ++ if (!ops->get_ethtool_stats || !ops->get_stats_count) ++ return -EOPNOTSUPP; ++ ++ if (copy_from_user(&stats, useraddr, sizeof(stats))) ++ return -EFAULT; ++ ++ stats.n_stats = ops->get_stats_count(dev); ++ data = kmalloc(stats.n_stats * sizeof(u64), GFP_USER); ++ if (!data) ++ return -ENOMEM; ++ ++ ops->get_ethtool_stats(dev, &stats, data); ++ ++ ret = -EFAULT; ++ if (copy_to_user(useraddr, &stats, sizeof(stats))) ++ goto out; ++ useraddr += sizeof(stats); ++ if (copy_to_user(useraddr, data, stats.n_stats * sizeof(u64))) ++ goto out; ++ ret = 0; ++ ++out: ++ kfree(data); ++ return ret; ++} ++ ++static int ethtool_ioctl(struct ifreq *ifr) ++{ ++ struct net_device *dev = __dev_get_by_name(ifr->ifr_name); ++ void *useraddr = (void *) ifr->ifr_data; ++ u32 ethcmd; ++ ++ /* ++ * XXX: This can be pushed down into the ethtool_* handlers that ++ * need it. Keep existing behaviour for the moment. ++ */ ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ ++ if (!dev || !netif_device_present(dev)) ++ return -ENODEV; ++ ++ if (copy_from_user(ðcmd, useraddr, sizeof (ethcmd))) ++ return -EFAULT; ++ ++ switch (ethcmd) { ++ case ETHTOOL_GSET: ++ return ethtool_get_settings(dev, useraddr); ++ case ETHTOOL_SSET: ++ return ethtool_set_settings(dev, useraddr); ++ case ETHTOOL_GDRVINFO: ++ return ethtool_get_drvinfo(dev, useraddr); ++ case ETHTOOL_GREGS: ++ return ethtool_get_regs(dev, useraddr); ++ case ETHTOOL_GWOL: ++ return ethtool_get_wol(dev, useraddr); ++ case ETHTOOL_SWOL: ++ return ethtool_set_wol(dev, useraddr); ++ case ETHTOOL_GMSGLVL: ++ return ethtool_get_msglevel(dev, useraddr); ++ case ETHTOOL_SMSGLVL: ++ return ethtool_set_msglevel(dev, useraddr); ++ case ETHTOOL_NWAY_RST: ++ return ethtool_nway_reset(dev); ++ case ETHTOOL_GLINK: ++ return ethtool_get_link(dev, useraddr); ++ case ETHTOOL_GEEPROM: ++ return ethtool_get_eeprom(dev, useraddr); ++ case ETHTOOL_SEEPROM: ++ return ethtool_set_eeprom(dev, useraddr); ++ case ETHTOOL_GCOALESCE: ++ return ethtool_get_coalesce(dev, useraddr); ++ case ETHTOOL_SCOALESCE: ++ return ethtool_set_coalesce(dev, useraddr); ++ case ETHTOOL_GRINGPARAM: ++ return ethtool_get_ringparam(dev, useraddr); ++ case ETHTOOL_SRINGPARAM: ++ return ethtool_set_ringparam(dev, useraddr); ++ case ETHTOOL_GPAUSEPARAM: ++ return ethtool_get_pauseparam(dev, useraddr); ++ case ETHTOOL_SPAUSEPARAM: ++ return ethtool_set_pauseparam(dev, useraddr); ++ case ETHTOOL_GRXCSUM: ++ return ethtool_get_rx_csum(dev, useraddr); ++ case ETHTOOL_SRXCSUM: ++ return ethtool_set_rx_csum(dev, useraddr); ++ case ETHTOOL_GTXCSUM: ++ return ethtool_get_tx_csum(dev, useraddr); ++ case ETHTOOL_STXCSUM: ++ return ethtool_set_tx_csum(dev, useraddr); ++ case ETHTOOL_GSG: ++ return ethtool_get_sg(dev, useraddr); ++ case ETHTOOL_SSG: ++ return ethtool_set_sg(dev, useraddr); ++ case ETHTOOL_GTSO: ++ return ethtool_get_tso(dev, useraddr); ++ case ETHTOOL_STSO: ++ return ethtool_set_tso(dev, useraddr); ++ case ETHTOOL_TEST: ++ return ethtool_self_test(dev, useraddr); ++ case ETHTOOL_GSTRINGS: ++ return ethtool_get_strings(dev, useraddr); ++ case ETHTOOL_PHYS_ID: ++ return ethtool_phys_id(dev, useraddr); ++ case ETHTOOL_GSTATS: ++ return ethtool_get_stats(dev, useraddr); ++ default: ++ return -EOPNOTSUPP; ++ } ++ ++ return -EOPNOTSUPP; ++} ++#endif //ETHTOOL_OPS_COMPAT ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,15,0) ++static int rtl8125_siocdevprivate(struct net_device *dev, struct ifreq *ifr, ++ void __user *data, int cmd) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int ret = 0; ++ ++ switch (cmd) { ++#ifdef ENABLE_DASH_SUPPORT ++ case SIOCDEVPRIVATE_RTLDASH: ++ if (!netif_running(dev)) { ++ ret = -ENODEV; ++ break; ++ } ++ if (!capable(CAP_NET_ADMIN)) { ++ ret = -EPERM; ++ break; ++ } ++ ++ ret = rtl8125_dash_ioctl(dev, ifr); ++ break; ++#endif ++ ++#ifdef ENABLE_REALWOW_SUPPORT ++ case SIOCDEVPRIVATE_RTLREALWOW: ++ if (!netif_running(dev)) { ++ ret = -ENODEV; ++ break; ++ } ++ ++ ret = rtl8125_realwow_ioctl(dev, ifr); ++ break; ++#endif ++ ++ case SIOCRTLTOOL: ++ if (!capable(CAP_NET_ADMIN)) { ++ ret = -EPERM; ++ break; ++ } ++ ++ ret = rtl8125_tool_ioctl(tp, ifr); ++ break; ++ ++ default: ++ ret = -EOPNOTSUPP; ++ } ++ ++ return ret; ++} ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,15,0) ++ ++static int ++rtl8125_do_ioctl(struct net_device *dev, ++ struct ifreq *ifr, ++ int cmd) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct mii_ioctl_data *data = if_mii(ifr); ++ int ret = 0; ++ ++ switch (cmd) { ++ case SIOCGMIIPHY: ++ data->phy_id = 32; /* Internal PHY */ ++ break; ++ ++ case SIOCGMIIREG: ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ data->val_out = rtl8125_mdio_read(tp, data->reg_num); ++ break; ++ ++ case SIOCSMIIREG: ++ if (!capable(CAP_NET_ADMIN)) ++ return -EPERM; ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ rtl8125_mdio_write(tp, data->reg_num, data->val_in); ++ break; ++ ++#ifdef ETHTOOL_OPS_COMPAT ++ case SIOCETHTOOL: ++ ret = ethtool_ioctl(ifr); ++ break; ++#endif ++ ++#ifdef ENABLE_PTP_SUPPORT ++ case SIOCSHWTSTAMP: ++ case SIOCGHWTSTAMP: ++ if (tp->EnablePtp) ++ ret = rtl8125_ptp_ioctl(dev, ifr, cmd); ++ else ++ ret = -EOPNOTSUPP; ++ break; ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0) ++#ifdef ENABLE_DASH_SUPPORT ++ case SIOCDEVPRIVATE_RTLDASH: ++ if (!netif_running(dev)) { ++ ret = -ENODEV; ++ break; ++ } ++ if (!capable(CAP_NET_ADMIN)) { ++ ret = -EPERM; ++ break; ++ } ++ ++ ret = rtl8125_dash_ioctl(dev, ifr); ++ break; ++#endif ++ ++#ifdef ENABLE_REALWOW_SUPPORT ++ case SIOCDEVPRIVATE_RTLREALWOW: ++ if (!netif_running(dev)) { ++ ret = -ENODEV; ++ break; ++ } ++ ++ if (!capable(CAP_NET_ADMIN)) { ++ ret = -EPERM; ++ break; ++ } ++ ++ ret = rtl8125_realwow_ioctl(dev, ifr); ++ break; ++#endif ++ ++ case SIOCRTLTOOL: ++ if (!capable(CAP_NET_ADMIN)) { ++ ret = -EPERM; ++ break; ++ } ++ ++ ret = rtl8125_tool_ioctl(tp, ifr); ++ break; ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0) ++ ++ default: ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ return ret; ++} ++ ++static void ++rtl8125_phy_power_up(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ ++ if (rtl8125_is_in_phy_disable_mode(dev)) ++ return; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ rtl8125_mdio_write(tp, MII_BMCR, BMCR_ANENABLE); ++ ++ //wait ups resume (phy state 3) ++ rtl8125_wait_phy_ups_resume(dev, 3); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++} ++ ++static void ++rtl8125_phy_power_down(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long flags; ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ if (HW_FIBER_MODE_ENABLED(tp)) ++ return; ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ rtl8125_mdio_write(tp, MII_BMCR, BMCR_ANENABLE | BMCR_PDOWN); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++} ++ ++static int __devinit ++rtl8125_init_board(struct pci_dev *pdev, ++ struct net_device **dev_out, ++ void __iomem **ioaddr_out) ++{ ++ void __iomem *ioaddr; ++ struct net_device *dev; ++ struct rtl8125_private *tp; ++ int rc = -ENOMEM, i, pm_cap; ++ ++ assert(ioaddr_out != NULL); ++ ++ /* dev zeroed in alloc_etherdev */ ++ dev = alloc_etherdev_mq(sizeof (*tp), R8125_MAX_QUEUES); ++ if (dev == NULL) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_drv(&debug)) ++ dev_err(&pdev->dev, "unable to alloc new ethernet\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ goto err_out; ++ } ++ ++ SET_MODULE_OWNER(dev); ++ SET_NETDEV_DEV(dev, &pdev->dev); ++ tp = netdev_priv(dev); ++ tp->dev = dev; ++ tp->pci_dev = pdev; ++ tp->msg_enable = netif_msg_init(debug.msg_enable, R8125_MSG_DEFAULT); ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,26) ++ if (!aspm) ++ pci_disable_link_state(pdev, PCIE_LINK_STATE_L0S | PCIE_LINK_STATE_L1 | ++ PCIE_LINK_STATE_CLKPM); ++#endif ++ ++ /* enable device (incl. PCI PM wakeup and hotplug setup) */ ++ rc = pci_enable_device(pdev); ++ if (rc < 0) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) ++ dev_err(&pdev->dev, "enable failure\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ goto err_out_free_dev; ++ } ++ ++ if (pci_set_mwi(pdev) < 0) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_drv(&debug)) ++ dev_info(&pdev->dev, "Mem-Wr-Inval unavailable.\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ } ++ ++ /* save power state before pci_enable_device overwrites it */ ++ pm_cap = pci_find_capability(pdev, PCI_CAP_ID_PM); ++ if (pm_cap) { ++ u16 pwr_command; ++ ++ pci_read_config_word(pdev, pm_cap + PCI_PM_CTRL, &pwr_command); ++ } else { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) { ++ dev_err(&pdev->dev, "PowerManagement capability not found.\n"); ++ } ++#else ++ printk("PowerManagement capability not found.\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ ++ } ++ ++ /* make sure PCI base addr 1 is MMIO */ ++ if (!(pci_resource_flags(pdev, 2) & IORESOURCE_MEM)) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) ++ dev_err(&pdev->dev, "region #1 not an MMIO resource, aborting\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ rc = -ENODEV; ++ goto err_out_mwi; ++ } ++ /* check for weird/broken PCI region reporting */ ++ if (pci_resource_len(pdev, 2) < R8125_REGS_SIZE) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) ++ dev_err(&pdev->dev, "Invalid PCI region size(s), aborting\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ rc = -ENODEV; ++ goto err_out_mwi; ++ } ++ ++ rc = pci_request_regions(pdev, MODULENAME); ++ if (rc < 0) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) ++ dev_err(&pdev->dev, "could not request regions.\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ goto err_out_mwi; ++ } ++ ++ if ((sizeof(dma_addr_t) > 4) && ++ use_dac && ++ !dma_set_mask(&pdev->dev, DMA_BIT_MASK(64)) && ++ !dma_set_coherent_mask(&pdev->dev, DMA_BIT_MASK(64))) { ++ dev->features |= NETIF_F_HIGHDMA; ++ } else { ++ rc = dma_set_mask(&pdev->dev, DMA_BIT_MASK(32)); ++ if (rc < 0) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) ++ dev_err(&pdev->dev, "DMA configuration failed.\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ goto err_out_free_res; ++ } ++ } ++ ++ /* ioremap MMIO region */ ++ ioaddr = ioremap(pci_resource_start(pdev, 2), pci_resource_len(pdev, 2)); ++ if (ioaddr == NULL) { ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) ++ dev_err(&pdev->dev, "cannot remap MMIO, aborting\n"); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ rc = -EIO; ++ goto err_out_free_res; ++ } ++ ++ tp->mmio_addr = ioaddr; ++ ++ /* Identify chip attached to board */ ++ rtl8125_get_mac_version(tp); ++ ++ rtl8125_print_mac_version(tp); ++ ++ for (i = ARRAY_SIZE(rtl_chip_info) - 1; i >= 0; i--) { ++ if (tp->mcfg == rtl_chip_info[i].mcfg) ++ break; ++ } ++ ++ if (i < 0) { ++ /* Unknown chip: assume array element #0, original RTL-8125 */ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (netif_msg_probe(tp)) ++ dev_printk(KERN_DEBUG, &pdev->dev, "unknown chip version, assuming %s\n", rtl_chip_info[0].name); ++#else ++ printk("Realtek unknown chip version, assuming %s\n", rtl_chip_info[0].name); ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,0) ++ i++; ++ } ++ ++ tp->chipset = i; ++ ++ *ioaddr_out = ioaddr; ++ *dev_out = dev; ++out: ++ return rc; ++ ++err_out_free_res: ++ pci_release_regions(pdev); ++err_out_mwi: ++ pci_clear_mwi(pdev); ++ pci_disable_device(pdev); ++err_out_free_dev: ++ free_netdev(dev); ++err_out: ++ *ioaddr_out = NULL; ++ *dev_out = NULL; ++ goto out; ++} ++ ++static bool ++rtl8125_test_phy_ocp_v4(struct rtl8125_private *tp) ++{ ++ bool restore = FALSE; ++ bool uc2_response; ++ u8 phy_fatal_err; ++ u16 val; ++ ++ if (FALSE == HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ goto exit; ++ ++ uc2_response = !!(rtl8125_mdio_direct_read_phy_ocp(tp, 0xB87A) & BIT_0); ++ phy_fatal_err = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB98E); ++ ++ if (!uc2_response && (phy_fatal_err == 0)) ++ goto exit; ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xC418, BIT_0); ++ mdelay(24); ++ ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404); ++ if ((val & 0x03) != 0x00) { ++ u32 wait_cnt = 0; ++ ++ while ((val & 0x03) != 0x00 && wait_cnt < 5) { ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC02, 0x000C); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC06, 0x7F00); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xC402, BIT_10); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xC402, BIT_10); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC06, 0x7F00); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC02, 0x000C); ++ ++ mdelay(100); ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404); ++ wait_cnt++; ++ } ++ } ++ ++ rtl8125_restore_phy_fuse_dout(tp); ++ ++ rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_INI, 5000000); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA468, BIT_0); ++ ++ rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_LAN_ON, 500000); ++ ++ if (phy_fatal_err) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, phy_fatal_err); ++ } ++ if (uc2_response) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801B); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA438, BIT_8); ++ } ++ ++ rtl8125_restore_led_select(tp); ++ ++ tp->HwHasWrRamCodeToMicroP = FALSE; ++ ++ restore = TRUE; ++ ++exit: ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xB87A, BIT_0); ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ ++ return restore; ++} ++ ++static bool ++rtl8125_test_phy_ocp_v5(struct rtl8125_private *tp) ++{ ++ bool restore = FALSE; ++ u8 phy_fatal_err; ++ u16 val; ++ ++ if (FALSE == HW_HAS_WRITE_PHY_MCU_RAM_CODE(tp)) ++ goto exit; ++ ++ phy_fatal_err = rtl8125_mdio_direct_read_phy_ocp(tp, 0xB98C); ++ ++ if (phy_fatal_err == 0) ++ goto exit; ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xC418, BIT_0); ++ mdelay(24); ++ ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404); ++ if (val & 0x0F) { ++ u32 wait_cnt = 0; ++ ++ while (val & 0x0F && wait_cnt < 5) { ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC02, 0x000C); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xBC06, 0x4F00); ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ 0xBC06, ++ 0x7F00, ++ 0x4F00); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xC402, BIT_10); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xC402, BIT_10); ++ ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC06, 0x7F00); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC4C, 0x1F00); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC04, 0x03FC); ++ rtl8125_clear_eth_phy_ocp_bit(tp, 0xBC02, 0x000C); ++ ++ mdelay(100); ++ val = rtl8125_mdio_direct_read_phy_ocp(tp, 0xC404); ++ wait_cnt++; ++ } ++ } ++ ++ rtl8125_restore_phy_fuse_dout(tp); ++ ++ rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_INI, 5000000); ++ ++ if (tp->mcfg == CFG_METHOD_10) ++ rtl8125_set_phy_mcu_8125d_1_efuse(tp->dev); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA468, BIT_0); ++ ++ rtl8125_clear_phy_ups_reg(tp->dev); ++ ++ rtl8125_wait_phy_state_ready(tp, HW_PHY_STATUS_LAN_ON, 500000); ++ ++ if (phy_fatal_err) { ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA436, 0x801C); ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xA438, phy_fatal_err); ++ } ++ ++ rtl8125_restore_led_select(tp); ++ ++ tp->HwHasWrRamCodeToMicroP = FALSE; ++ ++ restore = TRUE; ++ ++exit: ++ rtl8125_mdio_write(tp, 0x1F, 0x0000); ++ ++ return restore; ++} ++ ++static bool ++rtl8125_test_phy_ocp(struct rtl8125_private *tp) ++{ ++ unsigned long flags; ++ bool reset = false; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ if (tp->TestPhyOcpReg == FALSE) ++ goto unlock; ++ ++ switch (tp->HwSuppEsdVer) { ++ case 4: ++ reset = rtl8125_test_phy_ocp_v4(tp); ++ break; ++ case 5: ++ reset = rtl8125_test_phy_ocp_v5(tp); ++ break; ++ default: ++ goto unlock; ++ } ++ ++unlock: ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return reset; ++} ++ ++static void ++rtl8125_esd_checker(struct rtl8125_private *tp) ++{ ++ struct net_device *dev = tp->dev; ++ struct pci_dev *pdev = tp->pci_dev; ++ u8 cmd; ++ u16 io_base_l; ++ u16 mem_base_l; ++ u16 mem_base_h; ++ u8 ilr; ++ u16 resv_0x1c_h; ++ u16 resv_0x1c_l; ++ u16 resv_0x20_l; ++ u16 resv_0x20_h; ++ u16 resv_0x24_l; ++ u16 resv_0x24_h; ++ u16 resv_0x2c_h; ++ u16 resv_0x2c_l; ++ u32 pci_sn_l; ++ u32 pci_sn_h; ++ ++ if (unlikely(tp->rtk_enable_diag)) ++ goto exit; ++ ++ tp->esd_flag = 0; ++ ++ pci_read_config_byte(pdev, PCI_COMMAND, &cmd); ++ if (cmd != tp->pci_cfg_space.cmd) { ++ printk(KERN_ERR "%s: cmd = 0x%02x, should be 0x%02x \n.", dev->name, cmd, tp->pci_cfg_space.cmd); ++ pci_write_config_byte(pdev, PCI_COMMAND, tp->pci_cfg_space.cmd); ++ tp->esd_flag |= BIT_0; ++ ++ pci_read_config_byte(pdev, PCI_COMMAND, &cmd); ++ if (cmd == 0xff) { ++ printk(KERN_ERR "%s: pci link is down \n.", dev->name); ++ goto exit; ++ } ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_0, &io_base_l); ++ if (io_base_l != tp->pci_cfg_space.io_base_l) { ++ printk(KERN_ERR "%s: io_base_l = 0x%04x, should be 0x%04x \n.", dev->name, io_base_l, tp->pci_cfg_space.io_base_l); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_0, tp->pci_cfg_space.io_base_l); ++ tp->esd_flag |= BIT_1; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_2, &mem_base_l); ++ if (mem_base_l != tp->pci_cfg_space.mem_base_l) { ++ printk(KERN_ERR "%s: mem_base_l = 0x%04x, should be 0x%04x \n.", dev->name, mem_base_l, tp->pci_cfg_space.mem_base_l); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_2, tp->pci_cfg_space.mem_base_l); ++ tp->esd_flag |= BIT_2; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_2 + 2, &mem_base_h); ++ if (mem_base_h!= tp->pci_cfg_space.mem_base_h) { ++ printk(KERN_ERR "%s: mem_base_h = 0x%04x, should be 0x%04x \n.", dev->name, mem_base_h, tp->pci_cfg_space.mem_base_h); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_2 + 2, tp->pci_cfg_space.mem_base_h); ++ tp->esd_flag |= BIT_3; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_3, &resv_0x1c_l); ++ if (resv_0x1c_l != tp->pci_cfg_space.resv_0x1c_l) { ++ printk(KERN_ERR "%s: resv_0x1c_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x1c_l, tp->pci_cfg_space.resv_0x1c_l); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_3, tp->pci_cfg_space.resv_0x1c_l); ++ tp->esd_flag |= BIT_4; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_3 + 2, &resv_0x1c_h); ++ if (resv_0x1c_h != tp->pci_cfg_space.resv_0x1c_h) { ++ printk(KERN_ERR "%s: resv_0x1c_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x1c_h, tp->pci_cfg_space.resv_0x1c_h); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_3 + 2, tp->pci_cfg_space.resv_0x1c_h); ++ tp->esd_flag |= BIT_5; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_4, &resv_0x20_l); ++ if (resv_0x20_l != tp->pci_cfg_space.resv_0x20_l) { ++ printk(KERN_ERR "%s: resv_0x20_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x20_l, tp->pci_cfg_space.resv_0x20_l); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_4, tp->pci_cfg_space.resv_0x20_l); ++ tp->esd_flag |= BIT_6; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_4 + 2, &resv_0x20_h); ++ if (resv_0x20_h != tp->pci_cfg_space.resv_0x20_h) { ++ printk(KERN_ERR "%s: resv_0x20_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x20_h, tp->pci_cfg_space.resv_0x20_h); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_4 + 2, tp->pci_cfg_space.resv_0x20_h); ++ tp->esd_flag |= BIT_7; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_5, &resv_0x24_l); ++ if (resv_0x24_l != tp->pci_cfg_space.resv_0x24_l) { ++ printk(KERN_ERR "%s: resv_0x24_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x24_l, tp->pci_cfg_space.resv_0x24_l); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_5, tp->pci_cfg_space.resv_0x24_l); ++ tp->esd_flag |= BIT_8; ++ } ++ ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_5 + 2, &resv_0x24_h); ++ if (resv_0x24_h != tp->pci_cfg_space.resv_0x24_h) { ++ printk(KERN_ERR "%s: resv_0x24_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x24_h, tp->pci_cfg_space.resv_0x24_h); ++ pci_write_config_word(pdev, PCI_BASE_ADDRESS_5 + 2, tp->pci_cfg_space.resv_0x24_h); ++ tp->esd_flag |= BIT_9; ++ } ++ ++ pci_read_config_byte(pdev, PCI_INTERRUPT_LINE, &ilr); ++ if (ilr != tp->pci_cfg_space.ilr) { ++ printk(KERN_ERR "%s: ilr = 0x%02x, should be 0x%02x \n.", dev->name, ilr, tp->pci_cfg_space.ilr); ++ pci_write_config_byte(pdev, PCI_INTERRUPT_LINE, tp->pci_cfg_space.ilr); ++ tp->esd_flag |= BIT_10; ++ } ++ ++ pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &resv_0x2c_l); ++ if (resv_0x2c_l != tp->pci_cfg_space.resv_0x2c_l) { ++ printk(KERN_ERR "%s: resv_0x2c_l = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x2c_l, tp->pci_cfg_space.resv_0x2c_l); ++ pci_write_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, tp->pci_cfg_space.resv_0x2c_l); ++ tp->esd_flag |= BIT_11; ++ } ++ ++ pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID + 2, &resv_0x2c_h); ++ if (resv_0x2c_h != tp->pci_cfg_space.resv_0x2c_h) { ++ printk(KERN_ERR "%s: resv_0x2c_h = 0x%04x, should be 0x%04x \n.", dev->name, resv_0x2c_h, tp->pci_cfg_space.resv_0x2c_h); ++ pci_write_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID + 2, tp->pci_cfg_space.resv_0x2c_h); ++ tp->esd_flag |= BIT_12; ++ } ++ ++ if (tp->HwPcieSNOffset > 0) { ++ pci_sn_l = rtl8125_csi_read(tp, tp->HwPcieSNOffset); ++ if (pci_sn_l != tp->pci_cfg_space.pci_sn_l) { ++ printk(KERN_ERR "%s: pci_sn_l = 0x%08x, should be 0x%08x \n.", dev->name, pci_sn_l, tp->pci_cfg_space.pci_sn_l); ++ rtl8125_csi_write(tp, tp->HwPcieSNOffset, tp->pci_cfg_space.pci_sn_l); ++ tp->esd_flag |= BIT_13; ++ } ++ ++ pci_sn_h = rtl8125_csi_read(tp, tp->HwPcieSNOffset + 4); ++ if (pci_sn_h != tp->pci_cfg_space.pci_sn_h) { ++ printk(KERN_ERR "%s: pci_sn_h = 0x%08x, should be 0x%08x \n.", dev->name, pci_sn_h, tp->pci_cfg_space.pci_sn_h); ++ rtl8125_csi_write(tp, tp->HwPcieSNOffset + 4, tp->pci_cfg_space.pci_sn_h); ++ tp->esd_flag |= BIT_14; ++ } ++ } ++ ++ if (tp->TestPhyOcpReg && rtl8125_test_phy_ocp(tp)) ++ tp->esd_flag |= BIT_15; ++ ++ if (tp->esd_flag != 0) { ++ printk(KERN_ERR "%s: esd_flag = 0x%04x\n.\n", dev->name, tp->esd_flag); ++ netif_carrier_off(dev); ++ netif_tx_disable(dev); ++ rtl8125_hw_reset(dev); ++ rtl8125_tx_clear(tp); ++ rtl8125_rx_clear(tp); ++ rtl8125_init_ring(dev); ++ rtl8125_up(dev); ++ rtl8125_enable_hw_linkchg_interrupt(tp); ++ rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising); ++ tp->esd_flag = 0; ++ } ++exit: ++ return; ++} ++/* ++static void ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++rtl8125_esd_timer(unsigned long __opaque) ++#else ++rtl8125_esd_timer(struct timer_list *t) ++#endif ++{ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++ struct net_device *dev = (struct net_device *)__opaque; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct timer_list *timer = &tp->esd_timer; ++#else ++ struct rtl8125_private *tp = from_timer(tp, t, esd_timer); ++ //struct net_device *dev = tp->dev; ++ struct timer_list *timer = t; ++#endif ++ rtl8125_esd_checker(tp); ++ ++ mod_timer(timer, jiffies + timeout); ++} ++*/ ++ ++/* ++static void ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++rtl8125_link_timer(unsigned long __opaque) ++#else ++rtl8125_link_timer(struct timer_list *t) ++#endif ++{ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,14,0) ++ struct net_device *dev = (struct net_device *)__opaque; ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct timer_list *timer = &tp->link_timer; ++#else ++ struct rtl8125_private *tp = from_timer(tp, t, link_timer); ++ struct net_device *dev = tp->dev; ++ struct timer_list *timer = t; ++#endif ++ rtl8125_check_link_status(dev); ++ ++ mod_timer(timer, jiffies + RTL8125_LINK_TIMEOUT); ++} ++*/ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) ++static int pci_enable_msix_range(struct pci_dev *dev, struct msix_entry *entries, ++ int minvec, int maxvec) ++{ ++ int nvec = maxvec; ++ int rc; ++ ++ if (maxvec < minvec) ++ return -ERANGE; ++ ++ do { ++ rc = pci_enable_msix(dev, entries, nvec); ++ if (rc < 0) { ++ return rc; ++ } else if (rc > 0) { ++ if (rc < minvec) ++ return -ENOSPC; ++ nvec = rc; ++ } ++ } while (rc); ++ ++ return nvec; ++} ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(3,14,0) */ ++ ++static int rtl8125_enable_msix(struct rtl8125_private *tp) ++{ ++ int i, nvecs = 0; ++ struct msix_entry msix_ent[R8125_MAX_MSIX_VEC]; ++ //struct net_device *dev = tp->dev; ++ //const int len = sizeof(tp->irq_tbl[0].name); ++ ++ for (i = 0; i < R8125_MAX_MSIX_VEC; i++) { ++ msix_ent[i].entry = i; ++ msix_ent[i].vector = 0; ++ } ++ ++ nvecs = pci_enable_msix_range(tp->pci_dev, msix_ent, ++ tp->min_irq_nvecs, tp->max_irq_nvecs); ++ if (nvecs < 0) ++ goto out; ++ ++ for (i = 0; i < nvecs; i++) { ++ struct r8125_irq *irq = &tp->irq_tbl[i]; ++ irq->vector = msix_ent[i].vector; ++ //snprintf(irq->name, len, "%s-%d", dev->name, i); ++ //irq->handler = rtl8125_interrupt_msix; ++ } ++ ++out: ++ return nvecs; ++} ++ ++/* Cfg9346_Unlock assumed. */ ++static int rtl8125_try_msi(struct rtl8125_private *tp) ++{ ++ struct pci_dev *pdev = tp->pci_dev; ++ unsigned int hw_supp_irq_nvecs; ++ unsigned msi = 0; ++ int nvecs = 1; ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ hw_supp_irq_nvecs = R8125_MAX_MSIX_VEC_8125A; ++ break; ++ case CFG_METHOD_4 ... CFG_METHOD_7: ++ hw_supp_irq_nvecs = R8125_MAX_MSIX_VEC_8125B; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ hw_supp_irq_nvecs = R8125_MAX_MSIX_VEC_8125D; ++ break; ++ default: ++ hw_supp_irq_nvecs = 1; ++ break; ++ } ++ tp->hw_supp_irq_nvecs = clamp_val(hw_supp_irq_nvecs, 1, ++ R8125_MAX_MSIX_VEC); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ tp->max_irq_nvecs = tp->hw_supp_irq_nvecs; ++ tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125B; ++ break; ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ tp->max_irq_nvecs = tp->hw_supp_irq_nvecs; ++ tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125BP; ++ break; ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_13: ++ tp->max_irq_nvecs = tp->hw_supp_irq_nvecs; ++ tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125D; ++ break; ++ case CFG_METHOD_12: ++ tp->max_irq_nvecs = tp->hw_supp_irq_nvecs; ++ tp->min_irq_nvecs = R8125_MIN_MSIX_VEC_8125CP; ++ break; ++ default: ++ tp->max_irq_nvecs = 1; ++ tp->min_irq_nvecs = 1; ++ break; ++ } ++#ifdef DISABLE_MULTI_MSIX_VECTOR ++ tp->max_irq_nvecs = 1; ++#endif ++ ++#if defined(RTL_USE_NEW_INTR_API) ++ if ((nvecs = pci_alloc_irq_vectors(pdev, tp->min_irq_nvecs, tp->max_irq_nvecs, PCI_IRQ_MSIX)) > 0) ++ msi |= RTL_FEATURE_MSIX; ++ else if ((nvecs = pci_alloc_irq_vectors(pdev, 1, 1, PCI_IRQ_ALL_TYPES)) > 0 && ++ pci_dev_msi_enabled(pdev)) ++ msi |= RTL_FEATURE_MSI; ++#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) ++ if ((nvecs = rtl8125_enable_msix(tp)) > 0) ++ msi |= RTL_FEATURE_MSIX; ++ else if (!pci_enable_msi(pdev)) ++ msi |= RTL_FEATURE_MSI; ++#endif ++ if (!(msi & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX))) ++ dev_info(&pdev->dev, "no MSI/MSI-X. Back to INTx.\n"); ++ ++ if (!(msi & RTL_FEATURE_MSIX) || nvecs < 1) ++ nvecs = 1; ++ ++ tp->irq_nvecs = nvecs; ++ ++ tp->features |= msi; ++ ++ return nvecs; ++} ++ ++static void rtl8125_disable_msi(struct pci_dev *pdev, struct rtl8125_private *tp) ++{ ++#if defined(RTL_USE_NEW_INTR_API) ++ if (tp->features & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX)) ++ pci_free_irq_vectors(pdev); ++#elif LINUX_VERSION_CODE > KERNEL_VERSION(2,6,13) ++ if (tp->features & (RTL_FEATURE_MSIX)) ++ pci_disable_msix(pdev); ++ else if (tp->features & (RTL_FEATURE_MSI)) ++ pci_disable_msi(pdev); ++#endif ++ tp->features &= ~(RTL_FEATURE_MSI | RTL_FEATURE_MSIX); ++} ++ ++static int rtl8125_get_irq(struct pci_dev *pdev) ++{ ++#if defined(RTL_USE_NEW_INTR_API) ++ return pci_irq_vector(pdev, 0); ++#else ++ return pdev->irq; ++#endif ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0) ++static void ++rtl8125_get_stats64(struct net_device *dev, struct rtnl_link_stats64 *stats) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct rtl8125_counters *counters = tp->tally_vaddr; ++ dma_addr_t paddr = tp->tally_paddr; ++ ++ if (!counters) ++ return; ++ ++ netdev_stats_to_stats64(stats, &dev->stats); ++ dev_fetch_sw_netstats(stats, dev->tstats); ++ ++ /* ++ * Fetch additional counter values missing in stats collected by driver ++ * from tally counters. ++ */ ++ rtl8125_dump_tally_counter(tp, paddr); ++ ++ stats->tx_errors = le64_to_cpu(counters->tx_errors); ++ stats->collisions = le32_to_cpu(counters->tx_multi_collision); ++ stats->tx_aborted_errors = le16_to_cpu(counters->tx_aborted); ++ stats->rx_missed_errors = le16_to_cpu(counters->rx_missed); ++} ++#else ++/** ++ * rtl8125_get_stats - Get rtl8125 read/write statistics ++ * @dev: The Ethernet Device to get statistics for ++ * ++ * Get TX/RX statistics for rtl8125 ++ */ ++static struct ++net_device_stats *rtl8125_get_stats(struct net_device *dev) ++{ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++ struct rtl8125_private *tp = netdev_priv(dev); ++#endif ++ return &RTLDEV->stats; ++} ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,36) ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) ++static const struct net_device_ops rtl8125_netdev_ops = { ++ .ndo_open = rtl8125_open, ++ .ndo_stop = rtl8125_close, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0) ++ .ndo_get_stats64 = rtl8125_get_stats64, ++#else ++ .ndo_get_stats = rtl8125_get_stats, ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0) ++ .ndo_start_xmit = rtl8125_start_xmit, ++ .ndo_tx_timeout = rtl8125_tx_timeout, ++ .ndo_change_mtu = rtl8125_change_mtu, ++ .ndo_set_mac_address = rtl8125_set_mac_address, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0) ++ .ndo_do_ioctl = rtl8125_do_ioctl, ++#else ++ .ndo_siocdevprivate = rtl8125_siocdevprivate, ++ .ndo_eth_ioctl = rtl8125_do_ioctl, ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(5,15,0) ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,1,0) ++ .ndo_set_multicast_list = rtl8125_set_rx_mode, ++#else ++ .ndo_set_rx_mode = rtl8125_set_rx_mode, ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++#ifdef CONFIG_R8125_VLAN ++ .ndo_vlan_rx_register = rtl8125_vlan_rx_register, ++#endif ++#else ++ .ndo_fix_features = rtl8125_fix_features, ++ .ndo_set_features = rtl8125_set_features, ++#endif ++#ifdef CONFIG_NET_POLL_CONTROLLER ++ .ndo_poll_controller = rtl8125_netpoll, ++#endif ++}; ++#endif ++ ++ ++#ifdef CONFIG_R8125_NAPI ++ ++static int rtl8125_poll(napi_ptr napi, napi_budget budget) ++{ ++ struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi); ++ struct rtl8125_private *tp = r8125napi->priv; ++ RTL_GET_NETDEV(tp) ++ unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev); ++ unsigned int work_done = 0; ++ int i; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) ++ rtl8125_tx_interrupt(&tp->tx_ring[i], budget); ++ ++ for (i = 0; i < tp->num_rx_rings; i++) ++ work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[i], budget); ++ ++ work_done = min(work_done, work_to_do); ++ ++ RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget); ++ ++ if (work_done < work_to_do) { ++#ifdef ENABLE_DASH_SUPPORT ++ if (rtl8125_check_dash_interrupt(tp)) ++ rtl8125_schedule_dash_work(tp); ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) ++ if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) ++ return RTL_NAPI_RETURN_VALUE; ++#else ++ RTL_NETIF_RX_COMPLETE(dev, napi, work_done); ++#endif ++ /* ++ * 20040426: the barrier is not strictly required but the ++ * behavior of the irq handler could be less predictable ++ * without it. Btw, the lack of flush for the posted pci ++ * write is safe - FR ++ */ ++ smp_wmb(); ++ ++ rtl8125_switch_to_timer_interrupt(tp); ++ } ++ ++ return RTL_NAPI_RETURN_VALUE; ++} ++ ++static int rtl8125_poll_msix_ring(napi_ptr napi, napi_budget budget) ++{ ++ struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi); ++ struct rtl8125_private *tp = r8125napi->priv; ++ RTL_GET_NETDEV(tp) ++ unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev); ++ unsigned int work_done = 0; ++ const int message_id = r8125napi->index; ++ ++ if (message_id < tp->num_tx_rings) ++ rtl8125_tx_interrupt_with_vector(tp, message_id, budget); ++ ++ if (message_id < tp->num_rx_rings) ++ work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget); ++ ++ RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget); ++ ++ if (work_done < work_to_do) { ++#ifdef ENABLE_DASH_SUPPORT ++ if (message_id == 31) ++ if (rtl8125_check_dash_interrupt(tp)) ++ rtl8125_schedule_dash_work(tp); ++#endif ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) ++ if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) ++ return RTL_NAPI_RETURN_VALUE; ++#else ++ RTL_NETIF_RX_COMPLETE(dev, napi, work_done); ++#endif ++ /* ++ * 20040426: the barrier is not strictly required but the ++ * behavior of the irq handler could be less predictable ++ * without it. Btw, the lack of flush for the posted pci ++ * write is safe - FR ++ */ ++ smp_wmb(); ++ ++ rtl8125_enable_hw_interrupt_v2(tp, message_id); ++ } ++ ++ return RTL_NAPI_RETURN_VALUE; ++} ++ ++static int rtl8125_poll_msix_tx(napi_ptr napi, napi_budget budget) ++{ ++ struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi); ++ struct rtl8125_private *tp = r8125napi->priv; ++ RTL_GET_NETDEV(tp) ++ unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev); ++ unsigned int work_done = 0; ++ const int message_id = r8125napi->index; ++ ++ //suppress unused variable ++ (void)(dev); ++ ++ rtl8125_tx_interrupt_with_vector(tp, message_id, budget); ++ ++ RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget); ++ ++ if (work_done < work_to_do) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) ++ if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) ++ return RTL_NAPI_RETURN_VALUE; ++#else ++ RTL_NETIF_RX_COMPLETE(dev, napi, work_done); ++#endif ++ /* ++ * 20040426: the barrier is not strictly required but the ++ * behavior of the irq handler could be less predictable ++ * without it. Btw, the lack of flush for the posted pci ++ * write is safe - FR ++ */ ++ smp_wmb(); ++ ++ rtl8125_enable_hw_interrupt_v2(tp, message_id); ++ } ++ ++ return RTL_NAPI_RETURN_VALUE; ++} ++ ++static int rtl8125_poll_msix_other(napi_ptr napi, napi_budget budget) ++{ ++ struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi); ++ struct rtl8125_private *tp = r8125napi->priv; ++ RTL_GET_NETDEV(tp) ++ unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev); ++ const int message_id = r8125napi->index; ++ ++ //suppress unused variable ++ (void)(dev); ++ (void)(work_to_do); ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) ++ RTL_NETIF_RX_COMPLETE(dev, napi, work_to_do); ++#else ++ RTL_NETIF_RX_COMPLETE(dev, napi, work_to_do); ++#endif ++ ++ rtl8125_enable_hw_interrupt_v2(tp, message_id); ++ ++ return 1; ++} ++ ++static int rtl8125_poll_msix_rx(napi_ptr napi, napi_budget budget) ++{ ++ struct r8125_napi *r8125napi = RTL_GET_PRIV(napi, struct r8125_napi); ++ struct rtl8125_private *tp = r8125napi->priv; ++ RTL_GET_NETDEV(tp) ++ unsigned int work_to_do = RTL_NAPI_QUOTA(budget, dev); ++ unsigned int work_done = 0; ++ const int message_id = r8125napi->index; ++ ++ if (message_id < tp->num_rx_rings) ++ work_done += rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget); ++ ++ RTL_NAPI_QUOTA_UPDATE(dev, work_done, budget); ++ ++ if (work_done < work_to_do) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4,10,0) ++ if (RTL_NETIF_RX_COMPLETE(dev, napi, work_done) == FALSE) ++ return RTL_NAPI_RETURN_VALUE; ++#else ++ RTL_NETIF_RX_COMPLETE(dev, napi, work_done); ++#endif ++ /* ++ * 20040426: the barrier is not strictly required but the ++ * behavior of the irq handler could be less predictable ++ * without it. Btw, the lack of flush for the posted pci ++ * write is safe - FR ++ */ ++ smp_wmb(); ++ ++ rtl8125_enable_hw_interrupt_v2(tp, message_id); ++ } ++ ++ return RTL_NAPI_RETURN_VALUE; ++} ++ ++void rtl8125_enable_napi(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ int i; ++ ++ for (i = 0; i < tp->irq_nvecs; i++) ++ RTL_NAPI_ENABLE(tp->dev, &tp->r8125napi[i].napi); ++#endif ++} ++ ++static void rtl8125_disable_napi(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ int i; ++ ++ for (i = 0; i < tp->irq_nvecs; i++) ++ RTL_NAPI_DISABLE(tp->dev, &tp->r8125napi[i].napi); ++#endif ++} ++ ++static void rtl8125_del_napi(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ int i; ++ ++ for (i = 0; i < tp->irq_nvecs; i++) ++ RTL_NAPI_DEL((&tp->r8125napi[i])); ++#endif ++} ++#endif //CONFIG_R8125_NAPI ++ ++static void rtl8125_init_napi(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i=0; iirq_nvecs; i++) { ++ struct r8125_napi *r8125napi = &tp->r8125napi[i]; ++#ifdef CONFIG_R8125_NAPI ++ int (*poll)(struct napi_struct *, int); ++ ++ poll = rtl8125_poll; ++ if (tp->features & RTL_FEATURE_MSIX) { ++ switch (tp->HwCurrIsrVer) { ++ case 7: ++ if (i < R8125_MAX_RX_QUEUES_VEC_V3) ++ poll = rtl8125_poll_msix_rx; ++ else if (i == 27 || i == 28) ++ poll = rtl8125_poll_msix_tx; ++ else ++ poll = rtl8125_poll_msix_other; ++ break; ++ case 5: ++ if (i < R8125_MAX_RX_QUEUES_VEC_V3) ++ poll = rtl8125_poll_msix_rx; ++ else if (i == 16 || i == 17) ++ poll = rtl8125_poll_msix_tx; ++ else ++ poll = rtl8125_poll_msix_other; ++ break; ++ case 2: ++ if (i < R8125_MAX_RX_QUEUES_VEC_V3) ++ poll = rtl8125_poll_msix_rx; ++ else if (i == 16 || i == 18) ++ poll = rtl8125_poll_msix_tx; ++ else ++ poll = rtl8125_poll_msix_other; ++ break; ++ case 3: ++ case 4: ++ if (i < R8125_MAX_RX_QUEUES_VEC_V3) ++ poll = rtl8125_poll_msix_ring; ++ else ++ poll = rtl8125_poll_msix_other; ++ break; ++ } ++ } ++ ++ RTL_NAPI_CONFIG(tp->dev, r8125napi, poll, R8125_NAPI_WEIGHT); ++#endif ++ ++ r8125napi->priv = tp; ++ r8125napi->index = i; ++ } ++} ++ ++static int ++rtl8125_set_real_num_queue(struct rtl8125_private *tp) ++{ ++ int retval = 0; ++ ++ retval = netif_set_real_num_tx_queues(tp->dev, tp->num_tx_rings); ++ if (retval < 0) ++ goto exit; ++ ++ retval = netif_set_real_num_rx_queues(tp->dev, tp->num_rx_rings); ++ if (retval < 0) ++ goto exit; ++ ++exit: ++ return retval; ++} ++ ++static int __devinit ++rtl8125_init_one(struct pci_dev *pdev, ++ const struct pci_device_id *ent) ++{ ++ struct net_device *dev = NULL; ++ struct rtl8125_private *tp; ++ void __iomem *ioaddr = NULL; ++ static int board_idx = -1; ++ ++ int rc; ++ ++ assert(pdev != NULL); ++ assert(ent != NULL); ++ ++ board_idx++; ++ ++ if (netif_msg_drv(&debug)) ++ printk(KERN_INFO "%s Ethernet controller driver %s loaded\n", ++ MODULENAME, RTL8125_VERSION); ++ ++ rc = rtl8125_init_board(pdev, &dev, &ioaddr); ++ if (rc) ++ goto out; ++ ++ tp = netdev_priv(dev); ++ assert(ioaddr != NULL); ++ ++ spin_lock_init(&tp->phy_lock); ++ ++ tp->set_speed = rtl8125_set_speed_xmii; ++ tp->get_settings = rtl8125_gset_xmii; ++ tp->phy_reset_enable = rtl8125_xmii_reset_enable; ++ tp->phy_reset_pending = rtl8125_xmii_reset_pending; ++ tp->link_ok = rtl8125_xmii_link_ok; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0) ++ dev->tstats = devm_netdev_alloc_pcpu_stats(&pdev->dev, ++ struct pcpu_sw_netstats); ++ if (!dev->tstats) ++ goto err_out_1; ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,11,0) ++ ++ rc = rtl8125_try_msi(tp); ++ if (rc < 0) { ++ dev_err(&pdev->dev, "Can't allocate interrupt\n"); ++ goto err_out_1; ++ } ++ ++ rtl8125_init_software_variable(dev); ++ ++ RTL_NET_DEVICE_OPS(rtl8125_netdev_ops); ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,4,22) ++ SET_ETHTOOL_OPS(dev, &rtl8125_ethtool_ops); ++#endif ++ ++ dev->watchdog_timeo = RTL8125_TX_TIMEOUT; ++ dev->irq = rtl8125_get_irq(pdev); ++ dev->base_addr = (unsigned long) ioaddr; ++ ++ rtl8125_init_napi(tp); ++ ++#ifdef CONFIG_R8125_VLAN ++ if (tp->mcfg != CFG_METHOD_DEFAULT) { ++ dev->features |= NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++ dev->vlan_rx_kill_vid = rtl8125_vlan_rx_kill_vid; ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++ } ++#endif ++ ++ /* There has been a number of reports that using SG/TSO results in ++ * tx timeouts. However for a lot of people SG/TSO works fine. ++ * Therefore disable both features by default, but allow users to ++ * enable them. Use at own risk! ++ */ ++ tp->cp_cmd |= RTL_R16(tp, CPlusCmd); ++ if (tp->mcfg != CFG_METHOD_DEFAULT) { ++ dev->features |= NETIF_F_IP_CSUM; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ tp->cp_cmd |= RxChkSum; ++#else ++ dev->features |= NETIF_F_RXCSUM; ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ /* nothing to do */ ++ break; ++ default: ++ dev->features |= NETIF_F_SG | NETIF_F_TSO; ++ break; ++ }; ++ dev->hw_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO | ++ NETIF_F_RXCSUM | NETIF_F_HW_VLAN_TX | NETIF_F_HW_VLAN_RX; ++ dev->vlan_features = NETIF_F_SG | NETIF_F_IP_CSUM | NETIF_F_TSO | ++ NETIF_F_HIGHDMA; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) ++ dev->priv_flags |= IFF_LIVE_ADDR_CHANGE; ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,15,0) ++ dev->hw_features |= NETIF_F_RXALL; ++ dev->hw_features |= NETIF_F_RXFCS; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) ++ dev->hw_features |= NETIF_F_IPV6_CSUM | NETIF_F_TSO6; ++ dev->features |= NETIF_F_IPV6_CSUM; ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ /* nothing to do */ ++ break; ++ default: ++ dev->features |= NETIF_F_TSO6; ++ break; ++ }; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,19,0) ++ netif_set_tso_max_size(dev, LSO_64K); ++ netif_set_tso_max_segs(dev, NIC_MAX_PHYS_BUF_COUNT_LSO2); ++#else //LINUX_VERSION_CODE >= KERNEL_VERSION(5,19,0) ++ netif_set_gso_max_size(dev, LSO_64K); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,0) ++ dev->gso_max_segs = NIC_MAX_PHYS_BUF_COUNT_LSO2; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) ++ dev->gso_min_segs = NIC_MIN_PHYS_BUF_COUNT; ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,7,0) ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,18,0) ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(5,19,0) ++ ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ ++#ifdef ENABLE_RSS_SUPPORT ++ if (tp->EnableRss) { ++ dev->hw_features |= NETIF_F_RXHASH; ++ dev->features |= NETIF_F_RXHASH; ++ } ++#endif ++ } ++ ++ netdev_sw_irq_coalesce_default_on(dev); ++ ++#ifdef ENABLE_LIB_SUPPORT ++ BLOCKING_INIT_NOTIFIER_HEAD(&tp->lib_nh); ++#endif ++ rtl8125_init_all_schedule_work(tp); ++ ++ rc = rtl8125_set_real_num_queue(tp); ++ if (rc < 0) ++ goto err_out; ++ ++ rtl8125_exit_oob(dev); ++ ++ rtl8125_powerup_pll(dev); ++ ++ rtl8125_hw_init(dev); ++ ++ rtl8125_hw_reset(dev); ++ ++ /* Get production from EEPROM */ ++ rtl8125_eeprom_type(tp); ++ ++ if (tp->eeprom_type == EEPROM_TYPE_93C46 || tp->eeprom_type == EEPROM_TYPE_93C56) ++ rtl8125_set_eeprom_sel_low(tp); ++ ++ rtl8125_get_mac_address(dev); ++ ++ tp->fw_name = rtl_chip_fw_infos[tp->mcfg].fw_name; ++ ++ tp->tally_vaddr = dma_alloc_coherent(&pdev->dev, sizeof(*tp->tally_vaddr), ++ &tp->tally_paddr, GFP_KERNEL); ++ if (!tp->tally_vaddr) { ++ rc = -ENOMEM; ++ goto err_out; ++ } ++ ++ rtl8125_tally_counter_clear(tp); ++ ++ pci_set_drvdata(pdev, dev); ++ ++ rc = register_netdev(dev); ++ if (rc) ++ goto err_out; ++ ++ printk(KERN_INFO "%s: This product is covered by one or more of the following patents: US6,570,884, US6,115,776, and US6,327,625.\n", MODULENAME); ++ ++ rtl8125_disable_rxdvgate(dev); ++ ++ device_set_wakeup_enable(&pdev->dev, tp->wol_enabled); ++ ++ netif_carrier_off(dev); ++ ++#ifdef ENABLE_R8125_SYSFS ++ rtl8125_sysfs_init(dev); ++#endif /* ENABLE_R8125_SYSFS */ ++ ++ printk("%s", GPL_CLAIM); ++ ++out: ++ return rc; ++ ++err_out: ++ if (tp->tally_vaddr != NULL) { ++ dma_free_coherent(&pdev->dev, sizeof(*tp->tally_vaddr), tp->tally_vaddr, ++ tp->tally_paddr); ++ ++ tp->tally_vaddr = NULL; ++ } ++#ifdef CONFIG_R8125_NAPI ++ rtl8125_del_napi(tp); ++#endif ++ rtl8125_disable_msi(pdev, tp); ++ ++err_out_1: ++ rtl8125_release_board(pdev, dev); ++ ++ goto out; ++} ++ ++static void __devexit ++rtl8125_remove_one(struct pci_dev *pdev) ++{ ++ struct net_device *dev = pci_get_drvdata(pdev); ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ assert(dev != NULL); ++ assert(tp != NULL); ++ ++ set_bit(R8125_FLAG_DOWN, tp->task_flags); ++ ++ rtl8125_cancel_all_schedule_work(tp); ++ ++ if (HW_DASH_SUPPORT_DASH(tp)) ++ rtl8125_driver_stop(tp); ++ ++ rtl8125_disable_pci_offset_180(tp); ++ ++#ifdef ENABLE_R8125_SYSFS ++ rtl8125_sysfs_remove(dev); ++#endif //ENABLE_R8125_SYSFS ++ ++ unregister_netdev(dev); ++#ifdef CONFIG_R8125_NAPI ++ rtl8125_del_napi(tp); ++#endif ++ rtl8125_disable_msi(pdev, tp); ++#ifdef ENABLE_R8125_PROCFS ++ rtl8125_proc_remove(dev); ++#endif ++ if (tp->tally_vaddr != NULL) { ++ dma_free_coherent(&pdev->dev, sizeof(*tp->tally_vaddr), tp->tally_vaddr, tp->tally_paddr); ++ tp->tally_vaddr = NULL; ++ } ++ ++ rtl8125_release_board(pdev, dev); ++ ++#ifdef ENABLE_USE_FIRMWARE_FILE ++ rtl8125_release_firmware(tp); ++#endif ++ ++ pci_set_drvdata(pdev, NULL); ++} ++ ++#ifdef ENABLE_PAGE_REUSE ++static inline unsigned int rtl8125_rx_page_order(unsigned rx_buf_sz, unsigned page_size) ++{ ++ unsigned truesize = SKB_DATA_ALIGN(sizeof(struct skb_shared_info)) + ++ SKB_DATA_ALIGN(rx_buf_sz + R8125_RX_ALIGN); ++ ++ return get_order(truesize * 2); ++} ++#endif //ENABLE_PAGE_REUSE ++ ++static void ++rtl8125_set_rxbufsize(struct rtl8125_private *tp, ++ struct net_device *dev) ++{ ++ unsigned int mtu = dev->mtu; ++ ++ tp->rms = (mtu > ETH_DATA_LEN) ? ++ mtu + ETH_HLEN + RT_VALN_HLEN + ETH_FCS_LEN: ++ RX_BUF_SIZE; ++ tp->rx_buf_sz = tp->rms; ++#ifdef ENABLE_RX_PACKET_FRAGMENT ++ tp->rx_buf_sz = SKB_DATA_ALIGN(RX_BUF_SIZE); ++#endif //ENABLE_RX_PACKET_FRAGMENT ++#ifdef ENABLE_PAGE_REUSE ++ tp->rx_buf_page_order = rtl8125_rx_page_order(tp->rx_buf_sz, PAGE_SIZE); ++ tp->rx_buf_page_size = rtl8125_rx_page_size(tp->rx_buf_page_order); ++#endif //ENABLE_PAGE_REUSE ++} ++ ++static void ++rtl8125_set_rms(struct rtl8125_private *tp, u16 rms) ++{ ++ switch (tp->mcfg) { ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ rms |= AcceppVlanPhys; ++ break; ++ default: ++ rms &= ~AcceppVlanPhys; ++ break; ++ } ++ RTL_W16(tp, RxMaxSize, rms); ++} ++ ++static void rtl8125_free_irq(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i=0; iirq_nvecs; i++) { ++ struct r8125_irq *irq = &tp->irq_tbl[i]; ++ struct r8125_napi *r8125napi = &tp->r8125napi[i]; ++ ++ if (irq->requested) { ++ irq->requested = 0; ++#if defined(RTL_USE_NEW_INTR_API) ++ pci_free_irq(tp->pci_dev, i, r8125napi); ++#else ++ free_irq(irq->vector, r8125napi); ++#endif ++ } ++ } ++} ++ ++static int rtl8125_alloc_irq(struct rtl8125_private *tp) ++{ ++ struct net_device *dev = tp->dev; ++ int rc = 0; ++ struct r8125_irq *irq; ++ struct r8125_napi *r8125napi; ++ int i = 0; ++ const int len = sizeof(tp->irq_tbl[0].name); ++ ++#if defined(RTL_USE_NEW_INTR_API) ++ for (i=0; iirq_nvecs; i++) { ++ irq = &tp->irq_tbl[i]; ++ if (tp->features & RTL_FEATURE_MSIX && ++ tp->HwCurrIsrVer > 1) ++ irq->handler = rtl8125_interrupt_msix; ++ else ++ irq->handler = rtl8125_interrupt; ++ ++ r8125napi = &tp->r8125napi[i]; ++ snprintf(irq->name, len, "%s-%d", dev->name, i); ++ rc = pci_request_irq(tp->pci_dev, i, irq->handler, NULL, r8125napi, ++ irq->name); ++ if (rc) ++ break; ++ ++ irq->vector = pci_irq_vector(tp->pci_dev, i); ++ irq->requested = 1; ++ } ++#else ++ unsigned long irq_flags = 0; ++#ifdef ENABLE_LIB_SUPPORT ++ irq_flags |= IRQF_NO_SUSPEND; ++#endif ++ if (tp->features & RTL_FEATURE_MSIX && ++ tp->HwCurrIsrVer > 1) { ++ for (i=0; iirq_nvecs; i++) { ++ irq = &tp->irq_tbl[i]; ++ irq->handler = rtl8125_interrupt_msix; ++ r8125napi = &tp->r8125napi[i]; ++ snprintf(irq->name, len, "%s-%d", dev->name, i); ++ rc = request_irq(irq->vector, irq->handler, irq_flags, irq->name, r8125napi); ++ ++ if (rc) ++ break; ++ ++ irq->requested = 1; ++ } ++ } else { ++ irq = &tp->irq_tbl[0]; ++ irq->handler = rtl8125_interrupt; ++ r8125napi = &tp->r8125napi[0]; ++ snprintf(irq->name, len, "%s-0", dev->name); ++ if (!(tp->features & RTL_FEATURE_MSIX)) ++ irq->vector = dev->irq; ++ irq_flags |= (tp->features & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX)) ? 0 : SA_SHIRQ; ++ rc = request_irq(irq->vector, irq->handler, irq_flags, irq->name, r8125napi); ++ ++ if (rc == 0) ++ irq->requested = 1; ++ } ++#endif ++ if (rc) ++ rtl8125_free_irq(tp); ++ ++ return rc; ++} ++ ++static int rtl8125_alloc_tx_desc(struct rtl8125_private *tp) ++{ ++ struct rtl8125_tx_ring *ring; ++ struct pci_dev *pdev = tp->pci_dev; ++ int i; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ ring = &tp->tx_ring[i]; ++ ring->TxDescAllocSize = (ring->num_tx_desc + 1) * sizeof(struct TxDesc); ++ ring->TxDescArray = dma_alloc_coherent(&pdev->dev, ++ ring->TxDescAllocSize, ++ &ring->TxPhyAddr, ++ GFP_KERNEL); ++ ++ if (!ring->TxDescArray) ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static int rtl8125_alloc_rx_desc(struct rtl8125_private *tp) ++{ ++ struct rtl8125_rx_ring *ring; ++ struct pci_dev *pdev = tp->pci_dev; ++ int i; ++ ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ ring = &tp->rx_ring[i]; ++ ring->RxDescAllocSize = (ring->num_rx_desc + 1) * tp->RxDescLength; ++ ring->RxDescArray = dma_alloc_coherent(&pdev->dev, ++ ring->RxDescAllocSize, ++ &ring->RxPhyAddr, ++ GFP_KERNEL); ++ ++ if (!ring->RxDescArray) ++ return -1; ++ } ++ ++ return 0; ++} ++ ++static void rtl8125_free_tx_desc(struct rtl8125_private *tp) ++{ ++ struct rtl8125_tx_ring *ring; ++ struct pci_dev *pdev = tp->pci_dev; ++ int i; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ ring = &tp->tx_ring[i]; ++ if (ring->TxDescArray) { ++ dma_free_coherent(&pdev->dev, ++ ring->TxDescAllocSize, ++ ring->TxDescArray, ++ ring->TxPhyAddr); ++ ring->TxDescArray = NULL; ++ } ++ } ++} ++ ++static void rtl8125_free_rx_desc(struct rtl8125_private *tp) ++{ ++ struct rtl8125_rx_ring *ring; ++ struct pci_dev *pdev = tp->pci_dev; ++ int i; ++ ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ ring = &tp->rx_ring[i]; ++ if (ring->RxDescArray) { ++ dma_free_coherent(&pdev->dev, ++ ring->RxDescAllocSize, ++ ring->RxDescArray, ++ ring->RxPhyAddr); ++ ring->RxDescArray = NULL; ++ } ++ } ++} ++ ++static void rtl8125_free_alloc_resources(struct rtl8125_private *tp) ++{ ++ rtl8125_free_rx_desc(tp); ++ ++ rtl8125_free_tx_desc(tp); ++} ++ ++#ifdef ENABLE_USE_FIRMWARE_FILE ++static void rtl8125_request_firmware(struct rtl8125_private *tp) ++{ ++ struct rtl8125_fw *rtl_fw; ++ ++ /* firmware loaded already or no firmware available */ ++ if (tp->rtl_fw || !tp->fw_name) ++ return; ++ ++ rtl_fw = kzalloc(sizeof(*rtl_fw), GFP_KERNEL); ++ if (!rtl_fw) ++ return; ++ ++ rtl_fw->phy_write = rtl8125_mdio_write; ++ rtl_fw->phy_read = rtl8125_mdio_read; ++ rtl_fw->mac_mcu_write = mac_mcu_write; ++ rtl_fw->mac_mcu_read = mac_mcu_read; ++ rtl_fw->fw_name = tp->fw_name; ++ rtl_fw->dev = tp_to_dev(tp); ++ ++ if (rtl8125_fw_request_firmware(rtl_fw)) ++ kfree(rtl_fw); ++ else ++ tp->rtl_fw = rtl_fw; ++} ++#endif ++ ++int rtl8125_open(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int retval; ++ ++ retval = -ENOMEM; ++ ++#ifdef ENABLE_R8125_PROCFS ++ rtl8125_proc_init(dev); ++#endif ++ rtl8125_set_rxbufsize(tp, dev); ++ /* ++ * Rx and Tx descriptors needs 256 bytes alignment. ++ * pci_alloc_consistent provides more. ++ */ ++ if (rtl8125_alloc_tx_desc(tp) < 0 || rtl8125_alloc_rx_desc(tp) < 0) ++ goto err_free_all_allocated_mem; ++ ++ retval = rtl8125_init_ring(dev); ++ if (retval < 0) ++ goto err_free_all_allocated_mem; ++ ++ retval = rtl8125_alloc_irq(tp); ++ if (retval < 0) ++ goto err_free_all_allocated_mem; ++ ++ if (netif_msg_probe(tp)) { ++ printk(KERN_INFO "%s: 0x%lx, " ++ "%2.2x:%2.2x:%2.2x:%2.2x:%2.2x:%2.2x, " ++ "IRQ %d\n", ++ dev->name, ++ dev->base_addr, ++ dev->dev_addr[0], dev->dev_addr[1], ++ dev->dev_addr[2], dev->dev_addr[3], ++ dev->dev_addr[4], dev->dev_addr[5], dev->irq); ++ } ++ ++#ifdef ENABLE_USE_FIRMWARE_FILE ++ rtl8125_request_firmware(tp); ++#endif ++ pci_set_master(tp->pci_dev); ++ ++#ifdef CONFIG_R8125_NAPI ++ rtl8125_enable_napi(tp); ++#endif ++ ++ rtl8125_exit_oob(dev); ++ ++ rtl8125_up(dev); ++ ++#ifdef ENABLE_PTP_SUPPORT ++ if (tp->EnablePtp) ++ rtl8125_ptp_init(tp); ++#endif ++ clear_bit(R8125_FLAG_DOWN, tp->task_flags); ++ ++ if (tp->resume_not_chg_speed) ++ _rtl8125_check_link_status(dev, R8125_LINK_STATE_UNKNOWN); ++ else ++ rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising); ++ ++ if (tp->esd_flag == 0) { ++ //rtl8125_request_esd_timer(dev); ++ ++ rtl8125_schedule_esd_work(tp); ++ } ++ ++ //rtl8125_request_link_timer(dev); ++#ifdef ENABLE_FIBER_SUPPORT ++ if (HW_FIBER_MODE_ENABLED(tp)) ++ rtl8125_schedule_link_work(tp); ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ rtl8125_enable_hw_linkchg_interrupt(tp); ++out: ++ ++ return retval; ++ ++err_free_all_allocated_mem: ++ rtl8125_free_alloc_resources(tp); ++ ++ goto out; ++} ++ ++static void ++_rtl8125_set_l1_l0s_entry_latency(struct rtl8125_private *tp, u8 setting) ++{ ++ u32 csi_tmp; ++ u32 temp; ++ ++ temp = setting & 0x3f; ++ temp <<= 24; ++ /*set PCI configuration space offset 0x70F to setting*/ ++ /*When the register offset of PCI configuration space larger than 0xff, use CSI to access it.*/ ++ ++ csi_tmp = rtl8125_csi_read(tp, 0x70c) & 0xc0ffffff; ++ rtl8125_csi_write(tp, 0x70c, csi_tmp | temp); ++} ++ ++static void ++rtl8125_set_l1_l0s_entry_latency(struct rtl8125_private *tp) ++{ ++ _rtl8125_set_l1_l0s_entry_latency(tp, 0x27); ++} ++ ++static void ++_rtl8125_set_mrrs(struct rtl8125_private *tp, u8 setting) ++{ ++ struct pci_dev *pdev = tp->pci_dev; ++ u8 device_control; ++ ++ pci_read_config_byte(pdev, 0x79, &device_control); ++ device_control &= ~0x70; ++ device_control |= setting; ++ pci_write_config_byte(pdev, 0x79, device_control); ++} ++ ++static void ++rtl8125_set_mrrs(struct rtl8125_private *tp) ++{ ++ if (hwoptimize & HW_PATCH_SOC_LAN) ++ return; ++ ++ _rtl8125_set_mrrs(tp, 0x40); ++} ++ ++void ++rtl8125_hw_set_rx_packet_filter(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ u32 mc_filter[2]; /* Multicast hash filter */ ++ int rx_mode; ++ u32 tmp = 0; ++ ++ if (dev->flags & IFF_PROMISC) { ++ /* Unconditionally log net taps. */ ++ if (netif_msg_link(tp)) ++ printk(KERN_NOTICE "%s: Promiscuous mode enabled.\n", ++ dev->name); ++ ++ rx_mode = ++ AcceptBroadcast | AcceptMulticast | AcceptMyPhys | ++ AcceptAllPhys; ++ mc_filter[1] = mc_filter[0] = 0xffffffff; ++ } else if (dev->flags & IFF_ALLMULTI) { ++ /* accept all multicasts. */ ++ rx_mode = AcceptBroadcast | AcceptMulticast | AcceptMyPhys; ++ mc_filter[1] = mc_filter[0] = 0xffffffff; ++ } else { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,35) ++ struct dev_mc_list *mclist; ++ unsigned int i; ++ ++ rx_mode = AcceptBroadcast | AcceptMyPhys; ++ mc_filter[1] = mc_filter[0] = 0; ++ for (i = 0, mclist = dev->mc_list; mclist && i < dev->mc_count; ++ i++, mclist = mclist->next) { ++ int bit_nr = ether_crc(ETH_ALEN, mclist->dmi_addr) >> 26; ++ mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31); ++ rx_mode |= AcceptMulticast; ++ } ++#else ++ struct netdev_hw_addr *ha; ++ ++ rx_mode = AcceptBroadcast | AcceptMyPhys; ++ mc_filter[1] = mc_filter[0] = 0; ++ netdev_for_each_mc_addr(ha, dev) { ++ int bit_nr = ether_crc(ETH_ALEN, ha->addr) >> 26; ++ mc_filter[bit_nr >> 5] |= 1 << (bit_nr & 31); ++ rx_mode |= AcceptMulticast; ++ } ++#endif ++ } ++ ++ if (dev->features & NETIF_F_RXALL) ++ rx_mode |= (AcceptErr | AcceptRunt); ++ ++ tmp = mc_filter[0]; ++ mc_filter[0] = swab32(mc_filter[1]); ++ mc_filter[1] = swab32(tmp); ++ ++ tmp = tp->rtl8125_rx_config | rx_mode | (RTL_R32(tp, RxConfig) & rtl_chip_info[tp->chipset].RxConfigMask); ++ ++ RTL_W32(tp, RxConfig, tmp); ++ RTL_W32(tp, MAR0 + 0, mc_filter[0]); ++ RTL_W32(tp, MAR0 + 4, mc_filter[1]); ++} ++ ++static void ++rtl8125_set_rx_mode(struct net_device *dev) ++{ ++ rtl8125_hw_set_rx_packet_filter(dev); ++} ++ ++void ++rtl8125_set_rx_q_num(struct rtl8125_private *tp, ++ unsigned int num_rx_queues) ++{ ++ u16 q_ctrl; ++ u16 rx_q_num; ++ ++ rx_q_num = (u16)ilog2(num_rx_queues); ++ rx_q_num &= (BIT_0 | BIT_1 | BIT_2); ++ rx_q_num <<= 2; ++ q_ctrl = RTL_R16(tp, Q_NUM_CTRL_8125); ++ q_ctrl &= ~(BIT_2 | BIT_3 | BIT_4); ++ q_ctrl |= rx_q_num; ++ RTL_W16(tp, Q_NUM_CTRL_8125, q_ctrl); ++} ++ ++void ++rtl8125_set_tx_q_num(struct rtl8125_private *tp, ++ unsigned int num_tx_queues) ++{ ++ u16 mac_ocp_data; ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE63E); ++ mac_ocp_data &= ~(BIT_11 | BIT_10); ++ mac_ocp_data |= ((ilog2(num_tx_queues) & 0x03) << 10); ++ rtl8125_mac_ocp_write(tp, 0xE63E, mac_ocp_data); ++} ++ ++void ++rtl8125_enable_mcu(struct rtl8125_private *tp, bool enable) ++{ ++ if (FALSE == HW_SUPPORT_MAC_MCU(tp)) ++ return; ++ ++ if (enable) ++ rtl8125_set_mac_ocp_bit(tp, 0xC0B4, BIT_0); ++ else ++ rtl8125_clear_mac_ocp_bit(tp, 0xC0B4, BIT_0); ++} ++ ++static void ++rtl8125_clear_tcam_entries(struct rtl8125_private *tp) ++{ ++ if (FALSE == HW_SUPPORT_TCAM(tp)) ++ return; ++ ++ rtl8125_set_mac_ocp_bit(tp, 0xEB54, BIT_0); ++ udelay(1); ++ rtl8125_clear_mac_ocp_bit(tp, 0xEB54, BIT_0); ++} ++ ++static void ++rtl8125_enable_tcam(struct rtl8125_private *tp) ++{ ++ if (tp->HwSuppTcamVer != 1) ++ return; ++ ++ RTL_W16(tp, 0x382, 0x221B); ++} ++ ++static u8 ++rtl8125_get_l1off_cap_bits(struct rtl8125_private *tp) ++{ ++ u8 l1offCapBits = 0; ++ ++ l1offCapBits = (BIT_0 | BIT_1); ++ switch (tp->mcfg) { ++ case CFG_METHOD_4: ++ case CFG_METHOD_5: ++ case CFG_METHOD_7: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_10: ++ case CFG_METHOD_11: ++ case CFG_METHOD_12: ++ case CFG_METHOD_13: ++ l1offCapBits |= (BIT_2 | BIT_3); ++ break; ++ default: ++ break; ++ } ++ ++ return l1offCapBits; ++} ++ ++void ++rtl8125_hw_config(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ struct pci_dev *pdev = tp->pci_dev; ++ u16 mac_ocp_data; ++ ++ rtl8125_disable_rx_packet_filter(tp); ++ ++ rtl8125_hw_reset(dev); ++ ++ rtl8125_enable_cfg9346_write(tp); ++ ++ rtl8125_enable_force_clkreq(tp, 0); ++ rtl8125_enable_aspm_clkreq_lock(tp, 0); ++ ++ rtl8125_set_eee_lpi_timer(tp); ++ ++ //keep magic packet only ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xC0B6); ++ mac_ocp_data &= BIT_0; ++ rtl8125_mac_ocp_write(tp, 0xC0B6, mac_ocp_data); ++ ++ rtl8125_tally_counter_addr_fill(tp); ++ ++ rtl8125_enable_extend_tally_couter(tp); ++ ++ rtl8125_desc_addr_fill(tp); ++ ++ /* Set DMA burst size and Interframe Gap Time */ ++ RTL_W32(tp, TxConfig, (TX_DMA_BURST_unlimited << TxDMAShift) | ++ (InterFrameGap << TxInterFrameGapShift)); ++ ++ if (tp->EnableTxNoClose) ++ RTL_W32(tp, TxConfig, (RTL_R32(tp, TxConfig) | BIT_6)); ++ ++ if (enable_double_vlan) ++ rtl8125_enable_double_vlan(tp); ++ else ++ rtl8125_disable_double_vlan(tp); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2 ... CFG_METHOD_7: ++ rtl8125_enable_tcam(tp); ++ break; ++ } ++ ++ rtl8125_set_l1_l0s_entry_latency(tp); ++ ++ rtl8125_set_mrrs(tp); ++ ++#ifdef ENABLE_RSS_SUPPORT ++ rtl8125_config_rss(tp); ++#else ++ RTL_W32(tp, RSS_CTRL_8125, 0x00); ++#endif ++ rtl8125_set_rx_q_num(tp, rtl8125_tot_rx_rings(tp)); ++ ++ RTL_W8(tp, Config1, RTL_R8(tp, Config1) & ~0x10); ++ ++ rtl8125_mac_ocp_write(tp, 0xC140, 0xFFFF); ++ rtl8125_mac_ocp_write(tp, 0xC142, 0xFFFF); ++ ++ //new tx desc format ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEB58); ++ mac_ocp_data |= (BIT_0); ++ rtl8125_mac_ocp_write(tp, 0xEB58, mac_ocp_data); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE614); ++ mac_ocp_data &= ~(BIT_10 | BIT_9 | BIT_8); ++ if (tp->mcfg == CFG_METHOD_4 || tp->mcfg == CFG_METHOD_5 || ++ tp->mcfg == CFG_METHOD_7) ++ mac_ocp_data |= ((2 & 0x07) << 8); ++ else ++ mac_ocp_data |= ((3 & 0x07) << 8); ++ rtl8125_mac_ocp_write(tp, 0xE614, mac_ocp_data); ++ ++ rtl8125_set_tx_q_num(tp, rtl8125_tot_tx_rings(tp)); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE63E); ++ mac_ocp_data &= ~(BIT_5 | BIT_4); ++ mac_ocp_data |= (0x02 << 4); ++ rtl8125_mac_ocp_write(tp, 0xE63E, mac_ocp_data); ++ ++ rtl8125_enable_mcu(tp, 0); ++ rtl8125_enable_mcu(tp, 1); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xC0B4); ++ mac_ocp_data |= (BIT_3 | BIT_2); ++ rtl8125_mac_ocp_write(tp, 0xC0B4, mac_ocp_data); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEB6A); ++ mac_ocp_data &= ~(BIT_7 | BIT_6 | BIT_5 | BIT_4 | BIT_3 | BIT_2 | BIT_1 | BIT_0); ++ mac_ocp_data |= (BIT_5 | BIT_4 | BIT_1 | BIT_0); ++ rtl8125_mac_ocp_write(tp, 0xEB6A, mac_ocp_data); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEB50); ++ mac_ocp_data &= ~(BIT_9 | BIT_8 | BIT_7 | BIT_6 | BIT_5); ++ mac_ocp_data |= (BIT_6); ++ rtl8125_mac_ocp_write(tp, 0xEB50, mac_ocp_data); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE056); ++ mac_ocp_data &= ~(BIT_7 | BIT_6 | BIT_5 | BIT_4); ++ //mac_ocp_data |= (BIT_4 | BIT_5); ++ rtl8125_mac_ocp_write(tp, 0xE056, mac_ocp_data); ++ ++ RTL_W8(tp, TDFNR, 0x10); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xE040); ++ mac_ocp_data &= ~(BIT_12); ++ rtl8125_mac_ocp_write(tp, 0xE040, mac_ocp_data); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEA1C); ++ mac_ocp_data &= ~(BIT_1 | BIT_0); ++ mac_ocp_data |= (BIT_0); ++ rtl8125_mac_ocp_write(tp, 0xEA1C, mac_ocp_data); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_12: ++ rtl8125_oob_mutex_lock(tp); ++ break; ++ } ++ ++ if (tp->mcfg == CFG_METHOD_10 || tp->mcfg == CFG_METHOD_11 || ++ tp->mcfg == CFG_METHOD_13) ++ rtl8125_mac_ocp_write(tp, 0xE0C0, 0x4403); ++ else ++ rtl8125_mac_ocp_write(tp, 0xE0C0, 0x4000); ++ ++ rtl8125_set_mac_ocp_bit(tp, 0xE052, (BIT_6 | BIT_5)); ++ rtl8125_clear_mac_ocp_bit(tp, 0xE052, BIT_3 | BIT_7); ++ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2: ++ case CFG_METHOD_3: ++ case CFG_METHOD_6: ++ case CFG_METHOD_8: ++ case CFG_METHOD_9: ++ case CFG_METHOD_12: ++ rtl8125_oob_mutex_unlock(tp); ++ break; ++ } ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xD430); ++ mac_ocp_data &= ~(BIT_11 | BIT_10 | BIT_9 | BIT_8 | BIT_7 | BIT_6 | BIT_5 | BIT_4 | BIT_3 | BIT_2 | BIT_1 | BIT_0); ++ mac_ocp_data |= 0x45F; ++ rtl8125_mac_ocp_write(tp, 0xD430, mac_ocp_data); ++ ++ //rtl8125_mac_ocp_write(tp, 0xE0C0, 0x4F87); ++ if (!tp->DASH) ++ RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) | BIT_6 | BIT_7); ++ else ++ RTL_W8(tp, 0xD0, RTL_R8(tp, 0xD0) & ~(BIT_6 | BIT_7)); ++ ++ if (tp->mcfg == CFG_METHOD_2 || tp->mcfg == CFG_METHOD_3 || ++ tp->mcfg == CFG_METHOD_6) ++ RTL_W8(tp, MCUCmd_reg, RTL_R8(tp, MCUCmd_reg) | BIT_0); ++ ++ if (tp->mcfg != CFG_METHOD_10 && tp->mcfg != CFG_METHOD_11 && ++ tp->mcfg != CFG_METHOD_13) ++ rtl8125_disable_eee_plus(tp); ++ ++ mac_ocp_data = rtl8125_mac_ocp_read(tp, 0xEA1C); ++ mac_ocp_data &= ~(BIT_2); ++ rtl8125_mac_ocp_write(tp, 0xEA1C, mac_ocp_data); ++ ++ rtl8125_clear_tcam_entries(tp); ++ ++ RTL_W16(tp, 0x1880, RTL_R16(tp, 0x1880) & ~(BIT_4 | BIT_5)); ++ ++ if (tp->HwSuppRxDescType == RX_DESC_RING_TYPE_4) { ++ if (tp->InitRxDescType == RX_DESC_RING_TYPE_4) ++ RTL_W8(tp, 0xd8, RTL_R8(tp, 0xd8) | ++ EnableRxDescV4_0); ++ else ++ RTL_W8(tp, 0xd8, RTL_R8(tp, 0xd8) & ++ ~EnableRxDescV4_0); ++ } ++ ++ if (tp->mcfg == CFG_METHOD_12) { ++ rtl8125_clear_mac_ocp_bit(tp, 0xE00C, BIT_12); ++ ++ rtl8125_clear_mac_ocp_bit(tp, 0xC0C2, BIT_6); ++ } ++ ++ /* csum offload command for RTL8125 */ ++ tp->tx_tcp_csum_cmd = TxTCPCS_C; ++ tp->tx_udp_csum_cmd = TxUDPCS_C; ++ tp->tx_ip_csum_cmd = TxIPCS_C; ++ tp->tx_ipv6_csum_cmd = TxIPV6F_C; ++ ++ /* config interrupt type for RTL8125B */ ++ if (tp->HwSuppIsrVer > 1) ++ rtl8125_hw_set_interrupt_type(tp, tp->HwCurrIsrVer); ++ ++ //other hw parameters ++ rtl8125_hw_clear_timer_int(dev); ++ ++ rtl8125_hw_clear_int_miti(dev); ++ ++ if (tp->use_timer_interrupt && ++ (tp->HwCurrIsrVer > 1) && ++ (tp->HwSuppIntMitiVer > 3) && ++ (tp->features & RTL_FEATURE_MSIX)) { ++ int i; ++ for (i = 0; i < tp->irq_nvecs; i++) ++ rtl8125_hw_set_timer_int(tp, i, timer_count_v2); ++ } ++ ++ rtl8125_enable_exit_l1_mask(tp); ++ ++ rtl8125_mac_ocp_write(tp, 0xE098, 0xC302); ++ ++ if (aspm && (tp->org_pci_offset_99 & (BIT_2 | BIT_5 | BIT_6))) ++ rtl8125_init_pci_offset_99(tp); ++ else ++ rtl8125_disable_pci_offset_99(tp); ++ ++ if (aspm && (tp->org_pci_offset_180 & rtl8125_get_l1off_cap_bits(tp))) ++ rtl8125_init_pci_offset_180(tp); ++ else ++ rtl8125_disable_pci_offset_180(tp); ++ ++ if (tp->RequiredPfmPatch) ++ rtl8125_set_pfm_patch(tp, 0); ++ ++ tp->cp_cmd &= ~(EnableBist | Macdbgo_oe | Force_halfdup | ++ Force_rxflow_en | Force_txflow_en | Cxpl_dbg_sel | ++ ASF | Macdbgo_sel); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,0,0) ++ RTL_W16(tp, CPlusCmd, tp->cp_cmd); ++#else ++ rtl8125_hw_set_features(dev, dev->features); ++#endif ++ rtl8125_set_rms(tp, tp->rms); ++ ++ rtl8125_disable_rxdvgate(dev); ++ ++ if (!tp->pci_cfg_is_read) { ++ pci_read_config_byte(pdev, PCI_COMMAND, &tp->pci_cfg_space.cmd); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_0, &tp->pci_cfg_space.io_base_l); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_0 + 2, &tp->pci_cfg_space.io_base_h); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_2, &tp->pci_cfg_space.mem_base_l); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_2 + 2, &tp->pci_cfg_space.mem_base_h); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_3, &tp->pci_cfg_space.resv_0x1c_l); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_3 + 2, &tp->pci_cfg_space.resv_0x1c_h); ++ pci_read_config_byte(pdev, PCI_INTERRUPT_LINE, &tp->pci_cfg_space.ilr); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_4, &tp->pci_cfg_space.resv_0x20_l); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_4 + 2, &tp->pci_cfg_space.resv_0x20_h); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_5, &tp->pci_cfg_space.resv_0x24_l); ++ pci_read_config_word(pdev, PCI_BASE_ADDRESS_5 + 2, &tp->pci_cfg_space.resv_0x24_h); ++ pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID, &tp->pci_cfg_space.resv_0x2c_l); ++ pci_read_config_word(pdev, PCI_SUBSYSTEM_VENDOR_ID + 2, &tp->pci_cfg_space.resv_0x2c_h); ++ if (tp->HwPcieSNOffset > 0) { ++ tp->pci_cfg_space.pci_sn_l = rtl8125_csi_read(tp, tp->HwPcieSNOffset); ++ tp->pci_cfg_space.pci_sn_h = rtl8125_csi_read(tp, tp->HwPcieSNOffset + 4); ++ } ++ ++ tp->pci_cfg_is_read = 1; ++ } ++ ++ /* Set Rx packet filter */ ++ rtl8125_hw_set_rx_packet_filter(dev); ++ ++#ifdef ENABLE_DASH_SUPPORT ++ rtl8125_check_and_enable_dash_interrupt(tp); ++#endif ++ ++ rtl8125_enable_aspm_clkreq_lock(tp, aspm ? 1 : 0); ++ ++ rtl8125_disable_cfg9346_write(tp); ++ ++ udelay(10); ++} ++ ++void ++rtl8125_hw_start(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++#ifdef ENABLE_LIB_SUPPORT ++ rtl8125_init_lib_ring(tp); ++#endif ++ ++ RTL_W8(tp, ChipCmd, CmdTxEnb | CmdRxEnb); ++ ++ rtl8125_enable_hw_interrupt(tp); ++ ++ rtl8125_lib_reset_complete(tp); ++} ++ ++static int ++rtl8125_change_mtu(struct net_device *dev, ++ int new_mtu) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int ret = 0; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0) ++ if (new_mtu < ETH_MIN_MTU) ++ return -EINVAL; ++ else if (new_mtu > tp->max_jumbo_frame_size) ++ new_mtu = tp->max_jumbo_frame_size; ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,10,0) ++ ++ dev->mtu = new_mtu; ++ ++ tp->eee.tx_lpi_timer = dev->mtu + ETH_HLEN + 0x20; ++ ++ if (!netif_running(dev)) ++ goto out; ++ ++ rtl8125_down(dev); ++ ++ rtl8125_set_rxbufsize(tp, dev); ++ ++ ret = rtl8125_init_ring(dev); ++ ++ if (ret < 0) ++ goto err_out; ++ ++#ifdef CONFIG_R8125_NAPI ++ rtl8125_enable_napi(tp); ++#endif//CONFIG_R8125_NAPI ++ ++ if (tp->link_ok(dev)) ++ rtl8125_link_on_patch(dev); ++ else ++ rtl8125_link_down_patch(dev); ++ ++ //mod_timer(&tp->esd_timer, jiffies + RTL8125_ESD_TIMEOUT); ++ //mod_timer(&tp->link_timer, jiffies + RTL8125_LINK_TIMEOUT); ++out: ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,0,0) ++ netdev_update_features(dev); ++#endif ++ ++err_out: ++ return ret; ++} ++ ++static inline void ++rtl8125_set_desc_dma_addr(struct rtl8125_private *tp, ++ struct RxDesc *desc, ++ dma_addr_t mapping) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ ((struct RxDescV3 *)desc)->addr = cpu_to_le64(mapping); ++ break; ++ case RX_DESC_RING_TYPE_4: ++ ((struct RxDescV4 *)desc)->addr = cpu_to_le64(mapping); ++ break; ++ default: ++ desc->addr = cpu_to_le64(mapping); ++ break; ++ } ++} ++ ++static inline void ++rtl8125_mark_to_asic_v1(struct RxDesc *desc, ++ u32 rx_buf_sz) ++{ ++ u32 eor = le32_to_cpu(desc->opts1) & RingEnd; ++ ++ WRITE_ONCE(desc->opts1, cpu_to_le32(DescOwn | eor | rx_buf_sz)); ++} ++ ++static inline void ++rtl8125_mark_to_asic_v3(struct RxDescV3 *descv3, ++ u32 rx_buf_sz) ++{ ++ u32 eor = le32_to_cpu(descv3->RxDescNormalDDWord4.opts1) & RingEnd; ++ ++ WRITE_ONCE(descv3->RxDescNormalDDWord4.opts1, cpu_to_le32(DescOwn | eor | rx_buf_sz)); ++} ++ ++static inline void ++rtl8125_mark_to_asic_v4(struct RxDescV4 *descv4, ++ u32 rx_buf_sz) ++{ ++ u32 eor = le32_to_cpu(descv4->RxDescNormalDDWord2.opts1) & RingEnd; ++ ++ WRITE_ONCE(descv4->RxDescNormalDDWord2.opts1, cpu_to_le32(DescOwn | eor | rx_buf_sz)); ++} ++ ++void ++rtl8125_mark_to_asic(struct rtl8125_private *tp, ++ struct RxDesc *desc, ++ u32 rx_buf_sz) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ rtl8125_mark_to_asic_v3((struct RxDescV3 *)desc, rx_buf_sz); ++ break; ++ case RX_DESC_RING_TYPE_4: ++ rtl8125_mark_to_asic_v4((struct RxDescV4 *)desc, rx_buf_sz); ++ break; ++ default: ++ rtl8125_mark_to_asic_v1(desc, rx_buf_sz); ++ break; ++ } ++} ++ ++static inline void ++rtl8125_map_to_asic(struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ struct RxDesc *desc, ++ dma_addr_t mapping, ++ u32 rx_buf_sz, ++ const u32 cur_rx) ++{ ++ ring->RxDescPhyAddr[cur_rx] = mapping; ++ rtl8125_set_desc_dma_addr(tp, desc, mapping); ++ wmb(); ++ rtl8125_mark_to_asic(tp, desc, rx_buf_sz); ++} ++ ++#ifdef ENABLE_PAGE_REUSE ++ ++static int ++rtl8125_alloc_rx_page(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring, ++ struct rtl8125_rx_buffer *rxb) ++{ ++ struct page *page; ++ dma_addr_t dma; ++ unsigned int order = tp->rx_buf_page_order; ++ ++ //get free page ++ page = dev_alloc_pages(order); ++ ++ if (unlikely(!page)) ++ return -ENOMEM; ++ ++ dma = dma_map_page_attrs(&tp->pci_dev->dev, page, 0, ++ tp->rx_buf_page_size, ++ DMA_FROM_DEVICE, ++ (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)); ++ ++ if (unlikely(dma_mapping_error(&tp->pci_dev->dev, dma))) { ++ __free_pages(page, order); ++ return -ENOMEM; ++ } ++ ++ rxb->page = page; ++ rxb->data = page_address(page); ++ rxb->page_offset = ring->rx_offset; ++ rxb->dma = dma; ++ ++ //after page alloc, page refcount already = 1 ++ ++ return 0; ++} ++ ++static void ++rtl8125_free_rx_page(struct rtl8125_private *tp, struct rtl8125_rx_buffer *rxb) ++{ ++ if (!rxb->page) ++ return; ++ ++ dma_unmap_page_attrs(&tp->pci_dev->dev, rxb->dma, ++ tp->rx_buf_page_size, ++ DMA_FROM_DEVICE, ++ (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)); ++ __free_pages(rxb->page, tp->rx_buf_page_order); ++ rxb->page = NULL; ++} ++ ++static void ++_rtl8125_rx_clear(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring) ++{ ++ int i; ++ struct rtl8125_rx_buffer *rxb; ++ ++ for (i = 0; i < ring->num_rx_desc; i++) { ++ rxb = &ring->rx_buffer[i]; ++ if (rxb->skb) { ++ dev_kfree_skb(rxb->skb); ++ rxb->skb = NULL; ++ } ++ rtl8125_free_rx_page(tp, rxb); ++ } ++} ++ ++static u32 ++rtl8125_rx_fill(struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ struct net_device *dev, ++ u32 start, ++ u32 end, ++ u8 in_intr) ++{ ++ u32 cur; ++ struct rtl8125_rx_buffer *rxb; ++ ++ for (cur = start; end - cur > 0; cur++) { ++ int ret, i = cur % ring->num_rx_desc; ++ ++ rxb = &ring->rx_buffer[i]; ++ if (rxb->page) ++ continue; ++ ++ ret = rtl8125_alloc_rx_page(tp, ring, rxb); ++ if (ret) ++ break; ++ ++ dma_sync_single_range_for_device(tp_to_dev(tp), ++ rxb->dma, ++ rxb->page_offset, ++ tp->rx_buf_sz, ++ DMA_FROM_DEVICE); ++ ++ rtl8125_map_to_asic(tp, ring, ++ rtl8125_get_rxdesc(tp, ring->RxDescArray, i), ++ rxb->dma + rxb->page_offset, ++ tp->rx_buf_sz, i); ++ } ++ return cur - start; ++} ++ ++#else //ENABLE_PAGE_REUSE ++ ++static void ++rtl8125_free_rx_skb(struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ struct sk_buff **sk_buff, ++ struct RxDesc *desc, ++ const u32 cur_rx) ++{ ++ struct pci_dev *pdev = tp->pci_dev; ++ ++ dma_unmap_single(&pdev->dev, ring->RxDescPhyAddr[cur_rx], tp->rx_buf_sz, ++ DMA_FROM_DEVICE); ++ dev_kfree_skb(*sk_buff); ++ *sk_buff = NULL; ++ rtl8125_make_unusable_by_asic(tp, desc); ++} ++ ++static int ++rtl8125_alloc_rx_skb(struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ struct sk_buff **sk_buff, ++ struct RxDesc *desc, ++ int rx_buf_sz, ++ const u32 cur_rx, ++ u8 in_intr) ++{ ++ struct sk_buff *skb; ++ dma_addr_t mapping; ++ int ret = 0; ++ ++ if (in_intr) ++ skb = RTL_ALLOC_SKB_INTR(&tp->r8125napi[ring->index].napi, rx_buf_sz + R8125_RX_ALIGN); ++ else ++ skb = dev_alloc_skb(rx_buf_sz + R8125_RX_ALIGN); ++ ++ if (unlikely(!skb)) ++ goto err_out; ++ ++ if (!in_intr || !R8125_USE_NAPI_ALLOC_SKB) ++ skb_reserve(skb, R8125_RX_ALIGN); ++ ++ mapping = dma_map_single(tp_to_dev(tp), skb->data, rx_buf_sz, ++ DMA_FROM_DEVICE); ++ if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) { ++ if (unlikely(net_ratelimit())) ++ netif_err(tp, drv, tp->dev, "Failed to map RX DMA!\n"); ++ goto err_out; ++ } ++ ++ *sk_buff = skb; ++ rtl8125_map_to_asic(tp, ring, desc, mapping, rx_buf_sz, cur_rx); ++out: ++ return ret; ++ ++err_out: ++ if (skb) ++ dev_kfree_skb(skb); ++ ret = -ENOMEM; ++ rtl8125_make_unusable_by_asic(tp, desc); ++ goto out; ++} ++ ++static void ++_rtl8125_rx_clear(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring) ++{ ++ int i; ++ ++ for (i = 0; i < ring->num_rx_desc; i++) { ++ if (ring->Rx_skbuff[i]) { ++ rtl8125_free_rx_skb(tp, ++ ring, ++ ring->Rx_skbuff + i, ++ rtl8125_get_rxdesc(tp, ring->RxDescArray, i), ++ i); ++ ring->Rx_skbuff[i] = NULL; ++ } ++ } ++} ++ ++static u32 ++rtl8125_rx_fill(struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ struct net_device *dev, ++ u32 start, ++ u32 end, ++ u8 in_intr) ++{ ++ u32 cur; ++ ++ for (cur = start; end - cur > 0; cur++) { ++ int ret, i = cur % ring->num_rx_desc; ++ ++ if (ring->Rx_skbuff[i]) ++ continue; ++ ++ ret = rtl8125_alloc_rx_skb(tp, ++ ring, ++ ring->Rx_skbuff + i, ++ rtl8125_get_rxdesc(tp, ring->RxDescArray, i), ++ tp->rx_buf_sz, ++ i, ++ in_intr); ++ if (ret < 0) ++ break; ++ } ++ return cur - start; ++} ++ ++#endif //ENABLE_PAGE_REUSE ++ ++void ++rtl8125_rx_clear(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ struct rtl8125_rx_ring *ring = &tp->rx_ring[i]; ++ ++ _rtl8125_rx_clear(tp, ring); ++ } ++} ++ ++static void ++rtl8125_mark_as_last_descriptor_v1(struct RxDesc *desc) ++{ ++ desc->opts1 |= cpu_to_le32(RingEnd); ++} ++ ++static void ++rtl8125_mark_as_last_descriptor_v3(struct RxDescV3 *descv3) ++{ ++ descv3->RxDescNormalDDWord4.opts1 |= cpu_to_le32(RingEnd); ++} ++ ++static void ++rtl8125_mark_as_last_descriptor_v4(struct RxDescV4 *descv4) ++{ ++ descv4->RxDescNormalDDWord2.opts1 |= cpu_to_le32(RingEnd); ++} ++ ++void ++rtl8125_mark_as_last_descriptor(struct rtl8125_private *tp, ++ struct RxDesc *desc) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ rtl8125_mark_as_last_descriptor_v3((struct RxDescV3 *)desc); ++ break; ++ case RX_DESC_RING_TYPE_4: ++ rtl8125_mark_as_last_descriptor_v4((struct RxDescV4 *)desc); ++ break; ++ default: ++ rtl8125_mark_as_last_descriptor_v1(desc); ++ break; ++ } ++} ++ ++static void ++rtl8125_desc_addr_fill(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ struct rtl8125_tx_ring *ring = &tp->tx_ring[i]; ++ RTL_W32(tp, ring->tdsar_reg, ((u64)ring->TxPhyAddr & DMA_BIT_MASK(32))); ++ RTL_W32(tp, ring->tdsar_reg + 4, ((u64)ring->TxPhyAddr >> 32)); ++ } ++ ++ if (rtl8125_num_lib_rx_rings(tp) == 0) { ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ struct rtl8125_rx_ring *ring = &tp->rx_ring[i]; ++ RTL_W32(tp, ring->rdsar_reg, ((u64)ring->RxPhyAddr & DMA_BIT_MASK(32))); ++ RTL_W32(tp, ring->rdsar_reg + 4, ((u64)ring->RxPhyAddr >> 32)); ++ } ++ } ++} ++ ++static void ++rtl8125_tx_desc_init(struct rtl8125_private *tp) ++{ ++ int i = 0; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ struct rtl8125_tx_ring *ring = &tp->tx_ring[i]; ++ memset(ring->TxDescArray, 0x0, ring->TxDescAllocSize); ++ ++ ring->TxDescArray[ring->num_tx_desc - 1].opts1 = cpu_to_le32(RingEnd); ++ } ++} ++ ++static void ++rtl8125_rx_desc_init(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ struct rtl8125_rx_ring *ring = &tp->rx_ring[i]; ++ memset(ring->RxDescArray, 0x0, ring->RxDescAllocSize); ++ } ++} ++ ++int ++rtl8125_init_ring(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ ++ rtl8125_init_ring_indexes(tp); ++ ++ rtl8125_tx_desc_init(tp); ++ rtl8125_rx_desc_init(tp); ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ struct rtl8125_tx_ring *ring = &tp->tx_ring[i]; ++ memset(ring->tx_skb, 0x0, sizeof(ring->tx_skb)); ++ } ++ ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ struct rtl8125_rx_ring *ring = &tp->rx_ring[i]; ++#ifdef ENABLE_PAGE_REUSE ++ ring->rx_offset = R8125_RX_ALIGN; ++#else ++ memset(ring->Rx_skbuff, 0x0, sizeof(ring->Rx_skbuff)); ++#endif //ENABLE_PAGE_REUSE ++ if (rtl8125_rx_fill(tp, ring, dev, 0, ring->num_rx_desc, 0) != ring->num_rx_desc) ++ goto err_out; ++ ++ rtl8125_mark_as_last_descriptor(tp, rtl8125_get_rxdesc(tp, ring->RxDescArray, ring->num_rx_desc - 1)); ++ } ++ ++ return 0; ++ ++err_out: ++ rtl8125_rx_clear(tp); ++ return -ENOMEM; ++} ++ ++static void ++rtl8125_unmap_tx_skb(struct pci_dev *pdev, ++ struct ring_info *tx_skb, ++ struct TxDesc *desc) ++{ ++ unsigned int len = tx_skb->len; ++ ++ dma_unmap_single(&pdev->dev, le64_to_cpu(desc->addr), len, DMA_TO_DEVICE); ++ ++ desc->opts1 = cpu_to_le32(RTK_MAGIC_DEBUG_VALUE); ++ desc->opts2 = 0x00; ++ desc->addr = RTL8125_MAGIC_NUMBER; ++ tx_skb->len = 0; ++} ++ ++static void ++rtl8125_tx_clear_range(struct rtl8125_private *tp, ++ struct rtl8125_tx_ring *ring, ++ u32 start, ++ unsigned int n) ++{ ++ unsigned int i; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,22) ++ struct net_device *dev = tp->dev; ++#endif ++ ++ for (i = 0; i < n; i++) { ++ unsigned int entry = (start + i) % ring->num_tx_desc; ++ struct ring_info *tx_skb = ring->tx_skb + entry; ++ unsigned int len = tx_skb->len; ++ ++ if (len) { ++ struct sk_buff *skb = tx_skb->skb; ++ ++ rtl8125_unmap_tx_skb(tp->pci_dev, tx_skb, ++ ring->TxDescArray + entry); ++ if (skb) { ++ RTLDEV->stats.tx_dropped++; ++ dev_kfree_skb_any(skb); ++ tx_skb->skb = NULL; ++ } ++ } ++ } ++} ++ ++void ++rtl8125_tx_clear(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) { ++ struct rtl8125_tx_ring *ring = &tp->tx_ring[i]; ++ rtl8125_tx_clear_range(tp, ring, ring->dirty_tx, ring->num_tx_desc); ++ ring->cur_tx = ring->dirty_tx = 0; ++ } ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++static void rtl8125_schedule_reset_work(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ set_bit(R8125_FLAG_TASK_RESET_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->reset_task, 4); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++} ++ ++static void rtl8125_schedule_esd_work(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ set_bit(R8125_FLAG_TASK_ESD_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->esd_task, RTL8125_ESD_TIMEOUT); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++} ++ ++static void rtl8125_schedule_linkchg_work(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ set_bit(R8125_FLAG_TASK_LINKCHG_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->linkchg_task, 4); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++} ++ ++static void rtl8125_schedule_link_work(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ set_bit(R8125_FLAG_TASK_LINK_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->link_task, RTL8125_LINK_TIMEOUT); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++} ++ ++static void rtl8125_schedule_dash_work(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ set_bit(R8125_FLAG_TASK_DASH_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->dash_task, RTL8125_DASH_TIMEOUT); ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++} ++ ++#define rtl8125_cancel_schedule_reset_work(a) ++#define rtl8125_cancel_schedule_esd_work(a) ++#define rtl8125_cancel_schedule_linkchg_work(a) ++#define rtl8125_cancel_schedule_link_work(a) ++#define rtl8125_cancel_schedule_dash_work(a) ++ ++#else ++static void rtl8125_schedule_reset_work(struct rtl8125_private *tp) ++{ ++ set_bit(R8125_FLAG_TASK_RESET_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->reset_task, 4); ++} ++ ++static void rtl8125_cancel_schedule_reset_work(struct rtl8125_private *tp) ++{ ++ struct work_struct *work = &tp->reset_task.work; ++ ++ if (!work->func) ++ return; ++ ++ cancel_delayed_work_sync(&tp->reset_task); ++} ++ ++static void rtl8125_schedule_esd_work(struct rtl8125_private *tp) ++{ ++ set_bit(R8125_FLAG_TASK_ESD_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->esd_task, RTL8125_ESD_TIMEOUT); ++} ++ ++static void rtl8125_cancel_schedule_esd_work(struct rtl8125_private *tp) ++{ ++ struct work_struct *work = &tp->esd_task.work; ++ ++ if (!work->func) ++ return; ++ ++ cancel_delayed_work_sync(&tp->esd_task); ++} ++ ++static void rtl8125_schedule_linkchg_work(struct rtl8125_private *tp) ++{ ++ set_bit(R8125_FLAG_TASK_LINKCHG_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->linkchg_task, 4); ++} ++ ++static void rtl8125_cancel_schedule_linkchg_work(struct rtl8125_private *tp) ++{ ++ struct work_struct *work = &tp->linkchg_task.work; ++ ++ if (!work->func) ++ return; ++ ++ cancel_delayed_work_sync(&tp->linkchg_task); ++} ++ ++static void rtl8125_schedule_link_work(struct rtl8125_private *tp) ++{ ++ set_bit(R8125_FLAG_TASK_LINK_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->link_task, RTL8125_LINK_TIMEOUT); ++} ++ ++static void rtl8125_cancel_schedule_link_work(struct rtl8125_private *tp) ++{ ++ struct work_struct *work = &tp->link_task.work; ++ ++ if (!work->func) ++ return; ++ ++ cancel_delayed_work_sync(&tp->link_task); ++} ++ ++void rtl8125_schedule_dash_work(struct rtl8125_private *tp) ++{ ++ set_bit(R8125_FLAG_TASK_DASH_CHECK_PENDING, tp->task_flags); ++ schedule_delayed_work(&tp->dash_task, RTL8125_DASH_TIMEOUT); ++} ++ ++static void rtl8125_cancel_schedule_dash_work(struct rtl8125_private *tp) ++{ ++ struct work_struct *work = &tp->dash_task.work; ++ ++ if (!work->func) ++ return; ++ ++ cancel_delayed_work_sync(&tp->dash_task); ++} ++#endif ++ ++static void rtl8125_init_all_schedule_work(struct rtl8125_private *tp) ++{ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++ INIT_WORK(&tp->reset_task, rtl8125_reset_task, dev); ++ INIT_WORK(&tp->esd_task, rtl8125_esd_task, dev); ++ INIT_WORK(&tp->linkchg_task, rtl8125_linkchg_task, dev); ++ INIT_WORK(&tp->link_task, rtl8125_link_task, dev); ++ INIT_WORK(&tp->dash_task, rtl8125_dash_task, dev); ++#else ++ INIT_DELAYED_WORK(&tp->reset_task, rtl8125_reset_task); ++ INIT_DELAYED_WORK(&tp->esd_task, rtl8125_esd_task); ++ INIT_DELAYED_WORK(&tp->linkchg_task, rtl8125_linkchg_task); ++ INIT_DELAYED_WORK(&tp->link_task, rtl8125_link_task); ++ INIT_DELAYED_WORK(&tp->dash_task, rtl8125_dash_task); ++#endif ++} ++ ++static void rtl8125_cancel_all_schedule_work(struct rtl8125_private *tp) ++{ ++ rtl8125_cancel_schedule_reset_work(tp); ++ rtl8125_cancel_schedule_esd_work(tp); ++ rtl8125_cancel_schedule_linkchg_work(tp); ++ rtl8125_cancel_schedule_link_work(tp); ++ rtl8125_cancel_schedule_dash_work(tp); ++} ++ ++static void ++rtl8125_wait_for_irq_complete(struct rtl8125_private *tp) ++{ ++ if (tp->features & RTL_FEATURE_MSIX) { ++ int i; ++ for (i = 0; i < tp->irq_nvecs; i++) ++ synchronize_irq(tp->irq_tbl[i].vector); ++ } else { ++ synchronize_irq(tp->dev->irq); ++ } ++} ++ ++void ++_rtl8125_wait_for_quiescence(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ /* Wait for any pending NAPI task to complete */ ++#ifdef CONFIG_R8125_NAPI ++ rtl8125_disable_napi(tp); ++#endif//CONFIG_R8125_NAPI ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,5,67) ++ /* Give a racing hard_start_xmit a few cycles to complete. */ ++ synchronize_net(); ++#endif ++ ++ rtl8125_irq_mask_and_ack(tp); ++ ++ rtl8125_wait_for_irq_complete(tp); ++} ++ ++static void ++rtl8125_wait_for_quiescence(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ //suppress unused variable ++ (void)(tp); ++ ++ _rtl8125_wait_for_quiescence(dev); ++ ++#ifdef CONFIG_R8125_NAPI ++ rtl8125_enable_napi(tp); ++#endif//CONFIG_R8125_NAPI ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++static void rtl8125_reset_task(void *_data) ++{ ++ struct net_device *dev = _data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++#else ++static void rtl8125_reset_task(struct work_struct *work) ++{ ++ struct rtl8125_private *tp = ++ container_of(work, struct rtl8125_private, reset_task.work); ++ struct net_device *dev = tp->dev; ++#endif ++ int i; ++ ++ rtnl_lock(); ++ ++ if (!netif_running(dev) || ++ test_bit(R8125_FLAG_DOWN, tp->task_flags) || ++ !test_and_clear_bit(R8125_FLAG_TASK_RESET_PENDING, tp->task_flags)) ++ goto out_unlock; ++ ++ netdev_err(dev, "Device reseting!\n"); ++ ++ netif_carrier_off(dev); ++ netif_tx_disable(dev); ++ _rtl8125_wait_for_quiescence(dev); ++ rtl8125_hw_reset(dev); ++ ++ rtl8125_tx_clear(tp); ++ ++ rtl8125_init_ring_indexes(tp); ++ ++ rtl8125_tx_desc_init(tp); ++ for (i = 0; i < tp->num_rx_rings; i++) { ++ struct rtl8125_rx_ring *ring; ++ u32 entry; ++ ++ ring = &tp->rx_ring[i]; ++ for (entry = 0; entry < ring->num_rx_desc; entry++) { ++ struct RxDesc *desc; ++ ++ desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry); ++ rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz); ++ } ++ } ++ ++#ifdef ENABLE_PTP_SUPPORT ++ rtl8125_ptp_reset(tp); ++#endif ++ ++#ifdef CONFIG_R8125_NAPI ++ rtl8125_enable_napi(tp); ++#endif //CONFIG_R8125_NAPI ++ ++ if (tp->resume_not_chg_speed) { ++ _rtl8125_check_link_status(dev, R8125_LINK_STATE_UNKNOWN); ++ ++ tp->resume_not_chg_speed = 0; ++ } else { ++ rtl8125_enable_hw_linkchg_interrupt(tp); ++ ++ rtl8125_set_speed(dev, tp->autoneg, tp->speed, tp->duplex, tp->advertising); ++ } ++ ++out_unlock: ++ rtnl_unlock(); ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++static void rtl8125_esd_task(void *_data) ++{ ++ struct net_device *dev = _data; ++ struct rtl8125_private *tp = netdev_priv(dev); ++#else ++static void rtl8125_esd_task(struct work_struct *work) ++{ ++ struct rtl8125_private *tp = ++ container_of(work, struct rtl8125_private, esd_task.work); ++ struct net_device *dev = tp->dev; ++#endif ++ rtnl_lock(); ++ ++ if (!netif_running(dev) || ++ test_bit(R8125_FLAG_DOWN, tp->task_flags) || ++ !test_and_clear_bit(R8125_FLAG_TASK_ESD_CHECK_PENDING, tp->task_flags)) ++ goto out_unlock; ++ ++ rtl8125_esd_checker(tp); ++ ++ rtl8125_schedule_esd_work(tp); ++ ++out_unlock: ++ rtnl_unlock(); ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++static void rtl8125_linkchg_task(void *_data) ++{ ++ struct net_device *dev = _data; ++ //struct rtl8125_private *tp = netdev_priv(dev); ++#else ++static void rtl8125_linkchg_task(struct work_struct *work) ++{ ++ struct rtl8125_private *tp = ++ container_of(work, struct rtl8125_private, linkchg_task.work); ++ struct net_device *dev = tp->dev; ++#endif ++ rtnl_lock(); ++ ++ if (!netif_running(dev) || ++ test_bit(R8125_FLAG_DOWN, tp->task_flags) || ++ !test_and_clear_bit(R8125_FLAG_TASK_LINKCHG_CHECK_PENDING, tp->task_flags)) ++ goto out_unlock; ++ ++ rtl8125_check_link_status(dev); ++ ++out_unlock: ++ rtnl_unlock(); ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++static void rtl8125_link_task(void *_data) ++{ ++ struct net_device *dev = _data; ++ //struct rtl8125_private *tp = netdev_priv(dev); ++#else ++static void rtl8125_link_task(struct work_struct *work) ++{ ++ struct rtl8125_private *tp = ++ container_of(work, struct rtl8125_private, link_task.work); ++ struct net_device *dev = tp->dev; ++#endif ++ rtnl_lock(); ++ ++ if (!netif_running(dev) || ++ test_bit(R8125_FLAG_DOWN, tp->task_flags) || ++ !test_and_clear_bit(R8125_FLAG_TASK_LINK_CHECK_PENDING, ++ tp->task_flags)) ++ goto out_unlock; ++ ++ if (netif_carrier_ok(dev) != tp->link_ok(dev)) ++ rtl8125_schedule_linkchg_work(tp); ++ ++ rtl8125_schedule_link_work(tp); ++ ++out_unlock: ++ rtnl_unlock(); ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,20) ++static void rtl8125_dash_task(void *_data) ++{ ++ struct net_device *dev = _data; ++ //struct rtl8125_private *tp = netdev_priv(dev); ++#else ++static void rtl8125_dash_task(struct work_struct *work) ++{ ++ struct rtl8125_private *tp = ++ container_of(work, struct rtl8125_private, dash_task.work); ++ struct net_device *dev = tp->dev; ++#endif ++ rtnl_lock(); ++ ++ if (!netif_running(dev) || ++ test_bit(R8125_FLAG_DOWN, tp->task_flags) || ++ !test_and_clear_bit(R8125_FLAG_TASK_DASH_CHECK_PENDING, tp->task_flags)) ++ goto out_unlock; ++ ++#ifdef ENABLE_DASH_SUPPORT ++ rtl8125_handle_dash_interrupt(dev); ++#endif ++ ++out_unlock: ++ rtnl_unlock(); ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) ++static void ++rtl8125_tx_timeout(struct net_device *dev, unsigned int txqueue) ++#else ++static void ++rtl8125_tx_timeout(struct net_device *dev) ++#endif ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ netdev_err(dev, "Transmit timeout reset Device!\n"); ++ ++ /* Let's wait a bit while any (async) irq lands on */ ++ rtl8125_schedule_reset_work(tp); ++} ++ ++static u32 ++rtl8125_get_txd_opts1(struct rtl8125_tx_ring *ring, ++ u32 opts1, ++ u32 len, ++ unsigned int entry) ++{ ++ u32 status = opts1 | len; ++ ++ if (entry == ring->num_tx_desc - 1) ++ status |= RingEnd; ++ ++ return status; ++} ++ ++static int ++rtl8125_xmit_frags(struct rtl8125_private *tp, ++ struct rtl8125_tx_ring *ring, ++ struct sk_buff *skb, ++ const u32 *opts) ++{ ++ struct skb_shared_info *info = skb_shinfo(skb); ++ unsigned int cur_frag, entry; ++ struct TxDesc *txd = NULL; ++ const unsigned char nr_frags = info->nr_frags; ++ unsigned long PktLenCnt = 0; ++ bool LsoPatchEnabled = FALSE; ++ ++ entry = ring->cur_tx; ++ for (cur_frag = 0; cur_frag < nr_frags; cur_frag++) { ++ skb_frag_t *frag = info->frags + cur_frag; ++ dma_addr_t mapping; ++ u32 status, len; ++ void *addr; ++ ++ entry = (entry + 1) % ring->num_tx_desc; ++ ++ txd = ring->TxDescArray + entry; ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,2,0) ++ len = frag->size; ++ addr = ((void *) page_address(frag->page)) + frag->page_offset; ++#else ++ len = skb_frag_size(frag); ++ addr = skb_frag_address(frag); ++#endif ++ if (tp->RequireLSOPatch && ++ (cur_frag == nr_frags - 1) && ++ (opts[0] & (GiantSendv4|GiantSendv6)) && ++ PktLenCnt < ETH_FRAME_LEN && ++ len > 1) { ++ len -= 1; ++ mapping = dma_map_single(tp_to_dev(tp), addr, len, DMA_TO_DEVICE); ++ ++ if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) { ++ if (unlikely(net_ratelimit())) ++ netif_err(tp, drv, tp->dev, ++ "Failed to map TX fragments DMA!\n"); ++ goto err_out; ++ } ++ ++ /* anti gcc 2.95.3 bugware (sic) */ ++ status = rtl8125_get_txd_opts1(ring, opts[0], len, entry); ++ ++ txd->addr = cpu_to_le64(mapping); ++ ++ ring->tx_skb[entry].len = len; ++ ++ txd->opts2 = cpu_to_le32(opts[1]); ++ wmb(); ++ txd->opts1 = cpu_to_le32(status); ++ ++ //second txd ++ addr += len; ++ len = 1; ++ entry = (entry + 1) % ring->num_tx_desc; ++ txd = ring->TxDescArray + entry; ++ cur_frag += 1; ++ ++ LsoPatchEnabled = TRUE; ++ } ++ ++ mapping = dma_map_single(tp_to_dev(tp), addr, len, DMA_TO_DEVICE); ++ ++ if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) { ++ if (unlikely(net_ratelimit())) ++ netif_err(tp, drv, tp->dev, ++ "Failed to map TX fragments DMA!\n"); ++ goto err_out; ++ } ++ ++ /* anti gcc 2.95.3 bugware (sic) */ ++ status = rtl8125_get_txd_opts1(ring, opts[0], len, entry); ++ if (cur_frag == (nr_frags - 1) || LsoPatchEnabled == TRUE) ++ status |= LastFrag; ++ ++ txd->addr = cpu_to_le64(mapping); ++ ++ ring->tx_skb[entry].len = len; ++ ++ txd->opts2 = cpu_to_le32(opts[1]); ++ wmb(); ++ txd->opts1 = cpu_to_le32(status); ++ ++ PktLenCnt += len; ++ } ++ ++ return cur_frag; ++ ++err_out: ++ rtl8125_tx_clear_range(tp, ring, ring->cur_tx + 1, cur_frag); ++ return -EIO; ++} ++ ++static inline ++__be16 get_protocol(struct sk_buff *skb) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,37) ++ return vlan_get_protocol(skb); ++#else ++ __be16 protocol; ++ ++ if (skb->protocol == htons(ETH_P_8021Q)) ++ protocol = vlan_eth_hdr(skb)->h_vlan_encapsulated_proto; ++ else ++ protocol = skb->protocol; ++ ++ return protocol; ++#endif ++} ++ ++static inline ++u8 rtl8125_get_l4_protocol(struct sk_buff *skb) ++{ ++ int no = skb_network_offset(skb); ++ struct ipv6hdr *i6h, _i6h; ++ struct iphdr *ih, _ih; ++ u8 ip_protocol = IPPROTO_RAW; ++ ++ switch (get_protocol(skb)) { ++ case __constant_htons(ETH_P_IP): ++ ih = skb_header_pointer(skb, no, sizeof(_ih), &_ih); ++ if (ih) ++ ip_protocol = ih->protocol; ++ break; ++ case __constant_htons(ETH_P_IPV6): ++ i6h = skb_header_pointer(skb, no, sizeof(_i6h), &_i6h); ++ if (i6h) ++ ip_protocol = i6h->nexthdr; ++ break; ++ } ++ ++ return ip_protocol; ++} ++ ++static bool rtl8125_skb_pad_with_len(struct sk_buff *skb, unsigned int len) ++{ ++ if (skb_padto(skb, len)) ++ return false; ++ skb_put(skb, len - skb->len); ++ return true; ++} ++ ++static bool rtl8125_skb_pad(struct sk_buff *skb) ++{ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,19,0) ++ return rtl8125_skb_pad_with_len(skb, ETH_ZLEN); ++#else ++ return !eth_skb_pad(skb); ++#endif ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) ++/* msdn_giant_send_check() ++ * According to the document of microsoft, the TCP Pseudo Header excludes the ++ * packet length for IPv6 TCP large packets. ++ */ ++static int msdn_giant_send_check(struct sk_buff *skb) ++{ ++ const struct ipv6hdr *ipv6h; ++ struct tcphdr *th; ++ int ret; ++ ++ ret = skb_cow_head(skb, 0); ++ if (ret) ++ return ret; ++ ++ ipv6h = ipv6_hdr(skb); ++ th = tcp_hdr(skb); ++ ++ th->check = 0; ++ th->check = ~tcp_v6_check(0, &ipv6h->saddr, &ipv6h->daddr, 0); ++ ++ return ret; ++} ++#endif ++ ++static bool rtl8125_require_pad_ptp_pkt(struct rtl8125_private *tp) ++{ ++ switch (tp->mcfg) { ++ case CFG_METHOD_2 ... CFG_METHOD_7: ++ return true; ++ default: ++ return false; ++ } ++} ++ ++#define MIN_PATCH_LEN (47) ++static u32 ++rtl8125_get_patch_pad_len(struct rtl8125_private *tp, ++ struct sk_buff *skb) ++{ ++ u32 pad_len = 0; ++ int trans_data_len; ++ u32 hdr_len; ++ u32 pkt_len = skb->len; ++ u8 ip_protocol; ++ bool has_trans = skb_transport_header_was_set(skb); ++ ++ if (!rtl8125_require_pad_ptp_pkt(tp)) ++ goto no_padding; ++ ++ if (!(has_trans && (pkt_len < 175))) //128 + MIN_PATCH_LEN ++ goto no_padding; ++ ++ ip_protocol = rtl8125_get_l4_protocol(skb); ++ if (!(ip_protocol == IPPROTO_TCP || ip_protocol == IPPROTO_UDP)) ++ goto no_padding; ++ ++ trans_data_len = pkt_len - ++ (skb->transport_header - ++ skb_headroom(skb)); ++ if (ip_protocol == IPPROTO_UDP) { ++ if (trans_data_len > 3 && trans_data_len < MIN_PATCH_LEN) { ++ u16 dest_port = 0; ++ ++ skb_copy_bits(skb, skb->transport_header - skb_headroom(skb) + 2, &dest_port, 2); ++ dest_port = ntohs(dest_port); ++ ++ if (dest_port == 0x13f || ++ dest_port == 0x140) { ++ pad_len = MIN_PATCH_LEN - trans_data_len; ++ goto out; ++ } ++ } ++ } ++ ++ hdr_len = 0; ++ if (ip_protocol == IPPROTO_TCP) ++ hdr_len = 20; ++ else if (ip_protocol == IPPROTO_UDP) ++ hdr_len = 8; ++ if (trans_data_len < hdr_len) ++ pad_len = hdr_len - trans_data_len; ++ ++out: ++ if ((pkt_len + pad_len) < ETH_ZLEN) ++ pad_len = ETH_ZLEN - pkt_len; ++ ++ return pad_len; ++ ++no_padding: ++ ++ return 0; ++} ++ ++static bool ++rtl8125_tso_csum(struct sk_buff *skb, ++ struct net_device *dev, ++ u32 *opts, ++ unsigned int *bytecount, ++ unsigned short *gso_segs) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned long large_send = 0; ++ u32 csum_cmd = 0; ++ u8 sw_calc_csum = false; ++ u8 check_patch_required = true; ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ if (dev->features & (NETIF_F_TSO | NETIF_F_TSO6)) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) ++ u32 mss = skb_shinfo(skb)->tso_size; ++#else ++ u32 mss = skb_shinfo(skb)->gso_size; ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,18) ++ ++ /* TCP Segmentation Offload (or TCP Large Send) */ ++ if (mss) { ++ union { ++ struct iphdr *v4; ++ struct ipv6hdr *v6; ++ unsigned char *hdr; ++ } ip; ++ union { ++ struct tcphdr *tcp; ++ struct udphdr *udp; ++ unsigned char *hdr; ++ } l4; ++ u32 l4_offset, hdr_len; ++ ++ ip.hdr = skb_network_header(skb); ++ l4.hdr = skb_checksum_start(skb); ++ ++ l4_offset = skb_transport_offset(skb); ++ assert((l4_offset%2) == 0); ++ switch (get_protocol(skb)) { ++ case __constant_htons(ETH_P_IP): ++ if (l4_offset <= GTTCPHO_MAX) { ++ opts[0] |= GiantSendv4; ++ opts[0] |= l4_offset << GTTCPHO_SHIFT; ++ opts[1] |= min(mss, MSS_MAX) << 18; ++ large_send = 1; ++ } ++ break; ++ case __constant_htons(ETH_P_IPV6): ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,14,0) ++ if (msdn_giant_send_check(skb)) ++ return false; ++#endif ++ if (l4_offset <= GTTCPHO_MAX) { ++ opts[0] |= GiantSendv6; ++ opts[0] |= l4_offset << GTTCPHO_SHIFT; ++ opts[1] |= min(mss, MSS_MAX) << 18; ++ large_send = 1; ++ } ++ break; ++ default: ++ if (unlikely(net_ratelimit())) ++ dprintk("tso proto=%x!\n", skb->protocol); ++ break; ++ } ++ ++ if (large_send == 0) ++ return false; ++ ++ ++ /* compute length of segmentation header */ ++ hdr_len = (l4.tcp->doff * 4) + l4_offset; ++ /* update gso size and bytecount with header size */ ++ *gso_segs = skb_shinfo(skb)->gso_segs; ++ *bytecount += (*gso_segs - 1) * hdr_len; ++ ++ return true; ++ } ++ } ++#endif //LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ ++ if (skb->ip_summed == CHECKSUM_PARTIAL) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,22) ++ const struct iphdr *ip = skb->nh.iph; ++ ++ if (dev->features & NETIF_F_IP_CSUM) { ++ if (ip->protocol == IPPROTO_TCP) ++ csum_cmd = tp->tx_ip_csum_cmd | tp->tx_tcp_csum_cmd; ++ else if (ip->protocol == IPPROTO_UDP) ++ csum_cmd = tp->tx_ip_csum_cmd | tp->tx_udp_csum_cmd; ++ else if (ip->protocol == IPPROTO_IP) ++ csum_cmd = tp->tx_ip_csum_cmd; ++ } ++#else ++ u8 ip_protocol = IPPROTO_RAW; ++ ++ switch (get_protocol(skb)) { ++ case __constant_htons(ETH_P_IP): ++ if (dev->features & NETIF_F_IP_CSUM) { ++ ip_protocol = ip_hdr(skb)->protocol; ++ csum_cmd = tp->tx_ip_csum_cmd; ++ } ++ break; ++ case __constant_htons(ETH_P_IPV6): ++ if (dev->features & NETIF_F_IPV6_CSUM) { ++ if (skb_transport_offset(skb) > 0 && skb_transport_offset(skb) <= TCPHO_MAX) { ++ ip_protocol = ipv6_hdr(skb)->nexthdr; ++ csum_cmd = tp->tx_ipv6_csum_cmd; ++ csum_cmd |= skb_transport_offset(skb) << TCPHO_SHIFT; ++ } ++ } ++ break; ++ default: ++ if (unlikely(net_ratelimit())) ++ dprintk("checksum_partial proto=%x!\n", skb->protocol); ++ break; ++ } ++ ++ if (ip_protocol == IPPROTO_TCP) ++ csum_cmd |= tp->tx_tcp_csum_cmd; ++ else if (ip_protocol == IPPROTO_UDP) ++ csum_cmd |= tp->tx_udp_csum_cmd; ++#endif ++ if (csum_cmd == 0) { ++ sw_calc_csum = true; ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ WARN_ON(1); /* we need a WARN() */ ++#endif ++ } ++ ++ if (ip_protocol == IPPROTO_TCP) ++ check_patch_required = false; ++ } ++ ++ if (check_patch_required) { ++ u32 pad_len = rtl8125_get_patch_pad_len(tp, skb); ++ ++ if (pad_len > 0) { ++ if (!rtl8125_skb_pad_with_len(skb, skb->len + pad_len)) ++ return false; ++ ++ if (csum_cmd != 0) ++ sw_calc_csum = true; ++ } ++ } ++ ++ if (skb->len < ETH_ZLEN) { ++ if (tp->UseSwPaddingShortPkt || ++ (tp->ShortPacketSwChecksum && csum_cmd != 0)) { ++ if (!rtl8125_skb_pad(skb)) ++ return false; ++ ++ if (csum_cmd != 0) ++ sw_calc_csum = true; ++ } ++ } ++ ++ if (sw_calc_csum) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,7) ++ skb_checksum_help(&skb, 0); ++#elif LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) && LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,10) ++ skb_checksum_help(skb, 0); ++#else ++ skb_checksum_help(skb); ++#endif ++ } else ++ opts[1] |= csum_cmd; ++ ++ return true; ++} ++ ++static bool rtl8125_tx_slots_avail(struct rtl8125_private *tp, ++ struct rtl8125_tx_ring *ring) ++{ ++ unsigned int slots_avail = READ_ONCE(ring->dirty_tx) + ring->num_tx_desc ++ - READ_ONCE(ring->cur_tx); ++ ++ /* A skbuff with nr_frags needs nr_frags+1 entries in the tx queue */ ++ return slots_avail > MAX_SKB_FRAGS; ++} ++ ++static inline u32 ++rtl8125_fast_mod_mask(const u32 input, const u32 mask) ++{ ++ return input > mask ? input & mask : input; ++} ++ ++static void rtl8125_doorbell(struct rtl8125_private *tp, ++ struct rtl8125_tx_ring *ring) ++{ ++ if (tp->EnableTxNoClose) { ++ if (tp->HwSuppTxNoCloseVer > 3) ++ RTL_W32(tp, ring->sw_tail_ptr_reg, ring->cur_tx); ++ else ++ RTL_W16(tp, ring->sw_tail_ptr_reg, ring->cur_tx); ++ } else ++ RTL_W16(tp, TPPOLL_8125, BIT(ring->index)); /* set polling bit */ ++} ++ ++static netdev_tx_t ++rtl8125_start_xmit(struct sk_buff *skb, ++ struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ unsigned int bytecount; ++ unsigned short gso_segs; ++ struct ring_info *last; ++ unsigned int last_entry; ++ unsigned int entry; ++ struct TxDesc *txd; ++ dma_addr_t mapping; ++ u32 len; ++ u32 opts[2]; ++ netdev_tx_t ret = NETDEV_TX_OK; ++ int frags; ++ u8 EnableTxNoClose = tp->EnableTxNoClose; ++ const u16 queue_mapping = skb_get_queue_mapping(skb); ++ struct rtl8125_tx_ring *ring; ++ bool stop_queue; ++ ++ assert(queue_mapping < tp->num_tx_rings); ++ ++ ring = &tp->tx_ring[queue_mapping]; ++ ++ if (unlikely(!rtl8125_tx_slots_avail(tp, ring))) { ++ if (netif_msg_drv(tp)) { ++ printk(KERN_ERR ++ "%s: BUG! Tx Ring[%d] full when queue awake!\n", ++ dev->name, ++ queue_mapping); ++ } ++ goto err_stop; ++ } ++ ++ entry = ring->cur_tx % ring->num_tx_desc; ++ txd = ring->TxDescArray + entry; ++ ++ if (!EnableTxNoClose) { ++ if (unlikely(le32_to_cpu(txd->opts1) & DescOwn)) { ++ if (netif_msg_drv(tp)) { ++ printk(KERN_ERR ++ "%s: BUG! Tx Desc is own by hardware!\n", ++ dev->name); ++ } ++ goto err_stop; ++ } ++ } ++ ++ bytecount = skb->len; ++ gso_segs = 1; ++ ++ opts[0] = DescOwn; ++ opts[1] = rtl8125_tx_vlan_tag(tp, skb); ++ ++ if (unlikely(!rtl8125_tso_csum(skb, dev, opts, &bytecount, &gso_segs))) ++ goto err_dma_0; ++ ++ frags = rtl8125_xmit_frags(tp, ring, skb, opts); ++ if (unlikely(frags < 0)) ++ goto err_dma_0; ++ if (frags) { ++ len = skb_headlen(skb); ++ opts[0] |= FirstFrag; ++ } else { ++ len = skb->len; ++ opts[0] |= FirstFrag | LastFrag; ++ } ++ ++ opts[0] = rtl8125_get_txd_opts1(ring, opts[0], len, entry); ++ mapping = dma_map_single(tp_to_dev(tp), skb->data, len, DMA_TO_DEVICE); ++ if (unlikely(dma_mapping_error(tp_to_dev(tp), mapping))) { ++ if (unlikely(net_ratelimit())) ++ netif_err(tp, drv, dev, "Failed to map TX DMA!\n"); ++ goto err_dma_1; ++ } ++ ++#ifdef ENABLE_PTP_SUPPORT ++ if (unlikely(skb_shinfo(skb)->tx_flags & SKBTX_HW_TSTAMP)) { ++ if (!test_and_set_bit_lock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state)) { ++ if (tp->hwtstamp_config.tx_type == HWTSTAMP_TX_ON && ++ !tp->ptp_tx_skb) { ++ skb_shinfo(skb)->tx_flags |= SKBTX_IN_PROGRESS; ++ ++ tp->ptp_tx_skb = skb_get(skb); ++ tp->ptp_tx_start = jiffies; ++ schedule_work(&tp->ptp_tx_work); ++ } else ++ tp->tx_hwtstamp_skipped++; ++ } ++ } ++#endif ++ /* set first fragment's length */ ++ ring->tx_skb[entry].len = len; ++ ++ /* set skb to last fragment */ ++ last_entry = (entry + frags) % ring->num_tx_desc; ++ last = &ring->tx_skb[last_entry]; ++ last->skb = skb; ++ last->gso_segs = gso_segs; ++ last->bytecount = bytecount; ++ ++ txd->addr = cpu_to_le64(mapping); ++ txd->opts2 = cpu_to_le32(opts[1]); ++ wmb(); ++ txd->opts1 = cpu_to_le32(opts[0]); ++ ++ netdev_tx_sent_queue(txring_txq(ring), bytecount); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) ++ dev->trans_start = jiffies; ++#else ++ skb_tx_timestamp(skb); ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(3,5,0) ++ ++ /* rtl_tx needs to see descriptor changes before updated tp->cur_tx */ ++ smp_wmb(); ++ ++ WRITE_ONCE(ring->cur_tx, ring->cur_tx + frags + 1); ++ ++ stop_queue = !rtl8125_tx_slots_avail(tp, ring); ++ if (unlikely(stop_queue)) { ++ /* Avoid wrongly optimistic queue wake-up: rtl_tx thread must ++ * not miss a ring update when it notices a stopped queue. ++ */ ++ smp_wmb(); ++ netif_stop_subqueue(dev, queue_mapping); ++ } ++ ++ if (netif_xmit_stopped(txring_txq(ring)) || !netdev_xmit_more()) ++ rtl8125_doorbell(tp, ring); ++ ++ if (unlikely(stop_queue)) { ++ /* Sync with rtl_tx: ++ * - publish queue status and cur_tx ring index (write barrier) ++ * - refresh dirty_tx ring index (read barrier). ++ * May the current thread have a pessimistic view of the ring ++ * status and forget to wake up queue, a racing rtl_tx thread ++ * can't. ++ */ ++ smp_mb(); ++ if (rtl8125_tx_slots_avail(tp, ring)) ++ netif_start_subqueue(dev, queue_mapping); ++ } ++out: ++ return ret; ++err_dma_1: ++ rtl8125_tx_clear_range(tp, ring, ring->cur_tx + 1, frags); ++err_dma_0: ++ RTLDEV->stats.tx_dropped++; ++ dev_kfree_skb_any(skb); ++ ret = NETDEV_TX_OK; ++ goto out; ++err_stop: ++ netif_stop_subqueue(dev, queue_mapping); ++ ret = NETDEV_TX_BUSY; ++ RTLDEV->stats.tx_dropped++; ++ goto out; ++} ++ ++/* recycle tx no close desc*/ ++static int ++rtl8125_tx_interrupt_noclose(struct rtl8125_tx_ring *ring, int budget) ++{ ++ unsigned int total_bytes = 0, total_packets = 0; ++ struct rtl8125_private *tp = ring->priv; ++ struct net_device *dev = tp->dev; ++ unsigned int dirty_tx, tx_left; ++ unsigned int tx_desc_closed; ++ unsigned int count = 0; ++ ++ dirty_tx = ring->dirty_tx; ++ ring->NextHwDesCloPtr = rtl8125_get_hw_clo_ptr(ring); ++ tx_desc_closed = rtl8125_fast_mod_mask(ring->NextHwDesCloPtr - ++ ring->BeginHwDesCloPtr, ++ tp->MaxTxDescPtrMask); ++ tx_left = min((READ_ONCE(ring->cur_tx) - dirty_tx), tx_desc_closed); ++ ring->BeginHwDesCloPtr += tx_left; ++ ++ while (tx_left > 0) { ++ unsigned int entry = dirty_tx % ring->num_tx_desc; ++ struct ring_info *tx_skb = ring->tx_skb + entry; ++ ++ rtl8125_unmap_tx_skb(tp->pci_dev, ++ tx_skb, ++ ring->TxDescArray + entry); ++ ++ if (tx_skb->skb != NULL) { ++ /* update the statistics for this packet */ ++ total_bytes += tx_skb->bytecount; ++ total_packets += tx_skb->gso_segs; ++ ++ RTL_NAPI_CONSUME_SKB_ANY(tx_skb->skb, budget); ++ tx_skb->skb = NULL; ++ } ++ dirty_tx++; ++ tx_left--; ++ } ++ ++ if (total_packets) { ++ netdev_tx_completed_queue(txring_txq(ring), ++ total_packets, total_bytes); ++ ++ RTLDEV->stats.tx_bytes += total_bytes; ++ RTLDEV->stats.tx_packets+= total_packets; ++ } ++ ++ if (ring->dirty_tx != dirty_tx) { ++ count = dirty_tx - ring->dirty_tx; ++ WRITE_ONCE(ring->dirty_tx, dirty_tx); ++ smp_wmb(); ++ if (__netif_subqueue_stopped(dev, ring->index) && ++ rtl8125_tx_slots_avail(tp, ring) && netif_carrier_ok(dev)) { ++ netif_start_subqueue(dev, ring->index); ++ } ++ } ++ ++ return count; ++} ++ ++/* recycle tx close desc*/ ++static int ++rtl8125_tx_interrupt_close(struct rtl8125_tx_ring *ring, int budget) ++{ ++ unsigned int total_bytes = 0, total_packets = 0; ++ struct rtl8125_private *tp = ring->priv; ++ struct net_device *dev = tp->dev; ++ unsigned int dirty_tx, tx_left; ++ unsigned int count = 0; ++ ++ dirty_tx = ring->dirty_tx; ++ tx_left = READ_ONCE(ring->cur_tx) - dirty_tx; ++ ++ while (tx_left > 0) { ++ unsigned int entry = dirty_tx % ring->num_tx_desc; ++ struct ring_info *tx_skb = ring->tx_skb + entry; ++ ++ if (le32_to_cpu(READ_ONCE(ring->TxDescArray[entry].opts1)) & DescOwn) ++ break; ++ ++ rtl8125_unmap_tx_skb(tp->pci_dev, ++ tx_skb, ++ ring->TxDescArray + entry); ++ ++ if (tx_skb->skb != NULL) { ++ /* update the statistics for this packet */ ++ total_bytes += tx_skb->bytecount; ++ total_packets += tx_skb->gso_segs; ++ ++ RTL_NAPI_CONSUME_SKB_ANY(tx_skb->skb, budget); ++ tx_skb->skb = NULL; ++ } ++ dirty_tx++; ++ tx_left--; ++ } ++ ++ if (total_packets) { ++ netdev_tx_completed_queue(txring_txq(ring), ++ total_packets, total_bytes); ++ ++ RTLDEV->stats.tx_bytes += total_bytes; ++ RTLDEV->stats.tx_packets+= total_packets; ++ } ++ ++ if (ring->dirty_tx != dirty_tx) { ++ count = dirty_tx - ring->dirty_tx; ++ WRITE_ONCE(ring->dirty_tx, dirty_tx); ++ smp_wmb(); ++ if (__netif_subqueue_stopped(dev, ring->index) && ++ rtl8125_tx_slots_avail(tp, ring) && netif_carrier_ok(dev)) { ++ netif_start_subqueue(dev, ring->index); ++ } ++ ++ if (READ_ONCE(ring->cur_tx) != dirty_tx) ++ rtl8125_doorbell(tp, ring); ++ } ++ ++ return count; ++} ++ ++static int ++rtl8125_tx_interrupt(struct rtl8125_tx_ring *ring, int budget) ++{ ++ struct rtl8125_private *tp = ring->priv; ++ ++ if (tp->EnableTxNoClose) ++ return rtl8125_tx_interrupt_noclose(ring, budget); ++ else ++ return rtl8125_tx_interrupt_close(ring, budget); ++} ++ ++static int ++rtl8125_tx_interrupt_with_vector(struct rtl8125_private *tp, ++ const int message_id, ++ int budget) ++{ ++ int count = 0; ++ ++ switch (tp->HwCurrIsrVer) { ++ case 3: ++ case 4: ++ if (message_id < tp->num_tx_rings) ++ count += rtl8125_tx_interrupt(&tp->tx_ring[message_id], budget); ++ break; ++ case 5: ++ if (message_id == 16) ++ count += rtl8125_tx_interrupt(&tp->tx_ring[0], budget); ++#ifdef ENABLE_MULTIPLE_TX_QUEUE ++ else if (message_id == 17 && tp->num_tx_rings > 1) ++ count += rtl8125_tx_interrupt(&tp->tx_ring[1], budget); ++#endif ++ break; ++ case 7: ++ if (message_id == 27) ++ count += rtl8125_tx_interrupt(&tp->tx_ring[0], budget); ++#ifdef ENABLE_MULTIPLE_TX_QUEUE ++ else if (message_id == 28 && tp->num_tx_rings > 1) ++ count += rtl8125_tx_interrupt(&tp->tx_ring[1], budget); ++#endif ++ break; ++ default: ++ if (message_id == 16) ++ count += rtl8125_tx_interrupt(&tp->tx_ring[0], budget); ++#ifdef ENABLE_MULTIPLE_TX_QUEUE ++ else if (message_id == 18 && tp->num_tx_rings > 1) ++ count += rtl8125_tx_interrupt(&tp->tx_ring[1], budget); ++#endif ++ break; ++ } ++ ++ return count; ++} ++ ++static inline int ++rtl8125_fragmented_frame(struct rtl8125_private *tp, u32 status) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ return (status & (FirstFrag_V3 | LastFrag_V3)) != (FirstFrag_V3 | LastFrag_V3); ++ case RX_DESC_RING_TYPE_4: ++ return (status & (FirstFrag_V4 | LastFrag_V4)) != (FirstFrag_V4 | LastFrag_V4); ++ default: ++ return (status & (FirstFrag | LastFrag)) != (FirstFrag | LastFrag); ++ } ++} ++ ++static inline int ++rtl8125_is_non_eop(struct rtl8125_private *tp, u32 status) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ return !(status & LastFrag_V3); ++ case RX_DESC_RING_TYPE_4: ++ return !(status & LastFrag_V4); ++ default: ++ return !(status & LastFrag); ++ } ++} ++ ++static inline int ++rtl8125_rx_desc_type(u32 status) ++{ ++ return ((status >> 26) & 0x0F); ++} ++ ++static inline void ++rtl8125_rx_v1_csum(struct rtl8125_private *tp, ++ struct sk_buff *skb, ++ struct RxDesc *desc) ++{ ++ u32 opts1 = le32_to_cpu(desc->opts1); ++ ++ if (((opts1 & RxTCPT) && !(opts1 & RxTCPF)) || ++ ((opts1 & RxUDPT) && !(opts1 & RxUDPF))) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ else ++ skb_checksum_none_assert(skb); ++} ++ ++static inline void ++rtl8125_rx_v3_csum(struct rtl8125_private *tp, ++ struct sk_buff *skb, ++ struct RxDescV3 *descv3) ++{ ++ u32 opts2 = le32_to_cpu(descv3->RxDescNormalDDWord4.opts2); ++ ++ /* rx csum offload for RTL8125 */ ++ if (((opts2 & RxTCPT_v3) && !(opts2 & RxTCPF_v3)) || ++ ((opts2 & RxUDPT_v3) && !(opts2 & RxUDPF_v3))) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ else ++ skb_checksum_none_assert(skb); ++} ++ ++static inline void ++rtl8125_rx_v4_csum(struct rtl8125_private *tp, ++ struct sk_buff *skb, ++ struct RxDescV4 *descv4) ++{ ++ u32 opts1 = le32_to_cpu(descv4->RxDescNormalDDWord2.opts1); ++ ++ /* rx csum offload for RTL8125 */ ++ if (((opts1 & RxTCPT_v4) && !(opts1 & RxTCPF_v4)) || ++ ((opts1 & RxUDPT_v4) && !(opts1 & RxUDPF_v4))) ++ skb->ip_summed = CHECKSUM_UNNECESSARY; ++ else ++ skb_checksum_none_assert(skb); ++} ++ ++static inline void ++rtl8125_rx_csum(struct rtl8125_private *tp, ++ struct sk_buff *skb, ++ struct RxDesc *desc) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ rtl8125_rx_v3_csum(tp, skb, (struct RxDescV3 *)desc); ++ break; ++ case RX_DESC_RING_TYPE_4: ++ rtl8125_rx_v4_csum(tp, skb, (struct RxDescV4 *)desc); ++ break; ++ default: ++ rtl8125_rx_v1_csum(tp, skb, desc); ++ break; ++ } ++} ++ ++/* ++static inline int ++rtl8125_try_rx_copy(struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ struct sk_buff **sk_buff, ++ int pkt_size, ++ struct RxDesc *desc, ++ int rx_buf_sz) ++{ ++ int ret = -1; ++ ++ struct sk_buff *skb; ++ ++ skb = RTL_ALLOC_SKB_INTR(&tp->r8125napi[ring->index].napi, pkt_size + R8125_RX_ALIGN); ++ if (skb) { ++ u8 *data; ++ ++ data = sk_buff[0]->data; ++ if (!R8125_USE_NAPI_ALLOC_SKB) ++ skb_reserve(skb, R8125_RX_ALIGN); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37) ++ prefetch(data - R8125_RX_ALIGN); ++#endif ++ eth_copy_and_sum(skb, data, pkt_size, 0); ++ *sk_buff = skb; ++ rtl8125_mark_to_asic(tp, desc, rx_buf_sz); ++ ret = 0; ++ } ++ ++ return ret; ++} ++*/ ++ ++static inline void ++rtl8125_rx_skb(struct rtl8125_private *tp, ++ struct sk_buff *skb, ++ u32 ring_index) ++{ ++#ifdef CONFIG_R8125_NAPI ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) ++ netif_receive_skb(skb); ++#else ++ napi_gro_receive(&tp->r8125napi[ring_index].napi, skb); ++#endif ++#else ++ netif_rx(skb); ++#endif ++} ++ ++static int ++rtl8125_check_rx_desc_error(struct net_device *dev, ++ struct rtl8125_private *tp, ++ u32 status) ++{ ++ int ret = 0; ++ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ if (unlikely(status & RxRES_V3)) { ++ if (status & (RxRWT_V3 | RxRUNT_V3)) ++ RTLDEV->stats.rx_length_errors++; ++ if (status & RxCRC_V3) ++ RTLDEV->stats.rx_crc_errors++; ++ ++ ret = -1; ++ } ++ break; ++ case RX_DESC_RING_TYPE_4: ++ if (unlikely(status & RxRES_V4)) { ++ if (status & RxRUNT_V4) ++ RTLDEV->stats.rx_length_errors++; ++ if (status & RxCRC_V4) ++ RTLDEV->stats.rx_crc_errors++; ++ ++ ret = -1; ++ } ++ break; ++ default: ++ if (unlikely(status & RxRES)) { ++ if (status & (RxRWT | RxRUNT)) ++ RTLDEV->stats.rx_length_errors++; ++ if (status & RxCRC) ++ RTLDEV->stats.rx_crc_errors++; ++ ++ ret = -1; ++ } ++ break; ++ } ++ ++ return ret; ++} ++ ++#ifdef ENABLE_PAGE_REUSE ++ ++static inline bool ++rtl8125_reuse_rx_ok(struct page *page) ++{ ++ /* avoid re-using remote pages */ ++ if (!dev_page_is_reusable(page)) { ++ //printk(KERN_INFO "r8125 page pfmemalloc, can't reuse!\n"); ++ return false; ++ } ++ /* if we are only owner of page we can reuse it */ ++ if (unlikely(page_ref_count(page) != 1)) { ++ //printk(KERN_INFO "r8125 page refcnt %d, can't reuse!\n", page_ref_count(page)); ++ return false; ++ } ++ ++ return true; ++} ++ ++static void ++rtl8125_reuse_rx_buffer(struct rtl8125_private *tp, struct rtl8125_rx_ring *ring, u32 cur_rx, struct rtl8125_rx_buffer *rxb) ++{ ++ struct page *page = rxb->page; ++ ++ u32 dirty_rx = ring->dirty_rx; ++ u32 entry = dirty_rx % ring->num_rx_desc; ++ struct rtl8125_rx_buffer *nrxb = &ring->rx_buffer[entry]; ++ ++ u32 noffset; ++ ++ //the page gonna be shared by us and kernel, keep page ref = 2 ++ page_ref_inc(page); ++ ++ //flip the buffer in page to use next ++ noffset = rxb->page_offset ^ (tp->rx_buf_page_size / 2); //one page, two buffer, ping-pong ++ ++ nrxb->dma = rxb->dma; ++ nrxb->page_offset = noffset; ++ nrxb->data = rxb->data; ++ ++ if (cur_rx != dirty_rx) { ++ //move the buffer to other slot ++ nrxb->page = page; ++ rxb->page = NULL; ++ } ++} ++ ++static void rtl8125_put_rx_buffer(struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ u32 cur_rx, ++ struct rtl8125_rx_buffer *rxb) ++{ ++ struct rtl8125_rx_buffer *nrxb; ++ struct page *page = rxb->page; ++ u32 entry; ++ ++ entry = ring->dirty_rx % ring->num_rx_desc; ++ nrxb = &ring->rx_buffer[entry]; ++ if (likely(rtl8125_reuse_rx_ok(page))) { ++ /* hand second half of page back to the ring */ ++ rtl8125_reuse_rx_buffer(tp, ring, cur_rx, rxb); ++ } else { ++ tp->page_reuse_fail_cnt++; ++ ++ dma_unmap_page_attrs(&tp->pci_dev->dev, rxb->dma, ++ tp->rx_buf_page_size, ++ DMA_FROM_DEVICE, ++ (DMA_ATTR_SKIP_CPU_SYNC | DMA_ATTR_WEAK_ORDERING)); ++ //the page ref is kept 1, uniquely owned by kernel now ++ rxb->page = NULL; ++ ++ return; ++ } ++ ++ dma_sync_single_range_for_device(tp_to_dev(tp), ++ nrxb->dma, ++ nrxb->page_offset, ++ tp->rx_buf_sz, ++ DMA_FROM_DEVICE); ++ ++ rtl8125_map_to_asic(tp, ring, ++ rtl8125_get_rxdesc(tp, ring->RxDescArray, entry), ++ nrxb->dma + nrxb->page_offset, ++ tp->rx_buf_sz, entry); ++ ++ ring->dirty_rx++; ++} ++ ++#endif //ENABLE_PAGE_REUSE ++ ++static int ++rtl8125_rx_interrupt(struct net_device *dev, ++ struct rtl8125_private *tp, ++ struct rtl8125_rx_ring *ring, ++ napi_budget budget) ++{ ++ unsigned int cur_rx, rx_left; ++ unsigned int delta, count = 0; ++ unsigned int entry; ++ struct RxDesc *desc; ++ struct sk_buff *skb; ++ u32 status; ++ u32 rx_quota; ++ u32 ring_index = ring->index; ++#ifdef ENABLE_PAGE_REUSE ++ struct rtl8125_rx_buffer *rxb; ++#else //ENABLE_PAGE_REUSE ++ u64 rx_buf_phy_addr; ++#endif //ENABLE_PAGE_REUSE ++ unsigned int total_rx_multicast_packets = 0; ++ unsigned int total_rx_bytes = 0, total_rx_packets = 0; ++ ++ assert(dev != NULL); ++ assert(tp != NULL); ++ ++ if (ring->RxDescArray == NULL) ++ goto rx_out; ++ ++ rx_quota = RTL_RX_QUOTA(budget); ++ cur_rx = ring->cur_rx; ++ rx_left = ring->num_rx_desc + ring->dirty_rx - cur_rx; ++ rx_left = rtl8125_rx_quota(rx_left, (u32)rx_quota); ++ ++ for (; rx_left > 0; rx_left--, cur_rx++) { ++#ifdef ENABLE_PTP_SUPPORT ++ u8 desc_type = RXDESC_TYPE_NORMAL; ++ struct RxDescV3 ptp_desc; ++#endif //ENABLE_PTP_SUPPORT ++#ifndef ENABLE_PAGE_REUSE ++ const void *rx_buf; ++#endif //!ENABLE_PAGE_REUSE ++ u32 pkt_size; ++ ++ entry = cur_rx % ring->num_rx_desc; ++ desc = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry); ++ status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc)); ++ if (status & DescOwn) { ++ RTL_R8(tp, tp->imr_reg[0]); ++ status = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc)); ++ if (status & DescOwn) ++ break; ++ } ++ ++ rmb(); ++ ++ if (unlikely(rtl8125_check_rx_desc_error(dev, tp, status) < 0)) { ++ if (netif_msg_rx_err(tp)) { ++ printk(KERN_INFO ++ "%s: Rx ERROR. status = %08x\n", ++ dev->name, status); ++ } ++ ++ RTLDEV->stats.rx_errors++; ++ ++ if (!(dev->features & NETIF_F_RXALL)) ++ goto release_descriptor; ++ } ++ pkt_size = status & 0x00003fff; ++ if (likely(!(dev->features & NETIF_F_RXFCS))) { ++#ifdef ENABLE_RX_PACKET_FRAGMENT ++ if (rtl8125_is_non_eop(tp, status) && ++ pkt_size == tp->rx_buf_sz) { ++ struct RxDesc *desc_next; ++ unsigned int entry_next; ++ int pkt_size_next; ++ u32 status_next; ++ ++ entry_next = (cur_rx + 1) % ring->num_rx_desc; ++ desc_next = rtl8125_get_rxdesc(tp, ring->RxDescArray, entry_next); ++ status_next = le32_to_cpu(rtl8125_rx_desc_opts1(tp, desc_next)); ++ if (!(status_next & DescOwn)) { ++ pkt_size_next = status_next & 0x00003fff; ++ if (pkt_size_next < ETH_FCS_LEN) ++ pkt_size -= (ETH_FCS_LEN - pkt_size_next); ++ } ++ } ++#endif //ENABLE_RX_PACKET_FRAGMENT ++ if (!rtl8125_is_non_eop(tp, status)) { ++ if (pkt_size < ETH_FCS_LEN) { ++#ifdef ENABLE_RX_PACKET_FRAGMENT ++ pkt_size = 0; ++#else ++ goto drop_packet; ++#endif //ENABLE_RX_PACKET_FRAGMENT ++ } else ++ pkt_size -= ETH_FCS_LEN; ++ } ++ } ++ ++ if (unlikely(pkt_size > tp->rx_buf_sz)) ++ goto drop_packet; ++ ++#if !defined(ENABLE_RX_PACKET_FRAGMENT) || !defined(ENABLE_PAGE_REUSE) ++ /* ++ * The driver does not support incoming fragmented ++ * frames. They are seen as a symptom of over-mtu ++ * sized frames. ++ */ ++ if (unlikely(rtl8125_fragmented_frame(tp, status))) ++ goto drop_packet; ++#endif //!ENABLE_RX_PACKET_FRAGMENT || !ENABLE_PAGE_REUSE ++ ++#ifdef ENABLE_PTP_SUPPORT ++ if (tp->HwSuppPtpVer == 1) { ++ desc_type = rtl8125_rx_desc_type(status); ++ if (desc_type == RXDESC_TYPE_NEXT && rx_left > 0) { ++ u32 status_next; ++ struct RxDescV3 *desc_next; ++ unsigned int entry_next; ++ ++ cur_rx++; ++ rx_left--; ++ entry_next = cur_rx % ring->num_rx_desc; ++ desc_next = (struct RxDescV3 *)rtl8125_get_rxdesc(tp, ring->RxDescArray, entry_next); ++ status_next = le32_to_cpu(desc_next->RxDescNormalDDWord4.opts1); ++ if (unlikely(status_next & DescOwn)) { ++ udelay(1); ++ status_next = le32_to_cpu(desc_next->RxDescNormalDDWord4.opts1); ++ if (unlikely(status_next & DescOwn)) { ++ if (netif_msg_rx_err(tp)) { ++ printk(KERN_ERR ++ "%s: Rx Next Desc ERROR. status = %08x\n", ++ dev->name, status_next); ++ } ++ rtl8125_set_desc_dma_addr(tp, (struct RxDesc *)desc_next, ++ ring->RxDescPhyAddr[entry_next]); ++ wmb(); ++ rtl8125_mark_to_asic(tp, (struct RxDesc *)desc_next, tp->rx_buf_sz); ++ goto drop_packet; ++ } ++ } ++ ++ rmb(); ++ ++ desc_type = rtl8125_rx_desc_type(status_next); ++ if (desc_type == RXDESC_TYPE_PTP) { ++ ptp_desc = *desc_next; ++ rmb(); ++ rtl8125_set_desc_dma_addr(tp, (struct RxDesc *)desc_next, ++ ring->RxDescPhyAddr[entry_next]); ++ wmb(); ++ rtl8125_mark_to_asic(tp, (struct RxDesc *)desc_next, tp->rx_buf_sz); ++ } else { ++ WARN_ON(1); ++ rtl8125_set_desc_dma_addr(tp, (struct RxDesc *)desc_next, ++ ring->RxDescPhyAddr[entry_next]); ++ wmb(); ++ rtl8125_mark_to_asic(tp, (struct RxDesc *)desc_next, tp->rx_buf_sz); ++ goto drop_packet; ++ } ++ } else ++ WARN_ON(desc_type != RXDESC_TYPE_NORMAL); ++ } ++#endif ++#ifdef ENABLE_PAGE_REUSE ++ rxb = &ring->rx_buffer[entry]; ++ skb = rxb->skb; ++ rxb->skb = NULL; ++ if (!skb) { ++ skb = RTL_BUILD_SKB_INTR(rxb->data + rxb->page_offset - ring->rx_offset, tp->rx_buf_page_size / 2); ++ if (!skb) { ++ //netdev_err(tp->dev, "Failed to allocate RX skb!\n"); ++ goto drop_packet; ++ } ++ ++ skb->dev = dev; ++ if (!R8125_USE_NAPI_ALLOC_SKB) ++ skb_reserve(skb, R8125_RX_ALIGN); ++ skb_put(skb, pkt_size); ++#ifdef ENABLE_RSS_SUPPORT ++ rtl8125_rx_hash(tp, desc, skb); ++#endif ++ rtl8125_rx_csum(tp, skb, desc); ++ } else ++ skb_add_rx_frag(skb, skb_shinfo(skb)->nr_frags, rxb->page, ++ rxb->page_offset, pkt_size, tp->rx_buf_page_size / 2); ++ ++ //recycle desc ++ rtl8125_put_rx_buffer(tp, ring, cur_rx, rxb); ++ ++ dma_sync_single_range_for_cpu(tp_to_dev(tp), ++ rxb->dma, ++ rxb->page_offset, ++ tp->rx_buf_sz, ++ DMA_FROM_DEVICE); ++#else //ENABLE_PAGE_REUSE ++ skb = RTL_ALLOC_SKB_INTR(&tp->r8125napi[ring->index].napi, pkt_size + R8125_RX_ALIGN); ++ if (!skb) { ++ //netdev_err(tp->dev, "Failed to allocate RX skb!\n"); ++ goto drop_packet; ++ } ++ ++ skb->dev = dev; ++ if (!R8125_USE_NAPI_ALLOC_SKB) ++ skb_reserve(skb, R8125_RX_ALIGN); ++ skb_put(skb, pkt_size); ++ ++ rx_buf_phy_addr = ring->RxDescPhyAddr[entry]; ++ dma_sync_single_for_cpu(tp_to_dev(tp), ++ rx_buf_phy_addr, tp->rx_buf_sz, ++ DMA_FROM_DEVICE); ++ rx_buf = ring->Rx_skbuff[entry]->data; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,4,37) ++ prefetch(rx_buf - R8125_RX_ALIGN); ++#endif ++ eth_copy_and_sum(skb, rx_buf, pkt_size, 0); ++ ++ dma_sync_single_for_device(tp_to_dev(tp), rx_buf_phy_addr, ++ tp->rx_buf_sz, DMA_FROM_DEVICE); ++#endif //ENABLE_PAGE_REUSE ++ ++#ifdef ENABLE_PTP_SUPPORT ++ if (tp->HwSuppPtpVer == 1 && desc_type == RXDESC_TYPE_PTP) ++ rtl8125_rx_mac_ptp_pktstamp(tp, skb, &ptp_desc); ++ else if (tp->HwSuppPtpVer == 3 && (tp->flags & RTL_FLAG_RX_HWTSTAMP_ENABLED)) ++ rtl8125_rx_phy_ptp_timestamp(tp, skb); ++#endif // ENABLE_PTP_SUPPORT ++ ++#ifdef ENABLE_RX_PACKET_FRAGMENT ++ if (rtl8125_is_non_eop(tp, status)) { ++ unsigned int entry_next; ++ entry_next = (entry + 1) % ring->num_rx_desc; ++ rxb = &ring->rx_buffer[entry_next]; ++ rxb->skb = skb; ++ continue; ++ } ++#endif //ENABLE_RX_PACKET_FRAGMENT ++ ++#ifndef ENABLE_PAGE_REUSE ++#ifdef ENABLE_RSS_SUPPORT ++ rtl8125_rx_hash(tp, desc, skb); ++#endif ++ rtl8125_rx_csum(tp, skb, desc); ++#endif /* !ENABLE_PAGE_REUSE */ ++ ++ skb->protocol = eth_type_trans(skb, dev); ++ ++ total_rx_bytes += skb->len; ++ ++ if (skb->pkt_type == PACKET_MULTICAST) ++ total_rx_multicast_packets++; ++ ++ if (rtl8125_rx_vlan_skb(tp, desc, skb) < 0) ++ rtl8125_rx_skb(tp, skb, ring_index); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0) ++ dev->last_rx = jiffies; ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(4,11,0) ++ total_rx_packets++; ++ ++#ifdef ENABLE_PAGE_REUSE ++ rxb->skb = NULL; ++ continue; ++#endif ++ ++release_descriptor: ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ case RX_DESC_RING_TYPE_4: ++ rtl8125_set_desc_dma_addr(tp, desc, ++ ring->RxDescPhyAddr[entry]); ++ wmb(); ++ break; ++ } ++ rtl8125_mark_to_asic(tp, desc, tp->rx_buf_sz); ++ continue; ++drop_packet: ++ RTLDEV->stats.rx_dropped++; ++ RTLDEV->stats.rx_length_errors++; ++ goto release_descriptor; ++ } ++ ++ count = cur_rx - ring->cur_rx; ++ ring->cur_rx = cur_rx; ++ ++ delta = rtl8125_rx_fill(tp, ring, dev, ring->dirty_rx, ring->cur_rx, 1); ++ if (!delta && count && netif_msg_intr(tp)) ++ printk(KERN_INFO "%s: no Rx buffer allocated\n", dev->name); ++ ring->dirty_rx += delta; ++ ++ RTLDEV->stats.rx_bytes += total_rx_bytes; ++ RTLDEV->stats.rx_packets += total_rx_packets; ++ RTLDEV->stats.multicast += total_rx_multicast_packets; ++ ++ /* ++ * FIXME: until there is periodic timer to try and refill the ring, ++ * a temporary shortage may definitely kill the Rx process. ++ * - disable the asic to try and avoid an overflow and kick it again ++ * after refill ? ++ * - how do others driver handle this condition (Uh oh...). ++ */ ++ if ((ring->dirty_rx + ring->num_rx_desc == ring->cur_rx) && netif_msg_intr(tp)) ++ printk(KERN_EMERG "%s: Rx buffers exhausted\n", dev->name); ++ ++rx_out: ++ return total_rx_packets; ++} ++ ++static bool ++rtl8125_linkchg_interrupt(struct rtl8125_private *tp, u32 status) ++{ ++ switch (tp->HwCurrIsrVer) { ++ case 2: ++ case 3: ++ return status & ISRIMR_V2_LINKCHG; ++ case 4: ++ return status & ISRIMR_V4_LINKCHG; ++ case 5: ++ return status & ISRIMR_V5_LINKCHG; ++ case 7: ++ return status & ISRIMR_V7_LINKCHG; ++ default: ++ return status & LinkChg; ++ } ++} ++ ++static u32 ++rtl8125_get_linkchg_message_id(struct rtl8125_private *tp) ++{ ++ switch (tp->HwCurrIsrVer) { ++ case 4: ++ case 7: ++ return 29; ++ case 5: ++ return 18; ++ default: ++ return 21; ++ } ++} ++ ++/* ++ *The interrupt handler does all of the Rx thread work and cleans up after ++ *the Tx thread. ++ */ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) ++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance, struct pt_regs *regs) ++#else ++static irqreturn_t rtl8125_interrupt(int irq, void *dev_instance) ++#endif ++{ ++ struct r8125_napi *r8125napi = dev_instance; ++ struct rtl8125_private *tp = r8125napi->priv; ++ struct net_device *dev = tp->dev; ++ u32 status; ++ int handled = 0; ++ ++ do { ++ status = RTL_R32(tp, tp->isr_reg[0]); ++ ++ if (!(tp->features & (RTL_FEATURE_MSI | RTL_FEATURE_MSIX))) { ++ /* hotplug/major error/no more work/shared irq */ ++ if (!status) ++ break; ++ ++ if (status == 0xFFFFFFFF) ++ break; ++ ++ if (!(status & (tp->intr_mask | tp->timer_intr_mask))) ++ break; ++ } ++ ++ handled = 1; ++ ++#if defined(RTL_USE_NEW_INTR_API) ++ if (!tp->irq_tbl[0].requested) ++ break; ++#endif ++ rtl8125_disable_hw_interrupt(tp); ++ ++ RTL_W32(tp, tp->isr_reg[0], status&~RxFIFOOver); ++ ++ if (rtl8125_linkchg_interrupt(tp, status)) ++ rtl8125_schedule_linkchg_work(tp); ++ ++#ifdef ENABLE_DASH_SUPPORT ++ if ((status & ISRIMR_V4_LAYER2_INTR_STS) && ++ rtl8125_check_dash_interrupt(tp)) ++ rtl8125_schedule_dash_work(tp); ++#endif ++ ++#ifdef CONFIG_R8125_NAPI ++ if (status & tp->intr_mask || tp->keep_intr_cnt-- > 0) { ++ if (status & tp->intr_mask) ++ tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT; ++ ++ if (likely(RTL_NETIF_RX_SCHEDULE_PREP(dev, &tp->r8125napi[0].napi))) ++ __RTL_NETIF_RX_SCHEDULE(dev, &tp->r8125napi[0].napi); ++ else if (netif_msg_intr(tp)) ++ printk(KERN_INFO "%s: interrupt %04x in poll\n", ++ dev->name, status); ++ } else { ++ tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT; ++ rtl8125_switch_to_hw_interrupt(tp); ++ } ++#else ++ if (status & tp->intr_mask || tp->keep_intr_cnt-- > 0) { ++ u32 budget = ~(u32)0; ++ int i; ++ ++ if (status & tp->intr_mask) ++ tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) ++ rtl8125_tx_interrupt(&tp->tx_ring[i], ~(u32)0); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) ++ rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[0], &budget); ++#else ++ rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[0], budget); ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) ++ ++#ifdef ENABLE_DASH_SUPPORT ++ if ((status & ISRIMR_V4_LAYER2_INTR_STS) && ++ rtl8125_check_dash_interrupt(tp)) ++ rtl8125_schedule_dash_work(tp); ++#endif ++ ++ rtl8125_switch_to_timer_interrupt(tp); ++ } else { ++ tp->keep_intr_cnt = RTK_KEEP_INTERRUPT_COUNT; ++ rtl8125_switch_to_hw_interrupt(tp); ++ } ++#endif ++ } while (false); ++ ++ return IRQ_RETVAL(handled); ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,19) ++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance, struct pt_regs *regs) ++#else ++static irqreturn_t rtl8125_interrupt_msix(int irq, void *dev_instance) ++#endif ++{ ++ struct r8125_napi *r8125napi = dev_instance; ++ struct rtl8125_private *tp = r8125napi->priv; ++ struct net_device *dev = tp->dev; ++ int message_id = r8125napi->index; ++#ifndef CONFIG_R8125_NAPI ++ u32 budget = ~(u32)0; ++#endif ++ ++ do { ++#if defined(RTL_USE_NEW_INTR_API) ++ if (!tp->irq_tbl[message_id].requested) ++ break; ++#endif ++ //link change ++ if (message_id == rtl8125_get_linkchg_message_id(tp)) { ++ rtl8125_disable_hw_interrupt_v2(tp, message_id); ++ rtl8125_clear_hw_isr_v2(tp, message_id); ++ rtl8125_schedule_linkchg_work(tp); ++ break; ++ } ++ ++#ifdef ENABLE_DASH_SUPPORT ++ if (message_id == 31) { ++ if (rtl8125_check_dash_interrupt(tp)) ++ rtl8125_disable_hw_interrupt_v2(tp, message_id); ++ rtl8125_clear_hw_isr_v2(tp, message_id); ++ rtl8125_schedule_dash_work(tp); ++ rtl8125_enable_hw_interrupt_v2(tp, message_id); ++ break; ++ } ++#endif ++ ++#ifdef CONFIG_R8125_NAPI ++ if (likely(RTL_NETIF_RX_SCHEDULE_PREP(dev, &r8125napi->napi))) { ++ rtl8125_disable_hw_interrupt_v2(tp, message_id); ++ __RTL_NETIF_RX_SCHEDULE(dev, &r8125napi->napi); ++ } else if (netif_msg_intr(tp)) ++ printk(KERN_INFO "%s: interrupt message id %d in poll_msix\n", ++ dev->name, message_id); ++ rtl8125_clear_hw_isr_v2(tp, message_id); ++#else ++ rtl8125_disable_hw_interrupt_v2(tp, message_id); ++ ++ rtl8125_clear_hw_isr_v2(tp, message_id); ++ ++ rtl8125_tx_interrupt_with_vector(tp, message_id, ~(u32)0); ++ ++ if (message_id < tp->num_rx_rings) { ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) ++ rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], &budget); ++#else ++ rtl8125_rx_interrupt(dev, tp, &tp->rx_ring[message_id], budget); ++#endif //LINUX_VERSION_CODE < KERNEL_VERSION(2,6,24) ++ } ++ ++ rtl8125_enable_hw_interrupt_v2(tp, message_id); ++#endif ++ ++ } while (false); ++ ++ return IRQ_HANDLED; ++} ++ ++static void rtl8125_down(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ //rtl8125_delete_esd_timer(dev, &tp->esd_timer); ++ ++ //rtl8125_delete_link_timer(dev, &tp->link_timer); ++ ++ netif_carrier_off(dev); ++ ++ netif_tx_disable(dev); ++ ++ _rtl8125_wait_for_quiescence(dev); ++ ++ rtl8125_hw_reset(dev); ++ ++ rtl8125_tx_clear(tp); ++ ++ rtl8125_rx_clear(tp); ++} ++ ++static int rtl8125_resource_freed(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < tp->num_tx_rings; i++) ++ if (tp->tx_ring[i].TxDescArray) ++ return 0; ++ ++ for (i = 0; i < tp->num_rx_rings; i++) ++ if (tp->rx_ring[i].RxDescArray) ++ return 0; ++ ++ return 1; ++} ++ ++int rtl8125_close(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (!rtl8125_resource_freed(tp)) { ++ set_bit(R8125_FLAG_DOWN, tp->task_flags); ++ ++ rtl8125_down(dev); ++ ++ pci_clear_master(tp->pci_dev); ++ ++#ifdef ENABLE_PTP_SUPPORT ++ rtl8125_ptp_stop(tp); ++#endif ++ rtl8125_hw_d3_para(dev); ++ ++ rtl8125_powerdown_pll(dev, 0); ++ ++ rtl8125_free_irq(tp); ++ ++ rtl8125_free_alloc_resources(tp); ++ } else { ++ rtl8125_hw_d3_para(dev); ++ ++ rtl8125_powerdown_pll(dev, 0); ++ } ++ ++ return 0; ++} ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11) ++static void rtl8125_shutdown(struct pci_dev *pdev) ++{ ++ struct net_device *dev = pci_get_drvdata(pdev); ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ rtnl_lock(); ++ ++ if (HW_DASH_SUPPORT_DASH(tp)) ++ rtl8125_driver_stop(tp); ++ ++ rtl8125_disable_pci_offset_180(tp); ++ ++ if (s5_keep_curr_mac == 0 && tp->random_mac == 0) ++ rtl8125_rar_set(tp, tp->org_mac_addr); ++ ++ if (s5wol == 0) ++ tp->wol_enabled = WOL_DISABLED; ++ ++ rtl8125_close(dev); ++ rtl8125_disable_msi(pdev, tp); ++ ++ rtnl_unlock(); ++ ++ if (system_state == SYSTEM_POWER_OFF) { ++ pci_clear_master(tp->pci_dev); ++ pci_wake_from_d3(pdev, tp->wol_enabled); ++ pci_set_power_state(pdev, PCI_D3hot); ++ } ++} ++#endif ++ ++#ifdef CONFIG_PM ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,11) ++static int ++rtl8125_suspend(struct pci_dev *pdev, u32 state) ++#elif LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) ++static int ++rtl8125_suspend(struct device *device) ++#else ++static int ++rtl8125_suspend(struct pci_dev *pdev, pm_message_t state) ++#endif ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) ++ struct pci_dev *pdev = to_pci_dev(device); ++ struct net_device *dev = pci_get_drvdata(pdev); ++#else ++ struct net_device *dev = pci_get_drvdata(pdev); ++#endif ++ struct rtl8125_private *tp = netdev_priv(dev); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) ++ u32 pci_pm_state = pci_choose_state(pdev, state); ++#endif ++ rtnl_lock(); ++ ++ if (!netif_running(dev)) ++ goto out; ++ ++ set_bit(R8125_FLAG_DOWN, tp->task_flags); ++ ++ netif_carrier_off(dev); ++ ++ netif_tx_disable(dev); ++ ++ netif_device_detach(dev); ++ ++#ifdef ENABLE_PTP_SUPPORT ++ rtl8125_ptp_suspend(tp); ++#endif ++ rtl8125_hw_reset(dev); ++ ++ pci_clear_master(pdev); ++ ++ rtl8125_hw_d3_para(dev); ++ ++ rtl8125_powerdown_pll(dev, 1); ++ ++out: ++ if (HW_DASH_SUPPORT_DASH(tp)) ++ rtl8125_driver_stop(tp); ++ ++ rtnl_unlock(); ++ ++ pci_disable_device(pdev); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) ++ pci_save_state(pdev, &pci_pm_state); ++#else ++ pci_save_state(pdev); ++#endif ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) ++ pci_enable_wake(pdev, pci_choose_state(pdev, state), tp->wol_enabled); ++#endif ++ ++ pci_prepare_to_sleep(pdev); ++ ++ return 0; ++} ++ ++static int ++rtl8125_hw_d3_not_power_off(struct net_device *dev) ++{ ++ return rtl8125_check_hw_phy_mcu_code_ver(dev); ++} ++ ++static int rtl8125_wait_phy_nway_complete_sleep(struct rtl8125_private *tp) ++{ ++ int i, val; ++ ++ for (i = 0; i < 30; i++) { ++ val = rtl8125_mdio_read(tp, MII_BMSR) & BMSR_ANEGCOMPLETE; ++ if (val) ++ return 0; ++ ++ mdelay(100); ++ } ++ ++ return -1; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) ++static int ++rtl8125_resume(struct pci_dev *pdev) ++#else ++static int ++rtl8125_resume(struct device *device) ++#endif ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) ++ struct pci_dev *pdev = to_pci_dev(device); ++ struct net_device *dev = pci_get_drvdata(pdev); ++#else ++ struct net_device *dev = pci_get_drvdata(pdev); ++#endif ++ struct rtl8125_private *tp = netdev_priv(dev); ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) ++ u32 pci_pm_state = PCI_D0; ++#endif ++ unsigned long flags; ++ u32 err; ++ ++ rtnl_lock(); ++ ++ err = pci_enable_device(pdev); ++ if (err) { ++ dev_err(&pdev->dev, "Cannot enable PCI device from suspend\n"); ++ goto out_unlock; ++ } ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,10) ++ pci_restore_state(pdev, &pci_pm_state); ++#else ++ pci_restore_state(pdev); ++#endif ++ pci_enable_wake(pdev, PCI_D0, 0); ++ ++ /* restore last modified mac address */ ++ rtl8125_rar_set(tp, dev->dev_addr); ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_check_hw_phy_mcu_code_ver(dev); ++ ++ tp->resume_not_chg_speed = 0; ++ if (tp->check_keep_link_speed && ++ //tp->link_ok(dev) && ++ rtl8125_hw_d3_not_power_off(dev) && ++ rtl8125_wait_phy_nway_complete_sleep(tp) == 0) ++ tp->resume_not_chg_speed = 1; ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ if (!netif_running(dev)) ++ goto out_unlock; ++ ++ pci_set_master(pdev); ++ ++ rtl8125_exit_oob(dev); ++ ++ rtl8125_up(dev); ++ ++ clear_bit(R8125_FLAG_DOWN, tp->task_flags); ++ ++ rtl8125_schedule_reset_work(tp); ++ ++ rtl8125_schedule_esd_work(tp); ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ if (HW_FIBER_MODE_ENABLED(tp)) ++ rtl8125_schedule_link_work(tp); ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ //mod_timer(&tp->esd_timer, jiffies + RTL8125_ESD_TIMEOUT); ++ //mod_timer(&tp->link_timer, jiffies + RTL8125_LINK_TIMEOUT); ++out_unlock: ++ netif_device_attach(dev); ++ ++ rtnl_unlock(); ++ ++ return err; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,29) ++ ++static struct dev_pm_ops rtl8125_pm_ops = { ++ .suspend = rtl8125_suspend, ++ .resume = rtl8125_resume, ++ .freeze = rtl8125_suspend, ++ .thaw = rtl8125_resume, ++ .poweroff = rtl8125_suspend, ++ .restore = rtl8125_resume, ++}; ++ ++#define RTL8125_PM_OPS (&rtl8125_pm_ops) ++ ++#endif ++ ++#else /* !CONFIG_PM */ ++ ++#define RTL8125_PM_OPS NULL ++ ++#endif /* CONFIG_PM */ ++ ++static struct pci_driver rtl8125_pci_driver = { ++ .name = MODULENAME, ++ .id_table = rtl8125_pci_tbl, ++ .probe = rtl8125_init_one, ++ .remove = __devexit_p(rtl8125_remove_one), ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,11) ++ .shutdown = rtl8125_shutdown, ++#endif ++#ifdef CONFIG_PM ++#if LINUX_VERSION_CODE < KERNEL_VERSION(2,6,29) ++ .suspend = rtl8125_suspend, ++ .resume = rtl8125_resume, ++#else ++ .driver.pm = RTL8125_PM_OPS, ++#endif ++#endif ++}; ++ ++static int __init ++rtl8125_init_module(void) ++{ ++ int ret = 0; ++#ifdef ENABLE_R8125_PROCFS ++ rtl8125_proc_module_init(); ++#endif ++ ++#if LINUX_VERSION_CODE > KERNEL_VERSION(2,6,0) ++ ++ ret = pci_register_driver(&rtl8125_pci_driver); ++#else ++ ret = pci_module_init(&rtl8125_pci_driver); ++#endif ++ ++ return ret; ++} ++ ++static void __exit ++rtl8125_cleanup_module(void) ++{ ++ pci_unregister_driver(&rtl8125_pci_driver); ++ ++#ifdef ENABLE_R8125_PROCFS ++ if (rtl8125_proc) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++ remove_proc_subtree(MODULENAME, init_net.proc_net); ++#else ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) ++ remove_proc_entry(MODULENAME, init_net.proc_net); ++#else ++ remove_proc_entry(MODULENAME, proc_net); ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(2,6,32) ++#endif //LINUX_VERSION_CODE >= KERNEL_VERSION(3,10,0) ++ rtl8125_proc = NULL; ++ } ++#endif ++} ++ ++module_init(rtl8125_init_module); ++module_exit(rtl8125_cleanup_module); +diff --git a/drivers/net/ethernet/realtek/r8125_ptp.c b/drivers/net/ethernet/realtek/r8125_ptp.c +new file mode 100755 +index 000000000000..457fa6d395d6 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_ptp.c +@@ -0,0 +1,1472 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "r8125.h" ++#include "r8125_ptp.h" ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,6,0) ++static inline struct timespec timespec64_to_timespec(const struct timespec64 ts64) ++{ ++ return *(const struct timespec *)&ts64; ++} ++ ++static inline struct timespec64 timespec_to_timespec64(const struct timespec ts) ++{ ++ return *(const struct timespec64 *)&ts; ++} ++#endif ++ ++static int _rtl8125_mac_phc_gettime(struct rtl8125_private *tp, struct timespec64 *ts64) ++{ ++ //get local time ++ RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_LATCHED_LOCAL_TIME | PTP_EXEC_CMD)); ++ ++ /* nanoseconds */ ++ //0x6808[29:0] ++ ts64->tv_nsec = (RTL_R32(tp, PTP_SOFT_CONFIG_Time_NS_8125) & 0x3fffffff); ++ ++ /* seconds */ ++ //0x680C[47:0] ++ ts64->tv_sec = RTL_R16(tp, PTP_SOFT_CONFIG_Time_S_8125 + 4); ++ ts64->tv_sec <<= 32; ++ ts64->tv_sec |= RTL_R32(tp, PTP_SOFT_CONFIG_Time_S_8125); ++ ++ return 0; ++} ++ ++static void rtl8125_wait_phy_clkadj_ready(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) ++ if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & CLKADJ_MODE_SET)) ++ break; ++} ++ ++static void rtl8125_phy_set_clkadj_mode(struct rtl8125_private *tp, u16 cmd) ++{ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ PTP_CLK_CFG_8126, ++ BIT_3 | BIT_2 | BIT_1, ++ CLKADJ_MODE_SET | cmd); ++ ++ rtl8125_wait_phy_clkadj_ready(tp); ++} ++ ++static int _rtl8125_phy_phc_gettime(struct rtl8125_private *tp, struct timespec64 *ts64) ++{ ++ unsigned long flags; ++ int i; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ //Direct Read ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ PTP_CLK_CFG_8126, ++ BIT_3 | BIT_2 | BIT_1, ++ (PTP_CLKADJ_MODE_SET | DIRECT_READ)); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET)) ++ break; ++ } ++ ++ /* nanoseconds */ ++ //Ns[29:16] E414[13:0] ++ ts64->tv_nsec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_NS_HI_8126) & 0x3fff; ++ ts64->tv_nsec <<= 16; ++ //Ns[15:0] E412[15:0] ++ ts64->tv_nsec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_NS_LO_8126); ++ ++ ++ /* seconds */ ++ //S[47:32] E41A[15:0] ++ ts64->tv_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_HI_8126); ++ ts64->tv_sec <<= 16; ++ //S[31:16] E418[15:0] ++ ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_MI_8126); ++ ts64->tv_sec <<= 16; ++ //S[15:0] E416[15:0] ++ ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_LO_8126); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return 0; ++} ++ ++static int _rtl8125_mac_phc_settime(struct rtl8125_private *tp, const struct timespec64 *ts64) ++{ ++ /* nanoseconds */ ++ //0x6808[29:0] ++ RTL_W32(tp, PTP_SOFT_CONFIG_Time_NS_8125, (ts64->tv_nsec & 0x3fffffff)); ++ ++ /* seconds */ ++ //0x680C[47:0] ++ RTL_W32(tp, PTP_SOFT_CONFIG_Time_S_8125, ts64->tv_sec); ++ RTL_W16(tp, PTP_SOFT_CONFIG_Time_S_8125 + 4, (ts64->tv_sec >> 32)); ++ ++ //set local time ++ RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_SET_LOCAL_TIME | PTP_EXEC_CMD)); ++ ++ return 0; ++} ++ ++static int _rtl8125_phy_phc_settime(struct rtl8125_private *tp, const struct timespec64 *ts64) ++{ ++ unsigned long flags; ++ int i; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ /* nanoseconds */ ++ //Ns[15:0] E412[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_LO_8126, ts64->tv_nsec); ++ //Ns[29:16] E414[13:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_HI_8126, (ts64->tv_nsec & 0x3fff0000) >> 16); ++ ++ /* seconds */ ++ //S[15:0] E416[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_LO_8126, ts64->tv_sec); ++ //S[31:16] E418[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_MI_8126, (ts64->tv_sec >> 16)); ++ //S[47:32] E41A[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_HI_8126, (ts64->tv_sec >> 32)); ++ ++ //Direct Write ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ PTP_CLK_CFG_8126, ++ BIT_3 | BIT_2 | BIT_1, ++ (PTP_CLKADJ_MODE_SET | DIRECT_WRITE)); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET)) ++ break; ++ } ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return 0; ++} ++ ++static int _rtl8125_mac_phc_adjtime(struct rtl8125_private *tp, s64 delta) ++{ ++ struct timespec64 d; ++ bool negative = false; ++ u64 tohw; ++ u32 nsec; ++ u64 sec; ++ ++ if (delta < 0) { ++ negative = true; ++ tohw = -delta; ++ } else { ++ tohw = delta; ++ } ++ ++ d = ns_to_timespec64(tohw); ++ ++ nsec = d.tv_nsec; ++ sec = d.tv_sec; ++ ++ if (negative) { ++ nsec = -nsec; ++ sec = -sec; ++ } ++ ++ nsec &= 0x3fffffff; ++ sec &= 0x0000ffffffffffff; ++ ++ if (negative) { ++ nsec |= PTP_SOFT_CONFIG_TIME_NS_NEGATIVE; ++ sec |= PTP_SOFT_CONFIG_TIME_S_NEGATIVE; ++ } ++ ++ /* nanoseconds */ ++ //0x6808[29:0] ++ RTL_W32(tp, PTP_SOFT_CONFIG_Time_NS_8125, nsec); ++ ++ /* seconds */ ++ //0x680C[47:0] ++ RTL_W32(tp, PTP_SOFT_CONFIG_Time_S_8125, sec); ++ RTL_W16(tp, PTP_SOFT_CONFIG_Time_S_8125 + 4, (sec >> 32)); ++ ++ //adjust local time ++ //RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_DRIFT_LOCAL_TIME | PTP_EXEC_CMD)); ++ RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_SET_LOCAL_TIME | PTP_EXEC_CMD)); ++ ++ return 0; ++} ++ ++static int rtl8125_mac_phc_adjtime(struct ptp_clock_info *ptp, s64 delta) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc adjust time\n"); ++ ++ rtnl_lock(); ++ ret = _rtl8125_mac_phc_adjtime(tp, delta); ++ rtnl_unlock(); ++ ++ return ret; ++} ++ ++static int _rtl8125_phy_phc_adjtime(struct rtl8125_private *tp, s64 delta) ++{ ++ unsigned long flags; ++ struct timespec64 d; ++ bool negative = false; ++ int i; ++ u64 tohw; ++ u32 nsec; ++ u64 sec; ++ ++ if (delta < 0) { ++ negative = true; ++ tohw = -delta; ++ } else { ++ tohw = delta; ++ } ++ ++ d = ns_to_timespec64(tohw); ++ ++ nsec = d.tv_nsec; ++ sec = d.tv_sec; ++ ++ nsec &= 0x3fffffff; ++ sec &= 0x0000ffffffffffff; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ /* nanoseconds */ ++ //Ns[15:0] E412[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_LO_8126, nsec); ++ //Ns[29:16] E414[13:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_HI_8126, (nsec >> 16)); ++ ++ /* seconds */ ++ //S[15:0] E416[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_LO_8126, sec); ++ //S[31:16] E418[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_MI_8126, (sec >> 16)); ++ //S[47:32] E41A[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_S_HI_8126, (sec >> 32)); ++ ++ if (negative) ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ PTP_CLK_CFG_8126, ++ BIT_3 | BIT_2 | BIT_1, ++ (PTP_CLKADJ_MODE_SET | DECREMENT_STEP)); ++ else ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ PTP_CLK_CFG_8126, ++ BIT_3 | BIT_2 | BIT_1, ++ (PTP_CLKADJ_MODE_SET | INCREMENT_STEP)); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET)) ++ break; ++ } ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return 0; ++} ++ ++static int rtl8125_phy_phc_adjtime(struct ptp_clock_info *ptp, s64 delta) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc adjust time\n"); ++ ++ ret = _rtl8125_phy_phc_adjtime(tp, delta); ++ ++ return ret; ++} ++ ++/* ++ * 1ppm means every 125MHz plus 125Hz. It also means every 8ns minus 8ns*10^(-6) ++ * 1ns=2^30 sub_ns ++ * 8ns*10^(-6) = 8 * 2^30 sub_ns * 10^(-6) = 2^33 sub_ns * 10^(-6) = 8590 = 0x218E sub_ns ++ * ++ * 1ppb means every 125MHz plus 0.125Hz. It also means every 8ns minus 8ns*10^(-9) ++ * 1ns=2^30 sub_ns ++ * 8ns*10^(-9) = 8 * 2^30 sub_ns * 10^(-9) = 2^33 sub_ns * 10^(-9) = 8.59 sub_ns = 9 sub_ns ++ */ ++static int _rtl8125_mac_phc_adjfreq(struct ptp_clock_info *ptp, s32 ppb) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ bool negative = false; ++ u32 sub_ns; ++ ++ if (ppb < 0) { ++ negative = true; ++ ppb = -ppb; ++ } ++ ++ sub_ns = ppb * 9; ++ if (negative) { ++ sub_ns = -sub_ns; ++ sub_ns &= 0x3fffffff; ++ sub_ns |= PTP_ADJUST_TIME_NS_NEGATIVE; ++ } else ++ sub_ns &= 0x3fffffff; ++ ++ /* nanoseconds */ ++ //0x6808[29:0] ++ RTL_W32(tp, PTP_SOFT_CONFIG_Time_NS_8125, sub_ns); ++ ++ //adjust local time ++ RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_DRIFT_LOCAL_TIME | PTP_EXEC_CMD)); ++ //RTL_W16(tp, PTP_TIME_CORRECT_CMD_8125, (PTP_CMD_SET_LOCAL_TIME | PTP_EXEC_CMD)); ++ ++ return 0; ++} ++ ++/* ++ * delta = delta * 10^6 ppm = delta * 10^9 ppb (in this equation ppm and ppb are not variable) ++ * ++ * in adjfreq ppb is a variable ++ * ppb = delta * 10^9 ++ * delta = ppb / 10^9 ++ * rate_value = |delta| * 2^32 = |ppb| / 10^9 * 2^32 = (|ppb| << 32) / 10^9 ++ */ ++static int _rtl8125_phy_phc_adjfreq(struct ptp_clock_info *ptp, s32 ppb) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ unsigned long flags; ++ u32 rate_value; ++ ++ if (ppb < 0) { ++ rate_value = ((u64)-ppb << 32) / 1000000000; ++ rate_value = ~rate_value + 1; ++ } else ++ rate_value = ((u64)ppb << 32) / 1000000000; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ /* nanoseconds */ ++ //Ns[15:0] E412[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_LO_8126, rate_value); ++ //Ns[22:16] E414[13:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CFG_NS_HI_8126, (rate_value & 0x003f0000) >> 16); ++ ++ rtl8125_phy_set_clkadj_mode(tp, RATE_WRITE); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return 0; ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) ++static int rtl8125_mac_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) ++{ ++ s32 ppb = scaled_ppm_to_ppb(scaled_ppm); ++ ++ if (ppb > ptp->max_adj || ppb < -ptp->max_adj) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ _rtl8125_mac_phc_adjfreq(ptp, ppb); ++ rtnl_unlock(); ++ ++ return 0; ++} ++#else ++static int rtl8125_mac_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta) ++{ ++ //struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ ++ //netif_info(tp, drv, tp->dev, "phc adjust freq\n"); ++ ++ if (delta > ptp->max_adj || delta < -ptp->max_adj) ++ return -EINVAL; ++ ++ rtnl_lock(); ++ _rtl8125_mac_phc_adjfreq(ptp, delta); ++ rtnl_unlock(); ++ ++ return 0; ++} ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) ++static int rtl8125_mac_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64, ++ struct ptp_system_timestamp *sts) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc get ts\n"); ++ ++ rtnl_lock(); ++ ptp_read_system_prets(sts); ++ ret = _rtl8125_mac_phc_gettime(tp, ts64); ++ ptp_read_system_postts(sts); ++ rtnl_unlock(); ++ ++ return ret; ++} ++#else ++static int rtl8125_mac_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc get ts\n"); ++ ++ rtnl_lock(); ++ ret = _rtl8125_mac_phc_gettime(tp, ts64); ++ rtnl_unlock(); ++ ++ return ret; ++} ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) ++static int rtl8125_phy_ptp_adjfine(struct ptp_clock_info *ptp, long scaled_ppm) ++{ ++ s32 ppb = scaled_ppm_to_ppb(scaled_ppm); ++ ++ if (ppb > ptp->max_adj || ppb < -ptp->max_adj) ++ return -EINVAL; ++ ++ _rtl8125_phy_phc_adjfreq(ptp, ppb); ++ ++ return 0; ++} ++ ++#else ++static int rtl8125_phy_phc_adjfreq(struct ptp_clock_info *ptp, s32 delta) ++{ ++ //netif_info(tp, drv, tp->dev, "phc adjust freq\n"); ++ ++ if (delta > ptp->max_adj || delta < -ptp->max_adj) ++ return -EINVAL; ++ ++ _rtl8125_phy_phc_adjfreq(ptp, delta); ++ ++ return 0; ++} ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */ ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) ++static int rtl8125_phy_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64, ++ struct ptp_system_timestamp *sts) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc get ts\n"); ++ ++ ptp_read_system_prets(sts); ++ ret = _rtl8125_phy_phc_gettime(tp, ts64); ++ ptp_read_system_postts(sts); ++ ++ return ret; ++} ++#else ++static int rtl8125_phy_phc_gettime(struct ptp_clock_info *ptp, struct timespec64 *ts64) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc get ts\n"); ++ ++ ret = _rtl8125_phy_phc_gettime(tp, ts64); ++ ++ return ret; ++} ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */ ++ ++static int rtl8125_mac_phc_settime(struct ptp_clock_info *ptp, ++ const struct timespec64 *ts64) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc set ts\n"); ++ ++ rtnl_lock(); ++ ret = _rtl8125_mac_phc_settime(tp, ts64); ++ rtnl_unlock(); ++ ++ return ret; ++} ++ ++static int rtl8125_phy_phc_settime(struct ptp_clock_info *ptp, ++ const struct timespec64 *ts64) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "phc set ts\n"); ++ ++ ret = _rtl8125_phy_phc_settime(tp, ts64); ++ ++ return ret; ++} ++ ++static int rtl8125_mac_phc_enable(struct ptp_clock_info *ptp, ++ struct ptp_clock_request *rq, int on) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ u16 ptp_ctrl; ++ ++ //netif_info(tp, drv, tp->dev, "phc enable type %x on %d\n", rq->type, on); ++ ++ switch (rq->type) { ++ case PTP_CLK_REQ_PPS: ++ rtnl_lock(); ++ ptp_ctrl = RTL_R16(tp, PTP_CTRL_8125); ++ ptp_ctrl &= ~BIT_15; ++ if (on) ++ ptp_ctrl |= BIT_14; ++ else ++ ptp_ctrl &= ~BIT_14; ++ RTL_W16(tp, PTP_CTRL_8125, ptp_ctrl); ++ rtnl_unlock(); ++ return 0; ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static int rtl8125_phy_phc_enable(struct ptp_clock_info *ptp, ++ struct ptp_clock_request *rq, int on) ++{ ++ struct rtl8125_private *tp = container_of(ptp, struct rtl8125_private, ptp_clock_info); ++ unsigned long flags; ++ u16 phy_ocp_data; ++ ++ switch (rq->type) { ++ case PTP_CLK_REQ_PPS: ++ rtnl_lock(); ++ if (on) { ++ tp->pps_enable = 1; ++ rtl8125_mac_ocp_write(tp, 0xDC00, rtl8125_mac_ocp_read(tp, 0xDC00) & ~BIT_6); ++ rtl8125_mac_ocp_write(tp, 0xDC60, rtl8125_mac_ocp_read(tp, 0xDC60) | BIT_6); ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ /* Set periodic pulse 1pps */ ++ /* E432[8:0] = 0x017d */ ++ phy_ocp_data = rtl8125_mdio_direct_read_phy_ocp(tp, 0xE432); ++ phy_ocp_data &= 0xFE00; ++ phy_ocp_data |= 0x017d; ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xE432, phy_ocp_data); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xE434, 0x7840); ++ ++ /* E436[8:0] = 0xbe */ ++ phy_ocp_data = rtl8125_mdio_direct_read_phy_ocp(tp, 0xE436); ++ phy_ocp_data &= 0xFE00; ++ phy_ocp_data |= 0xbe; ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xE436, phy_ocp_data); ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, 0xE438, 0xbc20); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ /* start hrtimer */ ++ hrtimer_start(&tp->pps_timer, 1000000000, HRTIMER_MODE_REL); ++ } else ++ tp->pps_enable = 0; ++ rtnl_unlock(); ++ return 0; ++ default: ++ return -EOPNOTSUPP; ++ } ++} ++ ++static void rtl8125_phy_ptp_enable_config(struct rtl8125_private *tp) ++{ ++ u16 ptp_ctrl; ++ ++ if (tp->syncE_en) ++ rtl8125_set_eth_phy_ocp_bit(tp, PTP_SYNCE_CTL, BIT_0); ++ else ++ rtl8125_clear_eth_phy_ocp_bit(tp, PTP_SYNCE_CTL, BIT_0); ++ ++ ptp_ctrl = BIT_0 | BIT_1 | BIT_2 | BIT_3 | BIT_4 | BIT_5 | BIT_6 | BIT_7 | BIT_12; ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_CTL, ptp_ctrl); ++ ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA640, BIT_15); ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0) ++int rtl8125_get_ts_info(struct net_device *netdev, ++ struct ethtool_ts_info *info) ++#else ++int rtl8125_get_ts_info(struct net_device *netdev, ++ struct kernel_ethtool_ts_info *info) ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0) */ ++{ ++ struct rtl8125_private *tp = netdev_priv(netdev); ++ ++ /* we always support timestamping disabled */ ++ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE); ++ ++ if (tp->HwSuppPtpVer == 0) ++ return ethtool_op_get_ts_info(netdev, info); ++ ++ info->so_timestamping = SOF_TIMESTAMPING_TX_SOFTWARE | ++ SOF_TIMESTAMPING_RX_SOFTWARE | ++ SOF_TIMESTAMPING_SOFTWARE | ++ SOF_TIMESTAMPING_TX_HARDWARE | ++ SOF_TIMESTAMPING_RX_HARDWARE | ++ SOF_TIMESTAMPING_RAW_HARDWARE; ++ ++ if (tp->ptp_clock) ++ info->phc_index = ptp_clock_index(tp->ptp_clock); ++ else ++ info->phc_index = -1; ++ ++ info->tx_types = BIT(HWTSTAMP_TX_OFF) | BIT(HWTSTAMP_TX_ON); ++ ++ info->rx_filters = BIT(HWTSTAMP_FILTER_NONE) | ++ BIT(HWTSTAMP_FILTER_PTP_V2_EVENT) | ++ BIT(HWTSTAMP_FILTER_PTP_V2_L4_EVENT) | ++ BIT(HWTSTAMP_FILTER_PTP_V2_SYNC) | ++ BIT(HWTSTAMP_FILTER_PTP_V2_L4_SYNC) | ++ BIT(HWTSTAMP_FILTER_PTP_V2_DELAY_REQ) | ++ BIT(HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ); ++ ++ return 0; ++} ++ ++static const struct ptp_clock_info rtl8125_mac_ptp_clock_info = { ++ .owner = THIS_MODULE, ++ .n_alarm = 0, ++ .n_ext_ts = 0, ++ .n_per_out = 0, ++ .n_pins = 0, ++ .pps = 1, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) ++ .adjfine = rtl8125_mac_ptp_adjfine, ++#else ++ .adjfreq = rtl8125_mac_phc_adjfreq, ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */ ++ .adjtime = rtl8125_mac_phc_adjtime, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) ++ .gettimex64 = rtl8125_mac_phc_gettime, ++#else ++ .gettime64 = rtl8125_mac_phc_gettime, ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */ ++ .settime64 = rtl8125_mac_phc_settime, ++ .enable = rtl8125_mac_phc_enable, ++}; ++ ++static const struct ptp_clock_info rtl8125_phy_ptp_clock_info = { ++ .owner = THIS_MODULE, ++ .n_alarm = 0, ++ .n_ext_ts = 0, ++ .n_per_out = 0, ++ .n_pins = 0, ++ .pps = 1, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) ++ .adjfine = rtl8125_phy_ptp_adjfine, ++#else ++ .adjfreq = rtl8125_phy_phc_adjfreq, ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,2,0) */ ++ .adjtime = rtl8125_phy_phc_adjtime, ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) ++ .gettimex64 = rtl8125_phy_phc_gettime, ++#else ++ .gettime64 = rtl8125_phy_phc_gettime, ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(5,0,0) */ ++ ++ .settime64 = rtl8125_phy_phc_settime, ++ .enable = rtl8125_phy_phc_enable, ++}; ++ ++static void rtl8125_mac_ptp_egresstime(struct rtl8125_private *tp, struct timespec64 *ts64, u32 regnum) ++{ ++ /* nanoseconds */ ++ //[29:0] ++ ts64->tv_nsec = rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_NS_8125 + regnum * 16 + 2); ++ ts64->tv_nsec <<= 16; ++ ts64->tv_nsec |= rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_NS_8125 + regnum * 16); ++ ts64->tv_nsec &= 0x3fffffff; ++ ++ /* seconds */ ++ //[47:0] ++ ts64->tv_sec = rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_S_8125 + regnum * 16 + 4); ++ ts64->tv_sec <<= 16; ++ ts64->tv_sec |= rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_S_8125 + regnum * 16 + 2); ++ ts64->tv_sec <<= 16; ++ ts64->tv_sec |= rtl8125_mac_ocp_read(tp, PTP_EGRESS_TIME_BASE_S_8125 + regnum * 16); ++ ts64->tv_sec &= 0x0000ffffffffffff; ++} ++ ++static u16 rtl8125_phy_ptp_get_tx_msgtype(struct rtl8125_private *tp) ++{ ++ u16 tx_ts_ready = 0; ++ int i; ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ tx_ts_ready = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_STA) & 0xF000; ++ if (tx_ts_ready) ++ break; ++ } ++ ++ switch (tx_ts_ready) { ++ case TX_TS_PDLYRSP_RDY: ++ return PTP_MSGTYPE_PDELAY_RESP; ++ case TX_TS_PDLYREQ_RDY: ++ return PTP_MSGTYPE_PDELAY_REQ; ++ case TX_TS_DLYREQ_RDY: ++ return PTP_MSGTYPE_DELAY_REQ; ++ case TX_TS_SYNC_RDY: ++ default: ++ return PTP_MSGTYPE_SYNC; ++ } ++} ++ ++/* ++static u16 rtl8125_phy_ptp_get_rx_msgtype(struct rtl8125_private *tp) ++{ ++ u16 rx_ts_ready = 0; ++ int i; ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ rx_ts_ready = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_STA) & 0x0F00; ++ if (rx_ts_ready) ++ break; ++ } ++ ++ switch (rx_ts_ready) { ++ case RX_TS_PDLYRSP_RDY: ++ return PTP_MSGTYPE_PDELAY_RESP; ++ case RX_TS_PDLYREQ_RDY: ++ return PTP_MSGTYPE_PDELAY_REQ; ++ case RX_TS_DLYREQ_RDY: ++ return PTP_MSGTYPE_DELAY_REQ; ++ case RX_TS_SYNC_RDY: ++ default: ++ return PTP_MSGTYPE_SYNC; ++ } ++} ++*/ ++ ++static void rtl8125_wait_phy_trx_ts_ready(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) ++ if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_STA) & TRX_TS_RD)) ++ break; ++} ++ ++static void rtl8125_set_phy_trx_ts_cmd(struct rtl8125_private *tp, u16 cmd) ++{ ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ PTP_TRX_TS_STA, ++ TRXTS_SEL | BIT_3 | BIT_2, ++ TRX_TS_RD | cmd); ++ ++ rtl8125_wait_phy_trx_ts_ready(tp); ++} ++ ++static void rtl8125_phy_ptp_egresstime(struct rtl8125_private *tp, struct timespec64 *ts64) ++{ ++ u16 msgtype; ++ ++ msgtype = rtl8125_phy_ptp_get_tx_msgtype(tp); ++ ++ msgtype <<= 2; ++ ++ rtl8125_set_phy_trx_ts_cmd(tp, (msgtype | BIT_4)); ++ ++ /* nanoseconds */ ++ //Ns[29:16] E448[13:0] ++ ts64->tv_nsec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_HI) & 0x3fff; ++ ts64->tv_nsec <<= 16; ++ //Ns[15:0] E446[15:0] ++ ts64->tv_nsec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_LO); ++ ++ /* seconds */ ++ //S[47:32] E44E[15:0] ++ ts64->tv_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_HI); ++ ts64->tv_sec <<= 16; ++ //S[31:16] E44C[15:0] ++ ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_MI); ++ ts64->tv_sec <<= 16; ++ //S[15:0] E44A[15:0] ++ ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_LO); ++} ++static void rtl8125_phy_ptp_ingresstime(struct rtl8125_private *tp, struct timespec64 *ts64, u8 type) ++{ ++ u16 msgtype; ++ ++ switch (type) { ++ case PTP_MSGTYPE_PDELAY_RESP: ++ case PTP_MSGTYPE_PDELAY_REQ: ++ case PTP_MSGTYPE_DELAY_REQ: ++ case PTP_MSGTYPE_SYNC: ++ msgtype = type << 2; ++ break; ++ default: ++ return; ++ } ++ ++ rtl8125_set_phy_trx_ts_cmd(tp, (TRXTS_SEL | msgtype | BIT_4)); ++ ++ /* nanoseconds */ ++ //Ns[29:16] E448[13:0] ++ ts64->tv_nsec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_HI) & 0x3fff; ++ ts64->tv_nsec <<= 16; ++ //Ns[15:0] E446[15:0] ++ ts64->tv_nsec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_NS_LO); ++ ++ /* seconds */ ++ //S[47:32] E44E[15:0] ++ ts64->tv_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_HI); ++ ts64->tv_sec <<= 16; ++ //S[31:16] E44C[15:0] ++ ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_MI); ++ ts64->tv_sec <<= 16; ++ //S[15:0] E44A[15:0] ++ ts64->tv_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_TRX_TS_S_LO); ++} ++ ++static void rtl8125_mac_ptp_tx_hwtstamp(struct rtl8125_private *tp) ++{ ++ struct sk_buff *skb = tp->ptp_tx_skb; ++ struct skb_shared_hwtstamps shhwtstamps = {0}; ++ struct timespec64 ts64; ++ u32 regnum; ++ ++ RTL_W8(tp, PTP_ISR_8125, PTP_ISR_TOK | PTP_ISR_TER); ++ ++ //IO 0x2302 bit 10~11 WR_PTR ++ regnum = RTL_R16(tp, 0x2032) & 0x0C00; ++ regnum >>= 10; ++ regnum = (regnum + 3) % 4; ++ ++ rtnl_lock(); ++ rtl8125_mac_ptp_egresstime(tp, &ts64, regnum); ++ rtnl_unlock(); ++ ++ /* Upper 32 bits contain s, lower 32 bits contain ns. */ ++ shhwtstamps.hwtstamp = ktime_set(ts64.tv_sec, ++ ts64.tv_nsec); ++ ++ /* Clear the lock early before calling skb_tstamp_tx so that ++ * applications are not woken up before the lock bit is clear. We use ++ * a copy of the skb pointer to ensure other threads can't change it ++ * while we're notifying the stack. ++ */ ++ tp->ptp_tx_skb = NULL; ++ clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state); ++ ++ /* Notify the stack and free the skb after we've unlocked */ ++ skb_tstamp_tx(skb, &shhwtstamps); ++ dev_kfree_skb_any(skb); ++} ++ ++static void rtl8125_phy_ptp_tx_hwtstamp(struct rtl8125_private *tp) ++{ ++ struct sk_buff *skb = tp->ptp_tx_skb; ++ struct skb_shared_hwtstamps shhwtstamps = { 0 }; ++ struct timespec64 ts64; ++ ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_INSR, TX_TX_INTR); ++ ++ rtl8125_phy_ptp_egresstime(tp, &ts64); ++ ++ /* Upper 32 bits contain s, lower 32 bits contain ns. */ ++ shhwtstamps.hwtstamp = ktime_set(ts64.tv_sec, ++ ts64.tv_nsec); ++ ++ /* Clear the lock early before calling skb_tstamp_tx so that ++ * applications are not woken up before the lock bit is clear. We use ++ * a copy of the skb pointer to ensure other threads can't change it ++ * while we're notifying the stack. ++ */ ++ tp->ptp_tx_skb = NULL; ++ clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state); ++ ++ /* Notify the stack and free the skb after we've unlocked */ ++ skb_tstamp_tx(skb, &shhwtstamps); ++ dev_kfree_skb_any(skb); ++} ++ ++#define RTL8125_PTP_TX_TIMEOUT (HZ * 15) ++static void rtl8125_mac_ptp_tx_work(struct work_struct *work) ++{ ++ struct rtl8125_private *tp = container_of(work, struct rtl8125_private, ++ ptp_tx_work); ++ ++ if (!tp->ptp_tx_skb) ++ return; ++ ++ if (time_is_before_jiffies(tp->ptp_tx_start + ++ RTL8125_PTP_TX_TIMEOUT)) { ++ dev_kfree_skb_any(tp->ptp_tx_skb); ++ tp->ptp_tx_skb = NULL; ++ clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state); ++ tp->tx_hwtstamp_timeouts++; ++ /* Clear the tx valid bit in TSYNCTXCTL register to enable ++ * interrupt ++ */ ++ RTL_W8(tp, PTP_ISR_8125, PTP_ISR_TOK | PTP_ISR_TER); ++ return; ++ } ++ ++ if (RTL_R8(tp, PTP_ISR_8125) & (PTP_ISR_TOK)) ++ rtl8125_mac_ptp_tx_hwtstamp(tp); ++ else ++ /* reschedule to check later */ ++ schedule_work(&tp->ptp_tx_work); ++} ++ ++static void rtl8125_phy_ptp_tx_work(struct work_struct *work) ++{ ++ struct rtl8125_private *tp = container_of(work, struct rtl8125_private, ++ ptp_tx_work); ++ unsigned long flags; ++ bool tx_intr; ++ ++ if (!tp->ptp_tx_skb) ++ return; ++ ++ if (time_is_before_jiffies(tp->ptp_tx_start + ++ RTL8125_PTP_TX_TIMEOUT)) { ++ dev_kfree_skb_any(tp->ptp_tx_skb); ++ tp->ptp_tx_skb = NULL; ++ clear_bit_unlock(__RTL8125_PTP_TX_IN_PROGRESS, &tp->state); ++ tp->tx_hwtstamp_timeouts++; ++ /* Clear the tx valid bit in TSYNCTXCTL register to enable ++ * interrupt ++ */ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_INSR, TX_TX_INTR); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ return; ++ } ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ if (rtl8125_mdio_direct_read_phy_ocp(tp, PTP_INSR) & TX_TX_INTR) { ++ tx_intr = true; ++ rtl8125_phy_ptp_tx_hwtstamp(tp); ++ } else { ++ tx_intr = false; ++ } ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ if (!tx_intr) { ++ /* reschedule to check later */ ++ schedule_work(&tp->ptp_tx_work); ++ } ++} ++ ++static int rtl8125_mac_hwtstamp_enable(struct rtl8125_private *tp, bool enable) ++{ ++ RTL_W16(tp, PTP_CTRL_8125, 0); ++ if (enable) { ++ u16 ptp_ctrl; ++ struct timespec64 ts64; ++ ++ //clear ptp isr ++ RTL_W8(tp, PTP_ISR_8125, 0xff); ++ //ptp source 0:gphy 1:mac ++ rtl8125_mac_ocp_write(tp, 0xDC00, rtl8125_mac_ocp_read(tp, 0xDC00) | BIT_6); ++ //enable ptp ++ ptp_ctrl = (BIT_0 | BIT_3 | BIT_4 | BIT_6 | BIT_10 | BIT_12); ++ if (tp->ptp_master_mode) ++ ptp_ctrl |= BIT_1; ++ RTL_W16(tp, PTP_CTRL_8125, ptp_ctrl); ++ ++ //set system time ++ /* ++ if (ktime_to_timespec64_cond(ktime_get_real(), &ts64)) ++ _rtl8125_mac_phc_settime(tp, timespec64_to_timespec(ts64)); ++ */ ++ ktime_get_real_ts64(&ts64); ++ _rtl8125_mac_phc_settime(tp, &ts64); ++ } ++ ++ return 0; ++} ++ ++static int rtl8125_phy_hwtstamp_enable(struct rtl8125_private *tp, bool enable) ++{ ++ unsigned long flags; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ if (enable) { ++ //trx timestamp interrupt enable ++ rtl8125_set_eth_phy_ocp_bit(tp, PTP_INER, BIT_2 | BIT_3); ++ ++ //set isr clear mode ++ rtl8125_set_eth_phy_ocp_bit(tp, PTP_GEN_CFG, BIT_0); ++ ++ //clear ptp isr ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_INSR, 0xFFFF); ++ ++ //enable ptp ++ rtl8125_phy_ptp_enable_config(tp); ++ ++ //rtl8125_set_phy_local_time(tp); ++ } else { ++ /* trx timestamp interrupt disable */ ++ rtl8125_clear_eth_phy_ocp_bit(tp, PTP_INER, BIT_2 | BIT_3); ++ ++ /* disable ptp */ ++ rtl8125_clear_eth_phy_ocp_bit(tp, PTP_SYNCE_CTL, BIT_0); ++ rtl8125_clear_eth_phy_ocp_bit(tp, PTP_CTL, BIT_0); ++ rtl8125_set_eth_phy_ocp_bit(tp, 0xA640, BIT_15); ++ } ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ return 0; ++} ++ ++void rtl8125_set_phy_local_time(struct rtl8125_private *tp) ++{ ++ struct timespec64 ts64; ++ //set system time ++ ktime_get_real_ts64(&ts64); ++ _rtl8125_phy_phc_settime(tp, &ts64); ++} ++ ++static long rtl8125_ptp_create_clock(struct rtl8125_private *tp) ++{ ++ struct net_device *netdev = tp->dev; ++ long err; ++ ++ if (!IS_ERR_OR_NULL(tp->ptp_clock)) ++ return 0; ++ ++ if (tp->HwSuppPtpVer == 0) { ++ tp->ptp_clock = NULL; ++ return -EOPNOTSUPP; ++ } ++ ++ switch (tp->HwSuppPtpVer) { ++ case 1: ++ tp->ptp_clock_info = rtl8125_mac_ptp_clock_info; ++ tp->ptp_clock_info.max_adj = 119304647; ++ break; ++ case 3: ++ tp->ptp_clock_info = rtl8125_phy_ptp_clock_info; ++ tp->ptp_clock_info.max_adj = 488281;//0x1FFFFF * 10^9 / 2^32 ++ break; ++ default: ++ break; ++ } ++ ++ snprintf(tp->ptp_clock_info.name, sizeof(tp->ptp_clock_info.name), ++ "%pm", tp->dev->dev_addr); ++ tp->ptp_clock = ptp_clock_register(&tp->ptp_clock_info, &tp->pci_dev->dev); ++ if (IS_ERR(tp->ptp_clock)) { ++ err = PTR_ERR(tp->ptp_clock); ++ tp->ptp_clock = NULL; ++ netif_err(tp, drv, tp->dev, "ptp_clock_register failed\n"); ++ return err; ++ } else ++ netif_info(tp, drv, tp->dev, "registered PHC device on %s\n", netdev->name); ++ ++ return 0; ++} ++ ++void rtl8125_ptp_reset(struct rtl8125_private *tp) ++{ ++ if (!tp->ptp_clock) ++ return; ++ ++ netif_info(tp, drv, tp->dev, "reset PHC clock\n"); ++ ++ switch (tp->HwSuppPtpVer) { ++ case 1: ++ rtl8125_mac_hwtstamp_enable(tp, false); ++ break; ++ case 3: ++ rtl8125_phy_hwtstamp_enable(tp, false); ++ break; ++ default: ++ break; ++ } ++} ++ ++static enum hrtimer_restart ++rtl8125_phy_hrtimer_for_pps(struct hrtimer *timer) { ++ struct rtl8125_private *tp = container_of(timer, struct rtl8125_private, pps_timer); ++ s64 pps_sec; ++ u16 tai_cfg; ++ int i; ++ ++ if (tp->pps_enable) ++ { ++ switch (tp->HwSuppPtpVer) { ++ case 3: ++ tai_cfg = BIT_8 | BIT_5 | BIT_1 | BIT_0; ++ break; ++ default: ++ break; ++ } ++ ++ //Direct Read ++ rtl8125_clear_and_set_eth_phy_ocp_bit(tp, ++ PTP_CLK_CFG_8126, ++ BIT_3 | BIT_2 | BIT_1, ++ (PTP_CLKADJ_MODE_SET | DIRECT_READ)); ++ ++ for (i = 0; i < R8125_CHANNEL_WAIT_COUNT; i++) { ++ udelay(R8125_CHANNEL_WAIT_TIME); ++ ++ if (!(rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CLK_CFG_8126) & PTP_CLKADJ_MODE_SET)) ++ break; ++ } ++ ++ pps_sec = rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_HI_8126); ++ pps_sec <<= 16; ++ pps_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_MI_8126); ++ pps_sec <<= 16; ++ pps_sec |= rtl8125_mdio_direct_read_phy_ocp(tp, PTP_CFG_S_LO_8126); ++ pps_sec++; ++ ++ //E42A[15:0] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_TAI_TS_S_LO, pps_sec & 0xffff); ++ //E42C[31:16] ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_TAI_TS_S_HI, (pps_sec & 0xffff0000) >> 16); ++ //Periodic Tai start ++ rtl8125_mdio_direct_write_phy_ocp(tp, PTP_TAI_CFG, tai_cfg); ++ ++ hrtimer_forward_now(&tp->pps_timer, 1000000000); //rekick ++ return HRTIMER_RESTART; ++ } else ++ return HRTIMER_NORESTART; ++} ++ ++void rtl8125_ptp_init(struct rtl8125_private *tp) ++{ ++ /* obtain a PTP device, or re-use an existing device */ ++ if (rtl8125_ptp_create_clock(tp)) ++ return; ++ ++ /* we have a clock so we can initialize work now */ ++ switch (tp->HwSuppPtpVer) { ++ case 1: ++ INIT_WORK(&tp->ptp_tx_work, rtl8125_mac_ptp_tx_work); ++ break; ++ case 3: ++ INIT_WORK(&tp->ptp_tx_work, rtl8125_phy_ptp_tx_work); ++ break; ++ default: ++ break; ++ } ++ ++ /* init a hrtimer for pps */ ++ switch (tp->HwSuppPtpVer) { ++ case 3: ++ tp->pps_enable = 0; ++ hrtimer_init(&tp->pps_timer, CLOCK_MONOTONIC, HRTIMER_MODE_REL); ++ tp->pps_timer.function = rtl8125_phy_hrtimer_for_pps; ++ break; ++ default: ++ break; ++ } ++ ++ /* reset the PTP related hardware bits */ ++ rtl8125_ptp_reset(tp); ++ ++ return; ++} ++ ++void rtl8125_ptp_suspend(struct rtl8125_private *tp) ++{ ++ if (!tp->ptp_clock) ++ return; ++ ++ netif_info(tp, drv, tp->dev, "suspend PHC clock\n"); ++ ++ switch (tp->HwSuppPtpVer) { ++ case 1: ++ rtl8125_mac_hwtstamp_enable(tp, false); ++ break; ++ case 3: ++ rtl8125_phy_hwtstamp_enable(tp, false); ++ break; ++ default: ++ break; ++ } ++ ++ /* ensure that we cancel any pending PTP Tx work item in progress */ ++ cancel_work_sync(&tp->ptp_tx_work); ++ ++ switch (tp->HwSuppPtpVer) { ++ case 3: ++ hrtimer_cancel(&tp->pps_timer); ++ break; ++ default: ++ break; ++ } ++} ++ ++void rtl8125_ptp_stop(struct rtl8125_private *tp) ++{ ++ struct net_device *netdev = tp->dev; ++ ++ netif_info(tp, drv, tp->dev, "stop PHC clock\n"); ++ ++ /* first, suspend PTP activity */ ++ rtl8125_ptp_suspend(tp); ++ ++ /* disable the PTP clock device */ ++ if (tp->ptp_clock) { ++ ptp_clock_unregister(tp->ptp_clock); ++ tp->ptp_clock = NULL; ++ netif_info(tp, drv, tp->dev, "removed PHC on %s\n", ++ netdev->name); ++ } ++} ++ ++static int rtl8125_set_tstamp(struct net_device *netdev, struct ifreq *ifr) ++{ ++ struct rtl8125_private *tp = netdev_priv(netdev); ++ struct hwtstamp_config config; ++ bool hwtstamp = 0; ++ ++ //netif_info(tp, drv, tp->dev, "ptp set ts\n"); ++ ++ if (copy_from_user(&config, ifr->ifr_data, sizeof(config))) ++ return -EFAULT; ++ ++ if (config.flags) ++ return -EINVAL; ++ ++ switch (config.tx_type) { ++ case HWTSTAMP_TX_ON: ++ hwtstamp = 1; ++ break; ++ case HWTSTAMP_TX_OFF: ++ break; ++ case HWTSTAMP_TX_ONESTEP_SYNC: ++ default: ++ return -ERANGE; ++ } ++ ++ switch (config.rx_filter) { ++ case HWTSTAMP_FILTER_PTP_V2_EVENT: ++ case HWTSTAMP_FILTER_PTP_V2_L2_EVENT: ++ case HWTSTAMP_FILTER_PTP_V2_L4_EVENT: ++ case HWTSTAMP_FILTER_PTP_V2_SYNC: ++ case HWTSTAMP_FILTER_PTP_V2_L2_SYNC: ++ case HWTSTAMP_FILTER_PTP_V2_L4_SYNC: ++ case HWTSTAMP_FILTER_PTP_V2_DELAY_REQ: ++ case HWTSTAMP_FILTER_PTP_V2_L2_DELAY_REQ: ++ case HWTSTAMP_FILTER_PTP_V2_L4_DELAY_REQ: ++ config.rx_filter = HWTSTAMP_FILTER_PTP_V2_EVENT; ++ hwtstamp = 1; ++ tp->flags |= RTL_FLAG_RX_HWTSTAMP_ENABLED; ++ break; ++ case HWTSTAMP_FILTER_NONE: ++ tp->flags &= ~RTL_FLAG_RX_HWTSTAMP_ENABLED; ++ break; ++ default: ++ tp->flags &= ~RTL_FLAG_RX_HWTSTAMP_ENABLED; ++ return -ERANGE; ++ } ++ ++ if (tp->hwtstamp_config.tx_type != config.tx_type || ++ tp->hwtstamp_config.rx_filter != config.rx_filter) { ++ tp->hwtstamp_config = config; ++ ++ switch (tp->HwSuppPtpVer) { ++ case 1: ++ rtl8125_mac_hwtstamp_enable(tp, hwtstamp); ++ break; ++ case 3: ++ rtl8125_phy_hwtstamp_enable(tp, hwtstamp); ++ break; ++ default: ++ break; ++ } ++ } ++ ++ return copy_to_user(ifr->ifr_data, &config, ++ sizeof(config)) ? -EFAULT : 0; ++} ++ ++static int rtl8125_get_tstamp(struct net_device *netdev, struct ifreq *ifr) ++{ ++ struct rtl8125_private *tp = netdev_priv(netdev); ++ ++ //netif_info(tp, drv, tp->dev, "ptp get ts\n"); ++ ++ return copy_to_user(ifr->ifr_data, &tp->hwtstamp_config, ++ sizeof(tp->hwtstamp_config)) ? -EFAULT : 0; ++} ++ ++int rtl8125_ptp_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd) ++{ ++ int ret; ++ ++ //netif_info(tp, drv, tp->dev, "ptp ioctl\n"); ++ ++ switch (cmd) { ++#ifdef ENABLE_PTP_SUPPORT ++ case SIOCSHWTSTAMP: ++ ret = rtl8125_set_tstamp(netdev, ifr); ++ break; ++ case SIOCGHWTSTAMP: ++ ret = rtl8125_get_tstamp(netdev, ifr); ++ break; ++#endif ++ default: ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ return ret; ++} ++ ++void rtl8125_rx_mac_ptp_pktstamp(struct rtl8125_private *tp, struct sk_buff *skb, ++ struct RxDescV3 *descv3) ++{ ++ time64_t tv_sec; ++ long tv_nsec; ++ ++ tv_sec = le32_to_cpu(descv3->RxDescTimeStamp.TimeStampHigh) + ++ ((u64)le32_to_cpu(descv3->RxDescPTPDDWord4.TimeStampHHigh) << 32); ++ tv_nsec = le32_to_cpu(descv3->RxDescTimeStamp.TimeStampLow); ++ ++ skb_hwtstamps(skb)->hwtstamp = ktime_set(tv_sec, tv_nsec); ++} ++ ++static void rtl8125_rx_phy_ptp_pktstamp(struct rtl8125_private *tp, struct sk_buff *skb, u8 type) ++{ ++ struct timespec64 ts64; ++ unsigned long flags; ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ ++ rtl8125_phy_ptp_ingresstime(tp, &ts64, type); ++ ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ skb_hwtstamps(skb)->hwtstamp = ktime_set(ts64.tv_sec, ts64.tv_nsec); ++ ++ return; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0) ++static struct ptp_header *ptp_parse_header(struct sk_buff *skb, unsigned int type) ++{ ++ u8 *ptr = skb_mac_header(skb); ++ ++ if (type & PTP_CLASS_VLAN) ++ //ptr += VLAN_HLEN; ++ ptr += 4; ++ ++ switch (type & PTP_CLASS_PMASK) { ++ case PTP_CLASS_IPV4: ++ ptr += IPV4_HLEN(ptr) + UDP_HLEN; ++ break; ++ case PTP_CLASS_IPV6: ++ ptr += IP6_HLEN + UDP_HLEN; ++ break; ++ case PTP_CLASS_L2: ++ break; ++ default: ++ return NULL; ++ } ++ ++ ptr += ETH_HLEN; ++ ++ /* Ensure that the entire header is present in this packet. */ ++ if (ptr + sizeof(struct ptp_header) > skb->data + skb->len) ++ return NULL; ++ ++ return (struct ptp_header *)ptr; ++} ++ ++static inline u8 ptp_get_msgtype(const struct ptp_header *hdr, ++ unsigned int type) ++{ ++ u8 msgtype; ++ ++ if (unlikely(type & PTP_CLASS_V1)) { ++ /* msg type is located at the control field for ptp v1 */ ++ msgtype = hdr->control; ++ } else { ++ msgtype = hdr->tsmt & 0x0f; ++ } ++ ++ return msgtype; ++} ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0) */ ++ ++void rtl8125_rx_phy_ptp_timestamp(struct rtl8125_private *tp, struct sk_buff *skb) ++{ ++ unsigned int ptp_class; ++ struct ptp_header *hdr; ++ u8 msgtype; ++ ++ ptp_class = ptp_classify_raw(skb); ++ if (ptp_class == PTP_CLASS_NONE) ++ return; ++ ++ skb_reset_mac_header(skb); ++ hdr = ptp_parse_header(skb, ptp_class); ++ if (unlikely(!hdr)) ++ return; ++ ++ msgtype = ptp_get_msgtype(hdr, ptp_class); ++ rtl8125_rx_phy_ptp_pktstamp(tp, skb, msgtype); ++ ++ return; ++} +diff --git a/drivers/net/ethernet/realtek/r8125_ptp.h b/drivers/net/ethernet/realtek/r8125_ptp.h +new file mode 100755 +index 000000000000..3cd8b677fd60 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_ptp.h +@@ -0,0 +1,159 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef _LINUX_rtl8125_PTP_H ++#define _LINUX_rtl8125_PTP_H ++ ++#include ++#include ++#include ++#include ++#include ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,11,0) ++#define PTP_MSGTYPE_SYNC 0x0 ++#define PTP_MSGTYPE_DELAY_REQ 0x1 ++#define PTP_MSGTYPE_PDELAY_REQ 0x2 ++#define PTP_MSGTYPE_PDELAY_RESP 0x3 ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,11,0) */ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0) ++struct clock_identity { ++ u8 id[8]; ++} __packed; ++ ++struct port_identity { ++ struct clock_identity clock_identity; ++ __be16 port_number; ++} __packed; ++ ++struct ptp_header { ++ u8 tsmt; /* transportSpecific | messageType */ ++ u8 ver; /* reserved | versionPTP */ ++ __be16 message_length; ++ u8 domain_number; ++ u8 reserved1; ++ u8 flag_field[2]; ++ __be64 correction; ++ __be32 reserved2; ++ struct port_identity source_port_identity; ++ __be16 sequence_id; ++ u8 control; ++ u8 log_message_interval; ++} __packed; ++ ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(5,10,0) */ ++ ++struct rtl8125_ptp_info { ++ s64 time_sec; ++ u32 time_ns; ++ u16 ts_info; ++}; ++ ++#ifndef _STRUCT_TIMESPEC ++#define _STRUCT_TIMESPEC ++struct timespec { ++ __kernel_old_time_t tv_sec; /* seconds */ ++ long tv_nsec; /* nanoseconds */ ++}; ++#endif ++ ++enum PTP_CMD_TYPE { ++ PTP_CMD_SET_LOCAL_TIME = 0, ++ PTP_CMD_DRIFT_LOCAL_TIME, ++ PTP_CMD_LATCHED_LOCAL_TIME, ++}; ++ ++#define PTP_CLKADJ_MODE_SET BIT_0 ++ ++enum PTP_CLKADJ_MOD_TYPE { ++ NO_FUNCTION = 0, ++ CLKADJ_MODE_SET = 1, ++ RESERVED = 2, ++ DIRECT_READ = 4, ++ DIRECT_WRITE = 6, ++ INCREMENT_STEP = 8, ++ DECREMENT_STEP = 10, ++ RATE_READ = 12, ++ RATE_WRITE = 14, ++}; ++ ++enum PTP_INSR_TYPE { ++ EVENT_CAP_INTR = (1 << 0), ++ TRIG_GEN_INTR = (1 << 1), ++ RX_TS_INTR = (1 << 2), ++ TX_TX_INTR = (1 << 3), ++}; ++ ++enum PTP_TRX_TS_STA_REG { ++ TRX_TS_RD = (1 << 0), ++ TRXTS_SEL = (1 << 1), ++ RX_TS_PDLYRSP_RDY = (1 << 8), ++ RX_TS_PDLYREQ_RDY = (1 << 9), ++ RX_TS_DLYREQ_RDY = (1 << 10), ++ RX_TS_SYNC_RDY = (1 << 11), ++ TX_TS_PDLYRSP_RDY = (1 << 12), ++ TX_TS_PDLYREQ_RDY = (1 << 13), ++ TX_TS_DLYREQ_RDY = (1 << 14), ++ TX_TS_SYNC_RDY = (1 << 15), ++}; ++ ++#define RTL_FLAG_RX_HWTSTAMP_ENABLED BIT_0 ++ ++struct rtl8125_private; ++struct RxDescV3; ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0) ++int rtl8125_get_ts_info(struct net_device *netdev, ++ struct ethtool_ts_info *info); ++#else ++int rtl8125_get_ts_info(struct net_device *netdev, ++ struct kernel_ethtool_ts_info *info); ++#endif /* LINUX_VERSION_CODE < KERNEL_VERSION(6,11,0) */ ++ ++void rtl8125_ptp_reset(struct rtl8125_private *tp); ++void rtl8125_ptp_init(struct rtl8125_private *tp); ++void rtl8125_ptp_suspend(struct rtl8125_private *tp); ++void rtl8125_ptp_stop(struct rtl8125_private *tp); ++ ++int rtl8125_ptp_ioctl(struct net_device *netdev, struct ifreq *ifr, int cmd); ++ ++void rtl8125_rx_mac_ptp_pktstamp(struct rtl8125_private *tp, struct sk_buff *skb, ++ struct RxDescV3 *descv3); ++ ++void rtl8125_set_phy_local_time(struct rtl8125_private *tp); ++ ++void rtl8125_rx_phy_ptp_timestamp(struct rtl8125_private *tp, struct sk_buff *skb); ++ ++#endif /* _LINUX_rtl8125_PTP_H */ +diff --git a/drivers/net/ethernet/realtek/r8125_realwow.h b/drivers/net/ethernet/realtek/r8125_realwow.h +new file mode 100755 +index 000000000000..4b2315ebbb62 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_realwow.h +@@ -0,0 +1,118 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef _LINUX_R8125_REALWOW_H ++#define _LINUX_R8125_REALWOW_H ++ ++#define SIOCDEVPRIVATE_RTLREALWOW SIOCDEVPRIVATE+3 ++ ++#define MAX_RealWoW_KCP_SIZE (100) ++#define MAX_RealWoW_Payload (64) ++ ++#define KA_TX_PACKET_SIZE (100) ++#define KA_WAKEUP_PATTERN_SIZE (120) ++ ++//HwSuppKeepAliveOffloadVer ++#define HW_SUPPORT_KCP_OFFLOAD(_M) ((_M)->HwSuppKCPOffloadVer > 0) ++ ++enum rtl_realwow_cmd { ++ ++ RTL_REALWOW_SET_KCP_DISABLE=0, ++ RTL_REALWOW_SET_KCP_INFO, ++ RTL_REALWOW_SET_KCP_CONTENT, ++ ++ RTL_REALWOW_SET_KCP_ACKPKTINFO, ++ RTL_REALWOW_SET_KCP_WPINFO, ++ RTL_REALWOW_SET_KCPDHCP_TIMEOUT, ++ ++ RTLT_REALWOW_COMMAND_INVALID ++}; ++ ++struct rtl_realwow_ioctl_struct { ++ __u32 cmd; ++ __u32 offset; ++ __u32 len; ++ union { ++ __u32 data; ++ void *data_buffer; ++ }; ++}; ++ ++typedef struct _MP_KCPInfo { ++ u8 DIPv4[4]; ++ u8 MacID[6]; ++ u16 UdpPort[2]; ++ u8 PKTLEN[2]; ++ ++ u16 ackLostCnt; ++ u8 KCP_WakePattern[MAX_RealWoW_Payload]; ++ u8 KCP_AckPacket[MAX_RealWoW_Payload]; ++ u32 KCP_interval; ++ u8 KCP_WakePattern_Len; ++ u8 KCP_AckPacket_Len; ++ u8 KCP_TxPacket[2][KA_TX_PACKET_SIZE]; ++} MP_KCP_INFO, *PMP_KCP_INFO; ++ ++typedef struct _KCPInfo { ++ u32 nId; // = id ++ u8 DIPv4[4]; ++ u8 MacID[6]; ++ u16 UdpPort; ++ u16 PKTLEN; ++} KCPInfo, *PKCPInfo; ++ ++typedef struct _KCPContent { ++ u32 id; // = id ++ u32 mSec; // = msec ++ u32 size; // =size ++ u8 bPacket[MAX_RealWoW_KCP_SIZE]; // put packet here ++} KCPContent, *PKCPContent; ++ ++typedef struct _RealWoWAckPktInfo { ++ u16 ackLostCnt; ++ u16 patterntSize; ++ u8 pattern[MAX_RealWoW_Payload]; ++} RealWoWAckPktInfo,*PRealWoWAckPktInfo; ++ ++typedef struct _RealWoWWPInfo { ++ u16 patterntSize; ++ u8 pattern[MAX_RealWoW_Payload]; ++} RealWoWWPInfo,*PRealWoWWPInfo; ++ ++int rtl8125_realwow_ioctl(struct net_device *dev, struct ifreq *ifr); ++void rtl8125_realwow_hw_init(struct net_device *dev); ++void rtl8125_get_realwow_hw_version(struct net_device *dev); ++void rtl8125_set_realwow_d3_para(struct net_device *dev); ++ ++#endif /* _LINUX_R8125_REALWOW_H */ +diff --git a/drivers/net/ethernet/realtek/r8125_rss.c b/drivers/net/ethernet/realtek/r8125_rss.c +new file mode 100755 +index 000000000000..bcdcab01a6ab +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_rss.c +@@ -0,0 +1,583 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#include ++#include "r8125.h" ++ ++enum rtl8125_rss_register_content { ++ /* RSS */ ++ RSS_CTRL_TCP_IPV4_SUPP = (1 << 0), ++ RSS_CTRL_IPV4_SUPP = (1 << 1), ++ RSS_CTRL_TCP_IPV6_SUPP = (1 << 2), ++ RSS_CTRL_IPV6_SUPP = (1 << 3), ++ RSS_CTRL_IPV6_EXT_SUPP = (1 << 4), ++ RSS_CTRL_TCP_IPV6_EXT_SUPP = (1 << 5), ++ RSS_HALF_SUPP = (1 << 7), ++ RSS_CTRL_UDP_IPV4_SUPP = (1 << 11), ++ RSS_CTRL_UDP_IPV6_SUPP = (1 << 12), ++ RSS_CTRL_UDP_IPV6_EXT_SUPP = (1 << 13), ++ RSS_QUAD_CPU_EN = (1 << 16), ++ RSS_HQ_Q_SUP_R = (1 << 31), ++}; ++ ++static int rtl8125_get_rss_hash_opts(struct rtl8125_private *tp, ++ struct ethtool_rxnfc *cmd) ++{ ++ cmd->data = 0; ++ ++ /* Report default options for RSS */ ++ switch (cmd->flow_type) { ++ case TCP_V4_FLOW: ++ cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; ++ fallthrough; ++ case UDP_V4_FLOW: ++ if (tp->rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV4) ++ cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; ++ fallthrough; ++ case IPV4_FLOW: ++ cmd->data |= RXH_IP_SRC | RXH_IP_DST; ++ break; ++ case TCP_V6_FLOW: ++ cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; ++ fallthrough; ++ case UDP_V6_FLOW: ++ if (tp->rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV6) ++ cmd->data |= RXH_L4_B_0_1 | RXH_L4_B_2_3; ++ fallthrough; ++ case IPV6_FLOW: ++ cmd->data |= RXH_IP_SRC | RXH_IP_DST; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ return 0; ++} ++ ++int rtl8125_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, ++ u32 *rule_locs) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int ret = -EOPNOTSUPP; ++ ++ if (!(dev->features & NETIF_F_RXHASH)) ++ return ret; ++ ++ switch (cmd->cmd) { ++ case ETHTOOL_GRXRINGS: ++ cmd->data = rtl8125_tot_rx_rings(tp); ++ ret = 0; ++ break; ++ case ETHTOOL_GRXFH: ++ ret = rtl8125_get_rss_hash_opts(tp, cmd); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++u32 rtl8125_rss_indir_tbl_entries(struct rtl8125_private *tp) ++{ ++ return tp->HwSuppIndirTblEntries; ++} ++ ++#define RSS_MASK_BITS_OFFSET (8) ++#define RSS_CPU_NUM_OFFSET (16) ++#define RTL8125_UDP_RSS_FLAGS (RTL_8125_RSS_FLAG_HASH_UDP_IPV4 | \ ++ RTL_8125_RSS_FLAG_HASH_UDP_IPV6) ++static int _rtl8125_set_rss_hash_opt(struct rtl8125_private *tp) ++{ ++ u32 rss_flags = tp->rss_flags; ++ u32 hash_mask_len; ++ u32 rss_ctrl; ++ ++ rss_ctrl = ilog2(rtl8125_tot_rx_rings(tp)); ++ rss_ctrl &= (BIT_0 | BIT_1 | BIT_2); ++ rss_ctrl <<= RSS_CPU_NUM_OFFSET; ++ ++ /* Perform hash on these packet types */ ++ rss_ctrl |= RSS_CTRL_TCP_IPV4_SUPP ++ | RSS_CTRL_IPV4_SUPP ++ | RSS_CTRL_IPV6_SUPP ++ | RSS_CTRL_IPV6_EXT_SUPP ++ | RSS_CTRL_TCP_IPV6_SUPP ++ | RSS_CTRL_TCP_IPV6_EXT_SUPP; ++ ++ if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV4) ++ rss_ctrl |= RSS_CTRL_UDP_IPV4_SUPP; ++ ++ if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV6) ++ rss_ctrl |= RSS_CTRL_UDP_IPV6_SUPP | ++ RSS_CTRL_UDP_IPV6_EXT_SUPP; ++ ++ hash_mask_len = ilog2(rtl8125_rss_indir_tbl_entries(tp)); ++ hash_mask_len &= (BIT_0 | BIT_1 | BIT_2); ++ rss_ctrl |= hash_mask_len << RSS_MASK_BITS_OFFSET; ++ ++ RTL_W32(tp, RSS_CTRL_8125, rss_ctrl); ++ ++ return 0; ++} ++ ++static int rtl8125_set_rss_hash_opt(struct rtl8125_private *tp, ++ struct ethtool_rxnfc *nfc) ++{ ++ u32 rss_flags = tp->rss_flags; ++ ++ /* ++ * RSS does not support anything other than hashing ++ * to queues on src and dst IPs and ports ++ */ ++ if (nfc->data & ~(RXH_IP_SRC | RXH_IP_DST | ++ RXH_L4_B_0_1 | RXH_L4_B_2_3)) ++ return -EINVAL; ++ ++ switch (nfc->flow_type) { ++ case TCP_V4_FLOW: ++ case TCP_V6_FLOW: ++ if (!(nfc->data & RXH_IP_SRC) || ++ !(nfc->data & RXH_IP_DST) || ++ !(nfc->data & RXH_L4_B_0_1) || ++ !(nfc->data & RXH_L4_B_2_3)) ++ return -EINVAL; ++ break; ++ case UDP_V4_FLOW: ++ if (!(nfc->data & RXH_IP_SRC) || ++ !(nfc->data & RXH_IP_DST)) ++ return -EINVAL; ++ switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { ++ case 0: ++ rss_flags &= ~RTL_8125_RSS_FLAG_HASH_UDP_IPV4; ++ break; ++ case (RXH_L4_B_0_1 | RXH_L4_B_2_3): ++ rss_flags |= RTL_8125_RSS_FLAG_HASH_UDP_IPV4; ++ break; ++ default: ++ return -EINVAL; ++ } ++ break; ++ case UDP_V6_FLOW: ++ if (!(nfc->data & RXH_IP_SRC) || ++ !(nfc->data & RXH_IP_DST)) ++ return -EINVAL; ++ switch (nfc->data & (RXH_L4_B_0_1 | RXH_L4_B_2_3)) { ++ case 0: ++ rss_flags &= ~RTL_8125_RSS_FLAG_HASH_UDP_IPV6; ++ break; ++ case (RXH_L4_B_0_1 | RXH_L4_B_2_3): ++ rss_flags |= RTL_8125_RSS_FLAG_HASH_UDP_IPV6; ++ break; ++ default: ++ return -EINVAL; ++ } ++ break; ++ case SCTP_V4_FLOW: ++ case AH_ESP_V4_FLOW: ++ case AH_V4_FLOW: ++ case ESP_V4_FLOW: ++ case SCTP_V6_FLOW: ++ case AH_ESP_V6_FLOW: ++ case AH_V6_FLOW: ++ case ESP_V6_FLOW: ++ case IP_USER_FLOW: ++ case ETHER_FLOW: ++ /* RSS is not supported for these protocols */ ++ if (nfc->data) { ++ netif_err(tp, drv, tp->dev, "Command parameters not supported\n"); ++ return -EINVAL; ++ } ++ return 0; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ /* if we changed something we need to update flags */ ++ if (rss_flags != tp->rss_flags) { ++ u32 rss_ctrl = RTL_R32(tp, RSS_CTRL_8125); ++ ++ if ((rss_flags & RTL8125_UDP_RSS_FLAGS) && ++ !(tp->rss_flags & RTL8125_UDP_RSS_FLAGS)) ++ netdev_warn(tp->dev, ++ "enabling UDP RSS: fragmented packets may " ++ "arrive out of order to the stack above\n"); ++ ++ tp->rss_flags = rss_flags; ++ ++ /* Perform hash on these packet types */ ++ rss_ctrl |= RSS_CTRL_TCP_IPV4_SUPP ++ | RSS_CTRL_IPV4_SUPP ++ | RSS_CTRL_IPV6_SUPP ++ | RSS_CTRL_IPV6_EXT_SUPP ++ | RSS_CTRL_TCP_IPV6_SUPP ++ | RSS_CTRL_TCP_IPV6_EXT_SUPP; ++ ++ rss_ctrl &= ~(RSS_CTRL_UDP_IPV4_SUPP | ++ RSS_CTRL_UDP_IPV6_SUPP | ++ RSS_CTRL_UDP_IPV6_EXT_SUPP); ++ ++ if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV4) ++ rss_ctrl |= RSS_CTRL_UDP_IPV4_SUPP; ++ ++ if (rss_flags & RTL_8125_RSS_FLAG_HASH_UDP_IPV6) ++ rss_ctrl |= RSS_CTRL_UDP_IPV6_SUPP | ++ RSS_CTRL_UDP_IPV6_EXT_SUPP; ++ ++ RTL_W32(tp, RSS_CTRL_8125, rss_ctrl); ++ } ++ ++ return 0; ++} ++ ++int rtl8125_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int ret = -EOPNOTSUPP; ++ ++ if (!(dev->features & NETIF_F_RXHASH)) ++ return ret; ++ ++ switch (cmd->cmd) { ++ case ETHTOOL_SRXFH: ++ ret = rtl8125_set_rss_hash_opt(tp, cmd); ++ break; ++ default: ++ break; ++ } ++ ++ return ret; ++} ++ ++static u32 _rtl8125_get_rxfh_key_size(struct rtl8125_private *tp) ++{ ++ return sizeof(tp->rss_key); ++} ++ ++u32 rtl8125_get_rxfh_key_size(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (!(dev->features & NETIF_F_RXHASH)) ++ return 0; ++ ++ return _rtl8125_get_rxfh_key_size(tp); ++} ++ ++u32 rtl8125_rss_indir_size(struct net_device *dev) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (!(dev->features & NETIF_F_RXHASH)) ++ return 0; ++ ++ return rtl8125_rss_indir_tbl_entries(tp); ++} ++ ++static void rtl8125_get_reta(struct rtl8125_private *tp, u32 *indir) ++{ ++ int i, reta_size = rtl8125_rss_indir_tbl_entries(tp); ++ ++ for (i = 0; i < reta_size; i++) ++ indir[i] = tp->rss_indir_tbl[i]; ++} ++ ++static u32 rtl8125_rss_key_reg(struct rtl8125_private *tp) ++{ ++ return RSS_KEY_8125; ++} ++ ++static u32 rtl8125_rss_indir_tbl_reg(struct rtl8125_private *tp) ++{ ++ return RSS_INDIRECTION_TBL_8125_V2; ++} ++ ++static void rtl8125_store_reta(struct rtl8125_private *tp) ++{ ++ u16 indir_tbl_reg = rtl8125_rss_indir_tbl_reg(tp); ++ u32 i, reta_entries = rtl8125_rss_indir_tbl_entries(tp); ++ u32 reta = 0; ++ u8 *indir_tbl = tp->rss_indir_tbl; ++ ++ /* Write redirection table to HW */ ++ for (i = 0; i < reta_entries; i++) { ++ reta |= indir_tbl[i] << (i & 0x3) * 8; ++ if ((i & 3) == 3) { ++ RTL_W32(tp, indir_tbl_reg, reta); ++ ++ indir_tbl_reg += 4; ++ reta = 0; ++ } ++ } ++} ++ ++static void rtl8125_store_rss_key(struct rtl8125_private *tp) ++{ ++ const u16 rss_key_reg = rtl8125_rss_key_reg(tp); ++ u32 i, rss_key_size = _rtl8125_get_rxfh_key_size(tp); ++ u32 *rss_key = (u32*)tp->rss_key; ++ ++ /* Write redirection table to HW */ ++ for (i = 0; i < rss_key_size; i+=4) ++ RTL_W32(tp, rss_key_reg + i, *rss_key++); ++} ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0) ++int rtl8125_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (!(dev->features & NETIF_F_RXHASH)) ++ return -EOPNOTSUPP; ++ ++ rxfh->hfunc = ETH_RSS_HASH_TOP; ++ ++ if (rxfh->indir) ++ rtl8125_get_reta(tp, rxfh->indir); ++ ++ if (rxfh->key) ++ memcpy(rxfh->key, tp->rss_key, RTL8125_RSS_KEY_SIZE); ++ ++ return 0; ++} ++ ++int rtl8125_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, ++ struct netlink_ext_ack *extack) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ u32 reta_entries = rtl8125_rss_indir_tbl_entries(tp); ++ ++ /* We require at least one supported parameter to be changed and no ++ * change in any of the unsupported parameters ++ */ ++ if (rxfh->hfunc != ETH_RSS_HASH_NO_CHANGE && rxfh->hfunc != ETH_RSS_HASH_TOP) ++ return -EOPNOTSUPP; ++ ++ /* Fill out the redirection table */ ++ if (rxfh->indir) { ++ int max_queues = tp->num_rx_rings; ++ ++ /* Verify user input. */ ++ for (i = 0; i < reta_entries; i++) ++ if (rxfh->indir[i] >= max_queues) ++ return -EINVAL; ++ ++ for (i = 0; i < reta_entries; i++) ++ tp->rss_indir_tbl[i] = rxfh->indir[i]; ++ } ++ ++ /* Fill out the rss hash key */ ++ if (rxfh->key) ++ memcpy(tp->rss_key, rxfh->key, RTL8125_RSS_KEY_SIZE); ++ ++ rtl8125_store_reta(tp); ++ ++ rtl8125_store_rss_key(tp); ++ ++ return 0; ++} ++#else ++int rtl8125_get_rxfh(struct net_device *dev, u32 *indir, u8 *key, ++ u8 *hfunc) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ ++ if (!(dev->features & NETIF_F_RXHASH)) ++ return -EOPNOTSUPP; ++ ++ if (hfunc) ++ *hfunc = ETH_RSS_HASH_TOP; ++ ++ if (indir) ++ rtl8125_get_reta(tp, indir); ++ ++ if (key) ++ memcpy(key, tp->rss_key, RTL8125_RSS_KEY_SIZE); ++ ++ return 0; ++} ++ ++int rtl8125_set_rxfh(struct net_device *dev, const u32 *indir, ++ const u8 *key, const u8 hfunc) ++{ ++ struct rtl8125_private *tp = netdev_priv(dev); ++ int i; ++ u32 reta_entries = rtl8125_rss_indir_tbl_entries(tp); ++ ++ /* We require at least one supported parameter to be changed and no ++ * change in any of the unsupported parameters ++ */ ++ if (hfunc != ETH_RSS_HASH_NO_CHANGE && hfunc != ETH_RSS_HASH_TOP) ++ return -EOPNOTSUPP; ++ ++ /* Fill out the redirection table */ ++ if (indir) { ++ int max_queues = tp->num_rx_rings; ++ ++ /* Verify user input. */ ++ for (i = 0; i < reta_entries; i++) ++ if (indir[i] >= max_queues) ++ return -EINVAL; ++ ++ for (i = 0; i < reta_entries; i++) ++ tp->rss_indir_tbl[i] = indir[i]; ++ } ++ ++ /* Fill out the rss hash key */ ++ if (key) ++ memcpy(tp->rss_key, key, RTL8125_RSS_KEY_SIZE); ++ ++ rtl8125_store_reta(tp); ++ ++ rtl8125_store_rss_key(tp); ++ ++ return 0; ++} ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0) */ ++ ++static u32 rtl8125_get_rx_desc_hash(struct rtl8125_private *tp, ++ struct RxDesc *desc) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ return le32_to_cpu(((struct RxDescV3 *)desc)->RxDescNormalDDWord2.RSSResult); ++ case RX_DESC_RING_TYPE_4: ++ return le32_to_cpu(((struct RxDescV4 *)desc)->RxDescNormalDDWord1.RSSResult); ++ default: ++ return 0; ++ } ++} ++ ++#define RXS_8125B_RSS_UDP BIT(9) ++#define RXS_8125_RSS_IPV4 BIT(10) ++#define RXS_8125_RSS_IPV6 BIT(12) ++#define RXS_8125_RSS_TCP BIT(13) ++#define RTL8125_RXS_RSS_L3_TYPE_MASK (RXS_8125_RSS_IPV4 | RXS_8125_RSS_IPV6) ++#define RTL8125_RXS_RSS_L4_TYPE_MASK (RXS_8125_RSS_TCP | RXS_8125B_RSS_UDP) ++ ++#define RXS_8125B_RSS_UDP_V4 BIT(27) ++#define RXS_8125_RSS_IPV4_V4 BIT(28) ++#define RXS_8125_RSS_IPV6_V4 BIT(29) ++#define RXS_8125_RSS_TCP_V4 BIT(30) ++#define RTL8125_RXS_RSS_L3_TYPE_MASK_V4 (RXS_8125_RSS_IPV4_V4 | RXS_8125_RSS_IPV6_V4) ++#define RTL8125_RXS_RSS_L4_TYPE_MASK_V4 (RXS_8125_RSS_TCP_V4 | RXS_8125B_RSS_UDP_V4) ++static void rtl8125_rx_hash_v3(struct rtl8125_private *tp, ++ struct RxDescV3 *descv3, ++ struct sk_buff *skb) ++{ ++ u16 rss_header_info; ++ ++ if (!(tp->dev->features & NETIF_F_RXHASH)) ++ return; ++ ++ rss_header_info = le16_to_cpu(descv3->RxDescNormalDDWord2.HeaderInfo); ++ ++ if (!(rss_header_info & RTL8125_RXS_RSS_L3_TYPE_MASK)) ++ return; ++ ++ skb_set_hash(skb, rtl8125_get_rx_desc_hash(tp, (struct RxDesc *)descv3), ++ (RTL8125_RXS_RSS_L4_TYPE_MASK & rss_header_info) ? ++ PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3); ++} ++ ++static void rtl8125_rx_hash_v4(struct rtl8125_private *tp, ++ struct RxDescV4 *descv4, ++ struct sk_buff *skb) ++{ ++ u32 rss_header_info; ++ ++ if (!(tp->dev->features & NETIF_F_RXHASH)) ++ return; ++ ++ rss_header_info = le32_to_cpu(descv4->RxDescNormalDDWord1.RSSInfo); ++ ++ if (!(rss_header_info & RTL8125_RXS_RSS_L3_TYPE_MASK_V4)) ++ return; ++ ++ skb_set_hash(skb, rtl8125_get_rx_desc_hash(tp, (struct RxDesc *)descv4), ++ (RTL8125_RXS_RSS_L4_TYPE_MASK_V4 & rss_header_info) ? ++ PKT_HASH_TYPE_L4 : PKT_HASH_TYPE_L3); ++} ++ ++void rtl8125_rx_hash(struct rtl8125_private *tp, ++ struct RxDesc *desc, ++ struct sk_buff *skb) ++{ ++ switch (tp->InitRxDescType) { ++ case RX_DESC_RING_TYPE_3: ++ rtl8125_rx_hash_v3(tp, (struct RxDescV3 *)desc, skb); ++ break; ++ case RX_DESC_RING_TYPE_4: ++ rtl8125_rx_hash_v4(tp, (struct RxDescV4 *)desc, skb); ++ break; ++ default: ++ return; ++ } ++} ++ ++void rtl8125_disable_rss(struct rtl8125_private *tp) ++{ ++ RTL_W32(tp, RSS_CTRL_8125, 0x00); ++} ++ ++void _rtl8125_config_rss(struct rtl8125_private *tp) ++{ ++ _rtl8125_set_rss_hash_opt(tp); ++ ++ rtl8125_store_reta(tp); ++ ++ rtl8125_store_rss_key(tp); ++} ++ ++void rtl8125_config_rss(struct rtl8125_private *tp) ++{ ++ if (!tp->EnableRss) { ++ rtl8125_disable_rss(tp); ++ return; ++ } ++ ++ _rtl8125_config_rss(tp); ++} ++ ++void rtl8125_init_rss(struct rtl8125_private *tp) ++{ ++ int i; ++ ++ for (i = 0; i < rtl8125_rss_indir_tbl_entries(tp); i++) ++ tp->rss_indir_tbl[i] = ethtool_rxfh_indir_default(i, tp->num_rx_rings); ++ ++ netdev_rss_key_fill(tp->rss_key, RTL8125_RSS_KEY_SIZE); ++} +diff --git a/drivers/net/ethernet/realtek/r8125_rss.h b/drivers/net/ethernet/realtek/r8125_rss.h +new file mode 100755 +index 000000000000..d2ec5f06c3f1 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/r8125_rss.h +@@ -0,0 +1,76 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef _LINUX_rtl8125_RSS_H ++#define _LINUX_rtl8125_RSS_H ++ ++#include ++#include ++ ++#define RTL8125_RSS_KEY_SIZE 40 /* size of RSS Hash Key in bytes */ ++#define RTL8125_MAX_INDIRECTION_TABLE_ENTRIES 128 ++ ++enum rtl8125_rss_flag { ++ RTL_8125_RSS_FLAG_HASH_UDP_IPV4 = (1 << 0), ++ RTL_8125_RSS_FLAG_HASH_UDP_IPV6 = (1 << 1), ++}; ++ ++struct rtl8125_private; ++struct RxDesc; ++ ++int rtl8125_get_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd, ++ u32 *rule_locs); ++int rtl8125_set_rxnfc(struct net_device *dev, struct ethtool_rxnfc *cmd); ++u32 rtl8125_get_rxfh_key_size(struct net_device *netdev); ++u32 rtl8125_rss_indir_size(struct net_device *netdev); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0) ++int rtl8125_get_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh); ++int rtl8125_set_rxfh(struct net_device *dev, struct ethtool_rxfh_param *rxfh, ++ struct netlink_ext_ack *extack); ++#else ++int rtl8125_get_rxfh(struct net_device *netdev, u32 *indir, u8 *key, ++ u8 *hfunc); ++int rtl8125_set_rxfh(struct net_device *netdev, const u32 *indir, ++ const u8 *key, const u8 hfunc); ++#endif /* LINUX_VERSION_CODE >= KERNEL_VERSION(6,8,0) */ ++void rtl8125_rx_hash(struct rtl8125_private *tp, ++ struct RxDesc *desc, ++ struct sk_buff *skb); ++void _rtl8125_config_rss(struct rtl8125_private *tp); ++void rtl8125_config_rss(struct rtl8125_private *tp); ++void rtl8125_init_rss(struct rtl8125_private *tp); ++u32 rtl8125_rss_indir_tbl_entries(struct rtl8125_private *tp); ++void rtl8125_disable_rss(struct rtl8125_private *tp); ++ ++#endif /* _LINUX_rtl8125_RSS_H */ +diff --git a/drivers/net/ethernet/realtek/r8169_main.c b/drivers/net/ethernet/realtek/r8169_main.c +index 755083852eef..32648463d169 100644 +--- a/drivers/net/ethernet/realtek/r8169_main.c ++++ b/drivers/net/ethernet/realtek/r8169_main.c +@@ -215,7 +215,6 @@ static const struct pci_device_id rtl8169_pci_tbl[] = { + { PCI_VDEVICE(REALTEK, 0x8129) }, + { PCI_VDEVICE(REALTEK, 0x8136), RTL_CFG_NO_GBIT }, + { PCI_VDEVICE(REALTEK, 0x8161) }, +- { PCI_VDEVICE(REALTEK, 0x8162) }, + { PCI_VDEVICE(REALTEK, 0x8167) }, + { PCI_VDEVICE(REALTEK, 0x8168) }, + { PCI_VDEVICE(NCUBE, 0x8168) }, +@@ -226,10 +225,13 @@ static const struct pci_device_id rtl8169_pci_tbl[] = { + { PCI_VDEVICE(USR, 0x0116) }, + { PCI_VENDOR_ID_LINKSYS, 0x1032, PCI_ANY_ID, 0x0024 }, + { 0x0001, 0x8168, PCI_ANY_ID, 0x2410 }, +- { PCI_VDEVICE(REALTEK, 0x8125) }, + { PCI_VDEVICE(REALTEK, 0x8126) }, + { PCI_VDEVICE(REALTEK, 0x8127) }, ++#if !defined(CONFIG_R8125) && !defined(CONFIG_R8125_MODULE) ++ { PCI_VDEVICE(REALTEK, 0x8125) }, ++ { PCI_VDEVICE(REALTEK, 0x8162) }, + { PCI_VDEVICE(REALTEK, 0x3000) }, ++#endif /* !CONFIG_R8125 */ + { PCI_VDEVICE(REALTEK, 0x5000) }, + { PCI_VDEVICE(REALTEK, 0x0e10) }, + {} +diff --git a/drivers/net/ethernet/realtek/rtl_eeprom.c b/drivers/net/ethernet/realtek/rtl_eeprom.c +new file mode 100755 +index 000000000000..f1c2a1d12e3c +--- /dev/null ++++ b/drivers/net/ethernet/realtek/rtl_eeprom.c +@@ -0,0 +1,284 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++ ++#include "r8125.h" ++#include "rtl_eeprom.h" ++ ++//------------------------------------------------------------------- ++//rtl8125_eeprom_type(): ++// tell the eeprom type ++//return value: ++// 0: the eeprom type is 93C46 ++// 1: the eeprom type is 93C56 or 93C66 ++//------------------------------------------------------------------- ++void rtl8125_eeprom_type(struct rtl8125_private *tp) ++{ ++ u16 magic = 0; ++ ++ if (tp->mcfg == CFG_METHOD_DEFAULT) ++ goto out_no_eeprom; ++ ++ if(RTL_R8(tp, 0xD2)&0x04) { ++ //not support ++ //tp->eeprom_type = EEPROM_TWSI; ++ //tp->eeprom_len = 256; ++ goto out_no_eeprom; ++ } else if(RTL_R32(tp, RxConfig) & RxCfg_9356SEL) { ++ tp->eeprom_type = EEPROM_TYPE_93C56; ++ tp->eeprom_len = 256; ++ } else { ++ tp->eeprom_type = EEPROM_TYPE_93C46; ++ tp->eeprom_len = 128; ++ } ++ ++ magic = rtl8125_eeprom_read_sc(tp, 0); ++ ++out_no_eeprom: ++ if ((magic != 0x8129) && (magic != 0x8128)) { ++ tp->eeprom_type = EEPROM_TYPE_NONE; ++ tp->eeprom_len = 0; ++ } ++} ++ ++void rtl8125_eeprom_cleanup(struct rtl8125_private *tp) ++{ ++ u8 x; ++ ++ x = RTL_R8(tp, Cfg9346); ++ x &= ~(Cfg9346_EEDI | Cfg9346_EECS); ++ ++ RTL_W8(tp, Cfg9346, x); ++ ++ rtl8125_raise_clock(tp, &x); ++ rtl8125_lower_clock(tp, &x); ++} ++ ++static int rtl8125_eeprom_cmd_done(struct rtl8125_private *tp) ++{ ++ u8 x; ++ int i; ++ ++ rtl8125_stand_by(tp); ++ ++ for (i = 0; i < 50000; i++) { ++ x = RTL_R8(tp, Cfg9346); ++ ++ if (x & Cfg9346_EEDO) { ++ udelay(RTL_CLOCK_RATE * 2 * 3); ++ return 0; ++ } ++ udelay(1); ++ } ++ ++ return -1; ++} ++ ++//------------------------------------------------------------------- ++//rtl8125_eeprom_read_sc(): ++// read one word from eeprom ++//------------------------------------------------------------------- ++u16 rtl8125_eeprom_read_sc(struct rtl8125_private *tp, u16 reg) ++{ ++ int addr_sz = 6; ++ u8 x; ++ u16 data; ++ ++ if(tp->eeprom_type == EEPROM_TYPE_NONE) ++ return -1; ++ ++ if (tp->eeprom_type==EEPROM_TYPE_93C46) ++ addr_sz = 6; ++ else if (tp->eeprom_type==EEPROM_TYPE_93C56) ++ addr_sz = 8; ++ ++ x = Cfg9346_EEM1 | Cfg9346_EECS; ++ RTL_W8(tp, Cfg9346, x); ++ ++ rtl8125_shift_out_bits(tp, RTL_EEPROM_READ_OPCODE, 3); ++ rtl8125_shift_out_bits(tp, reg, addr_sz); ++ ++ data = rtl8125_shift_in_bits(tp); ++ ++ rtl8125_eeprom_cleanup(tp); ++ ++ RTL_W8(tp, Cfg9346, 0); ++ ++ return data; ++} ++ ++//------------------------------------------------------------------- ++//rtl8125_eeprom_write_sc(): ++// write one word to a specific address in the eeprom ++//------------------------------------------------------------------- ++void rtl8125_eeprom_write_sc(struct rtl8125_private *tp, u16 reg, u16 data) ++{ ++ u8 x; ++ int addr_sz = 6; ++ int w_dummy_addr = 4; ++ ++ if(tp->eeprom_type == EEPROM_TYPE_NONE) ++ return; ++ ++ if (tp->eeprom_type==EEPROM_TYPE_93C46) { ++ addr_sz = 6; ++ w_dummy_addr = 4; ++ } else if (tp->eeprom_type==EEPROM_TYPE_93C56) { ++ addr_sz = 8; ++ w_dummy_addr = 6; ++ } ++ ++ x = Cfg9346_EEM1 | Cfg9346_EECS; ++ RTL_W8(tp, Cfg9346, x); ++ ++ rtl8125_shift_out_bits(tp, RTL_EEPROM_EWEN_OPCODE, 5); ++ rtl8125_shift_out_bits(tp, reg, w_dummy_addr); ++ rtl8125_stand_by(tp); ++ ++ rtl8125_shift_out_bits(tp, RTL_EEPROM_ERASE_OPCODE, 3); ++ rtl8125_shift_out_bits(tp, reg, addr_sz); ++ if (rtl8125_eeprom_cmd_done(tp) < 0) ++ return; ++ rtl8125_stand_by(tp); ++ ++ rtl8125_shift_out_bits(tp, RTL_EEPROM_WRITE_OPCODE, 3); ++ rtl8125_shift_out_bits(tp, reg, addr_sz); ++ rtl8125_shift_out_bits(tp, data, 16); ++ if (rtl8125_eeprom_cmd_done(tp) < 0) ++ return; ++ rtl8125_stand_by(tp); ++ ++ rtl8125_shift_out_bits(tp, RTL_EEPROM_EWDS_OPCODE, 5); ++ rtl8125_shift_out_bits(tp, reg, w_dummy_addr); ++ ++ rtl8125_eeprom_cleanup(tp); ++ RTL_W8(tp, Cfg9346, 0); ++} ++ ++void rtl8125_raise_clock(struct rtl8125_private *tp, u8 *x) ++{ ++ *x = *x | Cfg9346_EESK; ++ RTL_W8(tp, Cfg9346, *x); ++ udelay(RTL_CLOCK_RATE); ++} ++ ++void rtl8125_lower_clock(struct rtl8125_private *tp, u8 *x) ++{ ++ *x = *x & ~Cfg9346_EESK; ++ RTL_W8(tp, Cfg9346, *x); ++ udelay(RTL_CLOCK_RATE); ++} ++ ++void rtl8125_shift_out_bits(struct rtl8125_private *tp, int data, int count) ++{ ++ u8 x; ++ int mask; ++ ++ mask = 0x01 << (count - 1); ++ x = RTL_R8(tp, Cfg9346); ++ x &= ~(Cfg9346_EEDI | Cfg9346_EEDO); ++ ++ do { ++ if (data & mask) ++ x |= Cfg9346_EEDI; ++ else ++ x &= ~Cfg9346_EEDI; ++ ++ RTL_W8(tp, Cfg9346, x); ++ udelay(RTL_CLOCK_RATE); ++ rtl8125_raise_clock(tp, &x); ++ rtl8125_lower_clock(tp, &x); ++ mask = mask >> 1; ++ } while(mask); ++ ++ x &= ~Cfg9346_EEDI; ++ RTL_W8(tp, Cfg9346, x); ++} ++ ++u16 rtl8125_shift_in_bits(struct rtl8125_private *tp) ++{ ++ u8 x; ++ u16 d, i; ++ ++ x = RTL_R8(tp, Cfg9346); ++ x &= ~(Cfg9346_EEDI | Cfg9346_EEDO); ++ ++ d = 0; ++ ++ for (i = 0; i < 16; i++) { ++ d = d << 1; ++ rtl8125_raise_clock(tp, &x); ++ ++ x = RTL_R8(tp, Cfg9346); ++ x &= ~Cfg9346_EEDI; ++ ++ if (x & Cfg9346_EEDO) ++ d |= 1; ++ ++ rtl8125_lower_clock(tp, &x); ++ } ++ ++ return d; ++} ++ ++void rtl8125_stand_by(struct rtl8125_private *tp) ++{ ++ u8 x; ++ ++ x = RTL_R8(tp, Cfg9346); ++ x &= ~(Cfg9346_EECS | Cfg9346_EESK); ++ RTL_W8(tp, Cfg9346, x); ++ udelay(RTL_CLOCK_RATE); ++ ++ x |= Cfg9346_EECS; ++ RTL_W8(tp, Cfg9346, x); ++} ++ ++void rtl8125_set_eeprom_sel_low(struct rtl8125_private *tp) ++{ ++ RTL_W8(tp, Cfg9346, Cfg9346_EEM1); ++ RTL_W8(tp, Cfg9346, Cfg9346_EEM1 | Cfg9346_EESK); ++ ++ udelay(20); ++ ++ RTL_W8(tp, Cfg9346, Cfg9346_EEM1); ++} +diff --git a/drivers/net/ethernet/realtek/rtl_eeprom.h b/drivers/net/ethernet/realtek/rtl_eeprom.h +new file mode 100755 +index 000000000000..7c154f2f4b48 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/rtl_eeprom.h +@@ -0,0 +1,53 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++//EEPROM opcodes ++#define RTL_EEPROM_READ_OPCODE 06 ++#define RTL_EEPROM_WRITE_OPCODE 05 ++#define RTL_EEPROM_ERASE_OPCODE 07 ++#define RTL_EEPROM_EWEN_OPCODE 19 ++#define RTL_EEPROM_EWDS_OPCODE 16 ++ ++#define RTL_CLOCK_RATE 3 ++ ++void rtl8125_eeprom_type(struct rtl8125_private *tp); ++void rtl8125_eeprom_cleanup(struct rtl8125_private *tp); ++u16 rtl8125_eeprom_read_sc(struct rtl8125_private *tp, u16 reg); ++void rtl8125_eeprom_write_sc(struct rtl8125_private *tp, u16 reg, u16 data); ++void rtl8125_shift_out_bits(struct rtl8125_private *tp, int data, int count); ++u16 rtl8125_shift_in_bits(struct rtl8125_private *tp); ++void rtl8125_raise_clock(struct rtl8125_private *tp, u8 *x); ++void rtl8125_lower_clock(struct rtl8125_private *tp, u8 *x); ++void rtl8125_stand_by(struct rtl8125_private *tp); ++void rtl8125_set_eeprom_sel_low(struct rtl8125_private *tp); +diff --git a/drivers/net/ethernet/realtek/rtltool.c b/drivers/net/ethernet/realtek/rtltool.c +new file mode 100755 +index 000000000000..8dd79e1800a5 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/rtltool.c +@@ -0,0 +1,312 @@ ++// SPDX-License-Identifier: GPL-2.0-only ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include "r8125.h" ++#include "rtl_eeprom.h" ++#include "rtltool.h" ++ ++int rtl8125_tool_ioctl(struct rtl8125_private *tp, struct ifreq *ifr) ++{ ++ struct rtltool_cmd my_cmd; ++ unsigned long flags; ++ int ret; ++ ++ if (copy_from_user(&my_cmd, ifr->ifr_data, sizeof(my_cmd))) ++ return -EFAULT; ++ ++ ret = 0; ++ switch (my_cmd.cmd) { ++ case RTLTOOL_READ_MAC: ++ if ((my_cmd.offset + my_cmd.len) > pci_resource_len(tp->pci_dev, 2)) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (my_cmd.len==1) ++ my_cmd.data = readb(tp->mmio_addr+my_cmd.offset); ++ else if (my_cmd.len==2) ++ my_cmd.data = readw(tp->mmio_addr+(my_cmd.offset&~1)); ++ else if (my_cmd.len==4) ++ my_cmd.data = readl(tp->mmio_addr+(my_cmd.offset&~3)); ++ else { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ break; ++ ++ case RTLTOOL_WRITE_MAC: ++ if ((my_cmd.offset + my_cmd.len) > pci_resource_len(tp->pci_dev, 2)) { ++ ret = -EINVAL; ++ break; ++ } ++ ++ if (my_cmd.len==1) ++ writeb(my_cmd.data, tp->mmio_addr+my_cmd.offset); ++ else if (my_cmd.len==2) ++ writew(my_cmd.data, tp->mmio_addr+(my_cmd.offset&~1)); ++ else if (my_cmd.len==4) ++ writel(my_cmd.data, tp->mmio_addr+(my_cmd.offset&~3)); ++ else { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ break; ++ ++ case RTLTOOL_READ_PHY: ++ r8125_spin_lock(&tp->phy_lock, flags); ++ my_cmd.data = rtl8125_mdio_prot_read(tp, my_cmd.offset); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ break; ++ ++ case RTLTOOL_WRITE_PHY: ++ r8125_spin_lock(&tp->phy_lock, flags); ++ rtl8125_mdio_prot_write(tp, my_cmd.offset, my_cmd.data); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ break; ++ ++ case RTLTOOL_READ_EPHY: ++ my_cmd.data = rtl8125_ephy_read(tp, my_cmd.offset); ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ break; ++ ++ case RTLTOOL_WRITE_EPHY: ++ rtl8125_ephy_write(tp, my_cmd.offset, my_cmd.data); ++ break; ++ ++ case RTLTOOL_READ_ERI: ++ my_cmd.data = 0; ++ if (my_cmd.len==1 || my_cmd.len==2 || my_cmd.len==4) { ++ my_cmd.data = rtl8125_eri_read(tp, my_cmd.offset, my_cmd.len, ERIAR_ExGMAC); ++ } else { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ break; ++ ++ case RTLTOOL_WRITE_ERI: ++ if (my_cmd.len==1 || my_cmd.len==2 || my_cmd.len==4) { ++ rtl8125_eri_write(tp, my_cmd.offset, my_cmd.len, my_cmd.data, ERIAR_ExGMAC); ++ } else { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ break; ++ ++ case RTLTOOL_READ_PCI: ++ my_cmd.data = 0; ++ if (my_cmd.len==1) ++ pci_read_config_byte(tp->pci_dev, my_cmd.offset, ++ (u8 *)&my_cmd.data); ++ else if (my_cmd.len==2) ++ pci_read_config_word(tp->pci_dev, my_cmd.offset, ++ (u16 *)&my_cmd.data); ++ else if (my_cmd.len==4) ++ pci_read_config_dword(tp->pci_dev, my_cmd.offset, ++ &my_cmd.data); ++ else { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ break; ++ ++ case RTLTOOL_WRITE_PCI: ++ if (my_cmd.len==1) ++ pci_write_config_byte(tp->pci_dev, my_cmd.offset, ++ my_cmd.data); ++ else if (my_cmd.len==2) ++ pci_write_config_word(tp->pci_dev, my_cmd.offset, ++ my_cmd.data); ++ else if (my_cmd.len==4) ++ pci_write_config_dword(tp->pci_dev, my_cmd.offset, ++ my_cmd.data); ++ else { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ break; ++ ++ case RTLTOOL_READ_EEPROM: ++ my_cmd.data = rtl8125_eeprom_read_sc(tp, my_cmd.offset); ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ break; ++ ++ case RTLTOOL_WRITE_EEPROM: ++ rtl8125_eeprom_write_sc(tp, my_cmd.offset, my_cmd.data); ++ break; ++ ++ case RTL_READ_OOB_MAC: ++ rtl8125_oob_mutex_lock(tp); ++ my_cmd.data = rtl8125_ocp_read(tp, my_cmd.offset, 4); ++ rtl8125_oob_mutex_unlock(tp); ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ break; ++ ++ case RTL_WRITE_OOB_MAC: ++ if (my_cmd.len == 0 || my_cmd.len > 4) ++ return -EOPNOTSUPP; ++ ++ rtl8125_oob_mutex_lock(tp); ++ rtl8125_ocp_write(tp, my_cmd.offset, my_cmd.len, my_cmd.data); ++ rtl8125_oob_mutex_unlock(tp); ++ break; ++ ++ case RTL_ENABLE_PCI_DIAG: ++ r8125_spin_lock(&tp->phy_lock, flags); ++ tp->rtk_enable_diag = 1; ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ dprintk("enable rtk diag\n"); ++ break; ++ ++ case RTL_DISABLE_PCI_DIAG: ++ r8125_spin_lock(&tp->phy_lock, flags); ++ tp->rtk_enable_diag = 0; ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ ++ dprintk("disable rtk diag\n"); ++ break; ++ ++ case RTL_READ_MAC_OCP: ++ if (my_cmd.offset % 2) ++ return -EOPNOTSUPP; ++ ++ my_cmd.data = rtl8125_mac_ocp_read(tp, my_cmd.offset); ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ break; ++ ++ case RTL_WRITE_MAC_OCP: ++ if ((my_cmd.offset % 2) || (my_cmd.len != 2)) ++ return -EOPNOTSUPP; ++ ++ rtl8125_mac_ocp_write(tp, my_cmd.offset, (u16)my_cmd.data); ++ break; ++ ++ case RTL_DIRECT_READ_PHY_OCP: ++ r8125_spin_lock(&tp->phy_lock, flags); ++ my_cmd.data = rtl8125_mdio_prot_direct_read_phy_ocp(tp, my_cmd.offset); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ break; ++ ++ case RTL_DIRECT_WRITE_PHY_OCP: ++ r8125_spin_lock(&tp->phy_lock, flags); ++ rtl8125_mdio_prot_direct_write_phy_ocp(tp, my_cmd.offset, my_cmd.data); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ break; ++ ++#ifdef ENABLE_FIBER_SUPPORT ++ case RTL_READ_FIBER_PHY: ++ if (!HW_FIBER_STATUS_CONNECTED(tp)) { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ my_cmd.data = rtl8125_fiber_mdio_read(tp, my_cmd.offset); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ if (copy_to_user(ifr->ifr_data, &my_cmd, sizeof(my_cmd))) { ++ ret = -EFAULT; ++ break; ++ } ++ ++ break; ++ ++ case RTL_WRITE_FIBER_PHY: ++ if (!HW_FIBER_STATUS_CONNECTED(tp)) { ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ r8125_spin_lock(&tp->phy_lock, flags); ++ rtl8125_fiber_mdio_write(tp, my_cmd.offset, my_cmd.data); ++ r8125_spin_unlock(&tp->phy_lock, flags); ++ break; ++#endif /* ENABLE_FIBER_SUPPORT */ ++ ++ default: ++ ret = -EOPNOTSUPP; ++ break; ++ } ++ ++ return ret; ++} +diff --git a/drivers/net/ethernet/realtek/rtltool.h b/drivers/net/ethernet/realtek/rtltool.h +new file mode 100755 +index 000000000000..940be4fe7606 +--- /dev/null ++++ b/drivers/net/ethernet/realtek/rtltool.h +@@ -0,0 +1,89 @@ ++/* SPDX-License-Identifier: GPL-2.0-only */ ++/* ++################################################################################ ++# ++# r8125 is the Linux device driver released for Realtek 2.5 Gigabit Ethernet ++# controllers with PCI-Express interface. ++# ++# Copyright(c) 2025 Realtek Semiconductor Corp. All rights reserved. ++# ++# This program is free software; you can redistribute it and/or modify it ++# under the terms of the GNU General Public License as published by the Free ++# Software Foundation; either version 2 of the License, or (at your option) ++# any later version. ++# ++# This program is distributed in the hope that it will be useful, but WITHOUT ++# ANY WARRANTY; without even the implied warranty of MERCHANTABILITY or ++# FITNESS FOR A PARTICULAR PURPOSE. See the GNU General Public License for ++# more details. ++# ++# You should have received a copy of the GNU General Public License along with ++# this program; if not, see . ++# ++# Author: ++# Realtek NIC software team ++# No. 2, Innovation Road II, Hsinchu Science Park, Hsinchu 300, Taiwan ++# ++################################################################################ ++*/ ++ ++/************************************************************************************ ++ * This product is covered by one or more of the following patents: ++ * US6,570,884, US6,115,776, and US6,327,625. ++ ***********************************************************************************/ ++ ++#ifndef _LINUX_RTLTOOL_H ++#define _LINUX_RTLTOOL_H ++ ++#define SIOCRTLTOOL SIOCDEVPRIVATE+1 ++ ++enum rtl_cmd { ++ RTLTOOL_READ_MAC=0, ++ RTLTOOL_WRITE_MAC, ++ RTLTOOL_READ_PHY, ++ RTLTOOL_WRITE_PHY, ++ RTLTOOL_READ_EPHY, ++ RTLTOOL_WRITE_EPHY, ++ RTLTOOL_READ_ERI, ++ RTLTOOL_WRITE_ERI, ++ RTLTOOL_READ_PCI, ++ RTLTOOL_WRITE_PCI, ++ RTLTOOL_READ_EEPROM, ++ RTLTOOL_WRITE_EEPROM, ++ ++ RTL_READ_OOB_MAC, ++ RTL_WRITE_OOB_MAC, ++ ++ RTL_ENABLE_PCI_DIAG, ++ RTL_DISABLE_PCI_DIAG, ++ ++ RTL_READ_MAC_OCP, ++ RTL_WRITE_MAC_OCP, ++ ++ RTL_DIRECT_READ_PHY_OCP, ++ RTL_DIRECT_WRITE_PHY_OCP, ++ ++ RTL_READ_FIBER_PHY, ++ RTL_WRITE_FIBER_PHY, ++ ++ RTLTOOL_INVALID ++}; ++ ++struct rtltool_cmd { ++ __u32 cmd; ++ __u32 offset; ++ __u32 len; ++ __u32 data; ++}; ++ ++enum mode_access { ++ MODE_NONE=0, ++ MODE_READ, ++ MODE_WRITE ++}; ++ ++#ifdef __KERNEL__ ++int rtl8125_tool_ioctl(struct rtl8125_private *tp, struct ifreq *ifr); ++#endif ++ ++#endif /* _LINUX_RTLTOOL_H */ +-- +2.53.0 + diff --git a/sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch b/sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch new file mode 100644 index 0000000..2a11388 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.19/0007-vesa-dsc-bpp.patch @@ -0,0 +1,392 @@ +From 6f7b751921f791358d7c89c6a0ffe66914ae8d0d Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 27 Feb 2026 09:09:41 +0100 +Subject: [PATCH 7/8] vesa-dsc-bpp + +Signed-off-by: Peter Jung +--- + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 16 +++ + .../drm/amd/display/dc/dml/dsc/qp_tables.h | 4 +- + .../drm/amd/display/dc/dml/dsc/rc_calc_fpu.c | 2 +- + drivers/gpu/drm/drm_displayid_internal.h | 11 ++ + drivers/gpu/drm/drm_edid.c | 102 +++++++++++------- + include/drm/drm_connector.h | 6 ++ + include/drm/drm_modes.h | 10 ++ + 7 files changed, 112 insertions(+), 39 deletions(-) + +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +index bc9aca604aa0..47583196cfa8 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -6779,6 +6779,11 @@ static void fill_stream_properties_from_drm_display_mode( + + stream->output_color_space = get_output_color_space(timing_out, connector_state); + stream->content_type = get_output_content_type(connector_state); ++ ++ /* DisplayID Type VII pass-through timings. */ ++ if (mode_in->dsc_passthrough_timings_support && info->dp_dsc_bpp_x16 != 0) { ++ stream->timing.dsc_fixed_bits_per_pixel_x16 = info->dp_dsc_bpp_x16; ++ } + } + + static void fill_audio_info(struct audio_info *audio_info, +@@ -7237,6 +7242,7 @@ create_stream_for_sink(struct drm_connector *connector, + struct drm_display_mode mode; + struct drm_display_mode saved_mode; + struct drm_display_mode *freesync_mode = NULL; ++ struct drm_display_mode *dsc_passthru_mode = NULL; + bool native_mode_found = false; + bool recalculate_timing = false; + bool scale = dm_state->scaling != RMX_OFF; +@@ -7328,6 +7334,16 @@ create_stream_for_sink(struct drm_connector *connector, + } + } + ++ list_for_each_entry(dsc_passthru_mode, &connector->modes, head) { ++ if (dsc_passthru_mode->hdisplay == mode.hdisplay && ++ dsc_passthru_mode->vdisplay == mode.vdisplay && ++ drm_mode_vrefresh(dsc_passthru_mode) == mode_refresh) { ++ mode.dsc_passthrough_timings_support = ++ dsc_passthru_mode->dsc_passthrough_timings_support; ++ break; ++ } ++ } ++ + if (recalculate_timing) + drm_mode_set_crtcinfo(&saved_mode, 0); + +diff --git a/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h b/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h +index dcff0dd2b6a1..622abb69ea00 100644 +--- a/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h ++++ b/drivers/gpu/drm/amd/display/dc/dml/dsc/qp_tables.h +@@ -63,7 +63,7 @@ static const qp_table qp_table_444_8bpc_max = { + { 6.5, { 4, 6, 7, 8, 8, 8, 9, 10, 11, 11, 12, 12, 12, 13, 15} }, + { 7, { 4, 5, 7, 7, 8, 8, 8, 9, 10, 11, 11, 12, 12, 13, 14} }, + { 7.5, { 4, 5, 6, 7, 7, 8, 8, 9, 10, 10, 11, 11, 12, 13, 14} }, +- { 8, { 4, 4, 5, 6, 7, 7, 7, 8, 9, 10, 10, 11, 11, 12, 13} }, ++ { 8, { 4, 4, 5, 6, 7, 7, 7, 8, 9, 10, 11, 12, 13, 13, 15} }, + { 8.5, { 4, 4, 5, 6, 7, 7, 7, 8, 9, 10, 10, 11, 11, 12, 13} }, + { 9, { 3, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 10, 11, 11, 13} }, + { 9.5, { 3, 4, 5, 6, 7, 7, 7, 8, 9, 9, 10, 10, 11, 11, 13} }, +@@ -211,7 +211,7 @@ static const qp_table qp_table_444_8bpc_min = { + { 6.5, { 0, 1, 2, 3, 4, 4, 5, 5, 5, 5, 6, 6, 6, 9, 14} }, + { 7, { 0, 0, 2, 2, 4, 4, 4, 4, 4, 5, 5, 6, 6, 9, 13} }, + { 7.5, { 0, 0, 2, 2, 3, 4, 4, 4, 4, 4, 5, 5, 6, 9, 13} }, +- { 8, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 4, 5, 5, 5, 8, 12} }, ++ { 8, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 3, 5, 5, 5, 7, 13} }, + { 8.5, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 4, 5, 5, 5, 8, 12} }, + { 9, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 3, 5, 5, 5, 7, 12} }, + { 9.5, { 0, 0, 1, 1, 3, 3, 3, 3, 3, 3, 5, 5, 5, 7, 12} }, +diff --git a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c +index ef75eb7d5adc..8804419871d0 100644 +--- a/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c ++++ b/drivers/gpu/drm/amd/display/dc/dml/dsc/rc_calc_fpu.c +@@ -123,7 +123,7 @@ static void get_ofs_set(qp_set ofs, enum colour_mode mode, float bpp) + *p++ = (bpp <= 12) ? (-8) : ((bpp >= 15) ? (-6) : (-8 + dsc_roundf((bpp - 12) * (2 / 3.0)))); + *p++ = (bpp <= 12) ? (-10) : ((bpp >= 15) ? (-8) : (-10 + dsc_roundf((bpp - 12) * (2 / 3.0)))); + *p++ = -10; +- *p++ = (bpp <= 6) ? (-12) : ((bpp >= 8) ? (-10) : (-12 + dsc_roundf((bpp - 6) * (2 / 2.0)))); ++ *p++ = (bpp <= 6) ? (-12) : ((bpp >= 8) ? (-12) : (-12 + dsc_roundf((bpp - 6) * (2 / 2.0)))); + *p++ = -12; + *p++ = -12; + *p++ = -12; +diff --git a/drivers/gpu/drm/drm_displayid_internal.h b/drivers/gpu/drm/drm_displayid_internal.h +index 5b1b32f73516..8f1a2f33ca1a 100644 +--- a/drivers/gpu/drm/drm_displayid_internal.h ++++ b/drivers/gpu/drm/drm_displayid_internal.h +@@ -97,6 +97,7 @@ struct displayid_header { + u8 ext_count; + } __packed; + ++#define DISPLAYID_BLOCK_REV GENMASK(2, 0) + struct displayid_block { + u8 tag; + u8 rev; +@@ -125,6 +126,7 @@ struct displayid_detailed_timings_1 { + __le16 vsw; + } __packed; + ++#define DISPLAYID_BLOCK_PASSTHROUGH_TIMINGS_SUPPORT BIT(3) + struct displayid_detailed_timing_block { + struct displayid_block base; + struct displayid_detailed_timings_1 timings[]; +@@ -137,19 +139,28 @@ struct displayid_formula_timings_9 { + u8 vrefresh; + } __packed; + ++#define DISPLAYID_BLOCK_DESCRIPTOR_PAYLOAD_BYTES GENMASK(6, 4) + struct displayid_formula_timing_block { + struct displayid_block base; + struct displayid_formula_timings_9 timings[]; + } __packed; + ++#define DISPLAYID_VESA_DP_TYPE GENMASK(2, 0) + #define DISPLAYID_VESA_MSO_OVERLAP GENMASK(3, 0) + #define DISPLAYID_VESA_MSO_MODE GENMASK(6, 5) ++#define DISPLAYID_VESA_DSC_BPP_INT GENMASK(5, 0) ++#define DISPLAYID_VESA_DSC_BPP_FRACT GENMASK(3, 0) ++ ++#define DISPLAYID_VESA_DP_TYPE_EDP 0 ++#define DISPLAYID_VESA_DP_TYPE_DP 1 + + struct displayid_vesa_vendor_specific_block { + struct displayid_block base; + u8 oui[3]; + u8 data_structure_type; + u8 mso; ++ u8 dsc_bpp_int; ++ u8 dsc_bpp_fract; + } __packed; + + /* +diff --git a/drivers/gpu/drm/drm_edid.c b/drivers/gpu/drm/drm_edid.c +index 056eff8cbd1a..26d53a548a27 100644 +--- a/drivers/gpu/drm/drm_edid.c ++++ b/drivers/gpu/drm/drm_edid.c +@@ -45,6 +45,7 @@ + #include + #include + #include ++#include + #include + + #include "drm_crtc_internal.h" +@@ -6566,12 +6567,13 @@ static void drm_get_monitor_range(struct drm_connector *connector, + info->monitor_range.min_vfreq, info->monitor_range.max_vfreq); + } + +-static void drm_parse_vesa_mso_data(struct drm_connector *connector, +- const struct displayid_block *block) ++static void drm_parse_vesa_specific_block(struct drm_connector *connector, ++ const struct displayid_block *block) + { + struct displayid_vesa_vendor_specific_block *vesa = + (struct displayid_vesa_vendor_specific_block *)block; + struct drm_display_info *info = &connector->display_info; ++ int dp_type; + + if (block->num_bytes < 3) { + drm_dbg_kms(connector->dev, +@@ -6583,51 +6585,73 @@ static void drm_parse_vesa_mso_data(struct drm_connector *connector, + if (oui(vesa->oui[0], vesa->oui[1], vesa->oui[2]) != VESA_IEEE_OUI) + return; + +- if (sizeof(*vesa) != sizeof(*block) + block->num_bytes) { ++ if (block->num_bytes < 5) { + drm_dbg_kms(connector->dev, + "[CONNECTOR:%d:%s] Unexpected VESA vendor block size\n", + connector->base.id, connector->name); + return; + } + +- switch (FIELD_GET(DISPLAYID_VESA_MSO_MODE, vesa->mso)) { +- default: +- drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved MSO mode value\n", ++ dp_type = FIELD_GET(DISPLAYID_VESA_DP_TYPE, vesa->data_structure_type); ++ if (dp_type > 1) { ++ drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved dp type value\n", + connector->base.id, connector->name); +- fallthrough; +- case 0: +- info->mso_stream_count = 0; +- break; +- case 1: +- info->mso_stream_count = 2; /* 2 or 4 links */ +- break; +- case 2: +- info->mso_stream_count = 4; /* 4 links */ +- break; + } + +- if (!info->mso_stream_count) { ++ /* MSO is only supported for eDP */ ++ if (dp_type == DISPLAYID_VESA_DP_TYPE_EDP) { ++ switch (FIELD_GET(DISPLAYID_VESA_MSO_MODE, vesa->mso)) { ++ default: ++ drm_dbg_kms(connector->dev, "[CONNECTOR:%d:%s] Reserved MSO mode value\n", ++ connector->base.id, connector->name); ++ fallthrough; ++ case 0: ++ info->mso_stream_count = 0; ++ break; ++ case 1: ++ info->mso_stream_count = 2; /* 2 or 4 links */ ++ break; ++ case 2: ++ info->mso_stream_count = 4; /* 4 links */ ++ break; ++ } ++ } ++ ++ if (info->mso_stream_count) { ++ info->mso_pixel_overlap = FIELD_GET(DISPLAYID_VESA_MSO_OVERLAP, vesa->mso); ++ if (info->mso_pixel_overlap > 8) { ++ drm_dbg_kms(connector->dev, ++ "[CONNECTOR:%d:%s] Reserved MSO pixel overlap value %u\n", ++ connector->base.id, connector->name, ++ info->mso_pixel_overlap); ++ info->mso_pixel_overlap = 8; ++ } ++ drm_dbg_kms(connector->dev, ++ "[CONNECTOR:%d:%s] MSO stream count %u, pixel overlap %u\n", ++ connector->base.id, connector->name, ++ info->mso_stream_count, info->mso_pixel_overlap); ++ } else { + info->mso_pixel_overlap = 0; ++ } ++ ++ if (block->num_bytes < 7) { ++ /* DSC bpp is optional */ + return; + } + +- info->mso_pixel_overlap = FIELD_GET(DISPLAYID_VESA_MSO_OVERLAP, vesa->mso); +- if (info->mso_pixel_overlap > 8) { ++ info->dp_dsc_bpp_x16 = FIELD_GET(DISPLAYID_VESA_DSC_BPP_INT, vesa->dsc_bpp_int) << 4 | ++ FIELD_GET(DISPLAYID_VESA_DSC_BPP_FRACT, vesa->dsc_bpp_fract); ++ ++ if (info->dp_dsc_bpp_x16 > 0) { + drm_dbg_kms(connector->dev, +- "[CONNECTOR:%d:%s] Reserved MSO pixel overlap value %u\n", ++ "[CONNECTOR:%d:%s] DSC bits per pixel " FXP_Q4_FMT "\n", + connector->base.id, connector->name, +- info->mso_pixel_overlap); +- info->mso_pixel_overlap = 8; ++ FXP_Q4_ARGS(info->dp_dsc_bpp_x16)); + } +- +- drm_dbg_kms(connector->dev, +- "[CONNECTOR:%d:%s] MSO stream count %u, pixel overlap %u\n", +- connector->base.id, connector->name, +- info->mso_stream_count, info->mso_pixel_overlap); + } + +-static void drm_update_mso(struct drm_connector *connector, +- const struct drm_edid *drm_edid) ++static void drm_update_vesa_specific_block(struct drm_connector *connector, ++ const struct drm_edid *drm_edid) + { + const struct displayid_block *block; + struct displayid_iter iter; +@@ -6635,7 +6659,7 @@ static void drm_update_mso(struct drm_connector *connector, + displayid_iter_edid_begin(drm_edid, &iter); + displayid_iter_for_each(block, &iter) { + if (block->tag == DATA_BLOCK_2_VENDOR_SPECIFIC) +- drm_parse_vesa_mso_data(connector, block); ++ drm_parse_vesa_specific_block(connector, block); + } + displayid_iter_end(&iter); + } +@@ -6672,6 +6696,7 @@ static void drm_reset_display_info(struct drm_connector *connector) + info->mso_stream_count = 0; + info->mso_pixel_overlap = 0; + info->max_dsc_bpp = 0; ++ info->dp_dsc_bpp_x16 = 0; + + kfree(info->vics); + info->vics = NULL; +@@ -6795,7 +6820,7 @@ static void update_display_info(struct drm_connector *connector, + if (edid->features & DRM_EDID_FEATURE_RGB_YCRCB422) + info->color_formats |= DRM_COLOR_FORMAT_YCBCR422; + +- drm_update_mso(connector, drm_edid); ++ drm_update_vesa_specific_block(connector, drm_edid); + + out: + if (drm_edid_has_internal_quirk(connector, EDID_QUIRK_NON_DESKTOP)) { +@@ -6825,8 +6850,8 @@ static void update_display_info(struct drm_connector *connector, + } + + static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *dev, +- const struct displayid_detailed_timings_1 *timings, +- bool type_7) ++ const struct displayid_block *block, ++ const struct displayid_detailed_timings_1 *timings) + { + struct drm_display_mode *mode; + unsigned int pixel_clock = (timings->pixel_clock[0] | +@@ -6842,11 +6867,16 @@ static struct drm_display_mode *drm_mode_displayid_detailed(struct drm_device *d + unsigned int vsync_width = le16_to_cpu(timings->vsw) + 1; + bool hsync_positive = le16_to_cpu(timings->hsync) & (1 << 15); + bool vsync_positive = le16_to_cpu(timings->vsync) & (1 << 15); ++ bool type_7 = block->tag == DATA_BLOCK_2_TYPE_7_DETAILED_TIMING; + + mode = drm_mode_create(dev); + if (!mode) + return NULL; + ++ if (type_7 && FIELD_GET(DISPLAYID_BLOCK_REV, block->rev) >= 1) ++ mode->dsc_passthrough_timings_support = ++ block->rev & DISPLAYID_BLOCK_PASSTHROUGH_TIMINGS_SUPPORT; ++ + /* resolution is kHz for type VII, and 10 kHz for type I */ + mode->clock = type_7 ? pixel_clock : pixel_clock * 10; + mode->hdisplay = hactive; +@@ -6879,7 +6909,6 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector, + int num_timings; + struct drm_display_mode *newmode; + int num_modes = 0; +- bool type_7 = block->tag == DATA_BLOCK_2_TYPE_7_DETAILED_TIMING; + /* blocks must be multiple of 20 bytes length */ + if (block->num_bytes % 20) + return 0; +@@ -6888,7 +6917,7 @@ static int add_displayid_detailed_1_modes(struct drm_connector *connector, + for (i = 0; i < num_timings; i++) { + struct displayid_detailed_timings_1 *timings = &det->timings[i]; + +- newmode = drm_mode_displayid_detailed(connector->dev, timings, type_7); ++ newmode = drm_mode_displayid_detailed(connector->dev, block, timings); + if (!newmode) + continue; + +@@ -6935,7 +6964,8 @@ static int add_displayid_formula_modes(struct drm_connector *connector, + struct drm_display_mode *newmode; + int num_modes = 0; + bool type_10 = block->tag == DATA_BLOCK_2_TYPE_10_FORMULA_TIMING; +- int timing_size = 6 + ((formula_block->base.rev & 0x70) >> 4); ++ int timing_size = 6 + ++ FIELD_GET(DISPLAYID_BLOCK_DESCRIPTOR_PAYLOAD_BYTES, formula_block->base.rev); + + /* extended blocks are not supported yet */ + if (timing_size != 6) +diff --git a/include/drm/drm_connector.h b/include/drm/drm_connector.h +index fa4abfe8971e..a0f86e48192e 100644 +--- a/include/drm/drm_connector.h ++++ b/include/drm/drm_connector.h +@@ -890,6 +890,12 @@ struct drm_display_info { + */ + u32 max_dsc_bpp; + ++ /** ++ * @dp_dsc_bpp: DP Display-Stream-Compression (DSC) timing's target ++ * DSC bits per pixel in 6.4 fixed point format. 0 means undefined. ++ */ ++ u16 dp_dsc_bpp_x16; ++ + /** + * @vics: Array of vics_len VICs. Internal to EDID parsing. + */ +diff --git a/include/drm/drm_modes.h b/include/drm/drm_modes.h +index b9bb92e4b029..312e5c03af9a 100644 +--- a/include/drm/drm_modes.h ++++ b/include/drm/drm_modes.h +@@ -417,6 +417,16 @@ struct drm_display_mode { + */ + enum hdmi_picture_aspect picture_aspect_ratio; + ++ /** ++ * @dsc_passthrough_timing_support: ++ * ++ * Indicates whether this mode timing descriptor is supported ++ * with specific target DSC bits per pixel only. ++ * ++ * VESA vendor-specific data block shall exist with the relevant ++ * DSC bits per pixel declaration when this flag is set to true. ++ */ ++ bool dsc_passthrough_timings_support; + }; + + /** +-- +2.53.0 + diff --git a/sys-kernel/gentoo-sources-6.19/0008-vmscape.patch b/sys-kernel/gentoo-sources-6.19/0008-vmscape.patch new file mode 100644 index 0000000..05282b3 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.19/0008-vmscape.patch @@ -0,0 +1,366 @@ +From 5692ec66ac9431cee8522a866cd4b80fdff4ca54 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 27 Feb 2026 09:09:52 +0100 +Subject: [PATCH 8/8] vmscape + +Signed-off-by: Peter Jung +--- + Documentation/admin-guide/hw-vuln/vmscape.rst | 8 +++ + .../admin-guide/kernel-parameters.txt | 4 +- + arch/x86/Kconfig | 1 + + arch/x86/entry/entry_64.S | 13 +++- + arch/x86/include/asm/cpufeatures.h | 2 +- + arch/x86/include/asm/entry-common.h | 9 ++- + arch/x86/include/asm/nospec-branch.h | 11 +++- + arch/x86/kernel/cpu/bugs.c | 65 +++++++++++++++---- + arch/x86/kvm/x86.c | 4 +- + arch/x86/net/bpf_jit_comp.c | 2 + + 10 files changed, 90 insertions(+), 29 deletions(-) + +diff --git a/Documentation/admin-guide/hw-vuln/vmscape.rst b/Documentation/admin-guide/hw-vuln/vmscape.rst +index d9b9a2b6c114..580f288ae8bf 100644 +--- a/Documentation/admin-guide/hw-vuln/vmscape.rst ++++ b/Documentation/admin-guide/hw-vuln/vmscape.rst +@@ -86,6 +86,10 @@ The possible values in this file are: + run a potentially malicious guest and issues an IBPB before the first + exit to userspace after VM-exit. + ++ * 'Mitigation: Clear BHB before exit to userspace': ++ ++ As above, conditional BHB clearing mitigation is enabled. ++ + * 'Mitigation: IBPB on VMEXIT': + + IBPB is issued on every VM-exit. This occurs when other mitigations like +@@ -108,3 +112,7 @@ The mitigation can be controlled via the ``vmscape=`` command line parameter: + + Force vulnerability detection and mitigation even on processors that are + not known to be affected. ++ ++ * ``vmscape=on``: ++ ++ Choose the mitigation based on the VMSCAPE variant the CPU is affected by. +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 28f14d664aa3..a3e9684f63c0 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -8281,9 +8281,11 @@ Kernel parameters + + off - disable the mitigation + ibpb - use Indirect Branch Prediction Barrier +- (IBPB) mitigation (default) ++ (IBPB) mitigation + force - force vulnerability detection even on + unaffected processors ++ on - (default) selects IBPB or BHB clear ++ mitigation based on CPU + + vsyscall= [X86-64,EARLY] + Controls the behavior of vsyscalls (i.e. calls to +diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig +index 80527299f859..e03e35a2a6ce 100644 +--- a/arch/x86/Kconfig ++++ b/arch/x86/Kconfig +@@ -2710,6 +2710,7 @@ config MITIGATION_TSA + config MITIGATION_VMSCAPE + bool "Mitigate VMSCAPE" + depends on KVM ++ depends on HAVE_STATIC_CALL + default y + help + Enable mitigation for VMSCAPE attacks. VMSCAPE is a hardware security +diff --git a/arch/x86/entry/entry_64.S b/arch/x86/entry/entry_64.S +index f9983a1907bf..6d93602dd309 100644 +--- a/arch/x86/entry/entry_64.S ++++ b/arch/x86/entry/entry_64.S +@@ -1529,12 +1529,19 @@ SYM_CODE_END(rewind_stack_and_make_dead) + * refactored in the future if needed. The .skips are for safety, to ensure + * that all RETs are in the second half of a cacheline to mitigate Indirect + * Target Selection, rather than taking the slowpath via its_return_thunk. ++ * ++ * Note, callers should use a speculation barrier like LFENCE immediately after ++ * a call to this function to ensure BHB is cleared before indirect branches. + */ + SYM_FUNC_START(clear_bhb_loop) + ANNOTATE_NOENDBR + push %rbp + mov %rsp, %rbp +- movl $5, %ecx ++ ++ /* loop count differs based on BHI_CTRL, see Intel's BHI guidance */ ++ ALTERNATIVE "movl $5, %ecx; movl $5, %edx", \ ++ "movl $12, %ecx; movl $7, %edx", X86_FEATURE_BHI_CTRL ++ + ANNOTATE_INTRA_FUNCTION_CALL + call 1f + jmp 5f +@@ -1555,7 +1562,7 @@ SYM_FUNC_START(clear_bhb_loop) + * but some Clang versions (e.g. 18) don't like this. + */ + .skip 32 - 18, 0xcc +-2: movl $5, %eax ++2: movl %edx, %eax + 3: jmp 4f + nop + 4: sub $1, %eax +@@ -1563,7 +1570,7 @@ SYM_FUNC_START(clear_bhb_loop) + sub $1, %ecx + jnz 1b + .Lret2: RET +-5: lfence ++5: + pop %rbp + RET + SYM_FUNC_END(clear_bhb_loop) +diff --git a/arch/x86/include/asm/cpufeatures.h b/arch/x86/include/asm/cpufeatures.h +index c3b53beb1300..aa39430476d6 100644 +--- a/arch/x86/include/asm/cpufeatures.h ++++ b/arch/x86/include/asm/cpufeatures.h +@@ -501,7 +501,7 @@ + #define X86_FEATURE_TSA_SQ_NO (21*32+11) /* AMD CPU not vulnerable to TSA-SQ */ + #define X86_FEATURE_TSA_L1_NO (21*32+12) /* AMD CPU not vulnerable to TSA-L1 */ + #define X86_FEATURE_CLEAR_CPU_BUF_VM (21*32+13) /* Clear CPU buffers using VERW before VMRUN */ +-#define X86_FEATURE_IBPB_EXIT_TO_USER (21*32+14) /* Use IBPB on exit-to-userspace, see VMSCAPE bug */ ++/* Free */ + #define X86_FEATURE_ABMC (21*32+15) /* Assignable Bandwidth Monitoring Counters */ + #define X86_FEATURE_MSR_IMM (21*32+16) /* MSR immediate form instructions */ + #define X86_FEATURE_SGX_EUPDATESVN (21*32+17) /* Support for ENCLS[EUPDATESVN] instruction */ +diff --git a/arch/x86/include/asm/entry-common.h b/arch/x86/include/asm/entry-common.h +index ce3eb6d5fdf9..783e7cb50cae 100644 +--- a/arch/x86/include/asm/entry-common.h ++++ b/arch/x86/include/asm/entry-common.h +@@ -4,6 +4,7 @@ + + #include + #include ++#include + + #include + #include +@@ -94,11 +95,9 @@ static inline void arch_exit_to_user_mode_prepare(struct pt_regs *regs, + */ + choose_random_kstack_offset(rdtsc()); + +- /* Avoid unnecessary reads of 'x86_ibpb_exit_to_user' */ +- if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER) && +- this_cpu_read(x86_ibpb_exit_to_user)) { +- indirect_branch_prediction_barrier(); +- this_cpu_write(x86_ibpb_exit_to_user, false); ++ if (unlikely(this_cpu_read(x86_predictor_flush_exit_to_user))) { ++ static_call_cond(vmscape_predictor_flush)(); ++ this_cpu_write(x86_predictor_flush_exit_to_user, false); + } + } + #define arch_exit_to_user_mode_prepare arch_exit_to_user_mode_prepare +diff --git a/arch/x86/include/asm/nospec-branch.h b/arch/x86/include/asm/nospec-branch.h +index 4f4b5e8a1574..80efdb6645ba 100644 +--- a/arch/x86/include/asm/nospec-branch.h ++++ b/arch/x86/include/asm/nospec-branch.h +@@ -331,11 +331,11 @@ + + #ifdef CONFIG_X86_64 + .macro CLEAR_BRANCH_HISTORY +- ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_LOOP ++ ALTERNATIVE "", "call clear_bhb_loop; lfence", X86_FEATURE_CLEAR_BHB_LOOP + .endm + + .macro CLEAR_BRANCH_HISTORY_VMEXIT +- ALTERNATIVE "", "call clear_bhb_loop", X86_FEATURE_CLEAR_BHB_VMEXIT ++ ALTERNATIVE "", "call clear_bhb_loop; lfence", X86_FEATURE_CLEAR_BHB_VMEXIT + .endm + #else + #define CLEAR_BRANCH_HISTORY +@@ -390,6 +390,8 @@ extern void write_ibpb(void); + + #ifdef CONFIG_X86_64 + extern void clear_bhb_loop(void); ++#else ++static inline void clear_bhb_loop(void) {} + #endif + + extern void (*x86_return_thunk)(void); +@@ -533,7 +535,7 @@ void alternative_msr_write(unsigned int msr, u64 val, unsigned int feature) + : "memory"); + } + +-DECLARE_PER_CPU(bool, x86_ibpb_exit_to_user); ++DECLARE_PER_CPU(bool, x86_predictor_flush_exit_to_user); + + static inline void indirect_branch_prediction_barrier(void) + { +@@ -542,6 +544,9 @@ static inline void indirect_branch_prediction_barrier(void) + :: "rax", "rcx", "rdx", "memory"); + } + ++#include ++DECLARE_STATIC_CALL(vmscape_predictor_flush, write_ibpb); ++ + /* The Intel SPEC CTRL MSR base value cache */ + extern u64 x86_spec_ctrl_base; + DECLARE_PER_CPU(u64, x86_spec_ctrl_current); +diff --git a/arch/x86/kernel/cpu/bugs.c b/arch/x86/kernel/cpu/bugs.c +index d0a2847a4bb0..2818bfcb9f9f 100644 +--- a/arch/x86/kernel/cpu/bugs.c ++++ b/arch/x86/kernel/cpu/bugs.c +@@ -62,12 +62,11 @@ DEFINE_PER_CPU(u64, x86_spec_ctrl_current); + EXPORT_PER_CPU_SYMBOL_GPL(x86_spec_ctrl_current); + + /* +- * Set when the CPU has run a potentially malicious guest. An IBPB will +- * be needed to before running userspace. That IBPB will flush the branch +- * predictor content. ++ * Set when the CPU has run a potentially malicious guest. Indicates that a ++ * branch predictor flush is needed before running userspace. + */ +-DEFINE_PER_CPU(bool, x86_ibpb_exit_to_user); +-EXPORT_PER_CPU_SYMBOL_GPL(x86_ibpb_exit_to_user); ++DEFINE_PER_CPU(bool, x86_predictor_flush_exit_to_user); ++EXPORT_PER_CPU_SYMBOL_GPL(x86_predictor_flush_exit_to_user); + + u64 x86_pred_cmd __ro_after_init = PRED_CMD_IBPB; + +@@ -230,6 +229,9 @@ static void x86_amd_ssb_disable(void) + wrmsrq(MSR_AMD64_LS_CFG, msrval); + } + ++DEFINE_STATIC_CALL_NULL(vmscape_predictor_flush, write_ibpb); ++EXPORT_STATIC_CALL_GPL(vmscape_predictor_flush); ++ + #undef pr_fmt + #define pr_fmt(fmt) "MDS: " fmt + +@@ -3049,15 +3051,19 @@ static void __init srso_apply_mitigation(void) + enum vmscape_mitigations { + VMSCAPE_MITIGATION_NONE, + VMSCAPE_MITIGATION_AUTO, ++ VMSCAPE_MITIGATION_ON, + VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER, + VMSCAPE_MITIGATION_IBPB_ON_VMEXIT, ++ VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER, + }; + + static const char * const vmscape_strings[] = { +- [VMSCAPE_MITIGATION_NONE] = "Vulnerable", ++ [VMSCAPE_MITIGATION_NONE] = "Vulnerable", + /* [VMSCAPE_MITIGATION_AUTO] */ +- [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", +- [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", ++ /* [VMSCAPE_MITIGATION_ON] */ ++ [VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER] = "Mitigation: IBPB before exit to userspace", ++ [VMSCAPE_MITIGATION_IBPB_ON_VMEXIT] = "Mitigation: IBPB on VMEXIT", ++ [VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER] = "Mitigation: Clear BHB before exit to userspace", + }; + + static enum vmscape_mitigations vmscape_mitigation __ro_after_init = +@@ -3074,7 +3080,9 @@ static int __init vmscape_parse_cmdline(char *str) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + } else if (!strcmp(str, "force")) { + setup_force_cpu_bug(X86_BUG_VMSCAPE); +- vmscape_mitigation = VMSCAPE_MITIGATION_AUTO; ++ vmscape_mitigation = VMSCAPE_MITIGATION_ON; ++ } else if (!strcmp(str, "on")) { ++ vmscape_mitigation = VMSCAPE_MITIGATION_ON; + } else { + pr_err("Ignoring unknown vmscape=%s option.\n", str); + } +@@ -3085,17 +3093,42 @@ early_param("vmscape", vmscape_parse_cmdline); + + static void __init vmscape_select_mitigation(void) + { +- if (!boot_cpu_has_bug(X86_BUG_VMSCAPE) || +- !boot_cpu_has(X86_FEATURE_IBPB)) { ++ if (!boot_cpu_has_bug(X86_BUG_VMSCAPE)) { + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; + return; + } + +- if (vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) { +- if (should_mitigate_vuln(X86_BUG_VMSCAPE)) ++ if ((vmscape_mitigation == VMSCAPE_MITIGATION_AUTO) && ++ !should_mitigate_vuln(X86_BUG_VMSCAPE)) ++ vmscape_mitigation = VMSCAPE_MITIGATION_NONE; ++ ++ switch (vmscape_mitigation) { ++ case VMSCAPE_MITIGATION_NONE: ++ break; ++ ++ case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER: ++ if (!boot_cpu_has(X86_FEATURE_IBPB)) ++ vmscape_mitigation = VMSCAPE_MITIGATION_NONE; ++ break; ++ ++ case VMSCAPE_MITIGATION_AUTO: ++ case VMSCAPE_MITIGATION_ON: ++ /* ++ * CPUs with BHI_CTRL(ADL and newer) can avoid the IBPB and use BHB ++ * clear sequence. These CPUs are only vulnerable to the BHI variant ++ * of the VMSCAPE attack and does not require an IBPB flush. In ++ * 32-bit mode BHB clear sequence is not supported. ++ */ ++ if (boot_cpu_has(X86_FEATURE_BHI_CTRL) && IS_ENABLED(CONFIG_X86_64)) ++ vmscape_mitigation = VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER; ++ else if (boot_cpu_has(X86_FEATURE_IBPB)) + vmscape_mitigation = VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER; + else + vmscape_mitigation = VMSCAPE_MITIGATION_NONE; ++ break; ++ ++ default: ++ break; + } + } + +@@ -3114,7 +3147,9 @@ static void __init vmscape_update_mitigation(void) + static void __init vmscape_apply_mitigation(void) + { + if (vmscape_mitigation == VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER) +- setup_force_cpu_cap(X86_FEATURE_IBPB_EXIT_TO_USER); ++ static_call_update(vmscape_predictor_flush, write_ibpb); ++ else if (vmscape_mitigation == VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER) ++ static_call_update(vmscape_predictor_flush, clear_bhb_loop); + } + + #undef pr_fmt +@@ -3203,9 +3238,11 @@ void cpu_bugs_smt_update(void) + switch (vmscape_mitigation) { + case VMSCAPE_MITIGATION_NONE: + case VMSCAPE_MITIGATION_AUTO: ++ case VMSCAPE_MITIGATION_ON: + break; + case VMSCAPE_MITIGATION_IBPB_ON_VMEXIT: + case VMSCAPE_MITIGATION_IBPB_EXIT_TO_USER: ++ case VMSCAPE_MITIGATION_BHB_CLEAR_EXIT_TO_USER: + /* + * Hypervisors can be attacked across-threads, warn for SMT when + * STIBP is not already enabled system-wide. +diff --git a/arch/x86/kvm/x86.c b/arch/x86/kvm/x86.c +index 72d37c8930ad..5b4d44a6b702 100644 +--- a/arch/x86/kvm/x86.c ++++ b/arch/x86/kvm/x86.c +@@ -11437,8 +11437,8 @@ static int vcpu_enter_guest(struct kvm_vcpu *vcpu) + * set for the CPU that actually ran the guest, and not the CPU that it + * may migrate to. + */ +- if (cpu_feature_enabled(X86_FEATURE_IBPB_EXIT_TO_USER)) +- this_cpu_write(x86_ibpb_exit_to_user, true); ++ if (static_call_query(vmscape_predictor_flush)) ++ this_cpu_write(x86_predictor_flush_exit_to_user, true); + + /* + * Consume any pending interrupts, including the possible source of +diff --git a/arch/x86/net/bpf_jit_comp.c b/arch/x86/net/bpf_jit_comp.c +index b0bac2a66eff..c31508be0d72 100644 +--- a/arch/x86/net/bpf_jit_comp.c ++++ b/arch/x86/net/bpf_jit_comp.c +@@ -1620,6 +1620,8 @@ static int emit_spectre_bhb_barrier(u8 **pprog, u8 *ip, + + if (emit_call(&prog, func, ip)) + return -EINVAL; ++ /* Don't speculate past this until BHB is cleared */ ++ EMIT_LFENCE(); + EMIT1(0x59); /* pop rcx */ + EMIT1(0x58); /* pop rax */ + } +-- +2.53.0 + diff --git a/sys-kernel/gentoo-sources-6.10.3/0100-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-6.19/0101-glitched-additional-timer-tick-frequencies.patch similarity index 100% rename from sys-kernel/gentoo-sources-6.10.3/0100-glitched-additional-timer-tick-frequencies.patch rename to sys-kernel/gentoo-sources-6.19/0101-glitched-additional-timer-tick-frequencies.patch diff --git a/sys-kernel/gentoo-sources-6.6/0001-bbr3.patch b/sys-kernel/gentoo-sources-6.6/0001-bbr3.patch new file mode 100644 index 0000000..7d79ef2 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.6/0001-bbr3.patch @@ -0,0 +1,3352 @@ +From 0588576f1ca7bc2757bb90e1fac439eccf10afc9 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Fri, 15 Mar 2024 20:30:45 +0100 +Subject: [PATCH 1/5] bbr3 + +Signed-off-by: Peter Jung +--- + include/linux/tcp.h | 4 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 72 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2231 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 1 + + 15 files changed, 1934 insertions(+), 551 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 3c5efeeb024f..a0d4afd221d8 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -257,7 +257,9 @@ struct tcp_sock { + u8 compressed_ack; + u8 dup_ack_counter:2, + tlp_retrans:1, /* TLP is a retransmission */ +- unused:5; ++ fast_ack_mode:2, /* which fast ack mode ? */ ++ tlp_orig_data_app_limited:1, /* app-limited before TLP rtx? */ ++ unused:2; + u32 chrono_start; /* Start time in jiffies of a TCP chrono */ + u32 chrono_stat[3]; /* Time in jiffies for chrono_stat stats */ + u8 chrono_type:2, /* current chronograph type */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index fee1e5650551..1d069d636117 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -135,8 +135,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 71af24410443..9c92be8fe029 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -372,6 +372,8 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_QUEUE_CWR 2 + #define TCP_ECN_DEMAND_CWR 4 + #define TCP_ECN_SEEN 8 ++#define TCP_ECN_LOW 16 ++#define TCP_ECN_ECT_PERMANENT 32 + + enum tcp_tw_status { + TCP_TW_SUCCESS = 0, +@@ -724,6 +726,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + tcp_fast_path_on(tp); + } + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + u32 tcp_delack_max(const struct sock *sk); + + /* Compute the actual rto_min value */ +@@ -822,6 +833,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + static inline u32 tcp_skb_timestamp(const struct sk_buff *skb) + { + return tcp_ns_to_ts(skb->skb_mstamp_ns); +@@ -897,9 +913,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1003,6 +1024,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1025,7 +1047,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED 0x1 + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN 0x2 +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS 0x4 ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1045,10 +1071,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1059,7 +1088,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1083,8 +1114,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1150,6 +1184,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1169,6 +1211,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1181,6 +1224,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2212,7 +2270,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 50655de04c9b..82f8bd8f0d16 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index 51c13cf9c5ae..de8dcba26bec 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -506,9 +506,11 @@ enum { + #define RTAX_FEATURE_SACK (1 << 1) + #define RTAX_FEATURE_TIMESTAMP (1 << 2) + #define RTAX_FEATURE_ALLFRAG (1 << 3) ++#define RTAX_FEATURE_ECN_LOW (1 << 4) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | RTAX_FEATURE_SACK | \ +- RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG) ++ RTAX_FEATURE_TIMESTAMP | RTAX_FEATURE_ALLFRAG \ ++ | RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index 879eeb0a084b..77270053a5e3 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -170,6 +170,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_ECN 8 /* ECN was negociated at TCP session init */ + #define TCPI_OPT_ECN_SEEN 16 /* we received at least one packet with ECT */ + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ ++#define TCPI_OPT_ECN_LOW 64 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 2dfb12230f08..2e14db3bee70 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -668,15 +668,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 91c3d8264059..4a5e0abfe8c1 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3099,6 +3099,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -3790,6 +3791,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 146792cd26fe..f4f477a69917 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* v2: refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,121 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ return (tcp_sk(sk)->ecn_flags & TCP_ECN_OK) && ++ (tcp_sk(sk)->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +382,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +409,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +433,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, sk->sk_max_pacing_rate); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -278,7 +455,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + } + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); +- sk->sk_pacing_rate = bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain); ++ sk->sk_pacing_rate = ++ bbr_bw_to_pacing_rate(sk, bw, bbr_param(sk, startup_pacing_gain)); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -294,26 +472,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + sk->sk_pacing_rate = rate; + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = sk->sk_pacing_rate >> sk->sk_pacing_shift; ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return sk->sk_pacing_rate < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- sk->sk_pacing_rate >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -333,7 +533,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -344,6 +546,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -366,10 +578,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -386,23 +598,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -457,10 +669,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -468,66 +680,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -536,74 +709,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -613,191 +738,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -811,7 +794,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -819,15 +802,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -861,49 +848,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -913,9 +857,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -941,23 +885,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -966,9 +922,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -989,18 +945,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1012,144 +970,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; + +- bbr_update_model(sk, rs); +- +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * which can be used in either v1 or v2, and the PROBE_UP phase of v2, which ++ * uses it to notice when loss/ECN rates suggest inflight is too high. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; ++ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc void bbr_main(struct sock *sk, const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1160,10 +2361,11 @@ BTF_SET8_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + #endif + #endif +@@ -1198,5 +2400,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index 1b34050a7538..66d40449b3f4 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -241,6 +241,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index c2e4dac42453..62e765afcb2a 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -371,7 +371,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -382,7 +382,7 @@ static void __tcp_ecn_check_ce(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1096,7 +1096,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1477,6 +1482,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3732,7 +3748,8 @@ static void tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in draft-ietf-tcpm-rack + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3749,6 +3766,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3759,6 +3777,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3867,6 +3890,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -3941,7 +3965,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_rack_update_reo_wnd(sk, &rs); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -3965,6 +3989,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -3984,7 +4009,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5596,13 +5621,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 260bfb9ada38..0381cbdb9a2c 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -435,6 +435,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index 5631041ae12c..2125f3ab098e 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -332,10 +332,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -347,6 +346,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tp->ecn_flags = TCP_ECN_OK; + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -384,7 +386,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1546,7 +1549,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + int nlen; + u8 flags; +@@ -1621,6 +1624,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -1996,13 +2023,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2701,6 +2727,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2914,6 +2941,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index 64bcf384e9dd..e8b1adf17e3a 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -664,6 +664,7 @@ void tcp_write_timer_handler(struct sock *sk) + return; + } + ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.46.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.6/0001-bore.patch b/sys-kernel/gentoo-sources-6.6/0001-bore.patch new file mode 100644 index 0000000..23b02f0 --- /dev/null +++ b/sys-kernel/gentoo-sources-6.6/0001-bore.patch @@ -0,0 +1,825 @@ +From 90ecff92d4efa9ac452e3e199235f7b7d16a1d80 Mon Sep 17 00:00:00 2001 +From: Masahito S +Date: Thu, 1 Aug 2024 02:38:58 +0900 +Subject: [PATCH] linux6.6.30-bore5.1.8 + +--- + include/linux/sched.h | 10 ++ + init/Kconfig | 17 +++ + kernel/Kconfig.hz | 16 +++ + kernel/sched/core.c | 143 +++++++++++++++++++++ + kernel/sched/debug.c | 60 ++++++++- + kernel/sched/fair.c | 270 +++++++++++++++++++++++++++++++++++++++- + kernel/sched/features.h | 4 + + kernel/sched/sched.h | 7 ++ + 8 files changed, 524 insertions(+), 3 deletions(-) + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 77f01ac385..20fe8ee925 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -559,6 +559,16 @@ struct sched_entity { + u64 sum_exec_runtime; + u64 prev_sum_exec_runtime; + u64 vruntime; ++#ifdef CONFIG_SCHED_BORE ++ u64 burst_time; ++ u8 prev_burst_penalty; ++ u8 curr_burst_penalty; ++ u8 burst_penalty; ++ u8 burst_score; ++ u8 child_burst; ++ u32 child_burst_cnt; ++ u64 child_burst_last_cached; ++#endif // CONFIG_SCHED_BORE + s64 vlag; + u64 slice; + +diff --git a/init/Kconfig b/init/Kconfig +index e403a29256..06c028a327 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1267,6 +1267,23 @@ config CHECKPOINT_RESTORE + + If unsure, say N here. + ++config SCHED_BORE ++ bool "Burst-Oriented Response Enhancer" ++ default y ++ help ++ In Desktop and Mobile computing, one might prefer interactive ++ tasks to keep responsive no matter what they run in the background. ++ ++ Enabling this kernel feature modifies the scheduler to discriminate ++ tasks by their burst time (runtime since it last went sleeping or ++ yielding state) and prioritize those that run less bursty. ++ Such tasks usually include window compositor, widgets backend, ++ terminal emulator, video playback, games and so on. ++ With a little impact to scheduling fairness, it may improve ++ responsiveness especially under heavy background workload. ++ ++ If unsure, say Y here. ++ + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index 38ef6d0688..5f6eecd1e6 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -55,5 +55,21 @@ config HZ + default 300 if HZ_300 + default 1000 if HZ_1000 + ++config MIN_BASE_SLICE_NS ++ int "Default value for min_base_slice_ns" ++ default 2000000 ++ help ++ The BORE Scheduler automatically calculates the optimal base ++ slice for the configured HZ using the following equation: ++ ++ base_slice_ns = max(min_base_slice_ns, 1000000000/HZ) ++ ++ This option sets the default lower bound limit of the base slice ++ to prevent the loss of task throughput due to overscheduling. ++ ++ Setting this value too high can cause the system to boot with ++ an unnecessarily large base slice, resulting in high scheduling ++ latency and poor system responsiveness. ++ + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 8208809605..dad676f6fd 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -4487,6 +4487,138 @@ int wake_up_state(struct task_struct *p, unsigned int state) + return try_to_wake_up(p, state, 0); + } + ++#ifdef CONFIG_SCHED_BORE ++extern u8 sched_burst_fork_atavistic; ++extern uint sched_burst_cache_lifetime; ++ ++static void __init sched_init_bore(void) { ++ init_task.se.burst_time = 0; ++ init_task.se.prev_burst_penalty = 0; ++ init_task.se.curr_burst_penalty = 0; ++ init_task.se.burst_penalty = 0; ++ init_task.se.burst_score = 0; ++ init_task.se.child_burst_last_cached = 0; ++} ++ ++inline void sched_fork_bore(struct task_struct *p) { ++ p->se.burst_time = 0; ++ p->se.curr_burst_penalty = 0; ++ p->se.burst_score = 0; ++ p->se.child_burst_last_cached = 0; ++} ++ ++static u32 count_child_tasks(struct task_struct *p) { ++ struct task_struct *child; ++ u32 cnt = 0; ++ list_for_each_entry(child, &p->children, sibling) {cnt++;} ++ return cnt; ++} ++ ++static inline bool task_is_inheritable(struct task_struct *p) { ++ return (p->sched_class == &fair_sched_class); ++} ++ ++static inline bool child_burst_cache_expired(struct task_struct *p, u64 now) { ++ u64 expiration_time = ++ p->se.child_burst_last_cached + sched_burst_cache_lifetime; ++ return ((s64)(expiration_time - now) < 0); ++} ++ ++static void __update_child_burst_cache( ++ struct task_struct *p, u32 cnt, u32 sum, u64 now) { ++ u8 avg = 0; ++ if (cnt) avg = sum / cnt; ++ p->se.child_burst = max(avg, p->se.burst_penalty); ++ p->se.child_burst_cnt = cnt; ++ p->se.child_burst_last_cached = now; ++} ++ ++static inline void update_child_burst_direct(struct task_struct *p, u64 now) { ++ struct task_struct *child; ++ u32 cnt = 0; ++ u32 sum = 0; ++ ++ list_for_each_entry(child, &p->children, sibling) { ++ if (!task_is_inheritable(child)) continue; ++ cnt++; ++ sum += child->se.burst_penalty; ++ } ++ ++ __update_child_burst_cache(p, cnt, sum, now); ++} ++ ++static inline u8 __inherit_burst_direct(struct task_struct *p, u64 now) { ++ struct task_struct *parent = p->real_parent; ++ if (child_burst_cache_expired(parent, now)) ++ update_child_burst_direct(parent, now); ++ ++ return parent->se.child_burst; ++} ++ ++static void update_child_burst_topological( ++ struct task_struct *p, u64 now, u32 depth, u32 *acnt, u32 *asum) { ++ struct task_struct *child, *dec; ++ u32 cnt = 0, dcnt = 0; ++ u32 sum = 0; ++ ++ list_for_each_entry(child, &p->children, sibling) { ++ dec = child; ++ while ((dcnt = count_child_tasks(dec)) == 1) ++ dec = list_first_entry(&dec->children, struct task_struct, sibling); ++ ++ if (!dcnt || !depth) { ++ if (!task_is_inheritable(dec)) continue; ++ cnt++; ++ sum += dec->se.burst_penalty; ++ continue; ++ } ++ if (!child_burst_cache_expired(dec, now)) { ++ cnt += dec->se.child_burst_cnt; ++ sum += (u32)dec->se.child_burst * dec->se.child_burst_cnt; ++ continue; ++ } ++ update_child_burst_topological(dec, now, depth - 1, &cnt, &sum); ++ } ++ ++ __update_child_burst_cache(p, cnt, sum, now); ++ *acnt += cnt; ++ *asum += sum; ++} ++ ++static inline u8 __inherit_burst_topological(struct task_struct *p, u64 now) { ++ struct task_struct *anc = p->real_parent; ++ u32 cnt = 0, sum = 0; ++ ++ while (anc->real_parent != anc && count_child_tasks(anc) == 1) ++ anc = anc->real_parent; ++ ++ if (child_burst_cache_expired(anc, now)) ++ update_child_burst_topological( ++ anc, now, sched_burst_fork_atavistic - 1, &cnt, &sum); ++ ++ return anc->se.child_burst; ++} ++ ++static inline void inherit_burst(struct task_struct *p) { ++ u8 burst_cache; ++ u64 now = ktime_get_ns(); ++ ++ read_lock(&tasklist_lock); ++ burst_cache = likely(sched_burst_fork_atavistic)? ++ __inherit_burst_topological(p, now): ++ __inherit_burst_direct(p, now); ++ read_unlock(&tasklist_lock); ++ ++ p->se.prev_burst_penalty = max(p->se.prev_burst_penalty, burst_cache); ++} ++ ++static void sched_post_fork_bore(struct task_struct *p) { ++ if (p->sched_class == &fair_sched_class) ++ inherit_burst(p); ++ p->se.burst_penalty = p->se.prev_burst_penalty; ++} ++#endif // CONFIG_SCHED_BORE ++ + /* + * Perform scheduler related setup for a newly forked process p. + * p is forked by current. +@@ -4503,6 +4635,9 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) + p->se.prev_sum_exec_runtime = 0; + p->se.nr_migrations = 0; + p->se.vruntime = 0; ++#ifdef CONFIG_SCHED_BORE ++ sched_fork_bore(p); ++#endif // CONFIG_SCHED_BORE + p->se.vlag = 0; + p->se.slice = sysctl_sched_base_slice; + INIT_LIST_HEAD(&p->se.group_node); +@@ -4822,6 +4957,9 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) + + void sched_post_fork(struct task_struct *p) + { ++#ifdef CONFIG_SCHED_BORE ++ sched_post_fork_bore(p); ++#endif // CONFIG_SCHED_BORE + uclamp_post_fork(p); + } + +@@ -9925,6 +10063,11 @@ void __init sched_init(void) + BUG_ON(&dl_sched_class != &stop_sched_class + 1); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ sched_init_bore(); ++ printk(KERN_INFO "BORE (Burst-Oriented Response Enhancer) CPU Scheduler modification 5.1.8 by Masahito Suzuki"); ++#endif // CONFIG_SCHED_BORE ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 4c3d0d9f3d..02c8816c26 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -167,7 +167,52 @@ static const struct file_operations sched_feat_fops = { + }; + + #ifdef CONFIG_SMP ++#ifdef CONFIG_SCHED_BORE ++static ssize_t sched_min_base_slice_write(struct file *filp, const char __user *ubuf, ++ size_t cnt, loff_t *ppos) ++{ ++ char buf[16]; ++ unsigned int value; ++ ++ if (cnt > 15) ++ cnt = 15; ++ ++ if (copy_from_user(&buf, ubuf, cnt)) ++ return -EFAULT; ++ buf[cnt] = '\0'; ++ ++ if (kstrtouint(buf, 10, &value)) ++ return -EINVAL; + ++ if (!value) ++ return -EINVAL; ++ ++ sysctl_sched_min_base_slice = value; ++ sched_update_min_base_slice(); ++ ++ *ppos += cnt; ++ return cnt; ++} ++ ++static int sched_min_base_slice_show(struct seq_file *m, void *v) ++{ ++ seq_printf(m, "%d\n", sysctl_sched_min_base_slice); ++ return 0; ++} ++ ++static int sched_min_base_slice_open(struct inode *inode, struct file *filp) ++{ ++ return single_open(filp, sched_min_base_slice_show, NULL); ++} ++ ++static const struct file_operations sched_min_base_slice_fops = { ++ .open = sched_min_base_slice_open, ++ .write = sched_min_base_slice_write, ++ .read = seq_read, ++ .llseek = seq_lseek, ++ .release = single_release, ++}; ++#else // !CONFIG_SCHED_BORE + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +@@ -213,7 +258,7 @@ static const struct file_operations sched_scaling_fops = { + .llseek = seq_lseek, + .release = single_release, + }; +- ++#endif // CONFIG_SCHED_BORE + #endif /* SMP */ + + #ifdef CONFIG_PREEMPT_DYNAMIC +@@ -347,13 +392,20 @@ static __init int sched_init_debug(void) + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); ++ debugfs_create_u32("base_slice_ns", 0400, debugfs_sched, &sysctl_sched_base_slice); ++#else // !CONFIG_SCHED_BORE + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); ++#endif // CONFIG_SCHED_BORE + + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); + + #ifdef CONFIG_SMP ++#if !defined(CONFIG_SCHED_BORE) + debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); ++#endif // CONFIG_SCHED_BORE + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); + debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); + +@@ -595,6 +647,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + ++#ifdef CONFIG_SCHED_BORE ++ SEQ_printf(m, " %2d", p->se.burst_score); ++#endif // CONFIG_SCHED_BORE + #ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif +@@ -1068,6 +1123,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + + P(se.load.weight); + #ifdef CONFIG_SMP ++#ifdef CONFIG_SCHED_BORE ++ P(se.burst_score); ++#endif // CONFIG_SCHED_BORE + P(se.avg.load_sum); + P(se.avg.runnable_sum); + P(se.avg.util_sum); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index d3d0a1c933..caae4061a7 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -19,6 +19,9 @@ + * + * Adaptive scheduling granularity, math enhancements by Peter Zijlstra + * Copyright (C) 2007 Red Hat, Inc., Peter Zijlstra ++ * ++ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler ++ * Copyright (C) 2021-2024 Masahito Suzuki + */ + #include + #include +@@ -66,17 +69,29 @@ + * SCHED_TUNABLESCALING_LOG - scaled logarithmical, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * +- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) ++ * (BORE default SCHED_TUNABLESCALING_NONE = *1 constant) ++ * (EEVDF default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + */ ++#ifdef CONFIG_SCHED_BORE ++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; ++#else // !CONFIG_SCHED_BORE + unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; ++#endif // CONFIG_SCHED_BORE + + /* + * Minimal preemption granularity for CPU-bound tasks: + * +- * (default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) ++ * (BORE default: max(1 sec / HZ, min_base_slice) constant, units: nanoseconds) ++ * (EEVDF default: 0.75 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_SCHED_BORE ++unsigned int sysctl_sched_base_slice = 1000000000ULL / HZ; ++static unsigned int configured_sched_base_slice = 1000000000ULL / HZ; ++unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; ++#else // !CONFIG_SCHED_BORE + unsigned int sysctl_sched_base_slice = 750000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 750000ULL; ++#endif // CONFIG_SCHED_BORE + + /* + * After fork, child runs first. If set to 0 (default) then +@@ -86,6 +101,120 @@ unsigned int sysctl_sched_child_runs_first __read_mostly; + + const_debug unsigned int sysctl_sched_migration_cost = 500000UL; + ++#ifdef CONFIG_SCHED_BORE ++u8 __read_mostly sched_bore = 1; ++u8 __read_mostly sched_burst_exclude_kthreads = 1; ++u8 __read_mostly sched_burst_smoothness_long = 1; ++u8 __read_mostly sched_burst_smoothness_short = 0; ++u8 __read_mostly sched_burst_fork_atavistic = 2; ++u8 __read_mostly sched_burst_penalty_offset = 22; ++uint __read_mostly sched_burst_penalty_scale = 1280; ++uint __read_mostly sched_burst_cache_lifetime = 60000000; ++uint __read_mostly sched_deadline_boost_mask = ENQUEUE_INITIAL ++ | ENQUEUE_WAKEUP; ++uint __read_mostly sched_deadline_preserve_mask = ENQUEUE_RESTORE ++ | ENQUEUE_MIGRATED; ++static int __maybe_unused sixty_four = 64; ++static int __maybe_unused maxval_12_bits = 4095; ++ ++#define MAX_BURST_PENALTY (39U <<2) ++ ++static inline u32 log2plus1_u64_u32f8(u64 v) { ++ u32 msb = fls64(v); ++ s32 excess_bits = msb - 9; ++ u8 fractional = (0 <= excess_bits)? v >> excess_bits: v << -excess_bits; ++ return msb << 8 | fractional; ++} ++ ++static inline u32 calc_burst_penalty(u64 burst_time) { ++ u32 greed, tolerance, penalty, scaled_penalty; ++ ++ greed = log2plus1_u64_u32f8(burst_time); ++ tolerance = sched_burst_penalty_offset << 8; ++ penalty = max(0, (s32)greed - (s32)tolerance); ++ scaled_penalty = penalty * sched_burst_penalty_scale >> 16; ++ ++ return min(MAX_BURST_PENALTY, scaled_penalty); ++} ++ ++static inline u64 scale_slice(u64 delta, struct sched_entity *se) { ++ return mul_u64_u32_shr(delta, sched_prio_to_wmult[se->burst_score], 22); ++} ++ ++static inline u64 __unscale_slice(u64 delta, u8 score) { ++ return mul_u64_u32_shr(delta, sched_prio_to_weight[score], 10); ++} ++ ++static inline u64 unscale_slice(u64 delta, struct sched_entity *se) { ++ return __unscale_slice(delta, se->burst_score); ++} ++ ++static void reweight_entity( ++ struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); ++ ++static void renice_task(struct task_struct *p, int prio) ++{ ++ struct sched_entity *se = &p->se; ++ struct cfs_rq *cfs_rq = cfs_rq_of(se); ++ struct load_weight *load = &se->load; ++ unsigned long weight = scale_load(sched_prio_to_weight[prio]); ++ ++ reweight_entity(cfs_rq, se, weight); ++ load->inv_weight = sched_prio_to_wmult[prio]; ++} ++ ++static void update_burst_score(struct sched_entity *se) { ++ if (!entity_is_task(se)) return; ++ struct task_struct *p = task_of(se); ++ u8 prio = p->static_prio - MAX_RT_PRIO; ++ u8 prev_prio = min(39, prio + se->burst_score); ++ ++ u8 burst_score = 0; ++ if (!(sched_burst_exclude_kthreads && (p->flags & PF_KTHREAD))) ++ burst_score = se->burst_penalty >> 2; ++ ++ se->burst_score = burst_score; ++ ++ u8 new_prio = min(39, prio + se->burst_score); ++ if (new_prio != prev_prio) ++ renice_task(p, new_prio); ++} ++ ++static void update_burst_penalty(struct sched_entity *se) { ++ se->curr_burst_penalty = calc_burst_penalty(se->burst_time); ++ se->burst_penalty = max(se->prev_burst_penalty, se->curr_burst_penalty); ++ update_burst_score(se); ++} ++ ++static inline u32 binary_smooth(u32 new, u32 old) { ++ int increment = new - old; ++ return (0 <= increment)? ++ old + ( increment >> (int)sched_burst_smoothness_long): ++ old - (-increment >> (int)sched_burst_smoothness_short); ++} ++ ++static void restart_burst(struct sched_entity *se) { ++ se->burst_penalty = se->prev_burst_penalty = ++ binary_smooth(se->curr_burst_penalty, se->prev_burst_penalty); ++ se->curr_burst_penalty = 0; ++ se->burst_time = 0; ++ update_burst_score(se); ++} ++ ++static void restart_burst_rescale_deadline(struct sched_entity *se) { ++ s64 vscaled, wremain, vremain = se->deadline - se->vruntime; ++ u8 prev_score = se->burst_score; ++ restart_burst(se); ++ if (prev_score > se->burst_score) { ++ wremain = __unscale_slice(abs(vremain), prev_score); ++ vscaled = scale_slice(wremain, se); ++ if (unlikely(vremain < 0)) ++ vscaled = -vscaled; ++ se->deadline = se->vruntime + vscaled; ++ } ++} ++#endif // CONFIG_SCHED_BORE ++ + int sched_thermal_decay_shift; + static int __init setup_sched_thermal_decay_shift(char *str) + { +@@ -145,6 +274,92 @@ static unsigned int sysctl_numa_balancing_promote_rate_limit = 65536; + + #ifdef CONFIG_SYSCTL + static struct ctl_table sched_fair_sysctls[] = { ++#ifdef CONFIG_SCHED_BORE ++ { ++ .procname = "sched_bore", ++ .data = &sched_bore, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ONE, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_exclude_kthreads", ++ .data = &sched_burst_exclude_kthreads, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_smoothness_long", ++ .data = &sched_burst_smoothness_long, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_smoothness_short", ++ .data = &sched_burst_smoothness_short, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_fork_atavistic", ++ .data = &sched_burst_fork_atavistic, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_THREE, ++ }, ++ { ++ .procname = "sched_burst_penalty_offset", ++ .data = &sched_burst_penalty_offset, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &sixty_four, ++ }, ++ { ++ .procname = "sched_burst_penalty_scale", ++ .data = &sched_burst_penalty_scale, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_12_bits, ++ }, ++ { ++ .procname = "sched_burst_cache_lifetime", ++ .data = &sched_burst_cache_lifetime, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++ { ++ .procname = "sched_deadline_boost_mask", ++ .data = &sched_deadline_boost_mask, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++ { ++ .procname = "sched_deadline_preserve_mask", ++ .data = &sched_deadline_preserve_mask, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++#endif // CONFIG_SCHED_BORE + { + .procname = "sched_child_runs_first", + .data = &sysctl_sched_child_runs_first, +@@ -210,6 +425,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) + * + * This idea comes from the SD scheduler of Con Kolivas: + */ ++#ifdef CONFIG_SCHED_BORE ++static void update_sysctl(void) { ++ sysctl_sched_base_slice = ++ max(sysctl_sched_min_base_slice, configured_sched_base_slice); ++} ++void sched_update_min_base_slice(void) { update_sysctl(); } ++#else // !CONFIG_SCHED_BORE + static unsigned int get_update_sysctl_factor(void) + { + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); +@@ -240,6 +462,7 @@ static void update_sysctl(void) + SET_SYSCTL(sched_base_slice); + #undef SET_SYSCTL + } ++#endif // CONFIG_SCHED_BORE + + void __init sched_init_granularity(void) + { +@@ -713,6 +936,9 @@ static s64 entity_lag(u64 avruntime, struct sched_entity *se) + + vlag = avruntime - se->vruntime; + limit = calc_delta_fair(max_t(u64, 2*se->slice, TICK_NSEC), se); ++#ifdef CONFIG_SCHED_BORE ++ limit >>= 1; ++#endif // CONFIG_SCHED_BORE + + return clamp(vlag, -limit, limit); + } +@@ -1002,6 +1228,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + * Scheduling class statistics methods: + */ + #ifdef CONFIG_SMP ++#if !defined(CONFIG_SCHED_BORE) + int sched_update_scaling(void) + { + unsigned int factor = get_update_sysctl_factor(); +@@ -1013,6 +1240,7 @@ int sched_update_scaling(void) + + return 0; + } ++#endif // CONFIG_SCHED_BORE + #endif + #endif + +@@ -1179,7 +1407,13 @@ static void update_curr(struct cfs_rq *cfs_rq) + curr->sum_exec_runtime += delta_exec; + schedstat_add(cfs_rq->exec_clock, delta_exec); + ++#ifdef CONFIG_SCHED_BORE ++ curr->burst_time += delta_exec; ++ update_burst_penalty(curr); ++ curr->vruntime += max(1ULL, calc_delta_fair(delta_exec, curr)); ++#else // !CONFIG_SCHED_BORE + curr->vruntime += calc_delta_fair(delta_exec, curr); ++#endif // CONFIG_SCHED_BORE + update_deadline(cfs_rq, curr); + update_min_vruntime(cfs_rq); + +@@ -5072,6 +5306,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + s64 lag = 0; + + se->slice = sysctl_sched_base_slice; ++#ifdef CONFIG_SCHED_BORE ++ if (flags & ~sched_deadline_boost_mask & sched_deadline_preserve_mask) ++ vslice = se->deadline - se->vruntime; ++ else ++#endif // CONFIG_SCHED_BORE + vslice = calc_delta_fair(se->slice, se); + + /* +@@ -5082,6 +5321,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + * + * EEVDF: placement strategy #1 / #2 + */ ++#ifdef CONFIG_SCHED_BORE ++ if (se->vlag) ++#endif // CONFIG_SCHED_BORE + if (sched_feat(PLACE_LAG) && cfs_rq->nr_running) { + struct sched_entity *curr = cfs_rq->curr; + unsigned long load; +@@ -5157,7 +5399,11 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + * on average, halfway through their slice, as such start tasks + * off with half a slice to ease into the competition. + */ ++#if !defined(CONFIG_SCHED_BORE) + if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) ++#else // CONFIG_SCHED_BORE ++ if (flags & sched_deadline_boost_mask) ++#endif // CONFIG_SCHED_BORE + vslice /= 2; + + /* +@@ -6724,6 +6970,14 @@ static void dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + bool was_sched_idle = sched_idle_rq(rq); + + util_est_dequeue(&rq->cfs, p); ++#ifdef CONFIG_SCHED_BORE ++ if (task_sleep) { ++ cfs_rq = cfs_rq_of(se); ++ if (cfs_rq->curr == se) ++ update_curr(cfs_rq); ++ restart_burst(se); ++ } ++#endif // CONFIG_SCHED_BORE + + for_each_sched_entity(se) { + cfs_rq = cfs_rq_of(se); +@@ -8461,16 +8715,25 @@ static void yield_task_fair(struct rq *rq) + /* + * Are we the only task in the tree? + */ ++#if !defined(CONFIG_SCHED_BORE) + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); ++#endif // CONFIG_SCHED_BORE + + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); ++#ifdef CONFIG_SCHED_BORE ++ restart_burst_rescale_deadline(se); ++ if (unlikely(rq->nr_running == 1)) ++ return; ++ ++ clear_buddies(cfs_rq, se); ++#endif // CONFIG_SCHED_BORE + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() +@@ -12536,6 +12799,9 @@ static void task_fork_fair(struct task_struct *p) + curr = cfs_rq->curr; + if (curr) + update_curr(cfs_rq); ++#ifdef CONFIG_SCHED_BORE ++ update_burst_score(se); ++#endif // CONFIG_SCHED_BORE + place_entity(cfs_rq, se, ENQUEUE_INITIAL); + rq_unlock(rq, &rf); + } +diff --git a/kernel/sched/features.h b/kernel/sched/features.h +index f770168230..3711c7700d 100644 +--- a/kernel/sched/features.h ++++ b/kernel/sched/features.h +@@ -5,8 +5,12 @@ + * sleep+wake cycles. EEVDF placement strategy #1, #2 if disabled. + */ + SCHED_FEAT(PLACE_LAG, true) ++#if !defined(CONFIG_SCHED_BORE) + SCHED_FEAT(PLACE_DEADLINE_INITIAL, true) + SCHED_FEAT(RUN_TO_PARITY, true) ++#else // CONFIG_SCHED_BORE ++SCHED_FEAT(RUN_TO_PARITY, false) ++#endif // CONFIG_SCHED_BORE + + /* + * Prefer to schedule the task we woke last (assuming it failed +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 2e8f26a919..e8d5ce2027 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -1948,7 +1948,11 @@ static inline void dirty_sched_domain_sysctl(int cpu) + } + #endif + ++#ifdef CONFIG_SCHED_BORE ++extern void sched_update_min_base_slice(void); ++#else // !CONFIG_SCHED_BORE + extern int sched_update_scaling(void); ++#endif // CONFIG_SCHED_BORE + + static inline const struct cpumask *task_user_cpus(struct task_struct *p) + { +@@ -2528,6 +2532,9 @@ extern const_debug unsigned int sysctl_sched_nr_migrate; + extern const_debug unsigned int sysctl_sched_migration_cost; + + extern unsigned int sysctl_sched_base_slice; ++#ifdef CONFIG_SCHED_BORE ++extern unsigned int sysctl_sched_min_base_slice; ++#endif // CONFIG_SCHED_BORE + + #ifdef CONFIG_SCHED_DEBUG + extern int sysctl_resched_latency_warn_ms; +-- +2.34.1 diff --git a/sys-kernel/gentoo-sources-6.6/0005-zstd.patch b/sys-kernel/gentoo-sources-6.6/0005-zstd.patch new file mode 100644 index 0000000..e351b2a --- /dev/null +++ b/sys-kernel/gentoo-sources-6.6/0005-zstd.patch @@ -0,0 +1,13833 @@ +From 2c4b02d9a1c3640cde85fcf17bf264f01975f0e0 Mon Sep 17 00:00:00 2001 +From: Peter Jung +Date: Sat, 16 Dec 2023 09:03:26 +0100 +Subject: [PATCH 5/5] zstd + +Signed-off-by: Peter Jung +--- + include/linux/zstd.h | 2 +- + include/linux/zstd_errors.h | 23 +- + include/linux/zstd_lib.h | 697 +++++-- + lib/zstd/Makefile | 2 +- + lib/zstd/common/allocations.h | 56 + + lib/zstd/common/bits.h | 149 ++ + lib/zstd/common/bitstream.h | 53 +- + lib/zstd/common/compiler.h | 14 +- + lib/zstd/common/cpu.h | 3 +- + lib/zstd/common/debug.c | 5 +- + lib/zstd/common/debug.h | 3 +- + lib/zstd/common/entropy_common.c | 42 +- + lib/zstd/common/error_private.c | 12 +- + lib/zstd/common/error_private.h | 3 +- + lib/zstd/common/fse.h | 89 +- + lib/zstd/common/fse_decompress.c | 94 +- + lib/zstd/common/huf.h | 222 +-- + lib/zstd/common/mem.h | 2 +- + lib/zstd/common/portability_macros.h | 26 +- + lib/zstd/common/zstd_common.c | 38 +- + lib/zstd/common/zstd_deps.h | 16 +- + lib/zstd/common/zstd_internal.h | 99 +- + lib/zstd/compress/clevels.h | 3 +- + lib/zstd/compress/fse_compress.c | 59 +- + lib/zstd/compress/hist.c | 3 +- + lib/zstd/compress/hist.h | 3 +- + lib/zstd/compress/huf_compress.c | 372 ++-- + lib/zstd/compress/zstd_compress.c | 1762 ++++++++++++----- + lib/zstd/compress/zstd_compress_internal.h | 333 +++- + lib/zstd/compress/zstd_compress_literals.c | 155 +- + lib/zstd/compress/zstd_compress_literals.h | 25 +- + lib/zstd/compress/zstd_compress_sequences.c | 7 +- + lib/zstd/compress/zstd_compress_sequences.h | 3 +- + lib/zstd/compress/zstd_compress_superblock.c | 47 +- + lib/zstd/compress/zstd_compress_superblock.h | 3 +- + lib/zstd/compress/zstd_cwksp.h | 149 +- + lib/zstd/compress/zstd_double_fast.c | 129 +- + lib/zstd/compress/zstd_double_fast.h | 6 +- + lib/zstd/compress/zstd_fast.c | 582 ++++-- + lib/zstd/compress/zstd_fast.h | 6 +- + lib/zstd/compress/zstd_lazy.c | 518 ++--- + lib/zstd/compress/zstd_lazy.h | 7 +- + lib/zstd/compress/zstd_ldm.c | 11 +- + lib/zstd/compress/zstd_ldm.h | 3 +- + lib/zstd/compress/zstd_ldm_geartab.h | 3 +- + lib/zstd/compress/zstd_opt.c | 187 +- + lib/zstd/compress/zstd_opt.h | 3 +- + lib/zstd/decompress/huf_decompress.c | 770 ++++--- + lib/zstd/decompress/zstd_ddict.c | 9 +- + lib/zstd/decompress/zstd_ddict.h | 3 +- + lib/zstd/decompress/zstd_decompress.c | 261 ++- + lib/zstd/decompress/zstd_decompress_block.c | 283 ++- + lib/zstd/decompress/zstd_decompress_block.h | 8 +- + .../decompress/zstd_decompress_internal.h | 7 +- + lib/zstd/decompress_sources.h | 2 +- + lib/zstd/zstd_common_module.c | 5 +- + lib/zstd/zstd_compress_module.c | 2 +- + lib/zstd/zstd_decompress_module.c | 4 +- + 58 files changed, 4789 insertions(+), 2594 deletions(-) + create mode 100644 lib/zstd/common/allocations.h + create mode 100644 lib/zstd/common/bits.h + +diff --git a/include/linux/zstd.h b/include/linux/zstd.h +index 113408eef6ec..f109d49f43f8 100644 +--- a/include/linux/zstd.h ++++ b/include/linux/zstd.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/include/linux/zstd_errors.h b/include/linux/zstd_errors.h +index 58b6dd45a969..6d5cf55f0bf3 100644 +--- a/include/linux/zstd_errors.h ++++ b/include/linux/zstd_errors.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -17,8 +18,17 @@ + + + /* ===== ZSTDERRORLIB_API : control library symbols visibility ===== */ +-#define ZSTDERRORLIB_VISIBILITY +-#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBILITY ++#define ZSTDERRORLIB_VISIBLE ++ ++#ifndef ZSTDERRORLIB_HIDDEN ++# if (__GNUC__ >= 4) && !defined(__MINGW32__) ++# define ZSTDERRORLIB_HIDDEN __attribute__ ((visibility ("hidden"))) ++# else ++# define ZSTDERRORLIB_HIDDEN ++# endif ++#endif ++ ++#define ZSTDERRORLIB_API ZSTDERRORLIB_VISIBLE + + /*-********************************************* + * Error codes list +@@ -43,14 +53,17 @@ typedef enum { + ZSTD_error_frameParameter_windowTooLarge = 16, + ZSTD_error_corruption_detected = 20, + ZSTD_error_checksum_wrong = 22, ++ ZSTD_error_literals_headerWrong = 24, + ZSTD_error_dictionary_corrupted = 30, + ZSTD_error_dictionary_wrong = 32, + ZSTD_error_dictionaryCreation_failed = 34, + ZSTD_error_parameter_unsupported = 40, ++ ZSTD_error_parameter_combination_unsupported = 41, + ZSTD_error_parameter_outOfBound = 42, + ZSTD_error_tableLog_tooLarge = 44, + ZSTD_error_maxSymbolValue_tooLarge = 46, + ZSTD_error_maxSymbolValue_tooSmall = 48, ++ ZSTD_error_stabilityCondition_notRespected = 50, + ZSTD_error_stage_wrong = 60, + ZSTD_error_init_missing = 62, + ZSTD_error_memory_allocation = 64, +@@ -58,11 +71,15 @@ typedef enum { + ZSTD_error_dstSize_tooSmall = 70, + ZSTD_error_srcSize_wrong = 72, + ZSTD_error_dstBuffer_null = 74, ++ ZSTD_error_noForwardProgress_destFull = 80, ++ ZSTD_error_noForwardProgress_inputEmpty = 82, + /* following error codes are __NOT STABLE__, they can be removed or changed in future versions */ + ZSTD_error_frameIndex_tooLarge = 100, + ZSTD_error_seekableIO = 102, + ZSTD_error_dstBuffer_wrong = 104, + ZSTD_error_srcBuffer_wrong = 105, ++ ZSTD_error_sequenceProducer_failed = 106, ++ ZSTD_error_externalSequences_invalid = 107, + ZSTD_error_maxCode = 120 /* never EVER use this value directly, it can change in future versions! Use ZSTD_isError() instead */ + } ZSTD_ErrorCode; + +diff --git a/include/linux/zstd_lib.h b/include/linux/zstd_lib.h +index 79d55465d5c1..8b4ffe649df5 100644 +--- a/include/linux/zstd_lib.h ++++ b/include/linux/zstd_lib.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,23 +12,42 @@ + #ifndef ZSTD_H_235446 + #define ZSTD_H_235446 + +-/* ====== Dependency ======*/ ++/* ====== Dependencies ======*/ + #include /* INT_MAX */ + #include /* size_t */ + + + /* ===== ZSTDLIB_API : control library symbols visibility ===== */ +-#ifndef ZSTDLIB_VISIBLE ++#define ZSTDLIB_VISIBLE ++ ++#ifndef ZSTDLIB_HIDDEN + # if (__GNUC__ >= 4) && !defined(__MINGW32__) +-# define ZSTDLIB_VISIBLE __attribute__ ((visibility ("default"))) + # define ZSTDLIB_HIDDEN __attribute__ ((visibility ("hidden"))) + # else +-# define ZSTDLIB_VISIBLE + # define ZSTDLIB_HIDDEN + # endif + #endif ++ + #define ZSTDLIB_API ZSTDLIB_VISIBLE + ++/* Deprecation warnings : ++ * Should these warnings be a problem, it is generally possible to disable them, ++ * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. ++ * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. ++ */ ++#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS ++# define ZSTD_DEPRECATED(message) /* disable deprecation warnings */ ++#else ++# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated(message))) ++# elif (__GNUC__ >= 3) ++# define ZSTD_DEPRECATED(message) __attribute__((deprecated)) ++# else ++# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") ++# define ZSTD_DEPRECATED(message) ++# endif ++#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ ++ + + /* ***************************************************************************** + Introduction +@@ -65,7 +85,7 @@ + /*------ Version ------*/ + #define ZSTD_VERSION_MAJOR 1 + #define ZSTD_VERSION_MINOR 5 +-#define ZSTD_VERSION_RELEASE 2 ++#define ZSTD_VERSION_RELEASE 5 + #define ZSTD_VERSION_NUMBER (ZSTD_VERSION_MAJOR *100*100 + ZSTD_VERSION_MINOR *100 + ZSTD_VERSION_RELEASE) + + /*! ZSTD_versionNumber() : +@@ -107,7 +127,8 @@ ZSTDLIB_API const char* ZSTD_versionString(void); + ***************************************/ + /*! ZSTD_compress() : + * Compresses `src` content as a single zstd compressed frame into already allocated `dst`. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). */ + ZSTDLIB_API size_t ZSTD_compress( void* dst, size_t dstCapacity, +@@ -156,7 +177,9 @@ ZSTDLIB_API unsigned long long ZSTD_getFrameContentSize(const void *src, size_t + * "empty", "unknown" and "error" results to the same return value (0), + * while ZSTD_getFrameContentSize() gives them separate return values. + * @return : decompressed size of `src` frame content _if known and not empty_, 0 otherwise. */ +-ZSTDLIB_API unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); ++ZSTD_DEPRECATED("Replaced by ZSTD_getFrameContentSize") ++ZSTDLIB_API ++unsigned long long ZSTD_getDecompressedSize(const void* src, size_t srcSize); + + /*! ZSTD_findFrameCompressedSize() : Requires v1.4.0+ + * `src` should point to the start of a ZSTD frame or skippable frame. +@@ -168,8 +191,30 @@ ZSTDLIB_API size_t ZSTD_findFrameCompressedSize(const void* src, size_t srcSize) + + + /*====== Helper functions ======*/ +-#define ZSTD_COMPRESSBOUND(srcSize) ((srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ +-ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_compressBound() : ++ * maximum compressed size in worst case single-pass scenario. ++ * When invoking `ZSTD_compress()` or any other one-pass compression function, ++ * it's recommended to provide @dstCapacity >= ZSTD_compressBound(srcSize) ++ * as it eliminates one potential failure scenario, ++ * aka not enough room in dst buffer to write the compressed frame. ++ * Note : ZSTD_compressBound() itself can fail, if @srcSize > ZSTD_MAX_INPUT_SIZE . ++ * In which case, ZSTD_compressBound() will return an error code ++ * which can be tested using ZSTD_isError(). ++ * ++ * ZSTD_COMPRESSBOUND() : ++ * same as ZSTD_compressBound(), but as a macro. ++ * It can be used to produce constants, which can be useful for static allocation, ++ * for example to size a static array on stack. ++ * Will produce constant value 0 if srcSize too large. ++ */ ++#define ZSTD_MAX_INPUT_SIZE ((sizeof(size_t)==8) ? 0xFF00FF00FF00FF00LLU : 0xFF00FF00U) ++#define ZSTD_COMPRESSBOUND(srcSize) (((size_t)(srcSize) >= ZSTD_MAX_INPUT_SIZE) ? 0 : (srcSize) + ((srcSize)>>8) + (((srcSize) < (128<<10)) ? (((128<<10) - (srcSize)) >> 11) /* margin, from 64 to 0 */ : 0)) /* this formula ensures that bound(A) + bound(B) <= bound(A+B) as long as A and B >= 128 KB */ ++ZSTDLIB_API size_t ZSTD_compressBound(size_t srcSize); /*!< maximum compressed size in worst case single-pass scenario */ ++/* ZSTD_isError() : ++ * Most ZSTD_* functions returning a size_t value can be tested for error, ++ * using ZSTD_isError(). ++ * @return 1 if error, 0 otherwise ++ */ + ZSTDLIB_API unsigned ZSTD_isError(size_t code); /*!< tells if a `size_t` function result is an error code */ + ZSTDLIB_API const char* ZSTD_getErrorName(size_t code); /*!< provides readable string from an error code */ + ZSTDLIB_API int ZSTD_minCLevel(void); /*!< minimum negative compression level allowed, requires v1.4.0+ */ +@@ -412,6 +457,9 @@ typedef enum { + * ZSTD_c_validateSequences + * ZSTD_c_useBlockSplitter + * ZSTD_c_useRowMatchFinder ++ * ZSTD_c_prefetchCDictTables ++ * ZSTD_c_enableSeqProducerFallback ++ * ZSTD_c_maxBlockSize + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly; + * also, the enums values themselves are unstable and can still change. +@@ -430,7 +478,11 @@ typedef enum { + ZSTD_c_experimentalParam12=1009, + ZSTD_c_experimentalParam13=1010, + ZSTD_c_experimentalParam14=1011, +- ZSTD_c_experimentalParam15=1012 ++ ZSTD_c_experimentalParam15=1012, ++ ZSTD_c_experimentalParam16=1013, ++ ZSTD_c_experimentalParam17=1014, ++ ZSTD_c_experimentalParam18=1015, ++ ZSTD_c_experimentalParam19=1016 + } ZSTD_cParameter; + + typedef struct { +@@ -493,7 +545,7 @@ typedef enum { + * They will be used to compress next frame. + * Resetting session never fails. + * - The parameters : changes all parameters back to "default". +- * This removes any reference to any dictionary too. ++ * This also removes any reference to any dictionary or external sequence producer. + * Parameters can only be changed between 2 sessions (i.e. no compression is currently ongoing) + * otherwise the reset fails, and function returns an error value (which can be tested using ZSTD_isError()) + * - Both : similar to resetting the session, followed by resetting parameters. +@@ -506,7 +558,8 @@ ZSTDLIB_API size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset); + * Should cctx hold data from a previously unfinished frame, everything about it is forgotten. + * - Compression parameters are pushed into CCtx before starting compression, using ZSTD_CCtx_set*() + * - The function is always blocking, returns when compression is completed. +- * Hint : compression runs faster if `dstCapacity` >= `ZSTD_compressBound(srcSize)`. ++ * NOTE: Providing `dstCapacity >= ZSTD_compressBound(srcSize)` guarantees that zstd will have ++ * enough space to successfully compress the data, though it is possible it fails for other reasons. + * @return : compressed size written into `dst` (<= `dstCapacity), + * or an error code if it fails (which can be tested using ZSTD_isError()). + */ +@@ -543,13 +596,15 @@ typedef enum { + * ZSTD_d_stableOutBuffer + * ZSTD_d_forceIgnoreChecksum + * ZSTD_d_refMultipleDDicts ++ * ZSTD_d_disableHuffmanAssembly + * Because they are not stable, it's necessary to define ZSTD_STATIC_LINKING_ONLY to access them. + * note : never ever use experimentalParam? names directly + */ + ZSTD_d_experimentalParam1=1000, + ZSTD_d_experimentalParam2=1001, + ZSTD_d_experimentalParam3=1002, +- ZSTD_d_experimentalParam4=1003 ++ ZSTD_d_experimentalParam4=1003, ++ ZSTD_d_experimentalParam5=1004 + + } ZSTD_dParameter; + +@@ -728,8 +783,6 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * This following is a legacy streaming API, available since v1.0+ . + * It can be replaced by ZSTD_CCtx_reset() and ZSTD_compressStream2(). + * It is redundant, but remains fully supported. +- * Streaming in combination with advanced parameters and dictionary compression +- * can only be used through the new API. + ******************************************************************************/ + + /*! +@@ -738,6 +791,9 @@ ZSTDLIB_API size_t ZSTD_CStreamOutSize(void); /*< recommended size for output + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); + * ZSTD_CCtx_refCDict(zcs, NULL); // clear the dictionary (if any) + * ZSTD_CCtx_setParameter(zcs, ZSTD_c_compressionLevel, compressionLevel); ++ * ++ * Note that ZSTD_initCStream() clears any previously set dictionary. Use the new API ++ * to compress with a dictionary. + */ + ZSTDLIB_API size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel); + /*! +@@ -788,13 +844,31 @@ ZSTDLIB_API size_t ZSTD_freeDStream(ZSTD_DStream* zds); /* accept NULL pointer + + /*===== Streaming decompression functions =====*/ + +-/* This function is redundant with the advanced API and equivalent to: ++/*! ZSTD_initDStream() : ++ * Initialize/reset DStream state for new decompression operation. ++ * Call before new decompression operation using same DStream. + * ++ * Note : This function is redundant with the advanced API and equivalent to: + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * ZSTD_DCtx_refDDict(zds, NULL); + */ + ZSTDLIB_API size_t ZSTD_initDStream(ZSTD_DStream* zds); + ++/*! ZSTD_decompressStream() : ++ * Streaming decompression function. ++ * Call repetitively to consume full input updating it as necessary. ++ * Function will update both input and output `pos` fields exposing current state via these fields: ++ * - `input.pos < input.size`, some input remaining and caller should provide remaining input ++ * on the next call. ++ * - `output.pos < output.size`, decoder finished and flushed all remaining buffers. ++ * - `output.pos == output.size`, potentially uncflushed data present in the internal buffers, ++ * call ZSTD_decompressStream() again to flush remaining data to output. ++ * Note : with no additional input, amount of data flushed <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * @return : 0 when a frame is completely decoded and fully flushed, ++ * or an error code, which can be tested using ZSTD_isError(), ++ * or any other value > 0, which means there is some decoding or flushing to do to complete current frame. ++ */ + ZSTDLIB_API size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inBuffer* input); + + ZSTDLIB_API size_t ZSTD_DStreamInSize(void); /*!< recommended size for input buffer */ +@@ -913,7 +987,7 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict); + * If @return == 0, the dictID could not be decoded. + * This could for one of the following reasons : + * - The frame does not require a dictionary to be decoded (most common case). +- * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden information. ++ * - The frame was built with dictID intentionally removed. Whatever dictionary is necessary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, the frame header could not be decoded (only possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`). + * - This is not a Zstandard frame. +@@ -925,9 +999,11 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Advanced dictionary and prefix API (Requires v1.4.0+) + * + * This API allows dictionaries to be used with ZSTD_compress2(), +- * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). Dictionaries are sticky, and +- * only reset with the context is reset with ZSTD_reset_parameters or +- * ZSTD_reset_session_and_parameters. Prefixes are single-use. ++ * ZSTD_compressStream2(), and ZSTD_decompressDCtx(). ++ * Dictionaries are sticky, they remain valid when same context is re-used, ++ * they only reset when the context is reset ++ * with ZSTD_reset_parameters or ZSTD_reset_session_and_parameters. ++ * In contrast, Prefixes are single-use. + ******************************************************************************/ + + +@@ -937,8 +1013,9 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Loading a NULL (or 0-size) dictionary invalidates previous dictionary, + * meaning "return to no-dictionary mode". +- * Note 1 : Dictionary is sticky, it will be used for all future compressed frames. +- * To return to "no-dictionary" situation, load a NULL dictionary (or reset parameters). ++ * Note 1 : Dictionary is sticky, it will be used for all future compressed frames, ++ * until parameters are reset, a new dictionary is loaded, or the dictionary ++ * is explicitly invalidated by loading a NULL dictionary. + * Note 2 : Loading a dictionary involves building tables. + * It's also a CPU consuming operation, with non-negligible impact on latency. + * Tables are dependent on compression parameters, and for this reason, +@@ -947,11 +1024,15 @@ ZSTDLIB_API unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize); + * Use experimental ZSTD_CCtx_loadDictionary_byReference() to reference content instead. + * In such a case, dictionary buffer must outlive its users. + * Note 4 : Use ZSTD_CCtx_loadDictionary_advanced() +- * to precisely select how dictionary content must be interpreted. */ ++ * to precisely select how dictionary content must be interpreted. ++ * Note 5 : This method does not benefit from LDM (long distance mode). ++ * If you want to employ LDM on some large dictionary content, ++ * prefer employing ZSTD_CCtx_refPrefix() described below. ++ */ + ZSTDLIB_API size_t ZSTD_CCtx_loadDictionary(ZSTD_CCtx* cctx, const void* dict, size_t dictSize); + + /*! ZSTD_CCtx_refCDict() : Requires v1.4.0+ +- * Reference a prepared dictionary, to be used for all next compressed frames. ++ * Reference a prepared dictionary, to be used for all future compressed frames. + * Note that compression parameters are enforced from within CDict, + * and supersede any compression parameter previously set within CCtx. + * The parameters ignored are labelled as "superseded-by-cdict" in the ZSTD_cParameter enum docs. +@@ -970,6 +1051,7 @@ ZSTDLIB_API size_t ZSTD_CCtx_refCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); + * Decompression will need same prefix to properly regenerate data. + * Compressing with a prefix is similar in outcome as performing a diff and compressing it, + * but performs much faster, especially during decompression (compression speed is tunable with compression level). ++ * This method is compatible with LDM (long distance mode). + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special: Adding any prefix (including NULL) invalidates any previous prefix or dictionary + * Note 1 : Prefix buffer is referenced. It **must** outlive compression. +@@ -986,9 +1068,9 @@ ZSTDLIB_API size_t ZSTD_CCtx_refPrefix(ZSTD_CCtx* cctx, + const void* prefix, size_t prefixSize); + + /*! ZSTD_DCtx_loadDictionary() : Requires v1.4.0+ +- * Create an internal DDict from dict buffer, +- * to be used to decompress next frames. +- * The dictionary remains valid for all future frames, until explicitly invalidated. ++ * Create an internal DDict from dict buffer, to be used to decompress all future frames. ++ * The dictionary remains valid for all future frames, until explicitly invalidated, or ++ * a new dictionary is loaded. + * @result : 0, or an error code (which can be tested with ZSTD_isError()). + * Special : Adding a NULL (or 0-size) dictionary invalidates any previous dictionary, + * meaning "return to no-dictionary mode". +@@ -1012,9 +1094,10 @@ ZSTDLIB_API size_t ZSTD_DCtx_loadDictionary(ZSTD_DCtx* dctx, const void* dict, s + * The memory for the table is allocated on the first call to refDDict, and can be + * freed with ZSTD_freeDCtx(). + * ++ * If called with ZSTD_d_refMultipleDDicts disabled (the default), only one dictionary ++ * will be managed, and referencing a dictionary effectively "discards" any previous one. ++ * + * @result : 0, or an error code (which can be tested with ZSTD_isError()). +- * Note 1 : Currently, only one dictionary can be managed. +- * Referencing a new dictionary effectively "discards" any previous one. + * Special: referencing a NULL DDict means "return to no-dictionary mode". + * Note 2 : DDict is just referenced, its lifetime must outlive its usage from DCtx. + */ +@@ -1071,24 +1154,6 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTDLIB_STATIC_API ZSTDLIB_VISIBLE + #endif + +-/* Deprecation warnings : +- * Should these warnings be a problem, it is generally possible to disable them, +- * typically with -Wno-deprecated-declarations for gcc or _CRT_SECURE_NO_WARNINGS in Visual. +- * Otherwise, it's also possible to define ZSTD_DISABLE_DEPRECATE_WARNINGS. +- */ +-#ifdef ZSTD_DISABLE_DEPRECATE_WARNINGS +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API /* disable deprecation warnings */ +-#else +-# if (defined(GNUC) && (GNUC > 4 || (GNUC == 4 && GNUC_MINOR >= 5))) || defined(__clang__) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated(message))) +-# elif (__GNUC__ >= 3) +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API __attribute__((deprecated)) +-# else +-# pragma message("WARNING: You need to implement ZSTD_DEPRECATED for this compiler") +-# define ZSTD_DEPRECATED(message) ZSTDLIB_STATIC_API +-# endif +-#endif /* ZSTD_DISABLE_DEPRECATE_WARNINGS */ +- + /* ************************************************************************************** + * experimental API (static linking only) + **************************************************************************************** +@@ -1123,6 +1188,7 @@ ZSTDLIB_API size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict); + #define ZSTD_TARGETLENGTH_MIN 0 /* note : comparing this constant to an unsigned results in a tautological test */ + #define ZSTD_STRATEGY_MIN ZSTD_fast + #define ZSTD_STRATEGY_MAX ZSTD_btultra2 ++#define ZSTD_BLOCKSIZE_MAX_MIN (1 << 10) /* The minimum valid max blocksize. Maximum blocksizes smaller than this make compressBound() inaccurate. */ + + + #define ZSTD_OVERLAPLOG_MIN 0 +@@ -1303,7 +1369,7 @@ typedef enum { + } ZSTD_paramSwitch_e; + + /* ************************************* +-* Frame size functions ++* Frame header and size functions + ***************************************/ + + /*! ZSTD_findDecompressedSize() : +@@ -1350,29 +1416,109 @@ ZSTDLIB_STATIC_API unsigned long long ZSTD_decompressBound(const void* src, size + * or an error code (if srcSize is too small) */ + ZSTDLIB_STATIC_API size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize); + ++typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; ++typedef struct { ++ unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ ++ unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ ++ unsigned blockSizeMax; ++ ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ ++ unsigned headerSize; ++ unsigned dictID; ++ unsigned checksumFlag; ++ unsigned _reserved1; ++ unsigned _reserved2; ++} ZSTD_frameHeader; ++ ++/*! ZSTD_getFrameHeader() : ++ * decode Frame Header, or requires larger `srcSize`. ++ * @return : 0, `zfhPtr` is correctly filled, ++ * >0, `srcSize` is too small, value is wanted `srcSize` amount, ++ * or an error code, which can be tested using ZSTD_isError() */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ ++/*! ZSTD_getFrameHeader_advanced() : ++ * same as ZSTD_getFrameHeader(), ++ * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ ++ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); ++ ++/*! ZSTD_decompressionMargin() : ++ * Zstd supports in-place decompression, where the input and output buffers overlap. ++ * In this case, the output buffer must be at least (Margin + Output_Size) bytes large, ++ * and the input buffer must be at the end of the output buffer. ++ * ++ * _______________________ Output Buffer ________________________ ++ * | | ++ * | ____ Input Buffer ____| ++ * | | | ++ * v v v ++ * |---------------------------------------|-----------|----------| ++ * ^ ^ ^ ++ * |___________________ Output_Size ___________________|_ Margin _| ++ * ++ * NOTE: See also ZSTD_DECOMPRESSION_MARGIN(). ++ * NOTE: This applies only to single-pass decompression through ZSTD_decompress() or ++ * ZSTD_decompressDCtx(). ++ * NOTE: This function supports multi-frame input. ++ * ++ * @param src The compressed frame(s) ++ * @param srcSize The size of the compressed frame(s) ++ * @returns The decompression margin or an error that can be checked with ZSTD_isError(). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_decompressionMargin(const void* src, size_t srcSize); ++ ++/*! ZSTD_DECOMPRESS_MARGIN() : ++ * Similar to ZSTD_decompressionMargin(), but instead of computing the margin from ++ * the compressed frame, compute it from the original size and the blockSizeLog. ++ * See ZSTD_decompressionMargin() for details. ++ * ++ * WARNING: This macro does not support multi-frame input, the input must be a single ++ * zstd frame. If you need that support use the function, or implement it yourself. ++ * ++ * @param originalSize The original uncompressed size of the data. ++ * @param blockSize The block size == MIN(windowSize, ZSTD_BLOCKSIZE_MAX). ++ * Unless you explicitly set the windowLog smaller than ++ * ZSTD_BLOCKSIZELOG_MAX you can just use ZSTD_BLOCKSIZE_MAX. ++ */ ++#define ZSTD_DECOMPRESSION_MARGIN(originalSize, blockSize) ((size_t)( \ ++ ZSTD_FRAMEHEADERSIZE_MAX /* Frame header */ + \ ++ 4 /* checksum */ + \ ++ ((originalSize) == 0 ? 0 : 3 * (((originalSize) + (blockSize) - 1) / blockSize)) /* 3 bytes per block */ + \ ++ (blockSize) /* One block of margin */ \ ++ )) ++ + typedef enum { + ZSTD_sf_noBlockDelimiters = 0, /* Representation of ZSTD_Sequence has no block delimiters, sequences only */ + ZSTD_sf_explicitBlockDelimiters = 1 /* Representation of ZSTD_Sequence contains explicit block delimiters */ + } ZSTD_sequenceFormat_e; + ++/*! ZSTD_sequenceBound() : ++ * `srcSize` : size of the input buffer ++ * @return : upper-bound for the number of sequences that can be generated ++ * from a buffer of srcSize bytes ++ * ++ * note : returns number of sequences - to get bytes, multiply by sizeof(ZSTD_Sequence). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_sequenceBound(size_t srcSize); ++ + /*! ZSTD_generateSequences() : +- * Generate sequences using ZSTD_compress2, given a source buffer. ++ * Generate sequences using ZSTD_compress2(), given a source buffer. + * + * Each block will end with a dummy sequence + * with offset == 0, matchLength == 0, and litLength == length of last literals. + * litLength may be == 0, and if so, then the sequence of (of: 0 ml: 0 ll: 0) + * simply acts as a block delimiter. + * +- * zc can be used to insert custom compression params. +- * This function invokes ZSTD_compress2 ++ * @zc can be used to insert custom compression params. ++ * This function invokes ZSTD_compress2(). + * + * The output of this function can be fed into ZSTD_compressSequences() with CCtx + * setting of ZSTD_c_blockDelimiters as ZSTD_sf_explicitBlockDelimiters + * @return : number of sequences generated + */ + +-ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, +- size_t outSeqsSize, const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ++ZSTD_generateSequences( ZSTD_CCtx* zc, ++ ZSTD_Sequence* outSeqs, size_t outSeqsSize, ++ const void* src, size_t srcSize); + + /*! ZSTD_mergeBlockDelimiters() : + * Given an array of ZSTD_Sequence, remove all sequences that represent block delimiters/last literals +@@ -1388,7 +1534,9 @@ ZSTDLIB_STATIC_API size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* o + ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, size_t seqsSize); + + /*! ZSTD_compressSequences() : +- * Compress an array of ZSTD_Sequence, generated from the original source buffer, into dst. ++ * Compress an array of ZSTD_Sequence, associated with @src buffer, into dst. ++ * @src contains the entire input (not just the literals). ++ * If @srcSize > sum(sequence.length), the remaining bytes are considered all literals + * If a dictionary is included, then the cctx should reference the dict. (see: ZSTD_CCtx_refCDict(), ZSTD_CCtx_loadDictionary(), etc.) + * The entire source is compressed into a single frame. + * +@@ -1413,11 +1561,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_mergeBlockDelimiters(ZSTD_Sequence* sequences, si + * Note: Repcodes are, as of now, always re-calculated within this function, so ZSTD_Sequence::rep is unused. + * Note 2: Once we integrate ability to ingest repcodes, the explicit block delims mode must respect those repcodes exactly, + * and cannot emit an RLE block that disagrees with the repcode history +- * @return : final compressed size or a ZSTD error. ++ * @return : final compressed size, or a ZSTD error code. + */ +-ZSTDLIB_STATIC_API size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstSize, +- const ZSTD_Sequence* inSeqs, size_t inSeqsSize, +- const void* src, size_t srcSize); ++ZSTDLIB_STATIC_API size_t ++ZSTD_compressSequences( ZSTD_CCtx* cctx, void* dst, size_t dstSize, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ++ const void* src, size_t srcSize); + + + /*! ZSTD_writeSkippableFrame() : +@@ -1481,8 +1630,11 @@ ZSTDLIB_API unsigned ZSTD_isSkippableFrame(const void* buffer, size_t size); + * and ZSTD_estimateCCtxSize_usingCCtxParams(), which can be used in tandem with ZSTD_CCtxParams_setParameter(). + * Both can be used to estimate memory using custom compression parameters and arbitrary srcSize limits. + * +- * Note 2 : only single-threaded compression is supported. ++ * Note : only single-threaded compression is supported. + * ZSTD_estimateCCtxSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * ++ * Note 2 : ZSTD_estimateCCtxSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. + */ + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize(int compressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams); +@@ -1501,7 +1653,12 @@ ZSTDLIB_STATIC_API size_t ZSTD_estimateDCtxSize(void); + * or deducted from a valid frame Header, using ZSTD_estimateDStreamSize_fromFrame(); + * Note : if streaming is init with function ZSTD_init?Stream_usingDict(), + * an internal ?Dict will be created, which additional size is not estimated here. +- * In this case, get total size by adding ZSTD_estimate?DictSize */ ++ * In this case, get total size by adding ZSTD_estimate?DictSize ++ * Note 2 : only single-threaded compression is supported. ++ * ZSTD_estimateCStreamSize_usingCCtxParams() will return an error code if ZSTD_c_nbWorkers is >= 1. ++ * Note 3 : ZSTD_estimateCStreamSize* functions are not compatible with the Block-Level Sequence Producer API at this time. ++ * Size estimates assume that no external sequence producer is registered. ++ */ + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize(int compressionLevel); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCParams(ZSTD_compressionParameters cParams); + ZSTDLIB_STATIC_API size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params); +@@ -1649,22 +1806,45 @@ ZSTDLIB_STATIC_API size_t ZSTD_checkCParams(ZSTD_compressionParameters params); + * This function never fails (wide contract) */ + ZSTDLIB_STATIC_API ZSTD_compressionParameters ZSTD_adjustCParams(ZSTD_compressionParameters cPar, unsigned long long srcSize, size_t dictSize); + ++/*! ZSTD_CCtx_setCParams() : ++ * Set all parameters provided within @p cparams into the working @p cctx. ++ * Note : if modifying parameters during compression (MT mode only), ++ * note that changes to the .windowLog parameter will be ignored. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ * On failure, no parameters are updated. ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams); ++ ++/*! ZSTD_CCtx_setFParams() : ++ * Set all parameters provided within @p fparams into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams); ++ ++/*! ZSTD_CCtx_setParams() : ++ * Set all parameters provided within @p params into the working @p cctx. ++ * @return 0 on success, or an error code (can be checked with ZSTD_isError()). ++ */ ++ZSTDLIB_STATIC_API size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params); ++ + /*! ZSTD_compress_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_setParameter() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_advanced(ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- const void* dict,size_t dictSize, +- ZSTD_parameters params); ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ const void* dict,size_t dictSize, ++ ZSTD_parameters params); + + /*! ZSTD_compress_usingCDict_advanced() : + * Note : this function is now DEPRECATED. + * It can be replaced by ZSTD_compress2(), in combination with ZSTD_CCtx_loadDictionary() and other parameter setters. + * This prototype will generate compilation warnings. */ + ZSTD_DEPRECATED("use ZSTD_compress2 with ZSTD_CCtx_loadDictionary") ++ZSTDLIB_STATIC_API + size_t ZSTD_compress_usingCDict_advanced(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -1808,13 +1988,16 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Experimental parameter. + * Default is 0 == disabled. Set to 1 to enable. + * +- * Tells the compressor that the ZSTD_inBuffer will ALWAYS be the same +- * between calls, except for the modifications that zstd makes to pos (the +- * caller must not modify pos). This is checked by the compressor, and +- * compression will fail if it ever changes. This means the only flush +- * mode that makes sense is ZSTD_e_end, so zstd will error if ZSTD_e_end +- * is not used. The data in the ZSTD_inBuffer in the range [src, src + pos) +- * MUST not be modified during compression or you will get data corruption. ++ * Tells the compressor that input data presented with ZSTD_inBuffer ++ * will ALWAYS be the same between calls. ++ * Technically, the @src pointer must never be changed, ++ * and the @pos field can only be updated by zstd. ++ * However, it's possible to increase the @size field, ++ * allowing scenarios where more data can be appended after compressions starts. ++ * These conditions are checked by the compressor, ++ * and compression will fail if they are not respected. ++ * Also, data in the ZSTD_inBuffer within the range [src, src + pos) ++ * MUST not be modified during compression or it will result in data corruption. + * + * When this flag is enabled zstd won't allocate an input window buffer, + * because the user guarantees it can reference the ZSTD_inBuffer until +@@ -1822,18 +2005,15 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * large enough to fit a block (see ZSTD_c_stableOutBuffer). This will also + * avoid the memcpy() from the input buffer to the input window buffer. + * +- * NOTE: ZSTD_compressStream2() will error if ZSTD_e_end is not used. +- * That means this flag cannot be used with ZSTD_compressStream(). +- * + * NOTE: So long as the ZSTD_inBuffer always points to valid memory, using + * this flag is ALWAYS memory safe, and will never access out-of-bounds +- * memory. However, compression WILL fail if you violate the preconditions. ++ * memory. However, compression WILL fail if conditions are not respected. + * +- * WARNING: The data in the ZSTD_inBuffer in the range [dst, dst + pos) MUST +- * not be modified during compression or you will get data corruption. This +- * is because zstd needs to reference data in the ZSTD_inBuffer to find ++ * WARNING: The data in the ZSTD_inBuffer in the range [src, src + pos) MUST ++ * not be modified during compression or it will result in data corruption. ++ * This is because zstd needs to reference data in the ZSTD_inBuffer to find + * matches. Normally zstd maintains its own window buffer for this purpose, +- * but passing this flag tells zstd to use the user provided buffer. ++ * but passing this flag tells zstd to rely on user provided buffer instead. + */ + #define ZSTD_c_stableInBuffer ZSTD_c_experimentalParam9 + +@@ -1878,7 +2058,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + * Without validation, providing a sequence that does not conform to the zstd spec will cause + * undefined behavior, and may produce a corrupted block. + * +- * With validation enabled, a if sequence is invalid (see doc/zstd_compression_format.md for ++ * With validation enabled, if sequence is invalid (see doc/zstd_compression_format.md for + * specifics regarding offset/matchlength requirements) then the function will bail out and + * return an error. + * +@@ -1928,6 +2108,79 @@ ZSTDLIB_STATIC_API size_t ZSTD_CCtx_refPrefix_advanced(ZSTD_CCtx* cctx, const vo + */ + #define ZSTD_c_deterministicRefPrefix ZSTD_c_experimentalParam15 + ++/* ZSTD_c_prefetchCDictTables ++ * Controlled with ZSTD_paramSwitch_e enum. Default is ZSTD_ps_auto. ++ * ++ * In some situations, zstd uses CDict tables in-place rather than copying them ++ * into the working context. (See docs on ZSTD_dictAttachPref_e above for details). ++ * In such situations, compression speed is seriously impacted when CDict tables are ++ * "cold" (outside CPU cache). This parameter instructs zstd to prefetch CDict tables ++ * when they are used in-place. ++ * ++ * For sufficiently small inputs, the cost of the prefetch will outweigh the benefit. ++ * For sufficiently large inputs, zstd will by default memcpy() CDict tables ++ * into the working context, so there is no need to prefetch. This parameter is ++ * targeted at a middle range of input sizes, where a prefetch is cheap enough to be ++ * useful but memcpy() is too expensive. The exact range of input sizes where this ++ * makes sense is best determined by careful experimentation. ++ * ++ * Note: for this parameter, ZSTD_ps_auto is currently equivalent to ZSTD_ps_disable, ++ * but in the future zstd may conditionally enable this feature via an auto-detection ++ * heuristic for cold CDicts. ++ * Use ZSTD_ps_disable to opt out of prefetching under any circumstances. ++ */ ++#define ZSTD_c_prefetchCDictTables ZSTD_c_experimentalParam16 ++ ++/* ZSTD_c_enableSeqProducerFallback ++ * Allowed values are 0 (disable) and 1 (enable). The default setting is 0. ++ * ++ * Controls whether zstd will fall back to an internal sequence producer if an ++ * external sequence producer is registered and returns an error code. This fallback ++ * is block-by-block: the internal sequence producer will only be called for blocks ++ * where the external sequence producer returns an error code. Fallback parsing will ++ * follow any other cParam settings, such as compression level, the same as in a ++ * normal (fully-internal) compression operation. ++ * ++ * The user is strongly encouraged to read the full Block-Level Sequence Producer API ++ * documentation (below) before setting this parameter. */ ++#define ZSTD_c_enableSeqProducerFallback ZSTD_c_experimentalParam17 ++ ++/* ZSTD_c_maxBlockSize ++ * Allowed values are between 1KB and ZSTD_BLOCKSIZE_MAX (128KB). ++ * The default is ZSTD_BLOCKSIZE_MAX, and setting to 0 will set to the default. ++ * ++ * This parameter can be used to set an upper bound on the blocksize ++ * that overrides the default ZSTD_BLOCKSIZE_MAX. It cannot be used to set upper ++ * bounds greater than ZSTD_BLOCKSIZE_MAX or bounds lower than 1KB (will make ++ * compressBound() inaccurate). Only currently meant to be used for testing. ++ * ++ */ ++#define ZSTD_c_maxBlockSize ZSTD_c_experimentalParam18 ++ ++/* ZSTD_c_searchForExternalRepcodes ++ * This parameter affects how zstd parses external sequences, such as sequences ++ * provided through the compressSequences() API or from an external block-level ++ * sequence producer. ++ * ++ * If set to ZSTD_ps_enable, the library will check for repeated offsets in ++ * external sequences, even if those repcodes are not explicitly indicated in ++ * the "rep" field. Note that this is the only way to exploit repcode matches ++ * while using compressSequences() or an external sequence producer, since zstd ++ * currently ignores the "rep" field of external sequences. ++ * ++ * If set to ZSTD_ps_disable, the library will not exploit repeated offsets in ++ * external sequences, regardless of whether the "rep" field has been set. This ++ * reduces sequence compression overhead by about 25% while sacrificing some ++ * compression ratio. ++ * ++ * The default value is ZSTD_ps_auto, for which the library will enable/disable ++ * based on compression level. ++ * ++ * Note: for now, this param only has an effect if ZSTD_c_blockDelimiters is ++ * set to ZSTD_sf_explicitBlockDelimiters. That may change in the future. ++ */ ++#define ZSTD_c_searchForExternalRepcodes ZSTD_c_experimentalParam19 ++ + /*! ZSTD_CCtx_getParameter() : + * Get the requested compression parameter value, selected by enum ZSTD_cParameter, + * and store it into int* value. +@@ -2084,7 +2337,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * in the range [dst, dst + pos) MUST not be modified during decompression + * or you will get data corruption. + * +- * When this flags is enabled zstd won't allocate an output buffer, because ++ * When this flag is enabled zstd won't allocate an output buffer, because + * it can write directly to the ZSTD_outBuffer, but it will still allocate + * an input buffer large enough to fit any compressed block. This will also + * avoid the memcpy() from the internal output buffer to the ZSTD_outBuffer. +@@ -2137,6 +2390,17 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + */ + #define ZSTD_d_refMultipleDDicts ZSTD_d_experimentalParam4 + ++/* ZSTD_d_disableHuffmanAssembly ++ * Set to 1 to disable the Huffman assembly implementation. ++ * The default value is 0, which allows zstd to use the Huffman assembly ++ * implementation if available. ++ * ++ * This parameter can be used to disable Huffman assembly at runtime. ++ * If you want to disable it at compile time you can define the macro ++ * ZSTD_DISABLE_ASM. ++ */ ++#define ZSTD_d_disableHuffmanAssembly ZSTD_d_experimentalParam5 ++ + + /*! ZSTD_DCtx_setFormat() : + * This function is REDUNDANT. Prefer ZSTD_DCtx_setParameter(). +@@ -2145,6 +2409,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParamete + * such ZSTD_f_zstd1_magicless for example. + * @return : 0, or an error code (which can be tested using ZSTD_isError()). */ + ZSTD_DEPRECATED("use ZSTD_DCtx_setParameter() instead") ++ZSTDLIB_STATIC_API + size_t ZSTD_DCtx_setFormat(ZSTD_DCtx* dctx, ZSTD_format_e format); + + /*! ZSTD_decompressStream_simpleArgs() : +@@ -2181,6 +2446,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_decompressStream_simpleArgs ( + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + int compressionLevel, + unsigned long long pledgedSrcSize); +@@ -2198,17 +2464,15 @@ size_t ZSTD_initCStream_srcSize(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + int compressionLevel); + + /*! ZSTD_initCStream_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd parameter and leave the rest as-is. +- * for ((param, value) : params) { +- * ZSTD_CCtx_setParameter(zcs, param, value); +- * } ++ * ZSTD_CCtx_setParams(zcs, params); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_loadDictionary(zcs, dict, dictSize); + * +@@ -2218,6 +2482,7 @@ size_t ZSTD_initCStream_usingDict(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + const void* dict, size_t dictSize, + ZSTD_parameters params, +@@ -2232,15 +2497,13 @@ size_t ZSTD_initCStream_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + + /*! ZSTD_initCStream_usingCDict_advanced() : +- * This function is DEPRECATED, and is approximately equivalent to: ++ * This function is DEPRECATED, and is equivalent to: + * ZSTD_CCtx_reset(zcs, ZSTD_reset_session_only); +- * // Pseudocode: Set each zstd frame parameter and leave the rest as-is. +- * for ((fParam, value) : fParams) { +- * ZSTD_CCtx_setParameter(zcs, fParam, value); +- * } ++ * ZSTD_CCtx_setFParams(zcs, fParams); + * ZSTD_CCtx_setPledgedSrcSize(zcs, pledgedSrcSize); + * ZSTD_CCtx_refCDict(zcs, cdict); + * +@@ -2250,6 +2513,7 @@ size_t ZSTD_initCStream_usingCDict(ZSTD_CStream* zcs, const ZSTD_CDict* cdict); + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset and ZSTD_CCtx_refCDict, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + const ZSTD_CDict* cdict, + ZSTD_frameParameters fParams, +@@ -2274,6 +2538,7 @@ size_t ZSTD_initCStream_usingCDict_advanced(ZSTD_CStream* zcs, + * This prototype will generate compilation warnings. + */ + ZSTD_DEPRECATED("use ZSTD_CCtx_reset, see zstd.h for detailed instructions") ++ZSTDLIB_STATIC_API + size_t ZSTD_resetCStream(ZSTD_CStream* zcs, unsigned long long pledgedSrcSize); + + +@@ -2319,8 +2584,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_toFlushNow(ZSTD_CCtx* cctx); + * ZSTD_DCtx_loadDictionary(zds, dict, dictSize); + * + * note: no dictionary will be used if dict == NULL or dictSize < 8 +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_loadDictionary, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t dictSize); + + /*! +@@ -2330,8 +2595,8 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const vo + * ZSTD_DCtx_refDDict(zds, ddict); + * + * note : ddict is referenced, it must outlive decompression session +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset + ZSTD_DCtx_refDDict, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const ZSTD_DDict* ddict); + + /*! +@@ -2340,17 +2605,185 @@ ZSTDLIB_STATIC_API size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* zds, const Z + * ZSTD_DCtx_reset(zds, ZSTD_reset_session_only); + * + * re-use decompression parameters from previous init; saves dictionary loading +- * Note : this prototype will be marked as deprecated and generate compilation warnings on reaching v1.5.x + */ ++ZSTD_DEPRECATED("use ZSTD_DCtx_reset, see zstd.h for detailed instructions") + ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + ++/* ********************* BLOCK-LEVEL SEQUENCE PRODUCER API ********************* ++ * ++ * *** OVERVIEW *** ++ * The Block-Level Sequence Producer API allows users to provide their own custom ++ * sequence producer which libzstd invokes to process each block. The produced list ++ * of sequences (literals and matches) is then post-processed by libzstd to produce ++ * valid compressed blocks. ++ * ++ * This block-level offload API is a more granular complement of the existing ++ * frame-level offload API compressSequences() (introduced in v1.5.1). It offers ++ * an easier migration story for applications already integrated with libzstd: the ++ * user application continues to invoke the same compression functions ++ * ZSTD_compress2() or ZSTD_compressStream2() as usual, and transparently benefits ++ * from the specific advantages of the external sequence producer. For example, ++ * the sequence producer could be tuned to take advantage of known characteristics ++ * of the input, to offer better speed / ratio, or could leverage hardware ++ * acceleration not available within libzstd itself. ++ * ++ * See contrib/externalSequenceProducer for an example program employing the ++ * Block-Level Sequence Producer API. ++ * ++ * *** USAGE *** ++ * The user is responsible for implementing a function of type ++ * ZSTD_sequenceProducer_F. For each block, zstd will pass the following ++ * arguments to the user-provided function: ++ * ++ * - sequenceProducerState: a pointer to a user-managed state for the sequence ++ * producer. ++ * ++ * - outSeqs, outSeqsCapacity: an output buffer for the sequence producer. ++ * outSeqsCapacity is guaranteed >= ZSTD_sequenceBound(srcSize). The memory ++ * backing outSeqs is managed by the CCtx. ++ * ++ * - src, srcSize: an input buffer for the sequence producer to parse. ++ * srcSize is guaranteed to be <= ZSTD_BLOCKSIZE_MAX. ++ * ++ * - dict, dictSize: a history buffer, which may be empty, which the sequence ++ * producer may reference as it parses the src buffer. Currently, zstd will ++ * always pass dictSize == 0 into external sequence producers, but this will ++ * change in the future. ++ * ++ * - compressionLevel: a signed integer representing the zstd compression level ++ * set by the user for the current operation. The sequence producer may choose ++ * to use this information to change its compression strategy and speed/ratio ++ * tradeoff. Note: the compression level does not reflect zstd parameters set ++ * through the advanced API. ++ * ++ * - windowSize: a size_t representing the maximum allowed offset for external ++ * sequences. Note that sequence offsets are sometimes allowed to exceed the ++ * windowSize if a dictionary is present, see doc/zstd_compression_format.md ++ * for details. ++ * ++ * The user-provided function shall return a size_t representing the number of ++ * sequences written to outSeqs. This return value will be treated as an error ++ * code if it is greater than outSeqsCapacity. The return value must be non-zero ++ * if srcSize is non-zero. The ZSTD_SEQUENCE_PRODUCER_ERROR macro is provided ++ * for convenience, but any value greater than outSeqsCapacity will be treated as ++ * an error code. ++ * ++ * If the user-provided function does not return an error code, the sequences ++ * written to outSeqs must be a valid parse of the src buffer. Data corruption may ++ * occur if the parse is not valid. A parse is defined to be valid if the ++ * following conditions hold: ++ * - The sum of matchLengths and literalLengths must equal srcSize. ++ * - All sequences in the parse, except for the final sequence, must have ++ * matchLength >= ZSTD_MINMATCH_MIN. The final sequence must have ++ * matchLength >= ZSTD_MINMATCH_MIN or matchLength == 0. ++ * - All offsets must respect the windowSize parameter as specified in ++ * doc/zstd_compression_format.md. ++ * - If the final sequence has matchLength == 0, it must also have offset == 0. ++ * ++ * zstd will only validate these conditions (and fail compression if they do not ++ * hold) if the ZSTD_c_validateSequences cParam is enabled. Note that sequence ++ * validation has a performance cost. ++ * ++ * If the user-provided function returns an error, zstd will either fall back ++ * to an internal sequence producer or fail the compression operation. The user can ++ * choose between the two behaviors by setting the ZSTD_c_enableSeqProducerFallback ++ * cParam. Fallback compression will follow any other cParam settings, such as ++ * compression level, the same as in a normal compression operation. ++ * ++ * The user shall instruct zstd to use a particular ZSTD_sequenceProducer_F ++ * function by calling ++ * ZSTD_registerSequenceProducer(cctx, ++ * sequenceProducerState, ++ * sequenceProducer) ++ * This setting will persist until the next parameter reset of the CCtx. ++ * ++ * The sequenceProducerState must be initialized by the user before calling ++ * ZSTD_registerSequenceProducer(). The user is responsible for destroying the ++ * sequenceProducerState. ++ * ++ * *** LIMITATIONS *** ++ * This API is compatible with all zstd compression APIs which respect advanced parameters. ++ * However, there are three limitations: ++ * ++ * First, the ZSTD_c_enableLongDistanceMatching cParam is not currently supported. ++ * COMPRESSION WILL FAIL if it is enabled and the user tries to compress with a block-level ++ * external sequence producer. ++ * - Note that ZSTD_c_enableLongDistanceMatching is auto-enabled by default in some ++ * cases (see its documentation for details). Users must explicitly set ++ * ZSTD_c_enableLongDistanceMatching to ZSTD_ps_disable in such cases if an external ++ * sequence producer is registered. ++ * - As of this writing, ZSTD_c_enableLongDistanceMatching is disabled by default ++ * whenever ZSTD_c_windowLog < 128MB, but that's subject to change. Users should ++ * check the docs on ZSTD_c_enableLongDistanceMatching whenever the Block-Level Sequence ++ * Producer API is used in conjunction with advanced settings (like ZSTD_c_windowLog). ++ * ++ * Second, history buffers are not currently supported. Concretely, zstd will always pass ++ * dictSize == 0 to the external sequence producer (for now). This has two implications: ++ * - Dictionaries are not currently supported. Compression will *not* fail if the user ++ * references a dictionary, but the dictionary won't have any effect. ++ * - Stream history is not currently supported. All advanced compression APIs, including ++ * streaming APIs, work with external sequence producers, but each block is treated as ++ * an independent chunk without history from previous blocks. ++ * ++ * Third, multi-threading within a single compression is not currently supported. In other words, ++ * COMPRESSION WILL FAIL if ZSTD_c_nbWorkers > 0 and an external sequence producer is registered. ++ * Multi-threading across compressions is fine: simply create one CCtx per thread. ++ * ++ * Long-term, we plan to overcome all three limitations. There is no technical blocker to ++ * overcoming them. It is purely a question of engineering effort. ++ */ ++ ++#define ZSTD_SEQUENCE_PRODUCER_ERROR ((size_t)(-1)) ++ ++typedef size_t ZSTD_sequenceProducer_F ( ++ void* sequenceProducerState, ++ ZSTD_Sequence* outSeqs, size_t outSeqsCapacity, ++ const void* src, size_t srcSize, ++ const void* dict, size_t dictSize, ++ int compressionLevel, ++ size_t windowSize ++); ++ ++/*! ZSTD_registerSequenceProducer() : ++ * Instruct zstd to use a block-level external sequence producer function. ++ * ++ * The sequenceProducerState must be initialized by the caller, and the caller is ++ * responsible for managing its lifetime. This parameter is sticky across ++ * compressions. It will remain set until the user explicitly resets compression ++ * parameters. ++ * ++ * Sequence producer registration is considered to be an "advanced parameter", ++ * part of the "advanced API". This means it will only have an effect on compression ++ * APIs which respect advanced parameters, such as compress2() and compressStream2(). ++ * Older compression APIs such as compressCCtx(), which predate the introduction of ++ * "advanced parameters", will ignore any external sequence producer setting. ++ * ++ * The sequence producer can be "cleared" by registering a NULL function pointer. This ++ * removes all limitations described above in the "LIMITATIONS" section of the API docs. ++ * ++ * The user is strongly encouraged to read the full API documentation (above) before ++ * calling this function. */ ++ZSTDLIB_STATIC_API void ++ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* cctx, ++ void* sequenceProducerState, ++ ZSTD_sequenceProducer_F* sequenceProducer ++); ++ ++ + /* ******************************************************************* +-* Buffer-less and synchronous inner streaming functions ++* Buffer-less and synchronous inner streaming functions (DEPRECATED) ++* ++* This API is deprecated, and will be removed in a future version. ++* It allows streaming (de)compression with user allocated buffers. ++* However, it is hard to use, and not as well tested as the rest of ++* our API. + * +-* This is an advanced API, giving full control over buffer management, for users which need direct control over memory. +-* But it's also a complex one, with several restrictions, documented below. +-* Prefer normal streaming API for an easier experience. ++* Please use the normal streaming API instead: ZSTD_compressStream2, ++* and ZSTD_decompressStream. ++* If there is functionality that you need, but it doesn't provide, ++* please open an issue on our GitHub. + ********************************************************************* */ + + /* +@@ -2362,7 +2795,6 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + + Start by initializing a context. + Use ZSTD_compressBegin(), or ZSTD_compressBegin_usingDict() for dictionary compression. +- It's also possible to duplicate a reference context which has already been initialized, using ZSTD_copyCCtx() + + Then, consume your input using ZSTD_compressContinue(). + There are some important considerations to keep in mind when using this advanced function : +@@ -2384,18 +2816,28 @@ ZSTDLIB_STATIC_API size_t ZSTD_resetDStream(ZSTD_DStream* zds); + */ + + /*===== Buffer-less streaming compression functions =====*/ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); /*< note: fails if cdict==NULL */ +-ZSTDLIB_STATIC_API size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ + ++ZSTD_DEPRECATED("This function will likely be removed in a future release. It is misleading and has very limited utility.") ++ZSTDLIB_STATIC_API ++size_t ZSTD_copyCCtx(ZSTD_CCtx* cctx, const ZSTD_CCtx* preparedCCtx, unsigned long long pledgedSrcSize); /*< note: if pledgedSrcSize is not known, use ZSTD_CONTENTSIZE_UNKNOWN */ ++ ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The buffer-less API is deprecated in favor of the normal streaming API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* The ZSTD_compressBegin_advanced() and ZSTD_compressBegin_usingCDict_advanced() are now DEPRECATED and will generate a compiler warning */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, ZSTD_parameters params, unsigned long long pledgedSrcSize); /*< pledgedSrcSize : If srcSize is not known at init time, use ZSTD_CONTENTSIZE_UNKNOWN */ + ZSTD_DEPRECATED("use advanced API to access custom parameters") ++ZSTDLIB_STATIC_API + size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_CDict* const cdict, ZSTD_frameParameters const fParams, unsigned long long const pledgedSrcSize); /* compression parameters are already set within cdict. pledgedSrcSize must be correct. If srcSize is not known, use macro ZSTD_CONTENTSIZE_UNKNOWN */ + /* + Buffer-less streaming decompression (synchronous mode) +@@ -2408,8 +2850,8 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + Frame header is extracted from the beginning of compressed frame, so providing only the frame's beginning is enough. + Data fragment must be large enough to ensure successful decoding. + `ZSTD_frameHeaderSize_max` bytes is guaranteed to always be large enough. +- @result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. +- >0 : `srcSize` is too small, please provide at least @result bytes on next attempt. ++ result : 0 : successful decoding, the `ZSTD_frameHeader` structure is correctly filled. ++ >0 : `srcSize` is too small, please provide at least result bytes on next attempt. + errorCode, which can be tested using ZSTD_isError(). + + It fills a ZSTD_frameHeader structure with important information to correctly decode the frame, +@@ -2428,7 +2870,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + + The most memory efficient way is to use a round buffer of sufficient size. + Sufficient size is determined by invoking ZSTD_decodingBufferSize_min(), +- which can @return an error code if required value is too large for current system (in 32-bits mode). ++ which can return an error code if required value is too large for current system (in 32-bits mode). + In a round buffer methodology, ZSTD_decompressContinue() decompresses each block next to previous one, + up to the moment there is not enough room left in the buffer to guarantee decoding another full block, + which maximum size is provided in `ZSTD_frameHeader` structure, field `blockSizeMax`. +@@ -2448,7 +2890,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + ZSTD_nextSrcSizeToDecompress() tells how many bytes to provide as 'srcSize' to ZSTD_decompressContinue(). + ZSTD_decompressContinue() requires this _exact_ amount of bytes, or it will fail. + +- @result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). ++ result of ZSTD_decompressContinue() is the number of bytes regenerated within 'dst' (necessarily <= dstCapacity). + It can be zero : it just means ZSTD_decompressContinue() has decoded some metadata item. + It can also be an error code, which can be tested with ZSTD_isError(). + +@@ -2471,27 +2913,7 @@ size_t ZSTD_compressBegin_usingCDict_advanced(ZSTD_CCtx* const cctx, const ZSTD_ + */ + + /*===== Buffer-less streaming decompression functions =====*/ +-typedef enum { ZSTD_frame, ZSTD_skippableFrame } ZSTD_frameType_e; +-typedef struct { +- unsigned long long frameContentSize; /* if == ZSTD_CONTENTSIZE_UNKNOWN, it means this field is not available. 0 means "empty" */ +- unsigned long long windowSize; /* can be very large, up to <= frameContentSize */ +- unsigned blockSizeMax; +- ZSTD_frameType_e frameType; /* if == ZSTD_skippableFrame, frameContentSize is the size of skippable content */ +- unsigned headerSize; +- unsigned dictID; +- unsigned checksumFlag; +-} ZSTD_frameHeader; + +-/*! ZSTD_getFrameHeader() : +- * decode Frame Header, or requires larger `srcSize`. +- * @return : 0, `zfhPtr` is correctly filled, +- * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize); /*< doesn't consume input */ +-/*! ZSTD_getFrameHeader_advanced() : +- * same as ZSTD_getFrameHeader(), +- * with added capability to select a format (like ZSTD_f_zstd1_magicless) */ +-ZSTDLIB_STATIC_API size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format); + ZSTDLIB_STATIC_API size_t ZSTD_decodingBufferSize_min(unsigned long long windowSize, unsigned long long frameContentSize); /*< when frame content size is not known, pass in frameContentSize == ZSTD_CONTENTSIZE_UNKNOWN */ + + ZSTDLIB_STATIC_API size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx); +@@ -2502,6 +2924,7 @@ ZSTDLIB_STATIC_API size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx); + ZSTDLIB_STATIC_API size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); + + /* misc */ ++ZSTD_DEPRECATED("This function will likely be removed in the next minor release. It is misleading and has very limited utility.") + ZSTDLIB_STATIC_API void ZSTD_copyDCtx(ZSTD_DCtx* dctx, const ZSTD_DCtx* preparedDCtx); + typedef enum { ZSTDnit_frameHeader, ZSTDnit_blockHeader, ZSTDnit_block, ZSTDnit_lastBlock, ZSTDnit_checksum, ZSTDnit_skippableFrame } ZSTD_nextInputType_e; + ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); +@@ -2509,11 +2932,23 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + + + +-/* ============================ */ +-/* Block level API */ +-/* ============================ */ ++/* ========================================= */ ++/* Block level API (DEPRECATED) */ ++/* ========================================= */ + + /*! ++ ++ This API is deprecated in favor of the regular compression API. ++ You can get the frame header down to 2 bytes by setting: ++ - ZSTD_c_format = ZSTD_f_zstd1_magicless ++ - ZSTD_c_contentSizeFlag = 0 ++ - ZSTD_c_checksumFlag = 0 ++ - ZSTD_c_dictIDFlag = 0 ++ ++ This API is not as well tested as our normal API, so we recommend not using it. ++ We will be removing it in a future version. If the normal API doesn't provide ++ the functionality you need, please open a GitHub issue. ++ + Block functions produce and decode raw zstd blocks, without frame metadata. + Frame metadata cost is typically ~12 bytes, which can be non-negligible for very small blocks (< 100 bytes). + But users will have to take in charge needed metadata to regenerate data, such as compressed and content sizes. +@@ -2524,7 +2959,6 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + - It is necessary to init context before starting + + compression : any ZSTD_compressBegin*() variant, including with dictionary + + decompression : any ZSTD_decompressBegin*() variant, including with dictionary +- + copyCCtx() and copyDCtx() can be used too + - Block size is limited, it must be <= ZSTD_getBlockSize() <= ZSTD_BLOCKSIZE_MAX == 128 KB + + If input is larger than a block size, it's necessary to split input data into multiple blocks + + For inputs larger than a single block, consider using regular ZSTD_compress() instead. +@@ -2541,11 +2975,14 @@ ZSTDLIB_STATIC_API ZSTD_nextInputType_e ZSTD_nextInputType(ZSTD_DCtx* dctx); + */ + + /*===== Raw zstd block functions =====*/ ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_getBlockSize (const ZSTD_CCtx* cctx); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_compressBlock (ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ZSTD_DEPRECATED("The block API is deprecated in favor of the normal compression API. See docs.") + ZSTDLIB_STATIC_API size_t ZSTD_insertBlock (ZSTD_DCtx* dctx, const void* blockStart, size_t blockSize); /*< insert uncompressed block into `dctx` history. Useful for multi-blocks decompression. */ + +- + #endif /* ZSTD_H_ZSTD_STATIC_LINKING_ONLY */ + +diff --git a/lib/zstd/Makefile b/lib/zstd/Makefile +index 20f08c644b71..464c410b2768 100644 +--- a/lib/zstd/Makefile ++++ b/lib/zstd/Makefile +@@ -1,6 +1,6 @@ + # SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + # ################################################################ +-# Copyright (c) Facebook, Inc. ++# Copyright (c) Meta Platforms, Inc. and affiliates. + # All rights reserved. + # + # This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/allocations.h b/lib/zstd/common/allocations.h +new file mode 100644 +index 000000000000..05adbbeccaa9 +--- /dev/null ++++ b/lib/zstd/common/allocations.h +@@ -0,0 +1,56 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++/* This file provides custom allocation primitives ++ */ ++ ++#define ZSTD_DEPS_NEED_MALLOC ++#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ ++ ++#include "mem.h" /* MEM_STATIC */ ++#define ZSTD_STATIC_LINKING_ONLY ++#include /* ZSTD_customMem */ ++ ++#ifndef ZSTD_ALLOCATIONS_H ++#define ZSTD_ALLOCATIONS_H ++ ++/* custom memory allocation functions */ ++ ++MEM_STATIC void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) ++ return customMem.customAlloc(customMem.opaque, size); ++ return ZSTD_malloc(size); ++} ++ ++MEM_STATIC void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) ++{ ++ if (customMem.customAlloc) { ++ /* calloc implemented as malloc+memset; ++ * not as efficient as calloc, but next best guess for custom malloc */ ++ void* const ptr = customMem.customAlloc(customMem.opaque, size); ++ ZSTD_memset(ptr, 0, size); ++ return ptr; ++ } ++ return ZSTD_calloc(1, size); ++} ++ ++MEM_STATIC void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) ++{ ++ if (ptr!=NULL) { ++ if (customMem.customFree) ++ customMem.customFree(customMem.opaque, ptr); ++ else ++ ZSTD_free(ptr); ++ } ++} ++ ++#endif /* ZSTD_ALLOCATIONS_H */ +diff --git a/lib/zstd/common/bits.h b/lib/zstd/common/bits.h +new file mode 100644 +index 000000000000..aa3487ec4b6a +--- /dev/null ++++ b/lib/zstd/common/bits.h +@@ -0,0 +1,149 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ ++/* ++ * Copyright (c) Meta Platforms, Inc. and affiliates. ++ * All rights reserved. ++ * ++ * This source code is licensed under both the BSD-style license (found in the ++ * LICENSE file in the root directory of this source tree) and the GPLv2 (found ++ * in the COPYING file in the root directory of this source tree). ++ * You may select, at your option, one of the above-listed licenses. ++ */ ++ ++#ifndef ZSTD_BITS_H ++#define ZSTD_BITS_H ++ ++#include "mem.h" ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32_fallback(U32 val) ++{ ++ assert(val != 0); ++ { ++ static const U32 DeBruijnBytePos[32] = {0, 1, 28, 2, 29, 14, 24, 3, ++ 30, 22, 20, 15, 25, 17, 4, 8, ++ 31, 27, 13, 23, 21, 19, 16, 7, ++ 26, 12, 18, 6, 11, 5, 10, 9}; ++ return DeBruijnBytePos[((U32) ((val & -(S32) val) * 0x077CB531U)) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_ctz(val); ++# else ++ return ZSTD_countTrailingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32_fallback(U32 val) { ++ assert(val != 0); ++ { ++ static const U32 DeBruijnClz[32] = {0, 9, 1, 10, 13, 21, 2, 29, ++ 11, 14, 16, 18, 22, 25, 3, 30, ++ 8, 12, 20, 28, 15, 17, 24, 7, ++ 19, 27, 23, 6, 26, 5, 4, 31}; ++ val |= val >> 1; ++ val |= val >> 2; ++ val |= val >> 4; ++ val |= val >> 8; ++ val |= val >> 16; ++ return 31 - DeBruijnClz[(val * 0x07C4ACDDU) >> 27]; ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros32(U32 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)__builtin_clz(val); ++# else ++ return ZSTD_countLeadingZeros32_fallback(val); ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countTrailingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) && defined(__LP64__) ++ return (unsigned)__builtin_ctzll(val); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (leastSignificantWord == 0) { ++ return 32 + ZSTD_countTrailingZeros32(mostSignificantWord); ++ } else { ++ return ZSTD_countTrailingZeros32(leastSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_countLeadingZeros64(U64 val) ++{ ++ assert(val != 0); ++# if (__GNUC__ >= 4) ++ return (unsigned)(__builtin_clzll(val)); ++# else ++ { ++ U32 mostSignificantWord = (U32)(val >> 32); ++ U32 leastSignificantWord = (U32)val; ++ if (mostSignificantWord == 0) { ++ return 32 + ZSTD_countLeadingZeros32(leastSignificantWord); ++ } else { ++ return ZSTD_countLeadingZeros32(mostSignificantWord); ++ } ++ } ++# endif ++} ++ ++MEM_STATIC unsigned ZSTD_NbCommonBytes(size_t val) ++{ ++ if (MEM_isLittleEndian()) { ++ if (MEM_64bits()) { ++ return ZSTD_countTrailingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countTrailingZeros32((U32)val) >> 3; ++ } ++ } else { /* Big Endian CPU */ ++ if (MEM_64bits()) { ++ return ZSTD_countLeadingZeros64((U64)val) >> 3; ++ } else { ++ return ZSTD_countLeadingZeros32((U32)val) >> 3; ++ } ++ } ++} ++ ++MEM_STATIC unsigned ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ ++{ ++ assert(val != 0); ++ return 31 - ZSTD_countLeadingZeros32(val); ++} ++ ++/* ZSTD_rotateRight_*(): ++ * Rotates a bitfield to the right by "count" bits. ++ * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts ++ */ ++MEM_STATIC ++U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { ++ assert(count < 64); ++ count &= 0x3F; /* for fickle pattern recognition */ ++ return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); ++} ++ ++MEM_STATIC ++U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { ++ assert(count < 32); ++ count &= 0x1F; /* for fickle pattern recognition */ ++ return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); ++} ++ ++MEM_STATIC ++U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { ++ assert(count < 16); ++ count &= 0x0F; /* for fickle pattern recognition */ ++ return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++} ++ ++#endif /* ZSTD_BITS_H */ +diff --git a/lib/zstd/common/bitstream.h b/lib/zstd/common/bitstream.h +index feef3a1b1d60..444dc4f85c64 100644 +--- a/lib/zstd/common/bitstream.h ++++ b/lib/zstd/common/bitstream.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * bitstream + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -27,6 +28,7 @@ + #include "compiler.h" /* UNLIKELY() */ + #include "debug.h" /* assert(), DEBUGLOG(), RAWLOG() */ + #include "error_private.h" /* error codes and messages */ ++#include "bits.h" /* ZSTD_highbit32 */ + + + /*========================================= +@@ -122,33 +124,6 @@ MEM_STATIC void BIT_flushBitsFast(BIT_CStream_t* bitC); + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits); + /* faster, but works only if nbBits >= 1 */ + +- +- +-/*-************************************************************** +-* Internal functions +-****************************************************************/ +-MEM_STATIC unsigned BIT_highbit32 (U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* Use GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const unsigned DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, +- 11, 14, 16, 18, 22, 25, 3, 30, +- 8, 12, 20, 28, 15, 17, 24, 7, +- 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[ (U32) (v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- + /*===== Local Constants =====*/ + static const unsigned BIT_mask[] = { + 0, 1, 3, 7, 0xF, 0x1F, +@@ -178,6 +153,12 @@ MEM_STATIC size_t BIT_initCStream(BIT_CStream_t* bitC, + return 0; + } + ++MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) ++{ ++ assert(nbBits < BIT_MASK_SIZE); ++ return bitContainer & BIT_mask[nbBits]; ++} ++ + /*! BIT_addBits() : + * can add up to 31 bits into `bitC`. + * Note : does not check for register overflow ! */ +@@ -187,7 +168,7 @@ MEM_STATIC void BIT_addBits(BIT_CStream_t* bitC, + DEBUG_STATIC_ASSERT(BIT_MASK_SIZE == 32); + assert(nbBits < BIT_MASK_SIZE); + assert(nbBits + bitC->bitPos < sizeof(bitC->bitContainer) * 8); +- bitC->bitContainer |= (value & BIT_mask[nbBits]) << bitC->bitPos; ++ bitC->bitContainer |= BIT_getLowerBits(value, nbBits) << bitC->bitPos; + bitC->bitPos += nbBits; + } + +@@ -266,7 +247,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + bitD->ptr = (const char*)srcBuffer + srcSize - sizeof(bitD->bitContainer); + bitD->bitContainer = MEM_readLEST(bitD->ptr); + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; /* ensures bitsConsumed is always set */ + if (lastByte == 0) return ERROR(GENERIC); /* endMark not present */ } + } else { + bitD->ptr = bitD->start; +@@ -294,7 +275,7 @@ MEM_STATIC size_t BIT_initDStream(BIT_DStream_t* bitD, const void* srcBuffer, si + default: break; + } + { BYTE const lastByte = ((const BYTE*)srcBuffer)[srcSize-1]; +- bitD->bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ bitD->bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + if (lastByte == 0) return ERROR(corruption_detected); /* endMark not present */ + } + bitD->bitsConsumed += (U32)(sizeof(bitD->bitContainer) - srcSize)*8; +@@ -325,12 +306,6 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getMiddleBits(size_t bitContainer, U32 c + #endif + } + +-MEM_STATIC FORCE_INLINE_ATTR size_t BIT_getLowerBits(size_t bitContainer, U32 const nbBits) +-{ +- assert(nbBits < BIT_MASK_SIZE); +- return bitContainer & BIT_mask[nbBits]; +-} +- + /*! BIT_lookBits() : + * Provides next n bits from local register. + * local register is not modified. +@@ -377,7 +352,7 @@ MEM_STATIC FORCE_INLINE_ATTR size_t BIT_readBits(BIT_DStream_t* bitD, unsigned n + } + + /*! BIT_readBitsFast() : +- * unsafe version; only works only if nbBits >= 1 */ ++ * unsafe version; only works if nbBits >= 1 */ + MEM_STATIC size_t BIT_readBitsFast(BIT_DStream_t* bitD, unsigned nbBits) + { + size_t const value = BIT_lookBitsFast(bitD, nbBits); +@@ -408,7 +383,7 @@ MEM_STATIC BIT_DStream_status BIT_reloadDStreamFast(BIT_DStream_t* bitD) + * This function is safe, it guarantees it will not read beyond src buffer. + * @return : status of `BIT_DStream_t` internal register. + * when status == BIT_DStream_unfinished, internal register is filled with at least 25 or 57 bits */ +-MEM_STATIC BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) ++MEM_STATIC FORCE_INLINE_ATTR BIT_DStream_status BIT_reloadDStream(BIT_DStream_t* bitD) + { + if (bitD->bitsConsumed > (sizeof(bitD->bitContainer)*8)) /* overflow detected, like end of stream */ + return BIT_DStream_overflow; +diff --git a/lib/zstd/common/compiler.h b/lib/zstd/common/compiler.h +index c42d39faf9bd..c437e0975575 100644 +--- a/lib/zstd/common/compiler.h ++++ b/lib/zstd/common/compiler.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -179,6 +180,17 @@ + * Sanitizer + *****************************************************************/ + ++/* Issue #3240 reports an ASAN failure on an llvm-mingw build. Out of an ++ * abundance of caution, disable our custom poisoning on mingw. */ ++#ifdef __MINGW32__ ++#ifndef ZSTD_ASAN_DONT_POISON_WORKSPACE ++#define ZSTD_ASAN_DONT_POISON_WORKSPACE 1 ++#endif ++#ifndef ZSTD_MSAN_DONT_POISON_WORKSPACE ++#define ZSTD_MSAN_DONT_POISON_WORKSPACE 1 ++#endif ++#endif ++ + + + #endif /* ZSTD_COMPILER_H */ +diff --git a/lib/zstd/common/cpu.h b/lib/zstd/common/cpu.h +index 0db7b42407ee..d8319a2bef4c 100644 +--- a/lib/zstd/common/cpu.h ++++ b/lib/zstd/common/cpu.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/debug.c b/lib/zstd/common/debug.c +index bb863c9ea616..d77926cbad14 100644 +--- a/lib/zstd/common/debug.c ++++ b/lib/zstd/common/debug.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -21,4 +22,6 @@ + + #include "debug.h" + ++#if (DEBUGLEVEL>=2) + int g_debuglevel = DEBUGLEVEL; ++#endif +diff --git a/lib/zstd/common/debug.h b/lib/zstd/common/debug.h +index 6dd88d1fbd02..da0dbfc614b8 100644 +--- a/lib/zstd/common/debug.h ++++ b/lib/zstd/common/debug.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * debug + * Part of FSE library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +diff --git a/lib/zstd/common/entropy_common.c b/lib/zstd/common/entropy_common.c +index fef67056f052..6cdd82233fb5 100644 +--- a/lib/zstd/common/entropy_common.c ++++ b/lib/zstd/common/entropy_common.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * Common functions of New Generation Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,8 +20,8 @@ + #include "error_private.h" /* ERR_*, ERROR */ + #define FSE_STATIC_LINKING_ONLY /* FSE_MIN_TABLELOG */ + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY /* HUF_TABLELOG_ABSOLUTEMAX */ + #include "huf.h" ++#include "bits.h" /* ZSDT_highbit32, ZSTD_countTrailingZeros32 */ + + + /*=== Version ===*/ +@@ -38,23 +39,6 @@ const char* HUF_getErrorName(size_t code) { return ERR_getErrorName(code); } + /*-************************************************************** + * FSE NCount encoding-decoding + ****************************************************************/ +-static U32 FSE_ctz(U32 val) +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_ctz(val); +-# else /* Software version */ +- U32 count = 0; +- while ((val & 1) == 0) { +- val >>= 1; +- ++count; +- } +- return count; +-# endif +- } +-} +- + FORCE_INLINE_TEMPLATE + size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigned* tableLogPtr, + const void* headerBuffer, size_t hbSize) +@@ -102,7 +86,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * repeat. + * Avoid UB by setting the high bit to 1. + */ +- int repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ int repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + while (repeats >= 12) { + charnum += 3 * 12; + if (LIKELY(ip <= iend-7)) { +@@ -113,7 +97,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + ip = iend - 4; + } + bitStream = MEM_readLE32(ip) >> bitCount; +- repeats = FSE_ctz(~bitStream | 0x80000000) >> 1; ++ repeats = ZSTD_countTrailingZeros32(~bitStream | 0x80000000) >> 1; + } + charnum += 3 * repeats; + bitStream >>= 2 * repeats; +@@ -178,7 +162,7 @@ size_t FSE_readNCount_body(short* normalizedCounter, unsigned* maxSVPtr, unsigne + * know that threshold > 1. + */ + if (remaining <= 1) break; +- nbBits = BIT_highbit32(remaining) + 1; ++ nbBits = ZSTD_highbit32(remaining) + 1; + threshold = 1 << (nbBits - 1); + } + if (charnum >= maxSV1) break; +@@ -253,7 +237,7 @@ size_t HUF_readStats(BYTE* huffWeight, size_t hwSize, U32* rankStats, + const void* src, size_t srcSize) + { + U32 wksp[HUF_READ_STATS_WORKSPACE_SIZE_U32]; +- return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* bmi2 */ 0); ++ return HUF_readStats_wksp(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, wksp, sizeof(wksp), /* flags */ 0); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -301,14 +285,14 @@ HUF_readStats_body(BYTE* huffWeight, size_t hwSize, U32* rankStats, + if (weightTotal == 0) return ERROR(corruption_detected); + + /* get last non-null symbol weight (implied, total must be 2^n) */ +- { U32 const tableLog = BIT_highbit32(weightTotal) + 1; ++ { U32 const tableLog = ZSTD_highbit32(weightTotal) + 1; + if (tableLog > HUF_TABLELOG_MAX) return ERROR(corruption_detected); + *tableLogPtr = tableLog; + /* determine last weight */ + { U32 const total = 1 << tableLog; + U32 const rest = total - weightTotal; +- U32 const verif = 1 << BIT_highbit32(rest); +- U32 const lastWeight = BIT_highbit32(rest) + 1; ++ U32 const verif = 1 << ZSTD_highbit32(rest); ++ U32 const lastWeight = ZSTD_highbit32(rest) + 1; + if (verif != rest) return ERROR(corruption_detected); /* last value must be a clean power of 2 */ + huffWeight[oSize] = (BYTE)lastWeight; + rankStats[lastWeight]++; +@@ -345,13 +329,13 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, U32* rankStats, + U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workSpace, size_t wkspSize, +- int bmi2) ++ int flags) + { + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_readStats_body_bmi2(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } + #endif +- (void)bmi2; ++ (void)flags; + return HUF_readStats_body_default(huffWeight, hwSize, rankStats, nbSymbolsPtr, tableLogPtr, src, srcSize, workSpace, wkspSize); + } +diff --git a/lib/zstd/common/error_private.c b/lib/zstd/common/error_private.c +index 6d1135f8c373..a4062d30d170 100644 +--- a/lib/zstd/common/error_private.c ++++ b/lib/zstd/common/error_private.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -27,9 +28,11 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(version_unsupported): return "Version not supported"; + case PREFIX(frameParameter_unsupported): return "Unsupported frame parameter"; + case PREFIX(frameParameter_windowTooLarge): return "Frame requires too much memory for decoding"; +- case PREFIX(corruption_detected): return "Corrupted block detected"; ++ case PREFIX(corruption_detected): return "Data corruption detected"; + case PREFIX(checksum_wrong): return "Restored data doesn't match checksum"; ++ case PREFIX(literals_headerWrong): return "Header of Literals' block doesn't respect format specification"; + case PREFIX(parameter_unsupported): return "Unsupported parameter"; ++ case PREFIX(parameter_combination_unsupported): return "Unsupported combination of parameters"; + case PREFIX(parameter_outOfBound): return "Parameter is out of bound"; + case PREFIX(init_missing): return "Context should be init first"; + case PREFIX(memory_allocation): return "Allocation error : not enough memory"; +@@ -38,17 +41,22 @@ const char* ERR_getErrorString(ERR_enum code) + case PREFIX(tableLog_tooLarge): return "tableLog requires too much memory : unsupported"; + case PREFIX(maxSymbolValue_tooLarge): return "Unsupported max Symbol Value : too large"; + case PREFIX(maxSymbolValue_tooSmall): return "Specified maxSymbolValue is too small"; ++ case PREFIX(stabilityCondition_notRespected): return "pledged buffer stability condition is not respected"; + case PREFIX(dictionary_corrupted): return "Dictionary is corrupted"; + case PREFIX(dictionary_wrong): return "Dictionary mismatch"; + case PREFIX(dictionaryCreation_failed): return "Cannot create Dictionary from provided samples"; + case PREFIX(dstSize_tooSmall): return "Destination buffer is too small"; + case PREFIX(srcSize_wrong): return "Src size is incorrect"; + case PREFIX(dstBuffer_null): return "Operation on NULL destination buffer"; ++ case PREFIX(noForwardProgress_destFull): return "Operation made no progress over multiple calls, due to output buffer being full"; ++ case PREFIX(noForwardProgress_inputEmpty): return "Operation made no progress over multiple calls, due to input being empty"; + /* following error codes are not stable and may be removed or changed in a future version */ + case PREFIX(frameIndex_tooLarge): return "Frame index is too large"; + case PREFIX(seekableIO): return "An I/O error occurred when reading/seeking"; + case PREFIX(dstBuffer_wrong): return "Destination buffer is wrong"; + case PREFIX(srcBuffer_wrong): return "Source buffer is wrong"; ++ case PREFIX(sequenceProducer_failed): return "Block-level external sequence producer returned an error code"; ++ case PREFIX(externalSequences_invalid): return "External sequences are not valid"; + case PREFIX(maxCode): + default: return notErrorCode; + } +diff --git a/lib/zstd/common/error_private.h b/lib/zstd/common/error_private.h +index ca5101e542fa..9a4699a38a88 100644 +--- a/lib/zstd/common/error_private.h ++++ b/lib/zstd/common/error_private.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/fse.h b/lib/zstd/common/fse.h +index 4507043b2287..c4e25a219142 100644 +--- a/lib/zstd/common/fse.h ++++ b/lib/zstd/common/fse.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * FSE : Finite State Entropy codec + * Public Prototypes declaration +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -50,34 +51,6 @@ + FSE_PUBLIC_API unsigned FSE_versionNumber(void); /*< library version number; to be used when checking dll version */ + + +-/*-**************************************** +-* FSE simple functions +-******************************************/ +-/*! FSE_compress() : +- Compress content of buffer 'src', of size 'srcSize', into destination buffer 'dst'. +- 'dst' buffer must be already allocated. Compression runs faster is dstCapacity >= FSE_compressBound(srcSize). +- @return : size of compressed data (<= dstCapacity). +- Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression instead. +- if FSE_isError(return), compression failed (more details using FSE_getErrorName()) +-*/ +-FSE_PUBLIC_API size_t FSE_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/*! FSE_decompress(): +- Decompress FSE data from buffer 'cSrc', of size 'cSrcSize', +- into already allocated destination buffer 'dst', of size 'dstCapacity'. +- @return : size of regenerated data (<= maxDstSize), +- or an error code, which can be tested using FSE_isError() . +- +- ** Important ** : FSE_decompress() does not decompress non-compressible nor RLE data !!! +- Why ? : making this distinction requires a header. +- Header management is intentionally delegated to the user layer, which can better manage special cases. +-*/ +-FSE_PUBLIC_API size_t FSE_decompress(void* dst, size_t dstCapacity, +- const void* cSrc, size_t cSrcSize); +- +- + /*-***************************************** + * Tool functions + ******************************************/ +@@ -88,20 +61,6 @@ FSE_PUBLIC_API unsigned FSE_isError(size_t code); /* tells if a return + FSE_PUBLIC_API const char* FSE_getErrorName(size_t code); /* provides error code string (useful for debugging) */ + + +-/*-***************************************** +-* FSE advanced functions +-******************************************/ +-/*! FSE_compress2() : +- Same as FSE_compress(), but allows the selection of 'maxSymbolValue' and 'tableLog' +- Both parameters can be defined as '0' to mean : use default value +- @return : size of compressed data +- Special values : if return == 0, srcData is not compressible => Nothing is stored within cSrc !!! +- if return == 1, srcData is a single byte symbol * srcSize times. Use RLE compression. +- if FSE_isError(return), it's an error code. +-*/ +-FSE_PUBLIC_API size_t FSE_compress2 (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +- +- + /*-***************************************** + * FSE detailed API + ******************************************/ +@@ -161,8 +120,6 @@ FSE_PUBLIC_API size_t FSE_writeNCount (void* buffer, size_t bufferSize, + /*! Constructor and Destructor of FSE_CTable. + Note that FSE_CTable size depends on 'tableLog' and 'maxSymbolValue' */ + typedef unsigned FSE_CTable; /* don't allocate that. It's only meant to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeCTable (FSE_CTable* ct); + + /*! FSE_buildCTable(): + Builds `ct`, which must be already allocated, using FSE_createCTable(). +@@ -238,23 +195,7 @@ FSE_PUBLIC_API size_t FSE_readNCount_bmi2(short* normalizedCounter, + unsigned* maxSymbolValuePtr, unsigned* tableLogPtr, + const void* rBuffer, size_t rBuffSize, int bmi2); + +-/*! Constructor and Destructor of FSE_DTable. +- Note that its size depends on 'tableLog' */ + typedef unsigned FSE_DTable; /* don't allocate that. It's just a way to be more restrictive than void* */ +-FSE_PUBLIC_API FSE_DTable* FSE_createDTable(unsigned tableLog); +-FSE_PUBLIC_API void FSE_freeDTable(FSE_DTable* dt); +- +-/*! FSE_buildDTable(): +- Builds 'dt', which must be already allocated, using FSE_createDTable(). +- return : 0, or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_buildDTable (FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog); +- +-/*! FSE_decompress_usingDTable(): +- Decompress compressed source `cSrc` of size `cSrcSize` using `dt` +- into `dst` which must be already allocated. +- @return : size of regenerated data (necessarily <= `dstCapacity`), +- or an errorCode, which can be tested using FSE_isError() */ +-FSE_PUBLIC_API size_t FSE_decompress_usingDTable(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, const FSE_DTable* dt); + + /*! + Tutorial : +@@ -317,16 +258,6 @@ If there is an error, the function will return an error code, which can be teste + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus); + /*< same as FSE_optimalTableLog(), which used `minus==2` */ + +-/* FSE_compress_wksp() : +- * Same as FSE_compress2(), but using an externally allocated scratch buffer (`workSpace`). +- * FSE_COMPRESS_WKSP_SIZE_U32() provides the minimum size required for `workSpace` as a table of FSE_CTable. +- */ +-#define FSE_COMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) ( FSE_CTABLE_SIZE_U32(maxTableLog, maxSymbolValue) + ((maxTableLog > 12) ? (1 << (maxTableLog - 2)) : 1024) ) +-size_t FSE_compress_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); +- +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits); +-/*< build a fake FSE_CTable, designed for a flat distribution, where each symbol uses nbBits */ +- + size_t FSE_buildCTable_rle (FSE_CTable* ct, unsigned char symbolValue); + /*< build a fake FSE_CTable, designed to compress always the same symbolValue */ + +@@ -344,19 +275,11 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, const short* normalizedCounter, unsi + FSE_PUBLIC_API size_t FSE_buildDTable_wksp(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); + /*< Same as FSE_buildDTable(), using an externally allocated `workspace` produced with `FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxSymbolValue)` */ + +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits); +-/*< build a fake FSE_DTable, designed to read a flat distribution where each symbol uses nbBits */ +- +-size_t FSE_buildDTable_rle (FSE_DTable* dt, unsigned char symbolValue); +-/*< build a fake FSE_DTable, designed to always generate the same symbolValue */ +- +-#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) ++#define FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) (FSE_DTABLE_SIZE_U32(maxTableLog) + 1 + FSE_BUILD_DTABLE_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) + (FSE_MAX_SYMBOL_VALUE + 1) / 2 + 1) + #define FSE_DECOMPRESS_WKSP_SIZE(maxTableLog, maxSymbolValue) (FSE_DECOMPRESS_WKSP_SIZE_U32(maxTableLog, maxSymbolValue) * sizeof(unsigned)) +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize); +-/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)` */ +- + size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize, int bmi2); +-/*< Same as FSE_decompress_wksp() but with dynamic BMI2 support. Pass 1 if your CPU supports BMI2 or 0 if it doesn't. */ ++/*< same as FSE_decompress(), using an externally allocated `workSpace` produced with `FSE_DECOMPRESS_WKSP_SIZE_U32(maxLog, maxSymbolValue)`. ++ * Set bmi2 to 1 if your CPU supports BMI2 or 0 if it doesn't */ + + typedef enum { + FSE_repeat_none, /*< Cannot use the previous table */ +@@ -552,7 +475,7 @@ MEM_STATIC void FSE_flushCState(BIT_CStream_t* bitC, const FSE_CState_t* statePt + + /* FSE_getMaxNbBits() : + * Approximate maximum cost of a symbol, in bits. +- * Fractional get rounded up (i.e : a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) ++ * Fractional get rounded up (i.e. a symbol with a normalized frequency of 3 gives the same result as a frequency of 2) + * note 1 : assume symbolValue is valid (<= maxSymbolValue) + * note 2 : if freq[symbolValue]==0, @return a fake cost of tableLog+1 bits */ + MEM_STATIC U32 FSE_getMaxNbBits(const void* symbolTTPtr, U32 symbolValue) +diff --git a/lib/zstd/common/fse_decompress.c b/lib/zstd/common/fse_decompress.c +index 8dcb8ca39767..99ce8fa54d08 100644 +--- a/lib/zstd/common/fse_decompress.c ++++ b/lib/zstd/common/fse_decompress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy decoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -24,6 +25,7 @@ + #include "error_private.h" + #define ZSTD_DEPS_NEED_MALLOC + #include "zstd_deps.h" ++#include "bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -55,19 +57,6 @@ + #define FSE_FUNCTION_NAME(X,Y) FSE_CAT(X,Y) + #define FSE_TYPE_NAME(X,Y) FSE_CAT(X,Y) + +- +-/* Function templates */ +-FSE_DTable* FSE_createDTable (unsigned tableLog) +-{ +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- return (FSE_DTable*)ZSTD_malloc( FSE_DTABLE_SIZE_U32(tableLog) * sizeof (U32) ); +-} +- +-void FSE_freeDTable (FSE_DTable* dt) +-{ +- ZSTD_free(dt); +-} +- + static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCounter, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize) + { + void* const tdPtr = dt+1; /* because *dt is unsigned, 32-bits aligned on 32-bits */ +@@ -127,10 +116,10 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -166,7 +155,7 @@ static size_t FSE_buildDTable_internal(FSE_DTable* dt, const short* normalizedCo + for (u=0; utableLog = 0; +- DTableH->fastMode = 0; +- +- cell->newState = 0; +- cell->symbol = symbolValue; +- cell->nbBits = 0; +- +- return 0; +-} +- +- +-size_t FSE_buildDTable_raw (FSE_DTable* dt, unsigned nbBits) +-{ +- void* ptr = dt; +- FSE_DTableHeader* const DTableH = (FSE_DTableHeader*)ptr; +- void* dPtr = dt + 1; +- FSE_decode_t* const dinfo = (FSE_decode_t*)dPtr; +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSV1 = tableMask+1; +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* Build Decoding Table */ +- DTableH->tableLog = (U16)nbBits; +- DTableH->fastMode = 1; +- for (s=0; sfastMode; +- +- /* select fast mode (static) */ +- if (fastMode) return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 1); +- return FSE_decompress_usingDTable_generic(dst, originalSize, cSrc, cSrcSize, dt, 0); +-} +- +- +-size_t FSE_decompress_wksp(void* dst, size_t dstCapacity, const void* cSrc, size_t cSrcSize, unsigned maxLog, void* workSpace, size_t wkspSize) +-{ +- return FSE_decompress_wksp_bmi2(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize, /* bmi2 */ 0); +-} +- + typedef struct { + short ncount[FSE_MAX_SYMBOL_VALUE + 1]; + FSE_DTable dtable[]; /* Dynamically sized */ +@@ -342,7 +268,8 @@ FORCE_INLINE_TEMPLATE size_t FSE_decompress_wksp_body( + } + + if (FSE_DECOMPRESS_WKSP_SIZE(tableLog, maxSymbolValue) > wkspSize) return ERROR(tableLog_tooLarge); +- workSpace = wksp->dtable + FSE_DTABLE_SIZE_U32(tableLog); ++ assert(sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog) <= wkspSize); ++ workSpace = (BYTE*)workSpace + sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + wkspSize -= sizeof(*wksp) + FSE_DTABLE_SIZE(tableLog); + + CHECK_F( FSE_buildDTable_internal(wksp->dtable, wksp->ncount, maxSymbolValue, tableLog, workSpace, wkspSize) ); +@@ -382,9 +309,4 @@ size_t FSE_decompress_wksp_bmi2(void* dst, size_t dstCapacity, const void* cSrc, + return FSE_decompress_wksp_body_default(dst, dstCapacity, cSrc, cSrcSize, maxLog, workSpace, wkspSize); + } + +- +-typedef FSE_DTable DTable_max_t[FSE_DTABLE_SIZE_U32(FSE_MAX_TABLELOG)]; +- +- +- + #endif /* FSE_COMMONDEFS_ONLY */ +diff --git a/lib/zstd/common/huf.h b/lib/zstd/common/huf.h +index 5042ff870308..8e7943092ed1 100644 +--- a/lib/zstd/common/huf.h ++++ b/lib/zstd/common/huf.h +@@ -1,7 +1,8 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* ****************************************************************** + * huff0 huffman codec, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - Source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -18,99 +19,22 @@ + + /* *** Dependencies *** */ + #include "zstd_deps.h" /* size_t */ +- +- +-/* *** library symbols visibility *** */ +-/* Note : when linking with -fvisibility=hidden on gcc, or by default on Visual, +- * HUF symbols remain "private" (internal symbols for library only). +- * Set macro FSE_DLL_EXPORT to 1 if you want HUF symbols visible on DLL interface */ +-#if defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) && defined(__GNUC__) && (__GNUC__ >= 4) +-# define HUF_PUBLIC_API __attribute__ ((visibility ("default"))) +-#elif defined(FSE_DLL_EXPORT) && (FSE_DLL_EXPORT==1) /* Visual expected */ +-# define HUF_PUBLIC_API __declspec(dllexport) +-#elif defined(FSE_DLL_IMPORT) && (FSE_DLL_IMPORT==1) +-# define HUF_PUBLIC_API __declspec(dllimport) /* not required, just to generate faster code (saves a function pointer load from IAT and an indirect jump) */ +-#else +-# define HUF_PUBLIC_API +-#endif +- +- +-/* ========================== */ +-/* *** simple functions *** */ +-/* ========================== */ +- +-/* HUF_compress() : +- * Compress content from buffer 'src', of size 'srcSize', into buffer 'dst'. +- * 'dst' buffer must be already allocated. +- * Compression runs faster if `dstCapacity` >= HUF_compressBound(srcSize). +- * `srcSize` must be <= `HUF_BLOCKSIZE_MAX` == 128 KB. +- * @return : size of compressed data (<= `dstCapacity`). +- * Special values : if return == 0, srcData is not compressible => Nothing is stored within dst !!! +- * if HUF_isError(return), compression failed (more details using HUF_getErrorName()) +- */ +-HUF_PUBLIC_API size_t HUF_compress(void* dst, size_t dstCapacity, +- const void* src, size_t srcSize); +- +-/* HUF_decompress() : +- * Decompress HUF data from buffer 'cSrc', of size 'cSrcSize', +- * into already allocated buffer 'dst', of minimum size 'dstSize'. +- * `originalSize` : **must** be the ***exact*** size of original (uncompressed) data. +- * Note : in contrast with FSE, HUF_decompress can regenerate +- * RLE (cSrcSize==1) and uncompressed (cSrcSize==dstSize) data, +- * because it knows size to regenerate (originalSize). +- * @return : size of regenerated data (== originalSize), +- * or an error code, which can be tested using HUF_isError() +- */ +-HUF_PUBLIC_API size_t HUF_decompress(void* dst, size_t originalSize, +- const void* cSrc, size_t cSrcSize); ++#include "mem.h" /* U32 */ ++#define FSE_STATIC_LINKING_ONLY ++#include "fse.h" + + + /* *** Tool functions *** */ +-#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ +-HUF_PUBLIC_API size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ ++#define HUF_BLOCKSIZE_MAX (128 * 1024) /*< maximum input size for a single block compressed with HUF_compress */ ++size_t HUF_compressBound(size_t size); /*< maximum compressed size (worst case) */ + + /* Error Management */ +-HUF_PUBLIC_API unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ +-HUF_PUBLIC_API const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ ++unsigned HUF_isError(size_t code); /*< tells if a return value is an error code */ ++const char* HUF_getErrorName(size_t code); /*< provides error code string (useful for debugging) */ + + +-/* *** Advanced function *** */ +- +-/* HUF_compress2() : +- * Same as HUF_compress(), but offers control over `maxSymbolValue` and `tableLog`. +- * `maxSymbolValue` must be <= HUF_SYMBOLVALUE_MAX . +- * `tableLog` must be `<= HUF_TABLELOG_MAX` . */ +-HUF_PUBLIC_API size_t HUF_compress2 (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog); +- +-/* HUF_compress4X_wksp() : +- * Same as HUF_compress2(), but uses externally allocated `workSpace`. +- * `workspace` must be at least as large as HUF_WORKSPACE_SIZE */ + #define HUF_WORKSPACE_SIZE ((8 << 10) + 512 /* sorting scratch space */) + #define HUF_WORKSPACE_SIZE_U64 (HUF_WORKSPACE_SIZE / sizeof(U64)) +-HUF_PUBLIC_API size_t HUF_compress4X_wksp (void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned tableLog, +- void* workSpace, size_t wkspSize); +- +-#endif /* HUF_H_298734234 */ +- +-/* ****************************************************************** +- * WARNING !! +- * The following section contains advanced and experimental definitions +- * which shall never be used in the context of a dynamic library, +- * because they are not guaranteed to remain stable in the future. +- * Only consider them in association with static linking. +- * *****************************************************************/ +-#if !defined(HUF_H_HUF_STATIC_LINKING_ONLY) +-#define HUF_H_HUF_STATIC_LINKING_ONLY +- +-/* *** Dependencies *** */ +-#include "mem.h" /* U32 */ +-#define FSE_STATIC_LINKING_ONLY +-#include "fse.h" +- + + /* *** Constants *** */ + #define HUF_TABLELOG_MAX 12 /* max runtime value of tableLog (due to static allocation); can be modified up to HUF_TABLELOG_ABSOLUTEMAX */ +@@ -151,25 +75,49 @@ typedef U32 HUF_DTable; + /* **************************************** + * Advanced decompression functions + ******************************************/ +-size_t HUF_decompress4X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-#endif + +-size_t HUF_decompress4X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< decodes RLE and uncompressed */ +-size_t HUF_decompress4X_hufOnly(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< considers RLE and uncompressed as errors */ +-size_t HUF_decompress4X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++/* ++ * Huffman flags bitset. ++ * For all flags, 0 is the default value. ++ */ ++typedef enum { ++ /* ++ * If compiled with DYNAMIC_BMI2: Set flag only if the CPU supports BMI2 at runtime. ++ * Otherwise: Ignored. ++ */ ++ HUF_flags_bmi2 = (1 << 0), ++ /* ++ * If set: Test possible table depths to find the one that produces the smallest header + encoded size. ++ * If unset: Use heuristic to find the table depth. ++ */ ++ HUF_flags_optimalDepth = (1 << 1), ++ /* ++ * If set: If the previous table can encode the input, always reuse the previous table. ++ * If unset: If the previous table can encode the input, reuse the previous table if it results in a smaller output. ++ */ ++ HUF_flags_preferRepeat = (1 << 2), ++ /* ++ * If set: Sample the input and check if the sample is uncompressible, if it is then don't attempt to compress. ++ * If unset: Always histogram the entire input. ++ */ ++ HUF_flags_suspectUncompressible = (1 << 3), ++ /* ++ * If set: Don't use assembly implementations ++ * If unset: Allow using assembly implementations ++ */ ++ HUF_flags_disableAsm = (1 << 4), ++ /* ++ * If set: Don't use the fast decoding loop, always use the fallback decoding loop. ++ * If unset: Use the fast decoding loop when possible. ++ */ ++ HUF_flags_disableFast = (1 << 5) ++} HUF_flags_e; + + + /* **************************************** + * HUF detailed API + * ****************************************/ ++#define HUF_OPTIMAL_DEPTH_THRESHOLD ZSTD_btultra + + /*! HUF_compress() does the following: + * 1. count symbol occurrence from source[] into table count[] using FSE_count() (exposed within "fse.h") +@@ -182,12 +130,12 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + * For example, it's possible to compress several blocks using the same 'CTable', + * or to save and regenerate 'CTable' using external methods. + */ +-unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue); +-size_t HUF_buildCTable (HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue, unsigned maxNbBits); /* @return : maxNbBits; CTable and count can overlap. In which case, CTable will overwrite count content */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog); ++unsigned HUF_minTableLog(unsigned symbolCardinality); ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue); ++unsigned HUF_optimalTableLog(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, void* workSpace, ++ size_t wkspSize, HUF_CElt* table, const unsigned* count, int flags); /* table is used as scratch space for building and testing tables, not a return value */ + size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog, void* workspace, size_t workspaceSize); +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + size_t HUF_estimateCompressedSize(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + int HUF_validateCTable(const HUF_CElt* CTable, const unsigned* count, unsigned maxSymbolValue); + +@@ -196,6 +144,7 @@ typedef enum { + HUF_repeat_check, /*< Can use the previous table but it must be checked. Note : The previous table must have been constructed by HUF_compress{1, 4}X_repeat */ + HUF_repeat_valid /*< Can use the previous table and it is assumed to be valid */ + } HUF_repeat; ++ + /* HUF_compress4X_repeat() : + * Same as HUF_compress4X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -206,13 +155,13 @@ size_t HUF_compress4X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and its size must be >= HUF_CTABLE_WORKSPACE_SIZE. + */ +-#define HUF_CTABLE_WORKSPACE_SIZE_U32 (2*HUF_SYMBOLVALUE_MAX +1 +1) ++#define HUF_CTABLE_WORKSPACE_SIZE_U32 ((4 * (HUF_SYMBOLVALUE_MAX + 1)) + 192) + #define HUF_CTABLE_WORKSPACE_SIZE (HUF_CTABLE_WORKSPACE_SIZE_U32 * sizeof(unsigned)) + size_t HUF_buildCTable_wksp (HUF_CElt* tree, + const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, +@@ -238,7 +187,7 @@ size_t HUF_readStats_wksp(BYTE* huffWeight, size_t hwSize, + U32* rankStats, U32* nbSymbolsPtr, U32* tableLogPtr, + const void* src, size_t srcSize, + void* workspace, size_t wkspSize, +- int bmi2); ++ int flags); + + /* HUF_readCTable() : + * Loading a CTable saved with HUF_writeCTable() */ +@@ -276,32 +225,12 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize); + #define HUF_DECOMPRESS_WORKSPACE_SIZE ((2 << 10) + (1 << 9)) + #define HUF_DECOMPRESS_WORKSPACE_SIZE_U32 (HUF_DECOMPRESS_WORKSPACE_SIZE / sizeof(U32)) + +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX1_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2 (HUF_DTable* DTable, const void* src, size_t srcSize); +-size_t HUF_readDTableX2_wksp (HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize); +-#endif +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress4X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress4X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif +- + + /* ====================== */ + /* single stream variants */ + /* ====================== */ + +-size_t HUF_compress1X (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog); +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, const void* src, size_t srcSize, unsigned maxSymbolValue, unsigned tableLog, void* workSpace, size_t wkspSize); /*< `workSpace` must be a table of at least HUF_WORKSPACE_SIZE_U64 U64 */ +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable); +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2); ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags); + /* HUF_compress1X_repeat() : + * Same as HUF_compress1X_wksp(), but considers using hufTable if *repeat != HUF_repeat_none. + * If it uses hufTable it does not modify hufTable or repeat. +@@ -312,47 +241,28 @@ size_t HUF_compress1X_repeat(void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned tableLog, + void* workSpace, size_t wkspSize, /*< `workSpace` must be aligned on 4-bytes boundaries, `wkspSize` must be >= HUF_WORKSPACE_SIZE */ +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible); +- +-size_t HUF_decompress1X1 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* single-symbol decoder */ +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2 (void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /* double-symbol decoder */ +-#endif +- +-size_t HUF_decompress1X_DCtx (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); +-size_t HUF_decompress1X_DCtx_wksp (HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< single-symbol decoder */ +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< single-symbol decoder */ +-#endif +-#ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_DCtx(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize); /*< double-symbols decoder */ +-size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize); /*< double-symbols decoder */ +-#endif ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags); + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); /*< automatic selection of sing or double symbol decoder, based on DTable */ +-#ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); +-#endif ++size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_decompress1X2_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable); ++size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); /*< double-symbols decoder */ + #endif + + /* BMI2 variants. + * If the CPU has BMI2 support, pass bmi2=1, otherwise pass bmi2=0. + */ +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #endif +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2); +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags); ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags); + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + #ifndef HUF_FORCE_DECOMPRESS_X1 +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2); ++size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags); + #endif + +-#endif /* HUF_STATIC_LINKING_ONLY */ ++#endif /* HUF_H_298734234 */ + +diff --git a/lib/zstd/common/mem.h b/lib/zstd/common/mem.h +index 1d9cc03924ca..a7231822b6e3 100644 +--- a/lib/zstd/common/mem.h ++++ b/lib/zstd/common/mem.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/common/portability_macros.h b/lib/zstd/common/portability_macros.h +index 0e3b2c0a527d..7ede8cf1ffe5 100644 +--- a/lib/zstd/common/portability_macros.h ++++ b/lib/zstd/common/portability_macros.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -12,7 +13,7 @@ + #define ZSTD_PORTABILITY_MACROS_H + + /* +- * This header file contains macro defintions to support portability. ++ * This header file contains macro definitions to support portability. + * This header is shared between C and ASM code, so it MUST only + * contain macro definitions. It MUST not contain any C code. + * +@@ -65,7 +66,7 @@ + #endif + + /* +- * Only enable assembly for GNUC comptabile compilers, ++ * Only enable assembly for GNUC compatible compilers, + * because other platforms may not support GAS assembly syntax. + * + * Only enable assembly for Linux / MacOS, other platforms may +@@ -90,4 +91,23 @@ + */ + #define ZSTD_ENABLE_ASM_X86_64_BMI2 0 + ++/* ++ * For x86 ELF targets, add .note.gnu.property section for Intel CET in ++ * assembly sources when CET is enabled. ++ * ++ * Additionally, any function that may be called indirectly must begin ++ * with ZSTD_CET_ENDBRANCH. ++ */ ++#if defined(__ELF__) && (defined(__x86_64__) || defined(__i386__)) \ ++ && defined(__has_include) ++# if __has_include() ++# include ++# define ZSTD_CET_ENDBRANCH _CET_ENDBR ++# endif ++#endif ++ ++#ifndef ZSTD_CET_ENDBRANCH ++# define ZSTD_CET_ENDBRANCH ++#endif ++ + #endif /* ZSTD_PORTABILITY_MACROS_H */ +diff --git a/lib/zstd/common/zstd_common.c b/lib/zstd/common/zstd_common.c +index 3d7e35b309b5..44b95b25344a 100644 +--- a/lib/zstd/common/zstd_common.c ++++ b/lib/zstd/common/zstd_common.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,6 @@ + * Dependencies + ***************************************/ + #define ZSTD_DEPS_NEED_MALLOC +-#include "zstd_deps.h" /* ZSTD_malloc, ZSTD_calloc, ZSTD_free, ZSTD_memset */ + #include "error_private.h" + #include "zstd_internal.h" + +@@ -47,37 +47,3 @@ ZSTD_ErrorCode ZSTD_getErrorCode(size_t code) { return ERR_getErrorCode(code); } + /*! ZSTD_getErrorString() : + * provides error code string from enum */ + const char* ZSTD_getErrorString(ZSTD_ErrorCode code) { return ERR_getErrorString(code); } +- +- +- +-/*=************************************************************** +-* Custom allocator +-****************************************************************/ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) +- return customMem.customAlloc(customMem.opaque, size); +- return ZSTD_malloc(size); +-} +- +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem) +-{ +- if (customMem.customAlloc) { +- /* calloc implemented as malloc+memset; +- * not as efficient as calloc, but next best guess for custom malloc */ +- void* const ptr = customMem.customAlloc(customMem.opaque, size); +- ZSTD_memset(ptr, 0, size); +- return ptr; +- } +- return ZSTD_calloc(1, size); +-} +- +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem) +-{ +- if (ptr!=NULL) { +- if (customMem.customFree) +- customMem.customFree(customMem.opaque, ptr); +- else +- ZSTD_free(ptr); +- } +-} +diff --git a/lib/zstd/common/zstd_deps.h b/lib/zstd/common/zstd_deps.h +index 2c34e8a33a1c..f931f7d0e294 100644 +--- a/lib/zstd/common/zstd_deps.h ++++ b/lib/zstd/common/zstd_deps.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -105,3 +105,17 @@ static uint64_t ZSTD_div64(uint64_t dividend, uint32_t divisor) { + + #endif /* ZSTD_DEPS_IO */ + #endif /* ZSTD_DEPS_NEED_IO */ ++ ++/* ++ * Only requested when MSAN is enabled. ++ * Need: ++ * intptr_t ++ */ ++#ifdef ZSTD_DEPS_NEED_STDINT ++#ifndef ZSTD_DEPS_STDINT ++#define ZSTD_DEPS_STDINT ++ ++/* intptr_t already provided by ZSTD_DEPS_COMMON */ ++ ++#endif /* ZSTD_DEPS_STDINT */ ++#endif /* ZSTD_DEPS_NEED_STDINT */ +diff --git a/lib/zstd/common/zstd_internal.h b/lib/zstd/common/zstd_internal.h +index 93305d9b41bb..7f023e4d4774 100644 +--- a/lib/zstd/common/zstd_internal.h ++++ b/lib/zstd/common/zstd_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -28,7 +29,6 @@ + #include + #define FSE_STATIC_LINKING_ONLY + #include "fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "huf.h" + #include /* XXH_reset, update, digest */ + #define ZSTD_TRACE 0 +@@ -83,9 +83,9 @@ typedef enum { bt_raw, bt_rle, bt_compressed, bt_reserved } blockType_e; + #define ZSTD_FRAMECHECKSUMSIZE 4 + + #define MIN_SEQUENCES_SIZE 1 /* nbSeq==0 */ +-#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */ + MIN_SEQUENCES_SIZE /* nbSeq==0 */) /* for a non-null block */ ++#define MIN_CBLOCK_SIZE (1 /*litCSize*/ + 1 /* RLE or RAW */) /* for a non-null block */ ++#define MIN_LITERALS_FOR_4_STREAMS 6 + +-#define HufLog 12 + typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingType_e; + + #define LONGNBSEQ 0x7F00 +@@ -93,6 +93,7 @@ typedef enum { set_basic, set_rle, set_compressed, set_repeat } symbolEncodingTy + #define MINMATCH 3 + + #define Litbits 8 ++#define LitHufLog 11 + #define MaxLit ((1<= length) return; + op += 16; +@@ -240,7 +237,6 @@ void ZSTD_wildcopy(void* dst, const void* src, ptrdiff_t length, ZSTD_overlap_e + COPY16(op, ip); + } + while (op < oend); +-#endif + } + } + +@@ -289,11 +285,11 @@ typedef enum { + typedef struct { + seqDef* sequencesStart; + seqDef* sequences; /* ptr to end of sequences */ +- BYTE* litStart; +- BYTE* lit; /* ptr to end of literals */ +- BYTE* llCode; +- BYTE* mlCode; +- BYTE* ofCode; ++ BYTE* litStart; ++ BYTE* lit; /* ptr to end of literals */ ++ BYTE* llCode; ++ BYTE* mlCode; ++ BYTE* ofCode; + size_t maxNbSeq; + size_t maxNbLit; + +@@ -301,8 +297,8 @@ typedef struct { + * in the seqStore that has a value larger than U16 (if it exists). To do so, we increment + * the existing value of the litLength or matchLength by 0x10000. + */ +- ZSTD_longLengthType_e longLengthType; +- U32 longLengthPos; /* Index of the sequence to apply long length modification to */ ++ ZSTD_longLengthType_e longLengthType; ++ U32 longLengthPos; /* Index of the sequence to apply long length modification to */ + } seqStore_t; + + typedef struct { +@@ -321,10 +317,10 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + seqLen.matchLength = seq->mlBase + MINMATCH; + if (seqStore->longLengthPos == (U32)(seq - seqStore->sequencesStart)) { + if (seqStore->longLengthType == ZSTD_llt_literalLength) { +- seqLen.litLength += 0xFFFF; ++ seqLen.litLength += 0x10000; + } + if (seqStore->longLengthType == ZSTD_llt_matchLength) { +- seqLen.matchLength += 0xFFFF; ++ seqLen.matchLength += 0x10000; + } + } + return seqLen; +@@ -337,72 +333,13 @@ MEM_STATIC ZSTD_sequenceLength ZSTD_getSequenceLength(seqStore_t const* seqStore + * `decompressedBound != ZSTD_CONTENTSIZE_ERROR` + */ + typedef struct { ++ size_t nbBlocks; + size_t compressedSize; + unsigned long long decompressedBound; + } ZSTD_frameSizeInfo; /* decompress & legacy */ + + const seqStore_t* ZSTD_getSeqStore(const ZSTD_CCtx* ctx); /* compress & dictBuilder */ +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ +- +-/* custom memory allocation functions */ +-void* ZSTD_customMalloc(size_t size, ZSTD_customMem customMem); +-void* ZSTD_customCalloc(size_t size, ZSTD_customMem customMem); +-void ZSTD_customFree(void* ptr, ZSTD_customMem customMem); +- +- +-MEM_STATIC U32 ZSTD_highbit32(U32 val) /* compress, dictBuilder, decodeCorpus */ +-{ +- assert(val != 0); +- { +-# if (__GNUC__ >= 3) /* GCC Intrinsic */ +- return __builtin_clz (val) ^ 31; +-# else /* Software version */ +- static const U32 DeBruijnClz[32] = { 0, 9, 1, 10, 13, 21, 2, 29, 11, 14, 16, 18, 22, 25, 3, 30, 8, 12, 20, 28, 15, 17, 24, 7, 19, 27, 23, 6, 26, 5, 4, 31 }; +- U32 v = val; +- v |= v >> 1; +- v |= v >> 2; +- v |= v >> 4; +- v |= v >> 8; +- v |= v >> 16; +- return DeBruijnClz[(v * 0x07C4ACDDU) >> 27]; +-# endif +- } +-} +- +-/* +- * Counts the number of trailing zeros of a `size_t`. +- * Most compilers should support CTZ as a builtin. A backup +- * implementation is provided if the builtin isn't supported, but +- * it may not be terribly efficient. +- */ +-MEM_STATIC unsigned ZSTD_countTrailingZeros(size_t val) +-{ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return __builtin_ctzll((U64)val); +-# else +- static const int DeBruijnBytePos[64] = { 0, 1, 2, 7, 3, 13, 8, 19, +- 4, 25, 14, 28, 9, 34, 20, 56, +- 5, 17, 26, 54, 15, 41, 29, 43, +- 10, 31, 38, 35, 21, 45, 49, 57, +- 63, 6, 12, 18, 24, 27, 33, 55, +- 16, 53, 40, 42, 30, 37, 44, 48, +- 62, 11, 23, 32, 52, 39, 36, 47, +- 61, 22, 51, 46, 60, 50, 59, 58 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return __builtin_ctz((U32)val); +-# else +- static const int DeBruijnBytePos[32] = { 0, 1, 28, 2, 29, 14, 24, 3, +- 30, 22, 20, 15, 25, 17, 4, 8, +- 31, 27, 13, 23, 21, 19, 16, 7, +- 26, 12, 18, 6, 11, 5, 10, 9 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +-} ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr); /* compress, dictBuilder, decodeCorpus (shouldn't get its definition from here) */ + + + /* ZSTD_invalidateRepCodes() : +diff --git a/lib/zstd/compress/clevels.h b/lib/zstd/compress/clevels.h +index d9a76112ec3a..6ab8be6532ef 100644 +--- a/lib/zstd/compress/clevels.h ++++ b/lib/zstd/compress/clevels.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/fse_compress.c b/lib/zstd/compress/fse_compress.c +index ec5b1ca6d71a..e46ca6621b48 100644 +--- a/lib/zstd/compress/fse_compress.c ++++ b/lib/zstd/compress/fse_compress.c +@@ -1,6 +1,7 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * FSE : Finite State Entropy encoder +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -26,6 +27,7 @@ + #define ZSTD_DEPS_NEED_MALLOC + #define ZSTD_DEPS_NEED_MATH64 + #include "../common/zstd_deps.h" /* ZSTD_malloc, ZSTD_free, ZSTD_memcpy, ZSTD_memset */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + /* ************************************************************** +@@ -90,7 +92,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + assert(tableLog < 16); /* required for threshold strategy to work */ + + /* For explanations on how to distribute symbol values over the table : +- * http://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ ++ * https://fastcompression.blogspot.fr/2014/02/fse-distributing-symbol-values.html */ + + #ifdef __clang_analyzer__ + ZSTD_memset(tableSymbol, 0, sizeof(*tableSymbol) * tableSize); /* useless initialization, just to keep scan-build happy */ +@@ -191,7 +193,7 @@ size_t FSE_buildCTable_wksp(FSE_CTable* ct, + break; + default : + assert(normalizedCounter[s] > 1); +- { U32 const maxBitsOut = tableLog - BIT_highbit32 ((U32)normalizedCounter[s]-1); ++ { U32 const maxBitsOut = tableLog - ZSTD_highbit32 ((U32)normalizedCounter[s]-1); + U32 const minStatePlus = (U32)normalizedCounter[s] << maxBitsOut; + symbolTT[s].deltaNbBits = (maxBitsOut << 16) - minStatePlus; + symbolTT[s].deltaFindState = (int)(total - (unsigned)normalizedCounter[s]); +@@ -342,21 +344,11 @@ size_t FSE_writeNCount (void* buffer, size_t bufferSize, + * FSE Compression Code + ****************************************************************/ + +-FSE_CTable* FSE_createCTable (unsigned maxSymbolValue, unsigned tableLog) +-{ +- size_t size; +- if (tableLog > FSE_TABLELOG_ABSOLUTE_MAX) tableLog = FSE_TABLELOG_ABSOLUTE_MAX; +- size = FSE_CTABLE_SIZE_U32 (tableLog, maxSymbolValue) * sizeof(U32); +- return (FSE_CTable*)ZSTD_malloc(size); +-} +- +-void FSE_freeCTable (FSE_CTable* ct) { ZSTD_free(ct); } +- + /* provides the minimum logSize to safely represent a distribution */ + static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + { +- U32 minBitsSrc = BIT_highbit32((U32)(srcSize)) + 1; +- U32 minBitsSymbols = BIT_highbit32(maxSymbolValue) + 2; ++ U32 minBitsSrc = ZSTD_highbit32((U32)(srcSize)) + 1; ++ U32 minBitsSymbols = ZSTD_highbit32(maxSymbolValue) + 2; + U32 minBits = minBitsSrc < minBitsSymbols ? minBitsSrc : minBitsSymbols; + assert(srcSize > 1); /* Not supported, RLE should be used instead */ + return minBits; +@@ -364,7 +356,7 @@ static unsigned FSE_minTableLog(size_t srcSize, unsigned maxSymbolValue) + + unsigned FSE_optimalTableLog_internal(unsigned maxTableLog, size_t srcSize, unsigned maxSymbolValue, unsigned minus) + { +- U32 maxBitsSrc = BIT_highbit32((U32)(srcSize - 1)) - minus; ++ U32 maxBitsSrc = ZSTD_highbit32((U32)(srcSize - 1)) - minus; + U32 tableLog = maxTableLog; + U32 minBits = FSE_minTableLog(srcSize, maxSymbolValue); + assert(srcSize > 1); /* Not supported, RLE should be used instead */ +@@ -532,40 +524,6 @@ size_t FSE_normalizeCount (short* normalizedCounter, unsigned tableLog, + return tableLog; + } + +- +-/* fake FSE_CTable, for raw (uncompressed) input */ +-size_t FSE_buildCTable_raw (FSE_CTable* ct, unsigned nbBits) +-{ +- const unsigned tableSize = 1 << nbBits; +- const unsigned tableMask = tableSize - 1; +- const unsigned maxSymbolValue = tableMask; +- void* const ptr = ct; +- U16* const tableU16 = ( (U16*) ptr) + 2; +- void* const FSCT = ((U32*)ptr) + 1 /* header */ + (tableSize>>1); /* assumption : tableLog >= 1 */ +- FSE_symbolCompressionTransform* const symbolTT = (FSE_symbolCompressionTransform*) (FSCT); +- unsigned s; +- +- /* Sanity checks */ +- if (nbBits < 1) return ERROR(GENERIC); /* min size */ +- +- /* header */ +- tableU16[-2] = (U16) nbBits; +- tableU16[-1] = (U16) maxSymbolValue; +- +- /* Build table */ +- for (s=0; s= 2 ++ ++static size_t showU32(const U32* arr, size_t size) ++{ ++ size_t u; ++ for (u=0; u= sizeof(HUF_WriteCTableWksp)); ++ + /* check conditions */ + if (workspaceSize < sizeof(HUF_WriteCTableWksp)) return ERROR(GENERIC); + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) return ERROR(maxSymbolValue_tooLarge); +@@ -204,16 +264,6 @@ size_t HUF_writeCTable_wksp(void* dst, size_t maxDstSize, + return ((maxSymbolValue+1)/2) + 1; + } + +-/*! HUF_writeCTable() : +- `CTable` : Huffman tree to save, using huf representation. +- @return : size of saved CTable */ +-size_t HUF_writeCTable (void* dst, size_t maxDstSize, +- const HUF_CElt* CTable, unsigned maxSymbolValue, unsigned huffLog) +-{ +- HUF_WriteCTableWksp wksp; +- return HUF_writeCTable_wksp(dst, maxDstSize, CTable, maxSymbolValue, huffLog, &wksp, sizeof(wksp)); +-} +- + + size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void* src, size_t srcSize, unsigned* hasZeroWeights) + { +@@ -269,68 +319,64 @@ size_t HUF_readCTable (HUF_CElt* CTable, unsigned* maxSymbolValuePtr, const void + + U32 HUF_getNbBitsFromCTable(HUF_CElt const* CTable, U32 symbolValue) + { +- const HUF_CElt* ct = CTable + 1; ++ const HUF_CElt* const ct = CTable + 1; + assert(symbolValue <= HUF_SYMBOLVALUE_MAX); + return (U32)HUF_getNbBits(ct[symbolValue]); + } + + +-typedef struct nodeElt_s { +- U32 count; +- U16 parent; +- BYTE byte; +- BYTE nbBits; +-} nodeElt; +- + /* + * HUF_setMaxHeight(): +- * Enforces maxNbBits on the Huffman tree described in huffNode. ++ * Try to enforce @targetNbBits on the Huffman tree described in @huffNode. + * +- * It sets all nodes with nbBits > maxNbBits to be maxNbBits. Then it adjusts +- * the tree to so that it is a valid canonical Huffman tree. ++ * It attempts to convert all nodes with nbBits > @targetNbBits ++ * to employ @targetNbBits instead. Then it adjusts the tree ++ * so that it remains a valid canonical Huffman tree. + * + * @pre The sum of the ranks of each symbol == 2^largestBits, + * where largestBits == huffNode[lastNonNull].nbBits. + * @post The sum of the ranks of each symbol == 2^largestBits, +- * where largestBits is the return value <= maxNbBits. ++ * where largestBits is the return value (expected <= targetNbBits). + * +- * @param huffNode The Huffman tree modified in place to enforce maxNbBits. ++ * @param huffNode The Huffman tree modified in place to enforce targetNbBits. ++ * It's presumed sorted, from most frequent to rarest symbol. + * @param lastNonNull The symbol with the lowest count in the Huffman tree. +- * @param maxNbBits The maximum allowed number of bits, which the Huffman tree ++ * @param targetNbBits The allowed number of bits, which the Huffman tree + * may not respect. After this function the Huffman tree will +- * respect maxNbBits. +- * @return The maximum number of bits of the Huffman tree after adjustment, +- * necessarily no more than maxNbBits. ++ * respect targetNbBits. ++ * @return The maximum number of bits of the Huffman tree after adjustment. + */ +-static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) ++static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 targetNbBits) + { + const U32 largestBits = huffNode[lastNonNull].nbBits; +- /* early exit : no elt > maxNbBits, so the tree is already valid. */ +- if (largestBits <= maxNbBits) return largestBits; ++ /* early exit : no elt > targetNbBits, so the tree is already valid. */ ++ if (largestBits <= targetNbBits) return largestBits; ++ ++ DEBUGLOG(5, "HUF_setMaxHeight (targetNbBits = %u)", targetNbBits); + + /* there are several too large elements (at least >= 2) */ + { int totalCost = 0; +- const U32 baseCost = 1 << (largestBits - maxNbBits); ++ const U32 baseCost = 1 << (largestBits - targetNbBits); + int n = (int)lastNonNull; + +- /* Adjust any ranks > maxNbBits to maxNbBits. ++ /* Adjust any ranks > targetNbBits to targetNbBits. + * Compute totalCost, which is how far the sum of the ranks is + * we are over 2^largestBits after adjust the offending ranks. + */ +- while (huffNode[n].nbBits > maxNbBits) { ++ while (huffNode[n].nbBits > targetNbBits) { + totalCost += baseCost - (1 << (largestBits - huffNode[n].nbBits)); +- huffNode[n].nbBits = (BYTE)maxNbBits; ++ huffNode[n].nbBits = (BYTE)targetNbBits; + n--; + } +- /* n stops at huffNode[n].nbBits <= maxNbBits */ +- assert(huffNode[n].nbBits <= maxNbBits); +- /* n end at index of smallest symbol using < maxNbBits */ +- while (huffNode[n].nbBits == maxNbBits) --n; ++ /* n stops at huffNode[n].nbBits <= targetNbBits */ ++ assert(huffNode[n].nbBits <= targetNbBits); ++ /* n end at index of smallest symbol using < targetNbBits */ ++ while (huffNode[n].nbBits == targetNbBits) --n; + +- /* renorm totalCost from 2^largestBits to 2^maxNbBits ++ /* renorm totalCost from 2^largestBits to 2^targetNbBits + * note : totalCost is necessarily a multiple of baseCost */ +- assert((totalCost & (baseCost - 1)) == 0); +- totalCost >>= (largestBits - maxNbBits); ++ assert(((U32)totalCost & (baseCost - 1)) == 0); ++ totalCost >>= (largestBits - targetNbBits); + assert(totalCost > 0); + + /* repay normalized cost */ +@@ -339,19 +385,19 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + + /* Get pos of last (smallest = lowest cum. count) symbol per rank */ + ZSTD_memset(rankLast, 0xF0, sizeof(rankLast)); +- { U32 currentNbBits = maxNbBits; ++ { U32 currentNbBits = targetNbBits; + int pos; + for (pos=n ; pos >= 0; pos--) { + if (huffNode[pos].nbBits >= currentNbBits) continue; +- currentNbBits = huffNode[pos].nbBits; /* < maxNbBits */ +- rankLast[maxNbBits-currentNbBits] = (U32)pos; ++ currentNbBits = huffNode[pos].nbBits; /* < targetNbBits */ ++ rankLast[targetNbBits-currentNbBits] = (U32)pos; + } } + + while (totalCost > 0) { + /* Try to reduce the next power of 2 above totalCost because we + * gain back half the rank. + */ +- U32 nBitsToDecrease = BIT_highbit32((U32)totalCost) + 1; ++ U32 nBitsToDecrease = ZSTD_highbit32((U32)totalCost) + 1; + for ( ; nBitsToDecrease > 1; nBitsToDecrease--) { + U32 const highPos = rankLast[nBitsToDecrease]; + U32 const lowPos = rankLast[nBitsToDecrease-1]; +@@ -391,7 +437,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + rankLast[nBitsToDecrease] = noSymbol; + else { + rankLast[nBitsToDecrease]--; +- if (huffNode[rankLast[nBitsToDecrease]].nbBits != maxNbBits-nBitsToDecrease) ++ if (huffNode[rankLast[nBitsToDecrease]].nbBits != targetNbBits-nBitsToDecrease) + rankLast[nBitsToDecrease] = noSymbol; /* this rank is now empty */ + } + } /* while (totalCost > 0) */ +@@ -403,11 +449,11 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + * TODO. + */ + while (totalCost < 0) { /* Sometimes, cost correction overshoot */ +- /* special case : no rank 1 symbol (using maxNbBits-1); +- * let's create one from largest rank 0 (using maxNbBits). ++ /* special case : no rank 1 symbol (using targetNbBits-1); ++ * let's create one from largest rank 0 (using targetNbBits). + */ + if (rankLast[1] == noSymbol) { +- while (huffNode[n].nbBits == maxNbBits) n--; ++ while (huffNode[n].nbBits == targetNbBits) n--; + huffNode[n+1].nbBits--; + assert(n >= 0); + rankLast[1] = (U32)(n+1); +@@ -421,7 +467,7 @@ static U32 HUF_setMaxHeight(nodeElt* huffNode, U32 lastNonNull, U32 maxNbBits) + } /* repay normalized cost */ + } /* there are several too large elements (at least >= 2) */ + +- return maxNbBits; ++ return targetNbBits; + } + + typedef struct { +@@ -429,7 +475,7 @@ typedef struct { + U16 curr; + } rankPos; + +-typedef nodeElt huffNodeTable[HUF_CTABLE_WORKSPACE_SIZE_U32]; ++typedef nodeElt huffNodeTable[2 * (HUF_SYMBOLVALUE_MAX + 1)]; + + /* Number of buckets available for HUF_sort() */ + #define RANK_POSITION_TABLE_SIZE 192 +@@ -448,8 +494,8 @@ typedef struct { + * Let buckets 166 to 192 represent all remaining counts up to RANK_POSITION_MAX_COUNT_LOG using log2 bucketing. + */ + #define RANK_POSITION_MAX_COUNT_LOG 32 +-#define RANK_POSITION_LOG_BUCKETS_BEGIN (RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */ +-#define RANK_POSITION_DISTINCT_COUNT_CUTOFF RANK_POSITION_LOG_BUCKETS_BEGIN + BIT_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */ ++#define RANK_POSITION_LOG_BUCKETS_BEGIN ((RANK_POSITION_TABLE_SIZE - 1) - RANK_POSITION_MAX_COUNT_LOG - 1 /* == 158 */) ++#define RANK_POSITION_DISTINCT_COUNT_CUTOFF (RANK_POSITION_LOG_BUCKETS_BEGIN + ZSTD_highbit32(RANK_POSITION_LOG_BUCKETS_BEGIN) /* == 166 */) + + /* Return the appropriate bucket index for a given count. See definition of + * RANK_POSITION_DISTINCT_COUNT_CUTOFF for explanation of bucketing strategy. +@@ -457,7 +503,7 @@ typedef struct { + static U32 HUF_getIndex(U32 const count) { + return (count < RANK_POSITION_DISTINCT_COUNT_CUTOFF) + ? count +- : BIT_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; ++ : ZSTD_highbit32(count) + RANK_POSITION_LOG_BUCKETS_BEGIN; + } + + /* Helper swap function for HUF_quickSortPartition() */ +@@ -580,7 +626,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + + /* Sort each bucket. */ + for (n = RANK_POSITION_DISTINCT_COUNT_CUTOFF; n < RANK_POSITION_TABLE_SIZE - 1; ++n) { +- U32 const bucketSize = rankPosition[n].curr-rankPosition[n].base; ++ int const bucketSize = rankPosition[n].curr - rankPosition[n].base; + U32 const bucketStartIdx = rankPosition[n].base; + if (bucketSize > 1) { + assert(bucketStartIdx < maxSymbolValue1); +@@ -591,6 +637,7 @@ static void HUF_sort(nodeElt huffNode[], const unsigned count[], U32 const maxSy + assert(HUF_isSorted(huffNode, maxSymbolValue1)); + } + ++ + /* HUF_buildCTable_wksp() : + * Same as HUF_buildCTable(), but using externally allocated scratch buffer. + * `workSpace` must be aligned on 4-bytes boundaries, and be at least as large as sizeof(HUF_buildCTable_wksp_tables). +@@ -611,6 +658,7 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + int lowS, lowN; + int nodeNb = STARTNODE; + int n, nodeRoot; ++ DEBUGLOG(5, "HUF_buildTree (alphabet size = %u)", maxSymbolValue + 1); + /* init for parents */ + nonNullRank = (int)maxSymbolValue; + while(huffNode[nonNullRank].count == 0) nonNullRank--; +@@ -637,6 +685,8 @@ static int HUF_buildTree(nodeElt* huffNode, U32 maxSymbolValue) + for (n=0; n<=nonNullRank; n++) + huffNode[n].nbBits = huffNode[ huffNode[n].parent ].nbBits + 1; + ++ DEBUGLOG(6, "Initial distribution of bits completed (%zu sorted symbols)", showHNodeBits(huffNode, maxSymbolValue+1)); ++ + return nonNullRank; + } + +@@ -674,28 +724,36 @@ static void HUF_buildCTableFromTree(HUF_CElt* CTable, nodeElt const* huffNode, i + CTable[0] = maxNbBits; + } + +-size_t HUF_buildCTable_wksp (HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, void* workSpace, size_t wkspSize) ++size_t ++HUF_buildCTable_wksp(HUF_CElt* CTable, const unsigned* count, U32 maxSymbolValue, U32 maxNbBits, ++ void* workSpace, size_t wkspSize) + { +- HUF_buildCTable_wksp_tables* const wksp_tables = (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32)); ++ HUF_buildCTable_wksp_tables* const wksp_tables = ++ (HUF_buildCTable_wksp_tables*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(U32)); + nodeElt* const huffNode0 = wksp_tables->huffNodeTbl; + nodeElt* const huffNode = huffNode0+1; + int nonNullRank; + ++ HUF_STATIC_ASSERT(HUF_CTABLE_WORKSPACE_SIZE == sizeof(HUF_buildCTable_wksp_tables)); ++ ++ DEBUGLOG(5, "HUF_buildCTable_wksp (alphabet size = %u)", maxSymbolValue+1); ++ + /* safety checks */ + if (wkspSize < sizeof(HUF_buildCTable_wksp_tables)) +- return ERROR(workSpace_tooSmall); ++ return ERROR(workSpace_tooSmall); + if (maxNbBits == 0) maxNbBits = HUF_TABLELOG_DEFAULT; + if (maxSymbolValue > HUF_SYMBOLVALUE_MAX) +- return ERROR(maxSymbolValue_tooLarge); ++ return ERROR(maxSymbolValue_tooLarge); + ZSTD_memset(huffNode0, 0, sizeof(huffNodeTable)); + + /* sort, decreasing order */ + HUF_sort(huffNode, count, maxSymbolValue, wksp_tables->rankPosition); ++ DEBUGLOG(6, "sorted symbols completed (%zu symbols)", showHNodeSymbols(huffNode, maxSymbolValue+1)); + + /* build tree */ + nonNullRank = HUF_buildTree(huffNode, maxSymbolValue); + +- /* enforce maxTableLog */ ++ /* determine and enforce maxTableLog */ + maxNbBits = HUF_setMaxHeight(huffNode, (U32)nonNullRank, maxNbBits); + if (maxNbBits > HUF_TABLELOG_MAX) return ERROR(GENERIC); /* check fit into table */ + +@@ -804,7 +862,7 @@ FORCE_INLINE_TEMPLATE void HUF_addBits(HUF_CStream_t* bitC, HUF_CElt elt, int id + #if DEBUGLEVEL >= 1 + { + size_t const nbBits = HUF_getNbBits(elt); +- size_t const dirtyBits = nbBits == 0 ? 0 : BIT_highbit32((U32)nbBits) + 1; ++ size_t const dirtyBits = nbBits == 0 ? 0 : ZSTD_highbit32((U32)nbBits) + 1; + (void)dirtyBits; + /* Middle bits are 0. */ + assert(((elt >> dirtyBits) << (dirtyBits + nbBits)) == 0); +@@ -884,7 +942,7 @@ static size_t HUF_closeCStream(HUF_CStream_t* bitC) + { + size_t const nbBits = bitC->bitPos[0] & 0xFF; + if (bitC->ptr >= bitC->endPtr) return 0; /* overflow detected */ +- return (bitC->ptr - bitC->startPtr) + (nbBits > 0); ++ return (size_t)(bitC->ptr - bitC->startPtr) + (nbBits > 0); + } + } + +@@ -1045,9 +1103,9 @@ HUF_compress1X_usingCTable_internal_default(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { + return HUF_compress1X_usingCTable_internal_bmi2(dst, dstSize, src, srcSize, CTable); + } + return HUF_compress1X_usingCTable_internal_default(dst, dstSize, src, srcSize, CTable); +@@ -1058,28 +1116,23 @@ HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + static size_t + HUF_compress1X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, const int bmi2) ++ const HUF_CElt* CTable, const int flags) + { +- (void)bmi2; ++ (void)flags; + return HUF_compress1X_usingCTable_internal_body(dst, dstSize, src, srcSize, CTable); + } + + #endif + +-size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) +-{ +- return HUF_compress1X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress1X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) ++size_t HUF_compress1X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress1X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + static size_t + HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + const void* src, size_t srcSize, +- const HUF_CElt* CTable, int bmi2) ++ const HUF_CElt* CTable, int flags) + { + size_t const segmentSize = (srcSize+3)/4; /* first 3 segments */ + const BYTE* ip = (const BYTE*) src; +@@ -1093,7 +1146,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + op += 6; /* jumpTable */ + + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart, (U16)cSize); + op += cSize; +@@ -1101,7 +1154,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+2, (U16)cSize); + op += cSize; +@@ -1109,7 +1162,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + + ip += segmentSize; + assert(op <= oend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, segmentSize, CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + MEM_writeLE16(ostart+4, (U16)cSize); + op += cSize; +@@ -1118,7 +1171,7 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + ip += segmentSize; + assert(op <= oend); + assert(ip <= iend); +- { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, bmi2) ); ++ { CHECK_V_F(cSize, HUF_compress1X_usingCTable_internal(op, (size_t)(oend-op), ip, (size_t)(iend-ip), CTable, flags) ); + if (cSize == 0 || cSize > 65535) return 0; + op += cSize; + } +@@ -1126,14 +1179,9 @@ HUF_compress4X_usingCTable_internal(void* dst, size_t dstSize, + return (size_t)(op-ostart); + } + +-size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable) ++size_t HUF_compress4X_usingCTable(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int flags) + { +- return HUF_compress4X_usingCTable_bmi2(dst, dstSize, src, srcSize, CTable, /* bmi2 */ 0); +-} +- +-size_t HUF_compress4X_usingCTable_bmi2(void* dst, size_t dstSize, const void* src, size_t srcSize, const HUF_CElt* CTable, int bmi2) +-{ +- return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, bmi2); ++ return HUF_compress4X_usingCTable_internal(dst, dstSize, src, srcSize, CTable, flags); + } + + typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; +@@ -1141,11 +1189,11 @@ typedef enum { HUF_singleStream, HUF_fourStreams } HUF_nbStreams_e; + static size_t HUF_compressCTable_internal( + BYTE* const ostart, BYTE* op, BYTE* const oend, + const void* src, size_t srcSize, +- HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int bmi2) ++ HUF_nbStreams_e nbStreams, const HUF_CElt* CTable, const int flags) + { + size_t const cSize = (nbStreams==HUF_singleStream) ? +- HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2) : +- HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, bmi2); ++ HUF_compress1X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags) : ++ HUF_compress4X_usingCTable_internal(op, (size_t)(oend - op), src, srcSize, CTable, flags); + if (HUF_isError(cSize)) { return cSize; } + if (cSize==0) { return 0; } /* uncompressible */ + op += cSize; +@@ -1168,6 +1216,79 @@ typedef struct { + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE 4096 + #define SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO 10 /* Must be >= 2 */ + ++unsigned HUF_cardinality(const unsigned* count, unsigned maxSymbolValue) ++{ ++ unsigned cardinality = 0; ++ unsigned i; ++ ++ for (i = 0; i < maxSymbolValue + 1; i++) { ++ if (count[i] != 0) cardinality += 1; ++ } ++ ++ return cardinality; ++} ++ ++unsigned HUF_minTableLog(unsigned symbolCardinality) ++{ ++ U32 minBitsSymbols = ZSTD_highbit32(symbolCardinality) + 1; ++ return minBitsSymbols; ++} ++ ++unsigned HUF_optimalTableLog( ++ unsigned maxTableLog, ++ size_t srcSize, ++ unsigned maxSymbolValue, ++ void* workSpace, size_t wkspSize, ++ HUF_CElt* table, ++ const unsigned* count, ++ int flags) ++{ ++ assert(srcSize > 1); /* Not supported, RLE should be used instead */ ++ assert(wkspSize >= sizeof(HUF_buildCTable_wksp_tables)); ++ ++ if (!(flags & HUF_flags_optimalDepth)) { ++ /* cheap evaluation, based on FSE */ ++ return FSE_optimalTableLog_internal(maxTableLog, srcSize, maxSymbolValue, 1); ++ } ++ ++ { BYTE* dst = (BYTE*)workSpace + sizeof(HUF_WriteCTableWksp); ++ size_t dstSize = wkspSize - sizeof(HUF_WriteCTableWksp); ++ size_t maxBits, hSize, newSize; ++ const unsigned symbolCardinality = HUF_cardinality(count, maxSymbolValue); ++ const unsigned minTableLog = HUF_minTableLog(symbolCardinality); ++ size_t optSize = ((size_t) ~0) - 1; ++ unsigned optLog = maxTableLog, optLogGuess; ++ ++ DEBUGLOG(6, "HUF_optimalTableLog: probing huf depth (srcSize=%zu)", srcSize); ++ ++ /* Search until size increases */ ++ for (optLogGuess = minTableLog; optLogGuess <= maxTableLog; optLogGuess++) { ++ DEBUGLOG(7, "checking for huffLog=%u", optLogGuess); ++ maxBits = HUF_buildCTable_wksp(table, count, maxSymbolValue, optLogGuess, workSpace, wkspSize); ++ if (ERR_isError(maxBits)) continue; ++ ++ if (maxBits < optLogGuess && optLogGuess > minTableLog) break; ++ ++ hSize = HUF_writeCTable_wksp(dst, dstSize, table, maxSymbolValue, (U32)maxBits, workSpace, wkspSize); ++ ++ if (ERR_isError(hSize)) continue; ++ ++ newSize = HUF_estimateCompressedSize(table, count, maxSymbolValue) + hSize; ++ ++ if (newSize > optSize + 1) { ++ break; ++ } ++ ++ if (newSize < optSize) { ++ optSize = newSize; ++ optLog = optLogGuess; ++ } ++ } ++ assert(optLog <= HUF_TABLELOG_MAX); ++ return optLog; ++ } ++} ++ + /* HUF_compress_internal() : + * `workSpace_align4` must be aligned on 4-bytes boundaries, + * and occupies the same space as a table of HUF_WORKSPACE_SIZE_U64 unsigned */ +@@ -1177,14 +1298,14 @@ HUF_compress_internal (void* dst, size_t dstSize, + unsigned maxSymbolValue, unsigned huffLog, + HUF_nbStreams_e nbStreams, + void* workSpace, size_t wkspSize, +- HUF_CElt* oldHufTable, HUF_repeat* repeat, int preferRepeat, +- const int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* oldHufTable, HUF_repeat* repeat, int flags) + { + HUF_compress_tables_t* const table = (HUF_compress_tables_t*)HUF_alignUpWorkspace(workSpace, &wkspSize, ZSTD_ALIGNOF(size_t)); + BYTE* const ostart = (BYTE*)dst; + BYTE* const oend = ostart + dstSize; + BYTE* op = ostart; + ++ DEBUGLOG(5, "HUF_compress_internal (srcSize=%zu)", srcSize); + HUF_STATIC_ASSERT(sizeof(*table) + HUF_WORKSPACE_MAX_ALIGNMENT <= HUF_WORKSPACE_SIZE); + + /* checks & inits */ +@@ -1198,16 +1319,17 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (!huffLog) huffLog = HUF_TABLELOG_DEFAULT; + + /* Heuristic : If old table is valid, use it for small inputs */ +- if (preferRepeat && repeat && *repeat == HUF_repeat_valid) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat == HUF_repeat_valid) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* If uncompressible data is suspected, do a smaller sampling first */ + DEBUG_STATIC_ASSERT(SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO >= 2); +- if (suspectUncompressible && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { ++ if ((flags & HUF_flags_suspectUncompressible) && srcSize >= (SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE * SUSPECT_INCOMPRESSIBLE_SAMPLE_RATIO)) { + size_t largestTotal = 0; ++ DEBUGLOG(5, "input suspected incompressible : sampling to check"); + { unsigned maxSymbolValueBegin = maxSymbolValue; + CHECK_V_F(largestBegin, HIST_count_simple (table->count, &maxSymbolValueBegin, (const BYTE*)src, SUSPECT_INCOMPRESSIBLE_SAMPLE_SIZE) ); + largestTotal += largestBegin; +@@ -1224,6 +1346,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (largest == srcSize) { *ostart = ((const BYTE*)src)[0]; return 1; } /* single symbol, rle */ + if (largest <= (srcSize >> 7)+4) return 0; /* heuristic : probably not compressible enough */ + } ++ DEBUGLOG(6, "histogram detail completed (%zu symbols)", showU32(table->count, maxSymbolValue+1)); + + /* Check validity of previous table */ + if ( repeat +@@ -1232,19 +1355,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + *repeat = HUF_repeat_none; + } + /* Heuristic : use existing table for small inputs */ +- if (preferRepeat && repeat && *repeat != HUF_repeat_none) { ++ if ((flags & HUF_flags_preferRepeat) && repeat && *repeat != HUF_repeat_none) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } + + /* Build Huffman Tree */ +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, &table->wksps, sizeof(table->wksps), table->CTable, table->count, flags); + { size_t const maxBits = HUF_buildCTable_wksp(table->CTable, table->count, + maxSymbolValue, huffLog, + &table->wksps.buildCTable_wksp, sizeof(table->wksps.buildCTable_wksp)); + CHECK_F(maxBits); + huffLog = (U32)maxBits; ++ DEBUGLOG(6, "bit distribution completed (%zu symbols)", showCTableBits(table->CTable + 1, maxSymbolValue+1)); + } + /* Zero unused symbols in CTable, so we can check it for validity */ + { +@@ -1263,7 +1387,7 @@ HUF_compress_internal (void* dst, size_t dstSize, + if (oldSize <= hSize + newSize || hSize + 12 >= srcSize) { + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, oldHufTable, bmi2); ++ nbStreams, oldHufTable, flags); + } } + + /* Use the new huffman table */ +@@ -1275,46 +1399,20 @@ HUF_compress_internal (void* dst, size_t dstSize, + } + return HUF_compressCTable_internal(ostart, op, oend, + src, srcSize, +- nbStreams, table->CTable, bmi2); +-} +- +- +-size_t HUF_compress1X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_singleStream, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ nbStreams, table->CTable, flags); + } + + size_t HUF_compress1X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, +- int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress1X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_singleStream, + workSpace, wkspSize, hufTable, +- repeat, preferRepeat, bmi2, suspectUncompressible); +-} +- +-/* HUF_compress4X_repeat(): +- * compress input using 4 streams. +- * provide workspace to generate compression tables */ +-size_t HUF_compress4X_wksp (void* dst, size_t dstSize, +- const void* src, size_t srcSize, +- unsigned maxSymbolValue, unsigned huffLog, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_compress_internal(dst, dstSize, src, srcSize, +- maxSymbolValue, huffLog, HUF_fourStreams, +- workSpace, wkspSize, +- NULL, NULL, 0, 0 /*bmi2*/, 0); ++ repeat, flags); + } + + /* HUF_compress4X_repeat(): +@@ -1325,11 +1423,11 @@ size_t HUF_compress4X_repeat (void* dst, size_t dstSize, + const void* src, size_t srcSize, + unsigned maxSymbolValue, unsigned huffLog, + void* workSpace, size_t wkspSize, +- HUF_CElt* hufTable, HUF_repeat* repeat, int preferRepeat, int bmi2, unsigned suspectUncompressible) ++ HUF_CElt* hufTable, HUF_repeat* repeat, int flags) + { ++ DEBUGLOG(5, "HUF_compress4X_repeat (srcSize = %zu)", srcSize); + return HUF_compress_internal(dst, dstSize, src, srcSize, + maxSymbolValue, huffLog, HUF_fourStreams, + workSpace, wkspSize, +- hufTable, repeat, preferRepeat, bmi2, suspectUncompressible); ++ hufTable, repeat, flags); + } +- +diff --git a/lib/zstd/compress/zstd_compress.c b/lib/zstd/compress/zstd_compress.c +index f620cafca633..c1c316e9e289 100644 +--- a/lib/zstd/compress/zstd_compress.c ++++ b/lib/zstd/compress/zstd_compress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,12 +12,12 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* INT_MAX, ZSTD_memset, ZSTD_memcpy */ + #include "../common/mem.h" + #include "hist.h" /* HIST_countFast_wksp */ + #define FSE_STATIC_LINKING_ONLY /* FSE_encodeSymbol */ + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_compress_internal.h" + #include "zstd_compress_sequences.h" +@@ -27,6 +28,7 @@ + #include "zstd_opt.h" + #include "zstd_ldm.h" + #include "zstd_compress_superblock.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_rotateRight_U64 */ + + /* *************************************************************** + * Tuning parameters +@@ -55,14 +57,17 @@ + * Helper functions + ***************************************/ + /* ZSTD_compressBound() +- * Note that the result from this function is only compatible with the "normal" +- * full-block strategy. +- * When there are a lot of small blocks due to frequent flush in streaming mode +- * the overhead of headers can make the compressed data to be larger than the +- * return value of ZSTD_compressBound(). ++ * Note that the result from this function is only valid for ++ * the one-pass compression functions. ++ * When employing the streaming mode, ++ * if flushes are frequently altering the size of blocks, ++ * the overhead from block headers can make the compressed data larger ++ * than the return value of ZSTD_compressBound(). + */ + size_t ZSTD_compressBound(size_t srcSize) { +- return ZSTD_COMPRESSBOUND(srcSize); ++ size_t const r = ZSTD_COMPRESSBOUND(srcSize); ++ if (r==0) return ERROR(srcSize_wrong); ++ return r; + } + + +@@ -171,12 +176,9 @@ size_t ZSTD_freeCCtx(ZSTD_CCtx* cctx) + if (cctx==NULL) return 0; /* support free on NULL */ + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, + "not compatible with static CCtx"); +- { +- int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); ++ { int cctxInWorkspace = ZSTD_cwksp_owns_buffer(&cctx->workspace, cctx); + ZSTD_freeCCtxContent(cctx); +- if (!cctxInWorkspace) { +- ZSTD_customFree(cctx, cctx->customMem); +- } ++ if (!cctxInWorkspace) ZSTD_customFree(cctx, cctx->customMem); + } + return 0; + } +@@ -257,9 +259,9 @@ static int ZSTD_allocateChainTable(const ZSTD_strategy strategy, + return forDDSDict || ((strategy != ZSTD_fast) && !ZSTD_rowMatchFinderUsed(strategy, useRowMatchFinder)); + } + +-/* Returns 1 if compression parameters are such that we should ++/* Returns ZSTD_ps_enable if compression parameters are such that we should + * enable long distance matching (wlog >= 27, strategy >= btopt). +- * Returns 0 otherwise. ++ * Returns ZSTD_ps_disable otherwise. + */ + static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + const ZSTD_compressionParameters* const cParams) { +@@ -267,6 +269,34 @@ static ZSTD_paramSwitch_e ZSTD_resolveEnableLdm(ZSTD_paramSwitch_e mode, + return (cParams->strategy >= ZSTD_btopt && cParams->windowLog >= 27) ? ZSTD_ps_enable : ZSTD_ps_disable; + } + ++static int ZSTD_resolveExternalSequenceValidation(int mode) { ++ return mode; ++} ++ ++/* Resolves maxBlockSize to the default if no value is present. */ ++static size_t ZSTD_resolveMaxBlockSize(size_t maxBlockSize) { ++ if (maxBlockSize == 0) { ++ return ZSTD_BLOCKSIZE_MAX; ++ } else { ++ return maxBlockSize; ++ } ++} ++ ++static ZSTD_paramSwitch_e ZSTD_resolveExternalRepcodeSearch(ZSTD_paramSwitch_e value, int cLevel) { ++ if (value != ZSTD_ps_auto) return value; ++ if (cLevel < 10) { ++ return ZSTD_ps_disable; ++ } else { ++ return ZSTD_ps_enable; ++ } ++} ++ ++/* Returns 1 if compression parameters are such that CDict hashtable and chaintable indices are tagged. ++ * If so, the tags need to be removed in ZSTD_resetCCtx_byCopyingCDict. */ ++static int ZSTD_CDictIndicesAreTagged(const ZSTD_compressionParameters* const cParams) { ++ return cParams->strategy == ZSTD_fast || cParams->strategy == ZSTD_dfast; ++} ++ + static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + ZSTD_compressionParameters cParams) + { +@@ -284,6 +314,10 @@ static ZSTD_CCtx_params ZSTD_makeCCtxParamsFromCParams( + } + cctxParams.useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams.useBlockSplitter, &cParams); + cctxParams.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams.useRowMatchFinder, &cParams); ++ cctxParams.validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams.validateSequences); ++ cctxParams.maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams.maxBlockSize); ++ cctxParams.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams.searchForExternalRepcodes, ++ cctxParams.compressionLevel); + assert(!ZSTD_checkCParams(cParams)); + return cctxParams; + } +@@ -329,10 +363,13 @@ size_t ZSTD_CCtxParams_init(ZSTD_CCtx_params* cctxParams, int compressionLevel) + #define ZSTD_NO_CLEVEL 0 + + /* +- * Initializes the cctxParams from params and compressionLevel. ++ * Initializes `cctxParams` from `params` and `compressionLevel`. + * @param compressionLevel If params are derived from a compression level then that compression level, otherwise ZSTD_NO_CLEVEL. + */ +-static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_parameters const* params, int compressionLevel) ++static void ++ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ++ const ZSTD_parameters* params, ++ int compressionLevel) + { + assert(!ZSTD_checkCParams(params->cParams)); + ZSTD_memset(cctxParams, 0, sizeof(*cctxParams)); +@@ -345,6 +382,9 @@ static void ZSTD_CCtxParams_init_internal(ZSTD_CCtx_params* cctxParams, ZSTD_par + cctxParams->useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(cctxParams->useRowMatchFinder, ¶ms->cParams); + cctxParams->useBlockSplitter = ZSTD_resolveBlockSplitterMode(cctxParams->useBlockSplitter, ¶ms->cParams); + cctxParams->ldmParams.enableLdm = ZSTD_resolveEnableLdm(cctxParams->ldmParams.enableLdm, ¶ms->cParams); ++ cctxParams->validateSequences = ZSTD_resolveExternalSequenceValidation(cctxParams->validateSequences); ++ cctxParams->maxBlockSize = ZSTD_resolveMaxBlockSize(cctxParams->maxBlockSize); ++ cctxParams->searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(cctxParams->searchForExternalRepcodes, compressionLevel); + DEBUGLOG(4, "ZSTD_CCtxParams_init_internal: useRowMatchFinder=%d, useBlockSplitter=%d ldm=%d", + cctxParams->useRowMatchFinder, cctxParams->useBlockSplitter, cctxParams->ldmParams.enableLdm); + } +@@ -359,7 +399,7 @@ size_t ZSTD_CCtxParams_init_advanced(ZSTD_CCtx_params* cctxParams, ZSTD_paramete + + /* + * Sets cctxParams' cParams and fParams from params, but otherwise leaves them alone. +- * @param param Validated zstd parameters. ++ * @param params Validated zstd parameters. + */ + static void ZSTD_CCtxParams_setZstdParams( + ZSTD_CCtx_params* cctxParams, const ZSTD_parameters* params) +@@ -455,8 +495,8 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + return bounds; + + case ZSTD_c_enableLongDistanceMatching: +- bounds.lowerBound = 0; +- bounds.upperBound = 1; ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; + return bounds; + + case ZSTD_c_ldmHashLog: +@@ -549,6 +589,26 @@ ZSTD_bounds ZSTD_cParam_getBounds(ZSTD_cParameter param) + bounds.upperBound = 1; + return bounds; + ++ case ZSTD_c_prefetchCDictTables: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ ++ case ZSTD_c_maxBlockSize: ++ bounds.lowerBound = ZSTD_BLOCKSIZE_MAX_MIN; ++ bounds.upperBound = ZSTD_BLOCKSIZE_MAX; ++ return bounds; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ bounds.lowerBound = (int)ZSTD_ps_auto; ++ bounds.upperBound = (int)ZSTD_ps_disable; ++ return bounds; ++ + default: + bounds.error = ERROR(parameter_unsupported); + return bounds; +@@ -613,6 +673,10 @@ static int ZSTD_isUpdateAuthorized(ZSTD_cParameter param) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + default: + return 0; + } +@@ -625,7 +689,7 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + if (ZSTD_isUpdateAuthorized(param)) { + cctx->cParamsChanged = 1; + } else { +- RETURN_ERROR(stage_wrong, "can only set params in ctx init stage"); ++ RETURN_ERROR(stage_wrong, "can only set params in cctx init stage"); + } } + + switch(param) +@@ -668,6 +732,10 @@ size_t ZSTD_CCtx_setParameter(ZSTD_CCtx* cctx, ZSTD_cParameter param, int value) + case ZSTD_c_useBlockSplitter: + case ZSTD_c_useRowMatchFinder: + case ZSTD_c_deterministicRefPrefix: ++ case ZSTD_c_prefetchCDictTables: ++ case ZSTD_c_enableSeqProducerFallback: ++ case ZSTD_c_maxBlockSize: ++ case ZSTD_c_searchForExternalRepcodes: + break; + + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); +@@ -723,12 +791,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + case ZSTD_c_minMatch : + if (value!=0) /* 0 => use default */ + BOUNDCHECK(ZSTD_c_minMatch, value); +- CCtxParams->cParams.minMatch = value; ++ CCtxParams->cParams.minMatch = (U32)value; + return CCtxParams->cParams.minMatch; + + case ZSTD_c_targetLength : + BOUNDCHECK(ZSTD_c_targetLength, value); +- CCtxParams->cParams.targetLength = value; ++ CCtxParams->cParams.targetLength = (U32)value; + return CCtxParams->cParams.targetLength; + + case ZSTD_c_strategy : +@@ -741,12 +809,12 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + /* Content size written in frame header _when known_ (default:1) */ + DEBUGLOG(4, "set content size flag = %u", (value!=0)); + CCtxParams->fParams.contentSizeFlag = value != 0; +- return CCtxParams->fParams.contentSizeFlag; ++ return (size_t)CCtxParams->fParams.contentSizeFlag; + + case ZSTD_c_checksumFlag : + /* A 32-bits content checksum will be calculated and written at end of frame (default:0) */ + CCtxParams->fParams.checksumFlag = value != 0; +- return CCtxParams->fParams.checksumFlag; ++ return (size_t)CCtxParams->fParams.checksumFlag; + + case ZSTD_c_dictIDFlag : /* When applicable, dictionary's dictID is provided in frame header (default:1) */ + DEBUGLOG(4, "set dictIDFlag = %u", (value!=0)); +@@ -755,18 +823,18 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_forceMaxWindow : + CCtxParams->forceWindow = (value != 0); +- return CCtxParams->forceWindow; ++ return (size_t)CCtxParams->forceWindow; + + case ZSTD_c_forceAttachDict : { + const ZSTD_dictAttachPref_e pref = (ZSTD_dictAttachPref_e)value; +- BOUNDCHECK(ZSTD_c_forceAttachDict, pref); ++ BOUNDCHECK(ZSTD_c_forceAttachDict, (int)pref); + CCtxParams->attachDictPref = pref; + return CCtxParams->attachDictPref; + } + + case ZSTD_c_literalCompressionMode : { + const ZSTD_paramSwitch_e lcm = (ZSTD_paramSwitch_e)value; +- BOUNDCHECK(ZSTD_c_literalCompressionMode, lcm); ++ BOUNDCHECK(ZSTD_c_literalCompressionMode, (int)lcm); + CCtxParams->literalCompressionMode = lcm; + return CCtxParams->literalCompressionMode; + } +@@ -789,47 +857,48 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + + case ZSTD_c_enableDedicatedDictSearch : + CCtxParams->enableDedicatedDictSearch = (value!=0); +- return CCtxParams->enableDedicatedDictSearch; ++ return (size_t)CCtxParams->enableDedicatedDictSearch; + + case ZSTD_c_enableLongDistanceMatching : ++ BOUNDCHECK(ZSTD_c_enableLongDistanceMatching, value); + CCtxParams->ldmParams.enableLdm = (ZSTD_paramSwitch_e)value; + return CCtxParams->ldmParams.enableLdm; + + case ZSTD_c_ldmHashLog : + if (value!=0) /* 0 ==> auto */ + BOUNDCHECK(ZSTD_c_ldmHashLog, value); +- CCtxParams->ldmParams.hashLog = value; ++ CCtxParams->ldmParams.hashLog = (U32)value; + return CCtxParams->ldmParams.hashLog; + + case ZSTD_c_ldmMinMatch : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmMinMatch, value); +- CCtxParams->ldmParams.minMatchLength = value; ++ CCtxParams->ldmParams.minMatchLength = (U32)value; + return CCtxParams->ldmParams.minMatchLength; + + case ZSTD_c_ldmBucketSizeLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmBucketSizeLog, value); +- CCtxParams->ldmParams.bucketSizeLog = value; ++ CCtxParams->ldmParams.bucketSizeLog = (U32)value; + return CCtxParams->ldmParams.bucketSizeLog; + + case ZSTD_c_ldmHashRateLog : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_ldmHashRateLog, value); +- CCtxParams->ldmParams.hashRateLog = value; ++ CCtxParams->ldmParams.hashRateLog = (U32)value; + return CCtxParams->ldmParams.hashRateLog; + + case ZSTD_c_targetCBlockSize : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_targetCBlockSize, value); +- CCtxParams->targetCBlockSize = value; ++ CCtxParams->targetCBlockSize = (U32)value; + return CCtxParams->targetCBlockSize; + + case ZSTD_c_srcSizeHint : + if (value!=0) /* 0 ==> default */ + BOUNDCHECK(ZSTD_c_srcSizeHint, value); + CCtxParams->srcSizeHint = value; +- return CCtxParams->srcSizeHint; ++ return (size_t)CCtxParams->srcSizeHint; + + case ZSTD_c_stableInBuffer: + BOUNDCHECK(ZSTD_c_stableInBuffer, value); +@@ -866,6 +935,27 @@ size_t ZSTD_CCtxParams_setParameter(ZSTD_CCtx_params* CCtxParams, + CCtxParams->deterministicRefPrefix = !!value; + return CCtxParams->deterministicRefPrefix; + ++ case ZSTD_c_prefetchCDictTables: ++ BOUNDCHECK(ZSTD_c_prefetchCDictTables, value); ++ CCtxParams->prefetchCDictTables = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->prefetchCDictTables; ++ ++ case ZSTD_c_enableSeqProducerFallback: ++ BOUNDCHECK(ZSTD_c_enableSeqProducerFallback, value); ++ CCtxParams->enableMatchFinderFallback = value; ++ return CCtxParams->enableMatchFinderFallback; ++ ++ case ZSTD_c_maxBlockSize: ++ if (value!=0) /* 0 ==> default */ ++ BOUNDCHECK(ZSTD_c_maxBlockSize, value); ++ CCtxParams->maxBlockSize = value; ++ return CCtxParams->maxBlockSize; ++ ++ case ZSTD_c_searchForExternalRepcodes: ++ BOUNDCHECK(ZSTD_c_searchForExternalRepcodes, value); ++ CCtxParams->searchForExternalRepcodes = (ZSTD_paramSwitch_e)value; ++ return CCtxParams->searchForExternalRepcodes; ++ + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + } +@@ -980,6 +1070,18 @@ size_t ZSTD_CCtxParams_getParameter( + case ZSTD_c_deterministicRefPrefix: + *value = (int)CCtxParams->deterministicRefPrefix; + break; ++ case ZSTD_c_prefetchCDictTables: ++ *value = (int)CCtxParams->prefetchCDictTables; ++ break; ++ case ZSTD_c_enableSeqProducerFallback: ++ *value = CCtxParams->enableMatchFinderFallback; ++ break; ++ case ZSTD_c_maxBlockSize: ++ *value = (int)CCtxParams->maxBlockSize; ++ break; ++ case ZSTD_c_searchForExternalRepcodes: ++ *value = (int)CCtxParams->searchForExternalRepcodes; ++ break; + default: RETURN_ERROR(parameter_unsupported, "unknown parameter"); + } + return 0; +@@ -1006,9 +1108,47 @@ size_t ZSTD_CCtx_setParametersUsingCCtxParams( + return 0; + } + ++size_t ZSTD_CCtx_setCParams(ZSTD_CCtx* cctx, ZSTD_compressionParameters cparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(cparams) == 7 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setCParams"); ++ /* only update if all parameters are valid */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(cparams), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_windowLog, cparams.windowLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_chainLog, cparams.chainLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_hashLog, cparams.hashLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_searchLog, cparams.searchLog), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_minMatch, cparams.minMatch), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_targetLength, cparams.targetLength), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_strategy, cparams.strategy), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setFParams(ZSTD_CCtx* cctx, ZSTD_frameParameters fparams) ++{ ++ ZSTD_STATIC_ASSERT(sizeof(fparams) == 3 * 4 /* all params are listed below */); ++ DEBUGLOG(4, "ZSTD_CCtx_setFParams"); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_contentSizeFlag, fparams.contentSizeFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_checksumFlag, fparams.checksumFlag != 0), ""); ++ FORWARD_IF_ERROR(ZSTD_CCtx_setParameter(cctx, ZSTD_c_dictIDFlag, fparams.noDictIDFlag == 0), ""); ++ return 0; ++} ++ ++size_t ZSTD_CCtx_setParams(ZSTD_CCtx* cctx, ZSTD_parameters params) ++{ ++ DEBUGLOG(4, "ZSTD_CCtx_setParams"); ++ /* First check cParams, because we want to update all or none. */ ++ FORWARD_IF_ERROR(ZSTD_checkCParams(params.cParams), ""); ++ /* Next set fParams, because this could fail if the cctx isn't in init stage. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setFParams(cctx, params.fParams), ""); ++ /* Finally set cParams, which should succeed. */ ++ FORWARD_IF_ERROR(ZSTD_CCtx_setCParams(cctx, params.cParams), ""); ++ return 0; ++} ++ + size_t ZSTD_CCtx_setPledgedSrcSize(ZSTD_CCtx* cctx, unsigned long long pledgedSrcSize) + { +- DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %u bytes", (U32)pledgedSrcSize); ++ DEBUGLOG(4, "ZSTD_CCtx_setPledgedSrcSize to %llu bytes", pledgedSrcSize); + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, + "Can't set pledgedSrcSize when not in init stage."); + cctx->pledgedSrcSizePlusOne = pledgedSrcSize+1; +@@ -1024,9 +1164,9 @@ static void ZSTD_dedicatedDictSearch_revertCParams( + ZSTD_compressionParameters* cParams); + + /* +- * Initializes the local dict using the requested parameters. +- * NOTE: This does not use the pledged src size, because it may be used for more +- * than one compression. ++ * Initializes the local dictionary using requested parameters. ++ * NOTE: Initialization does not employ the pledged src size, ++ * because the dictionary may be used for multiple compressions. + */ + static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + { +@@ -1039,8 +1179,8 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + return 0; + } + if (dl->cdict != NULL) { +- assert(cctx->cdict == dl->cdict); + /* Local dictionary already initialized. */ ++ assert(cctx->cdict == dl->cdict); + return 0; + } + assert(dl->dictSize > 0); +@@ -1060,26 +1200,30 @@ static size_t ZSTD_initLocalDict(ZSTD_CCtx* cctx) + } + + size_t ZSTD_CCtx_loadDictionary_advanced( +- ZSTD_CCtx* cctx, const void* dict, size_t dictSize, +- ZSTD_dictLoadMethod_e dictLoadMethod, ZSTD_dictContentType_e dictContentType) ++ ZSTD_CCtx* cctx, ++ const void* dict, size_t dictSize, ++ ZSTD_dictLoadMethod_e dictLoadMethod, ++ ZSTD_dictContentType_e dictContentType) + { +- RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't load a dictionary when ctx is not in init stage."); + DEBUGLOG(4, "ZSTD_CCtx_loadDictionary_advanced (size: %u)", (U32)dictSize); +- ZSTD_clearAllDicts(cctx); /* in case one already exists */ +- if (dict == NULL || dictSize == 0) /* no dictionary mode */ ++ RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, ++ "Can't load a dictionary when cctx is not in init stage."); ++ ZSTD_clearAllDicts(cctx); /* erase any previously set dictionary */ ++ if (dict == NULL || dictSize == 0) /* no dictionary */ + return 0; + if (dictLoadMethod == ZSTD_dlm_byRef) { + cctx->localDict.dict = dict; + } else { ++ /* copy dictionary content inside CCtx to own its lifetime */ + void* dictBuffer; + RETURN_ERROR_IF(cctx->staticSize, memory_allocation, +- "no malloc for static CCtx"); ++ "static CCtx can't allocate for an internal copy of dictionary"); + dictBuffer = ZSTD_customMalloc(dictSize, cctx->customMem); +- RETURN_ERROR_IF(!dictBuffer, memory_allocation, "NULL pointer!"); ++ RETURN_ERROR_IF(dictBuffer==NULL, memory_allocation, ++ "allocation failed for dictionary content"); + ZSTD_memcpy(dictBuffer, dict, dictSize); +- cctx->localDict.dictBuffer = dictBuffer; +- cctx->localDict.dict = dictBuffer; ++ cctx->localDict.dictBuffer = dictBuffer; /* owned ptr to free */ ++ cctx->localDict.dict = dictBuffer; /* read-only reference */ + } + cctx->localDict.dictSize = dictSize; + cctx->localDict.dictContentType = dictContentType; +@@ -1149,8 +1293,9 @@ size_t ZSTD_CCtx_reset(ZSTD_CCtx* cctx, ZSTD_ResetDirective reset) + if ( (reset == ZSTD_reset_parameters) + || (reset == ZSTD_reset_session_and_parameters) ) { + RETURN_ERROR_IF(cctx->streamStage != zcss_init, stage_wrong, +- "Can't reset parameters only when not in init stage."); ++ "Reset parameters is only possible during init stage."); + ZSTD_clearAllDicts(cctx); ++ ZSTD_memset(&cctx->externalMatchCtx, 0, sizeof(cctx->externalMatchCtx)); + return ZSTD_CCtxParams_reset(&cctx->requestedParams); + } + return 0; +@@ -1247,7 +1392,8 @@ static ZSTD_compressionParameters + ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + unsigned long long srcSize, + size_t dictSize, +- ZSTD_cParamMode_e mode) ++ ZSTD_cParamMode_e mode, ++ ZSTD_paramSwitch_e useRowMatchFinder) + { + const U64 minSrcSize = 513; /* (1<<9) + 1 */ + const U64 maxWindowResize = 1ULL << (ZSTD_WINDOWLOG_MAX-1); +@@ -1281,8 +1427,8 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + } + + /* resize windowLog if input is small enough, to use less memory */ +- if ( (srcSize < maxWindowResize) +- && (dictSize < maxWindowResize) ) { ++ if ( (srcSize <= maxWindowResize) ++ && (dictSize <= maxWindowResize) ) { + U32 const tSize = (U32)(srcSize + dictSize); + static U32 const hashSizeMin = 1 << ZSTD_HASHLOG_MIN; + U32 const srcLog = (tSize < hashSizeMin) ? ZSTD_HASHLOG_MIN : +@@ -1300,6 +1446,42 @@ ZSTD_adjustCParams_internal(ZSTD_compressionParameters cPar, + if (cPar.windowLog < ZSTD_WINDOWLOG_ABSOLUTEMIN) + cPar.windowLog = ZSTD_WINDOWLOG_ABSOLUTEMIN; /* minimum wlog required for valid frame header */ + ++ /* We can't use more than 32 bits of hash in total, so that means that we require: ++ * (hashLog + 8) <= 32 && (chainLog + 8) <= 32 ++ */ ++ if (mode == ZSTD_cpm_createCDict && ZSTD_CDictIndicesAreTagged(&cPar)) { ++ U32 const maxShortCacheHashLog = 32 - ZSTD_SHORT_CACHE_TAG_BITS; ++ if (cPar.hashLog > maxShortCacheHashLog) { ++ cPar.hashLog = maxShortCacheHashLog; ++ } ++ if (cPar.chainLog > maxShortCacheHashLog) { ++ cPar.chainLog = maxShortCacheHashLog; ++ } ++ } ++ ++ ++ /* At this point, we aren't 100% sure if we are using the row match finder. ++ * Unless it is explicitly disabled, conservatively assume that it is enabled. ++ * In this case it will only be disabled for small sources, so shrinking the ++ * hash log a little bit shouldn't result in any ratio loss. ++ */ ++ if (useRowMatchFinder == ZSTD_ps_auto) ++ useRowMatchFinder = ZSTD_ps_enable; ++ ++ /* We can't hash more than 32-bits in total. So that means that we require: ++ * (hashLog - rowLog + 8) <= 32 ++ */ ++ if (ZSTD_rowMatchFinderUsed(cPar.strategy, useRowMatchFinder)) { ++ /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cPar.searchLog, 6); ++ U32 const maxRowHashLog = 32 - ZSTD_ROW_HASH_TAG_BITS; ++ U32 const maxHashLog = maxRowHashLog + rowLog; ++ assert(cPar.hashLog >= rowLog); ++ if (cPar.hashLog > maxHashLog) { ++ cPar.hashLog = maxHashLog; ++ } ++ } ++ + return cPar; + } + +@@ -1310,7 +1492,7 @@ ZSTD_adjustCParams(ZSTD_compressionParameters cPar, + { + cPar = ZSTD_clampCParams(cPar); /* resulting cPar is necessarily valid (all parameters within range) */ + if (srcSize == 0) srcSize = ZSTD_CONTENTSIZE_UNKNOWN; +- return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown); ++ return ZSTD_adjustCParams_internal(cPar, srcSize, dictSize, ZSTD_cpm_unknown, ZSTD_ps_auto); + } + + static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, unsigned long long srcSizeHint, size_t dictSize, ZSTD_cParamMode_e mode); +@@ -1341,7 +1523,7 @@ ZSTD_compressionParameters ZSTD_getCParamsFromCCtxParams( + ZSTD_overrideCParams(&cParams, &CCtxParams->cParams); + assert(!ZSTD_checkCParams(cParams)); + /* srcSizeHint == 0 means 0 */ +- return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cParams, srcSizeHint, dictSize, mode, CCtxParams->useRowMatchFinder); + } + + static size_t +@@ -1370,7 +1552,7 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_match_t)) + + ZSTD_cwksp_aligned_alloc_size((ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + size_t const lazyAdditionalSpace = ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder) +- ? ZSTD_cwksp_aligned_alloc_size(hSize*sizeof(U16)) ++ ? ZSTD_cwksp_aligned_alloc_size(hSize) + : 0; + size_t const optSpace = (forCCtx && (cParams->strategy >= ZSTD_btopt)) + ? optPotentialSpace +@@ -1386,6 +1568,13 @@ ZSTD_sizeof_matchState(const ZSTD_compressionParameters* const cParams, + return tableSpace + optSpace + slackSpace + lazyAdditionalSpace; + } + ++/* Helper function for calculating memory requirements. ++ * Gives a tighter bound than ZSTD_sequenceBound() by taking minMatch into account. */ ++static size_t ZSTD_maxNbSeq(size_t blockSize, unsigned minMatch, int useSequenceProducer) { ++ U32 const divider = (minMatch==3 || useSequenceProducer) ? 3 : 4; ++ return blockSize / divider; ++} ++ + static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_compressionParameters* cParams, + const ldmParams_t* ldmParams, +@@ -1393,12 +1582,13 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + const ZSTD_paramSwitch_e useRowMatchFinder, + const size_t buffInSize, + const size_t buffOutSize, +- const U64 pledgedSrcSize) ++ const U64 pledgedSrcSize, ++ int useSequenceProducer, ++ size_t maxBlockSize) + { + size_t const windowSize = (size_t) BOUNDED(1ULL, 1ULL << cParams->windowLog, pledgedSrcSize); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (cParams->minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(maxBlockSize), windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, cParams->minMatch, useSequenceProducer); + size_t const tokenSpace = ZSTD_cwksp_alloc_size(WILDCOPY_OVERLENGTH + blockSize) + + ZSTD_cwksp_aligned_alloc_size(maxNbSeq * sizeof(seqDef)) + + 3 * ZSTD_cwksp_alloc_size(maxNbSeq * sizeof(BYTE)); +@@ -1417,6 +1607,11 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + + size_t const cctxSpace = isStatic ? ZSTD_cwksp_alloc_size(sizeof(ZSTD_CCtx)) : 0; + ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ size_t const externalSeqSpace = useSequenceProducer ++ ? ZSTD_cwksp_aligned_alloc_size(maxNbExternalSeq * sizeof(ZSTD_Sequence)) ++ : 0; ++ + size_t const neededSpace = + cctxSpace + + entropySpace + +@@ -1425,7 +1620,8 @@ static size_t ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ldmSeqSpace + + matchStateSize + + tokenSpace + +- bufferSpace; ++ bufferSpace + ++ externalSeqSpace; + + DEBUGLOG(5, "estimate workspace : %u", (U32)neededSpace); + return neededSpace; +@@ -1443,7 +1639,7 @@ size_t ZSTD_estimateCCtxSize_usingCCtxParams(const ZSTD_CCtx_params* params) + * be needed. However, we still allocate two 0-sized buffers, which can + * take space under ASAN. */ + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( +- &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN); ++ &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, 0, 0, ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize); + } + + size_t ZSTD_estimateCCtxSize_usingCParams(ZSTD_compressionParameters cParams) +@@ -1493,7 +1689,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + RETURN_ERROR_IF(params->nbWorkers > 0, GENERIC, "Estimate CCtx size is supported for single-threaded compression only."); + { ZSTD_compressionParameters const cParams = + ZSTD_getCParamsFromCCtxParams(params, ZSTD_CONTENTSIZE_UNKNOWN, 0, ZSTD_cpm_noAttachDict); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, (size_t)1 << cParams.windowLog); ++ size_t const blockSize = MIN(ZSTD_resolveMaxBlockSize(params->maxBlockSize), (size_t)1 << cParams.windowLog); + size_t const inBuffSize = (params->inBufferMode == ZSTD_bm_buffered) + ? ((size_t)1 << cParams.windowLog) + blockSize + : 0; +@@ -1504,7 +1700,7 @@ size_t ZSTD_estimateCStreamSize_usingCCtxParams(const ZSTD_CCtx_params* params) + + return ZSTD_estimateCCtxSize_usingCCtxParams_internal( + &cParams, ¶ms->ldmParams, 1, useRowMatchFinder, inBuffSize, outBuffSize, +- ZSTD_CONTENTSIZE_UNKNOWN); ++ ZSTD_CONTENTSIZE_UNKNOWN, params->useSequenceProducer, params->maxBlockSize); + } + } + +@@ -1637,6 +1833,19 @@ typedef enum { + ZSTD_resetTarget_CCtx + } ZSTD_resetTarget_e; + ++/* Mixes bits in a 64 bits in a value, based on XXH3_rrmxmx */ ++static U64 ZSTD_bitmix(U64 val, U64 len) { ++ val ^= ZSTD_rotateRight_U64(val, 49) ^ ZSTD_rotateRight_U64(val, 24); ++ val *= 0x9FB21C651E98DF25ULL; ++ val ^= (val >> 35) + len ; ++ val *= 0x9FB21C651E98DF25ULL; ++ return val ^ (val >> 28); ++} ++ ++/* Mixes in the hashSalt and hashSaltEntropy to create a new hashSalt */ ++static void ZSTD_advanceHashSalt(ZSTD_matchState_t* ms) { ++ ms->hashSalt = ZSTD_bitmix(ms->hashSalt, 8) ^ ZSTD_bitmix((U64) ms->hashSaltEntropy, 4); ++} + + static size_t + ZSTD_reset_matchState(ZSTD_matchState_t* ms, +@@ -1664,6 +1873,7 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + } + + ms->hashLog3 = hashLog3; ++ ms->lazySkipping = 0; + + ZSTD_invalidateMatchState(ms); + +@@ -1685,6 +1895,27 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ZSTD_cwksp_clean_tables(ws); + } + ++ if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { ++ /* Row match finder needs an additional table of hashes ("tags") */ ++ size_t const tagTableSize = hSize; ++ /* We want to generate a new salt in case we reset a Cctx, but we always want to use ++ * 0 when we reset a Cdict */ ++ if(forWho == ZSTD_resetTarget_CCtx) { ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned_init_once(ws, tagTableSize); ++ ZSTD_advanceHashSalt(ms); ++ } else { ++ /* When we are not salting we want to always memset the memory */ ++ ms->tagTable = (BYTE*) ZSTD_cwksp_reserve_aligned(ws, tagTableSize); ++ ZSTD_memset(ms->tagTable, 0, tagTableSize); ++ ms->hashSalt = 0; ++ } ++ { /* Switch to 32-entry rows if searchLog is 5 (or more) */ ++ U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); ++ assert(cParams->hashLog >= rowLog); ++ ms->rowHashLog = cParams->hashLog - rowLog; ++ } ++ } ++ + /* opt parser space */ + if ((forWho == ZSTD_resetTarget_CCtx) && (cParams->strategy >= ZSTD_btopt)) { + DEBUGLOG(4, "reserving optimal parser space"); +@@ -1696,19 +1927,6 @@ ZSTD_reset_matchState(ZSTD_matchState_t* ms, + ms->opt.priceTable = (ZSTD_optimal_t*)ZSTD_cwksp_reserve_aligned(ws, (ZSTD_OPT_NUM+1) * sizeof(ZSTD_optimal_t)); + } + +- if (ZSTD_rowMatchFinderUsed(cParams->strategy, useRowMatchFinder)) { +- { /* Row match finder needs an additional table of hashes ("tags") */ +- size_t const tagTableSize = hSize*sizeof(U16); +- ms->tagTable = (U16*)ZSTD_cwksp_reserve_aligned(ws, tagTableSize); +- if (ms->tagTable) ZSTD_memset(ms->tagTable, 0, tagTableSize); +- } +- { /* Switch to 32-entry rows if searchLog is 5 (or more) */ +- U32 const rowLog = BOUNDED(4, cParams->searchLog, 6); +- assert(cParams->hashLog >= rowLog); +- ms->rowHashLog = cParams->hashLog - rowLog; +- } +- } +- + ms->cParams = *cParams; + + RETURN_ERROR_IF(ZSTD_cwksp_reserve_failed(ws), memory_allocation, +@@ -1768,6 +1986,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + assert(params->useRowMatchFinder != ZSTD_ps_auto); + assert(params->useBlockSplitter != ZSTD_ps_auto); + assert(params->ldmParams.enableLdm != ZSTD_ps_auto); ++ assert(params->maxBlockSize != 0); + if (params->ldmParams.enableLdm == ZSTD_ps_enable) { + /* Adjust long distance matching parameters */ + ZSTD_ldm_adjustParameters(&zc->appliedParams.ldmParams, ¶ms->cParams); +@@ -1776,9 +1995,8 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + } + + { size_t const windowSize = MAX(1, (size_t)MIN(((U64)1 << params->cParams.windowLog), pledgedSrcSize)); +- size_t const blockSize = MIN(ZSTD_BLOCKSIZE_MAX, windowSize); +- U32 const divider = (params->cParams.minMatch==3) ? 3 : 4; +- size_t const maxNbSeq = blockSize / divider; ++ size_t const blockSize = MIN(params->maxBlockSize, windowSize); ++ size_t const maxNbSeq = ZSTD_maxNbSeq(blockSize, params->cParams.minMatch, params->useSequenceProducer); + size_t const buffOutSize = (zbuff == ZSTDb_buffered && params->outBufferMode == ZSTD_bm_buffered) + ? ZSTD_compressBound(blockSize) + 1 + : 0; +@@ -1795,7 +2013,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + size_t const neededSpace = + ZSTD_estimateCCtxSize_usingCCtxParams_internal( + ¶ms->cParams, ¶ms->ldmParams, zc->staticSize != 0, params->useRowMatchFinder, +- buffInSize, buffOutSize, pledgedSrcSize); ++ buffInSize, buffOutSize, pledgedSrcSize, params->useSequenceProducer, params->maxBlockSize); + int resizeWorkspace; + + FORWARD_IF_ERROR(neededSpace, "cctx size estimate failed!"); +@@ -1838,6 +2056,7 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + /* init params */ + zc->blockState.matchState.cParams = params->cParams; ++ zc->blockState.matchState.prefetchCDictTables = params->prefetchCDictTables == ZSTD_ps_enable; + zc->pledgedSrcSizePlusOne = pledgedSrcSize+1; + zc->consumedSrcSize = 0; + zc->producedCSize = 0; +@@ -1854,13 +2073,46 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + + ZSTD_reset_compressedBlockState(zc->blockState.prevCBlock); + ++ FORWARD_IF_ERROR(ZSTD_reset_matchState( ++ &zc->blockState.matchState, ++ ws, ++ ¶ms->cParams, ++ params->useRowMatchFinder, ++ crp, ++ needsIndexReset, ++ ZSTD_resetTarget_CCtx), ""); ++ ++ zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); ++ ++ /* ldm hash table */ ++ if (params->ldmParams.enableLdm == ZSTD_ps_enable) { ++ /* TODO: avoid memset? */ ++ size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; ++ zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); ++ ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); ++ zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); ++ zc->maxNbLdmSequences = maxNbLdmSeq; ++ ++ ZSTD_window_init(&zc->ldmState.window); ++ zc->ldmState.loadedDictEnd = 0; ++ } ++ ++ /* reserve space for block-level external sequences */ ++ if (params->useSequenceProducer) { ++ size_t const maxNbExternalSeq = ZSTD_sequenceBound(blockSize); ++ zc->externalMatchCtx.seqBufferCapacity = maxNbExternalSeq; ++ zc->externalMatchCtx.seqBuffer = ++ (ZSTD_Sequence*)ZSTD_cwksp_reserve_aligned(ws, maxNbExternalSeq * sizeof(ZSTD_Sequence)); ++ } ++ ++ /* buffers */ ++ + /* ZSTD_wildcopy() is used to copy into the literals buffer, + * so we have to oversize the buffer by WILDCOPY_OVERLENGTH bytes. + */ + zc->seqStore.litStart = ZSTD_cwksp_reserve_buffer(ws, blockSize + WILDCOPY_OVERLENGTH); + zc->seqStore.maxNbLit = blockSize; + +- /* buffers */ + zc->bufferedPolicy = zbuff; + zc->inBuffSize = buffInSize; + zc->inBuff = (char*)ZSTD_cwksp_reserve_buffer(ws, buffInSize); +@@ -1883,32 +2135,9 @@ static size_t ZSTD_resetCCtx_internal(ZSTD_CCtx* zc, + zc->seqStore.llCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.mlCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); + zc->seqStore.ofCode = ZSTD_cwksp_reserve_buffer(ws, maxNbSeq * sizeof(BYTE)); +- zc->seqStore.sequencesStart = (seqDef*)ZSTD_cwksp_reserve_aligned(ws, maxNbSeq * sizeof(seqDef)); +- +- FORWARD_IF_ERROR(ZSTD_reset_matchState( +- &zc->blockState.matchState, +- ws, +- ¶ms->cParams, +- params->useRowMatchFinder, +- crp, +- needsIndexReset, +- ZSTD_resetTarget_CCtx), ""); +- +- /* ldm hash table */ +- if (params->ldmParams.enableLdm == ZSTD_ps_enable) { +- /* TODO: avoid memset? */ +- size_t const ldmHSize = ((size_t)1) << params->ldmParams.hashLog; +- zc->ldmState.hashTable = (ldmEntry_t*)ZSTD_cwksp_reserve_aligned(ws, ldmHSize * sizeof(ldmEntry_t)); +- ZSTD_memset(zc->ldmState.hashTable, 0, ldmHSize * sizeof(ldmEntry_t)); +- zc->ldmSequences = (rawSeq*)ZSTD_cwksp_reserve_aligned(ws, maxNbLdmSeq * sizeof(rawSeq)); +- zc->maxNbLdmSequences = maxNbLdmSeq; +- +- ZSTD_window_init(&zc->ldmState.window); +- zc->ldmState.loadedDictEnd = 0; +- } + + DEBUGLOG(3, "wksp: finished allocating, %zd bytes remain available", ZSTD_cwksp_available_space(ws)); +- assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace, resizeWorkspace)); ++ assert(ZSTD_cwksp_estimated_space_within_bounds(ws, neededSpace)); + + zc->initialized = 1; + +@@ -1980,7 +2209,8 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + } + + params.cParams = ZSTD_adjustCParams_internal(adjusted_cdict_cParams, pledgedSrcSize, +- cdict->dictContentSize, ZSTD_cpm_attachDict); ++ cdict->dictContentSize, ZSTD_cpm_attachDict, ++ params.useRowMatchFinder); + params.cParams.windowLog = windowLog; + params.useRowMatchFinder = cdict->useRowMatchFinder; /* cdict overrides */ + FORWARD_IF_ERROR(ZSTD_resetCCtx_internal(cctx, ¶ms, pledgedSrcSize, +@@ -2019,6 +2249,22 @@ ZSTD_resetCCtx_byAttachingCDict(ZSTD_CCtx* cctx, + return 0; + } + ++static void ZSTD_copyCDictTableIntoCCtx(U32* dst, U32 const* src, size_t tableSize, ++ ZSTD_compressionParameters const* cParams) { ++ if (ZSTD_CDictIndicesAreTagged(cParams)){ ++ /* Remove tags from the CDict table if they are present. ++ * See docs on "short cache" in zstd_compress_internal.h for context. */ ++ size_t i; ++ for (i = 0; i < tableSize; i++) { ++ U32 const taggedIndex = src[i]; ++ U32 const index = taggedIndex >> ZSTD_SHORT_CACHE_TAG_BITS; ++ dst[i] = index; ++ } ++ } else { ++ ZSTD_memcpy(dst, src, tableSize * sizeof(U32)); ++ } ++} ++ + static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, + ZSTD_CCtx_params params, +@@ -2054,21 +2300,23 @@ static size_t ZSTD_resetCCtx_byCopyingCDict(ZSTD_CCtx* cctx, + : 0; + size_t const hSize = (size_t)1 << cdict_cParams->hashLog; + +- ZSTD_memcpy(cctx->blockState.matchState.hashTable, +- cdict->matchState.hashTable, +- hSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.hashTable, ++ cdict->matchState.hashTable, ++ hSize, cdict_cParams); ++ + /* Do not copy cdict's chainTable if cctx has parameters such that it would not use chainTable */ + if (ZSTD_allocateChainTable(cctx->appliedParams.cParams.strategy, cctx->appliedParams.useRowMatchFinder, 0 /* forDDSDict */)) { +- ZSTD_memcpy(cctx->blockState.matchState.chainTable, +- cdict->matchState.chainTable, +- chainSize * sizeof(U32)); ++ ZSTD_copyCDictTableIntoCCtx(cctx->blockState.matchState.chainTable, ++ cdict->matchState.chainTable, ++ chainSize, cdict_cParams); + } + /* copy tag table */ + if (ZSTD_rowMatchFinderUsed(cdict_cParams->strategy, cdict->useRowMatchFinder)) { +- size_t const tagTableSize = hSize*sizeof(U16); ++ size_t const tagTableSize = hSize; + ZSTD_memcpy(cctx->blockState.matchState.tagTable, +- cdict->matchState.tagTable, +- tagTableSize); ++ cdict->matchState.tagTable, ++ tagTableSize); ++ cctx->blockState.matchState.hashSalt = cdict->matchState.hashSalt; + } + } + +@@ -2147,6 +2395,7 @@ static size_t ZSTD_copyCCtx_internal(ZSTD_CCtx* dstCCtx, + params.useBlockSplitter = srcCCtx->appliedParams.useBlockSplitter; + params.ldmParams = srcCCtx->appliedParams.ldmParams; + params.fParams = fParams; ++ params.maxBlockSize = srcCCtx->appliedParams.maxBlockSize; + ZSTD_resetCCtx_internal(dstCCtx, ¶ms, pledgedSrcSize, + /* loadedDictSize */ 0, + ZSTDcrp_leaveDirty, zbuff); +@@ -2294,7 +2543,7 @@ static void ZSTD_reduceIndex (ZSTD_matchState_t* ms, ZSTD_CCtx_params const* par + + /* See doc/zstd_compression_format.md for detailed format description */ + +-void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) ++int ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + { + const seqDef* const sequences = seqStorePtr->sequencesStart; + BYTE* const llCodeTable = seqStorePtr->llCode; +@@ -2302,18 +2551,24 @@ void ZSTD_seqToCodes(const seqStore_t* seqStorePtr) + BYTE* const mlCodeTable = seqStorePtr->mlCode; + U32 const nbSeq = (U32)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + U32 u; ++ int longOffsets = 0; + assert(nbSeq <= seqStorePtr->maxNbSeq); + for (u=0; u= STREAM_ACCUMULATOR_MIN)); ++ if (MEM_32bits() && ofCode >= STREAM_ACCUMULATOR_MIN) ++ longOffsets = 1; + } + if (seqStorePtr->longLengthType==ZSTD_llt_literalLength) + llCodeTable[seqStorePtr->longLengthPos] = MaxLL; + if (seqStorePtr->longLengthType==ZSTD_llt_matchLength) + mlCodeTable[seqStorePtr->longLengthPos] = MaxML; ++ return longOffsets; + } + + /* ZSTD_useTargetCBlockSize(): +@@ -2347,6 +2602,7 @@ typedef struct { + U32 MLtype; + size_t size; + size_t lastCountSize; /* Accounts for bug in 1.3.4. More detail in ZSTD_entropyCompressSeqStore_internal() */ ++ int longOffsets; + } ZSTD_symbolEncodingTypeStats_t; + + /* ZSTD_buildSequencesStatistics(): +@@ -2357,11 +2613,13 @@ typedef struct { + * entropyWkspSize must be of size at least ENTROPY_WORKSPACE_SIZE - (MaxSeq + 1)*sizeof(U32) + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, +- const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, +- BYTE* dst, const BYTE* const dstEnd, +- ZSTD_strategy strategy, unsigned* countWorkspace, +- void* entropyWorkspace, size_t entropyWkspSize) { ++ZSTD_buildSequencesStatistics( ++ const seqStore_t* seqStorePtr, size_t nbSeq, ++ const ZSTD_fseCTables_t* prevEntropy, ZSTD_fseCTables_t* nextEntropy, ++ BYTE* dst, const BYTE* const dstEnd, ++ ZSTD_strategy strategy, unsigned* countWorkspace, ++ void* entropyWorkspace, size_t entropyWkspSize) ++{ + BYTE* const ostart = dst; + const BYTE* const oend = dstEnd; + BYTE* op = ostart; +@@ -2375,7 +2633,7 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + + stats.lastCountSize = 0; + /* convert length/distances into codes */ +- ZSTD_seqToCodes(seqStorePtr); ++ stats.longOffsets = ZSTD_seqToCodes(seqStorePtr); + assert(op <= oend); + assert(nbSeq != 0); /* ZSTD_selectEncodingType() divides by nbSeq */ + /* build CTable for Literal Lengths */ +@@ -2480,22 +2738,22 @@ ZSTD_buildSequencesStatistics(seqStore_t* seqStorePtr, size_t nbSeq, + */ + #define SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO 20 + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- void* entropyWorkspace, size_t entropyWkspSize, +- const int bmi2) ++ZSTD_entropyCompressSeqStore_internal( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ const int bmi2) + { +- const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + ZSTD_strategy const strategy = cctxParams->cParams.strategy; + unsigned* count = (unsigned*)entropyWorkspace; + FSE_CTable* CTable_LitLength = nextEntropy->fse.litlengthCTable; + FSE_CTable* CTable_OffsetBits = nextEntropy->fse.offcodeCTable; + FSE_CTable* CTable_MatchLength = nextEntropy->fse.matchlengthCTable; + const seqDef* const sequences = seqStorePtr->sequencesStart; +- const size_t nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ const size_t nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + const BYTE* const ofCodeTable = seqStorePtr->ofCode; + const BYTE* const llCodeTable = seqStorePtr->llCode; + const BYTE* const mlCodeTable = seqStorePtr->mlCode; +@@ -2503,29 +2761,31 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + BYTE* const oend = ostart + dstCapacity; + BYTE* op = ostart; + size_t lastCountSize; ++ int longOffsets = 0; + + entropyWorkspace = count + (MaxSeq + 1); + entropyWkspSize -= (MaxSeq + 1) * sizeof(*count); + +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu)", nbSeq); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore_internal (nbSeq=%zu, dstCapacity=%zu)", nbSeq, dstCapacity); + ZSTD_STATIC_ASSERT(HUF_WORKSPACE_SIZE >= (1<= HUF_WORKSPACE_SIZE); + + /* Compress literals */ + { const BYTE* const literals = seqStorePtr->litStart; +- size_t const numSequences = seqStorePtr->sequences - seqStorePtr->sequencesStart; +- size_t const numLiterals = seqStorePtr->lit - seqStorePtr->litStart; ++ size_t const numSequences = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); ++ size_t const numLiterals = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); + /* Base suspicion of uncompressibility on ratio of literals to sequences */ + unsigned const suspectUncompressible = (numSequences == 0) || (numLiterals / numSequences >= SUSPECT_UNCOMPRESSIBLE_LITERAL_RATIO); + size_t const litSize = (size_t)(seqStorePtr->lit - literals); ++ + size_t const cSize = ZSTD_compressLiterals( +- &prevEntropy->huf, &nextEntropy->huf, +- cctxParams->cParams.strategy, +- ZSTD_literalsCompressionIsDisabled(cctxParams), + op, dstCapacity, + literals, litSize, + entropyWorkspace, entropyWkspSize, +- bmi2, suspectUncompressible); ++ &prevEntropy->huf, &nextEntropy->huf, ++ cctxParams->cParams.strategy, ++ ZSTD_literalsCompressionIsDisabled(cctxParams), ++ suspectUncompressible, bmi2); + FORWARD_IF_ERROR(cSize, "ZSTD_compressLiterals failed"); + assert(cSize <= dstCapacity); + op += cSize; +@@ -2551,11 +2811,10 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + ZSTD_memcpy(&nextEntropy->fse, &prevEntropy->fse, sizeof(prevEntropy->fse)); + return (size_t)(op - ostart); + } +- { +- ZSTD_symbolEncodingTypeStats_t stats; +- BYTE* seqHead = op++; ++ { BYTE* const seqHead = op++; + /* build stats for sequences */ +- stats = ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, ++ const ZSTD_symbolEncodingTypeStats_t stats = ++ ZSTD_buildSequencesStatistics(seqStorePtr, nbSeq, + &prevEntropy->fse, &nextEntropy->fse, + op, oend, + strategy, count, +@@ -2564,6 +2823,7 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + *seqHead = (BYTE)((stats.LLtype<<6) + (stats.Offtype<<4) + (stats.MLtype<<2)); + lastCountSize = stats.lastCountSize; + op += stats.size; ++ longOffsets = stats.longOffsets; + } + + { size_t const bitstreamSize = ZSTD_encodeSequences( +@@ -2598,14 +2858,15 @@ ZSTD_entropyCompressSeqStore_internal(seqStore_t* seqStorePtr, + } + + MEM_STATIC size_t +-ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- size_t srcSize, +- void* entropyWorkspace, size_t entropyWkspSize, +- int bmi2) ++ZSTD_entropyCompressSeqStore( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ size_t srcSize, ++ void* entropyWorkspace, size_t entropyWkspSize, ++ int bmi2) + { + size_t const cSize = ZSTD_entropyCompressSeqStore_internal( + seqStorePtr, prevEntropy, nextEntropy, cctxParams, +@@ -2615,15 +2876,21 @@ ZSTD_entropyCompressSeqStore(seqStore_t* seqStorePtr, + /* When srcSize <= dstCapacity, there is enough space to write a raw uncompressed block. + * Since we ran out of space, block must be not compressible, so fall back to raw uncompressed block. + */ +- if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) ++ if ((cSize == ERROR(dstSize_tooSmall)) & (srcSize <= dstCapacity)) { ++ DEBUGLOG(4, "not enough dstCapacity (%zu) for ZSTD_entropyCompressSeqStore_internal()=> do not compress block", dstCapacity); + return 0; /* block not compressed */ ++ } + FORWARD_IF_ERROR(cSize, "ZSTD_entropyCompressSeqStore_internal failed"); + + /* Check compressibility */ + { size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, cctxParams->cParams.strategy); + if (cSize >= maxCSize) return 0; /* block not compressed */ + } +- DEBUGLOG(4, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ DEBUGLOG(5, "ZSTD_entropyCompressSeqStore() cSize: %zu", cSize); ++ /* libzstd decoder before > v1.5.4 is not compatible with compressed blocks of size ZSTD_BLOCKSIZE_MAX exactly. ++ * This restriction is indirectly already fulfilled by respecting ZSTD_minGain() condition above. ++ */ ++ assert(cSize < ZSTD_BLOCKSIZE_MAX); + return cSize; + } + +@@ -2718,6 +2985,72 @@ void ZSTD_resetSeqStore(seqStore_t* ssPtr) + ssPtr->longLengthType = ZSTD_llt_none; + } + ++/* ZSTD_postProcessSequenceProducerResult() : ++ * Validates and post-processes sequences obtained through the external matchfinder API: ++ * - Checks whether nbExternalSeqs represents an error condition. ++ * - Appends a block delimiter to outSeqs if one is not already present. ++ * See zstd.h for context regarding block delimiters. ++ * Returns the number of sequences after post-processing, or an error code. */ ++static size_t ZSTD_postProcessSequenceProducerResult( ++ ZSTD_Sequence* outSeqs, size_t nbExternalSeqs, size_t outSeqsCapacity, size_t srcSize ++) { ++ RETURN_ERROR_IF( ++ nbExternalSeqs > outSeqsCapacity, ++ sequenceProducer_failed, ++ "External sequence producer returned error code %lu", ++ (unsigned long)nbExternalSeqs ++ ); ++ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == 0 && srcSize > 0, ++ sequenceProducer_failed, ++ "Got zero sequences from external sequence producer for a non-empty src buffer!" ++ ); ++ ++ if (srcSize == 0) { ++ ZSTD_memset(&outSeqs[0], 0, sizeof(ZSTD_Sequence)); ++ return 1; ++ } ++ ++ { ++ ZSTD_Sequence const lastSeq = outSeqs[nbExternalSeqs - 1]; ++ ++ /* We can return early if lastSeq is already a block delimiter. */ ++ if (lastSeq.offset == 0 && lastSeq.matchLength == 0) { ++ return nbExternalSeqs; ++ } ++ ++ /* This error condition is only possible if the external matchfinder ++ * produced an invalid parse, by definition of ZSTD_sequenceBound(). */ ++ RETURN_ERROR_IF( ++ nbExternalSeqs == outSeqsCapacity, ++ sequenceProducer_failed, ++ "nbExternalSeqs == outSeqsCapacity but lastSeq is not a block delimiter!" ++ ); ++ ++ /* lastSeq is not a block delimiter, so we need to append one. */ ++ ZSTD_memset(&outSeqs[nbExternalSeqs], 0, sizeof(ZSTD_Sequence)); ++ return nbExternalSeqs + 1; ++ } ++} ++ ++/* ZSTD_fastSequenceLengthSum() : ++ * Returns sum(litLen) + sum(matchLen) + lastLits for *seqBuf*. ++ * Similar to another function in zstd_compress.c (determine_blockSize), ++ * except it doesn't check for a block delimiter to end summation. ++ * Removing the early exit allows the compiler to auto-vectorize (https://godbolt.org/z/cY1cajz9P). ++ * This function can be deleted and replaced by determine_blockSize after we resolve issue #3456. */ ++static size_t ZSTD_fastSequenceLengthSum(ZSTD_Sequence const* seqBuf, size_t seqBufSize) { ++ size_t matchLenSum, litLenSum, i; ++ matchLenSum = 0; ++ litLenSum = 0; ++ for (i = 0; i < seqBufSize; i++) { ++ litLenSum += seqBuf[i].litLength; ++ matchLenSum += seqBuf[i].matchLength; ++ } ++ return litLenSum + matchLenSum; ++} ++ + typedef enum { ZSTDbss_compress, ZSTDbss_noCompress } ZSTD_buildSeqStore_e; + + static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) +@@ -2727,7 +3060,9 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + /* Assert that we have correctly flushed the ctx params into the ms's copy */ + ZSTD_assertEqualCParams(zc->appliedParams.cParams, ms->cParams); +- if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (srcSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + if (zc->appliedParams.cParams.strategy >= ZSTD_btopt) { + ZSTD_ldm_skipRawSeqStoreBytes(&zc->externSeqStore, srcSize); + } else { +@@ -2763,6 +3098,15 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } + if (zc->externSeqStore.pos < zc->externSeqStore.size) { + assert(zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_disable); ++ ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ zc->appliedParams.useSequenceProducer, ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + /* Updates ldmSeqStore.pos */ + lastLLSize = + ZSTD_ldm_blockCompress(&zc->externSeqStore, +@@ -2774,6 +3118,14 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + } else if (zc->appliedParams.ldmParams.enableLdm == ZSTD_ps_enable) { + rawSeqStore_t ldmSeqStore = kNullRawSeqStore; + ++ /* External matchfinder + LDM is technically possible, just not implemented yet. ++ * We need to revisit soon and implement it. */ ++ RETURN_ERROR_IF( ++ zc->appliedParams.useSequenceProducer, ++ parameter_combination_unsupported, ++ "Long-distance matching with external sequence producer enabled is not currently supported." ++ ); ++ + ldmSeqStore.seq = zc->ldmSequences; + ldmSeqStore.capacity = zc->maxNbLdmSequences; + /* Updates ldmSeqStore.size */ +@@ -2788,7 +3140,68 @@ static size_t ZSTD_buildSeqStore(ZSTD_CCtx* zc, const void* src, size_t srcSize) + zc->appliedParams.useRowMatchFinder, + src, srcSize); + assert(ldmSeqStore.pos == ldmSeqStore.size); +- } else { /* not long range mode */ ++ } else if (zc->appliedParams.useSequenceProducer) { ++ assert( ++ zc->externalMatchCtx.seqBufferCapacity >= ZSTD_sequenceBound(srcSize) ++ ); ++ assert(zc->externalMatchCtx.mFinder != NULL); ++ ++ { U32 const windowSize = (U32)1 << zc->appliedParams.cParams.windowLog; ++ ++ size_t const nbExternalSeqs = (zc->externalMatchCtx.mFinder)( ++ zc->externalMatchCtx.mState, ++ zc->externalMatchCtx.seqBuffer, ++ zc->externalMatchCtx.seqBufferCapacity, ++ src, srcSize, ++ NULL, 0, /* dict and dictSize, currently not supported */ ++ zc->appliedParams.compressionLevel, ++ windowSize ++ ); ++ ++ size_t const nbPostProcessedSeqs = ZSTD_postProcessSequenceProducerResult( ++ zc->externalMatchCtx.seqBuffer, ++ nbExternalSeqs, ++ zc->externalMatchCtx.seqBufferCapacity, ++ srcSize ++ ); ++ ++ /* Return early if there is no error, since we don't need to worry about last literals */ ++ if (!ZSTD_isError(nbPostProcessedSeqs)) { ++ ZSTD_sequencePosition seqPos = {0,0,0}; ++ size_t const seqLenSum = ZSTD_fastSequenceLengthSum(zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs); ++ RETURN_ERROR_IF(seqLenSum > srcSize, externalSequences_invalid, "External sequences imply too large a block!"); ++ FORWARD_IF_ERROR( ++ ZSTD_copySequencesToSeqStoreExplicitBlockDelim( ++ zc, &seqPos, ++ zc->externalMatchCtx.seqBuffer, nbPostProcessedSeqs, ++ src, srcSize, ++ zc->appliedParams.searchForExternalRepcodes ++ ), ++ "Failed to copy external sequences to seqStore!" ++ ); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG(5, "Copied %lu sequences from external sequence producer to internal seqStore.", (unsigned long)nbExternalSeqs); ++ return ZSTDbss_compress; ++ } ++ ++ /* Propagate the error if fallback is disabled */ ++ if (!zc->appliedParams.enableMatchFinderFallback) { ++ return nbPostProcessedSeqs; ++ } ++ ++ /* Fallback to software matchfinder */ ++ { ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, ++ zc->appliedParams.useRowMatchFinder, ++ dictMode); ++ ms->ldmSeqStore = NULL; ++ DEBUGLOG( ++ 5, ++ "External sequence producer returned error code %lu. Falling back to internal parser.", ++ (unsigned long)nbExternalSeqs ++ ); ++ lastLLSize = blockCompressor(ms, &zc->seqStore, zc->blockState.nextCBlock->rep, src, srcSize); ++ } } ++ } else { /* not long range mode and no external matchfinder */ + ZSTD_blockCompressor const blockCompressor = ZSTD_selectBlockCompressor(zc->appliedParams.cParams.strategy, + zc->appliedParams.useRowMatchFinder, + dictMode); +@@ -2849,7 +3262,7 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + /* seqStoreSeqs[i].offset == offCode+1, and ZSTD_updateRep() expects offCode + so we provide seqStoreSeqs[i].offset - 1 */ + ZSTD_updateRep(updatedRepcodes.rep, +- seqStoreSeqs[i].offBase - 1, ++ seqStoreSeqs[i].offBase, + seqStoreSeqs[i].litLength == 0); + literalsRead += outSeqs[i].litLength; + } +@@ -2865,6 +3278,10 @@ static void ZSTD_copyBlockSequences(ZSTD_CCtx* zc) + zc->seqCollector.seqIndex += seqStoreSeqSize; + } + ++size_t ZSTD_sequenceBound(size_t srcSize) { ++ return (srcSize / ZSTD_MINMATCH_MIN) + 1; ++} ++ + size_t ZSTD_generateSequences(ZSTD_CCtx* zc, ZSTD_Sequence* outSeqs, + size_t outSeqsSize, const void* src, size_t srcSize) + { +@@ -2910,19 +3327,17 @@ static int ZSTD_isRLE(const BYTE* src, size_t length) { + const size_t unrollMask = unrollSize - 1; + const size_t prefixLength = length & unrollMask; + size_t i; +- size_t u; + if (length == 1) return 1; + /* Check if prefix is RLE first before using unrolled loop */ + if (prefixLength && ZSTD_count(ip+1, ip, ip+prefixLength) != prefixLength-1) { + return 0; + } + for (i = prefixLength; i != length; i += unrollSize) { ++ size_t u; + for (u = 0; u < unrollSize; u += sizeof(size_t)) { + if (MEM_readST(ip + i + u) != valueST) { + return 0; +- } +- } +- } ++ } } } + return 1; + } + +@@ -2938,7 +3353,8 @@ static int ZSTD_maybeRLE(seqStore_t const* seqStore) + return nbSeqs < 4 && nbLits < 10; + } + +-static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) ++static void ++ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* const bs) + { + ZSTD_compressedBlockState_t* const tmp = bs->prevCBlock; + bs->prevCBlock = bs->nextCBlock; +@@ -2946,7 +3362,9 @@ static void ZSTD_blockState_confirmRepcodesAndEntropyTables(ZSTD_blockState_t* c + } + + /* Writes the block header */ +-static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) { ++static void ++writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastBlock) ++{ + U32 const cBlockHeader = cSize == 1 ? + lastBlock + (((U32)bt_rle)<<1) + (U32)(blockSize << 3) : + lastBlock + (((U32)bt_compressed)<<1) + (U32)(cSize << 3); +@@ -2959,13 +3377,16 @@ static void writeBlockHeader(void* op, size_t cSize, size_t blockSize, U32 lastB + * Stores literals block type (raw, rle, compressed, repeat) and + * huffman description table to hufMetadata. + * Requires ENTROPY_WORKSPACE_SIZE workspace +- * @return : size of huffman description table or error code */ +-static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, +- const ZSTD_hufCTables_t* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_hufCTablesMetadata_t* hufMetadata, +- const int literalsCompressionIsDisabled, +- void* workspace, size_t wkspSize) ++ * @return : size of huffman description table, or an error code ++ */ ++static size_t ++ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const int literalsCompressionIsDisabled, ++ void* workspace, size_t wkspSize, ++ int hufFlags) + { + BYTE* const wkspStart = (BYTE*)workspace; + BYTE* const wkspEnd = wkspStart + wkspSize; +@@ -2973,9 +3394,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + unsigned* const countWksp = (unsigned*)workspace; + const size_t countWkspSize = (HUF_SYMBOLVALUE_MAX + 1) * sizeof(unsigned); + BYTE* const nodeWksp = countWkspStart + countWkspSize; +- const size_t nodeWkspSize = wkspEnd-nodeWksp; ++ const size_t nodeWkspSize = (size_t)(wkspEnd - nodeWksp); + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +- unsigned huffLog = HUF_TABLELOG_DEFAULT; ++ unsigned huffLog = LitHufLog; + HUF_repeat repeat = prevHuf->repeatMode; + DEBUGLOG(5, "ZSTD_buildBlockEntropyStats_literals (srcSize=%zu)", srcSize); + +@@ -2990,73 +3411,77 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + + /* small ? don't even attempt compression (speed opt) */ + #ifndef COMPRESS_LITERALS_SIZE_MIN +-#define COMPRESS_LITERALS_SIZE_MIN 63 ++# define COMPRESS_LITERALS_SIZE_MIN 63 /* heuristic */ + #endif + { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; + if (srcSize <= minLitSize) { + DEBUGLOG(5, "set_basic - too small"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Scan input and build symbol stats */ +- { size_t const largest = HIST_count_wksp (countWksp, &maxSymbolValue, (const BYTE*)src, srcSize, workspace, wkspSize); ++ { size_t const largest = ++ HIST_count_wksp (countWksp, &maxSymbolValue, ++ (const BYTE*)src, srcSize, ++ workspace, wkspSize); + FORWARD_IF_ERROR(largest, "HIST_count_wksp failed"); + if (largest == srcSize) { ++ /* only one literal symbol */ + DEBUGLOG(5, "set_rle"); + hufMetadata->hType = set_rle; + return 0; + } + if (largest <= (srcSize >> 7)+4) { ++ /* heuristic: likely not compressible */ + DEBUGLOG(5, "set_basic - no gain"); + hufMetadata->hType = set_basic; + return 0; +- } +- } ++ } } + + /* Validate the previous Huffman table */ +- if (repeat == HUF_repeat_check && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { ++ if (repeat == HUF_repeat_check ++ && !HUF_validateCTable((HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue)) { + repeat = HUF_repeat_none; + } + + /* Build Huffman Tree */ + ZSTD_memset(nextHuf->CTable, 0, sizeof(nextHuf->CTable)); +- huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue); ++ huffLog = HUF_optimalTableLog(huffLog, srcSize, maxSymbolValue, nodeWksp, nodeWkspSize, nextHuf->CTable, countWksp, hufFlags); ++ assert(huffLog <= LitHufLog); + { size_t const maxBits = HUF_buildCTable_wksp((HUF_CElt*)nextHuf->CTable, countWksp, + maxSymbolValue, huffLog, + nodeWksp, nodeWkspSize); + FORWARD_IF_ERROR(maxBits, "HUF_buildCTable_wksp"); + huffLog = (U32)maxBits; +- { /* Build and write the CTable */ +- size_t const newCSize = HUF_estimateCompressedSize( +- (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); +- size_t const hSize = HUF_writeCTable_wksp( +- hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), +- (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, +- nodeWksp, nodeWkspSize); +- /* Check against repeating the previous CTable */ +- if (repeat != HUF_repeat_none) { +- size_t const oldCSize = HUF_estimateCompressedSize( +- (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); +- if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { +- DEBUGLOG(5, "set_repeat - smaller"); +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_repeat; +- return 0; +- } +- } +- if (newCSize + hSize >= srcSize) { +- DEBUGLOG(5, "set_basic - no gains"); ++ } ++ { /* Build and write the CTable */ ++ size_t const newCSize = HUF_estimateCompressedSize( ++ (HUF_CElt*)nextHuf->CTable, countWksp, maxSymbolValue); ++ size_t const hSize = HUF_writeCTable_wksp( ++ hufMetadata->hufDesBuffer, sizeof(hufMetadata->hufDesBuffer), ++ (HUF_CElt*)nextHuf->CTable, maxSymbolValue, huffLog, ++ nodeWksp, nodeWkspSize); ++ /* Check against repeating the previous CTable */ ++ if (repeat != HUF_repeat_none) { ++ size_t const oldCSize = HUF_estimateCompressedSize( ++ (HUF_CElt const*)prevHuf->CTable, countWksp, maxSymbolValue); ++ if (oldCSize < srcSize && (oldCSize <= hSize + newCSize || hSize + 12 >= srcSize)) { ++ DEBUGLOG(5, "set_repeat - smaller"); + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- hufMetadata->hType = set_basic; ++ hufMetadata->hType = set_repeat; + return 0; +- } +- DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); +- hufMetadata->hType = set_compressed; +- nextHuf->repeatMode = HUF_repeat_check; +- return hSize; ++ } } ++ if (newCSize + hSize >= srcSize) { ++ DEBUGLOG(5, "set_basic - no gains"); ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ hufMetadata->hType = set_basic; ++ return 0; + } ++ DEBUGLOG(5, "set_compressed (hSize=%u)", (U32)hSize); ++ hufMetadata->hType = set_compressed; ++ nextHuf->repeatMode = HUF_repeat_check; ++ return hSize; + } + } + +@@ -3066,8 +3491,9 @@ static size_t ZSTD_buildBlockEntropyStats_literals(void* const src, size_t srcSi + * and updates nextEntropy to the appropriate repeatMode. + */ + static ZSTD_symbolEncodingTypeStats_t +-ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { +- ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0}; ++ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) ++{ ++ ZSTD_symbolEncodingTypeStats_t stats = {set_basic, set_basic, set_basic, 0, 0, 0}; + nextEntropy->litlength_repeatMode = FSE_repeat_none; + nextEntropy->offcode_repeatMode = FSE_repeat_none; + nextEntropy->matchlength_repeatMode = FSE_repeat_none; +@@ -3078,16 +3504,18 @@ ZSTD_buildDummySequencesStatistics(ZSTD_fseCTables_t* nextEntropy) { + * Builds entropy for the sequences. + * Stores symbol compression modes and fse table to fseMetadata. + * Requires ENTROPY_WORKSPACE_SIZE wksp. +- * @return : size of fse tables or error code */ +-static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, +- const ZSTD_fseCTables_t* prevEntropy, +- ZSTD_fseCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize) ++ * @return : size of fse tables or error code */ ++static size_t ++ZSTD_buildBlockEntropyStats_sequences( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_fseCTables_t* prevEntropy, ++ ZSTD_fseCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize) + { + ZSTD_strategy const strategy = cctxParams->cParams.strategy; +- size_t const nbSeq = seqStorePtr->sequences - seqStorePtr->sequencesStart; ++ size_t const nbSeq = (size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart); + BYTE* const ostart = fseMetadata->fseTablesBuffer; + BYTE* const oend = ostart + sizeof(fseMetadata->fseTablesBuffer); + BYTE* op = ostart; +@@ -3114,23 +3542,28 @@ static size_t ZSTD_buildBlockEntropyStats_sequences(seqStore_t* seqStorePtr, + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * Requires workspace size ENTROPY_WORKSPACE_SIZE +- * +- * @return : 0 on success or error code ++ * @return : 0 on success, or an error code ++ * Note : also employed in superblock + */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize) +-{ +- size_t const litSize = seqStorePtr->lit - seqStorePtr->litStart; ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize) ++{ ++ size_t const litSize = (size_t)(seqStorePtr->lit - seqStorePtr->litStart); ++ int const huf_useOptDepth = (cctxParams->cParams.strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD); ++ int const hufFlags = huf_useOptDepth ? HUF_flags_optimalDepth : 0; ++ + entropyMetadata->hufMetadata.hufDesSize = + ZSTD_buildBlockEntropyStats_literals(seqStorePtr->litStart, litSize, + &prevEntropy->huf, &nextEntropy->huf, + &entropyMetadata->hufMetadata, + ZSTD_literalsCompressionIsDisabled(cctxParams), +- workspace, wkspSize); ++ workspace, wkspSize, hufFlags); ++ + FORWARD_IF_ERROR(entropyMetadata->hufMetadata.hufDesSize, "ZSTD_buildBlockEntropyStats_literals failed"); + entropyMetadata->fseMetadata.fseTablesSize = + ZSTD_buildBlockEntropyStats_sequences(seqStorePtr, +@@ -3143,11 +3576,12 @@ size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, + } + + /* Returns the size estimate for the literals section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, +- const ZSTD_hufCTables_t* huf, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSize, ++ const ZSTD_hufCTables_t* huf, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + unsigned* const countWksp = (unsigned*)workspace; + unsigned maxSymbolValue = HUF_SYMBOLVALUE_MAX; +@@ -3169,12 +3603,13 @@ static size_t ZSTD_estimateBlockSize_literal(const BYTE* literals, size_t litSiz + } + + /* Returns the size estimate for the FSE-compressed symbols (of, ml, ll) of a block */ +-static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, +- const BYTE* codeTable, size_t nbSeq, unsigned maxCode, +- const FSE_CTable* fseCTable, +- const U8* additionalBits, +- short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, +- void* workspace, size_t wkspSize) ++static size_t ++ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, ++ const BYTE* codeTable, size_t nbSeq, unsigned maxCode, ++ const FSE_CTable* fseCTable, ++ const U8* additionalBits, ++ short const* defaultNorm, U32 defaultNormLog, U32 defaultMax, ++ void* workspace, size_t wkspSize) + { + unsigned* const countWksp = (unsigned*)workspace; + const BYTE* ctp = codeTable; +@@ -3206,99 +3641,107 @@ static size_t ZSTD_estimateBlockSize_symbolType(symbolEncodingType_e type, + } + + /* Returns the size estimate for the sequences section (header + content) of a block */ +-static size_t ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- void* workspace, size_t wkspSize, +- int writeEntropy) ++static size_t ++ZSTD_estimateBlockSize_sequences(const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ void* workspace, size_t wkspSize, ++ int writeEntropy) + { + size_t sequencesSectionHeaderSize = 1 /* seqHead */ + 1 /* min seqSize size */ + (nbSeq >= 128) + (nbSeq >= LONGNBSEQ); + size_t cSeqSizeEstimate = 0; + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->ofType, ofCodeTable, nbSeq, MaxOff, +- fseTables->offcodeCTable, NULL, +- OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, +- workspace, wkspSize); ++ fseTables->offcodeCTable, NULL, ++ OF_defaultNorm, OF_defaultNormLog, DefaultMaxOff, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->llType, llCodeTable, nbSeq, MaxLL, +- fseTables->litlengthCTable, LL_bits, +- LL_defaultNorm, LL_defaultNormLog, MaxLL, +- workspace, wkspSize); ++ fseTables->litlengthCTable, LL_bits, ++ LL_defaultNorm, LL_defaultNormLog, MaxLL, ++ workspace, wkspSize); + cSeqSizeEstimate += ZSTD_estimateBlockSize_symbolType(fseMetadata->mlType, mlCodeTable, nbSeq, MaxML, +- fseTables->matchlengthCTable, ML_bits, +- ML_defaultNorm, ML_defaultNormLog, MaxML, +- workspace, wkspSize); ++ fseTables->matchlengthCTable, ML_bits, ++ ML_defaultNorm, ML_defaultNormLog, MaxML, ++ workspace, wkspSize); + if (writeEntropy) cSeqSizeEstimate += fseMetadata->fseTablesSize; + return cSeqSizeEstimate + sequencesSectionHeaderSize; + } + + /* Returns the size estimate for a given stream of literals, of, ll, ml */ +-static size_t ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, +- const BYTE* ofCodeTable, +- const BYTE* llCodeTable, +- const BYTE* mlCodeTable, +- size_t nbSeq, +- const ZSTD_entropyCTables_t* entropy, +- const ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize, +- int writeLitEntropy, int writeSeqEntropy) { ++static size_t ++ZSTD_estimateBlockSize(const BYTE* literals, size_t litSize, ++ const BYTE* ofCodeTable, ++ const BYTE* llCodeTable, ++ const BYTE* mlCodeTable, ++ size_t nbSeq, ++ const ZSTD_entropyCTables_t* entropy, ++ const ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize, ++ int writeLitEntropy, int writeSeqEntropy) ++{ + size_t const literalsSize = ZSTD_estimateBlockSize_literal(literals, litSize, +- &entropy->huf, &entropyMetadata->hufMetadata, +- workspace, wkspSize, writeLitEntropy); ++ &entropy->huf, &entropyMetadata->hufMetadata, ++ workspace, wkspSize, writeLitEntropy); + size_t const seqSize = ZSTD_estimateBlockSize_sequences(ofCodeTable, llCodeTable, mlCodeTable, +- nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, +- workspace, wkspSize, writeSeqEntropy); ++ nbSeq, &entropy->fse, &entropyMetadata->fseMetadata, ++ workspace, wkspSize, writeSeqEntropy); + return seqSize + literalsSize + ZSTD_blockHeaderSize; + } + + /* Builds entropy statistics and uses them for blocksize estimation. + * +- * Returns the estimated compressed size of the seqStore, or a zstd error. ++ * @return: estimated compressed size of the seqStore, or a zstd error. + */ +-static size_t ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) { +- ZSTD_entropyCTablesMetadata_t* entropyMetadata = &zc->blockSplitCtx.entropyMetadata; ++static size_t ++ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(seqStore_t* seqStore, ZSTD_CCtx* zc) ++{ ++ ZSTD_entropyCTablesMetadata_t* const entropyMetadata = &zc->blockSplitCtx.entropyMetadata; + DEBUGLOG(6, "ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize()"); + FORWARD_IF_ERROR(ZSTD_buildBlockEntropyStats(seqStore, + &zc->blockState.prevCBlock->entropy, + &zc->blockState.nextCBlock->entropy, + &zc->appliedParams, + entropyMetadata, +- zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */), ""); +- return ZSTD_estimateBlockSize(seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE), ""); ++ return ZSTD_estimateBlockSize( ++ seqStore->litStart, (size_t)(seqStore->lit - seqStore->litStart), + seqStore->ofCode, seqStore->llCode, seqStore->mlCode, + (size_t)(seqStore->sequences - seqStore->sequencesStart), +- &zc->blockState.nextCBlock->entropy, entropyMetadata, zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, ++ &zc->blockState.nextCBlock->entropy, ++ entropyMetadata, ++ zc->entropyWorkspace, ENTROPY_WORKSPACE_SIZE, + (int)(entropyMetadata->hufMetadata.hType == set_compressed), 1); + } + + /* Returns literals bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreLiteralsBytes(const seqStore_t* const seqStore) ++{ + size_t literalsBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { +- seqDef seq = seqStore->sequencesStart[i]; ++ seqDef const seq = seqStore->sequencesStart[i]; + literalsBytes += seq.litLength; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_literalLength) { + literalsBytes += 0x10000; +- } +- } ++ } } + return literalsBytes; + } + + /* Returns match bytes represented in a seqStore */ +-static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { ++static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) ++{ + size_t matchBytes = 0; +- size_t const nbSeqs = seqStore->sequences - seqStore->sequencesStart; ++ size_t const nbSeqs = (size_t)(seqStore->sequences - seqStore->sequencesStart); + size_t i; + for (i = 0; i < nbSeqs; ++i) { + seqDef seq = seqStore->sequencesStart[i]; + matchBytes += seq.mlBase + MINMATCH; + if (i == seqStore->longLengthPos && seqStore->longLengthType == ZSTD_llt_matchLength) { + matchBytes += 0x10000; +- } +- } ++ } } + return matchBytes; + } + +@@ -3307,15 +3750,12 @@ static size_t ZSTD_countSeqStoreMatchBytes(const seqStore_t* const seqStore) { + */ + static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + const seqStore_t* originalSeqStore, +- size_t startIdx, size_t endIdx) { +- BYTE* const litEnd = originalSeqStore->lit; +- size_t literalsBytes; +- size_t literalsBytesPreceding = 0; +- ++ size_t startIdx, size_t endIdx) ++{ + *resultSeqStore = *originalSeqStore; + if (startIdx > 0) { + resultSeqStore->sequences = originalSeqStore->sequencesStart + startIdx; +- literalsBytesPreceding = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->litStart += ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); + } + + /* Move longLengthPos into the correct position if necessary */ +@@ -3328,13 +3768,12 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + resultSeqStore->sequencesStart = originalSeqStore->sequencesStart + startIdx; + resultSeqStore->sequences = originalSeqStore->sequencesStart + endIdx; +- literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); +- resultSeqStore->litStart += literalsBytesPreceding; + if (endIdx == (size_t)(originalSeqStore->sequences - originalSeqStore->sequencesStart)) { + /* This accounts for possible last literals if the derived chunk reaches the end of the block */ +- resultSeqStore->lit = litEnd; ++ assert(resultSeqStore->lit == originalSeqStore->lit); + } else { +- resultSeqStore->lit = resultSeqStore->litStart+literalsBytes; ++ size_t const literalsBytes = ZSTD_countSeqStoreLiteralsBytes(resultSeqStore); ++ resultSeqStore->lit = resultSeqStore->litStart + literalsBytes; + } + resultSeqStore->llCode += startIdx; + resultSeqStore->mlCode += startIdx; +@@ -3342,20 +3781,26 @@ static void ZSTD_deriveSeqStoreChunk(seqStore_t* resultSeqStore, + } + + /* +- * Returns the raw offset represented by the combination of offCode, ll0, and repcode history. +- * offCode must represent a repcode in the numeric representation of ZSTD_storeSeq(). ++ * Returns the raw offset represented by the combination of offBase, ll0, and repcode history. ++ * offBase must represent a repcode in the numeric representation of ZSTD_storeSeq(). + */ + static U32 +-ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, const U32 ll0) +-{ +- U32 const adjustedOffCode = STORED_REPCODE(offCode) - 1 + ll0; /* [ 0 - 3 ] */ +- assert(STORED_IS_REPCODE(offCode)); +- if (adjustedOffCode == ZSTD_REP_NUM) { +- /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 */ +- assert(rep[0] > 0); ++ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offBase, const U32 ll0) ++{ ++ U32 const adjustedRepCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; /* [ 0 - 3 ] */ ++ assert(OFFBASE_IS_REPCODE(offBase)); ++ if (adjustedRepCode == ZSTD_REP_NUM) { ++ assert(ll0); ++ /* litlength == 0 and offCode == 2 implies selection of first repcode - 1 ++ * This is only valid if it results in a valid offset value, aka > 0. ++ * Note : it may happen that `rep[0]==1` in exceptional circumstances. ++ * In which case this function will return 0, which is an invalid offset. ++ * It's not an issue though, since this value will be ++ * compared and discarded within ZSTD_seqStore_resolveOffCodes(). ++ */ + return rep[0] - 1; + } +- return rep[adjustedOffCode]; ++ return rep[adjustedRepCode]; + } + + /* +@@ -3371,30 +3816,33 @@ ZSTD_resolveRepcodeToRawOffset(const U32 rep[ZSTD_REP_NUM], const U32 offCode, c + * 1-3 : repcode 1-3 + * 4+ : real_offset+3 + */ +-static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, +- seqStore_t* const seqStore, U32 const nbSeq) { ++static void ++ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_t* const cRepcodes, ++ const seqStore_t* const seqStore, U32 const nbSeq) ++{ + U32 idx = 0; ++ U32 const longLitLenIdx = seqStore->longLengthType == ZSTD_llt_literalLength ? seqStore->longLengthPos : nbSeq; + for (; idx < nbSeq; ++idx) { + seqDef* const seq = seqStore->sequencesStart + idx; +- U32 const ll0 = (seq->litLength == 0); +- U32 const offCode = OFFBASE_TO_STORED(seq->offBase); +- assert(seq->offBase > 0); +- if (STORED_IS_REPCODE(offCode)) { +- U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offCode, ll0); +- U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offCode, ll0); ++ U32 const ll0 = (seq->litLength == 0) && (idx != longLitLenIdx); ++ U32 const offBase = seq->offBase; ++ assert(offBase > 0); ++ if (OFFBASE_IS_REPCODE(offBase)) { ++ U32 const dRawOffset = ZSTD_resolveRepcodeToRawOffset(dRepcodes->rep, offBase, ll0); ++ U32 const cRawOffset = ZSTD_resolveRepcodeToRawOffset(cRepcodes->rep, offBase, ll0); + /* Adjust simulated decompression repcode history if we come across a mismatch. Replace + * the repcode with the offset it actually references, determined by the compression + * repcode history. + */ + if (dRawOffset != cRawOffset) { +- seq->offBase = cRawOffset + ZSTD_REP_NUM; ++ seq->offBase = OFFSET_TO_OFFBASE(cRawOffset); + } + } + /* Compression repcode history is always updated with values directly from the unmodified seqStore. + * Decompression repcode history may use modified seq->offset value taken from compression repcode history. + */ +- ZSTD_updateRep(dRepcodes->rep, OFFBASE_TO_STORED(seq->offBase), ll0); +- ZSTD_updateRep(cRepcodes->rep, offCode, ll0); ++ ZSTD_updateRep(dRepcodes->rep, seq->offBase, ll0); ++ ZSTD_updateRep(cRepcodes->rep, offBase, ll0); + } + } + +@@ -3404,10 +3852,11 @@ static void ZSTD_seqStore_resolveOffCodes(repcodes_t* const dRepcodes, repcodes_ + * Returns the total size of that block (including header) or a ZSTD error code. + */ + static size_t +-ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, seqStore_t* const seqStore, ++ZSTD_compressSeqStore_singleBlock(ZSTD_CCtx* zc, ++ const seqStore_t* const seqStore, + repcodes_t* const dRep, repcodes_t* const cRep, + void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, ++ const void* src, size_t srcSize, + U32 lastBlock, U32 isPartition) + { + const U32 rleMaxLength = 25; +@@ -3481,45 +3930,49 @@ typedef struct { + + /* Helper function to perform the recursive search for block splits. + * Estimates the cost of seqStore prior to split, and estimates the cost of splitting the sequences in half. +- * If advantageous to split, then we recurse down the two sub-blocks. If not, or if an error occurred in estimation, then +- * we do not recurse. ++ * If advantageous to split, then we recurse down the two sub-blocks. ++ * If not, or if an error occurred in estimation, then we do not recurse. + * +- * Note: The recursion depth is capped by a heuristic minimum number of sequences, defined by MIN_SEQUENCES_BLOCK_SPLITTING. ++ * Note: The recursion depth is capped by a heuristic minimum number of sequences, ++ * defined by MIN_SEQUENCES_BLOCK_SPLITTING. + * In theory, this means the absolute largest recursion depth is 10 == log2(maxNbSeqInBlock/MIN_SEQUENCES_BLOCK_SPLITTING). + * In practice, recursion depth usually doesn't go beyond 4. + * +- * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize ++ * Furthermore, the number of splits is capped by ZSTD_MAX_NB_BLOCK_SPLITS. ++ * At ZSTD_MAX_NB_BLOCK_SPLITS == 196 with the current existing blockSize + * maximum of 128 KB, this value is actually impossible to reach. + */ + static void + ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t endIdx, + ZSTD_CCtx* zc, const seqStore_t* origSeqStore) + { +- seqStore_t* fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; +- seqStore_t* firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; +- seqStore_t* secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; ++ seqStore_t* const fullSeqStoreChunk = &zc->blockSplitCtx.fullSeqStoreChunk; ++ seqStore_t* const firstHalfSeqStore = &zc->blockSplitCtx.firstHalfSeqStore; ++ seqStore_t* const secondHalfSeqStore = &zc->blockSplitCtx.secondHalfSeqStore; + size_t estimatedOriginalSize; + size_t estimatedFirstHalfSize; + size_t estimatedSecondHalfSize; + size_t midIdx = (startIdx + endIdx)/2; + ++ DEBUGLOG(5, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); ++ assert(endIdx >= startIdx); + if (endIdx - startIdx < MIN_SEQUENCES_BLOCK_SPLITTING || splits->idx >= ZSTD_MAX_NB_BLOCK_SPLITS) { +- DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences"); ++ DEBUGLOG(6, "ZSTD_deriveBlockSplitsHelper: Too few sequences (%zu)", endIdx - startIdx); + return; + } +- DEBUGLOG(4, "ZSTD_deriveBlockSplitsHelper: startIdx=%zu endIdx=%zu", startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(fullSeqStoreChunk, origSeqStore, startIdx, endIdx); + ZSTD_deriveSeqStoreChunk(firstHalfSeqStore, origSeqStore, startIdx, midIdx); + ZSTD_deriveSeqStoreChunk(secondHalfSeqStore, origSeqStore, midIdx, endIdx); + estimatedOriginalSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(fullSeqStoreChunk, zc); + estimatedFirstHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(firstHalfSeqStore, zc); + estimatedSecondHalfSize = ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(secondHalfSeqStore, zc); +- DEBUGLOG(4, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", ++ DEBUGLOG(5, "Estimated original block size: %zu -- First half split: %zu -- Second half split: %zu", + estimatedOriginalSize, estimatedFirstHalfSize, estimatedSecondHalfSize); + if (ZSTD_isError(estimatedOriginalSize) || ZSTD_isError(estimatedFirstHalfSize) || ZSTD_isError(estimatedSecondHalfSize)) { + return; + } + if (estimatedFirstHalfSize + estimatedSecondHalfSize < estimatedOriginalSize) { ++ DEBUGLOG(5, "split decided at seqNb:%zu", midIdx); + ZSTD_deriveBlockSplitsHelper(splits, startIdx, midIdx, zc, origSeqStore); + splits->splitLocations[splits->idx] = (U32)midIdx; + splits->idx++; +@@ -3527,14 +3980,18 @@ ZSTD_deriveBlockSplitsHelper(seqStoreSplits* splits, size_t startIdx, size_t end + } + } + +-/* Base recursive function. Populates a table with intra-block partition indices that can improve compression ratio. ++/* Base recursive function. ++ * Populates a table with intra-block partition indices that can improve compression ratio. + * +- * Returns the number of splits made (which equals the size of the partition table - 1). ++ * @return: number of splits made (which equals the size of the partition table - 1). + */ +-static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) { +- seqStoreSplits splits = {partitions, 0}; ++static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) ++{ ++ seqStoreSplits splits; ++ splits.splitLocations = partitions; ++ splits.idx = 0; + if (nbSeq <= 4) { +- DEBUGLOG(4, "ZSTD_deriveBlockSplits: Too few sequences to split"); ++ DEBUGLOG(5, "ZSTD_deriveBlockSplits: Too few sequences to split (%u <= 4)", nbSeq); + /* Refuse to try and split anything with less than 4 sequences */ + return 0; + } +@@ -3550,18 +4007,20 @@ static size_t ZSTD_deriveBlockSplits(ZSTD_CCtx* zc, U32 partitions[], U32 nbSeq) + * Returns combined size of all blocks (which includes headers), or a ZSTD error code. + */ + static size_t +-ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapacity, +- const void* src, size_t blockSize, U32 lastBlock, U32 nbSeq) ++ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t blockSize, ++ U32 lastBlock, U32 nbSeq) + { + size_t cSize = 0; + const BYTE* ip = (const BYTE*)src; + BYTE* op = (BYTE*)dst; + size_t i = 0; + size_t srcBytesTotal = 0; +- U32* partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ +- seqStore_t* nextSeqStore = &zc->blockSplitCtx.nextSeqStore; +- seqStore_t* currSeqStore = &zc->blockSplitCtx.currSeqStore; +- size_t numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); ++ U32* const partitions = zc->blockSplitCtx.partitions; /* size == ZSTD_MAX_NB_BLOCK_SPLITS */ ++ seqStore_t* const nextSeqStore = &zc->blockSplitCtx.nextSeqStore; ++ seqStore_t* const currSeqStore = &zc->blockSplitCtx.currSeqStore; ++ size_t const numSplits = ZSTD_deriveBlockSplits(zc, partitions, nbSeq); + + /* If a block is split and some partitions are emitted as RLE/uncompressed, then repcode history + * may become invalid. In order to reconcile potentially invalid repcodes, we keep track of two +@@ -3583,30 +4042,31 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + ZSTD_memcpy(cRep.rep, zc->blockState.prevCBlock->rep, sizeof(repcodes_t)); + ZSTD_memset(nextSeqStore, 0, sizeof(seqStore_t)); + +- DEBUGLOG(4, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", ++ DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal (dstCapacity=%u, dictLimit=%u, nextToUpdate=%u)", + (unsigned)dstCapacity, (unsigned)zc->blockState.matchState.window.dictLimit, + (unsigned)zc->blockState.matchState.nextToUpdate); + + if (numSplits == 0) { +- size_t cSizeSingleBlock = ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, +- &dRep, &cRep, +- op, dstCapacity, +- ip, blockSize, +- lastBlock, 0 /* isPartition */); ++ size_t cSizeSingleBlock = ++ ZSTD_compressSeqStore_singleBlock(zc, &zc->seqStore, ++ &dRep, &cRep, ++ op, dstCapacity, ++ ip, blockSize, ++ lastBlock, 0 /* isPartition */); + FORWARD_IF_ERROR(cSizeSingleBlock, "Compressing single block from splitBlock_internal() failed!"); + DEBUGLOG(5, "ZSTD_compressBlock_splitBlock_internal: No splits"); +- assert(cSizeSingleBlock <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(zc->blockSize <= ZSTD_BLOCKSIZE_MAX); ++ assert(cSizeSingleBlock <= zc->blockSize + ZSTD_blockHeaderSize); + return cSizeSingleBlock; + } + + ZSTD_deriveSeqStoreChunk(currSeqStore, &zc->seqStore, 0, partitions[0]); + for (i = 0; i <= numSplits; ++i) { +- size_t srcBytes; + size_t cSizeChunk; + U32 const lastPartition = (i == numSplits); + U32 lastBlockEntireSrc = 0; + +- srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); ++ size_t srcBytes = ZSTD_countSeqStoreLiteralsBytes(currSeqStore) + ZSTD_countSeqStoreMatchBytes(currSeqStore); + srcBytesTotal += srcBytes; + if (lastPartition) { + /* This is the final partition, need to account for possible last literals */ +@@ -3621,7 +4081,8 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + op, dstCapacity, + ip, srcBytes, + lastBlockEntireSrc, 1 /* isPartition */); +- DEBUGLOG(5, "Estimated size: %zu actual size: %zu", ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); ++ DEBUGLOG(5, "Estimated size: %zu vs %zu : actual size", ++ ZSTD_buildEntropyStatisticsAndEstimateSubBlockSize(currSeqStore, zc), cSizeChunk); + FORWARD_IF_ERROR(cSizeChunk, "Compressing chunk failed!"); + + ip += srcBytes; +@@ -3629,10 +4090,10 @@ ZSTD_compressBlock_splitBlock_internal(ZSTD_CCtx* zc, void* dst, size_t dstCapac + dstCapacity -= cSizeChunk; + cSize += cSizeChunk; + *currSeqStore = *nextSeqStore; +- assert(cSizeChunk <= ZSTD_BLOCKSIZE_MAX + ZSTD_blockHeaderSize); ++ assert(cSizeChunk <= zc->blockSize + ZSTD_blockHeaderSize); + } +- /* cRep and dRep may have diverged during the compression. If so, we use the dRep repcodes +- * for the next block. ++ /* cRep and dRep may have diverged during the compression. ++ * If so, we use the dRep repcodes for the next block. + */ + ZSTD_memcpy(zc->blockState.prevCBlock->rep, dRep.rep, sizeof(repcodes_t)); + return cSize; +@@ -3643,8 +4104,6 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 lastBlock) + { +- const BYTE* ip = (const BYTE*)src; +- BYTE* op = (BYTE*)dst; + U32 nbSeq; + size_t cSize; + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock"); +@@ -3655,7 +4114,7 @@ ZSTD_compressBlock_splitBlock(ZSTD_CCtx* zc, + if (bss == ZSTDbss_noCompress) { + if (zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode == FSE_repeat_valid) + zc->blockState.prevCBlock->entropy.fse.offcode_repeatMode = FSE_repeat_check; +- cSize = ZSTD_noCompressBlock(op, dstCapacity, ip, srcSize, lastBlock); ++ cSize = ZSTD_noCompressBlock(dst, dstCapacity, src, srcSize, lastBlock); + FORWARD_IF_ERROR(cSize, "ZSTD_noCompressBlock failed"); + DEBUGLOG(4, "ZSTD_compressBlock_splitBlock: Nocompress block"); + return cSize; +@@ -3673,9 +4132,9 @@ ZSTD_compressBlock_internal(ZSTD_CCtx* zc, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, U32 frame) + { +- /* This the upper bound for the length of an rle block. +- * This isn't the actual upper bound. Finding the real threshold +- * needs further investigation. ++ /* This is an estimated upper bound for the length of an rle block. ++ * This isn't the actual upper bound. ++ * Finding the real threshold needs further investigation. + */ + const U32 rleMaxLength = 25; + size_t cSize; +@@ -3767,10 +4226,11 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + * * cSize >= blockBound(srcSize): We have expanded the block too much so + * emit an uncompressed block. + */ +- { +- size_t const cSize = ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); ++ { size_t const cSize = ++ ZSTD_compressSuperBlock(zc, dst, dstCapacity, src, srcSize, lastBlock); + if (cSize != ERROR(dstSize_tooSmall)) { +- size_t const maxCSize = srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); ++ size_t const maxCSize = ++ srcSize - ZSTD_minGain(srcSize, zc->appliedParams.cParams.strategy); + FORWARD_IF_ERROR(cSize, "ZSTD_compressSuperBlock failed"); + if (cSize != 0 && cSize < maxCSize + ZSTD_blockHeaderSize) { + ZSTD_blockState_confirmRepcodesAndEntropyTables(&zc->blockState); +@@ -3778,7 +4238,7 @@ static size_t ZSTD_compressBlock_targetCBlockSize_body(ZSTD_CCtx* zc, + } + } + } +- } ++ } /* if (bss == ZSTDbss_compress)*/ + + DEBUGLOG(6, "Resorting to ZSTD_noCompressBlock()"); + /* Superblock compression failed, attempt to emit a single no compress block. +@@ -3836,7 +4296,7 @@ static void ZSTD_overflowCorrectIfNeeded(ZSTD_matchState_t* ms, + * All blocks will be terminated, all input will be consumed. + * Function will issue an error if there is not enough `dstCapacity` to hold the compressed content. + * Frame is supposed already started (header already produced) +-* @return : compressed size, or an error code ++* @return : compressed size, or an error code + */ + static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, +@@ -3860,7 +4320,9 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + ZSTD_matchState_t* const ms = &cctx->blockState.matchState; + U32 const lastBlock = lastFrameChunk & (blockSize >= remaining); + +- RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE, ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize + MIN_CBLOCK_SIZE + 1, + dstSize_tooSmall, + "not enough space to store compressed block"); + if (remaining < blockSize) blockSize = remaining; +@@ -3899,7 +4361,7 @@ static size_t ZSTD_compress_frameChunk(ZSTD_CCtx* cctx, + MEM_writeLE24(op, cBlockHeader); + cSize += ZSTD_blockHeaderSize; + } +- } ++ } /* if (ZSTD_useTargetCBlockSize(&cctx->appliedParams))*/ + + + ip += blockSize; +@@ -4078,31 +4540,51 @@ static size_t ZSTD_compressContinue_internal (ZSTD_CCtx* cctx, + } + } + +-size_t ZSTD_compressContinue (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressContinue (srcSize=%u)", (unsigned)srcSize); + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 1 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressContinue_public() */ ++size_t ZSTD_compressContinue(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressContinue_public(cctx, dst, dstCapacity, src, srcSize); ++} + +-size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++static size_t ZSTD_getBlockSize_deprecated(const ZSTD_CCtx* cctx) + { + ZSTD_compressionParameters const cParams = cctx->appliedParams.cParams; + assert(!ZSTD_checkCParams(cParams)); +- return MIN (ZSTD_BLOCKSIZE_MAX, (U32)1 << cParams.windowLog); ++ return MIN(cctx->appliedParams.maxBlockSize, (size_t)1 << cParams.windowLog); + } + +-size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++/* NOTE: Must just wrap ZSTD_getBlockSize_deprecated() */ ++size_t ZSTD_getBlockSize(const ZSTD_CCtx* cctx) ++{ ++ return ZSTD_getBlockSize_deprecated(cctx); ++} ++ ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) + { + DEBUGLOG(5, "ZSTD_compressBlock: srcSize = %u", (unsigned)srcSize); +- { size_t const blockSizeMax = ZSTD_getBlockSize(cctx); ++ { size_t const blockSizeMax = ZSTD_getBlockSize_deprecated(cctx); + RETURN_ERROR_IF(srcSize > blockSizeMax, srcSize_wrong, "input is larger than a block"); } + + return ZSTD_compressContinue_internal(cctx, dst, dstCapacity, src, srcSize, 0 /* frame mode */, 0 /* last chunk */); + } + ++/* NOTE: Must just wrap ZSTD_compressBlock_deprecated() */ ++size_t ZSTD_compressBlock(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize) ++{ ++ return ZSTD_compressBlock_deprecated(cctx, dst, dstCapacity, src, srcSize); ++} ++ + /*! ZSTD_loadDictionaryContent() : + * @return : 0, or an error code + */ +@@ -4111,25 +4593,36 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + ZSTD_cwksp* ws, + ZSTD_CCtx_params const* params, + const void* src, size_t srcSize, +- ZSTD_dictTableLoadMethod_e dtlm) ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) + { + const BYTE* ip = (const BYTE*) src; + const BYTE* const iend = ip + srcSize; + int const loadLdmDict = params->ldmParams.enableLdm == ZSTD_ps_enable && ls != NULL; + +- /* Assert that we the ms params match the params we're being given */ ++ /* Assert that the ms params match the params we're being given */ + ZSTD_assertEqualCParams(params->cParams, ms->cParams); + +- if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ { /* Ensure large dictionaries can't cause index overflow */ ++ + /* Allow the dictionary to set indices up to exactly ZSTD_CURRENT_MAX. + * Dictionaries right at the edge will immediately trigger overflow + * correction, but I don't want to insert extra constraints here. + */ +- U32 const maxDictSize = ZSTD_CURRENT_MAX - 1; +- /* We must have cleared our windows when our source is this large. */ +- assert(ZSTD_window_isEmpty(ms->window)); +- if (loadLdmDict) +- assert(ZSTD_window_isEmpty(ls->window)); ++ U32 maxDictSize = ZSTD_CURRENT_MAX - ZSTD_WINDOW_START_INDEX; ++ ++ int const CDictTaggedIndices = ZSTD_CDictIndicesAreTagged(¶ms->cParams); ++ if (CDictTaggedIndices && tfp == ZSTD_tfp_forCDict) { ++ /* Some dictionary matchfinders in zstd use "short cache", ++ * which treats the lower ZSTD_SHORT_CACHE_TAG_BITS of each ++ * CDict hashtable entry as a tag rather than as part of an index. ++ * When short cache is used, we need to truncate the dictionary ++ * so that its indices don't overlap with the tag. */ ++ U32 const shortCacheMaxDictSize = (1u << (32 - ZSTD_SHORT_CACHE_TAG_BITS)) - ZSTD_WINDOW_START_INDEX; ++ maxDictSize = MIN(maxDictSize, shortCacheMaxDictSize); ++ assert(!loadLdmDict); ++ } ++ + /* If the dictionary is too large, only load the suffix of the dictionary. */ + if (srcSize > maxDictSize) { + ip = iend - maxDictSize; +@@ -4138,30 +4631,46 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } + } + +- DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ if (srcSize > ZSTD_CHUNKSIZE_MAX) { ++ /* We must have cleared our windows when our source is this large. */ ++ assert(ZSTD_window_isEmpty(ms->window)); ++ if (loadLdmDict) assert(ZSTD_window_isEmpty(ls->window)); ++ } + ZSTD_window_update(&ms->window, src, srcSize, /* forceNonContiguous */ 0); +- ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); +- ms->forceNonContiguous = params->deterministicRefPrefix; + +- if (loadLdmDict) { ++ DEBUGLOG(4, "ZSTD_loadDictionaryContent(): useRowMatchFinder=%d", (int)params->useRowMatchFinder); ++ ++ if (loadLdmDict) { /* Load the entire dict into LDM matchfinders. */ + ZSTD_window_update(&ls->window, src, srcSize, /* forceNonContiguous */ 0); + ls->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ls->window.base); ++ ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); ++ } ++ ++ /* If the dict is larger than we can reasonably index in our tables, only load the suffix. */ ++ if (params->cParams.strategy < ZSTD_btultra) { ++ U32 maxDictSize = 8U << MIN(MAX(params->cParams.hashLog, params->cParams.chainLog), 28); ++ if (srcSize > maxDictSize) { ++ ip = iend - maxDictSize; ++ src = ip; ++ srcSize = maxDictSize; ++ } + } + ++ ms->nextToUpdate = (U32)(ip - ms->window.base); ++ ms->loadedDictEnd = params->forceWindow ? 0 : (U32)(iend - ms->window.base); ++ ms->forceNonContiguous = params->deterministicRefPrefix; ++ + if (srcSize <= HASH_READ_SIZE) return 0; + + ZSTD_overflowCorrectIfNeeded(ms, ws, params, ip, iend); + +- if (loadLdmDict) +- ZSTD_ldm_fillHashTable(ls, ip, iend, ¶ms->ldmParams); +- + switch(params->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, dtlm); ++ ZSTD_fillHashTable(ms, iend, dtlm, tfp); + break; + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, dtlm); ++ ZSTD_fillDoubleHashTable(ms, iend, dtlm, tfp); + break; + + case ZSTD_greedy: +@@ -4174,7 +4683,7 @@ static size_t ZSTD_loadDictionaryContent(ZSTD_matchState_t* ms, + } else { + assert(params->useRowMatchFinder != ZSTD_ps_auto); + if (params->useRowMatchFinder == ZSTD_ps_enable) { +- size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog) * sizeof(U16); ++ size_t const tagTableSize = ((size_t)1 << params->cParams.hashLog); + ZSTD_memset(ms->tagTable, 0, tagTableSize); + ZSTD_row_update(ms, iend-HASH_READ_SIZE); + DEBUGLOG(4, "Using row-based hash table for lazy dict"); +@@ -4327,6 +4836,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + ZSTD_CCtx_params const* params, + const void* dict, size_t dictSize, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + const BYTE* dictPtr = (const BYTE*)dict; +@@ -4345,7 +4855,7 @@ static size_t ZSTD_loadZstdDictionary(ZSTD_compressedBlockState_t* bs, + { + size_t const dictContentSize = (size_t)(dictEnd - dictPtr); + FORWARD_IF_ERROR(ZSTD_loadDictionaryContent( +- ms, NULL, ws, params, dictPtr, dictContentSize, dtlm), ""); ++ ms, NULL, ws, params, dictPtr, dictContentSize, dtlm, tfp), ""); + } + return dictID; + } +@@ -4361,6 +4871,7 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + const void* dict, size_t dictSize, + ZSTD_dictContentType_e dictContentType, + ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp, + void* workspace) + { + DEBUGLOG(4, "ZSTD_compress_insertDictionary (dictSize=%u)", (U32)dictSize); +@@ -4373,13 +4884,13 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict restricted modes */ + if (dictContentType == ZSTD_dct_rawContent) +- return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm); ++ return ZSTD_loadDictionaryContent(ms, ls, ws, params, dict, dictSize, dtlm, tfp); + + if (MEM_readLE32(dict) != ZSTD_MAGIC_DICTIONARY) { + if (dictContentType == ZSTD_dct_auto) { + DEBUGLOG(4, "raw content dictionary detected"); + return ZSTD_loadDictionaryContent( +- ms, ls, ws, params, dict, dictSize, dtlm); ++ ms, ls, ws, params, dict, dictSize, dtlm, tfp); + } + RETURN_ERROR_IF(dictContentType == ZSTD_dct_fullDict, dictionary_wrong, ""); + assert(0); /* impossible */ +@@ -4387,13 +4898,14 @@ ZSTD_compress_insertDictionary(ZSTD_compressedBlockState_t* bs, + + /* dict as full zstd dictionary */ + return ZSTD_loadZstdDictionary( +- bs, ms, ws, params, dict, dictSize, dtlm, workspace); ++ bs, ms, ws, params, dict, dictSize, dtlm, tfp, workspace); + } + + #define ZSTD_USE_CDICT_PARAMS_SRCSIZE_CUTOFF (128 KB) + #define ZSTD_USE_CDICT_PARAMS_DICTSIZE_MULTIPLIER (6ULL) + + /*! ZSTD_compressBegin_internal() : ++ * Assumption : either @dict OR @cdict (or none) is non-NULL, never both + * @return : 0, or an error code */ + static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + const void* dict, size_t dictSize, +@@ -4426,11 +4938,11 @@ static size_t ZSTD_compressBegin_internal(ZSTD_CCtx* cctx, + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, cdict->dictContent, + cdict->dictContentSize, cdict->dictContentType, dtlm, +- cctx->entropyWorkspace) ++ ZSTD_tfp_forCCtx, cctx->entropyWorkspace) + : ZSTD_compress_insertDictionary( + cctx->blockState.prevCBlock, &cctx->blockState.matchState, + &cctx->ldmState, &cctx->workspace, &cctx->appliedParams, dict, dictSize, +- dictContentType, dtlm, cctx->entropyWorkspace); ++ dictContentType, dtlm, ZSTD_tfp_forCCtx, cctx->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= UINT_MAX); + cctx->dictID = (U32)dictID; +@@ -4471,11 +4983,11 @@ size_t ZSTD_compressBegin_advanced(ZSTD_CCtx* cctx, + &cctxParams, pledgedSrcSize); + } + +-size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++static size_t ++ZSTD_compressBegin_usingDict_deprecated(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) + { + ZSTD_CCtx_params cctxParams; +- { +- ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); ++ { ZSTD_parameters const params = ZSTD_getParams_internal(compressionLevel, ZSTD_CONTENTSIZE_UNKNOWN, dictSize, ZSTD_cpm_noAttachDict); + ZSTD_CCtxParams_init_internal(&cctxParams, ¶ms, (compressionLevel == 0) ? ZSTD_CLEVEL_DEFAULT : compressionLevel); + } + DEBUGLOG(4, "ZSTD_compressBegin_usingDict (dictSize=%u)", (unsigned)dictSize); +@@ -4483,9 +4995,15 @@ size_t ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t di + &cctxParams, ZSTD_CONTENTSIZE_UNKNOWN, ZSTDb_not_buffered); + } + ++size_t ++ZSTD_compressBegin_usingDict(ZSTD_CCtx* cctx, const void* dict, size_t dictSize, int compressionLevel) ++{ ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, dict, dictSize, compressionLevel); ++} ++ + size_t ZSTD_compressBegin(ZSTD_CCtx* cctx, int compressionLevel) + { +- return ZSTD_compressBegin_usingDict(cctx, NULL, 0, compressionLevel); ++ return ZSTD_compressBegin_usingDict_deprecated(cctx, NULL, 0, compressionLevel); + } + + +@@ -4537,9 +5055,9 @@ void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize) + (void)extraCSize; + } + +-size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t endResult; + size_t const cSize = ZSTD_compressContinue_internal(cctx, +@@ -4563,6 +5081,14 @@ size_t ZSTD_compressEnd (ZSTD_CCtx* cctx, + return cSize + endResult; + } + ++/* NOTE: Must just wrap ZSTD_compressEnd_public() */ ++size_t ZSTD_compressEnd(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); ++} ++ + size_t ZSTD_compress_advanced (ZSTD_CCtx* cctx, + void* dst, size_t dstCapacity, + const void* src, size_t srcSize, +@@ -4591,7 +5117,7 @@ size_t ZSTD_compress_advanced_internal( + FORWARD_IF_ERROR( ZSTD_compressBegin_internal(cctx, + dict, dictSize, ZSTD_dct_auto, ZSTD_dtlm_fast, NULL, + params, srcSize, ZSTDb_not_buffered) , ""); +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + size_t ZSTD_compress_usingDict(ZSTD_CCtx* cctx, +@@ -4709,7 +5235,7 @@ static size_t ZSTD_initCDict_internal( + { size_t const dictID = ZSTD_compress_insertDictionary( + &cdict->cBlockState, &cdict->matchState, NULL, &cdict->workspace, + ¶ms, cdict->dictContent, cdict->dictContentSize, +- dictContentType, ZSTD_dtlm_full, cdict->entropyWorkspace); ++ dictContentType, ZSTD_dtlm_full, ZSTD_tfp_forCDict, cdict->entropyWorkspace); + FORWARD_IF_ERROR(dictID, "ZSTD_compress_insertDictionary failed"); + assert(dictID <= (size_t)(U32)-1); + cdict->dictID = (U32)dictID; +@@ -4906,6 +5432,7 @@ const ZSTD_CDict* ZSTD_initStaticCDict( + params.cParams = cParams; + params.useRowMatchFinder = useRowMatchFinder; + cdict->useRowMatchFinder = useRowMatchFinder; ++ cdict->compressionLevel = ZSTD_NO_CLEVEL; + + if (ZSTD_isError( ZSTD_initCDict_internal(cdict, + dict, dictSize, +@@ -4985,12 +5512,17 @@ size_t ZSTD_compressBegin_usingCDict_advanced( + + /* ZSTD_compressBegin_usingCDict() : + * cdict must be != NULL */ +-size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) + { + ZSTD_frameParameters const fParams = { 0 /*content*/, 0 /*checksum*/, 0 /*noDictID*/ }; + return ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, ZSTD_CONTENTSIZE_UNKNOWN); + } + ++size_t ZSTD_compressBegin_usingCDict(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict) ++{ ++ return ZSTD_compressBegin_usingCDict_deprecated(cctx, cdict); ++} ++ + /*! ZSTD_compress_usingCDict_internal(): + * Implementation of various ZSTD_compress_usingCDict* functions. + */ +@@ -5000,7 +5532,7 @@ static size_t ZSTD_compress_usingCDict_internal(ZSTD_CCtx* cctx, + const ZSTD_CDict* cdict, ZSTD_frameParameters fParams) + { + FORWARD_IF_ERROR(ZSTD_compressBegin_usingCDict_internal(cctx, cdict, fParams, srcSize), ""); /* will check if cdict != NULL */ +- return ZSTD_compressEnd(cctx, dst, dstCapacity, src, srcSize); ++ return ZSTD_compressEnd_public(cctx, dst, dstCapacity, src, srcSize); + } + + /*! ZSTD_compress_usingCDict_advanced(): +@@ -5197,30 +5729,41 @@ size_t ZSTD_initCStream(ZSTD_CStream* zcs, int compressionLevel) + + static size_t ZSTD_nextInputSizeHint(const ZSTD_CCtx* cctx) + { +- size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; +- if (hintInSize==0) hintInSize = cctx->blockSize; +- return hintInSize; ++ if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ return cctx->blockSize - cctx->stableIn_notConsumed; ++ } ++ assert(cctx->appliedParams.inBufferMode == ZSTD_bm_buffered); ++ { size_t hintInSize = cctx->inBuffTarget - cctx->inBuffPos; ++ if (hintInSize==0) hintInSize = cctx->blockSize; ++ return hintInSize; ++ } + } + + /* ZSTD_compressStream_generic(): + * internal function for all *compressStream*() variants +- * non-static, because can be called from zstdmt_compress.c +- * @return : hint size for next input */ ++ * @return : hint size for next input to complete ongoing block */ + static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, + ZSTD_EndDirective const flushMode) + { +- const char* const istart = (const char*)input->src; +- const char* const iend = input->size != 0 ? istart + input->size : istart; +- const char* ip = input->pos != 0 ? istart + input->pos : istart; +- char* const ostart = (char*)output->dst; +- char* const oend = output->size != 0 ? ostart + output->size : ostart; +- char* op = output->pos != 0 ? ostart + output->pos : ostart; ++ const char* const istart = (assert(input != NULL), (const char*)input->src); ++ const char* const iend = (istart != NULL) ? istart + input->size : istart; ++ const char* ip = (istart != NULL) ? istart + input->pos : istart; ++ char* const ostart = (assert(output != NULL), (char*)output->dst); ++ char* const oend = (ostart != NULL) ? ostart + output->size : ostart; ++ char* op = (ostart != NULL) ? ostart + output->pos : ostart; + U32 someMoreWork = 1; + + /* check expectations */ +- DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%u", (unsigned)flushMode); ++ DEBUGLOG(5, "ZSTD_compressStream_generic, flush=%i, srcSize = %zu", (int)flushMode, input->size - input->pos); ++ assert(zcs != NULL); ++ if (zcs->appliedParams.inBufferMode == ZSTD_bm_stable) { ++ assert(input->pos >= zcs->stableIn_notConsumed); ++ input->pos -= zcs->stableIn_notConsumed; ++ ip -= zcs->stableIn_notConsumed; ++ zcs->stableIn_notConsumed = 0; ++ } + if (zcs->appliedParams.inBufferMode == ZSTD_bm_buffered) { + assert(zcs->inBuff != NULL); + assert(zcs->inBuffSize > 0); +@@ -5229,8 +5772,10 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + assert(zcs->outBuff != NULL); + assert(zcs->outBuffSize > 0); + } +- assert(output->pos <= output->size); ++ if (input->src == NULL) assert(input->size == 0); + assert(input->pos <= input->size); ++ if (output->dst == NULL) assert(output->size == 0); ++ assert(output->pos <= output->size); + assert((U32)flushMode <= (U32)ZSTD_e_end); + + while (someMoreWork) { +@@ -5245,7 +5790,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) /* OR we are allowed to return dstSizeTooSmall */ + && (zcs->inBuffPos == 0) ) { + /* shortcut to compression pass directly into output buffer */ +- size_t const cSize = ZSTD_compressEnd(zcs, ++ size_t const cSize = ZSTD_compressEnd_public(zcs, + op, oend-op, ip, iend-ip); + DEBUGLOG(4, "ZSTD_compressEnd : cSize=%u", (unsigned)cSize); + FORWARD_IF_ERROR(cSize, "ZSTD_compressEnd failed"); +@@ -5262,8 +5807,7 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + zcs->inBuff + zcs->inBuffPos, toLoad, + ip, iend-ip); + zcs->inBuffPos += loaded; +- if (loaded != 0) +- ip += loaded; ++ if (ip) ip += loaded; + if ( (flushMode == ZSTD_e_continue) + && (zcs->inBuffPos < zcs->inBuffTarget) ) { + /* not enough input to fill full block : stop here */ +@@ -5274,6 +5818,20 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + /* empty */ + someMoreWork = 0; break; + } ++ } else { ++ assert(zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ if ( (flushMode == ZSTD_e_continue) ++ && ( (size_t)(iend - ip) < zcs->blockSize) ) { ++ /* can't compress a full block : stop here */ ++ zcs->stableIn_notConsumed = (size_t)(iend - ip); ++ ip = iend; /* pretend to have consumed input */ ++ someMoreWork = 0; break; ++ } ++ if ( (flushMode == ZSTD_e_flush) ++ && (ip == iend) ) { ++ /* empty */ ++ someMoreWork = 0; break; ++ } + } + /* compress current block (note : this stage cannot be stopped in the middle) */ + DEBUGLOG(5, "stream compression stage (flushMode==%u)", flushMode); +@@ -5281,9 +5839,8 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + void* cDst; + size_t cSize; + size_t oSize = oend-op; +- size_t const iSize = inputBuffered +- ? zcs->inBuffPos - zcs->inToCompress +- : MIN((size_t)(iend - ip), zcs->blockSize); ++ size_t const iSize = inputBuffered ? zcs->inBuffPos - zcs->inToCompress ++ : MIN((size_t)(iend - ip), zcs->blockSize); + if (oSize >= ZSTD_compressBound(iSize) || zcs->appliedParams.outBufferMode == ZSTD_bm_stable) + cDst = op; /* compress into output buffer, to skip flush stage */ + else +@@ -5291,9 +5848,9 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (inputBuffered) { + unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip==iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ++ ZSTD_compressEnd_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ++ ZSTD_compressContinue_public(zcs, cDst, oSize, + zcs->inBuff + zcs->inToCompress, iSize); + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +@@ -5306,19 +5863,16 @@ static size_t ZSTD_compressStream_generic(ZSTD_CStream* zcs, + if (!lastBlock) + assert(zcs->inBuffTarget <= zcs->inBuffSize); + zcs->inToCompress = zcs->inBuffPos; +- } else { +- unsigned const lastBlock = (ip + iSize == iend); +- assert(flushMode == ZSTD_e_end /* Already validated */); ++ } else { /* !inputBuffered, hence ZSTD_bm_stable */ ++ unsigned const lastBlock = (flushMode == ZSTD_e_end) && (ip + iSize == iend); + cSize = lastBlock ? +- ZSTD_compressEnd(zcs, cDst, oSize, ip, iSize) : +- ZSTD_compressContinue(zcs, cDst, oSize, ip, iSize); ++ ZSTD_compressEnd_public(zcs, cDst, oSize, ip, iSize) : ++ ZSTD_compressContinue_public(zcs, cDst, oSize, ip, iSize); + /* Consume the input prior to error checking to mirror buffered mode. */ +- if (iSize > 0) +- ip += iSize; ++ if (ip) ip += iSize; + FORWARD_IF_ERROR(cSize, "%s", lastBlock ? "ZSTD_compressEnd failed" : "ZSTD_compressContinue failed"); + zcs->frameEnded = lastBlock; +- if (lastBlock) +- assert(ip == iend); ++ if (lastBlock) assert(ip == iend); + } + if (cDst == op) { /* no need to flush */ + op += cSize; +@@ -5388,8 +5942,10 @@ size_t ZSTD_compressStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output, ZSTD_inBuf + /* After a compression call set the expected input/output buffer. + * This is validated at the start of the next compression call. + */ +-static void ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, ZSTD_outBuffer const* output, ZSTD_inBuffer const* input) ++static void ++ZSTD_setBufferExpectations(ZSTD_CCtx* cctx, const ZSTD_outBuffer* output, const ZSTD_inBuffer* input) + { ++ DEBUGLOG(5, "ZSTD_setBufferExpectations (for advanced stable in/out modes)"); + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + cctx->expectedInBuffer = *input; + } +@@ -5408,22 +5964,22 @@ static size_t ZSTD_checkBufferStability(ZSTD_CCtx const* cctx, + { + if (cctx->appliedParams.inBufferMode == ZSTD_bm_stable) { + ZSTD_inBuffer const expect = cctx->expectedInBuffer; +- if (expect.src != input->src || expect.pos != input->pos || expect.size != input->size) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer enabled but input differs!"); +- if (endOp != ZSTD_e_end) +- RETURN_ERROR(srcBuffer_wrong, "ZSTD_c_stableInBuffer can only be used with ZSTD_e_end!"); ++ if (expect.src != input->src || expect.pos != input->pos) ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableInBuffer enabled but input differs!"); + } ++ (void)endOp; + if (cctx->appliedParams.outBufferMode == ZSTD_bm_stable) { + size_t const outBufferSize = output->size - output->pos; + if (cctx->expectedOutBufferSize != outBufferSize) +- RETURN_ERROR(dstBuffer_wrong, "ZSTD_c_stableOutBuffer enabled but output size differs!"); ++ RETURN_ERROR(stabilityCondition_notRespected, "ZSTD_c_stableOutBuffer enabled but output size differs!"); + } + return 0; + } + + static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + ZSTD_EndDirective endOp, +- size_t inSize) { ++ size_t inSize) ++{ + ZSTD_CCtx_params params = cctx->requestedParams; + ZSTD_prefixDict const prefixDict = cctx->prefixDict; + FORWARD_IF_ERROR( ZSTD_initLocalDict(cctx) , ""); /* Init the local dict if present. */ +@@ -5437,9 +5993,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.compressionLevel = cctx->cdict->compressionLevel; + } + DEBUGLOG(4, "ZSTD_compressStream2 : transparent init stage"); +- if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-fix pledgedSrcSize */ +- { +- size_t const dictSize = prefixDict.dict ++ if (endOp == ZSTD_e_end) cctx->pledgedSrcSizePlusOne = inSize + 1; /* auto-determine pledgedSrcSize */ ++ ++ { size_t const dictSize = prefixDict.dict + ? prefixDict.dictSize + : (cctx->cdict ? cctx->cdict->dictContentSize : 0); + ZSTD_cParamMode_e const mode = ZSTD_getCParamMode(cctx->cdict, ¶ms, cctx->pledgedSrcSizePlusOne - 1); +@@ -5451,6 +6007,9 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + params.useBlockSplitter = ZSTD_resolveBlockSplitterMode(params.useBlockSplitter, ¶ms.cParams); + params.ldmParams.enableLdm = ZSTD_resolveEnableLdm(params.ldmParams.enableLdm, ¶ms.cParams); + params.useRowMatchFinder = ZSTD_resolveRowMatchFinderMode(params.useRowMatchFinder, ¶ms.cParams); ++ params.validateSequences = ZSTD_resolveExternalSequenceValidation(params.validateSequences); ++ params.maxBlockSize = ZSTD_resolveMaxBlockSize(params.maxBlockSize); ++ params.searchForExternalRepcodes = ZSTD_resolveExternalRepcodeSearch(params.searchForExternalRepcodes, params.compressionLevel); + + { U64 const pledgedSrcSize = cctx->pledgedSrcSizePlusOne - 1; + assert(!ZSTD_isError(ZSTD_checkCParams(params.cParams))); +@@ -5477,6 +6036,8 @@ static size_t ZSTD_CCtx_init_compressStream2(ZSTD_CCtx* cctx, + return 0; + } + ++/* @return provides a minimum amount of data remaining to be flushed from internal buffers ++ */ + size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + ZSTD_outBuffer* output, + ZSTD_inBuffer* input, +@@ -5491,8 +6052,27 @@ size_t ZSTD_compressStream2( ZSTD_CCtx* cctx, + + /* transparent initialization stage */ + if (cctx->streamStage == zcss_init) { +- FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, input->size), "CompressStream2 initialization failed"); +- ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ ++ size_t const inputSize = input->size - input->pos; /* no obligation to start from pos==0 */ ++ size_t const totalInputSize = inputSize + cctx->stableIn_notConsumed; ++ if ( (cctx->requestedParams.inBufferMode == ZSTD_bm_stable) /* input is presumed stable, across invocations */ ++ && (endOp == ZSTD_e_continue) /* no flush requested, more input to come */ ++ && (totalInputSize < ZSTD_BLOCKSIZE_MAX) ) { /* not even reached one block yet */ ++ if (cctx->stableIn_notConsumed) { /* not the first time */ ++ /* check stable source guarantees */ ++ RETURN_ERROR_IF(input->src != cctx->expectedInBuffer.src, stabilityCondition_notRespected, "stableInBuffer condition not respected: wrong src pointer"); ++ RETURN_ERROR_IF(input->pos != cctx->expectedInBuffer.size, stabilityCondition_notRespected, "stableInBuffer condition not respected: externally modified pos"); ++ } ++ /* pretend input was consumed, to give a sense forward progress */ ++ input->pos = input->size; ++ /* save stable inBuffer, for later control, and flush/end */ ++ cctx->expectedInBuffer = *input; ++ /* but actually input wasn't consumed, so keep track of position from where compression shall resume */ ++ cctx->stableIn_notConsumed += inputSize; ++ /* don't initialize yet, wait for the first block of flush() order, for better parameters adaptation */ ++ return ZSTD_FRAMEHEADERSIZE_MIN(cctx->requestedParams.format); /* at least some header to produce */ ++ } ++ FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, endOp, totalInputSize), "compressStream2 initialization failed"); ++ ZSTD_setBufferExpectations(cctx, output, input); /* Set initial buffer expectations now that we've initialized */ + } + /* end of transparent initialization stage */ + +@@ -5510,13 +6090,20 @@ size_t ZSTD_compressStream2_simpleArgs ( + const void* src, size_t srcSize, size_t* srcPos, + ZSTD_EndDirective endOp) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; + /* ZSTD_compressStream2() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ { size_t const cErr = ZSTD_compressStream2(cctx, &output, &input, endOp); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } + + size_t ZSTD_compress2(ZSTD_CCtx* cctx, +@@ -5539,6 +6126,7 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + /* Reset to the original values. */ + cctx->requestedParams.inBufferMode = originalInBufferMode; + cctx->requestedParams.outBufferMode = originalOutBufferMode; ++ + FORWARD_IF_ERROR(result, "ZSTD_compressStream2_simpleArgs failed"); + if (result != 0) { /* compression not completed, due to lack of output space */ + assert(oPos == dstCapacity); +@@ -5549,64 +6137,61 @@ size_t ZSTD_compress2(ZSTD_CCtx* cctx, + } + } + +-typedef struct { +- U32 idx; /* Index in array of ZSTD_Sequence */ +- U32 posInSequence; /* Position within sequence at idx */ +- size_t posInSrc; /* Number of bytes given by sequences provided so far */ +-} ZSTD_sequencePosition; +- + /* ZSTD_validateSequence() : + * @offCode : is presumed to follow format required by ZSTD_storeSeq() + * @returns a ZSTD error code if sequence is not valid + */ + static size_t +-ZSTD_validateSequence(U32 offCode, U32 matchLength, +- size_t posInSrc, U32 windowLog, size_t dictSize) ++ZSTD_validateSequence(U32 offCode, U32 matchLength, U32 minMatch, ++ size_t posInSrc, U32 windowLog, size_t dictSize, int useSequenceProducer) + { +- U32 const windowSize = 1 << windowLog; ++ U32 const windowSize = 1u << windowLog; + /* posInSrc represents the amount of data the decoder would decode up to this point. + * As long as the amount of data decoded is less than or equal to window size, offsets may be + * larger than the total length of output decoded in order to reference the dict, even larger than + * window size. After output surpasses windowSize, we're limited to windowSize offsets again. + */ + size_t const offsetBound = posInSrc > windowSize ? (size_t)windowSize : posInSrc + (size_t)dictSize; +- RETURN_ERROR_IF(offCode > STORE_OFFSET(offsetBound), corruption_detected, "Offset too large!"); +- RETURN_ERROR_IF(matchLength < MINMATCH, corruption_detected, "Matchlength too small"); ++ size_t const matchLenLowerBound = (minMatch == 3 || useSequenceProducer) ? 3 : 4; ++ RETURN_ERROR_IF(offCode > OFFSET_TO_OFFBASE(offsetBound), externalSequences_invalid, "Offset too large!"); ++ /* Validate maxNbSeq is large enough for the given matchLength and minMatch */ ++ RETURN_ERROR_IF(matchLength < matchLenLowerBound, externalSequences_invalid, "Matchlength too small for the minMatch"); + return 0; + } + + /* Returns an offset code, given a sequence's raw offset, the ongoing repcode array, and whether litLength == 0 */ +-static U32 ZSTD_finalizeOffCode(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) ++static U32 ZSTD_finalizeOffBase(U32 rawOffset, const U32 rep[ZSTD_REP_NUM], U32 ll0) + { +- U32 offCode = STORE_OFFSET(rawOffset); ++ U32 offBase = OFFSET_TO_OFFBASE(rawOffset); + + if (!ll0 && rawOffset == rep[0]) { +- offCode = STORE_REPCODE_1; ++ offBase = REPCODE1_TO_OFFBASE; + } else if (rawOffset == rep[1]) { +- offCode = STORE_REPCODE(2 - ll0); ++ offBase = REPCODE_TO_OFFBASE(2 - ll0); + } else if (rawOffset == rep[2]) { +- offCode = STORE_REPCODE(3 - ll0); ++ offBase = REPCODE_TO_OFFBASE(3 - ll0); + } else if (ll0 && rawOffset == rep[0] - 1) { +- offCode = STORE_REPCODE_3; ++ offBase = REPCODE3_TO_OFFBASE; + } +- return offCode; ++ return offBase; + } + +-/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of +- * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ++ ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; ++ U32 const startIdx = idx; + BYTE const* ip = (BYTE const*)(src); + const BYTE* const iend = ip + blockSize; + repcodes_t updatedRepcodes; + U32 dictSize; + ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreExplicitBlockDelim (blockSize = %zu)", blockSize); ++ + if (cctx->cdict) { + dictSize = (U32)cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5615,25 +6200,55 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + dictSize = 0; + } + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); +- for (; (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0) && idx < inSeqsSize; ++idx) { ++ for (; idx < inSeqsSize && (inSeqs[idx].matchLength != 0 || inSeqs[idx].offset != 0); ++idx) { + U32 const litLength = inSeqs[idx].litLength; +- U32 const ll0 = (litLength == 0); + U32 const matchLength = inSeqs[idx].matchLength; +- U32 const offCode = ZSTD_finalizeOffCode(inSeqs[idx].offset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ U32 offBase; + +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); ++ if (externalRepSearch == ZSTD_ps_disable) { ++ offBase = OFFSET_TO_OFFBASE(inSeqs[idx].offset); ++ } else { ++ U32 const ll0 = (litLength == 0); ++ offBase = ZSTD_finalizeOffBase(inSeqs[idx].offset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); ++ } ++ ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer), + "Sequence validation failed"); + } +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; + } ++ ++ /* If we skipped repcode search while parsing, we need to update repcodes now */ ++ assert(externalRepSearch != ZSTD_ps_auto); ++ assert(idx >= startIdx); ++ if (externalRepSearch == ZSTD_ps_disable && idx != startIdx) { ++ U32* const rep = updatedRepcodes.rep; ++ U32 lastSeqIdx = idx - 1; /* index of last non-block-delimiter sequence */ ++ ++ if (lastSeqIdx >= startIdx + 2) { ++ rep[2] = inSeqs[lastSeqIdx - 2].offset; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else if (lastSeqIdx == startIdx + 1) { ++ rep[2] = rep[0]; ++ rep[1] = inSeqs[lastSeqIdx - 1].offset; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } else { ++ assert(lastSeqIdx == startIdx); ++ rep[2] = rep[1]; ++ rep[1] = rep[0]; ++ rep[0] = inSeqs[lastSeqIdx].offset; ++ } ++ } ++ + ZSTD_memcpy(cctx->blockState.nextCBlock->rep, updatedRepcodes.rep, sizeof(repcodes_t)); + + if (inSeqs[idx].litLength) { +@@ -5642,26 +6257,15 @@ ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, + ip += inSeqs[idx].litLength; + seqPos->posInSrc += inSeqs[idx].litLength; + } +- RETURN_ERROR_IF(ip != iend, corruption_detected, "Blocksize doesn't agree with block delimiter!"); ++ RETURN_ERROR_IF(ip != iend, externalSequences_invalid, "Blocksize doesn't agree with block delimiter!"); + seqPos->idx = idx+1; + return 0; + } + +-/* Returns the number of bytes to move the current read position back by. Only non-zero +- * if we ended up splitting a sequence. Otherwise, it may return a ZSTD error if something +- * went wrong. +- * +- * This function will attempt to scan through blockSize bytes represented by the sequences +- * in inSeqs, storing any (partial) sequences. +- * +- * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to +- * avoid splitting a match, or to avoid splitting a match such that it would produce a match +- * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. +- */ +-static size_t ++size_t + ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize) ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch) + { + U32 idx = seqPos->idx; + U32 startPosInSequence = seqPos->posInSequence; +@@ -5673,6 +6277,9 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 bytesAdjustment = 0; + U32 finalMatchSplit = 0; + ++ /* TODO(embg) support fast parsing mode in noBlockDelim mode */ ++ (void)externalRepSearch; ++ + if (cctx->cdict) { + dictSize = cctx->cdict->dictContentSize; + } else if (cctx->prefixDict.dict) { +@@ -5680,7 +6287,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } else { + dictSize = 0; + } +- DEBUGLOG(5, "ZSTD_copySequencesToSeqStore: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); ++ DEBUGLOG(5, "ZSTD_copySequencesToSeqStoreNoBlockDelim: idx: %u PIS: %u blockSize: %zu", idx, startPosInSequence, blockSize); + DEBUGLOG(5, "Start seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + ZSTD_memcpy(updatedRepcodes.rep, cctx->blockState.prevCBlock->rep, sizeof(repcodes_t)); + while (endPosInSequence && idx < inSeqsSize && !finalMatchSplit) { +@@ -5688,7 +6295,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + U32 litLength = currSeq.litLength; + U32 matchLength = currSeq.matchLength; + U32 const rawOffset = currSeq.offset; +- U32 offCode; ++ U32 offBase; + + /* Modify the sequence depending on where endPosInSequence lies */ + if (endPosInSequence >= currSeq.litLength + currSeq.matchLength) { +@@ -5702,7 +6309,6 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + /* Move to the next sequence */ + endPosInSequence -= currSeq.litLength + currSeq.matchLength; + startPosInSequence = 0; +- idx++; + } else { + /* This is the final (partial) sequence we're adding from inSeqs, and endPosInSequence + does not reach the end of the match. So, we have to split the sequence */ +@@ -5742,21 +6348,23 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + } + /* Check if this offset can be represented with a repcode */ + { U32 const ll0 = (litLength == 0); +- offCode = ZSTD_finalizeOffCode(rawOffset, updatedRepcodes.rep, ll0); +- ZSTD_updateRep(updatedRepcodes.rep, offCode, ll0); ++ offBase = ZSTD_finalizeOffBase(rawOffset, updatedRepcodes.rep, ll0); ++ ZSTD_updateRep(updatedRepcodes.rep, offBase, ll0); + } + + if (cctx->appliedParams.validateSequences) { + seqPos->posInSrc += litLength + matchLength; +- FORWARD_IF_ERROR(ZSTD_validateSequence(offCode, matchLength, seqPos->posInSrc, +- cctx->appliedParams.cParams.windowLog, dictSize), ++ FORWARD_IF_ERROR(ZSTD_validateSequence(offBase, matchLength, cctx->appliedParams.cParams.minMatch, seqPos->posInSrc, ++ cctx->appliedParams.cParams.windowLog, dictSize, cctx->appliedParams.useSequenceProducer), + "Sequence validation failed"); + } +- DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offCode, matchLength, litLength); +- RETURN_ERROR_IF(idx - seqPos->idx > cctx->seqStore.maxNbSeq, memory_allocation, ++ DEBUGLOG(6, "Storing sequence: (of: %u, ml: %u, ll: %u)", offBase, matchLength, litLength); ++ RETURN_ERROR_IF(idx - seqPos->idx >= cctx->seqStore.maxNbSeq, externalSequences_invalid, + "Not enough memory allocated. Try adjusting ZSTD_c_minMatch."); +- ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offCode, matchLength); ++ ZSTD_storeSeq(&cctx->seqStore, litLength, ip, iend, offBase, matchLength); + ip += matchLength + litLength; ++ if (!finalMatchSplit) ++ idx++; /* Next Sequence */ + } + DEBUGLOG(5, "Ending seq: idx: %u (of: %u ml: %u ll: %u)", idx, inSeqs[idx].offset, inSeqs[idx].matchLength, inSeqs[idx].litLength); + assert(idx == inSeqsSize || endPosInSequence <= inSeqs[idx].litLength + inSeqs[idx].matchLength); +@@ -5779,7 +6387,7 @@ ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* + + typedef size_t (*ZSTD_sequenceCopier) (ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, + const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, +- const void* src, size_t blockSize); ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); + static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + { + ZSTD_sequenceCopier sequenceCopier = NULL; +@@ -5793,6 +6401,57 @@ static ZSTD_sequenceCopier ZSTD_selectSequenceCopier(ZSTD_sequenceFormat_e mode) + return sequenceCopier; + } + ++/* Discover the size of next block by searching for the delimiter. ++ * Note that a block delimiter **must** exist in this mode, ++ * otherwise it's an input error. ++ * The block size retrieved will be later compared to ensure it remains within bounds */ ++static size_t ++blockSize_explicitDelimiter(const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ int end = 0; ++ size_t blockSize = 0; ++ size_t spos = seqPos.idx; ++ DEBUGLOG(6, "blockSize_explicitDelimiter : seq %zu / %zu", spos, inSeqsSize); ++ assert(spos <= inSeqsSize); ++ while (spos < inSeqsSize) { ++ end = (inSeqs[spos].offset == 0); ++ blockSize += inSeqs[spos].litLength + inSeqs[spos].matchLength; ++ if (end) { ++ if (inSeqs[spos].matchLength != 0) ++ RETURN_ERROR(externalSequences_invalid, "delimiter format error : both matchlength and offset must be == 0"); ++ break; ++ } ++ spos++; ++ } ++ if (!end) ++ RETURN_ERROR(externalSequences_invalid, "Reached end of sequences without finding a block delimiter"); ++ return blockSize; ++} ++ ++/* More a "target" block size */ ++static size_t blockSize_noDelimiter(size_t blockSize, size_t remaining) ++{ ++ int const lastBlock = (remaining <= blockSize); ++ return lastBlock ? remaining : blockSize; ++} ++ ++static size_t determine_blockSize(ZSTD_sequenceFormat_e mode, ++ size_t blockSize, size_t remaining, ++ const ZSTD_Sequence* inSeqs, size_t inSeqsSize, ZSTD_sequencePosition seqPos) ++{ ++ DEBUGLOG(6, "determine_blockSize : remainingSize = %zu", remaining); ++ if (mode == ZSTD_sf_noBlockDelimiters) ++ return blockSize_noDelimiter(blockSize, remaining); ++ { size_t const explicitBlockSize = blockSize_explicitDelimiter(inSeqs, inSeqsSize, seqPos); ++ FORWARD_IF_ERROR(explicitBlockSize, "Error while determining block size with explicit delimiters"); ++ if (explicitBlockSize > blockSize) ++ RETURN_ERROR(externalSequences_invalid, "sequences incorrectly define a too large block"); ++ if (explicitBlockSize > remaining) ++ RETURN_ERROR(externalSequences_invalid, "sequences define a frame longer than source"); ++ return explicitBlockSize; ++ } ++} ++ + /* Compress, block-by-block, all of the sequences given. + * + * Returns the cumulative size of all compressed blocks (including their headers), +@@ -5805,9 +6464,6 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + const void* src, size_t srcSize) + { + size_t cSize = 0; +- U32 lastBlock; +- size_t blockSize; +- size_t compressedSeqsSize; + size_t remaining = srcSize; + ZSTD_sequencePosition seqPos = {0, 0, 0}; + +@@ -5827,22 +6483,29 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + } + + while (remaining) { ++ size_t compressedSeqsSize; + size_t cBlockSize; + size_t additionalByteAdjustment; +- lastBlock = remaining <= cctx->blockSize; +- blockSize = lastBlock ? (U32)remaining : (U32)cctx->blockSize; ++ size_t blockSize = determine_blockSize(cctx->appliedParams.blockDelimiters, ++ cctx->blockSize, remaining, ++ inSeqs, inSeqsSize, seqPos); ++ U32 const lastBlock = (blockSize == remaining); ++ FORWARD_IF_ERROR(blockSize, "Error while trying to determine block size"); ++ assert(blockSize <= remaining); + ZSTD_resetSeqStore(&cctx->seqStore); +- DEBUGLOG(4, "Working on new block. Blocksize: %zu", blockSize); ++ DEBUGLOG(5, "Working on new block. Blocksize: %zu (total:%zu)", blockSize, (ip - (const BYTE*)src) + blockSize); + +- additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize); ++ additionalByteAdjustment = sequenceCopier(cctx, &seqPos, inSeqs, inSeqsSize, ip, blockSize, cctx->appliedParams.searchForExternalRepcodes); + FORWARD_IF_ERROR(additionalByteAdjustment, "Bad sequence copy"); + blockSize -= additionalByteAdjustment; + + /* If blocks are too small, emit as a nocompress block */ +- if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1) { ++ /* TODO: See 3090. We reduced MIN_CBLOCK_SIZE from 3 to 2 so to compensate we are adding ++ * additional 1. We need to revisit and change this logic to be more consistent */ ++ if (blockSize < MIN_CBLOCK_SIZE+ZSTD_blockHeaderSize+1+1) { + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); + FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); ++ DEBUGLOG(5, "Block too small, writing out nocompress block: cSize: %zu", cBlockSize); + cSize += cBlockSize; + ip += blockSize; + op += cBlockSize; +@@ -5851,6 +6514,7 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + continue; + } + ++ RETURN_ERROR_IF(dstCapacity < ZSTD_blockHeaderSize, dstSize_tooSmall, "not enough dstCapacity to write a new compressed block"); + compressedSeqsSize = ZSTD_entropyCompressSeqStore(&cctx->seqStore, + &cctx->blockState.prevCBlock->entropy, &cctx->blockState.nextCBlock->entropy, + &cctx->appliedParams, +@@ -5859,11 +6523,11 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cctx->entropyWorkspace, ENTROPY_WORKSPACE_SIZE /* statically allocated in resetCCtx */, + cctx->bmi2); + FORWARD_IF_ERROR(compressedSeqsSize, "Compressing sequences of block failed"); +- DEBUGLOG(4, "Compressed sequences size: %zu", compressedSeqsSize); ++ DEBUGLOG(5, "Compressed sequences size: %zu", compressedSeqsSize); + + if (!cctx->isFirstBlock && + ZSTD_maybeRLE(&cctx->seqStore) && +- ZSTD_isRLE((BYTE const*)src, srcSize)) { ++ ZSTD_isRLE(ip, blockSize)) { + /* We don't want to emit our first block as a RLE even if it qualifies because + * doing so will cause the decoder (cli only) to throw a "should consume all input error." + * This is only an issue for zstd <= v1.4.3 +@@ -5874,12 +6538,12 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + if (compressedSeqsSize == 0) { + /* ZSTD_noCompressBlock writes the block header as well */ + cBlockSize = ZSTD_noCompressBlock(op, dstCapacity, ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "Nocompress block failed"); +- DEBUGLOG(4, "Writing out nocompress block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_noCompressBlock failed"); ++ DEBUGLOG(5, "Writing out nocompress block, size: %zu", cBlockSize); + } else if (compressedSeqsSize == 1) { + cBlockSize = ZSTD_rleCompressBlock(op, dstCapacity, *ip, blockSize, lastBlock); +- FORWARD_IF_ERROR(cBlockSize, "RLE compress block failed"); +- DEBUGLOG(4, "Writing out RLE block, size: %zu", cBlockSize); ++ FORWARD_IF_ERROR(cBlockSize, "ZSTD_rleCompressBlock failed"); ++ DEBUGLOG(5, "Writing out RLE block, size: %zu", cBlockSize); + } else { + U32 cBlockHeader; + /* Error checking and repcodes update */ +@@ -5891,11 +6555,10 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + cBlockHeader = lastBlock + (((U32)bt_compressed)<<1) + (U32)(compressedSeqsSize << 3); + MEM_writeLE24(op, cBlockHeader); + cBlockSize = ZSTD_blockHeaderSize + compressedSeqsSize; +- DEBUGLOG(4, "Writing out compressed block, size: %zu", cBlockSize); ++ DEBUGLOG(5, "Writing out compressed block, size: %zu", cBlockSize); + } + + cSize += cBlockSize; +- DEBUGLOG(4, "cSize running total: %zu", cSize); + + if (lastBlock) { + break; +@@ -5906,12 +6569,15 @@ ZSTD_compressSequences_internal(ZSTD_CCtx* cctx, + dstCapacity -= cBlockSize; + cctx->isFirstBlock = 0; + } ++ DEBUGLOG(5, "cSize running total: %zu (remaining dstCapacity=%zu)", cSize, dstCapacity); + } + ++ DEBUGLOG(4, "cSize final total: %zu", cSize); + return cSize; + } + +-size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapacity, ++size_t ZSTD_compressSequences(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, + const ZSTD_Sequence* inSeqs, size_t inSeqsSize, + const void* src, size_t srcSize) + { +@@ -5921,7 +6587,7 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + size_t frameHeaderSize = 0; + + /* Transparent initialization stage, same as compressStream2() */ +- DEBUGLOG(3, "ZSTD_compressSequences()"); ++ DEBUGLOG(4, "ZSTD_compressSequences (dstCapacity=%zu)", dstCapacity); + assert(cctx != NULL); + FORWARD_IF_ERROR(ZSTD_CCtx_init_compressStream2(cctx, ZSTD_e_end, srcSize), "CCtx initialization failed"); + /* Begin writing output, starting with frame header */ +@@ -5949,26 +6615,34 @@ size_t ZSTD_compressSequences(ZSTD_CCtx* const cctx, void* dst, size_t dstCapaci + cSize += 4; + } + +- DEBUGLOG(3, "Final compressed size: %zu", cSize); ++ DEBUGLOG(4, "Final compressed size: %zu", cSize); + return cSize; + } + + /*====== Finalize ======*/ + ++static ZSTD_inBuffer inBuffer_forEndFlush(const ZSTD_CStream* zcs) ++{ ++ const ZSTD_inBuffer nullInput = { NULL, 0, 0 }; ++ const int stableInput = (zcs->appliedParams.inBufferMode == ZSTD_bm_stable); ++ return stableInput ? zcs->expectedInBuffer : nullInput; ++} ++ + /*! ZSTD_flushStream() : + * @return : amount of data remaining to flush */ + size_t ZSTD_flushStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); ++ input.size = input.pos; /* do not ingest more input during flush */ + return ZSTD_compressStream2(zcs, output, &input, ZSTD_e_flush); + } + + + size_t ZSTD_endStream(ZSTD_CStream* zcs, ZSTD_outBuffer* output) + { +- ZSTD_inBuffer input = { NULL, 0, 0 }; ++ ZSTD_inBuffer input = inBuffer_forEndFlush(zcs); + size_t const remainingToFlush = ZSTD_compressStream2(zcs, output, &input, ZSTD_e_end); +- FORWARD_IF_ERROR( remainingToFlush , "ZSTD_compressStream2 failed"); ++ FORWARD_IF_ERROR(remainingToFlush , "ZSTD_compressStream2(,,ZSTD_e_end) failed"); + if (zcs->appliedParams.nbWorkers > 0) return remainingToFlush; /* minimal estimation */ + /* single thread mode : attempt to calculate remaining to flush more precisely */ + { size_t const lastBlockSize = zcs->frameEnded ? 0 : ZSTD_BLOCKHEADERSIZE; +@@ -6090,7 +6764,7 @@ static ZSTD_compressionParameters ZSTD_getCParams_internal(int compressionLevel, + cp.targetLength = (unsigned)(-clampedCompressionLevel); + } + /* refine parameters based on srcSize & dictSize */ +- return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode); ++ return ZSTD_adjustCParams_internal(cp, srcSizeHint, dictSize, mode, ZSTD_ps_auto); + } + } + +@@ -6125,3 +6799,21 @@ ZSTD_parameters ZSTD_getParams(int compressionLevel, unsigned long long srcSizeH + if (srcSizeHint == 0) srcSizeHint = ZSTD_CONTENTSIZE_UNKNOWN; + return ZSTD_getParams_internal(compressionLevel, srcSizeHint, dictSize, ZSTD_cpm_unknown); + } ++ ++void ZSTD_registerSequenceProducer( ++ ZSTD_CCtx* zc, void* mState, ++ ZSTD_sequenceProducer_F* mFinder ++) { ++ if (mFinder != NULL) { ++ ZSTD_externalMatchCtx emctx; ++ emctx.mState = mState; ++ emctx.mFinder = mFinder; ++ emctx.seqBuffer = NULL; ++ emctx.seqBufferCapacity = 0; ++ zc->externalMatchCtx = emctx; ++ zc->requestedParams.useSequenceProducer = 1; ++ } else { ++ ZSTD_memset(&zc->externalMatchCtx, 0, sizeof(zc->externalMatchCtx)); ++ zc->requestedParams.useSequenceProducer = 0; ++ } ++} +diff --git a/lib/zstd/compress/zstd_compress_internal.h b/lib/zstd/compress/zstd_compress_internal.h +index 71697a11ae30..899f5e2de8e9 100644 +--- a/lib/zstd/compress/zstd_compress_internal.h ++++ b/lib/zstd/compress/zstd_compress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,6 +21,7 @@ + ***************************************/ + #include "../common/zstd_internal.h" + #include "zstd_cwksp.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_NbCommonBytes */ + + + /*-************************************* +@@ -111,12 +113,13 @@ typedef struct { + /* ZSTD_buildBlockEntropyStats() : + * Builds entropy for the block. + * @return : 0 on success or error code */ +-size_t ZSTD_buildBlockEntropyStats(seqStore_t* seqStorePtr, +- const ZSTD_entropyCTables_t* prevEntropy, +- ZSTD_entropyCTables_t* nextEntropy, +- const ZSTD_CCtx_params* cctxParams, +- ZSTD_entropyCTablesMetadata_t* entropyMetadata, +- void* workspace, size_t wkspSize); ++size_t ZSTD_buildBlockEntropyStats( ++ const seqStore_t* seqStorePtr, ++ const ZSTD_entropyCTables_t* prevEntropy, ++ ZSTD_entropyCTables_t* nextEntropy, ++ const ZSTD_CCtx_params* cctxParams, ++ ZSTD_entropyCTablesMetadata_t* entropyMetadata, ++ void* workspace, size_t wkspSize); + + /* ******************************* + * Compression internals structs * +@@ -142,6 +145,12 @@ typedef struct { + size_t capacity; /* The capacity starting from `seq` pointer */ + } rawSeqStore_t; + ++typedef struct { ++ U32 idx; /* Index in array of ZSTD_Sequence */ ++ U32 posInSequence; /* Position within sequence at idx */ ++ size_t posInSrc; /* Number of bytes given by sequences provided so far */ ++} ZSTD_sequencePosition; ++ + UNUSED_ATTR static const rawSeqStore_t kNullRawSeqStore = {NULL, 0, 0, 0, 0}; + + typedef struct { +@@ -212,8 +221,10 @@ struct ZSTD_matchState_t { + U32 hashLog3; /* dispatch table for matches of len==3 : larger == faster, more memory */ + + U32 rowHashLog; /* For row-based matchfinder: Hashlog based on nb of rows in the hashTable.*/ +- U16* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ ++ BYTE* tagTable; /* For row-based matchFinder: A row-based table containing the hashes and head index. */ + U32 hashCache[ZSTD_ROW_HASH_CACHE_SIZE]; /* For row-based matchFinder: a cache of hashes to improve speed */ ++ U64 hashSalt; /* For row-based matchFinder: salts the hash for re-use of tag table */ ++ U32 hashSaltEntropy; /* For row-based matchFinder: collects entropy for salt generation */ + + U32* hashTable; + U32* hashTable3; +@@ -228,6 +239,18 @@ struct ZSTD_matchState_t { + const ZSTD_matchState_t* dictMatchState; + ZSTD_compressionParameters cParams; + const rawSeqStore_t* ldmSeqStore; ++ ++ /* Controls prefetching in some dictMatchState matchfinders. ++ * This behavior is controlled from the cctx ms. ++ * This parameter has no effect in the cdict ms. */ ++ int prefetchCDictTables; ++ ++ /* When == 0, lazy match finders insert every position. ++ * When != 0, lazy match finders only insert positions they search. ++ * This allows them to skip much faster over incompressible data, ++ * at a small cost to compression ratio. ++ */ ++ int lazySkipping; + }; + + typedef struct { +@@ -324,6 +347,24 @@ struct ZSTD_CCtx_params_s { + + /* Internal use, for createCCtxParams() and freeCCtxParams() only */ + ZSTD_customMem customMem; ++ ++ /* Controls prefetching in some dictMatchState matchfinders */ ++ ZSTD_paramSwitch_e prefetchCDictTables; ++ ++ /* Controls whether zstd will fall back to an internal matchfinder ++ * if the external matchfinder returns an error code. */ ++ int enableMatchFinderFallback; ++ ++ /* Indicates whether an external matchfinder has been referenced. ++ * Users can't set this externally. ++ * It is set internally in ZSTD_registerSequenceProducer(). */ ++ int useSequenceProducer; ++ ++ /* Adjust the max block size*/ ++ size_t maxBlockSize; ++ ++ /* Controls repcode search in external sequence parsing */ ++ ZSTD_paramSwitch_e searchForExternalRepcodes; + }; /* typedef'd to ZSTD_CCtx_params within "zstd.h" */ + + #define COMPRESS_SEQUENCES_WORKSPACE_SIZE (sizeof(unsigned) * (MaxSeq + 2)) +@@ -355,6 +396,14 @@ typedef struct { + ZSTD_entropyCTablesMetadata_t entropyMetadata; + } ZSTD_blockSplitCtx; + ++/* Context for block-level external matchfinder API */ ++typedef struct { ++ void* mState; ++ ZSTD_sequenceProducer_F* mFinder; ++ ZSTD_Sequence* seqBuffer; ++ size_t seqBufferCapacity; ++} ZSTD_externalMatchCtx; ++ + struct ZSTD_CCtx_s { + ZSTD_compressionStage_e stage; + int cParamsChanged; /* == 1 if cParams(except wlog) or compression level are changed in requestedParams. Triggers transmission of new params to ZSTDMT (if available) then reset to 0. */ +@@ -404,6 +453,7 @@ struct ZSTD_CCtx_s { + + /* Stable in/out buffer verification */ + ZSTD_inBuffer expectedInBuffer; ++ size_t stableIn_notConsumed; /* nb bytes within stable input buffer that are said to be consumed but are not */ + size_t expectedOutBufferSize; + + /* Dictionary */ +@@ -417,9 +467,13 @@ struct ZSTD_CCtx_s { + + /* Workspace for block splitter */ + ZSTD_blockSplitCtx blockSplitCtx; ++ ++ /* Workspace for external matchfinder */ ++ ZSTD_externalMatchCtx externalMatchCtx; + }; + + typedef enum { ZSTD_dtlm_fast, ZSTD_dtlm_full } ZSTD_dictTableLoadMethod_e; ++typedef enum { ZSTD_tfp_forCCtx, ZSTD_tfp_forCDict } ZSTD_tableFillPurpose_e; + + typedef enum { + ZSTD_noDict = 0, +@@ -441,7 +495,7 @@ typedef enum { + * In this mode we take both the source size and the dictionary size + * into account when selecting and adjusting the parameters. + */ +- ZSTD_cpm_unknown = 3, /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. ++ ZSTD_cpm_unknown = 3 /* ZSTD_getCParams, ZSTD_getParams, ZSTD_adjustParams. + * We don't know what these parameters are for. We default to the legacy + * behavior of taking both the source size and the dict size into account + * when selecting and adjusting parameters. +@@ -500,9 +554,11 @@ MEM_STATIC int ZSTD_cParam_withinBounds(ZSTD_cParameter cParam, int value) + /* ZSTD_noCompressBlock() : + * Writes uncompressed block to dst buffer from given src. + * Returns the size of the block */ +-MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_noCompressBlock(void* dst, size_t dstCapacity, const void* src, size_t srcSize, U32 lastBlock) + { + U32 const cBlockHeader24 = lastBlock + (((U32)bt_raw)<<1) + (U32)(srcSize << 3); ++ DEBUGLOG(5, "ZSTD_noCompressBlock (srcSize=%zu, dstCapacity=%zu)", srcSize, dstCapacity); + RETURN_ERROR_IF(srcSize + ZSTD_blockHeaderSize > dstCapacity, + dstSize_tooSmall, "dst buf too small for uncompressed block"); + MEM_writeLE24(dst, cBlockHeader24); +@@ -510,7 +566,8 @@ MEM_STATIC size_t ZSTD_noCompressBlock (void* dst, size_t dstCapacity, const voi + return ZSTD_blockHeaderSize + srcSize; + } + +-MEM_STATIC size_t ZSTD_rleCompressBlock (void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) ++MEM_STATIC size_t ++ZSTD_rleCompressBlock(void* dst, size_t dstCapacity, BYTE src, size_t srcSize, U32 lastBlock) + { + BYTE* const op = (BYTE*)dst; + U32 const cBlockHeader = lastBlock + (((U32)bt_rle)<<1) + (U32)(srcSize << 3); +@@ -529,7 +586,7 @@ MEM_STATIC size_t ZSTD_minGain(size_t srcSize, ZSTD_strategy strat) + { + U32 const minlog = (strat>=ZSTD_btultra) ? (U32)(strat) - 1 : 6; + ZSTD_STATIC_ASSERT(ZSTD_btultra == 8); +- assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, strat)); ++ assert(ZSTD_cParam_withinBounds(ZSTD_c_strategy, (int)strat)); + return (srcSize >> minlog) + 2; + } + +@@ -565,29 +622,27 @@ ZSTD_safecopyLiterals(BYTE* op, BYTE const* ip, BYTE const* const iend, BYTE con + while (ip < iend) *op++ = *ip++; + } + +-#define ZSTD_REP_MOVE (ZSTD_REP_NUM-1) +-#define STORE_REPCODE_1 STORE_REPCODE(1) +-#define STORE_REPCODE_2 STORE_REPCODE(2) +-#define STORE_REPCODE_3 STORE_REPCODE(3) +-#define STORE_REPCODE(r) (assert((r)>=1), assert((r)<=3), (r)-1) +-#define STORE_OFFSET(o) (assert((o)>0), o + ZSTD_REP_MOVE) +-#define STORED_IS_OFFSET(o) ((o) > ZSTD_REP_MOVE) +-#define STORED_IS_REPCODE(o) ((o) <= ZSTD_REP_MOVE) +-#define STORED_OFFSET(o) (assert(STORED_IS_OFFSET(o)), (o)-ZSTD_REP_MOVE) +-#define STORED_REPCODE(o) (assert(STORED_IS_REPCODE(o)), (o)+1) /* returns ID 1,2,3 */ +-#define STORED_TO_OFFBASE(o) ((o)+1) +-#define OFFBASE_TO_STORED(o) ((o)-1) ++ ++#define REPCODE1_TO_OFFBASE REPCODE_TO_OFFBASE(1) ++#define REPCODE2_TO_OFFBASE REPCODE_TO_OFFBASE(2) ++#define REPCODE3_TO_OFFBASE REPCODE_TO_OFFBASE(3) ++#define REPCODE_TO_OFFBASE(r) (assert((r)>=1), assert((r)<=ZSTD_REP_NUM), (r)) /* accepts IDs 1,2,3 */ ++#define OFFSET_TO_OFFBASE(o) (assert((o)>0), o + ZSTD_REP_NUM) ++#define OFFBASE_IS_OFFSET(o) ((o) > ZSTD_REP_NUM) ++#define OFFBASE_IS_REPCODE(o) ( 1 <= (o) && (o) <= ZSTD_REP_NUM) ++#define OFFBASE_TO_OFFSET(o) (assert(OFFBASE_IS_OFFSET(o)), (o) - ZSTD_REP_NUM) ++#define OFFBASE_TO_REPCODE(o) (assert(OFFBASE_IS_REPCODE(o)), (o)) /* returns ID 1,2,3 */ + + /*! ZSTD_storeSeq() : +- * Store a sequence (litlen, litPtr, offCode and matchLength) into seqStore_t. +- * @offBase_minus1 : Users should use employ macros STORE_REPCODE_X and STORE_OFFSET(). ++ * Store a sequence (litlen, litPtr, offBase and matchLength) into seqStore_t. ++ * @offBase : Users should employ macros REPCODE_TO_OFFBASE() and OFFSET_TO_OFFBASE(). + * @matchLength : must be >= MINMATCH +- * Allowed to overread literals up to litLimit. ++ * Allowed to over-read literals up to litLimit. + */ + HINT_INLINE UNUSED_ATTR void + ZSTD_storeSeq(seqStore_t* seqStorePtr, + size_t litLength, const BYTE* literals, const BYTE* litLimit, +- U32 offBase_minus1, ++ U32 offBase, + size_t matchLength) + { + BYTE const* const litLimit_w = litLimit - WILDCOPY_OVERLENGTH; +@@ -596,8 +651,8 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + static const BYTE* g_start = NULL; + if (g_start==NULL) g_start = (const BYTE*)literals; /* note : index only works for compression within a single segment */ + { U32 const pos = (U32)((const BYTE*)literals - g_start); +- DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offCode%7u", +- pos, (U32)litLength, (U32)matchLength, (U32)offBase_minus1); ++ DEBUGLOG(6, "Cpos%7u :%3u literals, match%4u bytes at offBase%7u", ++ pos, (U32)litLength, (U32)matchLength, (U32)offBase); + } + #endif + assert((size_t)(seqStorePtr->sequences - seqStorePtr->sequencesStart) < seqStorePtr->maxNbSeq); +@@ -607,9 +662,9 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + assert(literals + litLength <= litLimit); + if (litEnd <= litLimit_w) { + /* Common case we can use wildcopy. +- * First copy 16 bytes, because literals are likely short. +- */ +- assert(WILDCOPY_OVERLENGTH >= 16); ++ * First copy 16 bytes, because literals are likely short. ++ */ ++ ZSTD_STATIC_ASSERT(WILDCOPY_OVERLENGTH >= 16); + ZSTD_copy16(seqStorePtr->lit, literals); + if (litLength > 16) { + ZSTD_wildcopy(seqStorePtr->lit+16, literals+16, (ptrdiff_t)litLength-16, ZSTD_no_overlap); +@@ -628,7 +683,7 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + seqStorePtr->sequences[0].litLength = (U16)litLength; + + /* match offset */ +- seqStorePtr->sequences[0].offBase = STORED_TO_OFFBASE(offBase_minus1); ++ seqStorePtr->sequences[0].offBase = offBase; + + /* match Length */ + assert(matchLength >= MINMATCH); +@@ -646,17 +701,17 @@ ZSTD_storeSeq(seqStore_t* seqStorePtr, + + /* ZSTD_updateRep() : + * updates in-place @rep (array of repeat offsets) +- * @offBase_minus1 : sum-type, with same numeric representation as ZSTD_storeSeq() ++ * @offBase : sum-type, using numeric representation of ZSTD_storeSeq() + */ + MEM_STATIC void +-ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_updateRep(U32 rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { +- if (STORED_IS_OFFSET(offBase_minus1)) { /* full offset */ ++ if (OFFBASE_IS_OFFSET(offBase)) { /* full offset */ + rep[2] = rep[1]; + rep[1] = rep[0]; +- rep[0] = STORED_OFFSET(offBase_minus1); ++ rep[0] = OFFBASE_TO_OFFSET(offBase); + } else { /* repcode */ +- U32 const repCode = STORED_REPCODE(offBase_minus1) - 1 + ll0; ++ U32 const repCode = OFFBASE_TO_REPCODE(offBase) - 1 + ll0; + if (repCode > 0) { /* note : if repCode==0, no change */ + U32 const currentOffset = (repCode==ZSTD_REP_NUM) ? (rep[0] - 1) : rep[repCode]; + rep[2] = (repCode >= 2) ? rep[1] : rep[2]; +@@ -673,11 +728,11 @@ typedef struct repcodes_s { + } repcodes_t; + + MEM_STATIC repcodes_t +-ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0) ++ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase, U32 const ll0) + { + repcodes_t newReps; + ZSTD_memcpy(&newReps, rep, sizeof(newReps)); +- ZSTD_updateRep(newReps.rep, offBase_minus1, ll0); ++ ZSTD_updateRep(newReps.rep, offBase, ll0); + return newReps; + } + +@@ -685,59 +740,6 @@ ZSTD_newRep(U32 const rep[ZSTD_REP_NUM], U32 const offBase_minus1, U32 const ll0 + /*-************************************* + * Match length counter + ***************************************/ +-static unsigned ZSTD_NbCommonBytes (size_t val) +-{ +- if (MEM_isLittleEndian()) { +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_ctzll((U64)val) >> 3); +-# else +- static const int DeBruijnBytePos[64] = { 0, 0, 0, 0, 0, 1, 1, 2, +- 0, 3, 1, 3, 1, 4, 2, 7, +- 0, 2, 3, 6, 1, 5, 3, 5, +- 1, 3, 4, 4, 2, 5, 6, 7, +- 7, 0, 1, 2, 3, 3, 4, 6, +- 2, 6, 5, 5, 3, 4, 5, 6, +- 7, 1, 2, 4, 6, 4, 4, 5, +- 7, 2, 6, 5, 7, 6, 7, 7 }; +- return DeBruijnBytePos[((U64)((val & -(long long)val) * 0x0218A392CDABBD3FULL)) >> 58]; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_ctz((U32)val) >> 3); +-# else +- static const int DeBruijnBytePos[32] = { 0, 0, 3, 0, 3, 1, 3, 0, +- 3, 2, 2, 1, 3, 2, 0, 1, +- 3, 3, 1, 2, 2, 2, 2, 0, +- 3, 1, 2, 0, 1, 0, 1, 1 }; +- return DeBruijnBytePos[((U32)((val & -(S32)val) * 0x077CB531U)) >> 27]; +-# endif +- } +- } else { /* Big Endian CPU */ +- if (MEM_64bits()) { +-# if (__GNUC__ >= 4) +- return (__builtin_clzll(val) >> 3); +-# else +- unsigned r; +- const unsigned n32 = sizeof(size_t)*4; /* calculate this way due to compiler complaining in 32-bits mode */ +- if (!(val>>n32)) { r=4; } else { r=0; val>>=n32; } +- if (!(val>>16)) { r+=2; val>>=8; } else { val>>=24; } +- r += (!val); +- return r; +-# endif +- } else { /* 32 bits */ +-# if (__GNUC__ >= 3) +- return (__builtin_clz((U32)val) >> 3); +-# else +- unsigned r; +- if (!(val>>16)) { r=2; val>>=8; } else { r=0; val>>=24; } +- r += (!val); +- return r; +-# endif +- } } +-} +- +- + MEM_STATIC size_t ZSTD_count(const BYTE* pIn, const BYTE* pMatch, const BYTE* const pInLimit) + { + const BYTE* const pStart = pIn; +@@ -783,32 +785,43 @@ ZSTD_count_2segments(const BYTE* ip, const BYTE* match, + * Hashes + ***************************************/ + static const U32 prime3bytes = 506832829U; +-static U32 ZSTD_hash3(U32 u, U32 h) { return ((u << (32-24)) * prime3bytes) >> (32-h) ; } +-MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h); } /* only in zstd_opt.h */ ++static U32 ZSTD_hash3(U32 u, U32 h, U32 s) { assert(h <= 32); return (((u << (32-24)) * prime3bytes) ^ s) >> (32-h) ; } ++MEM_STATIC size_t ZSTD_hash3Ptr(const void* ptr, U32 h) { return ZSTD_hash3(MEM_readLE32(ptr), h, 0); } /* only in zstd_opt.h */ ++MEM_STATIC size_t ZSTD_hash3PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash3(MEM_readLE32(ptr), h, s); } + + static const U32 prime4bytes = 2654435761U; +-static U32 ZSTD_hash4(U32 u, U32 h) { return (u * prime4bytes) >> (32-h) ; } +-static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_read32(ptr), h); } ++static U32 ZSTD_hash4(U32 u, U32 h, U32 s) { assert(h <= 32); return ((u * prime4bytes) ^ s) >> (32-h) ; } ++static size_t ZSTD_hash4Ptr(const void* ptr, U32 h) { return ZSTD_hash4(MEM_readLE32(ptr), h, 0); } ++static size_t ZSTD_hash4PtrS(const void* ptr, U32 h, U32 s) { return ZSTD_hash4(MEM_readLE32(ptr), h, s); } + + static const U64 prime5bytes = 889523592379ULL; +-static size_t ZSTD_hash5(U64 u, U32 h) { return (size_t)(((u << (64-40)) * prime5bytes) >> (64-h)) ; } +-static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h); } ++static size_t ZSTD_hash5(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-40)) * prime5bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash5Ptr(const void* p, U32 h) { return ZSTD_hash5(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash5PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash5(MEM_readLE64(p), h, s); } + + static const U64 prime6bytes = 227718039650203ULL; +-static size_t ZSTD_hash6(U64 u, U32 h) { return (size_t)(((u << (64-48)) * prime6bytes) >> (64-h)) ; } +-static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h); } ++static size_t ZSTD_hash6(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-48)) * prime6bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash6Ptr(const void* p, U32 h) { return ZSTD_hash6(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash6PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash6(MEM_readLE64(p), h, s); } + + static const U64 prime7bytes = 58295818150454627ULL; +-static size_t ZSTD_hash7(U64 u, U32 h) { return (size_t)(((u << (64-56)) * prime7bytes) >> (64-h)) ; } +-static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h); } ++static size_t ZSTD_hash7(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u << (64-56)) * prime7bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash7Ptr(const void* p, U32 h) { return ZSTD_hash7(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash7PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash7(MEM_readLE64(p), h, s); } + + static const U64 prime8bytes = 0xCF1BBCDCB7A56463ULL; +-static size_t ZSTD_hash8(U64 u, U32 h) { return (size_t)(((u) * prime8bytes) >> (64-h)) ; } +-static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h); } ++static size_t ZSTD_hash8(U64 u, U32 h, U64 s) { assert(h <= 64); return (size_t)((((u) * prime8bytes) ^ s) >> (64-h)) ; } ++static size_t ZSTD_hash8Ptr(const void* p, U32 h) { return ZSTD_hash8(MEM_readLE64(p), h, 0); } ++static size_t ZSTD_hash8PtrS(const void* p, U32 h, U64 s) { return ZSTD_hash8(MEM_readLE64(p), h, s); } ++ + + MEM_STATIC FORCE_INLINE_ATTR + size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ + switch(mls) + { + default: +@@ -820,6 +833,24 @@ size_t ZSTD_hashPtr(const void* p, U32 hBits, U32 mls) + } + } + ++MEM_STATIC FORCE_INLINE_ATTR ++size_t ZSTD_hashPtrSalted(const void* p, U32 hBits, U32 mls, const U64 hashSalt) { ++ /* Although some of these hashes do support hBits up to 64, some do not. ++ * To be on the safe side, always avoid hBits > 32. */ ++ assert(hBits <= 32); ++ ++ switch(mls) ++ { ++ default: ++ case 4: return ZSTD_hash4PtrS(p, hBits, (U32)hashSalt); ++ case 5: return ZSTD_hash5PtrS(p, hBits, hashSalt); ++ case 6: return ZSTD_hash6PtrS(p, hBits, hashSalt); ++ case 7: return ZSTD_hash7PtrS(p, hBits, hashSalt); ++ case 8: return ZSTD_hash8PtrS(p, hBits, hashSalt); ++ } ++} ++ ++ + /* ZSTD_ipow() : + * Return base^exponent. + */ +@@ -1167,10 +1198,15 @@ ZSTD_checkDictValidity(const ZSTD_window_t* window, + (unsigned)blockEndIdx, (unsigned)maxDist, (unsigned)loadedDictEnd); + assert(blockEndIdx >= loadedDictEnd); + +- if (blockEndIdx > loadedDictEnd + maxDist) { ++ if (blockEndIdx > loadedDictEnd + maxDist || loadedDictEnd != window->dictLimit) { + /* On reaching window size, dictionaries are invalidated. + * For simplification, if window size is reached anywhere within next block, + * the dictionary is invalidated for the full block. ++ * ++ * We also have to invalidate the dictionary if ZSTD_window_update() has detected ++ * non-contiguous segments, which means that loadedDictEnd != window->dictLimit. ++ * loadedDictEnd may be 0, if forceWindow is true, but in that case we never use ++ * dictMatchState, so setting it to NULL is not a problem. + */ + DEBUGLOG(6, "invalidating dictionary for current block (distance > windowSize)"); + *loadedDictEndPtr = 0; +@@ -1302,6 +1338,42 @@ MEM_STATIC void ZSTD_debugTable(const U32* table, U32 max) + + #endif + ++/* Short Cache */ ++ ++/* Normally, zstd matchfinders follow this flow: ++ * 1. Compute hash at ip ++ * 2. Load index from hashTable[hash] ++ * 3. Check if *ip == *(base + index) ++ * In dictionary compression, loading *(base + index) is often an L2 or even L3 miss. ++ * ++ * Short cache is an optimization which allows us to avoid step 3 most of the time ++ * when the data doesn't actually match. With short cache, the flow becomes: ++ * 1. Compute (hash, currentTag) at ip. currentTag is an 8-bit independent hash at ip. ++ * 2. Load (index, matchTag) from hashTable[hash]. See ZSTD_writeTaggedIndex to understand how this works. ++ * 3. Only if currentTag == matchTag, check *ip == *(base + index). Otherwise, continue. ++ * ++ * Currently, short cache is only implemented in CDict hashtables. Thus, its use is limited to ++ * dictMatchState matchfinders. ++ */ ++#define ZSTD_SHORT_CACHE_TAG_BITS 8 ++#define ZSTD_SHORT_CACHE_TAG_MASK ((1u << ZSTD_SHORT_CACHE_TAG_BITS) - 1) ++ ++/* Helper function for ZSTD_fillHashTable and ZSTD_fillDoubleHashTable. ++ * Unpacks hashAndTag into (hash, tag), then packs (index, tag) into hashTable[hash]. */ ++MEM_STATIC void ZSTD_writeTaggedIndex(U32* const hashTable, size_t hashAndTag, U32 index) { ++ size_t const hash = hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const tag = (U32)(hashAndTag & ZSTD_SHORT_CACHE_TAG_MASK); ++ assert(index >> (32 - ZSTD_SHORT_CACHE_TAG_BITS) == 0); ++ hashTable[hash] = (index << ZSTD_SHORT_CACHE_TAG_BITS) | tag; ++} ++ ++/* Helper function for short cache matchfinders. ++ * Unpacks tag1 and tag2 from lower bits of packedTag1 and packedTag2, then checks if the tags match. */ ++MEM_STATIC int ZSTD_comparePackedTags(size_t packedTag1, size_t packedTag2) { ++ U32 const tag1 = packedTag1 & ZSTD_SHORT_CACHE_TAG_MASK; ++ U32 const tag2 = packedTag2 & ZSTD_SHORT_CACHE_TAG_MASK; ++ return tag1 == tag2; ++} + + + /* =============================================================== +@@ -1396,4 +1468,51 @@ U32 ZSTD_cycleLog(U32 hashLog, ZSTD_strategy strat); + */ + void ZSTD_CCtx_trace(ZSTD_CCtx* cctx, size_t extraCSize); + ++/* Returns 0 on success, and a ZSTD_error otherwise. This function scans through an array of ++ * ZSTD_Sequence, storing the sequences it finds, until it reaches a block delimiter. ++ * Note that the block delimiter must include the last literals of the block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreExplicitBlockDelim(ZSTD_CCtx* cctx, ++ ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++/* Returns the number of bytes to move the current read position back by. ++ * Only non-zero if we ended up splitting a sequence. ++ * Otherwise, it may return a ZSTD error if something went wrong. ++ * ++ * This function will attempt to scan through blockSize bytes ++ * represented by the sequences in @inSeqs, ++ * storing any (partial) sequences. ++ * ++ * Occasionally, we may want to change the actual number of bytes we consumed from inSeqs to ++ * avoid splitting a match, or to avoid splitting a match such that it would produce a match ++ * smaller than MINMATCH. In this case, we return the number of bytes that we didn't read from this block. ++ */ ++size_t ++ZSTD_copySequencesToSeqStoreNoBlockDelim(ZSTD_CCtx* cctx, ZSTD_sequencePosition* seqPos, ++ const ZSTD_Sequence* const inSeqs, size_t inSeqsSize, ++ const void* src, size_t blockSize, ZSTD_paramSwitch_e externalRepSearch); ++ ++ ++/* =============================================================== ++ * Deprecated definitions that are still used internally to avoid ++ * deprecation warnings. These functions are exactly equivalent to ++ * their public variants, but avoid the deprecation warnings. ++ * =============================================================== */ ++ ++size_t ZSTD_compressBegin_usingCDict_deprecated(ZSTD_CCtx* cctx, const ZSTD_CDict* cdict); ++ ++size_t ZSTD_compressContinue_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressEnd_public(ZSTD_CCtx* cctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ ++size_t ZSTD_compressBlock_deprecated(ZSTD_CCtx* cctx, void* dst, size_t dstCapacity, const void* src, size_t srcSize); ++ ++ + #endif /* ZSTD_COMPRESS_H */ +diff --git a/lib/zstd/compress/zstd_compress_literals.c b/lib/zstd/compress/zstd_compress_literals.c +index 52b0a8059aba..3e9ea46a670a 100644 +--- a/lib/zstd/compress/zstd_compress_literals.c ++++ b/lib/zstd/compress/zstd_compress_literals.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -13,11 +14,36 @@ + ***************************************/ + #include "zstd_compress_literals.h" + ++ ++/* ************************************************************** ++* Debug Traces ++****************************************************************/ ++#if DEBUGLEVEL >= 2 ++ ++static size_t showHexa(const void* src, size_t srcSize) ++{ ++ const BYTE* const ip = (const BYTE*)src; ++ size_t u; ++ for (u=0; u31) + (srcSize>4095); + ++ DEBUGLOG(5, "ZSTD_noCompressLiterals: srcSize=%zu, dstCapacity=%zu", srcSize, dstCapacity); ++ + RETURN_ERROR_IF(srcSize + flSize > dstCapacity, dstSize_tooSmall, ""); + + switch(flSize) +@@ -36,16 +62,30 @@ size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, + } + + ZSTD_memcpy(ostart + flSize, src, srcSize); +- DEBUGLOG(5, "Raw literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); ++ DEBUGLOG(5, "Raw (uncompressed) literals: %u -> %u", (U32)srcSize, (U32)(srcSize + flSize)); + return srcSize + flSize; + } + ++static int allBytesIdentical(const void* src, size_t srcSize) ++{ ++ assert(srcSize >= 1); ++ assert(src != NULL); ++ { const BYTE b = ((const BYTE*)src)[0]; ++ size_t p; ++ for (p=1; p31) + (srcSize>4095); + +- (void)dstCapacity; /* dstCapacity already guaranteed to be >=4, hence large enough */ ++ assert(dstCapacity >= 4); (void)dstCapacity; ++ assert(allBytesIdentical(src, srcSize)); + + switch(flSize) + { +@@ -63,28 +103,51 @@ size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* + } + + ostart[flSize] = *(const BYTE*)src; +- DEBUGLOG(5, "RLE literals: %u -> %u", (U32)srcSize, (U32)flSize + 1); ++ DEBUGLOG(5, "RLE : Repeated Literal (%02X: %u times) -> %u bytes encoded", ((const BYTE*)src)[0], (U32)srcSize, (U32)flSize + 1); + return flSize+1; + } + +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize, +- void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible) ++/* ZSTD_minLiteralsToCompress() : ++ * returns minimal amount of literals ++ * for literal compression to even be attempted. ++ * Minimum is made tighter as compression strategy increases. ++ */ ++static size_t ++ZSTD_minLiteralsToCompress(ZSTD_strategy strategy, HUF_repeat huf_repeat) ++{ ++ assert((int)strategy >= 0); ++ assert((int)strategy <= 9); ++ /* btultra2 : min 8 bytes; ++ * then 2x larger for each successive compression strategy ++ * max threshold 64 bytes */ ++ { int const shift = MIN(9-(int)strategy, 3); ++ size_t const mintc = (huf_repeat == HUF_repeat_valid) ? 6 : (size_t)8 << shift; ++ DEBUGLOG(7, "minLiteralsToCompress = %zu", mintc); ++ return mintc; ++ } ++} ++ ++size_t ZSTD_compressLiterals ( ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize, ++ void* entropyWorkspace, size_t entropyWorkspaceSize, ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, ++ int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2) + { +- size_t const minGain = ZSTD_minGain(srcSize, strategy); + size_t const lhSize = 3 + (srcSize >= 1 KB) + (srcSize >= 16 KB); + BYTE* const ostart = (BYTE*)dst; + U32 singleStream = srcSize < 256; + symbolEncodingType_e hType = set_compressed; + size_t cLitSize; + +- DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i srcSize=%u)", +- disableLiteralCompression, (U32)srcSize); ++ DEBUGLOG(5,"ZSTD_compressLiterals (disableLiteralCompression=%i, srcSize=%u, dstCapacity=%zu)", ++ disableLiteralCompression, (U32)srcSize, dstCapacity); ++ ++ DEBUGLOG(6, "Completed literals listing (%zu bytes)", showHexa(src, srcSize)); + + /* Prepare nextEntropy assuming reusing the existing table */ + ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +@@ -92,40 +155,51 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + if (disableLiteralCompression) + return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + +- /* small ? don't even attempt compression (speed opt) */ +-# define COMPRESS_LITERALS_SIZE_MIN 63 +- { size_t const minLitSize = (prevHuf->repeatMode == HUF_repeat_valid) ? 6 : COMPRESS_LITERALS_SIZE_MIN; +- if (srcSize <= minLitSize) return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ /* if too small, don't even attempt compression (speed opt) */ ++ if (srcSize < ZSTD_minLiteralsToCompress(strategy, prevHuf->repeatMode)) ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); + + RETURN_ERROR_IF(dstCapacity < lhSize+1, dstSize_tooSmall, "not enough space for compression"); + { HUF_repeat repeat = prevHuf->repeatMode; +- int const preferRepeat = strategy < ZSTD_lazy ? srcSize <= 1024 : 0; ++ int const flags = 0 ++ | (bmi2 ? HUF_flags_bmi2 : 0) ++ | (strategy < ZSTD_lazy && srcSize <= 1024 ? HUF_flags_preferRepeat : 0) ++ | (strategy >= HUF_OPTIMAL_DEPTH_THRESHOLD ? HUF_flags_optimalDepth : 0) ++ | (suspectUncompressible ? HUF_flags_suspectUncompressible : 0); ++ ++ typedef size_t (*huf_compress_f)(void*, size_t, const void*, size_t, unsigned, unsigned, void*, size_t, HUF_CElt*, HUF_repeat*, int); ++ huf_compress_f huf_compress; + if (repeat == HUF_repeat_valid && lhSize == 3) singleStream = 1; +- cLitSize = singleStream ? +- HUF_compress1X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible) : +- HUF_compress4X_repeat( +- ostart+lhSize, dstCapacity-lhSize, src, srcSize, +- HUF_SYMBOLVALUE_MAX, HUF_TABLELOG_DEFAULT, entropyWorkspace, entropyWorkspaceSize, +- (HUF_CElt*)nextHuf->CTable, &repeat, preferRepeat, bmi2, suspectUncompressible); ++ huf_compress = singleStream ? HUF_compress1X_repeat : HUF_compress4X_repeat; ++ cLitSize = huf_compress(ostart+lhSize, dstCapacity-lhSize, ++ src, srcSize, ++ HUF_SYMBOLVALUE_MAX, LitHufLog, ++ entropyWorkspace, entropyWorkspaceSize, ++ (HUF_CElt*)nextHuf->CTable, ++ &repeat, flags); ++ DEBUGLOG(5, "%zu literals compressed into %zu bytes (before header)", srcSize, cLitSize); + if (repeat != HUF_repeat_none) { + /* reused the existing table */ +- DEBUGLOG(5, "Reusing previous huffman table"); ++ DEBUGLOG(5, "reusing statistics from previous huffman block"); + hType = set_repeat; + } + } + +- if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); +- } ++ { size_t const minGain = ZSTD_minGain(srcSize, strategy); ++ if ((cLitSize==0) || (cLitSize >= srcSize - minGain) || ERR_isError(cLitSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_noCompressLiterals(dst, dstCapacity, src, srcSize); ++ } } + if (cLitSize==1) { +- ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); +- return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); +- } ++ /* A return value of 1 signals that the alphabet consists of a single symbol. ++ * However, in some rare circumstances, it could be the compressed size (a single byte). ++ * For that outcome to have a chance to happen, it's necessary that `srcSize < 8`. ++ * (it's also necessary to not generate statistics). ++ * Therefore, in such a case, actively check that all bytes are identical. */ ++ if ((srcSize >= 8) || allBytesIdentical(src, srcSize)) { ++ ZSTD_memcpy(nextHuf, prevHuf, sizeof(*prevHuf)); ++ return ZSTD_compressRleLiteralsBlock(dst, dstCapacity, src, srcSize); ++ } } + + if (hType == set_compressed) { + /* using a newly constructed table */ +@@ -136,16 +210,19 @@ size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, + switch(lhSize) + { + case 3: /* 2 - 2 - 10 - 10 */ +- { U32 const lhc = hType + ((!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); ++ if (!singleStream) assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); ++ { U32 const lhc = hType + ((U32)(!singleStream) << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<14); + MEM_writeLE24(ostart, lhc); + break; + } + case 4: /* 2 - 2 - 14 - 14 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (2 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<18); + MEM_writeLE32(ostart, lhc); + break; + } + case 5: /* 2 - 2 - 18 - 18 */ ++ assert(srcSize >= MIN_LITERALS_FOR_4_STREAMS); + { U32 const lhc = hType + (3 << 2) + ((U32)srcSize<<4) + ((U32)cLitSize<<22); + MEM_writeLE32(ostart, lhc); + ostart[4] = (BYTE)(cLitSize >> 10); +diff --git a/lib/zstd/compress/zstd_compress_literals.h b/lib/zstd/compress/zstd_compress_literals.h +index 9775fb97cb70..a2a85d6b69e5 100644 +--- a/lib/zstd/compress/zstd_compress_literals.h ++++ b/lib/zstd/compress/zstd_compress_literals.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,16 +17,24 @@ + + size_t ZSTD_noCompressLiterals (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + ++/* ZSTD_compressRleLiteralsBlock() : ++ * Conditions : ++ * - All bytes in @src are identical ++ * - dstCapacity >= 4 */ + size_t ZSTD_compressRleLiteralsBlock (void* dst, size_t dstCapacity, const void* src, size_t srcSize); + +-/* If suspectUncompressible then some sampling checks will be run to potentially skip huffman coding */ +-size_t ZSTD_compressLiterals (ZSTD_hufCTables_t const* prevHuf, +- ZSTD_hufCTables_t* nextHuf, +- ZSTD_strategy strategy, int disableLiteralCompression, +- void* dst, size_t dstCapacity, ++/* ZSTD_compressLiterals(): ++ * @entropyWorkspace: must be aligned on 4-bytes boundaries ++ * @entropyWorkspaceSize : must be >= HUF_WORKSPACE_SIZE ++ * @suspectUncompressible: sampling checks, to potentially skip huffman coding ++ */ ++size_t ZSTD_compressLiterals (void* dst, size_t dstCapacity, + const void* src, size_t srcSize, + void* entropyWorkspace, size_t entropyWorkspaceSize, +- const int bmi2, +- unsigned suspectUncompressible); ++ const ZSTD_hufCTables_t* prevHuf, ++ ZSTD_hufCTables_t* nextHuf, ++ ZSTD_strategy strategy, int disableLiteralCompression, ++ int suspectUncompressible, ++ int bmi2); + + #endif /* ZSTD_COMPRESS_LITERALS_H */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.c b/lib/zstd/compress/zstd_compress_sequences.c +index 21ddc1b37acf..5c028c78d889 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.c ++++ b/lib/zstd/compress/zstd_compress_sequences.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -58,7 +59,7 @@ static unsigned ZSTD_useLowProbCount(size_t const nbSeq) + { + /* Heuristic: This should cover most blocks <= 16K and + * start to fade out after 16K to about 32K depending on +- * comprssibility. ++ * compressibility. + */ + return nbSeq >= 2048; + } +@@ -166,7 +167,7 @@ ZSTD_selectEncodingType( + if (mostFrequent == nbSeq) { + *repeatMode = FSE_repeat_none; + if (isDefaultAllowed && nbSeq <= 2) { +- /* Prefer set_basic over set_rle when there are 2 or less symbols, ++ /* Prefer set_basic over set_rle when there are 2 or fewer symbols, + * since RLE uses 1 byte, but set_basic uses 5-6 bits per symbol. + * If basic encoding isn't possible, always choose RLE. + */ +diff --git a/lib/zstd/compress/zstd_compress_sequences.h b/lib/zstd/compress/zstd_compress_sequences.h +index 7991364c2f71..7fe6f4ff5cf2 100644 +--- a/lib/zstd/compress/zstd_compress_sequences.h ++++ b/lib/zstd/compress/zstd_compress_sequences.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_compress_superblock.c b/lib/zstd/compress/zstd_compress_superblock.c +index 17d836cc84e8..dbacbaf72733 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.c ++++ b/lib/zstd/compress/zstd_compress_superblock.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -36,13 +37,14 @@ + * If it is set_compressed, first sub-block's literals section will be Treeless_Literals_Block + * and the following sub-blocks' literals sections will be Treeless_Literals_Block. + * @return : compressed size of literals section of a sub-block +- * Or 0 if it unable to compress. ++ * Or 0 if unable to compress. + * Or error code */ +-static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, +- const ZSTD_hufCTablesMetadata_t* hufMetadata, +- const BYTE* literals, size_t litSize, +- void* dst, size_t dstSize, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, ++ const ZSTD_hufCTablesMetadata_t* hufMetadata, ++ const BYTE* literals, size_t litSize, ++ void* dst, size_t dstSize, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + size_t const header = writeEntropy ? 200 : 0; + size_t const lhSize = 3 + (litSize >= (1 KB - header)) + (litSize >= (16 KB - header)); +@@ -53,8 +55,6 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + symbolEncodingType_e hType = writeEntropy ? hufMetadata->hType : set_repeat; + size_t cLitSize = 0; + +- (void)bmi2; /* TODO bmi2... */ +- + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (litSize=%zu, lhSize=%zu, writeEntropy=%d)", litSize, lhSize, writeEntropy); + + *entropyWritten = 0; +@@ -76,9 +76,9 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + DEBUGLOG(5, "ZSTD_compressSubBlock_literal (hSize=%zu)", hufMetadata->hufDesSize); + } + +- /* TODO bmi2 */ +- { const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable) +- : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable); ++ { int const flags = bmi2 ? HUF_flags_bmi2 : 0; ++ const size_t cSize = singleStream ? HUF_compress1X_usingCTable(op, oend-op, literals, litSize, hufTable, flags) ++ : HUF_compress4X_usingCTable(op, oend-op, literals, litSize, hufTable, flags); + op += cSize; + cLitSize += cSize; + if (cSize == 0 || ERR_isError(cSize)) { +@@ -126,7 +126,11 @@ static size_t ZSTD_compressSubBlock_literal(const HUF_CElt* hufTable, + return op-ostart; + } + +-static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* sequences, size_t nbSeq, size_t litSize, int lastSequence) { ++static size_t ++ZSTD_seqDecompressedSize(seqStore_t const* seqStore, ++ const seqDef* sequences, size_t nbSeq, ++ size_t litSize, int lastSequence) ++{ + const seqDef* const sstart = sequences; + const seqDef* const send = sequences + nbSeq; + const seqDef* sp = sstart; +@@ -156,13 +160,14 @@ static size_t ZSTD_seqDecompressedSize(seqStore_t const* seqStore, const seqDef* + * @return : compressed size of sequences section of a sub-block + * Or 0 if it is unable to compress + * Or error code. */ +-static size_t ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, +- const ZSTD_fseCTablesMetadata_t* fseMetadata, +- const seqDef* sequences, size_t nbSeq, +- const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, +- const ZSTD_CCtx_params* cctxParams, +- void* dst, size_t dstCapacity, +- const int bmi2, int writeEntropy, int* entropyWritten) ++static size_t ++ZSTD_compressSubBlock_sequences(const ZSTD_fseCTables_t* fseTables, ++ const ZSTD_fseCTablesMetadata_t* fseMetadata, ++ const seqDef* sequences, size_t nbSeq, ++ const BYTE* llCode, const BYTE* mlCode, const BYTE* ofCode, ++ const ZSTD_CCtx_params* cctxParams, ++ void* dst, size_t dstCapacity, ++ const int bmi2, int writeEntropy, int* entropyWritten) + { + const int longOffsets = cctxParams->cParams.windowLog > STREAM_ACCUMULATOR_MIN; + BYTE* const ostart = (BYTE*)dst; +@@ -539,7 +544,7 @@ static size_t ZSTD_compressSubBlock_multi(const seqStore_t* seqStorePtr, + repcodes_t rep; + ZSTD_memcpy(&rep, prevCBlock->rep, sizeof(rep)); + for (seq = sstart; seq < sp; ++seq) { +- ZSTD_updateRep(rep.rep, seq->offBase - 1, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); ++ ZSTD_updateRep(rep.rep, seq->offBase, ZSTD_getSequenceLength(seqStorePtr, seq).litLength == 0); + } + ZSTD_memcpy(nextCBlock->rep, &rep, sizeof(rep)); + } +diff --git a/lib/zstd/compress/zstd_compress_superblock.h b/lib/zstd/compress/zstd_compress_superblock.h +index 224ece79546e..826bbc9e029b 100644 +--- a/lib/zstd/compress/zstd_compress_superblock.h ++++ b/lib/zstd/compress/zstd_compress_superblock.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_cwksp.h b/lib/zstd/compress/zstd_cwksp.h +index 349fc923c355..65ea53b62844 100644 +--- a/lib/zstd/compress/zstd_cwksp.h ++++ b/lib/zstd/compress/zstd_cwksp.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,7 +15,9 @@ + /*-************************************* + * Dependencies + ***************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_internal.h" ++#include "../common/portability_macros.h" + + + /*-************************************* +@@ -41,8 +44,9 @@ + ***************************************/ + typedef enum { + ZSTD_cwksp_alloc_objects, +- ZSTD_cwksp_alloc_buffers, +- ZSTD_cwksp_alloc_aligned ++ ZSTD_cwksp_alloc_aligned_init_once, ++ ZSTD_cwksp_alloc_aligned, ++ ZSTD_cwksp_alloc_buffers + } ZSTD_cwksp_alloc_phase_e; + + /* +@@ -95,8 +99,8 @@ typedef enum { + * + * Workspace Layout: + * +- * [ ... workspace ... ] +- * [objects][tables ... ->] free space [<- ... aligned][<- ... buffers] ++ * [ ... workspace ... ] ++ * [objects][tables ->] free space [<- buffers][<- aligned][<- init once] + * + * The various objects that live in the workspace are divided into the + * following categories, and are allocated separately: +@@ -120,9 +124,18 @@ typedef enum { + * uint32_t arrays, all of whose values are between 0 and (nextSrc - base). + * Their sizes depend on the cparams. These tables are 64-byte aligned. + * +- * - Aligned: these buffers are used for various purposes that require 4 byte +- * alignment, but don't require any initialization before they're used. These +- * buffers are each aligned to 64 bytes. ++ * - Init once: these buffers require to be initialized at least once before ++ * use. They should be used when we want to skip memory initialization ++ * while not triggering memory checkers (like Valgrind) when reading from ++ * from this memory without writing to it first. ++ * These buffers should be used carefully as they might contain data ++ * from previous compressions. ++ * Buffers are aligned to 64 bytes. ++ * ++ * - Aligned: these buffers don't require any initialization before they're ++ * used. The user of the buffer should make sure they write into a buffer ++ * location before reading from it. ++ * Buffers are aligned to 64 bytes. + * + * - Buffers: these buffers are used for various purposes that don't require + * any alignment or initialization before they're used. This means they can +@@ -134,8 +147,9 @@ typedef enum { + * correctly packed into the workspace buffer. That order is: + * + * 1. Objects +- * 2. Buffers +- * 3. Aligned/Tables ++ * 2. Init once / Tables ++ * 3. Aligned / Tables ++ * 4. Buffers / Tables + * + * Attempts to reserve objects of different types out of order will fail. + */ +@@ -147,6 +161,7 @@ typedef struct { + void* tableEnd; + void* tableValidEnd; + void* allocStart; ++ void* initOnceStart; + + BYTE allocFailed; + int workspaceOversizedDuration; +@@ -159,6 +174,7 @@ typedef struct { + ***************************************/ + + MEM_STATIC size_t ZSTD_cwksp_available_space(ZSTD_cwksp* ws); ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws); + + MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + (void)ws; +@@ -168,6 +184,8 @@ MEM_STATIC void ZSTD_cwksp_assert_internal_consistency(ZSTD_cwksp* ws) { + assert(ws->tableEnd <= ws->allocStart); + assert(ws->tableValidEnd <= ws->allocStart); + assert(ws->allocStart <= ws->workspaceEnd); ++ assert(ws->initOnceStart <= ZSTD_cwksp_initialAllocStart(ws)); ++ assert(ws->workspace <= ws->initOnceStart); + } + + /* +@@ -210,14 +228,10 @@ MEM_STATIC size_t ZSTD_cwksp_aligned_alloc_size(size_t size) { + * for internal purposes (currently only alignment). + */ + MEM_STATIC size_t ZSTD_cwksp_slack_space_required(void) { +- /* For alignment, the wksp will always allocate an additional n_1=[1, 64] bytes +- * to align the beginning of tables section, as well as another n_2=[0, 63] bytes +- * to align the beginning of the aligned section. +- * +- * n_1 + n_2 == 64 bytes if the cwksp is freshly allocated, due to tables and +- * aligneds being sized in multiples of 64 bytes. ++ /* For alignment, the wksp will always allocate an additional 2*ZSTD_CWKSP_ALIGNMENT_BYTES ++ * bytes to align the beginning of tables section and end of buffers; + */ +- size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES; ++ size_t const slackSpace = ZSTD_CWKSP_ALIGNMENT_BYTES * 2; + return slackSpace; + } + +@@ -230,10 +244,18 @@ MEM_STATIC size_t ZSTD_cwksp_bytes_to_align_ptr(void* ptr, const size_t alignByt + size_t const alignBytesMask = alignBytes - 1; + size_t const bytes = (alignBytes - ((size_t)ptr & (alignBytesMask))) & alignBytesMask; + assert((alignBytes & alignBytesMask) == 0); +- assert(bytes != ZSTD_CWKSP_ALIGNMENT_BYTES); ++ assert(bytes < alignBytes); + return bytes; + } + ++/* ++ * Returns the initial value for allocStart which is used to determine the position from ++ * which we can allocate from the end of the workspace. ++ */ ++MEM_STATIC void* ZSTD_cwksp_initialAllocStart(ZSTD_cwksp* ws) { ++ return (void*)((size_t)ws->workspaceEnd & ~(ZSTD_CWKSP_ALIGNMENT_BYTES-1)); ++} ++ + /* + * Internal function. Do not use directly. + * Reserves the given number of bytes within the aligned/buffer segment of the wksp, +@@ -274,27 +296,16 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + { + assert(phase >= ws->phase); + if (phase > ws->phase) { +- /* Going from allocating objects to allocating buffers */ +- if (ws->phase < ZSTD_cwksp_alloc_buffers && +- phase >= ZSTD_cwksp_alloc_buffers) { ++ /* Going from allocating objects to allocating initOnce / tables */ ++ if (ws->phase < ZSTD_cwksp_alloc_aligned_init_once && ++ phase >= ZSTD_cwksp_alloc_aligned_init_once) { + ws->tableValidEnd = ws->objectEnd; +- } ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + +- /* Going from allocating buffers to allocating aligneds/tables */ +- if (ws->phase < ZSTD_cwksp_alloc_aligned && +- phase >= ZSTD_cwksp_alloc_aligned) { +- { /* Align the start of the "aligned" to 64 bytes. Use [1, 64] bytes. */ +- size_t const bytesToAlign = +- ZSTD_CWKSP_ALIGNMENT_BYTES - ZSTD_cwksp_bytes_to_align_ptr(ws->allocStart, ZSTD_CWKSP_ALIGNMENT_BYTES); +- DEBUGLOG(5, "reserving aligned alignment addtl space: %zu", bytesToAlign); +- ZSTD_STATIC_ASSERT((ZSTD_CWKSP_ALIGNMENT_BYTES & (ZSTD_CWKSP_ALIGNMENT_BYTES - 1)) == 0); /* power of 2 */ +- RETURN_ERROR_IF(!ZSTD_cwksp_reserve_internal_buffer_space(ws, bytesToAlign), +- memory_allocation, "aligned phase - alignment initial allocation failed!"); +- } + { /* Align the start of the tables to 64 bytes. Use [0, 63] bytes */ +- void* const alloc = ws->objectEnd; ++ void *const alloc = ws->objectEnd; + size_t const bytesToAlign = ZSTD_cwksp_bytes_to_align_ptr(alloc, ZSTD_CWKSP_ALIGNMENT_BYTES); +- void* const objectEnd = (BYTE*)alloc + bytesToAlign; ++ void *const objectEnd = (BYTE *) alloc + bytesToAlign; + DEBUGLOG(5, "reserving table alignment addtl space: %zu", bytesToAlign); + RETURN_ERROR_IF(objectEnd > ws->workspaceEnd, memory_allocation, + "table phase - alignment initial allocation failed!"); +@@ -302,7 +313,9 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + ws->tableEnd = objectEnd; /* table area starts being empty */ + if (ws->tableValidEnd < ws->tableEnd) { + ws->tableValidEnd = ws->tableEnd; +- } } } ++ } ++ } ++ } + ws->phase = phase; + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -314,7 +327,7 @@ ZSTD_cwksp_internal_advance_phase(ZSTD_cwksp* ws, ZSTD_cwksp_alloc_phase_e phase + */ + MEM_STATIC int ZSTD_cwksp_owns_buffer(const ZSTD_cwksp* ws, const void* ptr) + { +- return (ptr != NULL) && (ws->workspace <= ptr) && (ptr <= ws->workspaceEnd); ++ return (ptr != NULL) && (ws->workspace <= ptr) && (ptr < ws->workspaceEnd); + } + + /* +@@ -343,6 +356,33 @@ MEM_STATIC BYTE* ZSTD_cwksp_reserve_buffer(ZSTD_cwksp* ws, size_t bytes) + return (BYTE*)ZSTD_cwksp_reserve_internal(ws, bytes, ZSTD_cwksp_alloc_buffers); + } + ++/* ++ * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). ++ * This memory has been initialized at least once in the past. ++ * This doesn't mean it has been initialized this time, and it might contain data from previous ++ * operations. ++ * The main usage is for algorithms that might need read access into uninitialized memory. ++ * The algorithm must maintain safety under these conditions and must make sure it doesn't ++ * leak any of the past data (directly or in side channels). ++ */ ++MEM_STATIC void* ZSTD_cwksp_reserve_aligned_init_once(ZSTD_cwksp* ws, size_t bytes) ++{ ++ size_t const alignedBytes = ZSTD_cwksp_align(bytes, ZSTD_CWKSP_ALIGNMENT_BYTES); ++ void* ptr = ZSTD_cwksp_reserve_internal(ws, alignedBytes, ZSTD_cwksp_alloc_aligned_init_once); ++ assert(((size_t)ptr & (ZSTD_CWKSP_ALIGNMENT_BYTES-1))== 0); ++ if(ptr && ptr < ws->initOnceStart) { ++ /* We assume the memory following the current allocation is either: ++ * 1. Not usable as initOnce memory (end of workspace) ++ * 2. Another initOnce buffer that has been allocated before (and so was previously memset) ++ * 3. An ASAN redzone, in which case we don't want to write on it ++ * For these reasons it should be fine to not explicitly zero every byte up to ws->initOnceStart. ++ * Note that we assume here that MSAN and ASAN cannot run in the same time. */ ++ ZSTD_memset(ptr, 0, MIN((size_t)((U8*)ws->initOnceStart - (U8*)ptr), alignedBytes)); ++ ws->initOnceStart = ptr; ++ } ++ return ptr; ++} ++ + /* + * Reserves and returns memory sized on and aligned on ZSTD_CWKSP_ALIGNMENT_BYTES (64 bytes). + */ +@@ -361,13 +401,17 @@ MEM_STATIC void* ZSTD_cwksp_reserve_aligned(ZSTD_cwksp* ws, size_t bytes) + */ + MEM_STATIC void* ZSTD_cwksp_reserve_table(ZSTD_cwksp* ws, size_t bytes) + { +- const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned; ++ const ZSTD_cwksp_alloc_phase_e phase = ZSTD_cwksp_alloc_aligned_init_once; + void* alloc; + void* end; + void* top; + +- if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { +- return NULL; ++ /* We can only start allocating tables after we are done reserving space for objects at the ++ * start of the workspace */ ++ if(ws->phase < phase) { ++ if (ZSTD_isError(ZSTD_cwksp_internal_advance_phase(ws, phase))) { ++ return NULL; ++ } + } + alloc = ws->tableEnd; + end = (BYTE *)alloc + bytes; +@@ -451,7 +495,7 @@ MEM_STATIC void ZSTD_cwksp_clean_tables(ZSTD_cwksp* ws) { + assert(ws->tableValidEnd >= ws->objectEnd); + assert(ws->tableValidEnd <= ws->allocStart); + if (ws->tableValidEnd < ws->tableEnd) { +- ZSTD_memset(ws->tableValidEnd, 0, (BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd); ++ ZSTD_memset(ws->tableValidEnd, 0, (size_t)((BYTE*)ws->tableEnd - (BYTE*)ws->tableValidEnd)); + } + ZSTD_cwksp_mark_tables_clean(ws); + } +@@ -478,10 +522,10 @@ MEM_STATIC void ZSTD_cwksp_clear(ZSTD_cwksp* ws) { + + + ws->tableEnd = ws->objectEnd; +- ws->allocStart = ws->workspaceEnd; ++ ws->allocStart = ZSTD_cwksp_initialAllocStart(ws); + ws->allocFailed = 0; +- if (ws->phase > ZSTD_cwksp_alloc_buffers) { +- ws->phase = ZSTD_cwksp_alloc_buffers; ++ if (ws->phase > ZSTD_cwksp_alloc_aligned_init_once) { ++ ws->phase = ZSTD_cwksp_alloc_aligned_init_once; + } + ZSTD_cwksp_assert_internal_consistency(ws); + } +@@ -498,6 +542,7 @@ MEM_STATIC void ZSTD_cwksp_init(ZSTD_cwksp* ws, void* start, size_t size, ZSTD_c + ws->workspaceEnd = (BYTE*)start + size; + ws->objectEnd = ws->workspace; + ws->tableValidEnd = ws->objectEnd; ++ ws->initOnceStart = ZSTD_cwksp_initialAllocStart(ws); + ws->phase = ZSTD_cwksp_alloc_objects; + ws->isStatic = isStatic; + ZSTD_cwksp_clear(ws); +@@ -550,17 +595,11 @@ MEM_STATIC int ZSTD_cwksp_reserve_failed(const ZSTD_cwksp* ws) { + * Returns if the estimated space needed for a wksp is within an acceptable limit of the + * actual amount of space used. + */ +-MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp* const ws, +- size_t const estimatedSpace, int resizedWorkspace) { +- if (resizedWorkspace) { +- /* Resized/newly allocated wksp should have exact bounds */ +- return ZSTD_cwksp_used(ws) == estimatedSpace; +- } else { +- /* Due to alignment, when reusing a workspace, we can actually consume 63 fewer or more bytes +- * than estimatedSpace. See the comments in zstd_cwksp.h for details. +- */ +- return (ZSTD_cwksp_used(ws) >= estimatedSpace - 63) && (ZSTD_cwksp_used(ws) <= estimatedSpace + 63); +- } ++MEM_STATIC int ZSTD_cwksp_estimated_space_within_bounds(const ZSTD_cwksp *const ws, size_t const estimatedSpace) { ++ /* We have an alignment space between objects and tables between tables and buffers, so we can have up to twice ++ * the alignment bytes difference between estimation and actual usage */ ++ return (estimatedSpace - ZSTD_cwksp_slack_space_required()) <= ZSTD_cwksp_used(ws) && ++ ZSTD_cwksp_used(ws) <= estimatedSpace; + } + + +diff --git a/lib/zstd/compress/zstd_double_fast.c b/lib/zstd/compress/zstd_double_fast.c +index 76933dea2624..ab9440a99603 100644 +--- a/lib/zstd/compress/zstd_double_fast.c ++++ b/lib/zstd/compress/zstd_double_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,43 @@ + #include "zstd_compress_internal.h" + #include "zstd_double_fast.h" + ++static void ZSTD_fillDoubleHashTableForCDict(ZSTD_matchState_t* ms, ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashLarge = ms->hashTable; ++ U32 const hBitsL = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ U32* const hashSmall = ms->chainTable; ++ U32 const hBitsS = cParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; + +-void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++ /* Always insert every fastHashFillStep position into the hash tables. ++ * Insert the other positions into the large hash table if their entry ++ * is empty. ++ */ ++ for (; ip + fastHashFillStep - 1 <= iend; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ U32 i; ++ for (i = 0; i < fastHashFillStep; ++i) { ++ size_t const smHashAndTag = ZSTD_hashPtr(ip + i, hBitsS, mls); ++ size_t const lgHashAndTag = ZSTD_hashPtr(ip + i, hBitsL, 8); ++ if (i == 0) { ++ ZSTD_writeTaggedIndex(hashSmall, smHashAndTag, curr + i); ++ } ++ if (i == 0 || hashLarge[lgHashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { ++ ZSTD_writeTaggedIndex(hashLarge, lgHashAndTag, curr + i); ++ } ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ if (dtlm == ZSTD_dtlm_fast) ++ break; ++ } } ++} ++ ++static void ZSTD_fillDoubleHashTableForCCtx(ZSTD_matchState_t* ms, + void const* end, ZSTD_dictTableLoadMethod_e dtlm) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -43,7 +79,19 @@ void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, + /* Only load extra positions for ZSTD_dtlm_full */ + if (dtlm == ZSTD_dtlm_fast) + break; +- } } ++ } } ++} ++ ++void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillDoubleHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillDoubleHashTableForCCtx(ms, end, dtlm); ++ } + } + + +@@ -67,7 +115,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t mLength; + U32 offset; +@@ -100,8 +148,8 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const current = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, current, cParams->windowLog); + U32 const maxRep = current - windowLow; +- if (offset_2 > maxRep) offsetSaved = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) offsetSaved = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + + /* Outer Loop: one iteration per match found and stored */ +@@ -131,7 +179,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + if ((offset_1 > 0) & (MEM_read32(ip+1-offset_1) == MEM_read32(ip+1))) { + mLength = ZSTD_count(ip+1+4, ip+1+4-offset_1, iend) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -175,9 +223,13 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + } while (ip1 <= ilimit); + + _cleanup: ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -217,7 +269,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + hashLong[hl1] = (U32)(ip1 - base); + } + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -243,7 +295,7 @@ size_t ZSTD_compressBlock_doubleFast_noDict_generic( + U32 const tmpOff = offset_2; offset_2 = offset_1; offset_1 = tmpOff; /* swap offset_2 <=> offset_1 */ + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = (U32)(ip-base); + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = (U32)(ip-base); +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + ip += rLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -275,7 +327,6 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams; +@@ -286,8 +337,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixLowestIndex - (U32)(dictEnd - dictBase); +- const U32 dictHBitsL = dictCParams->hashLog; +- const U32 dictHBitsS = dictCParams->chainLog; ++ const U32 dictHBitsL = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ const U32 dictHBitsS = dictCParams->chainLog + ZSTD_SHORT_CACHE_TAG_BITS; + const U32 dictAndPrefixLength = (U32)((ip - prefixLowest) + (dictEnd - dictStart)); + + DEBUGLOG(5, "ZSTD_compressBlock_doubleFast_dictMatchState_generic"); +@@ -295,6 +346,13 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + /* if a dictionary is attached, it must be within window range */ + assert(ms->window.dictLimit + (1U << cParams->windowLog) >= endIndex); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ size_t const chainTableBytes = (((size_t)1) << dictCParams->chainLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashLong, hashTableBytes) ++ PREFETCH_AREA(dictHashSmall, chainTableBytes) ++ } ++ + /* init */ + ip += (dictAndPrefixLength == 0); + +@@ -309,8 +367,12 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + U32 offset; + size_t const h2 = ZSTD_hashPtr(ip, hBitsL, 8); + size_t const h = ZSTD_hashPtr(ip, hBitsS, mls); +- size_t const dictHL = ZSTD_hashPtr(ip, dictHBitsL, 8); +- size_t const dictHS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ size_t const dictHashAndTagL = ZSTD_hashPtr(ip, dictHBitsL, 8); ++ size_t const dictHashAndTagS = ZSTD_hashPtr(ip, dictHBitsS, mls); ++ U32 const dictMatchIndexAndTagL = dictHashLong[dictHashAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ U32 const dictMatchIndexAndTagS = dictHashSmall[dictHashAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL = ZSTD_comparePackedTags(dictMatchIndexAndTagL, dictHashAndTagL); ++ int const dictTagsMatchS = ZSTD_comparePackedTags(dictMatchIndexAndTagS, dictHashAndTagS); + U32 const curr = (U32)(ip-base); + U32 const matchIndexL = hashLong[h2]; + U32 matchIndexS = hashSmall[h]; +@@ -328,7 +390,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + goto _match_stored; + } + +@@ -340,9 +402,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchLong>prefixLowest)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL) { + /* check dictMatchState long match */ +- U32 const dictMatchIndexL = dictHashLong[dictHL]; ++ U32 const dictMatchIndexL = dictMatchIndexAndTagL >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL = dictBase + dictMatchIndexL; + assert(dictMatchL < dictEnd); + +@@ -358,9 +420,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + if (MEM_read32(match) == MEM_read32(ip)) { + goto _search_next_long; + } +- } else { ++ } else if (dictTagsMatchS) { + /* check dictMatchState short match */ +- U32 const dictMatchIndexS = dictHashSmall[dictHS]; ++ U32 const dictMatchIndexS = dictMatchIndexAndTagS >> ZSTD_SHORT_CACHE_TAG_BITS; + match = dictBase + dictMatchIndexS; + matchIndexS = dictMatchIndexS + dictIndexDelta; + +@@ -375,10 +437,11 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + continue; + + _search_next_long: +- + { size_t const hl3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +- size_t const dictHLNext = ZSTD_hashPtr(ip+1, dictHBitsL, 8); ++ size_t const dictHashAndTagL3 = ZSTD_hashPtr(ip+1, dictHBitsL, 8); + U32 const matchIndexL3 = hashLong[hl3]; ++ U32 const dictMatchIndexAndTagL3 = dictHashLong[dictHashAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int const dictTagsMatchL3 = ZSTD_comparePackedTags(dictMatchIndexAndTagL3, dictHashAndTagL3); + const BYTE* matchL3 = base + matchIndexL3; + hashLong[hl3] = curr + 1; + +@@ -391,9 +454,9 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + while (((ip>anchor) & (matchL3>prefixLowest)) && (ip[-1] == matchL3[-1])) { ip--; matchL3--; mLength++; } /* catch up */ + goto _match_found; + } +- } else { ++ } else if (dictTagsMatchL3) { + /* check dict long +1 match */ +- U32 const dictMatchIndexL3 = dictHashLong[dictHLNext]; ++ U32 const dictMatchIndexL3 = dictMatchIndexAndTagL3 >> ZSTD_SHORT_CACHE_TAG_BITS; + const BYTE* dictMatchL3 = dictBase + dictMatchIndexL3; + assert(dictMatchL3 < dictEnd); + if (dictMatchL3 > dictStart && MEM_read64(dictMatchL3) == MEM_read64(ip+1)) { +@@ -419,7 +482,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + offset_2 = offset_1; + offset_1 = offset; + +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + _match_stored: + /* match found */ +@@ -448,7 +511,7 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + const BYTE* const repEnd2 = repIndex2 < prefixLowestIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixLowest) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +@@ -461,8 +524,8 @@ size_t ZSTD_compressBlock_doubleFast_dictMatchState_generic( + } /* while (ip < ilimit) */ + + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -585,7 +648,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; + mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; + ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); + } else { + if ((matchLongIndex > dictStartIndex) && (MEM_read64(matchLong) == MEM_read64(ip))) { + const BYTE* const matchEnd = matchLongIndex < prefixStartIndex ? dictEnd : iend; +@@ -596,7 +659,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + while (((ip>anchor) & (matchLong>lowMatchPtr)) && (ip[-1] == matchLong[-1])) { ip--; matchLong--; mLength++; } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else if ((matchIndex > dictStartIndex) && (MEM_read32(match) == MEM_read32(ip))) { + size_t const h3 = ZSTD_hashPtr(ip+1, hBitsL, 8); +@@ -621,7 +684,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + } + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); + + } else { + ip += ((ip-anchor) >> kSearchStrength) + 1; +@@ -653,7 +716,7 @@ static size_t ZSTD_compressBlock_doubleFast_extDict_generic( + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; + size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); + hashSmall[ZSTD_hashPtr(ip, hBitsS, mls)] = current2; + hashLong[ZSTD_hashPtr(ip, hBitsL, 8)] = current2; + ip += repLength2; +diff --git a/lib/zstd/compress/zstd_double_fast.h b/lib/zstd/compress/zstd_double_fast.h +index 6822bde65a1d..0204f12e4cf7 100644 +--- a/lib/zstd/compress/zstd_double_fast.h ++++ b/lib/zstd/compress/zstd_double_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,8 @@ + #include "zstd_compress_internal.h" /* ZSTD_CCtx, size_t */ + + void ZSTD_fillDoubleHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_doubleFast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +diff --git a/lib/zstd/compress/zstd_fast.c b/lib/zstd/compress/zstd_fast.c +index a752e6beab52..3399b39c5dbc 100644 +--- a/lib/zstd/compress/zstd_fast.c ++++ b/lib/zstd/compress/zstd_fast.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -11,8 +12,42 @@ + #include "zstd_compress_internal.h" /* ZSTD_hashPtr, ZSTD_count, ZSTD_storeSeq */ + #include "zstd_fast.h" + ++static void ZSTD_fillHashTableForCDict(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm) ++{ ++ const ZSTD_compressionParameters* const cParams = &ms->cParams; ++ U32* const hashTable = ms->hashTable; ++ U32 const hBits = cParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; ++ U32 const mls = cParams->minMatch; ++ const BYTE* const base = ms->window.base; ++ const BYTE* ip = base + ms->nextToUpdate; ++ const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; ++ const U32 fastHashFillStep = 3; + +-void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ /* Currently, we always use ZSTD_dtlm_full for filling CDict tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_full); ++ ++ /* Always insert every fastHashFillStep position into the hash table. ++ * Insert the other positions if their hash entry is empty. ++ */ ++ for ( ; ip + fastHashFillStep < iend + 2; ip += fastHashFillStep) { ++ U32 const curr = (U32)(ip - base); ++ { size_t const hashAndTag = ZSTD_hashPtr(ip, hBits, mls); ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr); } ++ ++ if (dtlm == ZSTD_dtlm_fast) continue; ++ /* Only load extra positions for ZSTD_dtlm_full */ ++ { U32 p; ++ for (p = 1; p < fastHashFillStep; ++p) { ++ size_t const hashAndTag = ZSTD_hashPtr(ip + p, hBits, mls); ++ if (hashTable[hashAndTag >> ZSTD_SHORT_CACHE_TAG_BITS] == 0) { /* not yet filled */ ++ ZSTD_writeTaggedIndex(hashTable, hashAndTag, curr + p); ++ } } } } ++} ++ ++static void ZSTD_fillHashTableForCCtx(ZSTD_matchState_t* ms, + const void* const end, + ZSTD_dictTableLoadMethod_e dtlm) + { +@@ -25,6 +60,10 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + const BYTE* const iend = ((const BYTE*)end) - HASH_READ_SIZE; + const U32 fastHashFillStep = 3; + ++ /* Currently, we always use ZSTD_dtlm_fast for filling CCtx tables. ++ * Feel free to remove this assert if there's a good reason! */ ++ assert(dtlm == ZSTD_dtlm_fast); ++ + /* Always insert every fastHashFillStep position into the hash table. + * Insert the other positions if their hash entry is empty. + */ +@@ -42,6 +81,18 @@ void ZSTD_fillHashTable(ZSTD_matchState_t* ms, + } } } } + } + ++void ZSTD_fillHashTable(ZSTD_matchState_t* ms, ++ const void* const end, ++ ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp) ++{ ++ if (tfp == ZSTD_tfp_forCDict) { ++ ZSTD_fillHashTableForCDict(ms, end, dtlm); ++ } else { ++ ZSTD_fillHashTableForCCtx(ms, end, dtlm); ++ } ++} ++ + + /* + * If you squint hard enough (and ignore repcodes), the search operation at any +@@ -117,7 +168,7 @@ ZSTD_compressBlock_fast_noDict_generic( + + U32 rep_offset1 = rep[0]; + U32 rep_offset2 = rep[1]; +- U32 offsetSaved = 0; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + size_t hash0; /* hash for ip0 */ + size_t hash1; /* hash for ip1 */ +@@ -141,8 +192,8 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const curr = (U32)(ip0 - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, cParams->windowLog); + U32 const maxRep = curr - windowLow; +- if (rep_offset2 > maxRep) offsetSaved = rep_offset2, rep_offset2 = 0; +- if (rep_offset1 > maxRep) offsetSaved = rep_offset1, rep_offset1 = 0; ++ if (rep_offset2 > maxRep) offsetSaved2 = rep_offset2, rep_offset2 = 0; ++ if (rep_offset1 > maxRep) offsetSaved1 = rep_offset1, rep_offset1 = 0; + } + + /* start each op */ +@@ -180,8 +231,14 @@ ZSTD_compressBlock_fast_noDict_generic( + mLength = ip0[-1] == match0[-1]; + ip0 -= mLength; + match0 -= mLength; +- offcode = STORE_REPCODE_1; ++ offcode = REPCODE1_TO_OFFBASE; + mLength += 4; ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 is before the ++ * repcode (ip2). */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _match; + } + +@@ -195,6 +252,12 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* First write next hash table entry; we've already calculated it. ++ * This write is known to be safe because the ip1 == ip0 + 1, so ++ * we know we will resume searching after ip1 */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ + goto _offset; + } + +@@ -224,6 +287,21 @@ ZSTD_compressBlock_fast_noDict_generic( + /* check match at ip[0] */ + if (MEM_read32(ip0) == mval) { + /* found a match! */ ++ ++ /* first write next hash table entry; we've already calculated it */ ++ if (step <= 4) { ++ /* We need to avoid writing an index into the hash table >= the ++ * position at which we will pick up our searching after we've ++ * taken this match. ++ * ++ * The minimum possible match has length 4, so the earliest ip0 ++ * can be after we take this match will be the current ip0 + 4. ++ * ip1 is ip0 + step - 1. If ip1 is >= ip0 + 4, we can't safely ++ * write this position. ++ */ ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ + goto _offset; + } + +@@ -254,9 +332,24 @@ ZSTD_compressBlock_fast_noDict_generic( + * However, it seems to be a meaningful performance hit to try to search + * them. So let's not. */ + ++ /* When the repcodes are outside of the prefix, we set them to zero before the loop. ++ * When the offsets are still zero, we need to restore them after the block to have a correct ++ * repcode history. If only one offset was invalid, it is easy. The tricky case is when both ++ * offsets were invalid. We need to figure out which offset to refill with. ++ * - If both offsets are zero they are in the same order. ++ * - If both offsets are non-zero, we won't restore the offsets from `offsetSaved[12]`. ++ * - If only one is zero, we need to decide which offset to restore. ++ * - If rep_offset1 is non-zero, then rep_offset2 must be offsetSaved1. ++ * - It is impossible for rep_offset2 to be non-zero. ++ * ++ * So if rep_offset1 started invalid (offsetSaved1 != 0) and became valid (rep_offset1 != 0), then ++ * set rep[0] = rep_offset1 and rep[1] = offsetSaved1. ++ */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (rep_offset1 != 0)) ? offsetSaved1 : offsetSaved2; ++ + /* save reps for next block */ +- rep[0] = rep_offset1 ? rep_offset1 : offsetSaved; +- rep[1] = rep_offset2 ? rep_offset2 : offsetSaved; ++ rep[0] = rep_offset1 ? rep_offset1 : offsetSaved1; ++ rep[1] = rep_offset2 ? rep_offset2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -267,7 +360,7 @@ ZSTD_compressBlock_fast_noDict_generic( + match0 = base + idx; + rep_offset2 = rep_offset1; + rep_offset1 = (U32)(ip0-match0); +- offcode = STORE_OFFSET(rep_offset1); ++ offcode = OFFSET_TO_OFFBASE(rep_offset1); + mLength = 4; + + /* Count the backwards match length. */ +@@ -287,11 +380,6 @@ ZSTD_compressBlock_fast_noDict_generic( + ip0 += mLength; + anchor = ip0; + +- /* write next hash table entry */ +- if (ip1 < ip0) { +- hashTable[hash1] = (U32)(ip1 - base); +- } +- + /* Fill table and check for immediate repcode. */ + if (ip0 <= ilimit) { + /* Fill Table */ +@@ -306,7 +394,7 @@ ZSTD_compressBlock_fast_noDict_generic( + { U32 const tmpOff = rep_offset2; rep_offset2 = rep_offset1; rep_offset1 = tmpOff; } /* swap rep_offset2 <=> rep_offset1 */ + hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); + ip0 += rLength; +- ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, STORE_REPCODE_1, rLength); ++ ZSTD_storeSeq(seqStore, 0 /*litLen*/, anchor, iend, REPCODE1_TO_OFFBASE, rLength); + anchor = ip0; + continue; /* faster when present (confirmed on gcc-8) ... (?) */ + } } } +@@ -380,14 +468,14 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + U32 const stepSize = cParams->targetLength + !(cParams->targetLength); + const BYTE* const base = ms->window.base; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; ++ const BYTE* ip0 = istart; ++ const BYTE* ip1 = ip0 + stepSize; /* we assert below that stepSize >= 1 */ + const BYTE* anchor = istart; + const U32 prefixStartIndex = ms->window.dictLimit; + const BYTE* const prefixStart = base + prefixStartIndex; + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - HASH_READ_SIZE; + U32 offset_1=rep[0], offset_2=rep[1]; +- U32 offsetSaved = 0; + + const ZSTD_matchState_t* const dms = ms->dictMatchState; + const ZSTD_compressionParameters* const dictCParams = &dms->cParams ; +@@ -397,13 +485,13 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + const BYTE* const dictStart = dictBase + dictStartIndex; + const BYTE* const dictEnd = dms->window.nextSrc; + const U32 dictIndexDelta = prefixStartIndex - (U32)(dictEnd - dictBase); +- const U32 dictAndPrefixLength = (U32)(ip - prefixStart + dictEnd - dictStart); +- const U32 dictHLog = dictCParams->hashLog; ++ const U32 dictAndPrefixLength = (U32)(istart - prefixStart + dictEnd - dictStart); ++ const U32 dictHBits = dictCParams->hashLog + ZSTD_SHORT_CACHE_TAG_BITS; + + /* if a dictionary is still attached, it necessarily means that + * it is within window size. So we just check it. */ + const U32 maxDistance = 1U << cParams->windowLog; +- const U32 endIndex = (U32)((size_t)(ip - base) + srcSize); ++ const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + assert(endIndex - prefixStartIndex <= maxDistance); + (void)maxDistance; (void)endIndex; /* these variables are not used when assert() is disabled */ + +@@ -413,106 +501,155 @@ size_t ZSTD_compressBlock_fast_dictMatchState_generic( + * when translating a dict index into a local index */ + assert(prefixStartIndex >= (U32)(dictEnd - dictBase)); + ++ if (ms->prefetchCDictTables) { ++ size_t const hashTableBytes = (((size_t)1) << dictCParams->hashLog) * sizeof(U32); ++ PREFETCH_AREA(dictHashTable, hashTableBytes) ++ } ++ + /* init */ + DEBUGLOG(5, "ZSTD_compressBlock_fast_dictMatchState_generic"); +- ip += (dictAndPrefixLength == 0); ++ ip0 += (dictAndPrefixLength == 0); + /* dictMatchState repCode checks don't currently handle repCode == 0 + * disabling. */ + assert(offset_1 <= dictAndPrefixLength); + assert(offset_2 <= dictAndPrefixLength); + +- /* Main Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because repcode check at (ip+1) */ ++ /* Outer search loop */ ++ assert(stepSize >= 1); ++ while (ip1 <= ilimit) { /* repcode check at (ip0 + 1) is safe because ip0 < ip1 */ + size_t mLength; +- size_t const h = ZSTD_hashPtr(ip, hlog, mls); +- U32 const curr = (U32)(ip-base); +- U32 const matchIndex = hashTable[h]; +- const BYTE* match = base + matchIndex; +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* repMatch = (repIndex < prefixStartIndex) ? +- dictBase + (repIndex - dictIndexDelta) : +- base + repIndex; +- hashTable[h] = curr; /* update hash table */ +- +- if ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- mLength = ZSTD_count_2segments(ip+1+4, repMatch+4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, mLength); +- } else if ( (matchIndex <= prefixStartIndex) ) { +- size_t const dictHash = ZSTD_hashPtr(ip, dictHLog, mls); +- U32 const dictMatchIndex = dictHashTable[dictHash]; +- const BYTE* dictMatch = dictBase + dictMatchIndex; +- if (dictMatchIndex <= dictStartIndex || +- MEM_read32(dictMatch) != MEM_read32(ip)) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a dict match */ +- U32 const offset = (U32)(curr-dictMatchIndex-dictIndexDelta); +- mLength = ZSTD_count_2segments(ip+4, dictMatch+4, iend, dictEnd, prefixStart) + 4; +- while (((ip>anchor) & (dictMatch>dictStart)) +- && (ip[-1] == dictMatch[-1])) { +- ip--; dictMatch--; mLength++; ++ size_t hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ ++ size_t const dictHashAndTag0 = ZSTD_hashPtr(ip0, dictHBits, mls); ++ U32 dictMatchIndexAndTag = dictHashTable[dictHashAndTag0 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ int dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag0); ++ ++ U32 matchIndex = hashTable[hash0]; ++ U32 curr = (U32)(ip0 - base); ++ size_t step = stepSize; ++ const size_t kStepIncr = 1 << kSearchStrength; ++ const BYTE* nextStep = ip0 + kStepIncr; ++ ++ /* Inner search loop */ ++ while (1) { ++ const BYTE* match = base + matchIndex; ++ const U32 repIndex = curr + 1 - offset_1; ++ const BYTE* repMatch = (repIndex < prefixStartIndex) ? ++ dictBase + (repIndex - dictIndexDelta) : ++ base + repIndex; ++ const size_t hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ size_t const dictHashAndTag1 = ZSTD_hashPtr(ip1, dictHBits, mls); ++ hashTable[hash0] = curr; /* update hash table */ ++ ++ if (((U32) ((prefixStartIndex - 1) - repIndex) >= ++ 3) /* intentional underflow : ensure repIndex isn't overlapping dict + prefix */ ++ && (MEM_read32(repMatch) == MEM_read32(ip0 + 1))) { ++ const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ mLength = ZSTD_count_2segments(ip0 + 1 + 4, repMatch + 4, iend, repMatchEnd, prefixStart) + 4; ++ ip0++; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, REPCODE1_TO_OFFBASE, mLength); ++ break; ++ } ++ ++ if (dictTagsMatch) { ++ /* Found a possible dict match */ ++ const U32 dictMatchIndex = dictMatchIndexAndTag >> ZSTD_SHORT_CACHE_TAG_BITS; ++ const BYTE* dictMatch = dictBase + dictMatchIndex; ++ if (dictMatchIndex > dictStartIndex && ++ MEM_read32(dictMatch) == MEM_read32(ip0)) { ++ /* To replicate extDict parse behavior, we only use dict matches when the normal matchIndex is invalid */ ++ if (matchIndex <= prefixStartIndex) { ++ U32 const offset = (U32) (curr - dictMatchIndex - dictIndexDelta); ++ mLength = ZSTD_count_2segments(ip0 + 4, dictMatch + 4, iend, dictEnd, prefixStart) + 4; ++ while (((ip0 > anchor) & (dictMatch > dictStart)) ++ && (ip0[-1] == dictMatch[-1])) { ++ ip0--; ++ dictMatch--; ++ mLength++; ++ } /* catch up */ ++ offset_2 = offset_1; ++ offset_1 = offset; ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; ++ } ++ } ++ } ++ ++ if (matchIndex > prefixStartIndex && MEM_read32(match) == MEM_read32(ip0)) { ++ /* found a regular match */ ++ U32 const offset = (U32) (ip0 - match); ++ mLength = ZSTD_count(ip0 + 4, match + 4, iend) + 4; ++ while (((ip0 > anchor) & (match > prefixStart)) ++ && (ip0[-1] == match[-1])) { ++ ip0--; ++ match--; ++ mLength++; + } /* catch up */ + offset_2 = offset_1; + offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); ++ ZSTD_storeSeq(seqStore, (size_t) (ip0 - anchor), anchor, iend, OFFSET_TO_OFFBASE(offset), mLength); ++ break; + } +- } else if (MEM_read32(match) != MEM_read32(ip)) { +- /* it's not a match, and we're not going to check the dictionary */ +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; +- } else { +- /* found a regular match */ +- U32 const offset = (U32)(ip-match); +- mLength = ZSTD_count(ip+4, match+4, iend) + 4; +- while (((ip>anchor) & (match>prefixStart)) +- && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; +- offset_1 = offset; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- } ++ ++ /* Prepare for next iteration */ ++ dictMatchIndexAndTag = dictHashTable[dictHashAndTag1 >> ZSTD_SHORT_CACHE_TAG_BITS]; ++ dictTagsMatch = ZSTD_comparePackedTags(dictMatchIndexAndTag, dictHashAndTag1); ++ matchIndex = hashTable[hash1]; ++ ++ if (ip1 >= nextStep) { ++ step++; ++ nextStep += kStepIncr; ++ } ++ ip0 = ip1; ++ ip1 = ip1 + step; ++ if (ip1 > ilimit) goto _cleanup; ++ ++ curr = (U32)(ip0 - base); ++ hash0 = hash1; ++ } /* end inner search loop */ + + /* match found */ +- ip += mLength; +- anchor = ip; ++ assert(mLength); ++ ip0 += mLength; ++ anchor = ip0; + +- if (ip <= ilimit) { ++ if (ip0 <= ilimit) { + /* Fill Table */ + assert(base+curr+2 > istart); /* check base overflow */ + hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; /* here because curr+2 could be > iend-8 */ +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); + + /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); ++ while (ip0 <= ilimit) { ++ U32 const current2 = (U32)(ip0-base); + U32 const repIndex2 = current2 - offset_2; + const BYTE* repMatch2 = repIndex2 < prefixStartIndex ? + dictBase - dictIndexDelta + repIndex2 : + base + repIndex2; + if ( ((U32)((prefixStartIndex-1) - (U32)repIndex2) >= 3 /* intentional overflow */) +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { ++ && (MEM_read32(repMatch2) == MEM_read32(ip0))) { + const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; + U32 tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = current2; ++ ip0 += repLength2; ++ anchor = ip0; + continue; + } + break; + } + } ++ ++ /* Prepare for next iteration */ ++ assert(ip0 == anchor); ++ ip1 = ip0 + stepSize; + } + ++_cleanup: + /* save reps for next block */ +- rep[0] = offset_1 ? offset_1 : offsetSaved; +- rep[1] = offset_2 ? offset_2 : offsetSaved; ++ rep[0] = offset_1; ++ rep[1] = offset_2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -553,11 +690,10 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + U32* const hashTable = ms->hashTable; + U32 const hlog = cParams->hashLog; + /* support stepSize of 0 */ +- U32 const stepSize = cParams->targetLength + !(cParams->targetLength); ++ size_t const stepSize = cParams->targetLength + !(cParams->targetLength) + 1; + const BYTE* const base = ms->window.base; + const BYTE* const dictBase = ms->window.dictBase; + const BYTE* const istart = (const BYTE*)src; +- const BYTE* ip = istart; + const BYTE* anchor = istart; + const U32 endIndex = (U32)((size_t)(istart - base) + srcSize); + const U32 lowLimit = ZSTD_getLowestMatchIndex(ms, endIndex, cParams->windowLog); +@@ -570,6 +706,28 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + const BYTE* const iend = istart + srcSize; + const BYTE* const ilimit = iend - 8; + U32 offset_1=rep[0], offset_2=rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; ++ ++ const BYTE* ip0 = istart; ++ const BYTE* ip1; ++ const BYTE* ip2; ++ const BYTE* ip3; ++ U32 current0; ++ ++ ++ size_t hash0; /* hash for ip0 */ ++ size_t hash1; /* hash for ip1 */ ++ U32 idx; /* match idx for ip0 */ ++ const BYTE* idxBase; /* base pointer for idx */ ++ ++ U32 offcode; ++ const BYTE* match0; ++ size_t mLength; ++ const BYTE* matchEnd = 0; /* initialize to avoid warning, assert != 0 later */ ++ ++ size_t step; ++ const BYTE* nextStep; ++ const size_t kStepIncr = (1 << (kSearchStrength - 1)); + + (void)hasStep; /* not currently specialized on whether it's accelerated */ + +@@ -579,75 +737,202 @@ static size_t ZSTD_compressBlock_fast_extDict_generic( + if (prefixStartIndex == dictStartIndex) + return ZSTD_compressBlock_fast(ms, seqStore, rep, src, srcSize); + +- /* Search Loop */ +- while (ip < ilimit) { /* < instead of <=, because (ip+1) */ +- const size_t h = ZSTD_hashPtr(ip, hlog, mls); +- const U32 matchIndex = hashTable[h]; +- const BYTE* const matchBase = matchIndex < prefixStartIndex ? dictBase : base; +- const BYTE* match = matchBase + matchIndex; +- const U32 curr = (U32)(ip-base); +- const U32 repIndex = curr + 1 - offset_1; +- const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; +- const BYTE* const repMatch = repBase + repIndex; +- hashTable[h] = curr; /* update hash table */ +- DEBUGLOG(7, "offset_1 = %u , curr = %u", offset_1, curr); +- +- if ( ( ((U32)((prefixStartIndex-1) - repIndex) >= 3) /* intentional underflow */ +- & (offset_1 <= curr+1 - dictStartIndex) ) /* note: we are searching at curr+1 */ +- && (MEM_read32(repMatch) == MEM_read32(ip+1)) ) { +- const BYTE* const repMatchEnd = repIndex < prefixStartIndex ? dictEnd : iend; +- size_t const rLength = ZSTD_count_2segments(ip+1 +4, repMatch +4, iend, repMatchEnd, prefixStart) + 4; +- ip++; +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_REPCODE_1, rLength); +- ip += rLength; +- anchor = ip; +- } else { +- if ( (matchIndex < dictStartIndex) || +- (MEM_read32(match) != MEM_read32(ip)) ) { +- assert(stepSize >= 1); +- ip += ((ip-anchor) >> kSearchStrength) + stepSize; +- continue; ++ { U32 const curr = (U32)(ip0 - base); ++ U32 const maxRep = curr - dictStartIndex; ++ if (offset_2 >= maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 >= maxRep) offsetSaved1 = offset_1, offset_1 = 0; ++ } ++ ++ /* start each op */ ++_start: /* Requires: ip0 */ ++ ++ step = stepSize; ++ nextStep = ip0 + kStepIncr; ++ ++ /* calculate positions, ip0 - anchor == 0, so we skip step calc */ ++ ip1 = ip0 + 1; ++ ip2 = ip0 + step; ++ ip3 = ip2 + 1; ++ ++ if (ip3 >= ilimit) { ++ goto _cleanup; ++ } ++ ++ hash0 = ZSTD_hashPtr(ip0, hlog, mls); ++ hash1 = ZSTD_hashPtr(ip1, hlog, mls); ++ ++ idx = hashTable[hash0]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ do { ++ { /* load repcode match for ip[2] */ ++ U32 const current2 = (U32)(ip2 - base); ++ U32 const repIndex = current2 - offset_1; ++ const BYTE* const repBase = repIndex < prefixStartIndex ? dictBase : base; ++ U32 rval; ++ if ( ((U32)(prefixStartIndex - repIndex) >= 4) /* intentional underflow */ ++ & (offset_1 > 0) ) { ++ rval = MEM_read32(repBase + repIndex); ++ } else { ++ rval = MEM_read32(ip2) ^ 1; /* guaranteed to not match. */ + } +- { const BYTE* const matchEnd = matchIndex < prefixStartIndex ? dictEnd : iend; +- const BYTE* const lowMatchPtr = matchIndex < prefixStartIndex ? dictStart : prefixStart; +- U32 const offset = curr - matchIndex; +- size_t mLength = ZSTD_count_2segments(ip+4, match+4, iend, matchEnd, prefixStart) + 4; +- while (((ip>anchor) & (match>lowMatchPtr)) && (ip[-1] == match[-1])) { ip--; match--; mLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = offset; /* update offset history */ +- ZSTD_storeSeq(seqStore, (size_t)(ip-anchor), anchor, iend, STORE_OFFSET(offset), mLength); +- ip += mLength; +- anchor = ip; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ /* check repcode at ip[2] */ ++ if (MEM_read32(ip2) == rval) { ++ ip0 = ip2; ++ match0 = repBase + repIndex; ++ matchEnd = repIndex < prefixStartIndex ? dictEnd : iend; ++ assert((match0 != prefixStart) & (match0 != dictStart)); ++ mLength = ip0[-1] == match0[-1]; ++ ip0 -= mLength; ++ match0 -= mLength; ++ offcode = REPCODE1_TO_OFFBASE; ++ mLength += 4; ++ goto _match; + } } + +- if (ip <= ilimit) { +- /* Fill Table */ +- hashTable[ZSTD_hashPtr(base+curr+2, hlog, mls)] = curr+2; +- hashTable[ZSTD_hashPtr(ip-2, hlog, mls)] = (U32)(ip-2-base); +- /* check immediate repcode */ +- while (ip <= ilimit) { +- U32 const current2 = (U32)(ip-base); +- U32 const repIndex2 = current2 - offset_2; +- const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; +- if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 <= curr - dictStartIndex)) /* intentional overflow */ +- && (MEM_read32(repMatch2) == MEM_read32(ip)) ) { +- const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; +- size_t const repLength2 = ZSTD_count_2segments(ip+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; +- { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, STORE_REPCODE_1, repLength2); +- hashTable[ZSTD_hashPtr(ip, hlog, mls)] = current2; +- ip += repLength2; +- anchor = ip; +- continue; +- } +- break; +- } } } ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip3; ++ ++ /* write back hash table entry */ ++ current0 = (U32)(ip0 - base); ++ hashTable[hash0] = current0; ++ ++ { /* load match for ip[0] */ ++ U32 const mval = idx >= dictStartIndex ? ++ MEM_read32(idxBase + idx) : ++ MEM_read32(ip0) ^ 1; /* guaranteed not to match */ ++ ++ /* check match at ip[0] */ ++ if (MEM_read32(ip0) == mval) { ++ /* found a match! */ ++ goto _offset; ++ } } ++ ++ /* lookup ip[1] */ ++ idx = hashTable[hash1]; ++ idxBase = idx < prefixStartIndex ? dictBase : base; ++ ++ /* hash ip[2] */ ++ hash0 = hash1; ++ hash1 = ZSTD_hashPtr(ip2, hlog, mls); ++ ++ /* advance to next positions */ ++ ip0 = ip1; ++ ip1 = ip2; ++ ip2 = ip0 + step; ++ ip3 = ip1 + step; ++ ++ /* calculate step */ ++ if (ip2 >= nextStep) { ++ step++; ++ PREFETCH_L1(ip1 + 64); ++ PREFETCH_L1(ip1 + 128); ++ nextStep += kStepIncr; ++ } ++ } while (ip3 < ilimit); ++ ++_cleanup: ++ /* Note that there are probably still a couple positions we could search. ++ * However, it seems to be a meaningful performance hit to try to search ++ * them. So let's not. */ ++ ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; + + /* save reps for next block */ +- rep[0] = offset_1; +- rep[1] = offset_2; ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); ++ ++_offset: /* Requires: ip0, idx, idxBase */ ++ ++ /* Compute the offset code. */ ++ { U32 const offset = current0 - idx; ++ const BYTE* const lowMatchPtr = idx < prefixStartIndex ? dictStart : prefixStart; ++ matchEnd = idx < prefixStartIndex ? dictEnd : iend; ++ match0 = idxBase + idx; ++ offset_2 = offset_1; ++ offset_1 = offset; ++ offcode = OFFSET_TO_OFFBASE(offset); ++ mLength = 4; ++ ++ /* Count the backwards match length. */ ++ while (((ip0>anchor) & (match0>lowMatchPtr)) && (ip0[-1] == match0[-1])) { ++ ip0--; ++ match0--; ++ mLength++; ++ } } ++ ++_match: /* Requires: ip0, match0, offcode, matchEnd */ ++ ++ /* Count the forward length. */ ++ assert(matchEnd != 0); ++ mLength += ZSTD_count_2segments(ip0 + mLength, match0 + mLength, iend, matchEnd, prefixStart); ++ ++ ZSTD_storeSeq(seqStore, (size_t)(ip0 - anchor), anchor, iend, offcode, mLength); ++ ++ ip0 += mLength; ++ anchor = ip0; ++ ++ /* write next hash table entry */ ++ if (ip1 < ip0) { ++ hashTable[hash1] = (U32)(ip1 - base); ++ } ++ ++ /* Fill table and check for immediate repcode. */ ++ if (ip0 <= ilimit) { ++ /* Fill Table */ ++ assert(base+current0+2 > istart); /* check base overflow */ ++ hashTable[ZSTD_hashPtr(base+current0+2, hlog, mls)] = current0+2; /* here because current+2 could be > iend-8 */ ++ hashTable[ZSTD_hashPtr(ip0-2, hlog, mls)] = (U32)(ip0-2-base); ++ ++ while (ip0 <= ilimit) { ++ U32 const repIndex2 = (U32)(ip0-base) - offset_2; ++ const BYTE* const repMatch2 = repIndex2 < prefixStartIndex ? dictBase + repIndex2 : base + repIndex2; ++ if ( (((U32)((prefixStartIndex-1) - repIndex2) >= 3) & (offset_2 > 0)) /* intentional underflow */ ++ && (MEM_read32(repMatch2) == MEM_read32(ip0)) ) { ++ const BYTE* const repEnd2 = repIndex2 < prefixStartIndex ? dictEnd : iend; ++ size_t const repLength2 = ZSTD_count_2segments(ip0+4, repMatch2+4, iend, repEnd2, prefixStart) + 4; ++ { U32 const tmpOffset = offset_2; offset_2 = offset_1; offset_1 = tmpOffset; } /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0 /*litlen*/, anchor, iend, REPCODE1_TO_OFFBASE, repLength2); ++ hashTable[ZSTD_hashPtr(ip0, hlog, mls)] = (U32)(ip0-base); ++ ip0 += repLength2; ++ anchor = ip0; ++ continue; ++ } ++ break; ++ } } ++ ++ goto _start; + } + + ZSTD_GEN_FAST_FN(extDict, 4, 0) +@@ -660,6 +945,7 @@ size_t ZSTD_compressBlock_fast_extDict( + void const* src, size_t srcSize) + { + U32 const mls = ms->cParams.minMatch; ++ assert(ms->dictMatchState == NULL); + switch(mls) + { + default: /* includes case 3 */ +diff --git a/lib/zstd/compress/zstd_fast.h b/lib/zstd/compress/zstd_fast.h +index fddc2f532d21..e64d9e1b2d39 100644 +--- a/lib/zstd/compress/zstd_fast.h ++++ b/lib/zstd/compress/zstd_fast.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,8 @@ + #include "zstd_compress_internal.h" + + void ZSTD_fillHashTable(ZSTD_matchState_t* ms, +- void const* end, ZSTD_dictTableLoadMethod_e dtlm); ++ void const* end, ZSTD_dictTableLoadMethod_e dtlm, ++ ZSTD_tableFillPurpose_e tfp); + size_t ZSTD_compressBlock_fast( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +diff --git a/lib/zstd/compress/zstd_lazy.c b/lib/zstd/compress/zstd_lazy.c +index 0298a01a7504..f6b4978ceba7 100644 +--- a/lib/zstd/compress/zstd_lazy.c ++++ b/lib/zstd/compress/zstd_lazy.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -10,6 +11,9 @@ + + #include "zstd_compress_internal.h" + #include "zstd_lazy.h" ++#include "../common/bits.h" /* ZSTD_countTrailingZeros64 */ ++ ++#define kLazySkippingStep 8 + + + /*-************************************* +@@ -197,8 +201,8 @@ ZSTD_DUBT_findBetterDictMatch ( + U32 matchIndex = dictMatchIndex + dictIndexDelta; + if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) { + DEBUGLOG(9, "ZSTD_DUBT_findBetterDictMatch(%u) : found better match length %u -> %u and offsetCode %u -> %u (dictMatchIndex %u, matchIndex %u)", +- curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, STORE_OFFSET(curr - matchIndex), dictMatchIndex, matchIndex); +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ curr, (U32)bestLength, (U32)matchLength, (U32)*offsetPtr, OFFSET_TO_OFFBASE(curr - matchIndex), dictMatchIndex, matchIndex); ++ bestLength = matchLength, *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + } + if (ip+matchLength == iend) { /* reached end of input : ip[matchLength] is not valid, no way to know if it's larger or smaller than match */ + break; /* drop, to guarantee consistency (miss a little bit of compression) */ +@@ -218,7 +222,7 @@ ZSTD_DUBT_findBetterDictMatch ( + } + + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offsetPtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBetterDictMatch(%u) : found match of length %u and offsetCode %u (pos %u)", + curr, (U32)bestLength, (U32)*offsetPtr, mIndex); + } +@@ -230,7 +234,7 @@ ZSTD_DUBT_findBetterDictMatch ( + static size_t + ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iend, +- size_t* offsetPtr, ++ size_t* offBasePtr, + U32 const mls, + const ZSTD_dictMode_e dictMode) + { +@@ -327,8 +331,8 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (matchLength > bestLength) { + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; +- if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr-matchIndex+1) - ZSTD_highbit32((U32)offsetPtr[0]+1)) ) +- bestLength = matchLength, *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ if ( (4*(int)(matchLength-bestLength)) > (int)(ZSTD_highbit32(curr - matchIndex + 1) - ZSTD_highbit32((U32)*offBasePtr)) ) ++ bestLength = matchLength, *offBasePtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+matchLength == iend) { /* equal : no way to know if inf or sup */ + if (dictMode == ZSTD_dictMatchState) { + nbCompares = 0; /* in addition to avoiding checking any +@@ -361,16 +365,16 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + if (dictMode == ZSTD_dictMatchState && nbCompares) { + bestLength = ZSTD_DUBT_findBetterDictMatch( + ms, ip, iend, +- offsetPtr, bestLength, nbCompares, ++ offBasePtr, bestLength, nbCompares, + mls, dictMode); + } + + assert(matchEndIdx > curr+8); /* ensure nextToUpdate is increased */ + ms->nextToUpdate = matchEndIdx - 8; /* skip repetitive patterns */ + if (bestLength >= MINMATCH) { +- U32 const mIndex = curr - (U32)STORED_OFFSET(*offsetPtr); (void)mIndex; ++ U32 const mIndex = curr - (U32)OFFBASE_TO_OFFSET(*offBasePtr); (void)mIndex; + DEBUGLOG(8, "ZSTD_DUBT_findBestMatch(%u) : found match of length %u and offsetCode %u (pos %u)", +- curr, (U32)bestLength, (U32)*offsetPtr, mIndex); ++ curr, (U32)bestLength, (U32)*offBasePtr, mIndex); + } + return bestLength; + } +@@ -381,14 +385,14 @@ ZSTD_DUBT_findBestMatch(ZSTD_matchState_t* ms, + FORCE_INLINE_TEMPLATE size_t + ZSTD_BtFindBestMatch( ZSTD_matchState_t* ms, + const BYTE* const ip, const BYTE* const iLimit, +- size_t* offsetPtr, ++ size_t* offBasePtr, + const U32 mls /* template */, + const ZSTD_dictMode_e dictMode) + { + DEBUGLOG(7, "ZSTD_BtFindBestMatch"); + if (ip < ms->window.base + ms->nextToUpdate) return 0; /* skipped area */ + ZSTD_updateDUBT(ms, ip, iLimit, mls); +- return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offsetPtr, mls, dictMode); ++ return ZSTD_DUBT_findBestMatch(ms, ip, iLimit, offBasePtr, mls, dictMode); + } + + /* ********************************* +@@ -561,7 +565,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) { + /* best possible, avoids read overflow on next attempt */ + return ml; +@@ -598,7 +602,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + ddsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + ddsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -617,7 +621,7 @@ size_t ZSTD_dedicatedDictSearch_lazy_search(size_t* offsetPtr, size_t ml, U32 nb + FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + ZSTD_matchState_t* ms, + const ZSTD_compressionParameters* const cParams, +- const BYTE* ip, U32 const mls) ++ const BYTE* ip, U32 const mls, U32 const lazySkipping) + { + U32* const hashTable = ms->hashTable; + const U32 hashLog = cParams->hashLog; +@@ -632,6 +636,9 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + NEXT_IN_CHAIN(idx, chainMask) = hashTable[h]; + hashTable[h] = idx; + idx++; ++ /* Stop inserting every position when in the lazy skipping mode. */ ++ if (lazySkipping) ++ break; + } + + ms->nextToUpdate = target; +@@ -640,7 +647,7 @@ FORCE_INLINE_TEMPLATE U32 ZSTD_insertAndFindFirstIndex_internal( + + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip) { + const ZSTD_compressionParameters* const cParams = &ms->cParams; +- return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch); ++ return ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, ms->cParams.minMatch, /* lazySkipping*/ 0); + } + + /* inlining is important to hardwire a hot branch (template emulation) */ +@@ -684,14 +691,15 @@ size_t ZSTD_HcFindBestMatch( + } + + /* HC4 match finder */ +- matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls); ++ matchIndex = ZSTD_insertAndFindFirstIndex_internal(ms, cParams, ip, mls, ms->lazySkipping); + + for ( ; (matchIndex>=lowLimit) & (nbAttempts>0) ; nbAttempts--) { + size_t currentMl=0; + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -703,7 +711,7 @@ size_t ZSTD_HcFindBestMatch( + /* save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -739,7 +747,7 @@ size_t ZSTD_HcFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + +@@ -756,8 +764,6 @@ size_t ZSTD_HcFindBestMatch( + * (SIMD) Row-based matchfinder + ***********************************/ + /* Constants for row-based hash */ +-#define ZSTD_ROW_HASH_TAG_OFFSET 16 /* byte offset of hashes in the match state's tagTable from the beginning of a row */ +-#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ + #define ZSTD_ROW_HASH_TAG_MASK ((1u << ZSTD_ROW_HASH_TAG_BITS) - 1) + #define ZSTD_ROW_HASH_MAX_ENTRIES 64 /* absolute maximum number of entries per row, for all configurations */ + +@@ -769,64 +775,19 @@ typedef U64 ZSTD_VecMask; /* Clarifies when we are interacting with a U64 repr + * Starting from the LSB, returns the idx of the next non-zero bit. + * Basically counting the nb of trailing zeroes. + */ +-static U32 ZSTD_VecMask_next(ZSTD_VecMask val) { +- assert(val != 0); +-# if (defined(__GNUC__) && ((__GNUC__ > 3) || ((__GNUC__ == 3) && (__GNUC_MINOR__ >= 4)))) +- if (sizeof(size_t) == 4) { +- U32 mostSignificantWord = (U32)(val >> 32); +- U32 leastSignificantWord = (U32)val; +- if (leastSignificantWord == 0) { +- return 32 + (U32)__builtin_ctz(mostSignificantWord); +- } else { +- return (U32)__builtin_ctz(leastSignificantWord); +- } +- } else { +- return (U32)__builtin_ctzll(val); +- } +-# else +- /* Software ctz version: http://aggregate.org/MAGIC/#Trailing%20Zero%20Count +- * and: https://stackoverflow.com/questions/2709430/count-number-of-bits-in-a-64-bit-long-big-integer +- */ +- val = ~val & (val - 1ULL); /* Lowest set bit mask */ +- val = val - ((val >> 1) & 0x5555555555555555); +- val = (val & 0x3333333333333333ULL) + ((val >> 2) & 0x3333333333333333ULL); +- return (U32)((((val + (val >> 4)) & 0xF0F0F0F0F0F0F0FULL) * 0x101010101010101ULL) >> 56); +-# endif +-} +- +-/* ZSTD_rotateRight_*(): +- * Rotates a bitfield to the right by "count" bits. +- * https://en.wikipedia.org/w/index.php?title=Circular_shift&oldid=991635599#Implementing_circular_shifts +- */ +-FORCE_INLINE_TEMPLATE +-U64 ZSTD_rotateRight_U64(U64 const value, U32 count) { +- assert(count < 64); +- count &= 0x3F; /* for fickle pattern recognition */ +- return (value >> count) | (U64)(value << ((0U - count) & 0x3F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_rotateRight_U32(U32 const value, U32 count) { +- assert(count < 32); +- count &= 0x1F; /* for fickle pattern recognition */ +- return (value >> count) | (U32)(value << ((0U - count) & 0x1F)); +-} +- +-FORCE_INLINE_TEMPLATE +-U16 ZSTD_rotateRight_U16(U16 const value, U32 count) { +- assert(count < 16); +- count &= 0x0F; /* for fickle pattern recognition */ +- return (value >> count) | (U16)(value << ((0U - count) & 0x0F)); ++MEM_STATIC U32 ZSTD_VecMask_next(ZSTD_VecMask val) { ++ return ZSTD_countTrailingZeros64(val); + } + + /* ZSTD_row_nextIndex(): + * Returns the next index to insert at within a tagTable row, and updates the "head" +- * value to reflect the update. Essentially cycles backwards from [0, {entries per row}) ++ * value to reflect the update. Essentially cycles backwards from [1, {entries per row}) + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextIndex(BYTE* const tagRow, U32 const rowMask) { +- U32 const next = (*tagRow - 1) & rowMask; +- *tagRow = (BYTE)next; +- return next; ++ U32 next = (*tagRow-1) & rowMask; ++ next += (next == 0) ? rowMask : 0; /* skip first position */ ++ *tagRow = (BYTE)next; ++ return next; + } + + /* ZSTD_isAligned(): +@@ -840,7 +801,7 @@ MEM_STATIC int ZSTD_isAligned(void const* ptr, size_t align) { + /* ZSTD_row_prefetch(): + * Performs prefetching for the hashTable and tagTable at a given row. + */ +-FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, U16 const* tagTable, U32 const relRow, U32 const rowLog) { ++FORCE_INLINE_TEMPLATE void ZSTD_row_prefetch(U32 const* hashTable, BYTE const* tagTable, U32 const relRow, U32 const rowLog) { + PREFETCH_L1(hashTable + relRow); + if (rowLog >= 5) { + PREFETCH_L1(hashTable + relRow + 16); +@@ -864,13 +825,13 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + U32 idx, const BYTE* const iLimit) + { + U32 const* const hashTable = ms->hashTable; +- U16 const* const tagTable = ms->tagTable; ++ BYTE const* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + U32 const maxElemsToPrefetch = (base + idx) > iLimit ? 0 : (U32)(iLimit - (base + idx) + 1); + U32 const lim = idx + MIN(ZSTD_ROW_HASH_CACHE_SIZE, maxElemsToPrefetch); + + for (; idx < lim; ++idx) { +- U32 const hash = (U32)ZSTD_hashPtr(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = (U32)ZSTD_hashPtrSalted(base + idx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const row = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + ms->hashCache[idx & ZSTD_ROW_HASH_CACHE_MASK] = hash; +@@ -886,11 +847,12 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_fillHashCache(ZSTD_matchState_t* ms, const B + * base + idx + ZSTD_ROW_HASH_CACHE_SIZE. Also prefetches the appropriate rows from hashTable and tagTable. + */ + FORCE_INLINE_TEMPLATE U32 ZSTD_row_nextCachedHash(U32* cache, U32 const* hashTable, +- U16 const* tagTable, BYTE const* base, ++ BYTE const* tagTable, BYTE const* base, + U32 idx, U32 const hashLog, +- U32 const rowLog, U32 const mls) ++ U32 const rowLog, U32 const mls, ++ U64 const hashSalt) + { +- U32 const newHash = (U32)ZSTD_hashPtr(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const newHash = (U32)ZSTD_hashPtrSalted(base+idx+ZSTD_ROW_HASH_CACHE_SIZE, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); + U32 const row = (newHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + ZSTD_row_prefetch(hashTable, tagTable, row, rowLog); + { U32 const hash = cache[idx & ZSTD_ROW_HASH_CACHE_MASK]; +@@ -908,22 +870,21 @@ FORCE_INLINE_TEMPLATE void ZSTD_row_update_internalImpl(ZSTD_matchState_t* ms, + U32 const rowMask, U32 const useCache) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32 const hashLog = ms->rowHashLog; + const BYTE* const base = ms->window.base; + + DEBUGLOG(6, "ZSTD_row_update_internalImpl(): updateStartIdx=%u, updateEndIdx=%u", updateStartIdx, updateEndIdx); + for (; updateStartIdx < updateEndIdx; ++updateStartIdx) { +- U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls) +- : (U32)ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls); ++ U32 const hash = useCache ? ZSTD_row_nextCachedHash(ms->hashCache, hashTable, tagTable, base, updateStartIdx, hashLog, rowLog, mls, ms->hashSalt) ++ : (U32)ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32* const row = hashTable + relRow; +- BYTE* tagRow = (BYTE*)(tagTable + relRow); /* Though tagTable is laid out as a table of U16, each tag is only 1 byte. +- Explicit cast allows us to get exact desired position within each row */ ++ BYTE* tagRow = tagTable + relRow; + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); + +- assert(hash == ZSTD_hashPtr(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls)); +- ((BYTE*)tagRow)[pos + ZSTD_ROW_HASH_TAG_OFFSET] = hash & ZSTD_ROW_HASH_TAG_MASK; ++ assert(hash == ZSTD_hashPtrSalted(base + updateStartIdx, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, ms->hashSalt)); ++ tagRow[pos] = hash & ZSTD_ROW_HASH_TAG_MASK; + row[pos] = updateStartIdx; + } + } +@@ -971,7 +932,35 @@ void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip) { + const U32 mls = MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */); + + DEBUGLOG(5, "ZSTD_row_update(), rowLog=%u", rowLog); +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* dont use cache */); ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 0 /* don't use cache */); ++} ++ ++/* Returns the mask width of bits group of which will be set to 1. Given not all ++ * architectures have easy movemask instruction, this helps to iterate over ++ * groups of bits easier and faster. ++ */ ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_row_matchMaskGroupWidth(const U32 rowEntries) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ (void)rowEntries; ++#if defined(ZSTD_ARCH_ARM_NEON) ++ /* NEON path only works for little endian */ ++ if (!MEM_isLittleEndian()) { ++ return 1; ++ } ++ if (rowEntries == 16) { ++ return 4; ++ } ++ if (rowEntries == 32) { ++ return 2; ++ } ++ if (rowEntries == 64) { ++ return 1; ++ } ++#endif ++ return 1; + } + + #if defined(ZSTD_ARCH_X86_SSE2) +@@ -994,71 +983,82 @@ ZSTD_row_getSSEMask(int nbChunks, const BYTE* const src, const BYTE tag, const U + } + #endif + +-/* Returns a ZSTD_VecMask (U32) that has the nth bit set to 1 if the newly-computed "tag" matches +- * the hash at the nth position in a row of the tagTable. +- * Each row is a circular buffer beginning at the value of "head". So we must rotate the "matches" bitfield +- * to match up with the actual layout of the entries within the hashTable */ ++#if defined(ZSTD_ARCH_ARM_NEON) ++FORCE_INLINE_TEMPLATE ZSTD_VecMask ++ZSTD_row_getNEONMask(const U32 rowEntries, const BYTE* const src, const BYTE tag, const U32 headGrouped) ++{ ++ assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); ++ if (rowEntries == 16) { ++ /* vshrn_n_u16 shifts by 4 every u16 and narrows to 8 lower bits. ++ * After that groups of 4 bits represent the equalMask. We lower ++ * all bits except the highest in these groups by doing AND with ++ * 0x88 = 0b10001000. ++ */ ++ const uint8x16_t chunk = vld1q_u8(src); ++ const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); ++ const uint8x8_t res = vshrn_n_u16(equalMask, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x8888888888888888ull; ++ } else if (rowEntries == 32) { ++ /* Same idea as with rowEntries == 16 but doing AND with ++ * 0x55 = 0b01010101. ++ */ ++ const uint16x8x2_t chunk = vld2q_u16((const uint16_t*)(const void*)src); ++ const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); ++ const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x8_t t0 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk0, dup)), 6); ++ const uint8x8_t t1 = vshrn_n_u16(vreinterpretq_u16_u8(vceqq_u8(chunk1, dup)), 6); ++ const uint8x8_t res = vsli_n_u8(t0, t1, 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(res), 0) ; ++ return ZSTD_rotateRight_U64(matches, headGrouped) & 0x5555555555555555ull; ++ } else { /* rowEntries == 64 */ ++ const uint8x16x4_t chunk = vld4q_u8(src); ++ const uint8x16_t dup = vdupq_n_u8(tag); ++ const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); ++ const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); ++ const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); ++ const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); ++ ++ const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); ++ const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); ++ const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); ++ const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); ++ const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); ++ const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); ++ return ZSTD_rotateRight_U64(matches, headGrouped); ++ } ++} ++#endif ++ ++/* Returns a ZSTD_VecMask (U64) that has the nth group (determined by ++ * ZSTD_row_matchMaskGroupWidth) of bits set to 1 if the newly-computed "tag" ++ * matches the hash at the nth position in a row of the tagTable. ++ * Each row is a circular buffer beginning at the value of "headGrouped". So we ++ * must rotate the "matches" bitfield to match up with the actual layout of the ++ * entries within the hashTable */ + FORCE_INLINE_TEMPLATE ZSTD_VecMask +-ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, const U32 rowEntries) ++ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 headGrouped, const U32 rowEntries) + { +- const BYTE* const src = tagRow + ZSTD_ROW_HASH_TAG_OFFSET; ++ const BYTE* const src = tagRow; + assert((rowEntries == 16) || (rowEntries == 32) || rowEntries == 64); + assert(rowEntries <= ZSTD_ROW_HASH_MAX_ENTRIES); ++ assert(ZSTD_row_matchMaskGroupWidth(rowEntries) * rowEntries <= sizeof(ZSTD_VecMask) * 8); + + #if defined(ZSTD_ARCH_X86_SSE2) + +- return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, head); ++ return ZSTD_row_getSSEMask(rowEntries / 16, src, tag, headGrouped); + + #else /* SW or NEON-LE */ + + # if defined(ZSTD_ARCH_ARM_NEON) + /* This NEON path only works for little endian - otherwise use SWAR below */ + if (MEM_isLittleEndian()) { +- if (rowEntries == 16) { +- const uint8x16_t chunk = vld1q_u8(src); +- const uint16x8_t equalMask = vreinterpretq_u16_u8(vceqq_u8(chunk, vdupq_n_u8(tag))); +- const uint16x8_t t0 = vshlq_n_u16(equalMask, 7); +- const uint32x4_t t1 = vreinterpretq_u32_u16(vsriq_n_u16(t0, t0, 14)); +- const uint64x2_t t2 = vreinterpretq_u64_u32(vshrq_n_u32(t1, 14)); +- const uint8x16_t t3 = vreinterpretq_u8_u64(vsraq_n_u64(t2, t2, 28)); +- const U16 hi = (U16)vgetq_lane_u8(t3, 8); +- const U16 lo = (U16)vgetq_lane_u8(t3, 0); +- return ZSTD_rotateRight_U16((hi << 8) | lo, head); +- } else if (rowEntries == 32) { +- const uint16x8x2_t chunk = vld2q_u16((const U16*)(const void*)src); +- const uint8x16_t chunk0 = vreinterpretq_u8_u16(chunk.val[0]); +- const uint8x16_t chunk1 = vreinterpretq_u8_u16(chunk.val[1]); +- const uint8x16_t equalMask0 = vceqq_u8(chunk0, vdupq_n_u8(tag)); +- const uint8x16_t equalMask1 = vceqq_u8(chunk1, vdupq_n_u8(tag)); +- const int8x8_t pack0 = vqmovn_s16(vreinterpretq_s16_u8(equalMask0)); +- const int8x8_t pack1 = vqmovn_s16(vreinterpretq_s16_u8(equalMask1)); +- const uint8x8_t t0 = vreinterpret_u8_s8(pack0); +- const uint8x8_t t1 = vreinterpret_u8_s8(pack1); +- const uint8x8_t t2 = vsri_n_u8(t1, t0, 2); +- const uint8x8x2_t t3 = vuzp_u8(t2, t0); +- const uint8x8_t t4 = vsri_n_u8(t3.val[1], t3.val[0], 4); +- const U32 matches = vget_lane_u32(vreinterpret_u32_u8(t4), 0); +- return ZSTD_rotateRight_U32(matches, head); +- } else { /* rowEntries == 64 */ +- const uint8x16x4_t chunk = vld4q_u8(src); +- const uint8x16_t dup = vdupq_n_u8(tag); +- const uint8x16_t cmp0 = vceqq_u8(chunk.val[0], dup); +- const uint8x16_t cmp1 = vceqq_u8(chunk.val[1], dup); +- const uint8x16_t cmp2 = vceqq_u8(chunk.val[2], dup); +- const uint8x16_t cmp3 = vceqq_u8(chunk.val[3], dup); +- +- const uint8x16_t t0 = vsriq_n_u8(cmp1, cmp0, 1); +- const uint8x16_t t1 = vsriq_n_u8(cmp3, cmp2, 1); +- const uint8x16_t t2 = vsriq_n_u8(t1, t0, 2); +- const uint8x16_t t3 = vsriq_n_u8(t2, t2, 4); +- const uint8x8_t t4 = vshrn_n_u16(vreinterpretq_u16_u8(t3), 4); +- const U64 matches = vget_lane_u64(vreinterpret_u64_u8(t4), 0); +- return ZSTD_rotateRight_U64(matches, head); +- } ++ return ZSTD_row_getNEONMask(rowEntries, src, tag, headGrouped); + } + # endif /* ZSTD_ARCH_ARM_NEON */ + /* SWAR */ +- { const size_t chunkSize = sizeof(size_t); ++ { const int chunkSize = sizeof(size_t); + const size_t shiftAmount = ((chunkSize * 8) - chunkSize); + const size_t xFF = ~((size_t)0); + const size_t x01 = xFF / 0xFF; +@@ -1091,11 +1091,11 @@ ZSTD_row_getMatchMask(const BYTE* const tagRow, const BYTE tag, const U32 head, + } + matches = ~matches; + if (rowEntries == 16) { +- return ZSTD_rotateRight_U16((U16)matches, head); ++ return ZSTD_rotateRight_U16((U16)matches, headGrouped); + } else if (rowEntries == 32) { +- return ZSTD_rotateRight_U32((U32)matches, head); ++ return ZSTD_rotateRight_U32((U32)matches, headGrouped); + } else { +- return ZSTD_rotateRight_U64((U64)matches, head); ++ return ZSTD_rotateRight_U64((U64)matches, headGrouped); + } + } + #endif +@@ -1125,7 +1125,7 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowLog) + { + U32* const hashTable = ms->hashTable; +- U16* const tagTable = ms->tagTable; ++ BYTE* const tagTable = ms->tagTable; + U32* const hashCache = ms->hashCache; + const U32 hashLog = ms->rowHashLog; + const ZSTD_compressionParameters* const cParams = &ms->cParams; +@@ -1143,8 +1143,11 @@ size_t ZSTD_RowFindBestMatch( + const U32 rowEntries = (1U << rowLog); + const U32 rowMask = rowEntries - 1; + const U32 cappedSearchLog = MIN(cParams->searchLog, rowLog); /* nb of searches is capped at nb entries per row */ ++ const U32 groupWidth = ZSTD_row_matchMaskGroupWidth(rowEntries); ++ const U64 hashSalt = ms->hashSalt; + U32 nbAttempts = 1U << cappedSearchLog; + size_t ml=4-1; ++ U32 hash; + + /* DMS/DDS variables that may be referenced laster */ + const ZSTD_matchState_t* const dms = ms->dictMatchState; +@@ -1168,7 +1171,7 @@ size_t ZSTD_RowFindBestMatch( + if (dictMode == ZSTD_dictMatchState) { + /* Prefetch DMS rows */ + U32* const dmsHashTable = dms->hashTable; +- U16* const dmsTagTable = dms->tagTable; ++ BYTE* const dmsTagTable = dms->tagTable; + U32 const dmsHash = (U32)ZSTD_hashPtr(ip, dms->rowHashLog + ZSTD_ROW_HASH_TAG_BITS, mls); + U32 const dmsRelRow = (dmsHash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + dmsTag = dmsHash & ZSTD_ROW_HASH_TAG_MASK; +@@ -1178,23 +1181,34 @@ size_t ZSTD_RowFindBestMatch( + } + + /* Update the hashTable and tagTable up to (but not including) ip */ +- ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ if (!ms->lazySkipping) { ++ ZSTD_row_update_internal(ms, ip, mls, rowLog, rowMask, 1 /* useCache */); ++ hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls, hashSalt); ++ } else { ++ /* Stop inserting every position when in the lazy skipping mode. ++ * The hash cache is also not kept up to date in this mode. ++ */ ++ hash = (U32)ZSTD_hashPtrSalted(ip, hashLog + ZSTD_ROW_HASH_TAG_BITS, mls, hashSalt); ++ ms->nextToUpdate = curr; ++ } ++ ms->hashSaltEntropy += hash; /* collect salt entropy */ ++ + { /* Get the hash for ip, compute the appropriate row */ +- U32 const hash = ZSTD_row_nextCachedHash(hashCache, hashTable, tagTable, base, curr, hashLog, rowLog, mls); + U32 const relRow = (hash >> ZSTD_ROW_HASH_TAG_BITS) << rowLog; + U32 const tag = hash & ZSTD_ROW_HASH_TAG_MASK; + U32* const row = hashTable + relRow; + BYTE* tagRow = (BYTE*)(tagTable + relRow); +- U32 const head = *tagRow & rowMask; ++ U32 const headGrouped = (*tagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(tagRow, (BYTE)tag, headGrouped, rowEntries); + + /* Cycle through the matches and prefetch */ +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = row[matchPos]; ++ if(matchPos == 0) continue; + assert(numMatches < rowEntries); + if (matchIndex < lowLimit) + break; +@@ -1204,13 +1218,14 @@ size_t ZSTD_RowFindBestMatch( + PREFETCH_L1(dictBase + matchIndex); + } + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Speed opt: insert current byte into hashtable too. This allows us to avoid one iteration of the loop + in ZSTD_row_update_internal() at the next search. */ + { + U32 const pos = ZSTD_row_nextIndex(tagRow, rowMask); +- tagRow[pos + ZSTD_ROW_HASH_TAG_OFFSET] = (BYTE)tag; ++ tagRow[pos] = (BYTE)tag; + row[pos] = ms->nextToUpdate++; + } + +@@ -1224,7 +1239,8 @@ size_t ZSTD_RowFindBestMatch( + if ((dictMode != ZSTD_extDict) || matchIndex >= dictLimit) { + const BYTE* const match = base + matchIndex; + assert(matchIndex >= dictLimit); /* ensures this is true if dictMode != ZSTD_extDict */ +- if (match[ml] == ip[ml]) /* potentially better */ ++ /* read 4B starting from (match + ml + 1 - sizeof(U32)) */ ++ if (MEM_read32(match + ml - 3) == MEM_read32(ip + ml - 3)) /* potentially better */ + currentMl = ZSTD_count(ip, match, iLimit); + } else { + const BYTE* const match = dictBase + matchIndex; +@@ -1236,7 +1252,7 @@ size_t ZSTD_RowFindBestMatch( + /* Save best solution */ + if (currentMl > ml) { + ml = currentMl; +- *offsetPtr = STORE_OFFSET(curr - matchIndex); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - matchIndex); + if (ip+currentMl == iLimit) break; /* best possible, avoids read overflow on next attempt */ + } + } +@@ -1254,19 +1270,21 @@ size_t ZSTD_RowFindBestMatch( + const U32 dmsSize = (U32)(dmsEnd - dmsBase); + const U32 dmsIndexDelta = dictLimit - dmsSize; + +- { U32 const head = *dmsTagRow & rowMask; ++ { U32 const headGrouped = (*dmsTagRow & rowMask) * groupWidth; + U32 matchBuffer[ZSTD_ROW_HASH_MAX_ENTRIES]; + size_t numMatches = 0; + size_t currMatch = 0; +- ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, head, rowEntries); ++ ZSTD_VecMask matches = ZSTD_row_getMatchMask(dmsTagRow, (BYTE)dmsTag, headGrouped, rowEntries); + +- for (; (matches > 0) && (nbAttempts > 0); --nbAttempts, matches &= (matches - 1)) { +- U32 const matchPos = (head + ZSTD_VecMask_next(matches)) & rowMask; ++ for (; (matches > 0) && (nbAttempts > 0); matches &= (matches - 1)) { ++ U32 const matchPos = ((headGrouped + ZSTD_VecMask_next(matches)) / groupWidth) & rowMask; + U32 const matchIndex = dmsRow[matchPos]; ++ if(matchPos == 0) continue; + if (matchIndex < dmsLowestIndex) + break; + PREFETCH_L1(dmsBase + matchIndex); + matchBuffer[numMatches++] = matchIndex; ++ --nbAttempts; + } + + /* Return the longest match */ +@@ -1285,7 +1303,7 @@ size_t ZSTD_RowFindBestMatch( + if (currentMl > ml) { + ml = currentMl; + assert(curr > matchIndex + dmsIndexDelta); +- *offsetPtr = STORE_OFFSET(curr - (matchIndex + dmsIndexDelta)); ++ *offsetPtr = OFFSET_TO_OFFBASE(curr - (matchIndex + dmsIndexDelta)); + if (ip+currentMl == iLimit) break; + } + } +@@ -1491,7 +1509,8 @@ ZSTD_compressBlock_lazy_generic( + const U32 mls = BOUNDED(4, ms->cParams.minMatch, 6); + const U32 rowLog = BOUNDED(4, ms->cParams.searchLog, 6); + +- U32 offset_1 = rep[0], offset_2 = rep[1], savedOffset=0; ++ U32 offset_1 = rep[0], offset_2 = rep[1]; ++ U32 offsetSaved1 = 0, offsetSaved2 = 0; + + const int isDMS = dictMode == ZSTD_dictMatchState; + const int isDDS = dictMode == ZSTD_dedicatedDictSearch; +@@ -1512,8 +1531,8 @@ ZSTD_compressBlock_lazy_generic( + U32 const curr = (U32)(ip - base); + U32 const windowLow = ZSTD_getLowestPrefixIndex(ms, curr, ms->cParams.windowLog); + U32 const maxRep = curr - windowLow; +- if (offset_2 > maxRep) savedOffset = offset_2, offset_2 = 0; +- if (offset_1 > maxRep) savedOffset = offset_1, offset_1 = 0; ++ if (offset_2 > maxRep) offsetSaved2 = offset_2, offset_2 = 0; ++ if (offset_1 > maxRep) offsetSaved1 = offset_1, offset_1 = 0; + } + if (isDxS) { + /* dictMatchState repCode checks don't currently handle repCode == 0 +@@ -1522,10 +1541,11 @@ ZSTD_compressBlock_lazy_generic( + assert(offset_2 <= dictAndPrefixLength); + } + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1537,7 +1557,7 @@ ZSTD_compressBlock_lazy_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + DEBUGLOG(7, "search baseline (depth 0)"); + +@@ -1562,14 +1582,23 @@ ZSTD_compressBlock_lazy_generic( + } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, dictMode); ++ { size_t offbaseFound = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offbaseFound, mls, rowLog, searchMethod, dictMode); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = offbaseFound; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */; ++ ip += step; ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1579,12 +1608,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 1"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1596,17 +1625,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1615,12 +1644,12 @@ ZSTD_compressBlock_lazy_generic( + DEBUGLOG(7, "search depth 2"); + ip ++; + if ( (dictMode == ZSTD_noDict) +- && (offcode) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { ++ && (offBase) && ((offset_1>0) & (MEM_read32(ip) == MEM_read32(ip - offset_1)))) { + size_t const mlRep = ZSTD_count(ip+4, ip+4-offset_1, iend) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + if (isDxS) { + const U32 repIndex = (U32)(ip - base) - offset_1; +@@ -1632,17 +1661,17 @@ ZSTD_compressBlock_lazy_generic( + const BYTE* repMatchEnd = repIndex < prefixLowestIndex ? dictEnd : iend; + size_t const mlRep = ZSTD_count_2segments(ip+4, repMatch+4, iend, repMatchEnd, prefixLowest) + 4; + int const gain2 = (int)(mlRep * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((mlRep >= 4) && (gain2 > gain1)) +- matchLength = mlRep, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = mlRep, offBase = REPCODE1_TO_OFFBASE, start = ip; + } + } +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, dictMode); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate=999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, dictMode); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ +@@ -1653,26 +1682,33 @@ ZSTD_compressBlock_lazy_generic( + * notably if `value` is unsigned, resulting in a large positive `-value`. + */ + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { ++ if (OFFBASE_IS_OFFSET(offBase)) { + if (dictMode == ZSTD_noDict) { +- while ( ((start > anchor) & (start - STORED_OFFSET(offcode) > prefixLowest)) +- && (start[-1] == (start-STORED_OFFSET(offcode))[-1]) ) /* only search for offset within prefix */ ++ while ( ((start > anchor) & (start - OFFBASE_TO_OFFSET(offBase) > prefixLowest)) ++ && (start[-1] == (start-OFFBASE_TO_OFFSET(offBase))[-1]) ) /* only search for offset within prefix */ + { start--; matchLength++; } + } + if (isDxS) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < prefixLowestIndex) ? dictBase + matchIndex - dictIndexDelta : base + matchIndex; + const BYTE* const mStart = (matchIndex < prefixLowestIndex) ? dictLowest : prefixLowest; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ + } +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + if (isDxS) { +@@ -1686,8 +1722,8 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(repMatch) == MEM_read32(ip)) ) { + const BYTE* const repEnd2 = repIndex < prefixLowestIndex ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd2, prefixLowest) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset_2 <=> offset_1 */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset_2 <=> offset_1 */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; +@@ -1701,16 +1737,20 @@ ZSTD_compressBlock_lazy_generic( + && (MEM_read32(ip) == MEM_read32(ip - offset_2)) ) { + /* store sequence */ + matchLength = ZSTD_count(ip+4, ip+4-offset_2, iend) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap repcodes */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap repcodes */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ + } } } + +- /* Save reps for next block */ +- rep[0] = offset_1 ? offset_1 : savedOffset; +- rep[1] = offset_2 ? offset_2 : savedOffset; ++ /* If offset_1 started invalid (offsetSaved1 != 0) and became valid (offset_1 != 0), ++ * rotate saved offsets. See comment in ZSTD_compressBlock_fast_noDict for more context. */ ++ offsetSaved2 = ((offsetSaved1 != 0) && (offset_1 != 0)) ? offsetSaved1 : offsetSaved2; ++ ++ /* save reps for next block */ ++ rep[0] = offset_1 ? offset_1 : offsetSaved1; ++ rep[1] = offset_2 ? offset_2 : offsetSaved2; + + /* Return the last literals size */ + return (size_t)(iend - anchor); +@@ -1886,12 +1926,13 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + + DEBUGLOG(5, "ZSTD_compressBlock_lazy_extDict_generic (searchFunc=%u)", (U32)searchMethod); + ++ /* Reset the lazy skipping state */ ++ ms->lazySkipping = 0; ++ + /* init */ + ip += (ip == prefixStart); + if (searchMethod == search_rowHash) { +- ZSTD_row_fillHashCache(ms, base, rowLog, +- MIN(ms->cParams.minMatch, 6 /* mls caps out at 6 */), +- ms->nextToUpdate, ilimit); ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); + } + + /* Match Loop */ +@@ -1903,7 +1944,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + #endif + while (ip < ilimit) { + size_t matchLength=0; +- size_t offcode=STORE_REPCODE_1; ++ size_t offBase = REPCODE1_TO_OFFBASE; + const BYTE* start=ip+1; + U32 curr = (U32)(ip-base); + +@@ -1922,14 +1963,23 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + } } + + /* first search (depth 0) */ +- { size_t offsetFound = 999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offsetFound, mls, rowLog, searchMethod, ZSTD_extDict); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); + if (ml2 > matchLength) +- matchLength = ml2, start = ip, offcode=offsetFound; ++ matchLength = ml2, start = ip, offBase = ofbCandidate; + } + + if (matchLength < 4) { +- ip += ((ip-anchor) >> kSearchStrength) + 1; /* jump faster over incompressible sections */ ++ size_t const step = ((size_t)(ip-anchor) >> kSearchStrength); ++ ip += step + 1; /* jump faster over incompressible sections */ ++ /* Enter the lazy skipping mode once we are skipping more than 8 bytes at a time. ++ * In this mode we stop inserting every position into our tables, and only insert ++ * positions that we search, which is one in step positions. ++ * The exact cutoff is flexible, I've just chosen a number that is reasonably high, ++ * so we minimize the compression ratio loss in "normal" scenarios. This mode gets ++ * triggered once we've gone 2KB without finding any matches. ++ */ ++ ms->lazySkipping = step > kLazySkippingStep; + continue; + } + +@@ -1939,7 +1989,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1951,18 +2001,18 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 3); +- int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*3 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 1 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 4); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 4); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; /* search a better one */ + } } + +@@ -1971,7 +2021,7 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + ip ++; + curr++; + /* check repCode */ +- if (offcode) { ++ if (offBase) { + const U32 windowLow = ZSTD_getLowestMatchIndex(ms, curr, windowLog); + const U32 repIndex = (U32)(curr - offset_1); + const BYTE* const repBase = repIndex < dictLimit ? dictBase : base; +@@ -1983,38 +2033,45 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + size_t const repLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; + int const gain2 = (int)(repLength * 4); +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 1); ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 1); + if ((repLength >= 4) && (gain2 > gain1)) +- matchLength = repLength, offcode = STORE_REPCODE_1, start = ip; ++ matchLength = repLength, offBase = REPCODE1_TO_OFFBASE, start = ip; + } } + + /* search match, depth 2 */ +- { size_t offset2=999999999; +- size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &offset2, mls, rowLog, searchMethod, ZSTD_extDict); +- int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offset2))); /* raw approx */ +- int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)STORED_TO_OFFBASE(offcode)) + 7); ++ { size_t ofbCandidate = 999999999; ++ size_t const ml2 = ZSTD_searchMax(ms, ip, iend, &ofbCandidate, mls, rowLog, searchMethod, ZSTD_extDict); ++ int const gain2 = (int)(ml2*4 - ZSTD_highbit32((U32)ofbCandidate)); /* raw approx */ ++ int const gain1 = (int)(matchLength*4 - ZSTD_highbit32((U32)offBase) + 7); + if ((ml2 >= 4) && (gain2 > gain1)) { +- matchLength = ml2, offcode = offset2, start = ip; ++ matchLength = ml2, offBase = ofbCandidate, start = ip; + continue; + } } } + break; /* nothing found : store previous solution */ + } + + /* catch up */ +- if (STORED_IS_OFFSET(offcode)) { +- U32 const matchIndex = (U32)((size_t)(start-base) - STORED_OFFSET(offcode)); ++ if (OFFBASE_IS_OFFSET(offBase)) { ++ U32 const matchIndex = (U32)((size_t)(start-base) - OFFBASE_TO_OFFSET(offBase)); + const BYTE* match = (matchIndex < dictLimit) ? dictBase + matchIndex : base + matchIndex; + const BYTE* const mStart = (matchIndex < dictLimit) ? dictStart : prefixStart; + while ((start>anchor) && (match>mStart) && (start[-1] == match[-1])) { start--; match--; matchLength++; } /* catch up */ +- offset_2 = offset_1; offset_1 = (U32)STORED_OFFSET(offcode); ++ offset_2 = offset_1; offset_1 = (U32)OFFBASE_TO_OFFSET(offBase); + } + + /* store sequence */ + _storeSequence: + { size_t const litLength = (size_t)(start - anchor); +- ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offcode, matchLength); ++ ZSTD_storeSeq(seqStore, litLength, anchor, iend, (U32)offBase, matchLength); + anchor = ip = start + matchLength; + } ++ if (ms->lazySkipping) { ++ /* We've found a match, disable lazy skipping mode, and refill the hash cache. */ ++ if (searchMethod == search_rowHash) { ++ ZSTD_row_fillHashCache(ms, base, rowLog, mls, ms->nextToUpdate, ilimit); ++ } ++ ms->lazySkipping = 0; ++ } + + /* check immediate repcode */ + while (ip <= ilimit) { +@@ -2029,8 +2086,8 @@ size_t ZSTD_compressBlock_lazy_extDict_generic( + /* repcode detected we should take it */ + const BYTE* const repEnd = repIndex < dictLimit ? dictEnd : iend; + matchLength = ZSTD_count_2segments(ip+4, repMatch+4, iend, repEnd, prefixStart) + 4; +- offcode = offset_2; offset_2 = offset_1; offset_1 = (U32)offcode; /* swap offset history */ +- ZSTD_storeSeq(seqStore, 0, anchor, iend, STORE_REPCODE_1, matchLength); ++ offBase = offset_2; offset_2 = offset_1; offset_1 = (U32)offBase; /* swap offset history */ ++ ZSTD_storeSeq(seqStore, 0, anchor, iend, REPCODE1_TO_OFFBASE, matchLength); + ip += matchLength; + anchor = ip; + continue; /* faster when present ... (?) */ +@@ -2096,7 +2153,6 @@ size_t ZSTD_compressBlock_lazy_extDict_row( + size_t ZSTD_compressBlock_lazy2_extDict_row( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize) +- + { + return ZSTD_compressBlock_lazy_extDict_generic(ms, seqStore, rep, src, srcSize, search_rowHash, 2); + } +diff --git a/lib/zstd/compress/zstd_lazy.h b/lib/zstd/compress/zstd_lazy.h +index e5bdf4df8dde..9505bed93c03 100644 +--- a/lib/zstd/compress/zstd_lazy.h ++++ b/lib/zstd/compress/zstd_lazy.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -22,6 +23,8 @@ + */ + #define ZSTD_LAZY_DDSS_BUCKET_LOG 2 + ++#define ZSTD_ROW_HASH_TAG_BITS 8 /* nb bits to use for the tag */ ++ + U32 ZSTD_insertAndFindFirstIndex(ZSTD_matchState_t* ms, const BYTE* ip); + void ZSTD_row_update(ZSTD_matchState_t* const ms, const BYTE* ip); + +@@ -113,7 +116,7 @@ size_t ZSTD_compressBlock_lazy2_extDict_row( + size_t ZSTD_compressBlock_btlazy2_extDict( + ZSTD_matchState_t* ms, seqStore_t* seqStore, U32 rep[ZSTD_REP_NUM], + void const* src, size_t srcSize); +- ++ + + + #endif /* ZSTD_LAZY_H */ +diff --git a/lib/zstd/compress/zstd_ldm.c b/lib/zstd/compress/zstd_ldm.c +index dd86fc83e7dd..b7da76b0db7c 100644 +--- a/lib/zstd/compress/zstd_ldm.c ++++ b/lib/zstd/compress/zstd_ldm.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -242,11 +243,11 @@ static size_t ZSTD_ldm_fillFastTables(ZSTD_matchState_t* ms, + switch(ms->cParams.strategy) + { + case ZSTD_fast: +- ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_dfast: +- ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast); ++ ZSTD_fillDoubleHashTable(ms, iend, ZSTD_dtlm_fast, ZSTD_tfp_forCCtx); + break; + + case ZSTD_greedy: +@@ -549,7 +550,7 @@ size_t ZSTD_ldm_generateSequences( + * the window through early invalidation. + * TODO: * Test the chunk size. + * * Try invalidation after the sequence generation and test the +- * the offset against maxDist directly. ++ * offset against maxDist directly. + * + * NOTE: Because of dictionaries + sequence splitting we MUST make sure + * that any offset used is valid at the END of the sequence, since it may +@@ -711,7 +712,7 @@ size_t ZSTD_ldm_blockCompress(rawSeqStore_t* rawSeqStore, + rep[0] = sequence.offset; + /* Store the sequence */ + ZSTD_storeSeq(seqStore, newLitLength, ip - newLitLength, iend, +- STORE_OFFSET(sequence.offset), ++ OFFSET_TO_OFFBASE(sequence.offset), + sequence.matchLength); + ip += sequence.matchLength; + } +diff --git a/lib/zstd/compress/zstd_ldm.h b/lib/zstd/compress/zstd_ldm.h +index fbc6a5e88fd7..c540731abde7 100644 +--- a/lib/zstd/compress/zstd_ldm.h ++++ b/lib/zstd/compress/zstd_ldm.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_ldm_geartab.h b/lib/zstd/compress/zstd_ldm_geartab.h +index 647f865be290..cfccfc46f6f7 100644 +--- a/lib/zstd/compress/zstd_ldm_geartab.h ++++ b/lib/zstd/compress/zstd_ldm_geartab.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/compress/zstd_opt.c b/lib/zstd/compress/zstd_opt.c +index fd82acfda62f..1e41cb04f482 100644 +--- a/lib/zstd/compress/zstd_opt.c ++++ b/lib/zstd/compress/zstd_opt.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Przemyslaw Skibinski, Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -16,7 +17,7 @@ + #define ZSTD_LITFREQ_ADD 2 /* scaling factor for litFreq, so that frequencies adapt faster to new stats */ + #define ZSTD_MAX_PRICE (1<<30) + +-#define ZSTD_PREDEF_THRESHOLD 1024 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ ++#define ZSTD_PREDEF_THRESHOLD 8 /* if srcSize < ZSTD_PREDEF_THRESHOLD, symbols' cost is assumed static, directly determined by pre-defined distributions */ + + + /*-************************************* +@@ -26,27 +27,35 @@ + #if 0 /* approximation at bit level (for tests) */ + # define BITCOST_ACCURACY 0 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat, opt) ((void)opt, ZSTD_bitWeight(stat)) ++# define WEIGHT(stat, opt) ((void)(opt), ZSTD_bitWeight(stat)) + #elif 0 /* fractional bit accuracy (for tests) */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) ((void)opt, ZSTD_fracWeight(stat)) ++# define WEIGHT(stat,opt) ((void)(opt), ZSTD_fracWeight(stat)) + #else /* opt==approx, ultra==accurate */ + # define BITCOST_ACCURACY 8 + # define BITCOST_MULTIPLIER (1 << BITCOST_ACCURACY) +-# define WEIGHT(stat,opt) (opt ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) ++# define WEIGHT(stat,opt) ((opt) ? ZSTD_fracWeight(stat) : ZSTD_bitWeight(stat)) + #endif + ++/* ZSTD_bitWeight() : ++ * provide estimated "cost" of a stat in full bits only */ + MEM_STATIC U32 ZSTD_bitWeight(U32 stat) + { + return (ZSTD_highbit32(stat+1) * BITCOST_MULTIPLIER); + } + ++/* ZSTD_fracWeight() : ++ * provide fractional-bit "cost" of a stat, ++ * using linear interpolation approximation */ + MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + { + U32 const stat = rawStat + 1; + U32 const hb = ZSTD_highbit32(stat); + U32 const BWeight = hb * BITCOST_MULTIPLIER; ++ /* Fweight was meant for "Fractional weight" ++ * but it's effectively a value between 1 and 2 ++ * using fixed point arithmetic */ + U32 const FWeight = (stat << BITCOST_ACCURACY) >> hb; + U32 const weight = BWeight + FWeight; + assert(hb + BITCOST_ACCURACY < 31); +@@ -57,7 +66,7 @@ MEM_STATIC U32 ZSTD_fracWeight(U32 rawStat) + /* debugging function, + * @return price in bytes as fractional value + * for debug messages only */ +-MEM_STATIC double ZSTD_fCost(U32 price) ++MEM_STATIC double ZSTD_fCost(int price) + { + return (double)price / (BITCOST_MULTIPLIER*8); + } +@@ -88,20 +97,26 @@ static U32 sum_u32(const unsigned table[], size_t nbElts) + return total; + } + +-static U32 ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift) ++typedef enum { base_0possible=0, base_1guaranteed=1 } base_directive_e; ++ ++static U32 ++ZSTD_downscaleStats(unsigned* table, U32 lastEltIndex, U32 shift, base_directive_e base1) + { + U32 s, sum=0; +- DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", (unsigned)lastEltIndex+1, (unsigned)shift); ++ DEBUGLOG(5, "ZSTD_downscaleStats (nbElts=%u, shift=%u)", ++ (unsigned)lastEltIndex+1, (unsigned)shift ); + assert(shift < 30); + for (s=0; s> shift); +- sum += table[s]; ++ unsigned const base = base1 ? 1 : (table[s]>0); ++ unsigned const newStat = base + (table[s] >> shift); ++ sum += newStat; ++ table[s] = newStat; + } + return sum; + } + + /* ZSTD_scaleStats() : +- * reduce all elements in table is sum too large ++ * reduce all elt frequencies in table if sum too large + * return the resulting sum of elements */ + static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + { +@@ -110,7 +125,7 @@ static U32 ZSTD_scaleStats(unsigned* table, U32 lastEltIndex, U32 logTarget) + DEBUGLOG(5, "ZSTD_scaleStats (nbElts=%u, target=%u)", (unsigned)lastEltIndex+1, (unsigned)logTarget); + assert(logTarget < 30); + if (factor <= 1) return prevsum; +- return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor)); ++ return ZSTD_downscaleStats(table, lastEltIndex, ZSTD_highbit32(factor), base_1guaranteed); + } + + /* ZSTD_rescaleFreqs() : +@@ -129,18 +144,22 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + DEBUGLOG(5, "ZSTD_rescaleFreqs (srcSize=%u)", (unsigned)srcSize); + optPtr->priceType = zop_dynamic; + +- if (optPtr->litLengthSum == 0) { /* first block : init */ +- if (srcSize <= ZSTD_PREDEF_THRESHOLD) { /* heuristic */ +- DEBUGLOG(5, "(srcSize <= ZSTD_PREDEF_THRESHOLD) => zop_predef"); ++ if (optPtr->litLengthSum == 0) { /* no literals stats collected -> first block assumed -> init */ ++ ++ /* heuristic: use pre-defined stats for too small inputs */ ++ if (srcSize <= ZSTD_PREDEF_THRESHOLD) { ++ DEBUGLOG(5, "srcSize <= %i : use predefined stats", ZSTD_PREDEF_THRESHOLD); + optPtr->priceType = zop_predef; + } + + assert(optPtr->symbolCosts != NULL); + if (optPtr->symbolCosts->huf.repeatMode == HUF_repeat_valid) { +- /* huffman table presumed generated by dictionary */ ++ ++ /* huffman stats covering the full value set : table presumed generated by dictionary */ + optPtr->priceType = zop_dynamic; + + if (compressedLiterals) { ++ /* generate literals statistics from huffman table */ + unsigned lit; + assert(optPtr->litFreq != NULL); + optPtr->litSum = 0; +@@ -188,13 +207,14 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum += optPtr->offCodeFreq[of]; + } } + +- } else { /* not a dictionary */ ++ } else { /* first block, no dictionary */ + + assert(optPtr->litFreq != NULL); + if (compressedLiterals) { ++ /* base initial cost of literals on direct frequency within src */ + unsigned lit = MaxLit; + HIST_count_simple(optPtr->litFreq, &lit, src, srcSize); /* use raw first block to init statistics */ +- optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8); ++ optPtr->litSum = ZSTD_downscaleStats(optPtr->litFreq, MaxLit, 8, base_0possible); + } + + { unsigned const baseLLfreqs[MaxLL+1] = { +@@ -224,10 +244,9 @@ ZSTD_rescaleFreqs(optState_t* const optPtr, + optPtr->offCodeSum = sum_u32(baseOFCfreqs, MaxOff+1); + } + +- + } + +- } else { /* new block : re-use previous statistics, scaled down */ ++ } else { /* new block : scale down accumulated statistics */ + + if (compressedLiterals) + optPtr->litSum = ZSTD_scaleStats(optPtr->litFreq, MaxLit, 12); +@@ -255,11 +274,14 @@ static U32 ZSTD_rawLiteralsCost(const BYTE* const literals, U32 const litLength, + return (litLength*6) * BITCOST_MULTIPLIER; /* 6 bit per literal - no statistic used */ + + /* dynamic statistics */ +- { U32 price = litLength * optPtr->litSumBasePrice; ++ { U32 price = optPtr->litSumBasePrice * litLength; ++ U32 const litPriceMax = optPtr->litSumBasePrice - BITCOST_MULTIPLIER; + U32 u; ++ assert(optPtr->litSumBasePrice >= BITCOST_MULTIPLIER); + for (u=0; u < litLength; u++) { +- assert(WEIGHT(optPtr->litFreq[literals[u]], optLevel) <= optPtr->litSumBasePrice); /* literal cost should never be negative */ +- price -= WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ U32 litPrice = WEIGHT(optPtr->litFreq[literals[u]], optLevel); ++ if (UNLIKELY(litPrice > litPriceMax)) litPrice = litPriceMax; ++ price -= litPrice; + } + return price; + } +@@ -272,10 +294,11 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + assert(litLength <= ZSTD_BLOCKSIZE_MAX); + if (optPtr->priceType == zop_predef) + return WEIGHT(litLength, optLevel); +- /* We can't compute the litLength price for sizes >= ZSTD_BLOCKSIZE_MAX +- * because it isn't representable in the zstd format. So instead just +- * call it 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. In this case the block +- * would be all literals. ++ ++ /* ZSTD_LLcode() can't compute litLength price for sizes >= ZSTD_BLOCKSIZE_MAX ++ * because it isn't representable in the zstd format. ++ * So instead just pretend it would cost 1 bit more than ZSTD_BLOCKSIZE_MAX - 1. ++ * In such a case, the block would be all literals. + */ + if (litLength == ZSTD_BLOCKSIZE_MAX) + return BITCOST_MULTIPLIER + ZSTD_litLengthPrice(ZSTD_BLOCKSIZE_MAX - 1, optPtr, optLevel); +@@ -289,24 +312,25 @@ static U32 ZSTD_litLengthPrice(U32 const litLength, const optState_t* const optP + } + + /* ZSTD_getMatchPrice() : +- * Provides the cost of the match part (offset + matchLength) of a sequence ++ * Provides the cost of the match part (offset + matchLength) of a sequence. + * Must be combined with ZSTD_fullLiteralsCost() to get the full cost of a sequence. +- * @offcode : expects a scale where 0,1,2 are repcodes 1-3, and 3+ are real_offsets+2 ++ * @offBase : sumtype, representing an offset or a repcode, and using numeric representation of ZSTD_storeSeq() + * @optLevel: when <2, favors small offset for decompression speed (improved cache efficiency) + */ + FORCE_INLINE_TEMPLATE U32 +-ZSTD_getMatchPrice(U32 const offcode, ++ZSTD_getMatchPrice(U32 const offBase, + U32 const matchLength, + const optState_t* const optPtr, + int const optLevel) + { + U32 price; +- U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offcode)); ++ U32 const offCode = ZSTD_highbit32(offBase); + U32 const mlBase = matchLength - MINMATCH; + assert(matchLength >= MINMATCH); + +- if (optPtr->priceType == zop_predef) /* fixed scheme, do not use statistics */ +- return WEIGHT(mlBase, optLevel) + ((16 + offCode) * BITCOST_MULTIPLIER); ++ if (optPtr->priceType == zop_predef) /* fixed scheme, does not use statistics */ ++ return WEIGHT(mlBase, optLevel) ++ + ((16 + offCode) * BITCOST_MULTIPLIER); /* emulated offset cost */ + + /* dynamic statistics */ + price = (offCode * BITCOST_MULTIPLIER) + (optPtr->offCodeSumBasePrice - WEIGHT(optPtr->offCodeFreq[offCode], optLevel)); +@@ -325,10 +349,10 @@ ZSTD_getMatchPrice(U32 const offcode, + } + + /* ZSTD_updateStats() : +- * assumption : literals + litLengtn <= iend */ ++ * assumption : literals + litLength <= iend */ + static void ZSTD_updateStats(optState_t* const optPtr, + U32 litLength, const BYTE* literals, +- U32 offsetCode, U32 matchLength) ++ U32 offBase, U32 matchLength) + { + /* literals */ + if (ZSTD_compressedLiterals(optPtr)) { +@@ -344,8 +368,8 @@ static void ZSTD_updateStats(optState_t* const optPtr, + optPtr->litLengthSum++; + } + +- /* offset code : expected to follow storeSeq() numeric representation */ +- { U32 const offCode = ZSTD_highbit32(STORED_TO_OFFBASE(offsetCode)); ++ /* offset code : follows storeSeq() numeric representation */ ++ { U32 const offCode = ZSTD_highbit32(offBase); + assert(offCode <= MaxOff); + optPtr->offCodeFreq[offCode]++; + optPtr->offCodeSum++; +@@ -552,16 +576,17 @@ void ZSTD_updateTree(ZSTD_matchState_t* ms, const BYTE* ip, const BYTE* iend) { + ZSTD_updateTree_internal(ms, ip, iend, ms->cParams.minMatch, ZSTD_noDict); + } + +-FORCE_INLINE_TEMPLATE +-U32 ZSTD_insertBtAndGetAllMatches ( +- ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ +- ZSTD_matchState_t* ms, +- U32* nextToUpdate3, +- const BYTE* const ip, const BYTE* const iLimit, const ZSTD_dictMode_e dictMode, +- const U32 rep[ZSTD_REP_NUM], +- U32 const ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ +- const U32 lengthToBeat, +- U32 const mls /* template */) ++FORCE_INLINE_TEMPLATE U32 ++ZSTD_insertBtAndGetAllMatches ( ++ ZSTD_match_t* matches, /* store result (found matches) in this table (presumed large enough) */ ++ ZSTD_matchState_t* ms, ++ U32* nextToUpdate3, ++ const BYTE* const ip, const BYTE* const iLimit, ++ const ZSTD_dictMode_e dictMode, ++ const U32 rep[ZSTD_REP_NUM], ++ const U32 ll0, /* tells if associated literal length is 0 or not. This value must be 0 or 1 */ ++ const U32 lengthToBeat, ++ const U32 mls /* template */) + { + const ZSTD_compressionParameters* const cParams = &ms->cParams; + U32 const sufficient_len = MIN(cParams->targetLength, ZSTD_OPT_NUM -1); +@@ -644,7 +669,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + DEBUGLOG(8, "found repCode %u (ll0:%u, offset:%u) of length %u", + repCode, ll0, repOffset, repLen); + bestLength = repLen; +- matches[mnum].off = STORE_REPCODE(repCode - ll0 + 1); /* expect value between 1 and 3 */ ++ matches[mnum].off = REPCODE_TO_OFFBASE(repCode - ll0 + 1); /* expect value between 1 and 3 */ + matches[mnum].len = (U32)repLen; + mnum++; + if ( (repLen > sufficient_len) +@@ -673,7 +698,7 @@ U32 ZSTD_insertBtAndGetAllMatches ( + bestLength = mlen; + assert(curr > matchIndex3); + assert(mnum==0); /* no prior solution */ +- matches[0].off = STORE_OFFSET(curr - matchIndex3); ++ matches[0].off = OFFSET_TO_OFFBASE(curr - matchIndex3); + matches[0].len = (U32)mlen; + mnum = 1; + if ( (mlen > sufficient_len) | +@@ -706,13 +731,13 @@ U32 ZSTD_insertBtAndGetAllMatches ( + } + + if (matchLength > bestLength) { +- DEBUGLOG(8, "found match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + assert(matchEndIdx > matchIndex); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -754,12 +779,12 @@ U32 ZSTD_insertBtAndGetAllMatches ( + + if (matchLength > bestLength) { + matchIndex = dictMatchIndex + dmsIndexDelta; +- DEBUGLOG(8, "found dms match of length %u at distance %u (offCode=%u)", +- (U32)matchLength, curr - matchIndex, STORE_OFFSET(curr - matchIndex)); ++ DEBUGLOG(8, "found dms match of length %u at distance %u (offBase=%u)", ++ (U32)matchLength, curr - matchIndex, OFFSET_TO_OFFBASE(curr - matchIndex)); + if (matchLength > matchEndIdx - matchIndex) + matchEndIdx = matchIndex + (U32)matchLength; + bestLength = matchLength; +- matches[mnum].off = STORE_OFFSET(curr - matchIndex); ++ matches[mnum].off = OFFSET_TO_OFFBASE(curr - matchIndex); + matches[mnum].len = (U32)matchLength; + mnum++; + if ( (matchLength > ZSTD_OPT_NUM) +@@ -960,7 +985,7 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + const ZSTD_optLdm_t* optLdm, U32 currPosInBlock) + { + U32 const posDiff = currPosInBlock - optLdm->startPosInBlock; +- /* Note: ZSTD_match_t actually contains offCode and matchLength (before subtracting MINMATCH) */ ++ /* Note: ZSTD_match_t actually contains offBase and matchLength (before subtracting MINMATCH) */ + U32 const candidateMatchLength = optLdm->endPosInBlock - optLdm->startPosInBlock - posDiff; + + /* Ensure that current block position is not outside of the match */ +@@ -971,11 +996,11 @@ static void ZSTD_optLdm_maybeAddMatch(ZSTD_match_t* matches, U32* nbMatches, + } + + if (*nbMatches == 0 || ((candidateMatchLength > matches[*nbMatches-1].len) && *nbMatches < ZSTD_OPT_NUM)) { +- U32 const candidateOffCode = STORE_OFFSET(optLdm->offset); +- DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offCode: %u matchLength %u) at block position=%u", +- candidateOffCode, candidateMatchLength, currPosInBlock); ++ U32 const candidateOffBase = OFFSET_TO_OFFBASE(optLdm->offset); ++ DEBUGLOG(6, "ZSTD_optLdm_maybeAddMatch(): Adding ldm candidate match (offBase: %u matchLength %u) at block position=%u", ++ candidateOffBase, candidateMatchLength, currPosInBlock); + matches[*nbMatches].len = candidateMatchLength; +- matches[*nbMatches].off = candidateOffCode; ++ matches[*nbMatches].off = candidateOffBase; + (*nbMatches)++; + } + } +@@ -1062,6 +1087,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + ZSTD_optimal_t lastSequence; + ZSTD_optLdm_t optLdm; + ++ ZSTD_memset(&lastSequence, 0, sizeof(ZSTD_optimal_t)); ++ + optLdm.seqStore = ms->ldmSeqStore ? *ms->ldmSeqStore : kNullRawSeqStore; + optLdm.endPosInBlock = optLdm.startPosInBlock = optLdm.offset = 0; + ZSTD_opt_getNextMatchAndUpdateSeqStore(&optLdm, (U32)(ip-istart), (U32)(iend-ip)); +@@ -1098,14 +1125,14 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + + /* large match -> immediate encoding */ + { U32 const maxML = matches[nbMatches-1].len; +- U32 const maxOffcode = matches[nbMatches-1].off; +- DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffCode=%u at cPos=%u => start new series", +- nbMatches, maxML, maxOffcode, (U32)(ip-prefixStart)); ++ U32 const maxOffBase = matches[nbMatches-1].off; ++ DEBUGLOG(6, "found %u matches of maxLength=%u and maxOffBase=%u at cPos=%u => start new series", ++ nbMatches, maxML, maxOffBase, (U32)(ip-prefixStart)); + + if (maxML > sufficient_len) { + lastSequence.litlen = litlen; + lastSequence.mlen = maxML; +- lastSequence.off = maxOffcode; ++ lastSequence.off = maxOffBase; + DEBUGLOG(6, "large match (%u>%u), immediate encoding", + maxML, sufficient_len); + cur = 0; +@@ -1122,15 +1149,15 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + opt[pos].price = ZSTD_MAX_PRICE; /* mlen, litlen and price will be fixed during forward scanning */ + } + for (matchNb = 0; matchNb < nbMatches; matchNb++) { +- U32 const offcode = matches[matchNb].off; ++ U32 const offBase = matches[matchNb].off; + U32 const end = matches[matchNb].len; + for ( ; pos <= end ; pos++ ) { +- U32 const matchPrice = ZSTD_getMatchPrice(offcode, pos, optStatePtr, optLevel); ++ U32 const matchPrice = ZSTD_getMatchPrice(offBase, pos, optStatePtr, optLevel); + U32 const sequencePrice = literalsPrice + matchPrice; + DEBUGLOG(7, "rPos:%u => set initial price : %.2f", +- pos, ZSTD_fCost(sequencePrice)); ++ pos, ZSTD_fCost((int)sequencePrice)); + opt[pos].mlen = pos; +- opt[pos].off = offcode; ++ opt[pos].off = offBase; + opt[pos].litlen = litlen; + opt[pos].price = (int)sequencePrice; + } } +@@ -1230,7 +1257,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + U32 const startML = (matchNb>0) ? matches[matchNb-1].len+1 : minMatch; + U32 mlen; + +- DEBUGLOG(7, "testing match %u => offCode=%4u, mlen=%2u, llen=%2u", ++ DEBUGLOG(7, "testing match %u => offBase=%4u, mlen=%2u, llen=%2u", + matchNb, matches[matchNb].off, lastML, litlen); + + for (mlen = lastML; mlen >= startML; mlen--) { /* scan downward */ +@@ -1296,7 +1323,7 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + for (storePos=storeStart; storePos <= storeEnd; storePos++) { + U32 const llen = opt[storePos].litlen; + U32 const mlen = opt[storePos].mlen; +- U32 const offCode = opt[storePos].off; ++ U32 const offBase = opt[storePos].off; + U32 const advance = llen + mlen; + DEBUGLOG(6, "considering seq starting at %zi, llen=%u, mlen=%u", + anchor - istart, (unsigned)llen, (unsigned)mlen); +@@ -1308,8 +1335,8 @@ ZSTD_compressBlock_opt_generic(ZSTD_matchState_t* ms, + } + + assert(anchor + llen <= iend); +- ZSTD_updateStats(optStatePtr, llen, anchor, offCode, mlen); +- ZSTD_storeSeq(seqStore, llen, anchor, iend, offCode, mlen); ++ ZSTD_updateStats(optStatePtr, llen, anchor, offBase, mlen); ++ ZSTD_storeSeq(seqStore, llen, anchor, iend, offBase, mlen); + anchor += advance; + ip = anchor; + } } +@@ -1349,7 +1376,7 @@ size_t ZSTD_compressBlock_btopt( + /* ZSTD_initStats_ultra(): + * make a first compression pass, just to seed stats with more accurate starting values. + * only works on first block, with no dictionary and no ldm. +- * this function cannot error, hence its contract must be respected. ++ * this function cannot error out, its narrow contract must be respected. + */ + static void + ZSTD_initStats_ultra(ZSTD_matchState_t* ms, +@@ -1368,7 +1395,7 @@ ZSTD_initStats_ultra(ZSTD_matchState_t* ms, + + ZSTD_compressBlock_opt2(ms, seqStore, tmpRep, src, srcSize, ZSTD_noDict); /* generate stats into ms->opt*/ + +- /* invalidate first scan from history */ ++ /* invalidate first scan from history, only keep entropy stats */ + ZSTD_resetSeqStore(seqStore); + ms->window.base -= srcSize; + ms->window.dictLimit += (U32)srcSize; +@@ -1392,20 +1419,20 @@ size_t ZSTD_compressBlock_btultra2( + U32 const curr = (U32)((const BYTE*)src - ms->window.base); + DEBUGLOG(5, "ZSTD_compressBlock_btultra2 (srcSize=%zu)", srcSize); + +- /* 2-pass strategy: ++ /* 2-passes strategy: + * this strategy makes a first pass over first block to collect statistics +- * and seed next round's statistics with it. +- * After 1st pass, function forgets everything, and starts a new block. ++ * in order to seed next round's statistics with it. ++ * After 1st pass, function forgets history, and starts a new block. + * Consequently, this can only work if no data has been previously loaded in tables, + * aka, no dictionary, no prefix, no ldm preprocessing. + * The compression ratio gain is generally small (~0.5% on first block), +- * the cost is 2x cpu time on first block. */ ++ ** the cost is 2x cpu time on first block. */ + assert(srcSize <= ZSTD_BLOCKSIZE_MAX); + if ( (ms->opt.litLengthSum==0) /* first block */ + && (seqStore->sequences == seqStore->sequencesStart) /* no ldm */ + && (ms->window.dictLimit == ms->window.lowLimit) /* no dictionary */ +- && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ +- && (srcSize > ZSTD_PREDEF_THRESHOLD) ++ && (curr == ms->window.dictLimit) /* start of frame, nothing already loaded nor skipped */ ++ && (srcSize > ZSTD_PREDEF_THRESHOLD) /* input large enough to not employ default stats */ + ) { + ZSTD_initStats_ultra(ms, seqStore, rep, src, srcSize); + } +diff --git a/lib/zstd/compress/zstd_opt.h b/lib/zstd/compress/zstd_opt.h +index 22b862858ba7..faa73ff4b03d 100644 +--- a/lib/zstd/compress/zstd_opt.h ++++ b/lib/zstd/compress/zstd_opt.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/huf_decompress.c b/lib/zstd/decompress/huf_decompress.c +index 60958afebc41..db670d71fdab 100644 +--- a/lib/zstd/decompress/huf_decompress.c ++++ b/lib/zstd/decompress/huf_decompress.c +@@ -1,7 +1,8 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* ****************************************************************** + * huff0 huffman decoder, + * part of Finite State Entropy library +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * + * You can contact the author at : + * - FSE+HUF source repository : https://github.com/Cyan4973/FiniteStateEntropy +@@ -19,10 +20,10 @@ + #include "../common/compiler.h" + #include "../common/bitstream.h" /* BIT_* */ + #include "../common/fse.h" /* to compress headers */ +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/error_private.h" + #include "../common/zstd_internal.h" ++#include "../common/bits.h" /* ZSTD_highbit32, ZSTD_countTrailingZeros64 */ + + /* ************************************************************** + * Constants +@@ -34,6 +35,12 @@ + * Macros + ****************************************************************/ + ++#ifdef HUF_DISABLE_FAST_DECODE ++# define HUF_ENABLE_FAST_DECODE 0 ++#else ++# define HUF_ENABLE_FAST_DECODE 1 ++#endif ++ + /* These two optional macros force the use one way or another of the two + * Huffman decompression implementations. You can't force in both directions + * at the same time. +@@ -43,27 +50,25 @@ + #error "Cannot force the use of the X1 and X2 decoders at the same time!" + #endif + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 && DYNAMIC_BMI2 +-# define HUF_ASM_X86_64_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE ++/* When DYNAMIC_BMI2 is enabled, fast decoders are only called when bmi2 is ++ * supported at runtime, so we can add the BMI2 target attribute. ++ * When it is disabled, we will still get BMI2 if it is enabled statically. ++ */ ++#if DYNAMIC_BMI2 ++# define HUF_FAST_BMI2_ATTRS BMI2_TARGET_ATTRIBUTE + #else +-# define HUF_ASM_X86_64_BMI2_ATTRS ++# define HUF_FAST_BMI2_ATTRS + #endif + + #define HUF_EXTERN_C + #define HUF_ASM_DECL HUF_EXTERN_C + +-#if DYNAMIC_BMI2 || (ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) ++#if DYNAMIC_BMI2 + # define HUF_NEED_BMI2_FUNCTION 1 + #else + # define HUF_NEED_BMI2_FUNCTION 0 + #endif + +-#if !(ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__)) +-# define HUF_NEED_DEFAULT_FUNCTION 1 +-#else +-# define HUF_NEED_DEFAULT_FUNCTION 0 +-#endif +- + /* ************************************************************** + * Error Management + ****************************************************************/ +@@ -80,6 +85,11 @@ + /* ************************************************************** + * BMI2 Variant Wrappers + ****************************************************************/ ++typedef size_t (*HUF_DecompressUsingDTableFn)(void *dst, size_t dstSize, ++ const void *cSrc, ++ size_t cSrcSize, ++ const HUF_DTable *DTable); ++ + #if DYNAMIC_BMI2 + + #define HUF_DGEN(fn) \ +@@ -101,9 +111,9 @@ + } \ + \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- if (bmi2) { \ ++ if (flags & HUF_flags_bmi2) { \ + return fn##_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); \ + } \ + return fn##_default(dst, dstSize, cSrc, cSrcSize, DTable); \ +@@ -113,9 +123,9 @@ + + #define HUF_DGEN(fn) \ + static size_t fn(void* dst, size_t dstSize, void const* cSrc, \ +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) \ ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) \ + { \ +- (void)bmi2; \ ++ (void)flags; \ + return fn##_body(dst, dstSize, cSrc, cSrcSize, DTable); \ + } + +@@ -134,15 +144,28 @@ static DTableDesc HUF_getDTableDesc(const HUF_DTable* table) + return dtd; + } + +-#if ZSTD_ENABLE_ASM_X86_64_BMI2 +- +-static size_t HUF_initDStream(BYTE const* ip) { ++static size_t HUF_initFastDStream(BYTE const* ip) { + BYTE const lastByte = ip[7]; +- size_t const bitsConsumed = lastByte ? 8 - BIT_highbit32(lastByte) : 0; ++ size_t const bitsConsumed = lastByte ? 8 - ZSTD_highbit32(lastByte) : 0; + size_t const value = MEM_readLEST(ip) | 1; + assert(bitsConsumed <= 8); ++ assert(sizeof(size_t) == 8); + return value << bitsConsumed; + } ++ ++ ++/* ++ * The input/output arguments to the Huffman fast decoding loop: ++ * ++ * ip [in/out] - The input pointers, must be updated to reflect what is consumed. ++ * op [in/out] - The output pointers, must be updated to reflect what is written. ++ * bits [in/out] - The bitstream containers, must be updated to reflect the current state. ++ * dt [in] - The decoding table. ++ * ilimit [in] - The input limit, stop when any input pointer is below ilimit. ++ * oend [in] - The end of the output stream. op[3] must not cross oend. ++ * iend [in] - The end of each input stream. ip[i] may cross iend[i], ++ * as long as it is above ilimit, but that indicates corruption. ++ */ + typedef struct { + BYTE const* ip[4]; + BYTE* op[4]; +@@ -151,15 +174,17 @@ typedef struct { + BYTE const* ilimit; + BYTE* oend; + BYTE const* iend[4]; +-} HUF_DecompressAsmArgs; ++} HUF_DecompressFastArgs; ++ ++typedef void (*HUF_DecompressFastLoopFn)(HUF_DecompressFastArgs*); + + /* +- * Initializes args for the asm decoding loop. +- * @returns 0 on success +- * 1 if the fallback implementation should be used. ++ * Initializes args for the fast decoding loop. ++ * @returns 1 on success ++ * 0 if the fallback implementation should be used. + * Or an error code on failure. + */ +-static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) ++static size_t HUF_DecompressFastArgs_init(HUF_DecompressFastArgs* args, void* dst, size_t dstSize, void const* src, size_t srcSize, const HUF_DTable* DTable) + { + void const* dt = DTable + 1; + U32 const dtLog = HUF_getDTableDesc(DTable).tableLog; +@@ -168,9 +193,11 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + BYTE* const oend = (BYTE*)dst + dstSize; + +- /* The following condition is false on x32 platform, +- * but HUF_asm is not compatible with this ABI */ +- if (!(MEM_isLittleEndian() && !MEM_32bits())) return 1; ++ /* The fast decoding loop assumes 64-bit little-endian. ++ * This condition is false on x32. ++ */ ++ if (!MEM_isLittleEndian() || MEM_32bits()) ++ return 0; + + /* strict minimum : jump table + 1 byte per stream */ + if (srcSize < 10) +@@ -181,7 +208,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * On small inputs we don't have enough data to trigger the fast loop, so use the old decoder. + */ + if (dtLog != HUF_DECODER_FAST_TABLELOG) +- return 1; ++ return 0; + + /* Read the jump table. */ + { +@@ -195,13 +222,13 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->iend[2] = args->iend[1] + length2; + args->iend[3] = args->iend[2] + length3; + +- /* HUF_initDStream() requires this, and this small of an input ++ /* HUF_initFastDStream() requires this, and this small of an input + * won't benefit from the ASM loop anyways. + * length1 must be >= 16 so that ip[0] >= ilimit before the loop + * starts. + */ + if (length1 < 16 || length2 < 8 || length3 < 8 || length4 < 8) +- return 1; ++ return 0; + if (length4 > srcSize) return ERROR(corruption_detected); /* overflow */ + } + /* ip[] contains the position that is currently loaded into bits[]. */ +@@ -218,7 +245,7 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + + /* No point to call the ASM loop for tiny outputs. */ + if (args->op[3] >= oend) +- return 1; ++ return 0; + + /* bits[] is the bit container. + * It is read from the MSB down to the LSB. +@@ -227,10 +254,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + * set, so that CountTrailingZeros(bits[]) can be used + * to count how many bits we've consumed. + */ +- args->bits[0] = HUF_initDStream(args->ip[0]); +- args->bits[1] = HUF_initDStream(args->ip[1]); +- args->bits[2] = HUF_initDStream(args->ip[2]); +- args->bits[3] = HUF_initDStream(args->ip[3]); ++ args->bits[0] = HUF_initFastDStream(args->ip[0]); ++ args->bits[1] = HUF_initFastDStream(args->ip[1]); ++ args->bits[2] = HUF_initFastDStream(args->ip[2]); ++ args->bits[3] = HUF_initFastDStream(args->ip[3]); + + /* If ip[] >= ilimit, it is guaranteed to be safe to + * reload bits[]. It may be beyond its section, but is +@@ -241,10 +268,10 @@ static size_t HUF_DecompressAsmArgs_init(HUF_DecompressAsmArgs* args, void* dst, + args->oend = oend; + args->dt = dt; + +- return 0; ++ return 1; + } + +-static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs const* args, int stream, BYTE* segmentEnd) ++static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressFastArgs const* args, int stream, BYTE* segmentEnd) + { + /* Validate that we haven't overwritten. */ + if (args->op[stream] > segmentEnd) +@@ -258,15 +285,33 @@ static size_t HUF_initRemainingDStream(BIT_DStream_t* bit, HUF_DecompressAsmArgs + return ERROR(corruption_detected); + + /* Construct the BIT_DStream_t. */ +- bit->bitContainer = MEM_readLE64(args->ip[stream]); +- bit->bitsConsumed = ZSTD_countTrailingZeros((size_t)args->bits[stream]); ++ assert(sizeof(size_t) == 8); ++ bit->bitContainer = MEM_readLEST(args->ip[stream]); ++ bit->bitsConsumed = ZSTD_countTrailingZeros64(args->bits[stream]); + bit->start = (const char*)args->iend[0]; + bit->limitPtr = bit->start + sizeof(size_t); + bit->ptr = (const char*)args->ip[stream]; + + return 0; + } +-#endif ++ ++/* Calls X(N) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM(X) \ ++ { \ ++ X(0) \ ++ X(1) \ ++ X(2) \ ++ X(3) \ ++ } ++ ++/* Calls X(N, var) for each stream 0, 1, 2, 3. */ ++#define HUF_4X_FOR_EACH_STREAM_WITH_VAR(X, var) \ ++ { \ ++ X(0, (var)) \ ++ X(1, (var)) \ ++ X(2, (var)) \ ++ X(3, (var)) \ ++ } + + + #ifndef HUF_FORCE_DECOMPRESS_X2 +@@ -283,10 +328,11 @@ typedef struct { BYTE nbBits; BYTE byte; } HUF_DEltX1; /* single-symbol decodi + static U64 HUF_DEltX1_set4(BYTE symbol, BYTE nbBits) { + U64 D4; + if (MEM_isLittleEndian()) { +- D4 = (symbol << 8) + nbBits; ++ D4 = (U64)((symbol << 8) + nbBits); + } else { +- D4 = symbol + (nbBits << 8); ++ D4 = (U64)(symbol + (nbBits << 8)); + } ++ assert(D4 < (1U << 16)); + D4 *= 0x0001000100010001ULL; + return D4; + } +@@ -329,13 +375,7 @@ typedef struct { + BYTE huffWeight[HUF_SYMBOLVALUE_MAX + 1]; + } HUF_ReadDTableX1_Workspace; + +- +-size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX1_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_readDTableX1_wksp(HUF_DTable* DTable, const void* src, size_t srcSize, void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog = 0; + U32 nbSymbols = 0; +@@ -350,7 +390,7 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + DEBUG_STATIC_ASSERT(sizeof(DTableDesc) == sizeof(HUF_DTable)); + /* ZSTD_memset(huffWeight, 0, sizeof(huffWeight)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->huffWeight, HUF_SYMBOLVALUE_MAX + 1, wksp->rankVal, &nbSymbols, &tableLog, src, srcSize, wksp->statsWksp, sizeof(wksp->statsWksp), flags); + if (HUF_isError(iSize)) return iSize; + + +@@ -377,9 +417,8 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * rankStart[0] is not filled because there are no entries in the table for + * weight 0. + */ +- { +- int n; +- int nextRankStart = 0; ++ { int n; ++ U32 nextRankStart = 0; + int const unroll = 4; + int const nLimit = (int)nbSymbols - unroll + 1; + for (n=0; n<(int)tableLog+1; n++) { +@@ -406,10 +445,9 @@ size_t HUF_readDTableX1_wksp_bmi2(HUF_DTable* DTable, const void* src, size_t sr + * We can switch based on the length to a different inner loop which is + * optimized for that particular case. + */ +- { +- U32 w; +- int symbol=wksp->rankVal[0]; +- int rankStart=0; ++ { U32 w; ++ int symbol = wksp->rankVal[0]; ++ int rankStart = 0; + for (w=1; wrankVal[w]; + int const length = (1 << w) >> 1; +@@ -519,7 +557,7 @@ HUF_decodeStreamX1(BYTE* p, BIT_DStream_t* const bitDPtr, BYTE* const pEnd, cons + while (p < pEnd) + HUF_DECODE_SYMBOLX1_0(p, bitDPtr); + +- return pEnd-pStart; ++ return (size_t)(pEnd-pStart); + } + + FORCE_INLINE_TEMPLATE size_t +@@ -545,6 +583,10 @@ HUF_decompress1X1_usingDTable_internal_body( + return dstSize; + } + ++/* HUF_decompress4X1_usingDTable_internal_body(): ++ * Conditions : ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X1_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -588,6 +630,7 @@ HUF_decompress4X1_usingDTable_internal_body( + + if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ + if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -650,38 +693,156 @@ size_t HUF_decompress4X1_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X1_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X1_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X1_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X1_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ U16 const* const dtable = (U16 const*)args->dt; ++ BYTE* const oend = args->oend; ++ BYTE const* const ilimit = args->ilimit; ++ ++ /* Copy the arguments to local variables */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= (stream == 3 ? oend : op[stream + 1])); ++ assert(ip[stream] >= ilimit); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each iteration produces 5 output symbols per stream */ ++ size_t const oiters = (size_t)(oend - op[3]) / 5; ++ /* Each iteration consumes up to 11 bits * 5 = 55 bits < 7 bytes ++ * per stream. ++ */ ++ size_t const iiters = (size_t)(ip[0] - ilimit) / 7; ++ /* We can safely run iters iterations before running bounds checks */ ++ size_t const iters = MIN(oiters, iiters); ++ size_t const symbols = iters * 5; ++ ++ /* We can simply check that op[3] < olimit, instead of checking all ++ * of our bounds, since we can't hit the other bounds until we've run ++ * iters iterations, which only happens when op[3] == olimit. ++ */ ++ olimit = op[3] + symbols; ++ ++ /* Exit fast decoding loop once we get close to the end. */ ++ if (op[3] + 20 > olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++#define HUF_4X1_DECODE_SYMBOL(_stream, _symbol) \ ++ { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ int const entry = (int)dtable[index]; \ ++ bits[(_stream)] <<= (entry & 0x3F); \ ++ op[(_stream)][(_symbol)] = (BYTE)((entry >> 8) & 0xFF); \ ++ } ++ ++#define HUF_4X1_RELOAD_STREAM(_stream) \ ++ { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ op[(_stream)] += 5; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols in each of the 4 streams */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 0) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 1) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 2) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 3) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X1_DECODE_SYMBOL, 4) ++ ++ /* Reload each of the 4 the bitstreams */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X1_RELOAD_STREAM) ++ } while (op[3] < olimit); ++ ++#undef HUF_4X1_DECODE_SYMBOL ++#undef HUF_4X1_RELOAD_STREAM ++ } ++ ++_out: + +-static HUF_ASM_X86_64_BMI2_ATTRS ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++/* ++ * @returns @p dstSize on success (>= 6) ++ * 0 if the fallback implementation should be used ++ * An error if an error occurred ++ */ ++static HUF_FAST_BMI2_ATTRS + size_t +-HUF_decompress4X1_usingDTable_internal_bmi2_asm( ++HUF_decompress4X1_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) + { + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; +- { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); +- FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ HUF_DecompressFastArgs args; ++ { size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ FORWARD_IF_ERROR(ret, "Failed to init fast loop args"); ++ if (ret == 0) ++ return 0; + } + + assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X1_usingDTable_internal_bmi2_asm_loop(&args); ++ loopFn(&args); + + /* Our loop guarantees that ip[] >= ilimit and that we haven't + * overwritten any op[]. +@@ -694,8 +855,7 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + (void)iend; + + /* finish bit streams one by one. */ +- { +- size_t const segmentSize = (dstSize+3) / 4; ++ { size_t const segmentSize = (dstSize+3) / 4; + BYTE* segmentEnd = (BYTE*)dst; + int i; + for (i = 0; i < 4; ++i) { +@@ -712,97 +872,59 @@ HUF_decompress4X1_usingDTable_internal_bmi2_asm( + } + + /* decoded size */ ++ assert(dstSize != 0); + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ +- +-typedef size_t (*HUF_decompress_usingDTable_t)(void *dst, size_t dstSize, +- const void *cSrc, +- size_t cSrcSize, +- const HUF_DTable *DTable); + + HUF_DGEN(HUF_decompress1X1_usingDTable_internal) + + static size_t HUF_decompress4X1_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X1_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X1_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X1_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X1_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X1_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X1_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X1_usingDTable_internal_fast_asm_loop; ++ } + #endif +-} +- +- +-size_t HUF_decompress1X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- +-size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp(DCtx, cSrc, cSrcSize, workSpace, wkspSize); +- if (HUF_isError(hSize)) return hSize; +- if (hSize >= cSrcSize) return ERROR(srcSize_wrong); +- ip += hSize; cSrcSize -= hSize; +- +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); +-} +- +- +-size_t HUF_decompress4X1_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 0) return ERROR(GENERIC); +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X1_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + +-static size_t HUF_decompress4X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); +-} +- +-size_t HUF_decompress4X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, 0); ++ return HUF_decompress4X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +- + #endif /* HUF_FORCE_DECOMPRESS_X2 */ + + +@@ -985,7 +1107,7 @@ static void HUF_fillDTableX2Level2(HUF_DEltX2* DTable, U32 targetLog, const U32 + + static void HUF_fillDTableX2(HUF_DEltX2* DTable, const U32 targetLog, + const sortedSymbol_t* sortedList, +- const U32* rankStart, rankValCol_t *rankValOrigin, const U32 maxWeight, ++ const U32* rankStart, rankValCol_t* rankValOrigin, const U32 maxWeight, + const U32 nbBitsBaseline) + { + U32* const rankVal = rankValOrigin[0]; +@@ -1040,14 +1162,7 @@ typedef struct { + + size_t HUF_readDTableX2_wksp(HUF_DTable* DTable, + const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_readDTableX2_wksp_bmi2(DTable, src, srcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +-size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, +- const void* src, size_t srcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + U32 tableLog, maxW, nbSymbols; + DTableDesc dtd = HUF_getDTableDesc(DTable); +@@ -1069,7 +1184,7 @@ size_t HUF_readDTableX2_wksp_bmi2(HUF_DTable* DTable, + if (maxTableLog > HUF_TABLELOG_MAX) return ERROR(tableLog_tooLarge); + /* ZSTD_memset(weightList, 0, sizeof(weightList)); */ /* is not necessary, even though some analyzer complain ... */ + +- iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), bmi2); ++ iSize = HUF_readStats_wksp(wksp->weightList, HUF_SYMBOLVALUE_MAX + 1, wksp->rankStats, &nbSymbols, &tableLog, src, srcSize, wksp->calleeWksp, sizeof(wksp->calleeWksp), flags); + if (HUF_isError(iSize)) return iSize; + + /* check result */ +@@ -1240,6 +1355,11 @@ HUF_decompress1X2_usingDTable_internal_body( + /* decoded size */ + return dstSize; + } ++ ++/* HUF_decompress4X2_usingDTable_internal_body(): ++ * Conditions: ++ * @dstSize >= 6 ++ */ + FORCE_INLINE_TEMPLATE size_t + HUF_decompress4X2_usingDTable_internal_body( + void* dst, size_t dstSize, +@@ -1280,8 +1400,9 @@ HUF_decompress4X2_usingDTable_internal_body( + DTableDesc const dtd = HUF_getDTableDesc(DTable); + U32 const dtLog = dtd.tableLog; + +- if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ +- if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (length4 > cSrcSize) return ERROR(corruption_detected); /* overflow */ ++ if (opStart4 > oend) return ERROR(corruption_detected); /* overflow */ ++ if (dstSize < 6) return ERROR(corruption_detected); /* stream 4-split doesn't work */ + CHECK_F( BIT_initDStream(&bitD1, istart1, length1) ); + CHECK_F( BIT_initDStream(&bitD2, istart2, length2) ); + CHECK_F( BIT_initDStream(&bitD3, istart3, length3) ); +@@ -1366,36 +1487,178 @@ size_t HUF_decompress4X2_usingDTable_internal_bmi2(void* dst, size_t dstSize, vo + } + #endif + +-#if HUF_NEED_DEFAULT_FUNCTION + static + size_t HUF_decompress4X2_usingDTable_internal_default(void* dst, size_t dstSize, void const* cSrc, + size_t cSrcSize, HUF_DTable const* DTable) { + return HUF_decompress4X2_usingDTable_internal_body(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 + +-HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(HUF_DecompressAsmArgs* args) ZSTDLIB_HIDDEN; ++HUF_ASM_DECL void HUF_decompress4X2_usingDTable_internal_fast_asm_loop(HUF_DecompressFastArgs* args) ZSTDLIB_HIDDEN; ++ ++#endif ++ ++static HUF_FAST_BMI2_ATTRS ++void HUF_decompress4X2_usingDTable_internal_fast_c_loop(HUF_DecompressFastArgs* args) ++{ ++ U64 bits[4]; ++ BYTE const* ip[4]; ++ BYTE* op[4]; ++ BYTE* oend[4]; ++ HUF_DEltX2 const* const dtable = (HUF_DEltX2 const*)args->dt; ++ BYTE const* const ilimit = args->ilimit; ++ ++ /* Copy the arguments to local registers. */ ++ ZSTD_memcpy(&bits, &args->bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&ip), &args->ip, sizeof(ip)); ++ ZSTD_memcpy(&op, &args->op, sizeof(op)); ++ ++ oend[0] = op[1]; ++ oend[1] = op[2]; ++ oend[2] = op[3]; ++ oend[3] = args->oend; ++ ++ assert(MEM_isLittleEndian()); ++ assert(!MEM_32bits()); ++ ++ for (;;) { ++ BYTE* olimit; ++ int stream; ++ ++ /* Assert loop preconditions */ ++#ifndef NDEBUG ++ for (stream = 0; stream < 4; ++stream) { ++ assert(op[stream] <= oend[stream]); ++ assert(ip[stream] >= ilimit); ++ } ++#endif ++ /* Compute olimit */ ++ { ++ /* Each loop does 5 table lookups for each of the 4 streams. ++ * Each table lookup consumes up to 11 bits of input, and produces ++ * up to 2 bytes of output. ++ */ ++ /* We can consume up to 7 bytes of input per iteration per stream. ++ * We also know that each input pointer is >= ip[0]. So we can run ++ * iters loops before running out of input. ++ */ ++ size_t iters = (size_t)(ip[0] - ilimit) / 7; ++ /* Each iteration can produce up to 10 bytes of output per stream. ++ * Each output stream my advance at different rates. So take the ++ * minimum number of safe iterations among all the output streams. ++ */ ++ for (stream = 0; stream < 4; ++stream) { ++ size_t const oiters = (size_t)(oend[stream] - op[stream]) / 10; ++ iters = MIN(iters, oiters); ++ } ++ ++ /* Each iteration produces at least 5 output symbols. So until ++ * op[3] crosses olimit, we know we haven't executed iters ++ * iterations yet. This saves us maintaining an iters counter, ++ * at the expense of computing the remaining # of iterations ++ * more frequently. ++ */ ++ olimit = op[3] + (iters * 5); ++ ++ /* Exit the fast decoding loop if we are too close to the end. */ ++ if (op[3] + 10 > olimit) ++ break; ++ ++ /* Exit the decoding loop if any input pointer has crossed the ++ * previous one. This indicates corruption, and a precondition ++ * to our loop is that ip[i] >= ip[0]. ++ */ ++ for (stream = 1; stream < 4; ++stream) { ++ if (ip[stream] < ip[stream - 1]) ++ goto _out; ++ } ++ } ++ ++#ifndef NDEBUG ++ for (stream = 1; stream < 4; ++stream) { ++ assert(ip[stream] >= ip[stream - 1]); ++ } ++#endif ++ ++#define HUF_4X2_DECODE_SYMBOL(_stream, _decode3) \ ++ if ((_decode3) || (_stream) != 3) { \ ++ int const index = (int)(bits[(_stream)] >> 53); \ ++ HUF_DEltX2 const entry = dtable[index]; \ ++ MEM_write16(op[(_stream)], entry.sequence); \ ++ bits[(_stream)] <<= (entry.nbBits) & 0x3F; \ ++ op[(_stream)] += (entry.length); \ ++ } ++ ++#define HUF_4X2_RELOAD_STREAM(_stream) \ ++ { \ ++ HUF_4X2_DECODE_SYMBOL(3, 1) \ ++ { \ ++ int const ctz = ZSTD_countTrailingZeros64(bits[(_stream)]); \ ++ int const nbBits = ctz & 7; \ ++ int const nbBytes = ctz >> 3; \ ++ ip[(_stream)] -= nbBytes; \ ++ bits[(_stream)] = MEM_read64(ip[(_stream)]) | 1; \ ++ bits[(_stream)] <<= nbBits; \ ++ } \ ++ } ++ ++ /* Manually unroll the loop because compilers don't consistently ++ * unroll the inner loops, which destroys performance. ++ */ ++ do { ++ /* Decode 5 symbols from each of the first 3 streams. ++ * The final stream will be decoded during the reload phase ++ * to reduce register pressure. ++ */ ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0) ++ HUF_4X_FOR_EACH_STREAM_WITH_VAR(HUF_4X2_DECODE_SYMBOL, 0) ++ ++ /* Decode one symbol from the final stream */ ++ HUF_4X2_DECODE_SYMBOL(3, 1) ++ ++ /* Decode 4 symbols from the final stream & reload bitstreams. ++ * The final stream is reloaded last, meaning that all 5 symbols ++ * are decoded from the final stream before it is reloaded. ++ */ ++ HUF_4X_FOR_EACH_STREAM(HUF_4X2_RELOAD_STREAM) ++ } while (op[3] < olimit); ++ } + +-static HUF_ASM_X86_64_BMI2_ATTRS size_t +-HUF_decompress4X2_usingDTable_internal_bmi2_asm( ++#undef HUF_4X2_DECODE_SYMBOL ++#undef HUF_4X2_RELOAD_STREAM ++ ++_out: ++ ++ /* Save the final values of each of the state variables back to args. */ ++ ZSTD_memcpy(&args->bits, &bits, sizeof(bits)); ++ ZSTD_memcpy((void*)(&args->ip), &ip, sizeof(ip)); ++ ZSTD_memcpy(&args->op, &op, sizeof(op)); ++} ++ ++ ++static HUF_FAST_BMI2_ATTRS size_t ++HUF_decompress4X2_usingDTable_internal_fast( + void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) { ++ const HUF_DTable* DTable, ++ HUF_DecompressFastLoopFn loopFn) { + void const* dt = DTable + 1; + const BYTE* const iend = (const BYTE*)cSrc + 6; + BYTE* const oend = (BYTE*)dst + dstSize; +- HUF_DecompressAsmArgs args; ++ HUF_DecompressFastArgs args; + { +- size_t const ret = HUF_DecompressAsmArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); ++ size_t const ret = HUF_DecompressFastArgs_init(&args, dst, dstSize, cSrc, cSrcSize, DTable); + FORWARD_IF_ERROR(ret, "Failed to init asm args"); +- if (ret != 0) +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (ret == 0) ++ return 0; + } + + assert(args.ip[0] >= args.ilimit); +- HUF_decompress4X2_usingDTable_internal_bmi2_asm_loop(&args); ++ loopFn(&args); + + /* note : op4 already verified within main loop */ + assert(args.ip[0] >= iend); +@@ -1426,91 +1689,72 @@ HUF_decompress4X2_usingDTable_internal_bmi2_asm( + /* decoded size */ + return dstSize; + } +-#endif /* ZSTD_ENABLE_ASM_X86_64_BMI2 */ + + static size_t HUF_decompress4X2_usingDTable_internal(void* dst, size_t dstSize, void const* cSrc, +- size_t cSrcSize, HUF_DTable const* DTable, int bmi2) ++ size_t cSrcSize, HUF_DTable const* DTable, int flags) + { ++ HUF_DecompressUsingDTableFn fallbackFn = HUF_decompress4X2_usingDTable_internal_default; ++ HUF_DecompressFastLoopFn loopFn = HUF_decompress4X2_usingDTable_internal_fast_c_loop; ++ + #if DYNAMIC_BMI2 +- if (bmi2) { ++ if (flags & HUF_flags_bmi2) { ++ fallbackFn = HUF_decompress4X2_usingDTable_internal_bmi2; + # if ZSTD_ENABLE_ASM_X86_64_BMI2 +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-# else +- return HUF_decompress4X2_usingDTable_internal_bmi2(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + # endif ++ } else { ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } +-#else +- (void)bmi2; + #endif + + #if ZSTD_ENABLE_ASM_X86_64_BMI2 && defined(__BMI2__) +- return HUF_decompress4X2_usingDTable_internal_bmi2_asm(dst, dstSize, cSrc, cSrcSize, DTable); +-#else +- return HUF_decompress4X2_usingDTable_internal_default(dst, dstSize, cSrc, cSrcSize, DTable); ++ if (!(flags & HUF_flags_disableAsm)) { ++ loopFn = HUF_decompress4X2_usingDTable_internal_fast_asm_loop; ++ } + #endif ++ ++ if (HUF_ENABLE_FAST_DECODE && !(flags & HUF_flags_disableFast)) { ++ size_t const ret = HUF_decompress4X2_usingDTable_internal_fast(dst, dstSize, cSrc, cSrcSize, DTable, loopFn); ++ if (ret != 0) ++ return ret; ++ } ++ return fallbackFn(dst, dstSize, cSrc, cSrcSize, DTable); + } + + HUF_DGEN(HUF_decompress1X2_usingDTable_internal) + +-size_t HUF_decompress1X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- + size_t HUF_decompress1X2_DCtx_wksp(HUF_DTable* DCtx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t const hSize = HUF_readDTableX2_wksp(DCtx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, /* bmi2 */ 0); ++ return HUF_decompress1X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, DCtx, flags); + } + +- +-size_t HUF_decompress4X2_usingDTable( +- void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc dtd = HUF_getDTableDesc(DTable); +- if (dtd.tableType != 1) return ERROR(GENERIC); +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-} +- +-static size_t HUF_decompress4X2_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, ++static size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize, int bmi2) ++ void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + + size_t hSize = HUF_readDTableX2_wksp(dctx, cSrc, cSrcSize, +- workSpace, wkspSize); ++ workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + +-size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, +- const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) +-{ +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, /* bmi2 */ 0); +-} +- +- + #endif /* HUF_FORCE_DECOMPRESS_X1 */ + + +@@ -1518,44 +1762,6 @@ size_t HUF_decompress4X2_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + /* Universal decompression selectors */ + /* ***********************************/ + +-size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- +-size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, +- const void* cSrc, size_t cSrcSize, +- const HUF_DTable* DTable) +-{ +- DTableDesc const dtd = HUF_getDTableDesc(DTable); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)dtd; +- assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)dtd; +- assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, /* bmi2 */ 0); +-#endif +-} +- + + #if !defined(HUF_FORCE_DECOMPRESS_X1) && !defined(HUF_FORCE_DECOMPRESS_X2) + typedef struct { U32 tableTime; U32 decode256Time; } algo_time_t; +@@ -1610,36 +1816,9 @@ U32 HUF_selectDecoder (size_t dstSize, size_t cSrcSize) + #endif + } + +- +-size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, +- size_t dstSize, const void* cSrc, +- size_t cSrcSize, void* workSpace, +- size_t wkspSize) +-{ +- /* validation checks */ +- if (dstSize == 0) return ERROR(dstSize_tooSmall); +- if (cSrcSize == 0) return ERROR(corruption_detected); +- +- { U32 const algoNb = HUF_selectDecoder(dstSize, cSrcSize); +-#if defined(HUF_FORCE_DECOMPRESS_X1) +- (void)algoNb; +- assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#elif defined(HUF_FORCE_DECOMPRESS_X2) +- (void)algoNb; +- assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#else +- return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): +- HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize); +-#endif +- } +-} +- + size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + const void* cSrc, size_t cSrcSize, +- void* workSpace, size_t wkspSize) ++ void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1652,71 +1831,71 @@ size_t HUF_decompress1X_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, + (void)algoNb; + assert(algoNb == 0); + return HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); + return HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #else + return algoNb ? HUF_decompress1X2_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize): ++ cSrcSize, workSpace, wkspSize, flags): + HUF_decompress1X1_DCtx_wksp(dctx, dst, dstSize, cSrc, +- cSrcSize, workSpace, wkspSize); ++ cSrcSize, workSpace, wkspSize, flags); + #endif + } + } + + +-size_t HUF_decompress1X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress1X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress1X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress1X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + + #ifndef HUF_FORCE_DECOMPRESS_X2 +-size_t HUF_decompress1X1_DCtx_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress1X1_DCtx_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + const BYTE* ip = (const BYTE*) cSrc; + +- size_t const hSize = HUF_readDTableX1_wksp_bmi2(dctx, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ size_t const hSize = HUF_readDTableX1_wksp(dctx, cSrc, cSrcSize, workSpace, wkspSize, flags); + if (HUF_isError(hSize)) return hSize; + if (hSize >= cSrcSize) return ERROR(srcSize_wrong); + ip += hSize; cSrcSize -= hSize; + +- return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, bmi2); ++ return HUF_decompress1X1_usingDTable_internal(dst, dstSize, ip, cSrcSize, dctx, flags); + } + #endif + +-size_t HUF_decompress4X_usingDTable_bmi2(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int bmi2) ++size_t HUF_decompress4X_usingDTable(void* dst, size_t maxDstSize, const void* cSrc, size_t cSrcSize, const HUF_DTable* DTable, int flags) + { + DTableDesc const dtd = HUF_getDTableDesc(DTable); + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)dtd; + assert(dtd.tableType == 0); +- return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)dtd; + assert(dtd.tableType == 1); +- return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #else +- return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2) : +- HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, bmi2); ++ return dtd.tableType ? HUF_decompress4X2_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags) : ++ HUF_decompress4X1_usingDTable_internal(dst, maxDstSize, cSrc, cSrcSize, DTable, flags); + #endif + } + +-size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int bmi2) ++size_t HUF_decompress4X_hufOnly_wksp(HUF_DTable* dctx, void* dst, size_t dstSize, const void* cSrc, size_t cSrcSize, void* workSpace, size_t wkspSize, int flags) + { + /* validation checks */ + if (dstSize == 0) return ERROR(dstSize_tooSmall); +@@ -1726,15 +1905,14 @@ size_t HUF_decompress4X_hufOnly_wksp_bmi2(HUF_DTable* dctx, void* dst, size_t ds + #if defined(HUF_FORCE_DECOMPRESS_X1) + (void)algoNb; + assert(algoNb == 0); +- return HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #elif defined(HUF_FORCE_DECOMPRESS_X2) + (void)algoNb; + assert(algoNb == 1); +- return HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #else +- return algoNb ? HUF_decompress4X2_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2) : +- HUF_decompress4X1_DCtx_wksp_bmi2(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, bmi2); ++ return algoNb ? HUF_decompress4X2_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags) : ++ HUF_decompress4X1_DCtx_wksp(dctx, dst, dstSize, cSrc, cSrcSize, workSpace, wkspSize, flags); + #endif + } + } +- +diff --git a/lib/zstd/decompress/zstd_ddict.c b/lib/zstd/decompress/zstd_ddict.c +index dbbc7919de53..30ef65e1ab5c 100644 +--- a/lib/zstd/decompress/zstd_ddict.c ++++ b/lib/zstd/decompress/zstd_ddict.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -14,12 +15,12 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/cpu.h" /* bmi2 */ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "zstd_decompress_internal.h" + #include "zstd_ddict.h" +@@ -131,7 +132,7 @@ static size_t ZSTD_initDDict_internal(ZSTD_DDict* ddict, + ZSTD_memcpy(internalBuffer, dict, dictSize); + } + ddict->dictSize = dictSize; +- ddict->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ ddict->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + + /* parse dictionary content */ + FORWARD_IF_ERROR( ZSTD_loadEntropy_intoDDict(ddict, dictContentType) , ""); +@@ -237,5 +238,5 @@ size_t ZSTD_sizeof_DDict(const ZSTD_DDict* ddict) + unsigned ZSTD_getDictID_fromDDict(const ZSTD_DDict* ddict) + { + if (ddict==NULL) return 0; +- return ZSTD_getDictID_fromDict(ddict->dictContent, ddict->dictSize); ++ return ddict->dictID; + } +diff --git a/lib/zstd/decompress/zstd_ddict.h b/lib/zstd/decompress/zstd_ddict.h +index 8c1a79d666f8..de459a0dacd1 100644 +--- a/lib/zstd/decompress/zstd_ddict.h ++++ b/lib/zstd/decompress/zstd_ddict.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/decompress/zstd_decompress.c b/lib/zstd/decompress/zstd_decompress.c +index 6b3177c94711..03dbdf39109f 100644 +--- a/lib/zstd/decompress/zstd_decompress.c ++++ b/lib/zstd/decompress/zstd_decompress.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -52,17 +53,18 @@ + /*-******************************************************* + * Dependencies + *********************************************************/ ++#include "../common/allocations.h" /* ZSTD_customMalloc, ZSTD_customCalloc, ZSTD_customFree */ + #include "../common/zstd_deps.h" /* ZSTD_memcpy, ZSTD_memmove, ZSTD_memset */ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include /* xxh64_reset, xxh64_update, xxh64_digest, XXH64 */ + #include "../common/zstd_internal.h" /* blockProperties_t */ + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" /* ZSTD_decompressBlock_internal */ ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + + +@@ -72,11 +74,11 @@ + *************************************/ + + #define DDICT_HASHSET_MAX_LOAD_FACTOR_COUNT_MULT 4 +-#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. +- * Currently, that means a 0.75 load factor. +- * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded +- * the load factor of the ddict hash set. +- */ ++#define DDICT_HASHSET_MAX_LOAD_FACTOR_SIZE_MULT 3 /* These two constants represent SIZE_MULT/COUNT_MULT load factor without using a float. ++ * Currently, that means a 0.75 load factor. ++ * So, if count * COUNT_MULT / size * SIZE_MULT != 0, then we've exceeded ++ * the load factor of the ddict hash set. ++ */ + + #define DDICT_HASHSET_TABLE_BASE_SIZE 64 + #define DDICT_HASHSET_RESIZE_FACTOR 2 +@@ -237,6 +239,7 @@ static void ZSTD_DCtx_resetParameters(ZSTD_DCtx* dctx) + dctx->outBufferMode = ZSTD_bm_buffered; + dctx->forceIgnoreChecksum = ZSTD_d_validateChecksum; + dctx->refMultipleDDicts = ZSTD_rmd_refSingleDDict; ++ dctx->disableHufAsm = 0; + } + + static void ZSTD_initDCtx_internal(ZSTD_DCtx* dctx) +@@ -421,16 +424,40 @@ size_t ZSTD_frameHeaderSize(const void* src, size_t srcSize) + * note : only works for formats ZSTD_f_zstd1 and ZSTD_f_zstd1_magicless + * @return : 0, `zfhPtr` is correctly filled, + * >0, `srcSize` is too small, value is wanted `srcSize` amount, +- * or an error code, which can be tested using ZSTD_isError() */ ++** or an error code, which can be tested using ZSTD_isError() */ + size_t ZSTD_getFrameHeader_advanced(ZSTD_frameHeader* zfhPtr, const void* src, size_t srcSize, ZSTD_format_e format) + { + const BYTE* ip = (const BYTE*)src; + size_t const minInputSize = ZSTD_startingInputLength(format); + +- ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzer do not understand that zfhPtr is only going to be read only if return value is zero, since they are 2 different signals */ +- if (srcSize < minInputSize) return minInputSize; +- RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter"); ++ DEBUGLOG(5, "ZSTD_getFrameHeader_advanced: minInputSize = %zu, srcSize = %zu", minInputSize, srcSize); ++ ++ if (srcSize > 0) { ++ /* note : technically could be considered an assert(), since it's an invalid entry */ ++ RETURN_ERROR_IF(src==NULL, GENERIC, "invalid parameter : src==NULL, but srcSize>0"); ++ } ++ if (srcSize < minInputSize) { ++ if (srcSize > 0 && format != ZSTD_f_zstd1_magicless) { ++ /* when receiving less than @minInputSize bytes, ++ * control these bytes at least correspond to a supported magic number ++ * in order to error out early if they don't. ++ **/ ++ size_t const toCopy = MIN(4, srcSize); ++ unsigned char hbuf[4]; MEM_writeLE32(hbuf, ZSTD_MAGICNUMBER); ++ assert(src != NULL); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ( MEM_readLE32(hbuf) != ZSTD_MAGICNUMBER ) { ++ /* not a zstd frame : let's check if it's a skippable frame */ ++ MEM_writeLE32(hbuf, ZSTD_MAGIC_SKIPPABLE_START); ++ ZSTD_memcpy(hbuf, src, toCopy); ++ if ((MEM_readLE32(hbuf) & ZSTD_MAGIC_SKIPPABLE_MASK) != ZSTD_MAGIC_SKIPPABLE_START) { ++ RETURN_ERROR(prefix_unknown, ++ "first bytes don't correspond to any supported magic number"); ++ } } } ++ return minInputSize; ++ } + ++ ZSTD_memset(zfhPtr, 0, sizeof(*zfhPtr)); /* not strictly necessary, but static analyzers may not understand that zfhPtr will be read only if return value is zero, since they are 2 different signals */ + if ( (format != ZSTD_f_zstd1_magicless) + && (MEM_readLE32(src) != ZSTD_MAGICNUMBER) ) { + if ((MEM_readLE32(src) & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { +@@ -540,49 +567,52 @@ static size_t readSkippableFrameSize(void const* src, size_t srcSize) + sizeU32 = MEM_readLE32((BYTE const*)src + ZSTD_FRAMEIDSIZE); + RETURN_ERROR_IF((U32)(sizeU32 + ZSTD_SKIPPABLEHEADERSIZE) < sizeU32, + frameParameter_unsupported, ""); +- { +- size_t const skippableSize = skippableHeaderSize + sizeU32; ++ { size_t const skippableSize = skippableHeaderSize + sizeU32; + RETURN_ERROR_IF(skippableSize > srcSize, srcSize_wrong, ""); + return skippableSize; + } + } + + /*! ZSTD_readSkippableFrame() : +- * Retrieves a zstd skippable frame containing data given by src, and writes it to dst buffer. ++ * Retrieves content of a skippable frame, and writes it to dst buffer. + * + * The parameter magicVariant will receive the magicVariant that was supplied when the frame was written, + * i.e. magicNumber - ZSTD_MAGIC_SKIPPABLE_START. This can be NULL if the caller is not interested + * in the magicVariant. + * +- * Returns an error if destination buffer is not large enough, or if the frame is not skippable. ++ * Returns an error if destination buffer is not large enough, or if this is not a valid skippable frame. + * + * @return : number of bytes written or a ZSTD error. + */ +-ZSTDLIB_API size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, unsigned* magicVariant, +- const void* src, size_t srcSize) ++size_t ZSTD_readSkippableFrame(void* dst, size_t dstCapacity, ++ unsigned* magicVariant, /* optional, can be NULL */ ++ const void* src, size_t srcSize) + { +- U32 const magicNumber = MEM_readLE32(src); +- size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); +- size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; +- +- /* check input validity */ +- RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); +- RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); +- RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ RETURN_ERROR_IF(srcSize < ZSTD_SKIPPABLEHEADERSIZE, srcSize_wrong, ""); + +- /* deliver payload */ +- if (skippableContentSize > 0 && dst != NULL) +- ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); +- if (magicVariant != NULL) +- *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; +- return skippableContentSize; ++ { U32 const magicNumber = MEM_readLE32(src); ++ size_t skippableFrameSize = readSkippableFrameSize(src, srcSize); ++ size_t skippableContentSize = skippableFrameSize - ZSTD_SKIPPABLEHEADERSIZE; ++ ++ /* check input validity */ ++ RETURN_ERROR_IF(!ZSTD_isSkippableFrame(src, srcSize), frameParameter_unsupported, ""); ++ RETURN_ERROR_IF(skippableFrameSize < ZSTD_SKIPPABLEHEADERSIZE || skippableFrameSize > srcSize, srcSize_wrong, ""); ++ RETURN_ERROR_IF(skippableContentSize > dstCapacity, dstSize_tooSmall, ""); ++ ++ /* deliver payload */ ++ if (skippableContentSize > 0 && dst != NULL) ++ ZSTD_memcpy(dst, (const BYTE *)src + ZSTD_SKIPPABLEHEADERSIZE, skippableContentSize); ++ if (magicVariant != NULL) ++ *magicVariant = magicNumber - ZSTD_MAGIC_SKIPPABLE_START; ++ return skippableContentSize; ++ } + } + + /* ZSTD_findDecompressedSize() : +- * compatible with legacy mode + * `srcSize` must be the exact length of some number of ZSTD compressed and/or + * skippable frames +- * @return : decompressed size of the frames contained */ ++ * note: compatible with legacy mode ++ * @return : decompressed size of the frames contained */ + unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + { + unsigned long long totalDstSize = 0; +@@ -592,9 +622,7 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- if (ZSTD_isError(skippableSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(skippableSize)) return ZSTD_CONTENTSIZE_ERROR; + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; +@@ -602,17 +630,17 @@ unsigned long long ZSTD_findDecompressedSize(const void* src, size_t srcSize) + continue; + } + +- { unsigned long long const ret = ZSTD_getFrameContentSize(src, srcSize); +- if (ret >= ZSTD_CONTENTSIZE_ERROR) return ret; ++ { unsigned long long const fcs = ZSTD_getFrameContentSize(src, srcSize); ++ if (fcs >= ZSTD_CONTENTSIZE_ERROR) return fcs; + +- /* check for overflow */ +- if (totalDstSize + ret < totalDstSize) return ZSTD_CONTENTSIZE_ERROR; +- totalDstSize += ret; ++ if (totalDstSize + fcs < totalDstSize) ++ return ZSTD_CONTENTSIZE_ERROR; /* check for overflow */ ++ totalDstSize += fcs; + } ++ /* skip to next frame */ + { size_t const frameSrcSize = ZSTD_findFrameCompressedSize(src, srcSize); +- if (ZSTD_isError(frameSrcSize)) { +- return ZSTD_CONTENTSIZE_ERROR; +- } ++ if (ZSTD_isError(frameSrcSize)) return ZSTD_CONTENTSIZE_ERROR; ++ assert(frameSrcSize <= srcSize); + + src = (const BYTE *)src + frameSrcSize; + srcSize -= frameSrcSize; +@@ -730,10 +758,11 @@ static ZSTD_frameSizeInfo ZSTD_findFrameSizeInfo(const void* src, size_t srcSize + ip += 4; + } + ++ frameSizeInfo.nbBlocks = nbBlocks; + frameSizeInfo.compressedSize = (size_t)(ip - ipstart); + frameSizeInfo.decompressedBound = (zfh.frameContentSize != ZSTD_CONTENTSIZE_UNKNOWN) + ? zfh.frameContentSize +- : nbBlocks * zfh.blockSizeMax; ++ : (unsigned long long)nbBlocks * zfh.blockSizeMax; + return frameSizeInfo; + } + } +@@ -773,6 +802,48 @@ unsigned long long ZSTD_decompressBound(const void* src, size_t srcSize) + return bound; + } + ++size_t ZSTD_decompressionMargin(void const* src, size_t srcSize) ++{ ++ size_t margin = 0; ++ unsigned maxBlockSize = 0; ++ ++ /* Iterate over each frame */ ++ while (srcSize > 0) { ++ ZSTD_frameSizeInfo const frameSizeInfo = ZSTD_findFrameSizeInfo(src, srcSize); ++ size_t const compressedSize = frameSizeInfo.compressedSize; ++ unsigned long long const decompressedBound = frameSizeInfo.decompressedBound; ++ ZSTD_frameHeader zfh; ++ ++ FORWARD_IF_ERROR(ZSTD_getFrameHeader(&zfh, src, srcSize), ""); ++ if (ZSTD_isError(compressedSize) || decompressedBound == ZSTD_CONTENTSIZE_ERROR) ++ return ERROR(corruption_detected); ++ ++ if (zfh.frameType == ZSTD_frame) { ++ /* Add the frame header to our margin */ ++ margin += zfh.headerSize; ++ /* Add the checksum to our margin */ ++ margin += zfh.checksumFlag ? 4 : 0; ++ /* Add 3 bytes per block */ ++ margin += 3 * frameSizeInfo.nbBlocks; ++ ++ /* Compute the max block size */ ++ maxBlockSize = MAX(maxBlockSize, zfh.blockSizeMax); ++ } else { ++ assert(zfh.frameType == ZSTD_skippableFrame); ++ /* Add the entire skippable frame size to our margin. */ ++ margin += compressedSize; ++ } ++ ++ assert(srcSize >= compressedSize); ++ src = (const BYTE*)src + compressedSize; ++ srcSize -= compressedSize; ++ } ++ ++ /* Add the max block size back to the margin. */ ++ margin += maxBlockSize; ++ ++ return margin; ++} + + /*-************************************************************* + * Frame decoding +@@ -930,6 +1001,7 @@ static size_t ZSTD_decompressFrame(ZSTD_DCtx* dctx, + } + ZSTD_DCtx_trace_end(dctx, (U64)(op-ostart), (U64)(ip-istart), /* streaming */ 0); + /* Allow caller to get size read */ ++ DEBUGLOG(4, "ZSTD_decompressFrame: decompressed frame of size %zi, consuming %zi bytes of input", op-ostart, ip - (const BYTE*)*srcPtr); + *srcPtr = ip; + *srcSizePtr = remainingSrcSize; + return (size_t)(op-ostart); +@@ -955,17 +1027,18 @@ static size_t ZSTD_decompressMultiFrame(ZSTD_DCtx* dctx, + while (srcSize >= ZSTD_startingInputLength(dctx->format)) { + + +- { U32 const magicNumber = MEM_readLE32(src); +- DEBUGLOG(4, "reading magic number %08X (expecting %08X)", +- (unsigned)magicNumber, ZSTD_MAGICNUMBER); ++ if (srcSize >= 4) { ++ U32 const magicNumber = MEM_readLE32(src); ++ DEBUGLOG(5, "reading magic number %08X", (unsigned)magicNumber); + if ((magicNumber & ZSTD_MAGIC_SKIPPABLE_MASK) == ZSTD_MAGIC_SKIPPABLE_START) { ++ /* skippable frame detected : skip it */ + size_t const skippableSize = readSkippableFrameSize(src, srcSize); +- FORWARD_IF_ERROR(skippableSize, "readSkippableFrameSize failed"); ++ FORWARD_IF_ERROR(skippableSize, "invalid skippable frame"); + assert(skippableSize <= srcSize); + + src = (const BYTE *)src + skippableSize; + srcSize -= skippableSize; +- continue; ++ continue; /* check next frame */ + } } + + if (ddict) { +@@ -1061,8 +1134,8 @@ size_t ZSTD_decompress(void* dst, size_t dstCapacity, const void* src, size_t sr + size_t ZSTD_nextSrcSizeToDecompress(ZSTD_DCtx* dctx) { return dctx->expected; } + + /* +- * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, +- * we allow taking a partial block as the input. Currently only raw uncompressed blocks can ++ * Similar to ZSTD_nextSrcSizeToDecompress(), but when a block input can be streamed, we ++ * allow taking a partial block as the input. Currently only raw uncompressed blocks can + * be streamed. + * + * For blocks that can be streamed, this allows us to reduce the latency until we produce +@@ -1262,7 +1335,7 @@ size_t ZSTD_decompressContinue(ZSTD_DCtx* dctx, void* dst, size_t dstCapacity, c + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } + } + +@@ -1303,11 +1376,11 @@ ZSTD_loadDEntropy(ZSTD_entropyDTables_t* entropy, + /* in minimal huffman, we always use X1 variants */ + size_t const hSize = HUF_readDTableX1_wksp(entropy->hufTable, + dictPtr, dictEnd - dictPtr, +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #else + size_t const hSize = HUF_readDTableX2_wksp(entropy->hufTable, + dictPtr, (size_t)(dictEnd - dictPtr), +- workspace, workspaceSize); ++ workspace, workspaceSize, /* flags */ 0); + #endif + RETURN_ERROR_IF(HUF_isError(hSize), dictionary_corrupted, ""); + dictPtr += hSize; +@@ -1403,7 +1476,7 @@ size_t ZSTD_decompressBegin(ZSTD_DCtx* dctx) + dctx->prefixStart = NULL; + dctx->virtualStart = NULL; + dctx->dictEnd = NULL; +- dctx->entropy.hufTable[0] = (HUF_DTable)((HufLog)*0x1000001); /* cover both little and big endian */ ++ dctx->entropy.hufTable[0] = (HUF_DTable)((ZSTD_HUFFDTABLE_CAPACITY_LOG)*0x1000001); /* cover both little and big endian */ + dctx->litEntropy = dctx->fseEntropy = 0; + dctx->dictID = 0; + dctx->bType = bt_reserved; +@@ -1465,7 +1538,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * This could for one of the following reasons : + * - The frame does not require a dictionary (most common case). + * - The frame was built with dictID intentionally removed. +- * Needed dictionary is a hidden information. ++ * Needed dictionary is a hidden piece of information. + * Note : this use case also happens when using a non-conformant dictionary. + * - `srcSize` is too small, and as a result, frame header could not be decoded. + * Note : possible if `srcSize < ZSTD_FRAMEHEADERSIZE_MAX`. +@@ -1474,7 +1547,7 @@ unsigned ZSTD_getDictID_fromDict(const void* dict, size_t dictSize) + * ZSTD_getFrameHeader(), which will provide a more precise error code. */ + unsigned ZSTD_getDictID_fromFrame(const void* src, size_t srcSize) + { +- ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0 }; ++ ZSTD_frameHeader zfp = { 0, 0, 0, ZSTD_frame, 0, 0, 0, 0, 0 }; + size_t const hError = ZSTD_getFrameHeader(&zfp, src, srcSize); + if (ZSTD_isError(hError)) return 0; + return zfp.dictID; +@@ -1581,7 +1654,9 @@ size_t ZSTD_initDStream_usingDict(ZSTD_DStream* zds, const void* dict, size_t di + size_t ZSTD_initDStream(ZSTD_DStream* zds) + { + DEBUGLOG(4, "ZSTD_initDStream"); +- return ZSTD_initDStream_usingDDict(zds, NULL); ++ FORWARD_IF_ERROR(ZSTD_DCtx_reset(zds, ZSTD_reset_session_only), ""); ++ FORWARD_IF_ERROR(ZSTD_DCtx_refDDict(zds, NULL), ""); ++ return ZSTD_startingInputLength(zds->format); + } + + /* ZSTD_initDStream_usingDDict() : +@@ -1589,6 +1664,7 @@ size_t ZSTD_initDStream(ZSTD_DStream* zds) + * this function cannot fail */ + size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + { ++ DEBUGLOG(4, "ZSTD_initDStream_usingDDict"); + FORWARD_IF_ERROR( ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only) , ""); + FORWARD_IF_ERROR( ZSTD_DCtx_refDDict(dctx, ddict) , ""); + return ZSTD_startingInputLength(dctx->format); +@@ -1599,6 +1675,7 @@ size_t ZSTD_initDStream_usingDDict(ZSTD_DStream* dctx, const ZSTD_DDict* ddict) + * this function cannot fail */ + size_t ZSTD_resetDStream(ZSTD_DStream* dctx) + { ++ DEBUGLOG(4, "ZSTD_resetDStream"); + FORWARD_IF_ERROR(ZSTD_DCtx_reset(dctx, ZSTD_reset_session_only), ""); + return ZSTD_startingInputLength(dctx->format); + } +@@ -1670,6 +1747,11 @@ ZSTD_bounds ZSTD_dParam_getBounds(ZSTD_dParameter dParam) + bounds.lowerBound = (int)ZSTD_rmd_refSingleDDict; + bounds.upperBound = (int)ZSTD_rmd_refMultipleDDicts; + return bounds; ++ case ZSTD_d_disableHuffmanAssembly: ++ bounds.lowerBound = 0; ++ bounds.upperBound = 1; ++ return bounds; ++ + default:; + } + bounds.error = ERROR(parameter_unsupported); +@@ -1710,6 +1792,9 @@ size_t ZSTD_DCtx_getParameter(ZSTD_DCtx* dctx, ZSTD_dParameter param, int* value + case ZSTD_d_refMultipleDDicts: + *value = (int)dctx->refMultipleDDicts; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ *value = (int)dctx->disableHufAsm; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1743,6 +1828,10 @@ size_t ZSTD_DCtx_setParameter(ZSTD_DCtx* dctx, ZSTD_dParameter dParam, int value + } + dctx->refMultipleDDicts = (ZSTD_refMultipleDDicts_e)value; + return 0; ++ case ZSTD_d_disableHuffmanAssembly: ++ CHECK_DBOUNDS(ZSTD_d_disableHuffmanAssembly, value); ++ dctx->disableHufAsm = value != 0; ++ return 0; + default:; + } + RETURN_ERROR(parameter_unsupported, ""); +@@ -1918,7 +2007,6 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if (zds->refMultipleDDicts && zds->ddictSet) { + ZSTD_DCtx_selectFrameDDict(zds); + } +- DEBUGLOG(5, "header size : %u", (U32)hSize); + if (ZSTD_isError(hSize)) { + return hSize; /* error */ + } +@@ -1932,6 +2020,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + zds->lhSize += remainingInput; + } + input->pos = input->size; ++ /* check first few bytes */ ++ FORWARD_IF_ERROR( ++ ZSTD_getFrameHeader_advanced(&zds->fParams, zds->headerBuffer, zds->lhSize, zds->format), ++ "First few bytes detected incorrect" ); ++ /* return hint input size */ + return (MAX((size_t)ZSTD_FRAMEHEADERSIZE_MIN(zds->format), hSize) - zds->lhSize) + ZSTD_blockHeaderSize; /* remaining header bytes + next block header */ + } + assert(ip != NULL); +@@ -1949,8 +2042,9 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + size_t const decompressedSize = ZSTD_decompress_usingDDict(zds, op, (size_t)(oend-op), istart, cSize, ZSTD_getDDict(zds)); + if (ZSTD_isError(decompressedSize)) return decompressedSize; + DEBUGLOG(4, "shortcut to single-pass ZSTD_decompress_usingDDict()") ++ assert(istart != NULL); + ip = istart + cSize; +- op += decompressedSize; ++ op = op ? op + decompressedSize : op; /* can occur if frameContentSize = 0 (empty frame) */ + zds->expected = 0; + zds->streamStage = zdss_init; + someMoreWork = 0; +@@ -2034,6 +2128,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + } + if ((size_t)(iend-ip) >= neededInSize) { /* decode directly from src */ + FORWARD_IF_ERROR(ZSTD_decompressContinueStream(zds, &op, oend, ip, neededInSize), ""); ++ assert(ip != NULL); + ip += neededInSize; + /* Function modifies the stage so we must break */ + break; +@@ -2048,7 +2143,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + int const isSkipFrame = ZSTD_isSkipFrame(zds); + size_t loadedSize; + /* At this point we shouldn't be decompressing a block that we can stream. */ +- assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, iend - ip)); ++ assert(neededInSize == ZSTD_nextSrcSizeToDecompressWithInputSize(zds, (size_t)(iend - ip))); + if (isSkipFrame) { + loadedSize = MIN(toLoad, (size_t)(iend-ip)); + } else { +@@ -2057,8 +2152,11 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + "should never happen"); + loadedSize = ZSTD_limitCopy(zds->inBuff + zds->inPos, toLoad, ip, (size_t)(iend-ip)); + } +- ip += loadedSize; +- zds->inPos += loadedSize; ++ if (loadedSize != 0) { ++ /* ip may be NULL */ ++ ip += loadedSize; ++ zds->inPos += loadedSize; ++ } + if (loadedSize < toLoad) { someMoreWork = 0; break; } /* not enough input, wait for more */ + + /* decode loaded input */ +@@ -2068,14 +2166,17 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + break; + } + case zdss_flush: +- { size_t const toFlushSize = zds->outEnd - zds->outStart; ++ { ++ size_t const toFlushSize = zds->outEnd - zds->outStart; + size_t const flushedSize = ZSTD_limitCopy(op, (size_t)(oend-op), zds->outBuff + zds->outStart, toFlushSize); +- op += flushedSize; ++ ++ op = op ? op + flushedSize : op; ++ + zds->outStart += flushedSize; + if (flushedSize == toFlushSize) { /* flush completed */ + zds->streamStage = zdss_read; + if ( (zds->outBuffSize < zds->fParams.frameContentSize) +- && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { ++ && (zds->outStart + zds->fParams.blockSizeMax > zds->outBuffSize) ) { + DEBUGLOG(5, "restart filling outBuff from beginning (left:%i, needed:%u)", + (int)(zds->outBuffSize - zds->outStart), + (U32)zds->fParams.blockSizeMax); +@@ -2089,7 +2190,7 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + + default: + assert(0); /* impossible */ +- RETURN_ERROR(GENERIC, "impossible to reach"); /* some compiler require default to do something */ ++ RETURN_ERROR(GENERIC, "impossible to reach"); /* some compilers require default to do something */ + } } + + /* result */ +@@ -2102,8 +2203,8 @@ size_t ZSTD_decompressStream(ZSTD_DStream* zds, ZSTD_outBuffer* output, ZSTD_inB + if ((ip==istart) && (op==ostart)) { /* no forward progress */ + zds->noForwardProgress ++; + if (zds->noForwardProgress >= ZSTD_NO_FORWARD_PROGRESS_MAX) { +- RETURN_ERROR_IF(op==oend, dstSize_tooSmall, ""); +- RETURN_ERROR_IF(ip==iend, srcSize_wrong, ""); ++ RETURN_ERROR_IF(op==oend, noForwardProgress_destFull, ""); ++ RETURN_ERROR_IF(ip==iend, noForwardProgress_inputEmpty, ""); + assert(0); + } + } else { +@@ -2140,11 +2241,17 @@ size_t ZSTD_decompressStream_simpleArgs ( + void* dst, size_t dstCapacity, size_t* dstPos, + const void* src, size_t srcSize, size_t* srcPos) + { +- ZSTD_outBuffer output = { dst, dstCapacity, *dstPos }; +- ZSTD_inBuffer input = { src, srcSize, *srcPos }; +- /* ZSTD_compress_generic() will check validity of dstPos and srcPos */ +- size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); +- *dstPos = output.pos; +- *srcPos = input.pos; +- return cErr; ++ ZSTD_outBuffer output; ++ ZSTD_inBuffer input; ++ output.dst = dst; ++ output.size = dstCapacity; ++ output.pos = *dstPos; ++ input.src = src; ++ input.size = srcSize; ++ input.pos = *srcPos; ++ { size_t const cErr = ZSTD_decompressStream(dctx, &output, &input); ++ *dstPos = output.pos; ++ *srcPos = input.pos; ++ return cErr; ++ } + } +diff --git a/lib/zstd/decompress/zstd_decompress_block.c b/lib/zstd/decompress/zstd_decompress_block.c +index c1913b8e7c89..9f5577e5bc19 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.c ++++ b/lib/zstd/decompress/zstd_decompress_block.c +@@ -1,5 +1,6 @@ ++// SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -20,12 +21,12 @@ + #include "../common/mem.h" /* low level memory routines */ + #define FSE_STATIC_LINKING_ONLY + #include "../common/fse.h" +-#define HUF_STATIC_LINKING_ONLY + #include "../common/huf.h" + #include "../common/zstd_internal.h" + #include "zstd_decompress_internal.h" /* ZSTD_DCtx */ + #include "zstd_ddict.h" /* ZSTD_DDictDictContent */ + #include "zstd_decompress_block.h" ++#include "../common/bits.h" /* ZSTD_highbit32 */ + + /*_******************************************************* + * Macros +@@ -89,7 +90,7 @@ static void ZSTD_allocateLiteralsBuffer(ZSTD_DCtx* dctx, void* const dst, const + dctx->litBufferEnd = dctx->litBuffer + litSize - ZSTD_LITBUFFEREXTRASIZE; + } + else { +- /* initially this will be stored entirely in dst during huffman decoding, it will partially shifted to litExtraBuffer after */ ++ /* initially this will be stored entirely in dst during huffman decoding, it will partially be shifted to litExtraBuffer after */ + dctx->litBuffer = (BYTE*)dst + expectedWriteSize - litSize; + dctx->litBufferEnd = (BYTE*)dst + expectedWriteSize; + } +@@ -134,13 +135,16 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + ZSTD_FALLTHROUGH; + + case set_compressed: +- RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need up to 5 for case 3"); ++ RETURN_ERROR_IF(srcSize < 5, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need up to 5 for case 3"); + { size_t lhSize, litSize, litCSize; + U32 singleStream=0; + U32 const lhlCode = (istart[0] >> 2) & 3; + U32 const lhc = MEM_readLE32(istart); + size_t hufSuccess; + size_t expectedWriteSize = MIN(ZSTD_BLOCKSIZE_MAX, dstCapacity); ++ int const flags = 0 ++ | (ZSTD_DCtx_get_bmi2(dctx) ? HUF_flags_bmi2 : 0) ++ | (dctx->disableHufAsm ? HUF_flags_disableAsm : 0); + switch(lhlCode) + { + case 0: case 1: default: /* note : default is impossible, since lhlCode into [0..3] */ +@@ -165,6 +169,10 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); + RETURN_ERROR_IF(litSize > ZSTD_BLOCKSIZE_MAX, corruption_detected, ""); ++ if (!singleStream) ++ RETURN_ERROR_IF(litSize < MIN_LITERALS_FOR_4_STREAMS, literals_headerWrong, ++ "Not enough literals (%zu) for the 4-streams mode (min %u)", ++ litSize, MIN_LITERALS_FOR_4_STREAMS); + RETURN_ERROR_IF(litCSize + lhSize > srcSize, corruption_detected, ""); + RETURN_ERROR_IF(expectedWriteSize < litSize , dstSize_tooSmall, ""); + ZSTD_allocateLiteralsBuffer(dctx, dst, dstCapacity, litSize, streaming, expectedWriteSize, 0); +@@ -176,13 +184,14 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + + if (litEncType==set_repeat) { + if (singleStream) { +- hufSuccess = HUF_decompress1X_usingDTable_bmi2( ++ hufSuccess = HUF_decompress1X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } else { +- hufSuccess = HUF_decompress4X_usingDTable_bmi2( ++ assert(litSize >= MIN_LITERALS_FOR_4_STREAMS); ++ hufSuccess = HUF_decompress4X_usingDTable( + dctx->litBuffer, litSize, istart+lhSize, litCSize, +- dctx->HUFptr, ZSTD_DCtx_get_bmi2(dctx)); ++ dctx->HUFptr, flags); + } + } else { + if (singleStream) { +@@ -190,18 +199,18 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + hufSuccess = HUF_decompress1X_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace)); ++ sizeof(dctx->workspace), flags); + #else +- hufSuccess = HUF_decompress1X1_DCtx_wksp_bmi2( ++ hufSuccess = HUF_decompress1X1_DCtx_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + #endif + } else { +- hufSuccess = HUF_decompress4X_hufOnly_wksp_bmi2( ++ hufSuccess = HUF_decompress4X_hufOnly_wksp( + dctx->entropy.hufTable, dctx->litBuffer, litSize, + istart+lhSize, litCSize, dctx->workspace, +- sizeof(dctx->workspace), ZSTD_DCtx_get_bmi2(dctx)); ++ sizeof(dctx->workspace), flags); + } + } + if (dctx->litBufferLocation == ZSTD_split) +@@ -237,6 +246,7 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize = 3"); + litSize = MEM_readLE24(istart) >> 4; + break; + } +@@ -279,12 +289,13 @@ size_t ZSTD_decodeLiteralsBlock(ZSTD_DCtx* dctx, + break; + case 1: + lhSize = 2; ++ RETURN_ERROR_IF(srcSize<3, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 3"); + litSize = MEM_readLE16(istart) >> 4; + break; + case 3: + lhSize = 3; ++ RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 2; here we need lhSize+1 = 4"); + litSize = MEM_readLE24(istart) >> 4; +- RETURN_ERROR_IF(srcSize<4, corruption_detected, "srcSize >= MIN_CBLOCK_SIZE == 3; here we need lhSize+1 = 4"); + break; + } + RETURN_ERROR_IF(litSize > 0 && dst == NULL, dstSize_tooSmall, "NULL not handled"); +@@ -506,14 +517,15 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i = 8; i < n; i += 8) { + MEM_write64(spread + pos + i, sv); + } +- pos += n; ++ assert(n>=0); ++ pos += (size_t)n; + } + } + /* Now we spread those positions across the table. +- * The benefit of doing it in two stages is that we avoid the the ++ * The benefit of doing it in two stages is that we avoid the + * variable size inner loop, which caused lots of branch misses. + * Now we can run through all the positions without any branch misses. +- * We unroll the loop twice, since that is what emperically worked best. ++ * We unroll the loop twice, since that is what empirically worked best. + */ + { + size_t position = 0; +@@ -540,7 +552,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (i=0; i highThreshold) position = (position + step) & tableMask; /* lowprob area */ ++ while (UNLIKELY(position > highThreshold)) position = (position + step) & tableMask; /* lowprob area */ + } } + assert(position == 0); /* position must reach all cells once, otherwise normalizedCounter is incorrect */ + } +@@ -551,7 +563,7 @@ void ZSTD_buildFSETable_body(ZSTD_seqSymbol* dt, + for (u=0; ustateLL.table + seqState->stateLL.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(mlDInfo, seqState->stateML.table + seqState->stateML.state, sizeof(ZSTD_seqSymbol)); ++ ZSTD_memcpy(ofDInfo, seqState->stateOffb.table + seqState->stateOffb.state, sizeof(ZSTD_seqSymbol)); ++#else + const ZSTD_seqSymbol* const llDInfo = seqState->stateLL.table + seqState->stateLL.state; + const ZSTD_seqSymbol* const mlDInfo = seqState->stateML.table + seqState->stateML.state; + const ZSTD_seqSymbol* const ofDInfo = seqState->stateOffb.table + seqState->stateOffb.state; ++#endif + seq.matchLength = mlDInfo->baseValue; + seq.litLength = llDInfo->baseValue; + { U32 const ofBase = ofDInfo->baseValue; +@@ -1186,28 +1221,31 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + U32 const llnbBits = llDInfo->nbBits; + U32 const mlnbBits = mlDInfo->nbBits; + U32 const ofnbBits = ofDInfo->nbBits; ++ ++ assert(llBits <= MaxLLBits); ++ assert(mlBits <= MaxMLBits); ++ assert(ofBits <= MaxOff); + /* + * As gcc has better branch and block analyzers, sometimes it is only +- * valuable to mark likelyness for clang, it gives around 3-4% of ++ * valuable to mark likeliness for clang, it gives around 3-4% of + * performance. + */ + + /* sequence */ + { size_t offset; +- #if defined(__clang__) +- if (LIKELY(ofBits > 1)) { +- #else + if (ofBits > 1) { +- #endif + ZSTD_STATIC_ASSERT(ZSTD_lo_isLongOffset == 1); + ZSTD_STATIC_ASSERT(LONG_OFFSETS_MAX_EXTRA_BITS_32 == 5); +- assert(ofBits <= MaxOff); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 > LONG_OFFSETS_MAX_EXTRA_BITS_32); ++ ZSTD_STATIC_ASSERT(STREAM_ACCUMULATOR_MIN_32 - LONG_OFFSETS_MAX_EXTRA_BITS_32 >= MaxMLBits); + if (MEM_32bits() && longOffsets && (ofBits >= STREAM_ACCUMULATOR_MIN_32)) { +- U32 const extraBits = ofBits - MIN(ofBits, 32 - seqState->DStream.bitsConsumed); ++ /* Always read extra bits, this keeps the logic simple, ++ * avoids branches, and avoids accidentally reading 0 bits. ++ */ ++ U32 const extraBits = LONG_OFFSETS_MAX_EXTRA_BITS_32; + offset = ofBase + (BIT_readBitsFast(&seqState->DStream, ofBits - extraBits) << extraBits); + BIT_reloadDStream(&seqState->DStream); +- if (extraBits) offset += BIT_readBitsFast(&seqState->DStream, extraBits); +- assert(extraBits <= LONG_OFFSETS_MAX_EXTRA_BITS_32); /* to avoid another reload */ ++ offset += BIT_readBitsFast(&seqState->DStream, extraBits); + } else { + offset = ofBase + BIT_readBitsFast(&seqState->DStream, ofBits/*>0*/); /* <= (ZSTD_WINDOWLOG_MAX-1) bits */ + if (MEM_32bits()) BIT_reloadDStream(&seqState->DStream); +@@ -1232,11 +1270,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + seq.offset = offset; + } + +- #if defined(__clang__) +- if (UNLIKELY(mlBits > 0)) +- #else + if (mlBits > 0) +- #endif + seq.matchLength += BIT_readBitsFast(&seqState->DStream, mlBits/*>0*/); + + if (MEM_32bits() && (mlBits+llBits >= STREAM_ACCUMULATOR_MIN_32-LONG_OFFSETS_MAX_EXTRA_BITS_32)) +@@ -1246,11 +1280,7 @@ ZSTD_decodeSequence(seqState_t* seqState, const ZSTD_longOffset_e longOffsets) + /* Ensure there are enough bits to read the rest of data in 64-bit mode. */ + ZSTD_STATIC_ASSERT(16+LLFSELog+MLFSELog+OffFSELog < STREAM_ACCUMULATOR_MIN_64); + +- #if defined(__clang__) +- if (UNLIKELY(llBits > 0)) +- #else + if (llBits > 0) +- #endif + seq.litLength += BIT_readBitsFast(&seqState->DStream, llBits/*>0*/); + + if (MEM_32bits()) +@@ -1552,7 +1582,7 @@ ZSTD_decompressSequences_body(ZSTD_DCtx* dctx, + const BYTE* const prefixStart = (const BYTE*)(dctx->prefixStart); + const BYTE* const vBase = (const BYTE*)(dctx->virtualStart); + const BYTE* const dictEnd = (const BYTE*)(dctx->dictEnd); +- DEBUGLOG(5, "ZSTD_decompressSequences_body"); ++ DEBUGLOG(5, "ZSTD_decompressSequences_body: nbSeq = %d", nbSeq); + (void)frame; + + /* Regen sequences */ +@@ -1945,34 +1975,79 @@ ZSTD_decompressSequencesLong(ZSTD_DCtx* dctx, + #endif /* ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT */ + + ++/* ++ * @returns The total size of the history referenceable by zstd, including ++ * both the prefix and the extDict. At @p op any offset larger than this ++ * is invalid. ++ */ ++static size_t ZSTD_totalHistorySize(BYTE* op, BYTE const* virtualStart) ++{ ++ return (size_t)(op - virtualStart); ++} ++ ++typedef struct { ++ unsigned longOffsetShare; ++ unsigned maxNbAdditionalBits; ++} ZSTD_OffsetInfo; + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +-/* ZSTD_getLongOffsetsShare() : ++/* ZSTD_getOffsetInfo() : + * condition : offTable must be valid + * @return : "share" of long offsets (arbitrarily defined as > (1<<23)) +- * compared to maximum possible of (1< 22) total += 1; ++ ZSTD_OffsetInfo info = {0, 0}; ++ /* If nbSeq == 0, then the offTable is uninitialized, but we have ++ * no sequences, so both values should be 0. ++ */ ++ if (nbSeq != 0) { ++ const void* ptr = offTable; ++ U32 const tableLog = ((const ZSTD_seqSymbol_header*)ptr)[0].tableLog; ++ const ZSTD_seqSymbol* table = offTable + 1; ++ U32 const max = 1 << tableLog; ++ U32 u; ++ DEBUGLOG(5, "ZSTD_getLongOffsetsShare: (tableLog=%u)", tableLog); ++ ++ assert(max <= (1 << OffFSELog)); /* max not too large */ ++ for (u=0; u 22) info.longOffsetShare += 1; ++ } ++ ++ assert(tableLog <= OffFSELog); ++ info.longOffsetShare <<= (OffFSELog - tableLog); /* scale to OffFSELog */ + } + +- assert(tableLog <= OffFSELog); +- total <<= (OffFSELog - tableLog); /* scale to OffFSELog */ ++ return info; ++} + +- return total; ++/* ++ * @returns The maximum offset we can decode in one read of our bitstream, without ++ * reloading more bits in the middle of the offset bits read. Any offsets larger ++ * than this must use the long offset decoder. ++ */ ++static size_t ZSTD_maxShortOffset(void) ++{ ++ if (MEM_64bits()) { ++ /* We can decode any offset without reloading bits. ++ * This might change if the max window size grows. ++ */ ++ ZSTD_STATIC_ASSERT(ZSTD_WINDOWLOG_MAX <= 31); ++ return (size_t)-1; ++ } else { ++ /* The maximum offBase is (1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1. ++ * This offBase would require STREAM_ACCUMULATOR_MIN extra bits. ++ * Then we have to subtract ZSTD_REP_NUM to get the maximum possible offset. ++ */ ++ size_t const maxOffbase = ((size_t)1 << (STREAM_ACCUMULATOR_MIN + 1)) - 1; ++ size_t const maxOffset = maxOffbase - ZSTD_REP_NUM; ++ assert(ZSTD_highbit32((U32)maxOffbase) == STREAM_ACCUMULATOR_MIN); ++ return maxOffset; ++ } + } +-#endif + + size_t + ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, +@@ -1980,20 +2055,21 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + const void* src, size_t srcSize, const int frame, const streaming_operation streaming) + { /* blockType == blockCompressed */ + const BYTE* ip = (const BYTE*)src; +- /* isLongOffset must be true if there are long offsets. +- * Offsets are long if they are larger than 2^STREAM_ACCUMULATOR_MIN. +- * We don't expect that to be the case in 64-bit mode. +- * In block mode, window size is not known, so we have to be conservative. +- * (note: but it could be evaluated from current-lowLimit) +- */ +- ZSTD_longOffset_e const isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (!frame || (dctx->fParams.windowSize > (1ULL << STREAM_ACCUMULATOR_MIN)))); + DEBUGLOG(5, "ZSTD_decompressBlock_internal (size : %u)", (U32)srcSize); + +- RETURN_ERROR_IF(srcSize >= ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); ++ /* Note : the wording of the specification ++ * allows compressed block to be sized exactly ZSTD_BLOCKSIZE_MAX. ++ * This generally does not happen, as it makes little sense, ++ * since an uncompressed block would feature same size and have no decompression cost. ++ * Also, note that decoder from reference libzstd before < v1.5.4 ++ * would consider this edge case as an error. ++ * As a consequence, avoid generating compressed blocks of size ZSTD_BLOCKSIZE_MAX ++ * for broader compatibility with the deployed ecosystem of zstd decoders */ ++ RETURN_ERROR_IF(srcSize > ZSTD_BLOCKSIZE_MAX, srcSize_wrong, ""); + + /* Decode literals section */ + { size_t const litCSize = ZSTD_decodeLiteralsBlock(dctx, src, srcSize, dst, dstCapacity, streaming); +- DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : %u", (U32)litCSize); ++ DEBUGLOG(5, "ZSTD_decodeLiteralsBlock : cSize=%u, nbLiterals=%zu", (U32)litCSize, dctx->litSize); + if (ZSTD_isError(litCSize)) return litCSize; + ip += litCSize; + srcSize -= litCSize; +@@ -2001,6 +2077,23 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + + /* Build Decoding Tables */ + { ++ /* Compute the maximum block size, which must also work when !frame and fParams are unset. ++ * Additionally, take the min with dstCapacity to ensure that the totalHistorySize fits in a size_t. ++ */ ++ size_t const blockSizeMax = MIN(dstCapacity, (frame ? dctx->fParams.blockSizeMax : ZSTD_BLOCKSIZE_MAX)); ++ size_t const totalHistorySize = ZSTD_totalHistorySize((BYTE*)dst + blockSizeMax, (BYTE const*)dctx->virtualStart); ++ /* isLongOffset must be true if there are long offsets. ++ * Offsets are long if they are larger than ZSTD_maxShortOffset(). ++ * We don't expect that to be the case in 64-bit mode. ++ * ++ * We check here to see if our history is large enough to allow long offsets. ++ * If it isn't, then we can't possible have (valid) long offsets. If the offset ++ * is invalid, then it is okay to read it incorrectly. ++ * ++ * If isLongOffsets is true, then we will later check our decoding table to see ++ * if it is even possible to generate long offsets. ++ */ ++ ZSTD_longOffset_e isLongOffset = (ZSTD_longOffset_e)(MEM_32bits() && (totalHistorySize > ZSTD_maxShortOffset())); + /* These macros control at build-time which decompressor implementation + * we use. If neither is defined, we do some inspection and dispatch at + * runtime. +@@ -2008,6 +2101,11 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) + int usePrefetchDecoder = dctx->ddictIsCold; ++#else ++ /* Set to 1 to avoid computing offset info if we don't need to. ++ * Otherwise this value is ignored. ++ */ ++ int usePrefetchDecoder = 1; + #endif + int nbSeq; + size_t const seqHSize = ZSTD_decodeSeqHeaders(dctx, &nbSeq, ip, srcSize); +@@ -2015,28 +2113,42 @@ ZSTD_decompressBlock_internal(ZSTD_DCtx* dctx, + ip += seqHSize; + srcSize -= seqHSize; + +- RETURN_ERROR_IF(dst == NULL && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF((dst == NULL || dstCapacity == 0) && nbSeq > 0, dstSize_tooSmall, "NULL not handled"); ++ RETURN_ERROR_IF(MEM_64bits() && sizeof(size_t) == sizeof(void*) && (size_t)(-1) - (size_t)dst < (size_t)(1 << 20), dstSize_tooSmall, ++ "invalid dst"); + +-#if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ +- !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if ( !usePrefetchDecoder +- && (!frame || (dctx->fParams.windowSize > (1<<24))) +- && (nbSeq>ADVANCED_SEQS) ) { /* could probably use a larger nbSeq limit */ +- U32 const shareLongOffsets = ZSTD_getLongOffsetsShare(dctx->OFTptr); +- U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ +- usePrefetchDecoder = (shareLongOffsets >= minShare); ++ /* If we could potentially have long offsets, or we might want to use the prefetch decoder, ++ * compute information about the share of long offsets, and the maximum nbAdditionalBits. ++ * NOTE: could probably use a larger nbSeq limit ++ */ ++ if (isLongOffset || (!usePrefetchDecoder && (totalHistorySize > (1u << 24)) && (nbSeq > 8))) { ++ ZSTD_OffsetInfo const info = ZSTD_getOffsetInfo(dctx->OFTptr, nbSeq); ++ if (isLongOffset && info.maxNbAdditionalBits <= STREAM_ACCUMULATOR_MIN) { ++ /* If isLongOffset, but the maximum number of additional bits that we see in our table is small ++ * enough, then we know it is impossible to have too long an offset in this block, so we can ++ * use the regular offset decoder. ++ */ ++ isLongOffset = ZSTD_lo_isRegularOffset; ++ } ++ if (!usePrefetchDecoder) { ++ U32 const minShare = MEM_64bits() ? 7 : 20; /* heuristic values, correspond to 2.73% and 7.81% */ ++ usePrefetchDecoder = (info.longOffsetShare >= minShare); ++ } + } +-#endif + + dctx->ddictIsCold = 0; + + #if !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT) && \ + !defined(ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG) +- if (usePrefetchDecoder) ++ if (usePrefetchDecoder) { ++#else ++ (void)usePrefetchDecoder; ++ { + #endif + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_SHORT + return ZSTD_decompressSequencesLong(dctx, dst, dstCapacity, ip, srcSize, nbSeq, isLongOffset, frame); + #endif ++ } + + #ifndef ZSTD_FORCE_DECOMPRESS_SEQUENCES_LONG + /* else */ +@@ -2060,9 +2172,9 @@ void ZSTD_checkContinuity(ZSTD_DCtx* dctx, const void* dst, size_t dstSize) + } + + +-size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, +- void* dst, size_t dstCapacity, +- const void* src, size_t srcSize) ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) + { + size_t dSize; + ZSTD_checkContinuity(dctx, dst, dstCapacity); +@@ -2070,3 +2182,12 @@ size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, + dctx->previousDstEnd = (char*)dst + dSize; + return dSize; + } ++ ++ ++/* NOTE: Must just wrap ZSTD_decompressBlock_deprecated() */ ++size_t ZSTD_decompressBlock(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize) ++{ ++ return ZSTD_decompressBlock_deprecated(dctx, dst, dstCapacity, src, srcSize); ++} +diff --git a/lib/zstd/decompress/zstd_decompress_block.h b/lib/zstd/decompress/zstd_decompress_block.h +index 3d2d57a5d25a..5888e6cc788b 100644 +--- a/lib/zstd/decompress/zstd_decompress_block.h ++++ b/lib/zstd/decompress/zstd_decompress_block.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -64,5 +65,10 @@ void ZSTD_buildFSETable(ZSTD_seqSymbol* dt, + unsigned tableLog, void* wksp, size_t wkspSize, + int bmi2); + ++/* Internal definition of ZSTD_decompressBlock() to avoid deprecation warnings. */ ++size_t ZSTD_decompressBlock_deprecated(ZSTD_DCtx* dctx, ++ void* dst, size_t dstCapacity, ++ const void* src, size_t srcSize); ++ + + #endif /* ZSTD_DEC_BLOCK_H */ +diff --git a/lib/zstd/decompress/zstd_decompress_internal.h b/lib/zstd/decompress/zstd_decompress_internal.h +index 98102edb6a83..32f79fb2873d 100644 +--- a/lib/zstd/decompress/zstd_decompress_internal.h ++++ b/lib/zstd/decompress/zstd_decompress_internal.h +@@ -1,5 +1,6 @@ ++/* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Yann Collet, Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -75,12 +76,13 @@ static UNUSED_ATTR const U32 ML_base[MaxML+1] = { + + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE (sizeof(S16) * (MaxSeq + 1) + (1u << MaxFSELog) + sizeof(U64)) + #define ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32 ((ZSTD_BUILD_FSE_TABLE_WKSP_SIZE + sizeof(U32) - 1) / sizeof(U32)) ++#define ZSTD_HUFFDTABLE_CAPACITY_LOG 12 + + typedef struct { + ZSTD_seqSymbol LLTable[SEQSYMBOL_TABLE_SIZE(LLFSELog)]; /* Note : Space reserved for FSE Tables */ + ZSTD_seqSymbol OFTable[SEQSYMBOL_TABLE_SIZE(OffFSELog)]; /* is also used as temporary workspace while building hufTable during DDict creation */ + ZSTD_seqSymbol MLTable[SEQSYMBOL_TABLE_SIZE(MLFSELog)]; /* and therefore must be at least HUF_DECOMPRESS_WORKSPACE_SIZE large */ +- HUF_DTable hufTable[HUF_DTABLE_SIZE(HufLog)]; /* can accommodate HUF_decompress4X */ ++ HUF_DTable hufTable[HUF_DTABLE_SIZE(ZSTD_HUFFDTABLE_CAPACITY_LOG)]; /* can accommodate HUF_decompress4X */ + U32 rep[ZSTD_REP_NUM]; + U32 workspace[ZSTD_BUILD_FSE_TABLE_WKSP_SIZE_U32]; + } ZSTD_entropyDTables_t; +@@ -164,6 +166,7 @@ struct ZSTD_DCtx_s + ZSTD_dictUses_e dictUses; + ZSTD_DDictHashSet* ddictSet; /* Hash set for multiple ddicts */ + ZSTD_refMultipleDDicts_e refMultipleDDicts; /* User specified: if == 1, will allow references to multiple DDicts. Default == 0 (disabled) */ ++ int disableHufAsm; + + /* streaming */ + ZSTD_dStreamStage streamStage; +diff --git a/lib/zstd/decompress_sources.h b/lib/zstd/decompress_sources.h +index a06ca187aab5..8a47eb2a4514 100644 +--- a/lib/zstd/decompress_sources.h ++++ b/lib/zstd/decompress_sources.h +@@ -1,6 +1,6 @@ + /* SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause */ + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_common_module.c b/lib/zstd/zstd_common_module.c +index 22686e367e6f..466828e35752 100644 +--- a/lib/zstd/zstd_common_module.c ++++ b/lib/zstd/zstd_common_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -24,9 +24,6 @@ EXPORT_SYMBOL_GPL(HUF_readStats_wksp); + EXPORT_SYMBOL_GPL(ZSTD_isError); + EXPORT_SYMBOL_GPL(ZSTD_getErrorName); + EXPORT_SYMBOL_GPL(ZSTD_getErrorCode); +-EXPORT_SYMBOL_GPL(ZSTD_customMalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customCalloc); +-EXPORT_SYMBOL_GPL(ZSTD_customFree); + + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("Zstd Common"); +diff --git a/lib/zstd/zstd_compress_module.c b/lib/zstd/zstd_compress_module.c +index 04e1b5c01d9b..8ecf43226af2 100644 +--- a/lib/zstd/zstd_compress_module.c ++++ b/lib/zstd/zstd_compress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +diff --git a/lib/zstd/zstd_decompress_module.c b/lib/zstd/zstd_decompress_module.c +index f4ed952ed485..7d31518e9d5a 100644 +--- a/lib/zstd/zstd_decompress_module.c ++++ b/lib/zstd/zstd_decompress_module.c +@@ -1,6 +1,6 @@ + // SPDX-License-Identifier: GPL-2.0+ OR BSD-3-Clause + /* +- * Copyright (c) Facebook, Inc. ++ * Copyright (c) Meta Platforms, Inc. and affiliates. + * All rights reserved. + * + * This source code is licensed under both the BSD-style license (found in the +@@ -77,7 +77,7 @@ EXPORT_SYMBOL(zstd_init_dstream); + + size_t zstd_reset_dstream(zstd_dstream *dstream) + { +- return ZSTD_resetDStream(dstream); ++ return ZSTD_DCtx_reset(dstream, ZSTD_reset_session_only); + } + EXPORT_SYMBOL(zstd_reset_dstream); + +-- +2.46.0.rc1 + diff --git a/sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch b/sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch deleted file mode 100644 index 4094dce..0000000 --- a/sys-kernel/gentoo-sources-6.6/0010-sched-ext.patch +++ /dev/null @@ -1,19747 +0,0 @@ -From 1d5eefab83823197c1de81da58ba61bef161635b Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Thu, 7 Dec 2023 20:43:19 +0100 -Subject: [PATCH] sched-ext - -Signed-off-by: Piotr Gorski ---- - Documentation/scheduler/index.rst | 1 + - Documentation/scheduler/sched-ext.rst | 229 + - MAINTAINERS | 3 + - Makefile | 8 +- - drivers/tty/sysrq.c | 1 + - include/asm-generic/vmlinux.lds.h | 1 + - include/linux/cgroup-defs.h | 8 + - include/linux/cgroup.h | 5 +- - include/linux/sched.h | 5 + - include/linux/sched/ext.h | 716 +++ - include/linux/sched/task.h | 3 +- - include/uapi/linux/sched.h | 1 + - init/Kconfig | 5 + - init/init_task.c | 12 + - kernel/Kconfig.preempt | 24 +- - kernel/bpf/bpf_struct_ops_types.h | 4 + - kernel/cgroup/cgroup.c | 97 +- - kernel/fork.c | 17 +- - kernel/sched/build_policy.c | 5 + - kernel/sched/core.c | 316 +- - kernel/sched/deadline.c | 4 +- - kernel/sched/debug.c | 6 + - kernel/sched/ext.c | 4497 +++++++++++++++++ - kernel/sched/ext.h | 266 + - kernel/sched/fair.c | 9 +- - kernel/sched/idle.c | 2 + - kernel/sched/rt.c | 4 +- - kernel/sched/sched.h | 117 +- - kernel/sched/topology.c | 4 +- - lib/dump_stack.c | 1 + - tools/Makefile | 10 +- - tools/sched_ext/.gitignore | 10 + - tools/sched_ext/Kconfig | 9 + - tools/sched_ext/Makefile | 301 ++ - tools/sched_ext/README.md | 403 ++ - tools/sched_ext/gnu/stubs.h | 1 + - tools/sched_ext/ravg.bpf.h | 42 + - tools/sched_ext/ravg_impl.bpf.h | 358 ++ - tools/sched_ext/ravg_read.rs.h | 82 + - tools/sched_ext/scx_central.bpf.c | 346 ++ - tools/sched_ext/scx_central.c | 123 + - tools/sched_ext/scx_common.bpf.h | 244 + - tools/sched_ext/scx_common.h | 59 + - tools/sched_ext/scx_flatcg.bpf.c | 912 ++++ - tools/sched_ext/scx_flatcg.c | 221 + - tools/sched_ext/scx_flatcg.h | 49 + - tools/sched_ext/scx_layered/.gitignore | 3 + - tools/sched_ext/scx_layered/Cargo.toml | 30 + - tools/sched_ext/scx_layered/build.rs | 77 + - tools/sched_ext/scx_layered/rustfmt.toml | 8 + - .../scx_layered/src/bpf/layered.bpf.c | 974 ++++ - tools/sched_ext/scx_layered/src/bpf/layered.h | 100 + - .../sched_ext/scx_layered/src/bpf/util.bpf.c | 68 + - .../sched_ext/scx_layered/src/layered_sys.rs | 10 + - tools/sched_ext/scx_layered/src/main.rs | 1641 ++++++ - tools/sched_ext/scx_nest.bpf.c | 681 +++ - tools/sched_ext/scx_nest.c | 227 + - tools/sched_ext/scx_nest.h | 18 + - tools/sched_ext/scx_nest_stats_table.h | 19 + - tools/sched_ext/scx_pair.bpf.c | 626 +++ - tools/sched_ext/scx_pair.c | 168 + - tools/sched_ext/scx_pair.h | 9 + - tools/sched_ext/scx_qmap.bpf.c | 401 ++ - tools/sched_ext/scx_qmap.c | 105 + - tools/sched_ext/scx_rusty/.gitignore | 3 + - tools/sched_ext/scx_rusty/Cargo.toml | 28 + - tools/sched_ext/scx_rusty/build.rs | 72 + - tools/sched_ext/scx_rusty/rustfmt.toml | 8 + - tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c | 1153 +++++ - tools/sched_ext/scx_rusty/src/bpf/rusty.h | 97 + - tools/sched_ext/scx_rusty/src/main.rs | 1265 +++++ - tools/sched_ext/scx_rusty/src/rusty_sys.rs | 10 + - tools/sched_ext/scx_simple.bpf.c | 143 + - tools/sched_ext/scx_simple.c | 99 + - tools/sched_ext/scx_userland.bpf.c | 262 + - tools/sched_ext/scx_userland.c | 366 ++ - tools/sched_ext/scx_userland.h | 19 + - tools/sched_ext/user_exit_info.h | 50 + - 78 files changed, 18176 insertions(+), 105 deletions(-) - create mode 100644 Documentation/scheduler/sched-ext.rst - create mode 100644 include/linux/sched/ext.h - create mode 100644 kernel/sched/ext.c - create mode 100644 kernel/sched/ext.h - create mode 100644 tools/sched_ext/.gitignore - create mode 100644 tools/sched_ext/Kconfig - create mode 100644 tools/sched_ext/Makefile - create mode 100644 tools/sched_ext/README.md - create mode 100644 tools/sched_ext/gnu/stubs.h - create mode 100644 tools/sched_ext/ravg.bpf.h - create mode 100644 tools/sched_ext/ravg_impl.bpf.h - create mode 100644 tools/sched_ext/ravg_read.rs.h - create mode 100644 tools/sched_ext/scx_central.bpf.c - create mode 100644 tools/sched_ext/scx_central.c - create mode 100644 tools/sched_ext/scx_common.bpf.h - create mode 100644 tools/sched_ext/scx_common.h - create mode 100644 tools/sched_ext/scx_flatcg.bpf.c - create mode 100644 tools/sched_ext/scx_flatcg.c - create mode 100644 tools/sched_ext/scx_flatcg.h - create mode 100644 tools/sched_ext/scx_layered/.gitignore - create mode 100644 tools/sched_ext/scx_layered/Cargo.toml - create mode 100644 tools/sched_ext/scx_layered/build.rs - create mode 100644 tools/sched_ext/scx_layered/rustfmt.toml - create mode 100644 tools/sched_ext/scx_layered/src/bpf/layered.bpf.c - create mode 100644 tools/sched_ext/scx_layered/src/bpf/layered.h - create mode 100644 tools/sched_ext/scx_layered/src/bpf/util.bpf.c - create mode 100644 tools/sched_ext/scx_layered/src/layered_sys.rs - create mode 100644 tools/sched_ext/scx_layered/src/main.rs - create mode 100644 tools/sched_ext/scx_nest.bpf.c - create mode 100644 tools/sched_ext/scx_nest.c - create mode 100644 tools/sched_ext/scx_nest.h - create mode 100644 tools/sched_ext/scx_nest_stats_table.h - create mode 100644 tools/sched_ext/scx_pair.bpf.c - create mode 100644 tools/sched_ext/scx_pair.c - create mode 100644 tools/sched_ext/scx_pair.h - create mode 100644 tools/sched_ext/scx_qmap.bpf.c - create mode 100644 tools/sched_ext/scx_qmap.c - create mode 100644 tools/sched_ext/scx_rusty/.gitignore - create mode 100644 tools/sched_ext/scx_rusty/Cargo.toml - create mode 100644 tools/sched_ext/scx_rusty/build.rs - create mode 100644 tools/sched_ext/scx_rusty/rustfmt.toml - create mode 100644 tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c - create mode 100644 tools/sched_ext/scx_rusty/src/bpf/rusty.h - create mode 100644 tools/sched_ext/scx_rusty/src/main.rs - create mode 100644 tools/sched_ext/scx_rusty/src/rusty_sys.rs - create mode 100644 tools/sched_ext/scx_simple.bpf.c - create mode 100644 tools/sched_ext/scx_simple.c - create mode 100644 tools/sched_ext/scx_userland.bpf.c - create mode 100644 tools/sched_ext/scx_userland.c - create mode 100644 tools/sched_ext/scx_userland.h - create mode 100644 tools/sched_ext/user_exit_info.h - -diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst -index 317074722..0b650bb55 100644 ---- a/Documentation/scheduler/index.rst -+++ b/Documentation/scheduler/index.rst -@@ -19,6 +19,7 @@ Scheduler - sched-nice-design - sched-rt-group - sched-stats -+ sched-ext - sched-debug - - text_files -diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst -new file mode 100644 -index 000000000..25ddb535c ---- /dev/null -+++ b/Documentation/scheduler/sched-ext.rst -@@ -0,0 +1,229 @@ -+========================== -+Extensible Scheduler Class -+========================== -+ -+sched_ext is a scheduler class whose behavior can be defined by a set of BPF -+programs - the BPF scheduler. -+ -+* sched_ext exports a full scheduling interface so that any scheduling -+ algorithm can be implemented on top. -+ -+* The BPF scheduler can group CPUs however it sees fit and schedule them -+ together, as tasks aren't tied to specific CPUs at the time of wakeup. -+ -+* The BPF scheduler can be turned on and off dynamically anytime. -+ -+* The system integrity is maintained no matter what the BPF scheduler does. -+ The default scheduling behavior is restored anytime an error is detected, -+ a runnable task stalls, or on invoking the SysRq key sequence -+ :kbd:`SysRq-S`. -+ -+Switching to and from sched_ext -+=============================== -+ -+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and -+``tools/sched_ext`` contains the example schedulers. -+ -+sched_ext is used only when the BPF scheduler is loaded and running. -+ -+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be -+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is -+loaded. On load, such tasks will be switched to and scheduled by sched_ext. -+ -+The BPF scheduler can choose to schedule all normal and lower class tasks by -+calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this -+case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and -+``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers, -+this mode can be selected with the ``-a`` option. -+ -+Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or -+detection of any internal error including stalled runnable tasks aborts the -+BPF scheduler and reverts all tasks back to CFS. -+ -+.. code-block:: none -+ -+ # make -j16 -C tools/sched_ext -+ # tools/sched_ext/scx_simple -+ local=0 global=3 -+ local=5 global=24 -+ local=9 global=44 -+ local=13 global=56 -+ local=17 global=72 -+ ^CEXIT: BPF scheduler unregistered -+ -+If ``CONFIG_SCHED_DEBUG`` is set, the current status of the BPF scheduler -+and whether a given task is on sched_ext can be determined as follows: -+ -+.. code-block:: none -+ -+ # cat /sys/kernel/debug/sched/ext -+ ops : simple -+ enabled : 1 -+ switching_all : 1 -+ switched_all : 1 -+ enable_state : enabled -+ -+ # grep ext /proc/self/sched -+ ext.enabled : 1 -+ -+The Basics -+========== -+ -+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF -+programs that implement ``struct sched_ext_ops``. The only mandatory field -+is ``ops.name`` which must be a valid BPF object name. All operations are -+optional. The following modified excerpt is from -+``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler. -+ -+.. code-block:: c -+ -+ s32 BPF_STRUCT_OPS(simple_init) -+ { -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ return 0; -+ } -+ -+ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) -+ { -+ if (enq_flags & SCX_ENQ_LOCAL) -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); -+ else -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+ } -+ -+ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) -+ { -+ exit_type = ei->type; -+ } -+ -+ SEC(".struct_ops") -+ struct sched_ext_ops simple_ops = { -+ .enqueue = (void *)simple_enqueue, -+ .init = (void *)simple_init, -+ .exit = (void *)simple_exit, -+ .name = "simple", -+ }; -+ -+Dispatch Queues -+--------------- -+ -+To match the impedance between the scheduler core and the BPF scheduler, -+sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a -+priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), -+and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage -+an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and -+``scx_bpf_destroy_dsq()``. -+ -+A CPU always executes a task from its local DSQ. A task is "dispatched" to a -+DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's -+local DSQ. -+ -+When a CPU is looking for the next task to run, if the local DSQ is not -+empty, the first task is picked. Otherwise, the CPU tries to consume the -+global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` -+is invoked. -+ -+Scheduling Cycle -+---------------- -+ -+The following briefly shows how a waking task is scheduled and executed. -+ -+1. When a task is waking up, ``ops.select_cpu()`` is the first operation -+ invoked. This serves two purposes. First, CPU selection optimization -+ hint. Second, waking up the selected CPU if idle. -+ -+ The CPU selected by ``ops.select_cpu()`` is an optimization hint and not -+ binding. The actual decision is made at the last step of scheduling. -+ However, there is a small performance gain if the CPU -+ ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. -+ -+ A side-effect of selecting a CPU is waking it up from idle. While a BPF -+ scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, -+ using ``ops.select_cpu()`` judiciously can be simpler and more efficient. -+ -+ Note that the scheduler core will ignore an invalid CPU selection, for -+ example, if it's outside the allowed cpumask of the task. -+ -+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked. It can -+ make one of the following decisions: -+ -+ * Immediately dispatch the task to either the global or local DSQ by -+ calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or -+ ``SCX_DSQ_LOCAL``, respectively. -+ -+ * Immediately dispatch the task to a custom DSQ by calling -+ ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. -+ -+ * Queue the task on the BPF side. -+ -+3. When a CPU is ready to schedule, it first looks at its local DSQ. If -+ empty, it then looks at the global DSQ. If there still isn't a task to -+ run, ``ops.dispatch()`` is invoked which can use the following two -+ functions to populate the local DSQ. -+ -+ * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can -+ be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, -+ ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` -+ currently can't be called with BPF locks held, this is being worked on -+ and will be supported. ``scx_bpf_dispatch()`` schedules dispatching -+ rather than performing them immediately. There can be up to -+ ``ops.dispatch_max_batch`` pending tasks. -+ -+ * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ -+ to the dispatching DSQ. This function cannot be called with any BPF -+ locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks -+ before trying to consume the specified DSQ. -+ -+4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, -+ the CPU runs the first one. If empty, the following steps are taken: -+ -+ * Try to consume the global DSQ. If successful, run the task. -+ -+ * If ``ops.dispatch()`` has dispatched any tasks, retry #3. -+ -+ * If the previous task is an SCX task and still runnable, keep executing -+ it (see ``SCX_OPS_ENQ_LAST``). -+ -+ * Go idle. -+ -+Note that the BPF scheduler can always choose to dispatch tasks immediately -+in ``ops.enqueue()`` as illustrated in the above simple example. If only the -+built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as -+a task is never queued on the BPF scheduler and both the local and global -+DSQs are consumed automatically. -+ -+``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use -+``scx_bpf_dispatch_vtime()`` for the priority queue. See the function -+documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for more -+information. -+ -+Where to Look -+============= -+ -+* ``include/linux/sched/ext.h`` defines the core data structures, ops table -+ and constants. -+ -+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. -+ The functions prefixed with ``scx_bpf_`` can be called from the BPF -+ scheduler. -+ -+* ``tools/sched_ext/`` hosts example BPF scheduler implementations. -+ -+ * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a -+ custom DSQ. -+ -+ * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five -+ levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. -+ -+ABI Instability -+=============== -+ -+The APIs provided by sched_ext to BPF schedulers programs have no stability -+guarantees. This includes the ops table callbacks and constants defined in -+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in -+``kernel/sched/ext.c``. -+ -+While we will attempt to provide a relatively stable API surface when -+possible, they are subject to change without warning between kernel -+versions. -diff --git a/MAINTAINERS b/MAINTAINERS -index dd5de540e..286abb83c 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -19080,6 +19080,8 @@ R: Ben Segall (CONFIG_CFS_BANDWIDTH) - R: Mel Gorman (CONFIG_NUMA_BALANCING) - R: Daniel Bristot de Oliveira (SCHED_DEADLINE) - R: Valentin Schneider (TOPOLOGY) -+R: Tejun Heo (SCHED_EXT) -+R: David Vernet (SCHED_EXT) - L: linux-kernel@vger.kernel.org - S: Maintained - T: git git://git.kernel.org/pub/scm/linux/kernel/git/tip/tip.git sched/core -@@ -19088,6 +19090,7 @@ F: include/linux/sched.h - F: include/linux/wait.h - F: include/uapi/linux/sched.h - F: kernel/sched/ -+F: tools/sched_ext/ - - SCSI LIBSAS SUBSYSTEM - R: John Garry -diff --git a/Makefile b/Makefile -index cbe63ba91..8f2fc39a0 100644 ---- a/Makefile -+++ b/Makefile -@@ -1341,6 +1341,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),) - $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean - endif - -+tools-clean-targets := sched_ext -+PHONY += $(tools-clean-targets) -+$(tools-clean-targets): -+ $(Q)$(MAKE) -sC tools $@_clean -+tools_clean: $(tools-clean-targets) -+ - # Clear a bunch of variables before executing the submake - ifeq ($(quiet),silent_) - tools_silent=s -@@ -1510,7 +1516,7 @@ PHONY += $(mrproper-dirs) mrproper - $(mrproper-dirs): - $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) - --mrproper: clean $(mrproper-dirs) -+mrproper: clean $(mrproper-dirs) tools_clean - $(call cmd,rmfiles) - @find . $(RCS_FIND_IGNORE) \ - \( -name '*.rmeta' \) \ -diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c -index 6b4a28bcf..6ec15c131 100644 ---- a/drivers/tty/sysrq.c -+++ b/drivers/tty/sysrq.c -@@ -520,6 +520,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { - NULL, /* P */ - NULL, /* Q */ - NULL, /* R */ -+ /* S: May be registered by sched_ext for resetting */ - NULL, /* S */ - NULL, /* T */ - NULL, /* U */ -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index 67d8dd2f1..575322902 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -131,6 +131,7 @@ - *(__dl_sched_class) \ - *(__rt_sched_class) \ - *(__fair_sched_class) \ -+ *(__ext_sched_class) \ - *(__idle_sched_class) \ - __sched_class_lowest = .; - -diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h -index 265da00a1..6194d7c13 100644 ---- a/include/linux/cgroup-defs.h -+++ b/include/linux/cgroup-defs.h -@@ -127,12 +127,18 @@ enum { - CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ - CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ - -+ CFTYPE_HIDDEN = (1 << 6), /* file type hidden, see cgroup_show_cftypes() */ -+ - /* internal flags, do not use outside cgroup core proper */ - __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ - __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ - __CFTYPE_ADDED = (1 << 18), - }; - -+enum cfile_flags { -+ CFILE_HIDDEN = (1 << 0), /* file instance hidden */ -+}; -+ - /* - * cgroup_file is the handle for a file instance created in a cgroup which - * is used, for example, to generate file changed notifications. This can -@@ -140,7 +146,9 @@ enum { - */ - struct cgroup_file { - /* do not access any fields from outside cgroup core */ -+ struct cftype *cft; - struct kernfs_node *kn; -+ unsigned int flags; - unsigned long notified_at; - struct timer_list notify_timer; - }; -diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h -index b307013b9..08b54094b 100644 ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -29,8 +29,6 @@ - - struct kernel_clone_args; - --#ifdef CONFIG_CGROUPS -- - /* - * All weight knobs on the default hierarchy should use the following min, - * default and max values. The default value is the logarithmic center of -@@ -40,6 +38,8 @@ struct kernel_clone_args; - #define CGROUP_WEIGHT_DFL 100 - #define CGROUP_WEIGHT_MAX 10000 - -+#ifdef CONFIG_CGROUPS -+ - /* walk only threadgroup leaders */ - #define CSS_TASK_ITER_PROCS (1U << 0) - /* walk all threaded css_sets in the domain */ -@@ -115,6 +115,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); - int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); - int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); - int cgroup_rm_cftypes(struct cftype *cfts); -+void cgroup_show_cftype(struct cftype *cft, bool show); - void cgroup_file_notify(struct cgroup_file *cfile); - void cgroup_file_show(struct cgroup_file *cfile, bool show); - -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 77f01ac38..f81ff964c 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -71,6 +71,8 @@ struct task_delay_info; - struct task_group; - struct user_event_mm; - -+#include -+ - /* - * Task state bitmask. NOTE! These bits are also - * encoded in fs/proc/array.c: get_task_state(). -@@ -794,6 +796,9 @@ struct task_struct { - struct sched_entity se; - struct sched_rt_entity rt; - struct sched_dl_entity dl; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ struct sched_ext_entity scx; -+#endif - const struct sched_class *sched_class; - - #ifdef CONFIG_SCHED_CORE -diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h -new file mode 100644 -index 000000000..b20a7620b ---- /dev/null -+++ b/include/linux/sched/ext.h -@@ -0,0 +1,716 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef _LINUX_SCHED_EXT_H -+#define _LINUX_SCHED_EXT_H -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ -+#include -+#include -+ -+struct cgroup; -+ -+enum scx_consts { -+ SCX_OPS_NAME_LEN = 128, -+ SCX_EXIT_REASON_LEN = 128, -+ SCX_EXIT_BT_LEN = 64, -+ SCX_EXIT_MSG_LEN = 1024, -+ -+ SCX_SLICE_DFL = 20 * NSEC_PER_MSEC, -+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ -+}; -+ -+/* -+ * DSQ (dispatch queue) IDs are 64bit of the format: -+ * -+ * Bits: [63] [62 .. 0] -+ * [ B] [ ID ] -+ * -+ * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs -+ * ID: 63 bit ID -+ * -+ * Built-in IDs: -+ * -+ * Bits: [63] [62] [61..32] [31 .. 0] -+ * [ 1] [ L] [ R ] [ V ] -+ * -+ * 1: 1 for built-in DSQs. -+ * L: 1 for LOCAL_ON DSQ IDs, 0 for others -+ * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. -+ */ -+enum scx_dsq_id_flags { -+ SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, -+ SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, -+ -+ SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, -+ SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, -+ SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, -+ SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, -+ SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, -+}; -+ -+enum scx_exit_kind { -+ SCX_EXIT_NONE, -+ SCX_EXIT_DONE, -+ -+ SCX_EXIT_UNREG = 64, /* BPF unregistration */ -+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ -+ -+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ -+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ -+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -+}; -+ -+/* -+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is -+ * being disabled. -+ */ -+struct scx_exit_info { -+ /* %SCX_EXIT_* - broad category of the exit reason */ -+ enum scx_exit_kind kind; -+ /* textual representation of the above */ -+ char reason[SCX_EXIT_REASON_LEN]; -+ /* number of entries in the backtrace */ -+ u32 bt_len; -+ /* backtrace if exiting due to an error */ -+ unsigned long bt[SCX_EXIT_BT_LEN]; -+ /* extra message */ -+ char msg[SCX_EXIT_MSG_LEN]; -+}; -+ -+/* sched_ext_ops.flags */ -+enum scx_ops_flags { -+ /* -+ * Keep built-in idle tracking even if ops.update_idle() is implemented. -+ */ -+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, -+ -+ /* -+ * By default, if there are no other task to run on the CPU, ext core -+ * keeps running the current task even after its slice expires. If this -+ * flag is specified, such tasks are passed to ops.enqueue() with -+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. -+ */ -+ SCX_OPS_ENQ_LAST = 1LLU << 1, -+ -+ /* -+ * An exiting task may schedule after PF_EXITING is set. In such cases, -+ * bpf_task_from_pid() may not be able to find the task and if the BPF -+ * scheduler depends on pid lookup for dispatching, the task will be -+ * lost leading to various issues including RCU grace period stalls. -+ * -+ * To mask this problem, by default, unhashed tasks are automatically -+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't -+ * depend on pid lookups and wants to handle these tasks directly, the -+ * following flag can be used. -+ */ -+ SCX_OPS_ENQ_EXITING = 1LLU << 2, -+ -+ /* -+ * CPU cgroup knob enable flags -+ */ -+ SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16, /* cpu.weight */ -+ -+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | -+ SCX_OPS_ENQ_LAST | -+ SCX_OPS_ENQ_EXITING | -+ SCX_OPS_CGROUP_KNOB_WEIGHT, -+}; -+ -+/* argument container for ops.enable() and friends */ -+struct scx_enable_args { -+#ifdef CONFIG_EXT_GROUP_SCHED -+ /* the cgroup the task is joining */ -+ struct cgroup *cgroup; -+#endif -+}; -+ -+/* argument container for ops->cgroup_init() */ -+struct scx_cgroup_init_args { -+ /* the weight of the cgroup [1..10000] */ -+ u32 weight; -+}; -+ -+enum scx_cpu_preempt_reason { -+ /* next task is being scheduled by &sched_class_rt */ -+ SCX_CPU_PREEMPT_RT, -+ /* next task is being scheduled by &sched_class_dl */ -+ SCX_CPU_PREEMPT_DL, -+ /* next task is being scheduled by &sched_class_stop */ -+ SCX_CPU_PREEMPT_STOP, -+ /* unknown reason for SCX being preempted */ -+ SCX_CPU_PREEMPT_UNKNOWN, -+}; -+ -+/* -+ * Argument container for ops->cpu_acquire(). Currently empty, but may be -+ * expanded in the future. -+ */ -+struct scx_cpu_acquire_args {}; -+ -+/* argument container for ops->cpu_release() */ -+struct scx_cpu_release_args { -+ /* the reason the CPU was preempted */ -+ enum scx_cpu_preempt_reason reason; -+ -+ /* the task that's going to be scheduled on the CPU */ -+ struct task_struct *task; -+}; -+ -+/** -+ * struct sched_ext_ops - Operation table for BPF scheduler implementation -+ * -+ * Userland can implement an arbitrary scheduling policy by implementing and -+ * loading operations in this table. -+ */ -+struct sched_ext_ops { -+ /** -+ * select_cpu - Pick the target CPU for a task which is being woken up -+ * @p: task being woken up -+ * @prev_cpu: the cpu @p was on before sleeping -+ * @wake_flags: SCX_WAKE_* -+ * -+ * Decision made here isn't final. @p may be moved to any CPU while it -+ * is getting dispatched for execution later. However, as @p is not on -+ * the rq at this point, getting the eventual execution CPU right here -+ * saves a small bit of overhead down the line. -+ * -+ * If an idle CPU is returned, the CPU is kicked and will try to -+ * dispatch. While an explicit custom mechanism can be added, -+ * select_cpu() serves as the default way to wake up idle CPUs. -+ */ -+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); -+ -+ /** -+ * enqueue - Enqueue a task on the BPF scheduler -+ * @p: task being enqueued -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() -+ * or enqueue on the BPF scheduler. If not directly dispatched, the bpf -+ * scheduler owns @p and if it fails to dispatch @p, the task will -+ * stall. -+ */ -+ void (*enqueue)(struct task_struct *p, u64 enq_flags); -+ -+ /** -+ * dequeue - Remove a task from the BPF scheduler -+ * @p: task being dequeued -+ * @deq_flags: %SCX_DEQ_* -+ * -+ * Remove @p from the BPF scheduler. This is usually called to isolate -+ * the task while updating its scheduling properties (e.g. priority). -+ * -+ * The ext core keeps track of whether the BPF side owns a given task or -+ * not and can gracefully ignore spurious dispatches from BPF side, -+ * which makes it safe to not implement this method. However, depending -+ * on the scheduling logic, this can lead to confusing behaviors - e.g. -+ * scheduling position not being updated across a priority change. -+ */ -+ void (*dequeue)(struct task_struct *p, u64 deq_flags); -+ -+ /** -+ * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs -+ * @cpu: CPU to dispatch tasks for -+ * @prev: previous task being switched out -+ * -+ * Called when a CPU's local dsq is empty. The operation should dispatch -+ * one or more tasks from the BPF scheduler into the DSQs using -+ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using -+ * scx_bpf_consume(). -+ * -+ * The maximum number of times scx_bpf_dispatch() can be called without -+ * an intervening scx_bpf_consume() is specified by -+ * ops.dispatch_max_batch. See the comments on top of the two functions -+ * for more details. -+ * -+ * When not %NULL, @prev is an SCX task with its slice depleted. If -+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in -+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after -+ * ops.dispatch() returns. To keep executing @prev, return without -+ * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. -+ */ -+ void (*dispatch)(s32 cpu, struct task_struct *prev); -+ -+ /** -+ * runnable - A task is becoming runnable on its associated CPU -+ * @p: task becoming runnable -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * This and the following three functions can be used to track a task's -+ * execution state transitions. A task becomes ->runnable() on a CPU, -+ * and then goes through one or more ->running() and ->stopping() pairs -+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's -+ * done running on the CPU. -+ * -+ * @p is becoming runnable on the CPU because it's -+ * -+ * - waking up (%SCX_ENQ_WAKEUP) -+ * - being moved from another CPU -+ * - being restored after temporarily taken off the queue for an -+ * attribute change. -+ * -+ * This and ->enqueue() are related but not coupled. This operation -+ * notifies @p's state transition and may not be followed by ->enqueue() -+ * e.g. when @p is being dispatched to a remote CPU. Likewise, a task -+ * may be ->enqueue()'d without being preceded by this operation e.g. -+ * after exhausting its slice. -+ */ -+ void (*runnable)(struct task_struct *p, u64 enq_flags); -+ -+ /** -+ * running - A task is starting to run on its associated CPU -+ * @p: task starting to run -+ * -+ * See ->runnable() for explanation on the task state notifiers. -+ */ -+ void (*running)(struct task_struct *p); -+ -+ /** -+ * stopping - A task is stopping execution -+ * @p: task stopping to run -+ * @runnable: is task @p still runnable? -+ * -+ * See ->runnable() for explanation on the task state notifiers. If -+ * !@runnable, ->quiescent() will be invoked after this operation -+ * returns. -+ */ -+ void (*stopping)(struct task_struct *p, bool runnable); -+ -+ /** -+ * quiescent - A task is becoming not runnable on its associated CPU -+ * @p: task becoming not runnable -+ * @deq_flags: %SCX_DEQ_* -+ * -+ * See ->runnable() for explanation on the task state notifiers. -+ * -+ * @p is becoming quiescent on the CPU because it's -+ * -+ * - sleeping (%SCX_DEQ_SLEEP) -+ * - being moved to another CPU -+ * - being temporarily taken off the queue for an attribute change -+ * (%SCX_DEQ_SAVE) -+ * -+ * This and ->dequeue() are related but not coupled. This operation -+ * notifies @p's state transition and may not be preceded by ->dequeue() -+ * e.g. when @p is being dispatched to a remote CPU. -+ */ -+ void (*quiescent)(struct task_struct *p, u64 deq_flags); -+ -+ /** -+ * yield - Yield CPU -+ * @from: yielding task -+ * @to: optional yield target task -+ * -+ * If @to is NULL, @from is yielding the CPU to other runnable tasks. -+ * The BPF scheduler should ensure that other available tasks are -+ * dispatched before the yielding task. Return value is ignored in this -+ * case. -+ * -+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf -+ * scheduler can implement the request, return %true; otherwise, %false. -+ */ -+ bool (*yield)(struct task_struct *from, struct task_struct *to); -+ -+ /** -+ * core_sched_before - Task ordering for core-sched -+ * @a: task A -+ * @b: task B -+ * -+ * Used by core-sched to determine the ordering between two tasks. See -+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on -+ * core-sched. -+ * -+ * Both @a and @b are runnable and may or may not currently be queued on -+ * the BPF scheduler. Should return %true if @a should run before @b. -+ * %false if there's no required ordering or @b should run before @a. -+ * -+ * If not specified, the default is ordering them according to when they -+ * became runnable. -+ */ -+ bool (*core_sched_before)(struct task_struct *a,struct task_struct *b); -+ -+ /** -+ * set_weight - Set task weight -+ * @p: task to set weight for -+ * @weight: new eight [1..10000] -+ * -+ * Update @p's weight to @weight. -+ */ -+ void (*set_weight)(struct task_struct *p, u32 weight); -+ -+ /** -+ * set_cpumask - Set CPU affinity -+ * @p: task to set CPU affinity for -+ * @cpumask: cpumask of cpus that @p can run on -+ * -+ * Update @p's CPU affinity to @cpumask. -+ */ -+ void (*set_cpumask)(struct task_struct *p, -+ const struct cpumask *cpumask); -+ -+ /** -+ * update_idle - Update the idle state of a CPU -+ * @cpu: CPU to udpate the idle state for -+ * @idle: whether entering or exiting the idle state -+ * -+ * This operation is called when @rq's CPU goes or leaves the idle -+ * state. By default, implementing this operation disables the built-in -+ * idle CPU tracking and the following helpers become unavailable: -+ * -+ * - scx_bpf_select_cpu_dfl() -+ * - scx_bpf_test_and_clear_cpu_idle() -+ * - scx_bpf_pick_idle_cpu() -+ * -+ * The user also must implement ops.select_cpu() as the default -+ * implementation relies on scx_bpf_select_cpu_dfl(). -+ * -+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle -+ * tracking. -+ */ -+ void (*update_idle)(s32 cpu, bool idle); -+ -+ /** -+ * cpu_acquire - A CPU is becoming available to the BPF scheduler -+ * @cpu: The CPU being acquired by the BPF scheduler. -+ * @args: Acquire arguments, see the struct definition. -+ * -+ * A CPU that was previously released from the BPF scheduler is now once -+ * again under its control. -+ */ -+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); -+ -+ /** -+ * cpu_release - A CPU is taken away from the BPF scheduler -+ * @cpu: The CPU being released by the BPF scheduler. -+ * @args: Release arguments, see the struct definition. -+ * -+ * The specified CPU is no longer under the control of the BPF -+ * scheduler. This could be because it was preempted by a higher -+ * priority sched_class, though there may be other reasons as well. The -+ * caller should consult @args->reason to determine the cause. -+ */ -+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); -+ -+ /** -+ * cpu_online - A CPU became online -+ * @cpu: CPU which just came up -+ * -+ * @cpu just came online. @cpu doesn't call ops.enqueue() or run tasks -+ * associated with other CPUs beforehand. -+ */ -+ void (*cpu_online)(s32 cpu); -+ -+ /** -+ * cpu_offline - A CPU is going offline -+ * @cpu: CPU which is going offline -+ * -+ * @cpu is going offline. @cpu doesn't call ops.enqueue() or run tasks -+ * associated with other CPUs afterwards. -+ */ -+ void (*cpu_offline)(s32 cpu); -+ -+ /** -+ * prep_enable - Prepare to enable BPF scheduling for a task -+ * @p: task to prepare BPF scheduling for -+ * @args: enable arguments, see the struct definition -+ * -+ * Either we're loading a BPF scheduler or a new task is being forked. -+ * Prepare BPF scheduling for @p. This operation may block and can be -+ * used for allocations. -+ * -+ * Return 0 for success, -errno for failure. An error return while -+ * loading will abort loading of the BPF scheduler. During a fork, will -+ * abort the specific fork. -+ */ -+ s32 (*prep_enable)(struct task_struct *p, struct scx_enable_args *args); -+ -+ /** -+ * enable - Enable BPF scheduling for a task -+ * @p: task to enable BPF scheduling for -+ * @args: enable arguments, see the struct definition -+ * -+ * Enable @p for BPF scheduling. @p is now in the cgroup specified for -+ * the preceding prep_enable() and will start running soon. -+ */ -+ void (*enable)(struct task_struct *p, struct scx_enable_args *args); -+ -+ /** -+ * cancel_enable - Cancel prep_enable() -+ * @p: task being canceled -+ * @args: enable arguments, see the struct definition -+ * -+ * @p was prep_enable()'d but failed before reaching enable(). Undo the -+ * preparation. -+ */ -+ void (*cancel_enable)(struct task_struct *p, -+ struct scx_enable_args *args); -+ -+ /** -+ * disable - Disable BPF scheduling for a task -+ * @p: task to disable BPF scheduling for -+ * -+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. -+ * Disable BPF scheduling for @p. -+ */ -+ void (*disable)(struct task_struct *p); -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ /** -+ * cgroup_init - Initialize a cgroup -+ * @cgrp: cgroup being initialized -+ * @args: init arguments, see the struct definition -+ * -+ * Either the BPF scheduler is being loaded or @cgrp created, initialize -+ * @cgrp for sched_ext. This operation may block. -+ * -+ * Return 0 for success, -errno for failure. An error return while -+ * loading will abort loading of the BPF scheduler. During cgroup -+ * creation, it will abort the specific cgroup creation. -+ */ -+ s32 (*cgroup_init)(struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args); -+ -+ /** -+ * cgroup_exit - Exit a cgroup -+ * @cgrp: cgroup being exited -+ * -+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit -+ * @cgrp for sched_ext. This operation my block. -+ */ -+ void (*cgroup_exit)(struct cgroup *cgrp); -+ -+ /** -+ * cgroup_prep_move - Prepare a task to be moved to a different cgroup -+ * @p: task being moved -+ * @from: cgroup @p is being moved from -+ * @to: cgroup @p is being moved to -+ * -+ * Prepare @p for move from cgroup @from to @to. This operation may -+ * block and can be used for allocations. -+ * -+ * Return 0 for success, -errno for failure. An error return aborts the -+ * migration. -+ */ -+ s32 (*cgroup_prep_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_move - Commit cgroup move -+ * @p: task being moved -+ * @from: cgroup @p is being moved from -+ * @to: cgroup @p is being moved to -+ * -+ * Commit the move. @p is dequeued during this operation. -+ */ -+ void (*cgroup_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_cancel_move - Cancel cgroup move -+ * @p: task whose cgroup move is being canceled -+ * @from: cgroup @p was being moved from -+ * @to: cgroup @p was being moved to -+ * -+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). -+ * Undo the preparation. -+ */ -+ void (*cgroup_cancel_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_set_weight - A cgroup's weight is being changed -+ * @cgrp: cgroup whose weight is being updated -+ * @weight: new weight [1..10000] -+ * -+ * Update @tg's weight to @weight. -+ */ -+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); -+#endif /* CONFIG_CGROUPS */ -+ -+ /* -+ * All online ops must come before ops.init(). -+ */ -+ -+ /** -+ * init - Initialize the BPF scheduler -+ */ -+ s32 (*init)(void); -+ -+ /** -+ * exit - Clean up after the BPF scheduler -+ * @info: Exit info -+ */ -+ void (*exit)(struct scx_exit_info *info); -+ -+ /** -+ * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch -+ */ -+ u32 dispatch_max_batch; -+ -+ /** -+ * flags - %SCX_OPS_* flags -+ */ -+ u64 flags; -+ -+ /** -+ * timeout_ms - The maximum amount of time, in milliseconds, that a -+ * runnable task should be able to wait before being scheduled. The -+ * maximum timeout may not exceed the default timeout of 30 seconds. -+ * -+ * Defaults to the maximum allowed timeout value of 30 seconds. -+ */ -+ u32 timeout_ms; -+ -+ /** -+ * name - BPF scheduler's name -+ * -+ * Must be a non-zero valid BPF object name including only isalnum(), -+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the -+ * BPF scheduler is enabled. -+ */ -+ char name[SCX_OPS_NAME_LEN]; -+}; -+ -+/* -+ * Dispatch queue (dsq) is a simple FIFO which is used to buffer between the -+ * scheduler core and the BPF scheduler. See the documentation for more details. -+ */ -+struct scx_dispatch_q { -+ raw_spinlock_t lock; -+ struct list_head fifo; /* processed in dispatching order */ -+ struct rb_root_cached priq; /* processed in p->scx.dsq_vtime order */ -+ u32 nr; -+ u64 id; -+ struct rhash_head hash_node; -+ struct llist_node free_node; -+ struct rcu_head rcu; -+}; -+ -+/* scx_entity.flags */ -+enum scx_ent_flags { -+ SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ -+ SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ -+ SCX_TASK_ENQ_LOCAL = 1 << 2, /* used by scx_select_cpu_dfl() to set SCX_ENQ_LOCAL */ -+ -+ SCX_TASK_OPS_PREPPED = 1 << 8, /* prepared for BPF scheduler enable */ -+ SCX_TASK_OPS_ENABLED = 1 << 9, /* task has BPF scheduler enabled */ -+ -+ SCX_TASK_WATCHDOG_RESET = 1 << 16, /* task watchdog counter should be reset */ -+ SCX_TASK_DEQD_FOR_SLEEP = 1 << 17, /* last dequeue was for SLEEP */ -+ -+ SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ -+}; -+ -+/* scx_entity.dsq_flags */ -+enum scx_ent_dsq_flags { -+ SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ -+}; -+ -+/* -+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from -+ * everywhere and the following bits track which kfunc sets are currently -+ * allowed for %current. This simple per-task tracking works because SCX ops -+ * nest in a limited way. BPF will likely implement a way to allow and disallow -+ * kfuncs depending on the calling context which will replace this manual -+ * mechanism. See scx_kf_allow(). -+ */ -+enum scx_kf_mask { -+ SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ -+ /* all non-sleepables may be nested inside INIT and SLEEPABLE */ -+ SCX_KF_INIT = 1 << 0, /* running ops.init() */ -+ SCX_KF_SLEEPABLE = 1 << 1, /* other sleepable init operations */ -+ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ -+ SCX_KF_CPU_RELEASE = 1 << 2, /* ops.cpu_release() */ -+ /* ops.dequeue (in REST) may be nested inside DISPATCH */ -+ SCX_KF_DISPATCH = 1 << 3, /* ops.dispatch() */ -+ SCX_KF_ENQUEUE = 1 << 4, /* ops.enqueue() */ -+ SCX_KF_REST = 1 << 5, /* other rq-locked operations */ -+ -+ __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | -+ SCX_KF_ENQUEUE | SCX_KF_REST, -+ __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_REST, -+}; -+ -+/* -+ * The following is embedded in task_struct and contains all fields necessary -+ * for a task to be scheduled by SCX. -+ */ -+struct sched_ext_entity { -+ struct scx_dispatch_q *dsq; -+ struct { -+ struct list_head fifo; /* dispatch order */ -+ struct rb_node priq; /* p->scx.dsq_vtime order */ -+ } dsq_node; -+ struct list_head watchdog_node; -+ u32 flags; /* protected by rq lock */ -+ u32 dsq_flags; /* protected by dsq lock */ -+ u32 weight; -+ s32 sticky_cpu; -+ s32 holding_cpu; -+ u32 kf_mask; /* see scx_kf_mask above */ -+ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ -+ atomic_long_t ops_state; -+ unsigned long runnable_at; -+#ifdef CONFIG_SCHED_CORE -+ u64 core_sched_at; /* see scx_prio_less() */ -+#endif -+ -+ /* BPF scheduler modifiable fields */ -+ -+ /* -+ * Runtime budget in nsecs. This is usually set through -+ * scx_bpf_dispatch() but can also be modified directly by the BPF -+ * scheduler. Automatically decreased by SCX as the task executes. On -+ * depletion, a scheduling event is triggered. -+ * -+ * This value is cleared to zero if the task is preempted by -+ * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the -+ * task ran. Use p->se.sum_exec_runtime instead. -+ */ -+ u64 slice; -+ -+ /* -+ * Used to order tasks when dispatching to the vtime-ordered priority -+ * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() -+ * but can also be modified directly by the BPF scheduler. Modifying it -+ * while a task is queued on a dsq may mangle the ordering and is not -+ * recommended. -+ */ -+ u64 dsq_vtime; -+ -+ /* -+ * If set, reject future sched_setscheduler(2) calls updating the policy -+ * to %SCHED_EXT with -%EACCES. -+ * -+ * If set from ops.prep_enable() and the task's policy is already -+ * %SCHED_EXT, which can happen while the BPF scheduler is being loaded -+ * or by inhering the parent's policy during fork, the task's policy is -+ * rejected and forcefully reverted to %SCHED_NORMAL. The number of such -+ * events are reported through /sys/kernel/debug/sched_ext::nr_rejected. -+ */ -+ bool disallow; /* reject switching into SCX */ -+ -+ /* cold fields */ -+ struct list_head tasks_node; -+#ifdef CONFIG_EXT_GROUP_SCHED -+ struct cgroup *cgrp_moving_from; -+#endif -+}; -+ -+void sched_ext_free(struct task_struct *p); -+void print_scx_info(const char *log_lvl, struct task_struct *p); -+ -+#else /* !CONFIG_SCHED_CLASS_EXT */ -+ -+static inline void sched_ext_free(struct task_struct *p) {} -+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} -+ -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+#endif /* _LINUX_SCHED_EXT_H */ -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index a23af225c..03d35e3ed 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -61,7 +61,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); - extern void init_idle(struct task_struct *idle, int cpu); - - extern int sched_fork(unsigned long clone_flags, struct task_struct *p); --extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); -+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); -+extern void sched_cancel_fork(struct task_struct *p); - extern void sched_post_fork(struct task_struct *p); - extern void sched_dead(struct task_struct *p); - -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ce..359a14cc7 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -118,6 +118,7 @@ struct clone_args { - /* SCHED_ISO: reserved but not implemented yet */ - #define SCHED_IDLE 5 - #define SCHED_DEADLINE 6 -+#define SCHED_EXT 7 - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff --git a/init/Kconfig b/init/Kconfig -index 6d35728b9..6a247f11c 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1012,6 +1012,11 @@ config RT_GROUP_SCHED - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.rst for more information. - -+config EXT_GROUP_SCHED -+ bool -+ depends on SCHED_CLASS_EXT && CGROUP_SCHED -+ default y -+ - endif #CGROUP_SCHED - - config SCHED_MM_CID -diff --git a/init/init_task.c b/init/init_task.c -index ff6c4b9bf..7ea89ccd0 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -6,6 +6,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -101,6 +102,17 @@ struct task_struct init_task - #endif - #ifdef CONFIG_CGROUP_SCHED - .sched_task_group = &root_task_group, -+#endif -+#ifdef CONFIG_SCHED_CLASS_EXT -+ .scx = { -+ .dsq_node.fifo = LIST_HEAD_INIT(init_task.scx.dsq_node.fifo), -+ .watchdog_node = LIST_HEAD_INIT(init_task.scx.watchdog_node), -+ .sticky_cpu = -1, -+ .holding_cpu = -1, -+ .ops_state = ATOMIC_INIT(0), -+ .runnable_at = INITIAL_JIFFIES, -+ .slice = SCX_SLICE_DFL, -+ }, - #endif - .ptraced = LIST_HEAD_INIT(init_task.ptraced), - .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a..bae49b743 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -133,4 +133,26 @@ config SCHED_CORE - which is the likely usage by Linux distributions, there should - be no measurable impact on performance. - -- -+config SCHED_CLASS_EXT -+ bool "Extensible Scheduling Class" -+ depends on BPF_SYSCALL && BPF_JIT -+ help -+ This option enables a new scheduler class sched_ext (SCX), which -+ allows scheduling policies to be implemented as BPF programs to -+ achieve the following: -+ -+ - Ease of experimentation and exploration: Enabling rapid -+ iteration of new scheduling policies. -+ - Customization: Building application-specific schedulers which -+ implement policies that are not applicable to general-purpose -+ schedulers. -+ - Rapid scheduler deployments: Non-disruptive swap outs of -+ scheduling policies in production environments. -+ -+ sched_ext leverages BPF’s struct_ops feature to define a structure -+ which exports function callbacks and flags to BPF programs that -+ wish to implement scheduling policies. The struct_ops structure -+ exported by sched_ext is struct sched_ext_ops, and is conceptually -+ similar to struct sched_class. -+ -+ See Documentation/scheduler/sched-ext.rst for more details. -diff --git a/kernel/bpf/bpf_struct_ops_types.h b/kernel/bpf/bpf_struct_ops_types.h -index 5678a9ddf..3618769d8 100644 ---- a/kernel/bpf/bpf_struct_ops_types.h -+++ b/kernel/bpf/bpf_struct_ops_types.h -@@ -9,4 +9,8 @@ BPF_STRUCT_OPS_TYPE(bpf_dummy_ops) - #include - BPF_STRUCT_OPS_TYPE(tcp_congestion_ops) - #endif -+#ifdef CONFIG_SCHED_CLASS_EXT -+#include -+BPF_STRUCT_OPS_TYPE(sched_ext_ops) -+#endif - #endif -diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c -index 518725b57..f426f4be7 100644 ---- a/kernel/cgroup/cgroup.c -+++ b/kernel/cgroup/cgroup.c -@@ -4196,10 +4196,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, - return ret; - } - -+ kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN)); -+ - if (cft->file_offset) { - struct cgroup_file *cfile = (void *)css + cft->file_offset; - - timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); -+ cfile->cft = cft; - - spin_lock_irq(&cgroup_file_kn_lock); - cfile->kn = kn; -@@ -4475,6 +4478,24 @@ void cgroup_file_notify(struct cgroup_file *cfile) - spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); - } - -+static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile) -+{ -+ struct kernfs_node *kn; -+ -+ spin_lock_irq(&cgroup_file_kn_lock); -+ kn = cfile->kn; -+ kernfs_get(kn); -+ spin_unlock_irq(&cgroup_file_kn_lock); -+ -+ return kn; -+} -+ -+static bool cfile_visible(struct cgroup_file *cfile) -+{ -+ return !(cfile->cft->flags & CFTYPE_HIDDEN) && -+ !(cfile->flags & CFILE_HIDDEN); -+} -+ - /** - * cgroup_file_show - show or hide a hidden cgroup file - * @cfile: target cgroup_file obtained by setting cftype->file_offset -@@ -4484,15 +4505,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) - { - struct kernfs_node *kn; - -- spin_lock_irq(&cgroup_file_kn_lock); -- kn = cfile->kn; -- kernfs_get(kn); -- spin_unlock_irq(&cgroup_file_kn_lock); -+ mutex_lock(&cgroup_mutex); - -- if (kn) -- kernfs_show(kn, show); -+ if (show) -+ cfile->flags &= ~CFILE_HIDDEN; -+ else -+ cfile->flags |= CFILE_HIDDEN; - -- kernfs_put(kn); -+ kn = cfile_kn_get(cfile); -+ if (kn) { -+ kernfs_show(kn, cfile_visible(cfile)); -+ kernfs_put(kn); -+ } -+ -+ mutex_unlock(&cgroup_mutex); - } - - /** -@@ -5510,6 +5536,63 @@ static void offline_css(struct cgroup_subsys_state *css) - wake_up_all(&css->cgroup->offline_waitq); - } - -+/** -+ * cgroup_show_cftype - show or hide a cgroup file type -+ * @cft: cftype to show or hide -+ * @show: whether to show or hide -+ * -+ * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show. -+ * @cft may or may not be added at the time of this call. After hiding, it's -+ * guaranteed that there are no in-flight operations on the hidden files. -+ */ -+void cgroup_show_cftype(struct cftype *cft, bool show) -+{ -+ struct cgroup_subsys *ss = cft->ss; -+ struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp; -+ struct cgroup_subsys_state *css; -+ -+ mutex_lock(&cgroup_mutex); -+ -+ if (show) -+ cft->flags &= ~CFTYPE_HIDDEN; -+ else -+ cft->flags |= CFTYPE_HIDDEN; -+ -+ if (!(cft->flags & __CFTYPE_ADDED)) -+ goto out_unlock; -+ -+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { -+ struct cgroup *cgrp = css->cgroup; -+ struct kernfs_node *kn; -+ -+ if (!(css->flags & CSS_VISIBLE)) -+ continue; -+ -+ if (cft->file_offset) { -+ struct cgroup_file *cfile = -+ (void *)css + cft->file_offset; -+ -+ kn = cfile_kn_get(cfile); -+ if (kn) { -+ kernfs_show(kn, cfile_visible(cfile)); -+ kernfs_put(kn); -+ } -+ } else { -+ char buf[CGROUP_FILE_NAME_MAX]; -+ -+ kn = kernfs_find_and_get(cgrp->kn, -+ cgroup_file_name(cgrp, cft, buf)); -+ if (kn) { -+ kernfs_show(kn, show); -+ kernfs_put(kn); -+ } -+ } -+ } -+ -+out_unlock: -+ mutex_unlock(&cgroup_mutex); -+} -+ - /** - * css_create - create a cgroup_subsys_state - * @cgrp: the cgroup new css will be associated with -diff --git a/kernel/fork.c b/kernel/fork.c -index 177ce7438..141fceb3b 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -23,6 +23,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -970,6 +971,7 @@ void __put_task_struct(struct task_struct *tsk) - WARN_ON(refcount_read(&tsk->usage)); - WARN_ON(tsk == current); - -+ sched_ext_free(tsk); - io_uring_free(tsk); - cgroup_free(tsk); - task_numa_free(tsk, true); -@@ -2474,7 +2476,7 @@ __latent_entropy struct task_struct *copy_process( - - retval = perf_event_init_task(p, clone_flags); - if (retval) -- goto bad_fork_cleanup_policy; -+ goto bad_fork_sched_cancel_fork; - retval = audit_alloc(p); - if (retval) - goto bad_fork_cleanup_perf; -@@ -2606,7 +2608,9 @@ __latent_entropy struct task_struct *copy_process( - * cgroup specific, it unconditionally needs to place the task on a - * runqueue. - */ -- sched_cgroup_fork(p, args); -+ retval = sched_cgroup_fork(p, args); -+ if (retval) -+ goto bad_fork_cancel_cgroup; - - /* - * From this point on we must avoid any synchronous user-space -@@ -2652,13 +2656,13 @@ __latent_entropy struct task_struct *copy_process( - /* Don't start children in a dying pid namespace */ - if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { - retval = -ENOMEM; -- goto bad_fork_cancel_cgroup; -+ goto bad_fork_core_free; - } - - /* Let kill terminate clone/fork in the middle */ - if (fatal_signal_pending(current)) { - retval = -EINTR; -- goto bad_fork_cancel_cgroup; -+ goto bad_fork_core_free; - } - - /* No more failure paths after this point. */ -@@ -2734,10 +2738,11 @@ __latent_entropy struct task_struct *copy_process( - - return p; - --bad_fork_cancel_cgroup: -+bad_fork_core_free: - sched_core_free(p); - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); -+bad_fork_cancel_cgroup: - cgroup_cancel_fork(p, args); - bad_fork_put_pidfd: - if (clone_flags & CLONE_PIDFD) { -@@ -2776,6 +2781,8 @@ __latent_entropy struct task_struct *copy_process( - audit_free(p); - bad_fork_cleanup_perf: - perf_event_free_task(p); -+bad_fork_sched_cancel_fork: -+ sched_cancel_fork(p); - bad_fork_cleanup_policy: - lockdep_free_task(p); - #ifdef CONFIG_NUMA -diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c -index d9dc9ab37..005025f55 100644 ---- a/kernel/sched/build_policy.c -+++ b/kernel/sched/build_policy.c -@@ -28,6 +28,8 @@ - #include - #include - #include -+#include -+#include - - #include - -@@ -52,3 +54,6 @@ - #include "cputime.c" - #include "deadline.c" - -+#ifdef CONFIG_SCHED_CLASS_EXT -+# include "ext.c" -+#endif -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index a854b7183..5f2f52fc7 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -167,7 +167,10 @@ static inline int __task_prio(const struct task_struct *p) - if (p->sched_class == &idle_sched_class) - return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - -- return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ -+ if (task_on_scx(p)) -+ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ -+ -+ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ - } - - /* -@@ -196,6 +199,11 @@ static inline bool prio_less(const struct task_struct *a, - if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ - return cfs_prio_less(a, b, in_fi); - -+#ifdef CONFIG_SCHED_CLASS_EXT -+ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ -+ return scx_prio_less(a, b, in_fi); -+#endif -+ - return false; - } - -@@ -1233,11 +1241,14 @@ bool sched_can_stop_tick(struct rq *rq) - return true; - - /* -- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; -- * if there's more than one we need the tick for involuntary -- * preemption. -+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks -+ * left. For CFS, if there's more than one we need the tick for -+ * involuntary preemption. For SCX, ask. - */ -- if (rq->nr_running > 1) -+ if (!scx_switched_all() && rq->nr_running > 1) -+ return false; -+ -+ if (scx_enabled() && !scx_can_stop_tick(rq)) - return false; - - /* -@@ -1320,8 +1331,8 @@ static void set_load_weight(struct task_struct *p, bool update_load) - * SCHED_OTHER tasks have to update their load when changing their - * weight - */ -- if (update_load && p->sched_class == &fair_sched_class) { -- reweight_task(p, prio); -+ if (update_load && p->sched_class->reweight_task) { -+ p->sched_class->reweight_task(task_rq(p), p, prio); - } else { - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; -@@ -2198,6 +2209,17 @@ inline int task_curr(const struct task_struct *p) - return cpu_curr(task_cpu(p)) == p; - } - -+/* -+ * ->switching_to() is called with the pi_lock and rq_lock held and must not -+ * mess with locking. -+ */ -+void check_class_changing(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class) -+{ -+ if (prev_class != p->sched_class && p->sched_class->switching_to) -+ p->sched_class->switching_to(rq, p); -+} -+ - /* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. -@@ -2205,9 +2227,9 @@ inline int task_curr(const struct task_struct *p) - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ --static inline void check_class_changed(struct rq *rq, struct task_struct *p, -- const struct sched_class *prev_class, -- int oldprio) -+void check_class_changed(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class, -+ int oldprio) - { - if (prev_class != p->sched_class) { - if (prev_class->switched_from) -@@ -3962,6 +3984,15 @@ bool cpus_share_cache(int this_cpu, int that_cpu) - - static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) - { -+ /* -+ * The BPF scheduler may depend on select_task_rq() being invoked during -+ * wakeups. In addition, @p may end up executing on a different CPU -+ * regardless of what happens in the wakeup path making the ttwu_queue -+ * optimization less meaningful. Skip if on SCX. -+ */ -+ if (task_on_scx(p)) -+ return false; -+ - /* - * Do not complicate things with the async wake_list while the CPU is - * in hotplug state. -@@ -4528,6 +4559,21 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->rt.on_rq = 0; - p->rt.on_list = 0; - -+#ifdef CONFIG_SCHED_CLASS_EXT -+ p->scx.dsq = NULL; -+ INIT_LIST_HEAD(&p->scx.dsq_node.fifo); -+ RB_CLEAR_NODE(&p->scx.dsq_node.priq); -+ INIT_LIST_HEAD(&p->scx.watchdog_node); -+ p->scx.flags = 0; -+ p->scx.weight = 0; -+ p->scx.sticky_cpu = -1; -+ p->scx.holding_cpu = -1; -+ p->scx.kf_mask = 0; -+ atomic64_set(&p->scx.ops_state, 0); -+ p->scx.runnable_at = INITIAL_JIFFIES; -+ p->scx.slice = SCX_SLICE_DFL; -+#endif -+ - #ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); - #endif -@@ -4731,6 +4777,8 @@ late_initcall(sched_core_sysctl_init); - */ - int sched_fork(unsigned long clone_flags, struct task_struct *p) - { -+ int ret; -+ - __sched_fork(clone_flags, p); - /* - * We mark the process as NEW here. This guarantees that -@@ -4767,12 +4815,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - p->sched_reset_on_fork = 0; - } - -- if (dl_prio(p->prio)) -- return -EAGAIN; -- else if (rt_prio(p->prio)) -+ scx_pre_fork(p); -+ -+ if (dl_prio(p->prio)) { -+ ret = -EAGAIN; -+ goto out_cancel; -+ } else if (rt_prio(p->prio)) { - p->sched_class = &rt_sched_class; -- else -+#ifdef CONFIG_SCHED_CLASS_EXT -+ } else if (task_should_scx(p)) { -+ p->sched_class = &ext_sched_class; -+#endif -+ } else { - p->sched_class = &fair_sched_class; -+ } - - init_entity_runnable_average(&p->se); - -@@ -4790,9 +4846,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - RB_CLEAR_NODE(&p->pushable_dl_tasks); - #endif - return 0; -+ -+out_cancel: -+ scx_cancel_fork(p); -+ return ret; - } - --void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) -+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) - { - unsigned long flags; - -@@ -4819,11 +4879,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return scx_fork(p); -+} -+ -+void sched_cancel_fork(struct task_struct *p) -+{ -+ scx_cancel_fork(p); - } - - void sched_post_fork(struct task_struct *p) - { - uclamp_post_fork(p); -+ scx_post_fork(p); - } - - unsigned long to_ratio(u64 period, u64 runtime) -@@ -5668,14 +5736,17 @@ void scheduler_tick(void) - if (sched_feat(LATENCY_WARN) && resched_latency) - resched_latency_warn(cpu, resched_latency); - -+ scx_notify_sched_tick(); - perf_event_task_tick(); - - if (curr->flags & PF_WQ_WORKER) - wq_worker_tick(curr); - - #ifdef CONFIG_SMP -- rq->idle_balance = idle_cpu(cpu); -- trigger_load_balance(rq); -+ if (!scx_switched_all()) { -+ rq->idle_balance = idle_cpu(cpu); -+ trigger_load_balance(rq); -+ } - #endif - } - -@@ -5976,7 +6047,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ -- for_class_range(class, prev->sched_class, &idle_sched_class) { -+ for_balance_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -@@ -5994,6 +6065,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - const struct sched_class *class; - struct task_struct *p; - -+ if (scx_enabled()) -+ goto restart; -+ - /* - * Optimization: we know that if all tasks are in the fair class we can - * call that function directly, but only if the @prev task wasn't of a -@@ -6019,10 +6093,12 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - restart: - put_prev_task_balance(rq, prev, rf); - -- for_each_class(class) { -+ for_each_active_class(class) { - p = class->pick_next_task(rq); -- if (p) -+ if (p) { -+ scx_notify_pick_next_task(rq, p, class); - return p; -+ } - } - - BUG(); /* The idle class should always have a runnable task. */ -@@ -6052,7 +6128,7 @@ static inline struct task_struct *pick_task(struct rq *rq) - const struct sched_class *class; - struct task_struct *p; - -- for_each_class(class) { -+ for_each_active_class(class) { - p = class->pick_task(rq); - if (p) - return p; -@@ -7021,12 +7097,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag - } - EXPORT_SYMBOL(default_wake_function); - --static void __setscheduler_prio(struct task_struct *p, int prio) -+void __setscheduler_prio(struct task_struct *p, int prio) - { - if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ else if (task_should_scx(p)) -+ p->sched_class = &ext_sched_class; -+#endif - else - p->sched_class = &fair_sched_class; - -@@ -7161,6 +7241,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) - } - - __setscheduler_prio(p, prio); -+ check_class_changing(rq, p, prev_class); - - if (queued) - enqueue_task(rq, p, queue_flag); -@@ -7707,6 +7788,10 @@ static int __sched_setscheduler(struct task_struct *p, - goto unlock; - } - -+ retval = scx_check_setscheduler(p, policy); -+ if (retval) -+ goto unlock; -+ - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. -@@ -7809,6 +7894,7 @@ static int __sched_setscheduler(struct task_struct *p, - __setscheduler_prio(p, newprio); - } - __setscheduler_uclamp(p, attr); -+ check_class_changing(rq, p, prev_class); - - if (queued) { - /* -@@ -9050,6 +9136,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - break; - } -@@ -9077,6 +9164,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - } - return ret; -@@ -9180,6 +9268,7 @@ void sched_show_task(struct task_struct *p) - - print_worker_info(KERN_INFO, p); - print_stop_info(KERN_INFO, p); -+ print_scx_info(KERN_INFO, p); - show_stack(p, NULL, KERN_INFO); - put_task_stack(p); - } -@@ -9565,7 +9654,7 @@ static inline void balance_hotplug_wait(void) - - #endif /* CONFIG_HOTPLUG_CPU */ - --void set_rq_online(struct rq *rq) -+void set_rq_online(struct rq *rq, enum rq_onoff_reason reason) - { - if (!rq->online) { - const struct sched_class *class; -@@ -9575,12 +9664,12 @@ void set_rq_online(struct rq *rq) - - for_each_class(class) { - if (class->rq_online) -- class->rq_online(rq); -+ class->rq_online(rq, reason); - } - } - } - --void set_rq_offline(struct rq *rq) -+void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason) - { - if (rq->online) { - const struct sched_class *class; -@@ -9588,7 +9677,7 @@ void set_rq_offline(struct rq *rq) - update_rq_clock(rq); - for_each_class(class) { - if (class->rq_offline) -- class->rq_offline(rq); -+ class->rq_offline(rq, reason); - } - - cpumask_clear_cpu(rq->cpu, rq->rd->online); -@@ -9684,7 +9773,7 @@ int sched_cpu_activate(unsigned int cpu) - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -- set_rq_online(rq); -+ set_rq_online(rq, RQ_ONOFF_HOTPLUG); - } - rq_unlock_irqrestore(rq, &rf); - -@@ -9728,7 +9817,7 @@ int sched_cpu_deactivate(unsigned int cpu) - rq_lock_irqsave(rq, &rf); - if (rq->rd) { - BUG_ON(!cpumask_test_cpu(cpu, rq->rd->span)); -- set_rq_offline(rq); -+ set_rq_offline(rq, RQ_ONOFF_HOTPLUG); - } - rq_unlock_irqrestore(rq, &rf); - -@@ -9915,11 +10004,15 @@ void __init sched_init(void) - int i; - - /* Make sure the linker didn't screw up */ -- BUG_ON(&idle_sched_class != &fair_sched_class + 1 || -- &fair_sched_class != &rt_sched_class + 1 || -- &rt_sched_class != &dl_sched_class + 1); - #ifdef CONFIG_SMP -- BUG_ON(&dl_sched_class != &stop_sched_class + 1); -+ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -+#endif -+ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); -+ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); -+ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); -+#ifdef CONFIG_SCHED_CLASS_EXT -+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); -+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); - #endif - - wait_bit_init(); -@@ -9943,6 +10036,9 @@ void __init sched_init(void) - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); - #endif /* CONFIG_FAIR_GROUP_SCHED */ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ root_task_group.scx_weight = CGROUP_WEIGHT_DFL; -+#endif /* CONFIG_EXT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -@@ -10090,6 +10186,7 @@ void __init sched_init(void) - balance_push_set(smp_processor_id(), false); - #endif - init_sched_fair_class(); -+ init_sched_ext_class(); - - psi_init(); - -@@ -10398,6 +10495,7 @@ struct task_group *sched_create_group(struct task_group *parent) - if (!alloc_rt_sched_group(tg, parent)) - goto err; - -+ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); - alloc_uclamp_sched_group(tg, parent); - - return tg; -@@ -10524,6 +10622,7 @@ void sched_move_task(struct task_struct *tsk) - put_prev_task(rq, tsk); - - sched_change_group(tsk, group); -+ scx_move_task(tsk); - - if (queued) - enqueue_task(rq, tsk, queue_flags); -@@ -10541,11 +10640,6 @@ void sched_move_task(struct task_struct *tsk) - task_rq_unlock(rq, tsk, &rf); - } - --static inline struct task_group *css_tg(struct cgroup_subsys_state *css) --{ -- return css ? container_of(css, struct task_group, css) : NULL; --} -- - static struct cgroup_subsys_state * - cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) - { -@@ -10569,6 +10663,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) - { - struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); -+ int ret; -+ -+ ret = scx_tg_online(tg); -+ if (ret) -+ return ret; - - if (parent) - sched_online_group(tg, parent); -@@ -10585,6 +10684,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) - return 0; - } - -+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ scx_tg_offline(tg); -+} -+ - static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) - { - struct task_group *tg = css_tg(css); -@@ -10602,9 +10708,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) - sched_unregister_group(tg); - } - --#ifdef CONFIG_RT_GROUP_SCHED -+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) - static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) - { -+#ifdef CONFIG_RT_GROUP_SCHED - struct task_struct *task; - struct cgroup_subsys_state *css; - -@@ -10612,7 +10719,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) - if (!sched_rt_can_attach(css_tg(css), task)) - return -EINVAL; - } -- return 0; -+#endif -+ return scx_cgroup_can_attach(tset); - } - #endif - -@@ -10623,8 +10731,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) - - cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); -+ -+ scx_cgroup_finish_attach(); - } - -+#ifdef CONFIG_EXT_GROUP_SCHED -+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) -+{ -+ scx_cgroup_cancel_attach(tset); -+} -+#endif -+ - #ifdef CONFIG_UCLAMP_TASK_GROUP - static void cpu_util_update_eff(struct cgroup_subsys_state *css) - { -@@ -10806,9 +10923,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) - static int cpu_shares_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 shareval) - { -+ int ret; -+ - if (shareval > scale_load_down(ULONG_MAX)) - shareval = MAX_SHARES; -- return sched_group_set_shares(css_tg(css), scale_load(shareval)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), -+ sched_weight_to_cgroup(shareval)); -+ return ret; - } - - static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, -@@ -11209,7 +11332,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, - } - #endif - --static struct cftype cpu_legacy_files[] = { -+static struct cftype cpu_legacy_cftypes[] = { - #ifdef CONFIG_FAIR_GROUP_SCHED - { - .name = "shares", -@@ -11320,38 +11443,44 @@ static int cpu_local_stat_show(struct seq_file *sf, - return 0; - } - -+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) -+ -+static unsigned long tg_weight(struct task_group *tg) -+{ - #ifdef CONFIG_FAIR_GROUP_SCHED -+ return scale_load_down(tg->shares); -+#else -+ return sched_weight_from_cgroup(tg->scx_weight); -+#endif -+} -+ - static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -- struct task_group *tg = css_tg(css); -- u64 weight = scale_load_down(tg->shares); -- -- return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); -+ return sched_weight_to_cgroup(tg_weight(css_tg(css))); - } - - static int cpu_weight_write_u64(struct cgroup_subsys_state *css, -- struct cftype *cft, u64 weight) -+ struct cftype *cft, u64 cgrp_weight) - { -- /* -- * cgroup weight knobs should use the common MIN, DFL and MAX -- * values which are 1, 100 and 10000 respectively. While it loses -- * a bit of range on both ends, it maps pretty well onto the shares -- * value used by scheduler and the round-trip conversions preserve -- * the original value over the entire range. -- */ -- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) -+ unsigned long weight; -+ int ret; -+ -+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) - return -ERANGE; - -- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); -+ weight = sched_weight_from_cgroup(cgrp_weight); - -- return sched_group_set_shares(css_tg(css), scale_load(weight)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), cgrp_weight); -+ return ret; - } - - static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -- unsigned long weight = scale_load_down(css_tg(css)->shares); -+ unsigned long weight = tg_weight(css_tg(css)); - int last_delta = INT_MAX; - int prio, delta; - -@@ -11370,7 +11499,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, - struct cftype *cft, s64 nice) - { - unsigned long weight; -- int idx; -+ int idx, ret; - - if (nice < MIN_NICE || nice > MAX_NICE) - return -ERANGE; -@@ -11379,7 +11508,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, - idx = array_index_nospec(idx, 40); - weight = sched_prio_to_weight[idx]; - -- return sched_group_set_shares(css_tg(css), scale_load(weight)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), -+ sched_weight_to_cgroup(weight)); -+ return ret; - } - #endif - -@@ -11440,21 +11573,23 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, - } - #endif - --static struct cftype cpu_files[] = { --#ifdef CONFIG_FAIR_GROUP_SCHED -- { -+struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = { -+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) -+ [CPU_CFTYPE_WEIGHT] = { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_weight_read_u64, - .write_u64 = cpu_weight_write_u64, - }, -- { -+ [CPU_CFTYPE_WEIGHT_NICE] = { - .name = "weight.nice", - .flags = CFTYPE_NOT_ON_ROOT, - .read_s64 = cpu_weight_nice_read_s64, - .write_s64 = cpu_weight_nice_write_s64, - }, -- { -+#endif -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ [CPU_CFTYPE_IDLE] = { - .name = "idle", - .flags = CFTYPE_NOT_ON_ROOT, - .read_s64 = cpu_idle_read_s64, -@@ -11462,13 +11597,13 @@ static struct cftype cpu_files[] = { - }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH -- { -+ [CPU_CFTYPE_MAX] = { - .name = "max", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_max_show, - .write = cpu_max_write, - }, -- { -+ [CPU_CFTYPE_MAX_BURST] = { - .name = "max.burst", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, -@@ -11476,13 +11611,13 @@ static struct cftype cpu_files[] = { - }, - #endif - #ifdef CONFIG_UCLAMP_TASK_GROUP -- { -+ [CPU_CFTYPE_UCLAMP_MIN] = { - .name = "uclamp.min", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_uclamp_min_show, - .write = cpu_uclamp_min_write, - }, -- { -+ [CPU_CFTYPE_UCLAMP_MAX] = { - .name = "uclamp.max", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_uclamp_max_show, -@@ -11495,16 +11630,20 @@ static struct cftype cpu_files[] = { - struct cgroup_subsys cpu_cgrp_subsys = { - .css_alloc = cpu_cgroup_css_alloc, - .css_online = cpu_cgroup_css_online, -+ .css_offline = cpu_cgroup_css_offline, - .css_released = cpu_cgroup_css_released, - .css_free = cpu_cgroup_css_free, - .css_extra_stat_show = cpu_extra_stat_show, - .css_local_stat_show = cpu_local_stat_show, --#ifdef CONFIG_RT_GROUP_SCHED -+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) - .can_attach = cpu_cgroup_can_attach, - #endif - .attach = cpu_cgroup_attach, -- .legacy_cftypes = cpu_legacy_files, -- .dfl_cftypes = cpu_files, -+#ifdef CONFIG_EXT_GROUP_SCHED -+ .cancel_attach = cpu_cgroup_cancel_attach, -+#endif -+ .legacy_cftypes = cpu_legacy_cftypes, -+ .dfl_cftypes = cpu_cftypes, - .early_init = true, - .threaded = true, - }; -@@ -12104,3 +12243,38 @@ void sched_mm_cid_fork(struct task_struct *t) - t->mm_cid_active = 1; - } - #endif -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+void sched_deq_and_put_task(struct task_struct *p, int queue_flags, -+ struct sched_enq_and_set_ctx *ctx) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_rq_held(rq); -+ -+ *ctx = (struct sched_enq_and_set_ctx){ -+ .p = p, -+ .queue_flags = queue_flags, -+ .queued = task_on_rq_queued(p), -+ .running = task_current(rq, p), -+ }; -+ -+ update_rq_clock(rq); -+ if (ctx->queued) -+ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); -+ if (ctx->running) -+ put_prev_task(rq, p); -+} -+ -+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) -+{ -+ struct rq *rq = task_rq(ctx->p); -+ -+ lockdep_assert_rq_held(rq); -+ -+ if (ctx->queued) -+ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); -+ if (ctx->running) -+ set_next_task(rq, ctx->p); -+} -+#endif /* CONFIG_SCHED_CLASS_EXT */ -diff --git a/kernel/sched/deadline.c b/kernel/sched/deadline.c -index d78f2e876..77e7bc42e 100644 ---- a/kernel/sched/deadline.c -+++ b/kernel/sched/deadline.c -@@ -2512,7 +2512,7 @@ static void set_cpus_allowed_dl(struct task_struct *p, - } - - /* Assumes rq->lock is held */ --static void rq_online_dl(struct rq *rq) -+static void rq_online_dl(struct rq *rq, enum rq_onoff_reason reason) - { - if (rq->dl.overloaded) - dl_set_overload(rq); -@@ -2523,7 +2523,7 @@ static void rq_online_dl(struct rq *rq) - } - - /* Assumes rq->lock is held */ --static void rq_offline_dl(struct rq *rq) -+static void rq_offline_dl(struct rq *rq, enum rq_onoff_reason reason) - { - if (rq->dl.overloaded) - dl_clear_overload(rq); -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index 4c3d0d9f3..bbc6b8e37 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -374,6 +374,9 @@ static __init int sched_init_debug(void) - - debugfs_create_file("debug", 0444, debugfs_sched, NULL, &sched_debug_fops); - -+#ifdef CONFIG_SCHED_CLASS_EXT -+ debugfs_create_file("ext", 0444, debugfs_sched, NULL, &sched_ext_fops); -+#endif - return 0; - } - late_initcall(sched_init_debug); -@@ -1090,6 +1093,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - P(dl.runtime); - P(dl.deadline); - } -+#ifdef CONFIG_SCHED_CLASS_EXT -+ __PS("ext.enabled", task_on_scx(p)); -+#endif - #undef PN_SCHEDSTAT - #undef P_SCHEDSTAT - -diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c -new file mode 100644 -index 000000000..a4d3d8397 ---- /dev/null -+++ b/kernel/sched/ext.c -@@ -0,0 +1,4497 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) -+ -+enum scx_internal_consts { -+ SCX_NR_ONLINE_OPS = SCX_OP_IDX(init), -+ SCX_DSP_DFL_MAX_BATCH = 32, -+ SCX_DSP_MAX_LOOPS = 32, -+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, -+}; -+ -+enum scx_ops_enable_state { -+ SCX_OPS_PREPPING, -+ SCX_OPS_ENABLING, -+ SCX_OPS_ENABLED, -+ SCX_OPS_DISABLING, -+ SCX_OPS_DISABLED, -+}; -+ -+static const char *scx_ops_enable_state_str[] = { -+ [SCX_OPS_PREPPING] = "prepping", -+ [SCX_OPS_ENABLING] = "enabling", -+ [SCX_OPS_ENABLED] = "enabled", -+ [SCX_OPS_DISABLING] = "disabling", -+ [SCX_OPS_DISABLED] = "disabled", -+}; -+ -+/* -+ * sched_ext_entity->ops_state -+ * -+ * Used to track the task ownership between the SCX core and the BPF scheduler. -+ * State transitions look as follows: -+ * -+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING -+ * ^ | | -+ * | v v -+ * \-------------------------------/ -+ * -+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call -+ * sites for explanations on the conditions being waited upon and why they are -+ * safe. Transitions out of them into NONE or QUEUED must store_release and the -+ * waiters should load_acquire. -+ * -+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether -+ * any given task can be dispatched by the BPF scheduler at all times and thus -+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler -+ * to try to dispatch any task anytime regardless of its state as the SCX core -+ * can safely reject invalid dispatches. -+ */ -+enum scx_ops_state { -+ SCX_OPSS_NONE, /* owned by the SCX core */ -+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ -+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ -+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ -+ -+ /* -+ * QSEQ brands each QUEUED instance so that, when dispatch races -+ * dequeue/requeue, the dispatcher can tell whether it still has a claim -+ * on the task being dispatched. -+ * -+ * As some 32bit archs can't do 64bit store_release/load_acquire, -+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on -+ * 32bit machines. The dispatch race window QSEQ protects is very narrow -+ * and runs with IRQ disabled. 30 bits should be sufficient. -+ */ -+ SCX_OPSS_QSEQ_SHIFT = 2, -+}; -+ -+/* Use macros to ensure that the type is unsigned long for the masks */ -+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) -+ -+/* -+ * During exit, a task may schedule after losing its PIDs. When disabling the -+ * BPF scheduler, we need to be able to iterate tasks in every state to -+ * guarantee system safety. Maintain a dedicated task list which contains every -+ * task between its fork and eventual free. -+ */ -+static DEFINE_SPINLOCK(scx_tasks_lock); -+static LIST_HEAD(scx_tasks); -+ -+/* ops enable/disable */ -+static struct kthread_worker *scx_ops_helper; -+static DEFINE_MUTEX(scx_ops_enable_mutex); -+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); -+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); -+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -+static bool scx_switch_all_req; -+static bool scx_switching_all; -+DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -+ -+static struct sched_ext_ops scx_ops; -+static bool scx_warned_zero_slice; -+ -+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); -+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); -+DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); -+ -+struct static_key_false scx_has_op[SCX_NR_ONLINE_OPS] = -+ { [0 ... SCX_NR_ONLINE_OPS-1] = STATIC_KEY_FALSE_INIT }; -+ -+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); -+static struct scx_exit_info scx_exit_info; -+ -+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); -+ -+/* -+ * The maximum amount of time in jiffies that a task may be runnable without -+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger -+ * scx_ops_error(). -+ */ -+unsigned long scx_watchdog_timeout; -+ -+/* -+ * The last time the delayed work was run. This delayed work relies on -+ * ksoftirqd being able to run to service timer interrupts, so it's possible -+ * that this work itself could get wedged. To account for this, we check that -+ * it's not stalled in the timer tick, and trigger an error if it is. -+ */ -+unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; -+ -+static struct delayed_work scx_watchdog_work; -+ -+/* idle tracking */ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_CPUMASK_OFFSTACK -+#define CL_ALIGNED_IF_ONSTACK -+#else -+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -+#endif -+ -+static struct { -+ cpumask_var_t cpu; -+ cpumask_var_t smt; -+} idle_masks CL_ALIGNED_IF_ONSTACK; -+ -+#endif /* CONFIG_SMP */ -+ -+/* for %SCX_KICK_WAIT */ -+static unsigned long __percpu *scx_kick_cpus_pnt_seqs; -+ -+/* -+ * Direct dispatch marker. -+ * -+ * Non-NULL values are used for direct dispatch from enqueue path. A valid -+ * pointer points to the task currently being enqueued. An ERR_PTR value is used -+ * to indicate that direct dispatch has already happened. -+ */ -+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -+ -+/* dispatch queues */ -+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; -+ -+static const struct rhashtable_params dsq_hash_params = { -+ .key_len = 8, -+ .key_offset = offsetof(struct scx_dispatch_q, id), -+ .head_offset = offsetof(struct scx_dispatch_q, hash_node), -+}; -+ -+static struct rhashtable dsq_hash; -+static LLIST_HEAD(dsqs_to_free); -+ -+/* dispatch buf */ -+struct scx_dsp_buf_ent { -+ struct task_struct *task; -+ unsigned long qseq; -+ u64 dsq_id; -+ u64 enq_flags; -+}; -+ -+static u32 scx_dsp_max_batch; -+static struct scx_dsp_buf_ent __percpu *scx_dsp_buf; -+ -+struct scx_dsp_ctx { -+ struct rq *rq; -+ struct rq_flags *rf; -+ u32 buf_cursor; -+ u32 nr_tasks; -+}; -+ -+static DEFINE_PER_CPU(struct scx_dsp_ctx, scx_dsp_ctx); -+ -+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, -+ u64 enq_flags); -+void scx_bpf_kick_cpu(s32 cpu, u64 flags); -+ -+struct scx_task_iter { -+ struct sched_ext_entity cursor; -+ struct task_struct *locked; -+ struct rq *rq; -+ struct rq_flags rf; -+}; -+ -+#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) -+ -+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ -+static u32 higher_bits(u32 flags) -+{ -+ return ~((1 << fls(flags)) - 1); -+} -+ -+/* return the mask with only the highest bit set */ -+static u32 highest_bit(u32 flags) -+{ -+ int bit = fls(flags); -+ return bit ? 1 << (bit - 1) : 0; -+} -+ -+/* -+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX -+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate -+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check -+ * whether it's running from an allowed context. -+ * -+ * @mask is constant, always inline to cull the mask calculations. -+ */ -+static __always_inline void scx_kf_allow(u32 mask) -+{ -+ /* nesting is allowed only in increasing scx_kf_mask order */ -+ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, -+ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", -+ current->scx.kf_mask, mask); -+ current->scx.kf_mask |= mask; -+} -+ -+static void scx_kf_disallow(u32 mask) -+{ -+ current->scx.kf_mask &= ~mask; -+} -+ -+#define SCX_CALL_OP(mask, op, args...) \ -+do { \ -+ if (mask) { \ -+ scx_kf_allow(mask); \ -+ scx_ops.op(args); \ -+ scx_kf_disallow(mask); \ -+ } else { \ -+ scx_ops.op(args); \ -+ } \ -+} while (0) -+ -+#define SCX_CALL_OP_RET(mask, op, args...) \ -+({ \ -+ __typeof__(scx_ops.op(args)) __ret; \ -+ if (mask) { \ -+ scx_kf_allow(mask); \ -+ __ret = scx_ops.op(args); \ -+ scx_kf_disallow(mask); \ -+ } else { \ -+ __ret = scx_ops.op(args); \ -+ } \ -+ __ret; \ -+}) -+ -+/* -+ * Some kfuncs are allowed only on the tasks that are subjects of the -+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such -+ * restrictions, the following SCX_CALL_OP_*() variants should be used when -+ * invoking scx_ops operations that take task arguments. These can only be used -+ * for non-nesting operations due to the way the tasks are tracked. -+ * -+ * kfuncs which can only operate on such tasks can in turn use -+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on -+ * the specific task. -+ */ -+#define SCX_CALL_OP_TASK(mask, op, task, args...) \ -+do { \ -+ BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task; \ -+ SCX_CALL_OP(mask, op, task, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+} while (0) -+ -+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ -+({ \ -+ __typeof__(scx_ops.op(task, ##args)) __ret; \ -+ BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task; \ -+ __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+ __ret; \ -+}) -+ -+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ -+({ \ -+ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ -+ BUILD_BUG_ON(mask & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task0; \ -+ current->scx.kf_tasks[1] = task1; \ -+ __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+ current->scx.kf_tasks[1] = NULL; \ -+ __ret; \ -+}) -+ -+/* @mask is constant, always inline to cull unnecessary branches */ -+static __always_inline bool scx_kf_allowed(u32 mask) -+{ -+ if (unlikely(!(current->scx.kf_mask & mask))) { -+ scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", -+ mask, current->scx.kf_mask); -+ return false; -+ } -+ -+ if (unlikely((mask & (SCX_KF_INIT | SCX_KF_SLEEPABLE)) && -+ in_interrupt())) { -+ scx_ops_error("sleepable kfunc called from non-sleepable context"); -+ return false; -+ } -+ -+ /* -+ * Enforce nesting boundaries. e.g. A kfunc which can be called from -+ * DISPATCH must not be called if we're running DEQUEUE which is nested -+ * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE -+ * boundary thanks to the above in_interrupt() check. -+ */ -+ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && -+ (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { -+ scx_ops_error("cpu_release kfunc called from a nested operation"); -+ return false; -+ } -+ -+ if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && -+ (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { -+ scx_ops_error("dispatch kfunc called from a nested operation"); -+ return false; -+ } -+ -+ return true; -+} -+ -+/* see SCX_CALL_OP_TASK() */ -+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, -+ struct task_struct *p) -+{ -+ if (!scx_kf_allowed(__SCX_KF_RQ_LOCKED)) -+ return false; -+ -+ if (unlikely((p != current->scx.kf_tasks[0] && -+ p != current->scx.kf_tasks[1]))) { -+ scx_ops_error("called on a task not being operated on"); -+ return false; -+ } -+ -+ return true; -+} -+ -+/** -+ * scx_task_iter_init - Initialize a task iterator -+ * @iter: iterator to init -+ * -+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, -+ * @iter must eventually be exited with scx_task_iter_exit(). -+ * -+ * scx_tasks_lock may be released between this and the first next() call or -+ * between any two next() calls. If scx_tasks_lock is released between two -+ * next() calls, the caller is responsible for ensuring that the task being -+ * iterated remains accessible either through RCU read lock or obtaining a -+ * reference count. -+ * -+ * All tasks which existed when the iteration started are guaranteed to be -+ * visited as long as they still exist. -+ */ -+static void scx_task_iter_init(struct scx_task_iter *iter) -+{ -+ lockdep_assert_held(&scx_tasks_lock); -+ -+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; -+ list_add(&iter->cursor.tasks_node, &scx_tasks); -+ iter->locked = NULL; -+} -+ -+/** -+ * scx_task_iter_exit - Exit a task iterator -+ * @iter: iterator to exit -+ * -+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. -+ * If the iterator holds a task's rq lock, that rq lock is released. See -+ * scx_task_iter_init() for details. -+ */ -+static void scx_task_iter_exit(struct scx_task_iter *iter) -+{ -+ struct list_head *cursor = &iter->cursor.tasks_node; -+ -+ lockdep_assert_held(&scx_tasks_lock); -+ -+ if (iter->locked) { -+ task_rq_unlock(iter->rq, iter->locked, &iter->rf); -+ iter->locked = NULL; -+ } -+ -+ if (list_empty(cursor)) -+ return; -+ -+ list_del_init(cursor); -+} -+ -+/** -+ * scx_task_iter_next - Next task -+ * @iter: iterator to walk -+ * -+ * Visit the next task. See scx_task_iter_init() for details. -+ */ -+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) -+{ -+ struct list_head *cursor = &iter->cursor.tasks_node; -+ struct sched_ext_entity *pos; -+ -+ lockdep_assert_held(&scx_tasks_lock); -+ -+ list_for_each_entry(pos, cursor, tasks_node) { -+ if (&pos->tasks_node == &scx_tasks) -+ return NULL; -+ if (!(pos->flags & SCX_TASK_CURSOR)) { -+ list_move(cursor, &pos->tasks_node); -+ return container_of(pos, struct task_struct, scx); -+ } -+ } -+ -+ /* can't happen, should always terminate at scx_tasks above */ -+ BUG(); -+} -+ -+/** -+ * scx_task_iter_next_filtered - Next non-idle task -+ * @iter: iterator to walk -+ * -+ * Visit the next non-idle task. See scx_task_iter_init() for details. -+ */ -+static struct task_struct * -+scx_task_iter_next_filtered(struct scx_task_iter *iter) -+{ -+ struct task_struct *p; -+ -+ while ((p = scx_task_iter_next(iter))) { -+ /* -+ * is_idle_task() tests %PF_IDLE which may not be set for CPUs -+ * which haven't yet been onlined. Test sched_class directly. -+ */ -+ if (p->sched_class != &idle_sched_class) -+ return p; -+ } -+ return NULL; -+} -+ -+/** -+ * scx_task_iter_next_filtered_locked - Next non-idle task with its rq locked -+ * @iter: iterator to walk -+ * -+ * Visit the next non-idle task with its rq lock held. See scx_task_iter_init() -+ * for details. -+ */ -+static struct task_struct * -+scx_task_iter_next_filtered_locked(struct scx_task_iter *iter) -+{ -+ struct task_struct *p; -+ -+ if (iter->locked) { -+ task_rq_unlock(iter->rq, iter->locked, &iter->rf); -+ iter->locked = NULL; -+ } -+ -+ p = scx_task_iter_next_filtered(iter); -+ if (!p) -+ return NULL; -+ -+ iter->rq = task_rq_lock(p, &iter->rf); -+ iter->locked = p; -+ return p; -+} -+ -+static enum scx_ops_enable_state scx_ops_enable_state(void) -+{ -+ return atomic_read(&scx_ops_enable_state_var); -+} -+ -+static enum scx_ops_enable_state -+scx_ops_set_enable_state(enum scx_ops_enable_state to) -+{ -+ return atomic_xchg(&scx_ops_enable_state_var, to); -+} -+ -+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, -+ enum scx_ops_enable_state from) -+{ -+ int from_v = from; -+ -+ return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); -+} -+ -+static bool scx_ops_disabling(void) -+{ -+ return unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING); -+} -+ -+/** -+ * wait_ops_state - Busy-wait the specified ops state to end -+ * @p: target task -+ * @opss: state to wait the end of -+ * -+ * Busy-wait for @p to transition out of @opss. This can only be used when the -+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also -+ * has load_acquire semantics to ensure that the caller can see the updates made -+ * in the enqueueing and dispatching paths. -+ */ -+static void wait_ops_state(struct task_struct *p, unsigned long opss) -+{ -+ do { -+ cpu_relax(); -+ } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); -+} -+ -+/** -+ * ops_cpu_valid - Verify a cpu number -+ * @cpu: cpu number which came from a BPF ops -+ * -+ * @cpu is a cpu number which came from the BPF scheduler and can be any value. -+ * Verify that it is in range and one of the possible cpus. -+ */ -+static bool ops_cpu_valid(s32 cpu) -+{ -+ return likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu)); -+} -+ -+/** -+ * ops_sanitize_err - Sanitize a -errno value -+ * @ops_name: operation to blame on failure -+ * @err: -errno value to sanitize -+ * -+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return -+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can -+ * cause misbehaviors. For an example, a large negative return from -+ * ops.prep_enable() triggers an oops when passed up the call chain because the -+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is -+ * handled as a pointer. -+ */ -+static int ops_sanitize_err(const char *ops_name, s32 err) -+{ -+ if (err < 0 && err >= -MAX_ERRNO) -+ return err; -+ -+ scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); -+ return -EPROTO; -+} -+ -+/** -+ * touch_core_sched - Update timestamp used for core-sched task ordering -+ * @rq: rq to read clock from, must be locked -+ * @p: task to update the timestamp for -+ * -+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to -+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called -+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice -+ * exhaustion). -+ */ -+static void touch_core_sched(struct rq *rq, struct task_struct *p) -+{ -+#ifdef CONFIG_SCHED_CORE -+ /* -+ * It's okay to update the timestamp spuriously. Use -+ * sched_core_disabled() which is cheaper than enabled(). -+ */ -+ if (!sched_core_disabled()) -+ p->scx.core_sched_at = rq_clock_task(rq); -+#endif -+} -+ -+/** -+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch -+ * @rq: rq to read clock from, must be locked -+ * @p: task being dispatched -+ * -+ * If the BPF scheduler implements custom core-sched ordering via -+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO -+ * ordering within each local DSQ. This function is called from dispatch paths -+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. -+ */ -+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_rq_held(rq); -+ assert_clock_updated(rq); -+ -+#ifdef CONFIG_SCHED_CORE -+ if (SCX_HAS_OP(core_sched_before)) -+ touch_core_sched(rq, p); -+#endif -+} -+ -+static void update_curr_scx(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ u64 now = rq_clock_task(rq); -+ u64 delta_exec; -+ -+ if (time_before_eq64(now, curr->se.exec_start)) -+ return; -+ -+ delta_exec = now - curr->se.exec_start; -+ curr->se.exec_start = now; -+ curr->se.sum_exec_runtime += delta_exec; -+ account_group_exec_runtime(curr, delta_exec); -+ cgroup_account_cputime(curr, delta_exec); -+ -+ if (curr->scx.slice != SCX_SLICE_INF) { -+ curr->scx.slice -= min(curr->scx.slice, delta_exec); -+ if (!curr->scx.slice) -+ touch_core_sched(rq, curr); -+ } -+} -+ -+static bool scx_dsq_priq_less(struct rb_node *node_a, -+ const struct rb_node *node_b) -+{ -+ const struct task_struct *a = -+ container_of(node_a, struct task_struct, scx.dsq_node.priq); -+ const struct task_struct *b = -+ container_of(node_b, struct task_struct, scx.dsq_node.priq); -+ -+ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); -+} -+ -+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, -+ u64 enq_flags) -+{ -+ bool is_local = dsq->id == SCX_DSQ_LOCAL; -+ -+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.fifo)); -+ WARN_ON_ONCE((p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) || -+ !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); -+ -+ if (!is_local) { -+ raw_spin_lock(&dsq->lock); -+ if (unlikely(dsq->id == SCX_DSQ_INVALID)) { -+ scx_ops_error("attempting to dispatch to a destroyed dsq"); -+ /* fall back to the global dsq */ -+ raw_spin_unlock(&dsq->lock); -+ dsq = &scx_dsq_global; -+ raw_spin_lock(&dsq->lock); -+ } -+ } -+ -+ if (enq_flags & SCX_ENQ_DSQ_PRIQ) { -+ p->scx.dsq_flags |= SCX_TASK_DSQ_ON_PRIQ; -+ rb_add_cached(&p->scx.dsq_node.priq, &dsq->priq, -+ scx_dsq_priq_less); -+ } else { -+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) -+ list_add(&p->scx.dsq_node.fifo, &dsq->fifo); -+ else -+ list_add_tail(&p->scx.dsq_node.fifo, &dsq->fifo); -+ } -+ dsq->nr++; -+ p->scx.dsq = dsq; -+ -+ /* -+ * We're transitioning out of QUEUEING or DISPATCHING. store_release to -+ * match waiters' load_acquire. -+ */ -+ if (enq_flags & SCX_ENQ_CLEAR_OPSS) -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ -+ if (is_local) { -+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); -+ bool preempt = false; -+ -+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && -+ rq->curr->sched_class == &ext_sched_class) { -+ rq->curr->scx.slice = 0; -+ preempt = true; -+ } -+ -+ if (preempt || sched_class_above(&ext_sched_class, -+ rq->curr->sched_class)) -+ resched_curr(rq); -+ } else { -+ raw_spin_unlock(&dsq->lock); -+ } -+} -+ -+static void task_unlink_from_dsq(struct task_struct *p, -+ struct scx_dispatch_q *dsq) -+{ -+ if (p->scx.dsq_flags & SCX_TASK_DSQ_ON_PRIQ) { -+ rb_erase_cached(&p->scx.dsq_node.priq, &dsq->priq); -+ RB_CLEAR_NODE(&p->scx.dsq_node.priq); -+ p->scx.dsq_flags &= ~SCX_TASK_DSQ_ON_PRIQ; -+ } else { -+ list_del_init(&p->scx.dsq_node.fifo); -+ } -+} -+ -+static bool task_linked_on_dsq(struct task_struct *p) -+{ -+ return !list_empty(&p->scx.dsq_node.fifo) || -+ !RB_EMPTY_NODE(&p->scx.dsq_node.priq); -+} -+ -+static void dispatch_dequeue(struct scx_rq *scx_rq, struct task_struct *p) -+{ -+ struct scx_dispatch_q *dsq = p->scx.dsq; -+ bool is_local = dsq == &scx_rq->local_dsq; -+ -+ if (!dsq) { -+ WARN_ON_ONCE(task_linked_on_dsq(p)); -+ /* -+ * When dispatching directly from the BPF scheduler to a local -+ * DSQ, the task isn't associated with any DSQ but -+ * @p->scx.holding_cpu may be set under the protection of -+ * %SCX_OPSS_DISPATCHING. -+ */ -+ if (p->scx.holding_cpu >= 0) -+ p->scx.holding_cpu = -1; -+ return; -+ } -+ -+ if (!is_local) -+ raw_spin_lock(&dsq->lock); -+ -+ /* -+ * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node -+ * can't change underneath us. -+ */ -+ if (p->scx.holding_cpu < 0) { -+ /* @p must still be on @dsq, dequeue */ -+ WARN_ON_ONCE(!task_linked_on_dsq(p)); -+ task_unlink_from_dsq(p, dsq); -+ dsq->nr--; -+ } else { -+ /* -+ * We're racing against dispatch_to_local_dsq() which already -+ * removed @p from @dsq and set @p->scx.holding_cpu. Clear the -+ * holding_cpu which tells dispatch_to_local_dsq() that it lost -+ * the race. -+ */ -+ WARN_ON_ONCE(task_linked_on_dsq(p)); -+ p->scx.holding_cpu = -1; -+ } -+ p->scx.dsq = NULL; -+ -+ if (!is_local) -+ raw_spin_unlock(&dsq->lock); -+} -+ -+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -+{ -+ lockdep_assert(rcu_read_lock_any_held()); -+ -+ if (dsq_id == SCX_DSQ_GLOBAL) -+ return &scx_dsq_global; -+ else -+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, -+ dsq_hash_params); -+} -+ -+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, -+ struct task_struct *p) -+{ -+ struct scx_dispatch_q *dsq; -+ -+ if (dsq_id == SCX_DSQ_LOCAL) -+ return &rq->scx.local_dsq; -+ -+ dsq = find_non_local_dsq(dsq_id); -+ if (unlikely(!dsq)) { -+ scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", -+ dsq_id, p->comm, p->pid); -+ return &scx_dsq_global; -+ } -+ -+ return dsq; -+} -+ -+static void direct_dispatch(struct task_struct *ddsp_task, struct task_struct *p, -+ u64 dsq_id, u64 enq_flags) -+{ -+ struct scx_dispatch_q *dsq; -+ -+ /* @p must match the task which is being enqueued */ -+ if (unlikely(p != ddsp_task)) { -+ if (IS_ERR(ddsp_task)) -+ scx_ops_error("%s[%d] already direct-dispatched", -+ p->comm, p->pid); -+ else -+ scx_ops_error("enqueueing %s[%d] but trying to direct-dispatch %s[%d]", -+ ddsp_task->comm, ddsp_task->pid, -+ p->comm, p->pid); -+ return; -+ } -+ -+ /* -+ * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because -+ * dispatching to the local DSQ of a different CPU requires unlocking -+ * the current rq which isn't allowed in the enqueue path. Use -+ * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. -+ */ -+ if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { -+ scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); -+ return; -+ } -+ -+ touch_core_sched_dispatch(task_rq(p), p); -+ -+ dsq = find_dsq_for_dispatch(task_rq(p), dsq_id, p); -+ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); -+ -+ /* -+ * Mark that dispatch already happened by spoiling direct_dispatch_task -+ * with a non-NULL value which can never match a valid task pointer. -+ */ -+ __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); -+} -+ -+static bool test_rq_online(struct rq *rq) -+{ -+#ifdef CONFIG_SMP -+ return rq->online; -+#else -+ return true; -+#endif -+} -+ -+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, -+ int sticky_cpu) -+{ -+ struct task_struct **ddsp_taskp; -+ unsigned long qseq; -+ -+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); -+ -+ if (p->scx.flags & SCX_TASK_ENQ_LOCAL) { -+ enq_flags |= SCX_ENQ_LOCAL; -+ p->scx.flags &= ~SCX_TASK_ENQ_LOCAL; -+ } -+ -+ /* rq migration */ -+ if (sticky_cpu == cpu_of(rq)) -+ goto local_norefill; -+ -+ /* -+ * If !rq->online, we already told the BPF scheduler that the CPU is -+ * offline. We're just trying to on/offline the CPU. Don't bother the -+ * BPF scheduler. -+ */ -+ if (unlikely(!test_rq_online(rq))) -+ goto local; -+ -+ /* see %SCX_OPS_ENQ_EXITING */ -+ if (!static_branch_unlikely(&scx_ops_enq_exiting) && -+ unlikely(p->flags & PF_EXITING)) -+ goto local; -+ -+ /* see %SCX_OPS_ENQ_LAST */ -+ if (!static_branch_unlikely(&scx_ops_enq_last) && -+ (enq_flags & SCX_ENQ_LAST)) -+ goto local; -+ -+ if (!SCX_HAS_OP(enqueue)) { -+ if (enq_flags & SCX_ENQ_LOCAL) -+ goto local; -+ else -+ goto global; -+ } -+ -+ /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ -+ qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; -+ -+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); -+ atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); -+ -+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); -+ WARN_ON_ONCE(*ddsp_taskp); -+ *ddsp_taskp = p; -+ -+ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); -+ -+ /* -+ * If not directly dispatched, QUEUEING isn't clear yet and dispatch or -+ * dequeue may be waiting. The store_release matches their load_acquire. -+ */ -+ if (*ddsp_taskp == p) -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); -+ *ddsp_taskp = NULL; -+ return; -+ -+local: -+ /* -+ * For task-ordering, slice refill must be treated as implying the end -+ * of the current slice. Otherwise, the longer @p stays on the CPU, the -+ * higher priority it becomes from scx_prio_less()'s POV. -+ */ -+ touch_core_sched(rq, p); -+ p->scx.slice = SCX_SLICE_DFL; -+local_norefill: -+ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); -+ return; -+ -+global: -+ touch_core_sched(rq, p); /* see the comment in local: */ -+ p->scx.slice = SCX_SLICE_DFL; -+ dispatch_enqueue(&scx_dsq_global, p, enq_flags); -+} -+ -+static bool watchdog_task_watched(const struct task_struct *p) -+{ -+ return !list_empty(&p->scx.watchdog_node); -+} -+ -+static void watchdog_watch_task(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_rq_held(rq); -+ if (p->scx.flags & SCX_TASK_WATCHDOG_RESET) -+ p->scx.runnable_at = jiffies; -+ p->scx.flags &= ~SCX_TASK_WATCHDOG_RESET; -+ list_add_tail(&p->scx.watchdog_node, &rq->scx.watchdog_list); -+} -+ -+static void watchdog_unwatch_task(struct task_struct *p, bool reset_timeout) -+{ -+ list_del_init(&p->scx.watchdog_node); -+ if (reset_timeout) -+ p->scx.flags |= SCX_TASK_WATCHDOG_RESET; -+} -+ -+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) -+{ -+ int sticky_cpu = p->scx.sticky_cpu; -+ -+ enq_flags |= rq->scx.extra_enq_flags; -+ -+ if (sticky_cpu >= 0) -+ p->scx.sticky_cpu = -1; -+ -+ /* -+ * Restoring a running task will be immediately followed by -+ * set_next_task_scx() which expects the task to not be on the BPF -+ * scheduler as tasks can only start running through local DSQs. Force -+ * direct-dispatch into the local DSQ by setting the sticky_cpu. -+ */ -+ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) -+ sticky_cpu = cpu_of(rq); -+ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ WARN_ON_ONCE(!watchdog_task_watched(p)); -+ return; -+ } -+ -+ watchdog_watch_task(rq, p); -+ p->scx.flags |= SCX_TASK_QUEUED; -+ rq->scx.nr_running++; -+ add_nr_running(rq, 1); -+ -+ if (SCX_HAS_OP(runnable)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); -+ -+ if (enq_flags & SCX_ENQ_WAKEUP) -+ touch_core_sched(rq, p); -+ -+ do_enqueue_task(rq, p, enq_flags, sticky_cpu); -+} -+ -+static void ops_dequeue(struct task_struct *p, u64 deq_flags) -+{ -+ unsigned long opss; -+ -+ watchdog_unwatch_task(p, false); -+ -+ /* acquire ensures that we see the preceding updates on QUEUED */ -+ opss = atomic_long_read_acquire(&p->scx.ops_state); -+ -+ switch (opss & SCX_OPSS_STATE_MASK) { -+ case SCX_OPSS_NONE: -+ break; -+ case SCX_OPSS_QUEUEING: -+ /* -+ * QUEUEING is started and finished while holding @p's rq lock. -+ * As we're holding the rq lock now, we shouldn't see QUEUEING. -+ */ -+ BUG(); -+ case SCX_OPSS_QUEUED: -+ if (SCX_HAS_OP(dequeue)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); -+ -+ if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, -+ SCX_OPSS_NONE)) -+ break; -+ fallthrough; -+ case SCX_OPSS_DISPATCHING: -+ /* -+ * If @p is being dispatched from the BPF scheduler to a DSQ, -+ * wait for the transfer to complete so that @p doesn't get -+ * added to its DSQ after dequeueing is complete. -+ * -+ * As we're waiting on DISPATCHING with the rq locked, the -+ * dispatching side shouldn't try to lock the rq while -+ * DISPATCHING is set. See dispatch_to_local_dsq(). -+ * -+ * DISPATCHING shouldn't have qseq set and control can reach -+ * here with NONE @opss from the above QUEUED case block. -+ * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. -+ */ -+ wait_ops_state(p, SCX_OPSS_DISPATCHING); -+ BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); -+ break; -+ } -+} -+ -+static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) -+{ -+ struct scx_rq *scx_rq = &rq->scx; -+ -+ if (!(p->scx.flags & SCX_TASK_QUEUED)) { -+ WARN_ON_ONCE(watchdog_task_watched(p)); -+ return; -+ } -+ -+ ops_dequeue(p, deq_flags); -+ -+ /* -+ * A currently running task which is going off @rq first gets dequeued -+ * and then stops running. As we want running <-> stopping transitions -+ * to be contained within runnable <-> quiescent transitions, trigger -+ * ->stopping() early here instead of in put_prev_task_scx(). -+ * -+ * @p may go through multiple stopping <-> running transitions between -+ * here and put_prev_task_scx() if task attribute changes occur while -+ * balance_scx() leaves @rq unlocked. However, they don't contain any -+ * information meaningful to the BPF scheduler and can be suppressed by -+ * skipping the callbacks if the task is !QUEUED. -+ */ -+ if (SCX_HAS_OP(stopping) && task_current(rq, p)) { -+ update_curr_scx(rq); -+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); -+ } -+ -+ if (SCX_HAS_OP(quiescent)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); -+ -+ if (deq_flags & SCX_DEQ_SLEEP) -+ p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; -+ else -+ p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; -+ -+ p->scx.flags &= ~SCX_TASK_QUEUED; -+ scx_rq->nr_running--; -+ sub_nr_running(rq, 1); -+ -+ dispatch_dequeue(scx_rq, p); -+} -+ -+static void yield_task_scx(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (SCX_HAS_OP(yield)) -+ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); -+ else -+ p->scx.slice = 0; -+} -+ -+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) -+{ -+ struct task_struct *from = rq->curr; -+ -+ if (SCX_HAS_OP(yield)) -+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); -+ else -+ return false; -+} -+ -+#ifdef CONFIG_SMP -+/** -+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ -+ * @rq: rq to move the task into, currently locked -+ * @p: task to move -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller -+ * must: -+ * -+ * 1. Start with exclusive access to @p either through its DSQ lock or -+ * %SCX_OPSS_DISPATCHING flag. -+ * -+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). -+ * -+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't -+ * deadlock with dequeue. -+ * -+ * 4. Lock @rq and the task_rq from #3. -+ * -+ * 5. Call this function. -+ * -+ * Returns %true if @p was successfully moved. %false after racing dequeue and -+ * losing. -+ */ -+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, -+ u64 enq_flags) -+{ -+ struct rq *task_rq; -+ -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * If dequeue got to @p while we were trying to lock both rq's, it'd -+ * have cleared @p->scx.holding_cpu to -1. While other cpus may have -+ * updated it to different values afterwards, as this operation can't be -+ * preempted or recurse, @p->scx.holding_cpu can never become -+ * raw_smp_processor_id() again before we're done. Thus, we can tell -+ * whether we lost to dequeue by testing whether @p->scx.holding_cpu is -+ * still raw_smp_processor_id(). -+ * -+ * See dispatch_dequeue() for the counterpart. -+ */ -+ if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) -+ return false; -+ -+ /* @p->rq couldn't have changed if we're still the holding cpu */ -+ task_rq = task_rq(p); -+ lockdep_assert_rq_held(task_rq); -+ -+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); -+ deactivate_task(task_rq, p, 0); -+ set_task_cpu(p, cpu_of(rq)); -+ p->scx.sticky_cpu = cpu_of(rq); -+ -+ /* -+ * We want to pass scx-specific enq_flags but activate_task() will -+ * truncate the upper 32 bit. As we own @rq, we can pass them through -+ * @rq->scx.extra_enq_flags instead. -+ */ -+ WARN_ON_ONCE(rq->scx.extra_enq_flags); -+ rq->scx.extra_enq_flags = enq_flags; -+ activate_task(rq, p, 0); -+ rq->scx.extra_enq_flags = 0; -+ -+ return true; -+} -+ -+/** -+ * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to -+ * -+ * We're holding @rq lock and trying to dispatch a task from @src_rq to -+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether -+ * @rq stays locked isn't important as long as the state is restored after -+ * dispatch_to_local_dsq_unlock(). -+ */ -+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, -+ struct rq *src_rq, struct rq *dst_rq) -+{ -+ rq_unpin_lock(rq, rf); -+ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(rq); -+ raw_spin_rq_lock(dst_rq); -+ } else if (rq == src_rq) { -+ double_lock_balance(rq, dst_rq); -+ rq_repin_lock(rq, rf); -+ } else if (rq == dst_rq) { -+ double_lock_balance(rq, src_rq); -+ rq_repin_lock(rq, rf); -+ } else { -+ raw_spin_rq_unlock(rq); -+ double_rq_lock(src_rq, dst_rq); -+ } -+} -+ -+/** -+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to -+ * -+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. -+ */ -+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, -+ struct rq *src_rq, struct rq *dst_rq) -+{ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(dst_rq); -+ raw_spin_rq_lock(rq); -+ rq_repin_lock(rq, rf); -+ } else if (rq == src_rq) { -+ double_unlock_balance(rq, dst_rq); -+ } else if (rq == dst_rq) { -+ double_unlock_balance(rq, src_rq); -+ } else { -+ double_rq_unlock(src_rq, dst_rq); -+ raw_spin_rq_lock(rq); -+ rq_repin_lock(rq, rf); -+ } -+} -+#endif /* CONFIG_SMP */ -+ -+ -+static bool task_can_run_on_rq(struct task_struct *p, struct rq *rq) -+{ -+ return likely(test_rq_online(rq)) && !is_migration_disabled(p) && -+ cpumask_test_cpu(cpu_of(rq), p->cpus_ptr); -+} -+ -+static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, -+ struct scx_dispatch_q *dsq) -+{ -+ struct scx_rq *scx_rq = &rq->scx; -+ struct task_struct *p; -+ struct rb_node *rb_node; -+ struct rq *task_rq; -+ bool moved = false; -+retry: -+ if (list_empty(&dsq->fifo) && !rb_first_cached(&dsq->priq)) -+ return false; -+ -+ raw_spin_lock(&dsq->lock); -+ -+ list_for_each_entry(p, &dsq->fifo, scx.dsq_node.fifo) { -+ task_rq = task_rq(p); -+ if (rq == task_rq) -+ goto this_rq; -+ if (task_can_run_on_rq(p, rq)) -+ goto remote_rq; -+ } -+ -+ for (rb_node = rb_first_cached(&dsq->priq); rb_node; -+ rb_node = rb_next(rb_node)) { -+ p = container_of(rb_node, struct task_struct, scx.dsq_node.priq); -+ task_rq = task_rq(p); -+ if (rq == task_rq) -+ goto this_rq; -+ if (task_can_run_on_rq(p, rq)) -+ goto remote_rq; -+ } -+ -+ raw_spin_unlock(&dsq->lock); -+ return false; -+ -+this_rq: -+ /* @dsq is locked and @p is on this rq */ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ task_unlink_from_dsq(p, dsq); -+ list_add_tail(&p->scx.dsq_node.fifo, &scx_rq->local_dsq.fifo); -+ dsq->nr--; -+ scx_rq->local_dsq.nr++; -+ p->scx.dsq = &scx_rq->local_dsq; -+ raw_spin_unlock(&dsq->lock); -+ return true; -+ -+remote_rq: -+#ifdef CONFIG_SMP -+ /* -+ * @dsq is locked and @p is on a remote rq. @p is currently protected by -+ * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab -+ * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the -+ * rq lock or fail, do a little dancing from our side. See -+ * move_task_to_local_dsq(). -+ */ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ task_unlink_from_dsq(p, dsq); -+ dsq->nr--; -+ p->scx.holding_cpu = raw_smp_processor_id(); -+ raw_spin_unlock(&dsq->lock); -+ -+ rq_unpin_lock(rq, rf); -+ double_lock_balance(rq, task_rq); -+ rq_repin_lock(rq, rf); -+ -+ moved = move_task_to_local_dsq(rq, p, 0); -+ -+ double_unlock_balance(rq, task_rq); -+#endif /* CONFIG_SMP */ -+ if (likely(moved)) -+ return true; -+ goto retry; -+} -+ -+enum dispatch_to_local_dsq_ret { -+ DTL_DISPATCHED, /* successfully dispatched */ -+ DTL_LOST, /* lost race to dequeue */ -+ DTL_NOT_LOCAL, /* destination is not a local DSQ */ -+ DTL_INVALID, /* invalid local dsq_id */ -+}; -+ -+/** -+ * dispatch_to_local_dsq - Dispatch a task to a local dsq -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @dsq_id: destination dsq ID -+ * @p: task to dispatch -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by -+ * @dsq_id. This function performs all the synchronization dancing needed -+ * because local DSQs are protected with rq locks. -+ * -+ * The caller must have exclusive ownership of @p (e.g. through -+ * %SCX_OPSS_DISPATCHING). -+ */ -+static enum dispatch_to_local_dsq_ret -+dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, -+ struct task_struct *p, u64 enq_flags) -+{ -+ struct rq *src_rq = task_rq(p); -+ struct rq *dst_rq; -+ -+ /* -+ * We're synchronized against dequeue through DISPATCHING. As @p can't -+ * be dequeued, its task_rq and cpus_allowed are stable too. -+ */ -+ if (dsq_id == SCX_DSQ_LOCAL) { -+ dst_rq = rq; -+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (!ops_cpu_valid(cpu)) { -+ scx_ops_error("invalid cpu %d in SCX_DSQ_LOCAL_ON verdict for %s[%d]", -+ cpu, p->comm, p->pid); -+ return DTL_INVALID; -+ } -+ dst_rq = cpu_rq(cpu); -+ } else { -+ return DTL_NOT_LOCAL; -+ } -+ -+ /* if dispatching to @rq that @p is already on, no lock dancing needed */ -+ if (rq == src_rq && rq == dst_rq) { -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags | SCX_ENQ_CLEAR_OPSS); -+ return DTL_DISPATCHED; -+ } -+ -+#ifdef CONFIG_SMP -+ if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { -+ struct rq *locked_dst_rq = dst_rq; -+ bool dsp; -+ -+ /* -+ * @p is on a possibly remote @src_rq which we need to lock to -+ * move the task. If dequeue is in progress, it'd be locking -+ * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq -+ * lock while holding DISPATCHING. -+ * -+ * As DISPATCHING guarantees that @p is wholly ours, we can -+ * pretend that we're moving from a DSQ and use the same -+ * mechanism - mark the task under transfer with holding_cpu, -+ * release DISPATCHING and then follow the same protocol. -+ */ -+ p->scx.holding_cpu = raw_smp_processor_id(); -+ -+ /* store_release ensures that dequeue sees the above */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ -+ dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); -+ -+ /* -+ * We don't require the BPF scheduler to avoid dispatching to -+ * offline CPUs mostly for convenience but also because CPUs can -+ * go offline between scx_bpf_dispatch() calls and here. If @p -+ * is destined to an offline CPU, queue it on its current CPU -+ * instead, which should always be safe. As this is an allowed -+ * behavior, don't trigger an ops error. -+ */ -+ if (unlikely(!test_rq_online(dst_rq))) -+ dst_rq = src_rq; -+ -+ if (src_rq == dst_rq) { -+ /* -+ * As @p is staying on the same rq, there's no need to -+ * go through the full deactivate/activate cycle. -+ * Optimize by abbreviating the operations in -+ * move_task_to_local_dsq(). -+ */ -+ dsp = p->scx.holding_cpu == raw_smp_processor_id(); -+ if (likely(dsp)) { -+ p->scx.holding_cpu = -1; -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags); -+ } -+ } else { -+ dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); -+ } -+ -+ /* if the destination CPU is idle, wake it up */ -+ if (dsp && p->sched_class > dst_rq->curr->sched_class) -+ resched_curr(dst_rq); -+ -+ dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); -+ -+ return dsp ? DTL_DISPATCHED : DTL_LOST; -+ } -+#endif /* CONFIG_SMP */ -+ -+ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", -+ cpu_of(dst_rq), p->comm, p->pid); -+ return DTL_INVALID; -+} -+ -+/** -+ * finish_dispatch - Asynchronously finish dispatching a task -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @p: task to finish dispatching -+ * @qseq_at_dispatch: qseq when @p started getting dispatched -+ * @dsq_id: destination DSQ ID -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * Dispatching to local DSQs may need to wait for queueing to complete or -+ * require rq lock dancing. As we don't wanna do either while inside -+ * ops.dispatch() to avoid locking order inversion, we split dispatching into -+ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the -+ * task and its qseq. Once ops.dispatch() returns, this function is called to -+ * finish up. -+ * -+ * There is no guarantee that @p is still valid for dispatching or even that it -+ * was valid in the first place. Make sure that the task is still owned by the -+ * BPF scheduler and claim the ownership before dispatching. -+ */ -+static void finish_dispatch(struct rq *rq, struct rq_flags *rf, -+ struct task_struct *p, -+ unsigned long qseq_at_dispatch, -+ u64 dsq_id, u64 enq_flags) -+{ -+ struct scx_dispatch_q *dsq; -+ unsigned long opss; -+ -+ touch_core_sched_dispatch(rq, p); -+retry: -+ /* -+ * No need for _acquire here. @p is accessed only after a successful -+ * try_cmpxchg to DISPATCHING. -+ */ -+ opss = atomic_long_read(&p->scx.ops_state); -+ -+ switch (opss & SCX_OPSS_STATE_MASK) { -+ case SCX_OPSS_DISPATCHING: -+ case SCX_OPSS_NONE: -+ /* someone else already got to it */ -+ return; -+ case SCX_OPSS_QUEUED: -+ /* -+ * If qseq doesn't match, @p has gone through at least one -+ * dispatch/dequeue and re-enqueue cycle between -+ * scx_bpf_dispatch() and here and we have no claim on it. -+ */ -+ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) -+ return; -+ -+ /* -+ * While we know @p is accessible, we don't yet have a claim on -+ * it - the BPF scheduler is allowed to dispatch tasks -+ * spuriously and there can be a racing dequeue attempt. Let's -+ * claim @p by atomically transitioning it from QUEUED to -+ * DISPATCHING. -+ */ -+ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, -+ SCX_OPSS_DISPATCHING))) -+ break; -+ goto retry; -+ case SCX_OPSS_QUEUEING: -+ /* -+ * do_enqueue_task() is in the process of transferring the task -+ * to the BPF scheduler while holding @p's rq lock. As we aren't -+ * holding any kernel or BPF resource that the enqueue path may -+ * depend upon, it's safe to wait. -+ */ -+ wait_ops_state(p, opss); -+ goto retry; -+ } -+ -+ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); -+ -+ switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { -+ case DTL_DISPATCHED: -+ break; -+ case DTL_LOST: -+ break; -+ case DTL_INVALID: -+ dsq_id = SCX_DSQ_GLOBAL; -+ fallthrough; -+ case DTL_NOT_LOCAL: -+ dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), -+ dsq_id, p); -+ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); -+ break; -+ } -+} -+ -+static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); -+ u32 u; -+ -+ for (u = 0; u < dspc->buf_cursor; u++) { -+ struct scx_dsp_buf_ent *ent = &this_cpu_ptr(scx_dsp_buf)[u]; -+ -+ finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, -+ ent->enq_flags); -+ } -+ -+ dspc->nr_tasks += dspc->buf_cursor; -+ dspc->buf_cursor = 0; -+} -+ -+static int balance_one(struct rq *rq, struct task_struct *prev, -+ struct rq_flags *rf, bool local) -+{ -+ struct scx_rq *scx_rq = &rq->scx; -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); -+ bool prev_on_scx = prev->sched_class == &ext_sched_class; -+ int nr_loops = SCX_DSP_MAX_LOOPS; -+ -+ lockdep_assert_rq_held(rq); -+ -+ if (static_branch_unlikely(&scx_ops_cpu_preempt) && -+ unlikely(rq->scx.cpu_released)) { -+ /* -+ * If the previous sched_class for the current CPU was not SCX, -+ * notify the BPF scheduler that it again has control of the -+ * core. This callback complements ->cpu_release(), which is -+ * emitted in scx_notify_pick_next_task(). -+ */ -+ if (SCX_HAS_OP(cpu_acquire)) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq), -+ NULL); -+ rq->scx.cpu_released = false; -+ } -+ -+ if (prev_on_scx) { -+ WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); -+ update_curr_scx(rq); -+ -+ /* -+ * If @prev is runnable & has slice left, it has priority and -+ * fetching more just increases latency for the fetched tasks. -+ * Tell put_prev_task_scx() to put @prev on local_dsq. If the -+ * BPF scheduler wants to handle this explicitly, it should -+ * implement ->cpu_released(). -+ * -+ * See scx_ops_disable_workfn() for the explanation on the -+ * disabling() test. -+ * -+ * When balancing a remote CPU for core-sched, there won't be a -+ * following put_prev_task_scx() call and we don't own -+ * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the -+ * same conditions later and pick @rq->curr accordingly. -+ */ -+ if ((prev->scx.flags & SCX_TASK_QUEUED) && -+ prev->scx.slice && !scx_ops_disabling()) { -+ if (local) -+ prev->scx.flags |= SCX_TASK_BAL_KEEP; -+ return 1; -+ } -+ } -+ -+ /* if there already are tasks to run, nothing to do */ -+ if (scx_rq->local_dsq.nr) -+ return 1; -+ -+ if (consume_dispatch_q(rq, rf, &scx_dsq_global)) -+ return 1; -+ -+ if (!SCX_HAS_OP(dispatch)) -+ return 0; -+ -+ dspc->rq = rq; -+ dspc->rf = rf; -+ -+ /* -+ * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, -+ * the local DSQ might still end up empty after a successful -+ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() -+ * produced some tasks, retry. The BPF scheduler may depend on this -+ * looping behavior to simplify its implementation. -+ */ -+ do { -+ dspc->nr_tasks = 0; -+ -+ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), -+ prev_on_scx ? prev : NULL); -+ -+ flush_dispatch_buf(rq, rf); -+ -+ if (scx_rq->local_dsq.nr) -+ return 1; -+ if (consume_dispatch_q(rq, rf, &scx_dsq_global)) -+ return 1; -+ -+ /* -+ * ops.dispatch() can trap us in this loop by repeatedly -+ * dispatching ineligible tasks. Break out once in a while to -+ * allow the watchdog to run. As IRQ can't be enabled in -+ * balance(), we want to complete this scheduling cycle and then -+ * start a new one. IOW, we want to call resched_curr() on the -+ * next, most likely idle, task, not the current one. Use -+ * scx_bpf_kick_cpu() for deferred kicking. -+ */ -+ if (unlikely(!--nr_loops)) { -+ scx_bpf_kick_cpu(cpu_of(rq), 0); -+ break; -+ } -+ } while (dspc->nr_tasks); -+ -+ return 0; -+} -+ -+static int balance_scx(struct rq *rq, struct task_struct *prev, -+ struct rq_flags *rf) -+{ -+ int ret; -+ -+ ret = balance_one(rq, prev, rf, true); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When core-sched is enabled, this ops.balance() call will be followed -+ * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() -+ * on the SMT siblings. Balance the siblings too. -+ */ -+ if (sched_core_enabled(rq)) { -+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); -+ int scpu; -+ -+ for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { -+ struct rq *srq = cpu_rq(scpu); -+ struct rq_flags srf; -+ struct task_struct *sprev = srq->curr; -+ -+ /* -+ * While core-scheduling, rq lock is shared among -+ * siblings but the debug annotations and rq clock -+ * aren't. Do pinning dance to transfer the ownership. -+ */ -+ WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); -+ rq_unpin_lock(rq, rf); -+ rq_pin_lock(srq, &srf); -+ -+ update_rq_clock(srq); -+ balance_one(srq, sprev, &srf, false); -+ -+ rq_unpin_lock(srq, &srf); -+ rq_repin_lock(rq, rf); -+ } -+ } -+#endif -+ return ret; -+} -+ -+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) -+{ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ /* -+ * Core-sched might decide to execute @p before it is -+ * dispatched. Call ops_dequeue() to notify the BPF scheduler. -+ */ -+ ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); -+ dispatch_dequeue(&rq->scx, p); -+ } -+ -+ p->se.exec_start = rq_clock_task(rq); -+ -+ /* see dequeue_task_scx() on why we skip when !QUEUED */ -+ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, running, p); -+ -+ watchdog_unwatch_task(p, true); -+ -+ /* -+ * @p is getting newly scheduled or got kicked after someone updated its -+ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). -+ */ -+ if ((p->scx.slice == SCX_SLICE_INF) != -+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { -+ if (p->scx.slice == SCX_SLICE_INF) -+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; -+ else -+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; -+ -+ sched_update_tick_dependency(rq); -+ } -+} -+ -+static void put_prev_task_scx(struct rq *rq, struct task_struct *p) -+{ -+#ifndef CONFIG_SMP -+ /* -+ * UP workaround. -+ * -+ * Because SCX may transfer tasks across CPUs during dispatch, dispatch -+ * is performed from its balance operation which isn't called in UP. -+ * Let's work around by calling it from the operations which come right -+ * after. -+ * -+ * 1. If the prev task is on SCX, pick_next_task() calls -+ * .put_prev_task() right after. As .put_prev_task() is also called -+ * from other places, we need to distinguish the calls which can be -+ * done by looking at the previous task's state - if still queued or -+ * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). -+ * This case is handled here. -+ * -+ * 2. If the prev task is not on SCX, the first following call into SCX -+ * will be .pick_next_task(), which is covered by calling -+ * balance_scx() from pick_next_task_scx(). -+ * -+ * Note that we can't merge the first case into the second as -+ * balance_scx() must be called before the previous SCX task goes -+ * through put_prev_task_scx(). -+ * -+ * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. -+ * Pass in %NULL. -+ */ -+ if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) -+ balance_scx(rq, p, NULL); -+#endif -+ -+ update_curr_scx(rq); -+ -+ /* see dequeue_task_scx() on why we skip when !QUEUED */ -+ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); -+ -+ /* -+ * If we're being called from put_prev_task_balance(), balance_scx() may -+ * have decided that @p should keep running. -+ */ -+ if (p->scx.flags & SCX_TASK_BAL_KEEP) { -+ p->scx.flags &= ~SCX_TASK_BAL_KEEP; -+ watchdog_watch_task(rq, p); -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ watchdog_watch_task(rq, p); -+ -+ /* -+ * If @p has slice left and balance_scx() didn't tag it for -+ * keeping, @p is getting preempted by a higher priority -+ * scheduler class or core-sched forcing a different task. Leave -+ * it at the head of the local DSQ. -+ */ -+ if (p->scx.slice && !scx_ops_disabling()) { -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ /* -+ * If we're in the pick_next_task path, balance_scx() should -+ * have already populated the local DSQ if there are any other -+ * available tasks. If empty, tell ops.enqueue() that @p is the -+ * only one available for this cpu. ops.enqueue() should put it -+ * on the local DSQ so that the subsequent pick_next_task_scx() -+ * can find the task unless it wants to trigger a separate -+ * follow-up scheduling event. -+ */ -+ if (list_empty(&rq->scx.local_dsq.fifo)) -+ do_enqueue_task(rq, p, SCX_ENQ_LAST | SCX_ENQ_LOCAL, -1); -+ else -+ do_enqueue_task(rq, p, 0, -1); -+ } -+} -+ -+static struct task_struct *first_local_task(struct rq *rq) -+{ -+ struct rb_node *rb_node; -+ -+ if (!list_empty(&rq->scx.local_dsq.fifo)) -+ return list_first_entry(&rq->scx.local_dsq.fifo, -+ struct task_struct, scx.dsq_node.fifo); -+ -+ rb_node = rb_first_cached(&rq->scx.local_dsq.priq); -+ if (rb_node) -+ return container_of(rb_node, -+ struct task_struct, scx.dsq_node.priq); -+ -+ return NULL; -+} -+ -+static struct task_struct *pick_next_task_scx(struct rq *rq) -+{ -+ struct task_struct *p; -+ -+#ifndef CONFIG_SMP -+ /* UP workaround - see the comment at the head of put_prev_task_scx() */ -+ if (unlikely(rq->curr->sched_class != &ext_sched_class)) -+ balance_scx(rq, rq->curr, NULL); -+#endif -+ -+ p = first_local_task(rq); -+ if (!p) -+ return NULL; -+ -+ if (unlikely(!p->scx.slice)) { -+ if (!scx_ops_disabling() && !scx_warned_zero_slice) { -+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", -+ p->comm, p->pid); -+ scx_warned_zero_slice = true; -+ } -+ p->scx.slice = SCX_SLICE_DFL; -+ } -+ -+ set_next_task_scx(rq, p, true); -+ -+ return p; -+} -+ -+#ifdef CONFIG_SCHED_CORE -+/** -+ * scx_prio_less - Task ordering for core-sched -+ * @a: task A -+ * @b: task B -+ * -+ * Core-sched is implemented as an additional scheduling layer on top of the -+ * usual sched_class'es and needs to find out the expected task ordering. For -+ * SCX, core-sched calls this function to interrogate the task ordering. -+ * -+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used -+ * to implement the default task ordering. The older the timestamp, the higher -+ * prority the task - the global FIFO ordering matching the default scheduling -+ * behavior. -+ * -+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to -+ * implement FIFO ordering within each local DSQ. See pick_task_scx(). -+ */ -+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, -+ bool in_fi) -+{ -+ /* -+ * The const qualifiers are dropped from task_struct pointers when -+ * calling ops.core_sched_before(). Accesses are controlled by the -+ * verifier. -+ */ -+ if (SCX_HAS_OP(core_sched_before) && !scx_ops_disabling()) -+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, -+ (struct task_struct *)a, -+ (struct task_struct *)b); -+ else -+ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); -+} -+ -+/** -+ * pick_task_scx - Pick a candidate task for core-sched -+ * @rq: rq to pick the candidate task from -+ * -+ * Core-sched calls this function on each SMT sibling to determine the next -+ * tasks to run on the SMT siblings. balance_one() has been called on all -+ * siblings and put_prev_task_scx() has been called only for the current CPU. -+ * -+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look -+ * at the first task in the local dsq. @rq->curr has to be considered explicitly -+ * to mimic %SCX_TASK_BAL_KEEP. -+ */ -+static struct task_struct *pick_task_scx(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ struct task_struct *first = first_local_task(rq); -+ -+ if (curr->scx.flags & SCX_TASK_QUEUED) { -+ /* is curr the only runnable task? */ -+ if (!first) -+ return curr; -+ -+ /* -+ * Does curr trump first? We can always go by core_sched_at for -+ * this comparison as it represents global FIFO ordering when -+ * the default core-sched ordering is used and local-DSQ FIFO -+ * ordering otherwise. -+ * -+ * We can have a task with an earlier timestamp on the DSQ. For -+ * example, when a current task is preempted by a sibling -+ * picking a different cookie, the task would be requeued at the -+ * head of the local DSQ with an earlier timestamp than the -+ * core-sched picked next task. Besides, the BPF scheduler may -+ * dispatch any tasks to the local DSQ anytime. -+ */ -+ if (curr->scx.slice && time_before64(curr->scx.core_sched_at, -+ first->scx.core_sched_at)) -+ return curr; -+ } -+ -+ return first; /* this may be %NULL */ -+} -+#endif /* CONFIG_SCHED_CORE */ -+ -+static enum scx_cpu_preempt_reason -+preempt_reason_from_class(const struct sched_class *class) -+{ -+#ifdef CONFIG_SMP -+ if (class == &stop_sched_class) -+ return SCX_CPU_PREEMPT_STOP; -+#endif -+ if (class == &dl_sched_class) -+ return SCX_CPU_PREEMPT_DL; -+ if (class == &rt_sched_class) -+ return SCX_CPU_PREEMPT_RT; -+ return SCX_CPU_PREEMPT_UNKNOWN; -+} -+ -+void __scx_notify_pick_next_task(struct rq *rq, struct task_struct *task, -+ const struct sched_class *active) -+{ -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * The callback is conceptually meant to convey that the CPU is no -+ * longer under the control of SCX. Therefore, don't invoke the -+ * callback if the CPU is is staying on SCX, or going idle (in which -+ * case the SCX scheduler has actively decided not to schedule any -+ * tasks on the CPU). -+ */ -+ if (likely(active >= &ext_sched_class)) -+ return; -+ -+ /* -+ * At this point we know that SCX was preempted by a higher priority -+ * sched_class, so invoke the ->cpu_release() callback if we have not -+ * done so already. We only send the callback once between SCX being -+ * preempted, and it regaining control of the CPU. -+ * -+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the -+ * next time that balance_scx() is invoked. -+ */ -+ if (!rq->scx.cpu_released) { -+ if (SCX_HAS_OP(cpu_release)) { -+ struct scx_cpu_release_args args = { -+ .reason = preempt_reason_from_class(active), -+ .task = task, -+ }; -+ -+ SCX_CALL_OP(SCX_KF_CPU_RELEASE, -+ cpu_release, cpu_of(rq), &args); -+ } -+ rq->scx.cpu_released = true; -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+static bool test_and_clear_cpu_idle(int cpu) -+{ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT -+ * cluster is not wholly idle either way. This also prevents -+ * scx_pick_idle_cpu() from getting caught in an infinite loop. -+ */ -+ if (sched_smt_active()) { -+ const struct cpumask *smt = cpu_smt_mask(cpu); -+ -+ /* -+ * If offline, @cpu is not its own sibling and -+ * scx_pick_idle_cpu() can get caught in an infinite loop as -+ * @cpu is never cleared from idle_masks.smt. Ensure that @cpu -+ * is eventually cleared. -+ */ -+ if (cpumask_intersects(smt, idle_masks.smt)) -+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); -+ else if (cpumask_test_cpu(cpu, idle_masks.smt)) -+ __cpumask_clear_cpu(cpu, idle_masks.smt); -+ } -+#endif -+ return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -+} -+ -+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -+{ -+ int cpu; -+ -+retry: -+ if (sched_smt_active()) { -+ cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); -+ if (cpu < nr_cpu_ids) -+ goto found; -+ -+ if (flags & SCX_PICK_IDLE_CORE) -+ return -EBUSY; -+ } -+ -+ cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); -+ if (cpu >= nr_cpu_ids) -+ return -EBUSY; -+ -+found: -+ if (test_and_clear_cpu_idle(cpu)) -+ return cpu; -+ else -+ goto retry; -+} -+ -+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags) -+{ -+ s32 cpu; -+ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return prev_cpu; -+ } -+ -+ /* -+ * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the -+ * local DSQ of the waker. -+ */ -+ if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && -+ !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING)) { -+ cpu = smp_processor_id(); -+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) { -+ p->scx.flags |= SCX_TASK_ENQ_LOCAL; -+ return cpu; -+ } -+ } -+ -+ if (p->nr_cpus_allowed == 1) -+ return prev_cpu; -+ -+ /* -+ * If CPU has SMT, any wholly idle CPU is likely a better pick than -+ * partially idle @prev_cpu. -+ */ -+ if (sched_smt_active()) { -+ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && -+ test_and_clear_cpu_idle(prev_cpu)) { -+ p->scx.flags |= SCX_TASK_ENQ_LOCAL; -+ return prev_cpu; -+ } -+ -+ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) { -+ p->scx.flags |= SCX_TASK_ENQ_LOCAL; -+ return cpu; -+ } -+ } -+ -+ if (test_and_clear_cpu_idle(prev_cpu)) { -+ p->scx.flags |= SCX_TASK_ENQ_LOCAL; -+ return prev_cpu; -+ } -+ -+ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) { -+ p->scx.flags |= SCX_TASK_ENQ_LOCAL; -+ return cpu; -+ } -+ -+ return prev_cpu; -+} -+ -+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) -+{ -+ if (SCX_HAS_OP(select_cpu)) { -+ s32 cpu; -+ -+ cpu = SCX_CALL_OP_TASK_RET(SCX_KF_REST, select_cpu, p, prev_cpu, -+ wake_flags); -+ if (ops_cpu_valid(cpu)) { -+ return cpu; -+ } else { -+ scx_ops_error("select_cpu returned invalid cpu %d", cpu); -+ return prev_cpu; -+ } -+ } else { -+ return scx_select_cpu_dfl(p, prev_cpu, wake_flags); -+ } -+} -+ -+static void set_cpus_allowed_scx(struct task_struct *p, -+ struct affinity_context *ac) -+{ -+ set_cpus_allowed_common(p, ac); -+ -+ /* -+ * The effective cpumask is stored in @p->cpus_ptr which may temporarily -+ * differ from the configured one in @p->cpus_mask. Always tell the bpf -+ * scheduler the effective one. -+ * -+ * Fine-grained memory write control is enforced by BPF making the const -+ * designation pointless. Cast it away when calling the operation. -+ */ -+ if (SCX_HAS_OP(set_cpumask)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, -+ (struct cpumask *)p->cpus_ptr); -+} -+ -+static void reset_idle_masks(void) -+{ -+ /* consider all cpus idle, should converge to the actual state quickly */ -+ cpumask_setall(idle_masks.cpu); -+ cpumask_setall(idle_masks.smt); -+} -+ -+void __scx_update_idle(struct rq *rq, bool idle) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (SCX_HAS_OP(update_idle)) { -+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); -+ if (!static_branch_unlikely(&scx_builtin_idle_enabled)) -+ return; -+ } -+ -+ if (idle) -+ cpumask_set_cpu(cpu, idle_masks.cpu); -+ else -+ cpumask_clear_cpu(cpu, idle_masks.cpu); -+ -+#ifdef CONFIG_SCHED_SMT -+ if (sched_smt_active()) { -+ const struct cpumask *smt = cpu_smt_mask(cpu); -+ -+ if (idle) { -+ /* -+ * idle_masks.smt handling is racy but that's fine as -+ * it's only for optimization and self-correcting. -+ */ -+ for_each_cpu(cpu, smt) { -+ if (!cpumask_test_cpu(cpu, idle_masks.cpu)) -+ return; -+ } -+ cpumask_or(idle_masks.smt, idle_masks.smt, smt); -+ } else { -+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); -+ } -+ } -+#endif -+} -+ -+static void rq_online_scx(struct rq *rq, enum rq_onoff_reason reason) -+{ -+ if (SCX_HAS_OP(cpu_online) && reason == RQ_ONOFF_HOTPLUG) -+ SCX_CALL_OP(SCX_KF_REST, cpu_online, cpu_of(rq)); -+} -+ -+static void rq_offline_scx(struct rq *rq, enum rq_onoff_reason reason) -+{ -+ if (SCX_HAS_OP(cpu_offline) && reason == RQ_ONOFF_HOTPLUG) -+ SCX_CALL_OP(SCX_KF_REST, cpu_offline, cpu_of(rq)); -+} -+ -+#else /* !CONFIG_SMP */ -+ -+static bool test_and_clear_cpu_idle(int cpu) { return false; } -+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -+static void reset_idle_masks(void) {} -+ -+#endif /* CONFIG_SMP */ -+ -+static bool check_rq_for_timeouts(struct rq *rq) -+{ -+ struct task_struct *p; -+ struct rq_flags rf; -+ bool timed_out = false; -+ -+ rq_lock_irqsave(rq, &rf); -+ list_for_each_entry(p, &rq->scx.watchdog_list, scx.watchdog_node) { -+ unsigned long last_runnable = p->scx.runnable_at; -+ -+ if (unlikely(time_after(jiffies, -+ last_runnable + scx_watchdog_timeout))) { -+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); -+ -+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, -+ "%s[%d] failed to run for %u.%03us", -+ p->comm, p->pid, -+ dur_ms / 1000, dur_ms % 1000); -+ timed_out = true; -+ break; -+ } -+ } -+ rq_unlock_irqrestore(rq, &rf); -+ -+ return timed_out; -+} -+ -+static void scx_watchdog_workfn(struct work_struct *work) -+{ -+ int cpu; -+ -+ scx_watchdog_timestamp = jiffies; -+ -+ for_each_online_cpu(cpu) { -+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) -+ break; -+ -+ cond_resched(); -+ } -+ queue_delayed_work(system_unbound_wq, to_delayed_work(work), -+ scx_watchdog_timeout / 2); -+} -+ -+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) -+{ -+ update_curr_scx(rq); -+ -+ /* -+ * While disabling, always resched and refresh core-sched timestamp as -+ * we can't trust the slice management or ops.core_sched_before(). -+ */ -+ if (scx_ops_disabling()) { -+ curr->scx.slice = 0; -+ touch_core_sched(rq, curr); -+ } -+ -+ if (!curr->scx.slice) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+static struct cgroup *tg_cgrp(struct task_group *tg) -+{ -+ /* -+ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, -+ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the -+ * root cgroup. -+ */ -+ if (tg && tg->css.cgroup) -+ return tg->css.cgroup; -+ else -+ return &cgrp_dfl_root.cgrp; -+} -+ -+#define SCX_ENABLE_ARGS_INIT_CGROUP(tg) .cgroup = tg_cgrp(tg), -+ -+#else /* CONFIG_EXT_GROUP_SCHED */ -+ -+#define SCX_ENABLE_ARGS_INIT_CGROUP(tg) -+ -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+ -+static int scx_ops_prepare_task(struct task_struct *p, struct task_group *tg) -+{ -+ int ret; -+ -+ WARN_ON_ONCE(p->scx.flags & SCX_TASK_OPS_PREPPED); -+ -+ p->scx.disallow = false; -+ -+ if (SCX_HAS_OP(prep_enable)) { -+ struct scx_enable_args args = { -+ SCX_ENABLE_ARGS_INIT_CGROUP(tg) -+ }; -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, prep_enable, p, &args); -+ if (unlikely(ret)) { -+ ret = ops_sanitize_err("prep_enable", ret); -+ return ret; -+ } -+ } -+ -+ if (p->scx.disallow) { -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ rq = task_rq_lock(p, &rf); -+ -+ /* -+ * We're either in fork or load path and @p->policy will be -+ * applied right after. Reverting @p->policy here and rejecting -+ * %SCHED_EXT transitions from scx_check_setscheduler() -+ * guarantees that if ops.prep_enable() sets @p->disallow, @p -+ * can never be in SCX. -+ */ -+ if (p->policy == SCHED_EXT) { -+ p->policy = SCHED_NORMAL; -+ atomic_long_inc(&scx_nr_rejected); -+ } -+ -+ task_rq_unlock(rq, p, &rf); -+ } -+ -+ p->scx.flags |= (SCX_TASK_OPS_PREPPED | SCX_TASK_WATCHDOG_RESET); -+ return 0; -+} -+ -+static void scx_ops_enable_task(struct task_struct *p) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_OPS_PREPPED)); -+ -+ if (SCX_HAS_OP(enable)) { -+ struct scx_enable_args args = { -+ SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p)) -+ }; -+ SCX_CALL_OP_TASK(SCX_KF_REST, enable, p, &args); -+ } -+ p->scx.flags &= ~SCX_TASK_OPS_PREPPED; -+ p->scx.flags |= SCX_TASK_OPS_ENABLED; -+} -+ -+static void scx_ops_disable_task(struct task_struct *p) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ if (p->scx.flags & SCX_TASK_OPS_PREPPED) { -+ if (SCX_HAS_OP(cancel_enable)) { -+ struct scx_enable_args args = { -+ SCX_ENABLE_ARGS_INIT_CGROUP(task_group(p)) -+ }; -+ SCX_CALL_OP(SCX_KF_REST, cancel_enable, p, &args); -+ } -+ p->scx.flags &= ~SCX_TASK_OPS_PREPPED; -+ } else if (p->scx.flags & SCX_TASK_OPS_ENABLED) { -+ if (SCX_HAS_OP(disable)) -+ SCX_CALL_OP(SCX_KF_REST, disable, p); -+ p->scx.flags &= ~SCX_TASK_OPS_ENABLED; -+ } -+} -+ -+static void set_task_scx_weight(struct task_struct *p) -+{ -+ u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; -+ -+ p->scx.weight = sched_weight_to_cgroup(weight); -+} -+ -+/** -+ * refresh_scx_weight - Refresh a task's ext weight -+ * @p: task to refresh ext weight for -+ * -+ * @p->scx.weight carries the task's static priority in cgroup weight scale to -+ * enable easy access from the BPF scheduler. To keep it synchronized with the -+ * current task priority, this function should be called when a new task is -+ * created, priority is changed for a task on sched_ext, and a task is switched -+ * to sched_ext from other classes. -+ */ -+static void refresh_scx_weight(struct task_struct *p) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ set_task_scx_weight(p); -+ if (SCX_HAS_OP(set_weight)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); -+} -+ -+void scx_pre_fork(struct task_struct *p) -+{ -+ /* -+ * BPF scheduler enable/disable paths want to be able to iterate and -+ * update all tasks which can become complex when racing forks. As -+ * enable/disable are very cold paths, let's use a percpu_rwsem to -+ * exclude forks. -+ */ -+ percpu_down_read(&scx_fork_rwsem); -+} -+ -+int scx_fork(struct task_struct *p) -+{ -+ percpu_rwsem_assert_held(&scx_fork_rwsem); -+ -+ if (scx_enabled()) -+ return scx_ops_prepare_task(p, task_group(p)); -+ else -+ return 0; -+} -+ -+void scx_post_fork(struct task_struct *p) -+{ -+ if (scx_enabled()) { -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(p, &rf); -+ /* -+ * Set the weight manually before calling ops.enable() so that -+ * the scheduler doesn't see a stale value if they inspect the -+ * task struct. We'll invoke ops.set_weight() afterwards, as it -+ * would be odd to receive a callback on the task before we -+ * tell the scheduler that it's been fully enabled. -+ */ -+ set_task_scx_weight(p); -+ scx_ops_enable_task(p); -+ refresh_scx_weight(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+ -+ spin_lock_irq(&scx_tasks_lock); -+ list_add_tail(&p->scx.tasks_node, &scx_tasks); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ percpu_up_read(&scx_fork_rwsem); -+} -+ -+void scx_cancel_fork(struct task_struct *p) -+{ -+ if (scx_enabled()) -+ scx_ops_disable_task(p); -+ percpu_up_read(&scx_fork_rwsem); -+} -+ -+void sched_ext_free(struct task_struct *p) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&scx_tasks_lock, flags); -+ list_del_init(&p->scx.tasks_node); -+ spin_unlock_irqrestore(&scx_tasks_lock, flags); -+ -+ /* -+ * @p is off scx_tasks and wholly ours. scx_ops_enable()'s PREPPED -> -+ * ENABLED transitions can't race us. Disable ops for @p. -+ */ -+ if (p->scx.flags & (SCX_TASK_OPS_PREPPED | SCX_TASK_OPS_ENABLED)) { -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(p, &rf); -+ scx_ops_disable_task(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+} -+ -+static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) -+{ -+ refresh_scx_weight(p); -+} -+ -+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) -+{ -+} -+ -+static void switching_to_scx(struct rq *rq, struct task_struct *p) -+{ -+ refresh_scx_weight(p); -+ -+ /* -+ * set_cpus_allowed_scx() is not called while @p is associated with a -+ * different scheduler class. Keep the BPF scheduler up-to-date. -+ */ -+ if (SCX_HAS_OP(set_cpumask)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, -+ (struct cpumask *)p->cpus_ptr); -+} -+ -+static void check_preempt_curr_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} -+static void switched_to_scx(struct rq *rq, struct task_struct *p) {} -+ -+int scx_check_setscheduler(struct task_struct *p, int policy) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ /* if disallow, reject transitioning into SCX */ -+ if (scx_enabled() && READ_ONCE(p->scx.disallow) && -+ p->policy != policy && policy == SCHED_EXT) -+ return -EACCES; -+ -+ return 0; -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+bool scx_can_stop_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (scx_ops_disabling()) -+ return false; -+ -+ if (p->sched_class != &ext_sched_class) -+ return true; -+ -+ /* -+ * @rq can dispatch from different DSQs, so we can't tell whether it -+ * needs the tick or not by looking at nr_running. Allow stopping ticks -+ * iff the BPF scheduler indicated so. See set_next_task_scx(). -+ */ -+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; -+} -+#endif -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ -+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); -+ -+int scx_tg_online(struct task_group *tg) -+{ -+ int ret = 0; -+ -+ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); -+ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (SCX_HAS_OP(cgroup_init)) { -+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, -+ tg->css.cgroup, &args); -+ if (!ret) -+ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; -+ else -+ ret = ops_sanitize_err("cgroup_init", ret); -+ } else { -+ tg->scx_flags |= SCX_TG_ONLINE; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+ return ret; -+} -+ -+void scx_tg_offline(struct task_group *tg) -+{ -+ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); -+ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup); -+ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+int scx_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *p; -+ int ret; -+ -+ /* released in scx_finish/cancel_attach() */ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (!scx_enabled()) -+ return 0; -+ -+ cgroup_taskset_for_each(p, css, tset) { -+ struct cgroup *from = tg_cgrp(task_group(p)); -+ -+ if (SCX_HAS_OP(cgroup_prep_move)) { -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move, -+ p, from, css->cgroup); -+ if (ret) -+ goto err; -+ } -+ -+ WARN_ON_ONCE(p->scx.cgrp_moving_from); -+ p->scx.cgrp_moving_from = from; -+ } -+ -+ return 0; -+ -+err: -+ cgroup_taskset_for_each(p, css, tset) { -+ if (!p->scx.cgrp_moving_from) -+ break; -+ if (SCX_HAS_OP(cgroup_cancel_move)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, -+ p->scx.cgrp_moving_from, css->cgroup); -+ p->scx.cgrp_moving_from = NULL; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+ return ops_sanitize_err("cgroup_prep_move", ret); -+} -+ -+void scx_move_task(struct task_struct *p) -+{ -+ /* -+ * We're called from sched_move_task() which handles both cgroup and -+ * autogroup moves. Ignore the latter. -+ */ -+ if (task_group_is_autogroup(task_group(p))) -+ return; -+ -+ if (!scx_enabled()) -+ return; -+ -+ if (SCX_HAS_OP(cgroup_move)) { -+ WARN_ON_ONCE(!p->scx.cgrp_moving_from); -+ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, -+ p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); -+ } -+ p->scx.cgrp_moving_from = NULL; -+} -+ -+void scx_cgroup_finish_attach(void) -+{ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *p; -+ -+ if (!scx_enabled()) -+ goto out_unlock; -+ -+ cgroup_taskset_for_each(p, css, tset) { -+ if (SCX_HAS_OP(cgroup_cancel_move)) { -+ WARN_ON_ONCE(!p->scx.cgrp_moving_from); -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, -+ p->scx.cgrp_moving_from, css->cgroup); -+ } -+ p->scx.cgrp_moving_from = NULL; -+ } -+out_unlock: -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+void scx_group_set_weight(struct task_group *tg, unsigned long weight) -+{ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (tg->scx_weight != weight) { -+ if (SCX_HAS_OP(cgroup_set_weight)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight, -+ tg_cgrp(tg), weight); -+ tg->scx_weight = weight; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+static void scx_cgroup_lock(void) -+{ -+ percpu_down_write(&scx_cgroup_rwsem); -+} -+ -+static void scx_cgroup_unlock(void) -+{ -+ percpu_up_write(&scx_cgroup_rwsem); -+} -+ -+#else /* CONFIG_EXT_GROUP_SCHED */ -+ -+static inline void scx_cgroup_lock(void) {} -+static inline void scx_cgroup_unlock(void) {} -+ -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+ -+/* -+ * Omitted operations: -+ * -+ * - check_preempt_curr: NOOP as it isn't useful in the wakeup path because the -+ * task isn't tied to the CPU at that point. Preemption is implemented by -+ * resetting the victim task's slice to 0 and triggering reschedule on the -+ * target CPU. -+ * -+ * - migrate_task_rq: Unncessary as task to cpu mapping is transient. -+ * -+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of -+ * their current sched_class. Call them directly from sched core instead. -+ * -+ * - task_woken, switched_from: Unnecessary. -+ */ -+DEFINE_SCHED_CLASS(ext) = { -+ .enqueue_task = enqueue_task_scx, -+ .dequeue_task = dequeue_task_scx, -+ .yield_task = yield_task_scx, -+ .yield_to_task = yield_to_task_scx, -+ -+ .check_preempt_curr = check_preempt_curr_scx, -+ -+ .pick_next_task = pick_next_task_scx, -+ -+ .put_prev_task = put_prev_task_scx, -+ .set_next_task = set_next_task_scx, -+ -+#ifdef CONFIG_SMP -+ .balance = balance_scx, -+ .select_task_rq = select_task_rq_scx, -+ .set_cpus_allowed = set_cpus_allowed_scx, -+ -+ .rq_online = rq_online_scx, -+ .rq_offline = rq_offline_scx, -+#endif -+ -+#ifdef CONFIG_SCHED_CORE -+ .pick_task = pick_task_scx, -+#endif -+ -+ .task_tick = task_tick_scx, -+ -+ .switching_to = switching_to_scx, -+ .switched_to = switched_to_scx, -+ .reweight_task = reweight_task_scx, -+ .prio_changed = prio_changed_scx, -+ -+ .update_curr = update_curr_scx, -+ -+#ifdef CONFIG_UCLAMP_TASK -+ .uclamp_enabled = 0, -+#endif -+}; -+ -+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) -+{ -+ memset(dsq, 0, sizeof(*dsq)); -+ -+ raw_spin_lock_init(&dsq->lock); -+ INIT_LIST_HEAD(&dsq->fifo); -+ dsq->id = dsq_id; -+} -+ -+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) -+{ -+ struct scx_dispatch_q *dsq; -+ int ret; -+ -+ if (dsq_id & SCX_DSQ_FLAG_BUILTIN) -+ return ERR_PTR(-EINVAL); -+ -+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); -+ if (!dsq) -+ return ERR_PTR(-ENOMEM); -+ -+ init_dsq(dsq, dsq_id); -+ -+ ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, -+ dsq_hash_params); -+ if (ret) { -+ kfree(dsq); -+ return ERR_PTR(ret); -+ } -+ return dsq; -+} -+ -+static void free_dsq_irq_workfn(struct irq_work *irq_work) -+{ -+ struct llist_node *to_free = llist_del_all(&dsqs_to_free); -+ struct scx_dispatch_q *dsq, *tmp_dsq; -+ -+ llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) -+ kfree_rcu(dsq, rcu); -+} -+ -+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); -+ -+static void destroy_dsq(u64 dsq_id) -+{ -+ struct scx_dispatch_q *dsq; -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ dsq = rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -+ if (!dsq) -+ goto out_unlock_rcu; -+ -+ raw_spin_lock_irqsave(&dsq->lock, flags); -+ -+ if (dsq->nr) { -+ scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", -+ dsq->id, dsq->nr); -+ goto out_unlock_dsq; -+ } -+ -+ if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) -+ goto out_unlock_dsq; -+ -+ /* -+ * Mark dead by invalidating ->id to prevent dispatch_enqueue() from -+ * queueing more tasks. As this function can be called from anywhere, -+ * freeing is bounced through an irq work to avoid nesting RCU -+ * operations inside scheduler locks. -+ */ -+ dsq->id = SCX_DSQ_INVALID; -+ llist_add(&dsq->free_node, &dsqs_to_free); -+ irq_work_queue(&free_dsq_irq_work); -+ -+out_unlock_dsq: -+ raw_spin_unlock_irqrestore(&dsq->lock, flags); -+out_unlock_rcu: -+ rcu_read_unlock(); -+} -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+static void scx_cgroup_exit(void) -+{ -+ struct cgroup_subsys_state *css; -+ -+ percpu_rwsem_assert_held(&scx_cgroup_rwsem); -+ -+ /* -+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk -+ * cgroups and exit all the inited ones, all online cgroups are exited. -+ */ -+ rcu_read_lock(); -+ css_for_each_descendant_post(css, &root_task_group.css) { -+ struct task_group *tg = css_tg(css); -+ -+ if (!(tg->scx_flags & SCX_TG_INITED)) -+ continue; -+ tg->scx_flags &= ~SCX_TG_INITED; -+ -+ if (!scx_ops.cgroup_exit) -+ continue; -+ -+ if (WARN_ON_ONCE(!css_tryget(css))) -+ continue; -+ rcu_read_unlock(); -+ -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); -+ -+ rcu_read_lock(); -+ css_put(css); -+ } -+ rcu_read_unlock(); -+} -+ -+static int scx_cgroup_init(void) -+{ -+ struct cgroup_subsys_state *css; -+ int ret; -+ -+ percpu_rwsem_assert_held(&scx_cgroup_rwsem); -+ -+ /* -+ * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk -+ * cgroups and init, all online cgroups are initialized. -+ */ -+ rcu_read_lock(); -+ css_for_each_descendant_pre(css, &root_task_group.css) { -+ struct task_group *tg = css_tg(css); -+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; -+ -+ if ((tg->scx_flags & -+ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) -+ continue; -+ -+ if (!scx_ops.cgroup_init) { -+ tg->scx_flags |= SCX_TG_INITED; -+ continue; -+ } -+ -+ if (WARN_ON_ONCE(!css_tryget(css))) -+ continue; -+ rcu_read_unlock(); -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, -+ css->cgroup, &args); -+ if (ret) { -+ css_put(css); -+ return ret; -+ } -+ tg->scx_flags |= SCX_TG_INITED; -+ -+ rcu_read_lock(); -+ css_put(css); -+ } -+ rcu_read_unlock(); -+ -+ return 0; -+} -+ -+static void scx_cgroup_config_knobs(void) -+{ -+ static DEFINE_MUTEX(cgintf_mutex); -+ DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { }; -+ u64 knob_flags; -+ int i; -+ -+ /* -+ * Called from both class switch and ops enable/disable paths, -+ * synchronize internally. -+ */ -+ mutex_lock(&cgintf_mutex); -+ -+ /* if fair is in use, all knobs should be shown */ -+ if (!scx_switched_all()) { -+ bitmap_fill(mask, CPU_CFTYPE_CNT); -+ goto apply; -+ } -+ -+ /* -+ * On ext, only show the supported knobs. Otherwise, show all possible -+ * knobs so that configuration attempts succeed and the states are -+ * remembered while ops is not loaded. -+ */ -+ if (scx_enabled()) -+ knob_flags = scx_ops.flags; -+ else -+ knob_flags = SCX_OPS_ALL_FLAGS; -+ -+ if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) { -+ __set_bit(CPU_CFTYPE_WEIGHT, mask); -+ __set_bit(CPU_CFTYPE_WEIGHT_NICE, mask); -+ } -+apply: -+ for (i = 0; i < CPU_CFTYPE_CNT; i++) -+ cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask)); -+ -+ mutex_unlock(&cgintf_mutex); -+} -+ -+#else -+static void scx_cgroup_exit(void) {} -+static int scx_cgroup_init(void) { return 0; } -+static void scx_cgroup_config_knobs(void) {} -+#endif -+ -+/* -+ * Used by sched_fork() and __setscheduler_prio() to pick the matching -+ * sched_class. dl/rt are already handled. -+ */ -+bool task_should_scx(struct task_struct *p) -+{ -+ if (!scx_enabled() || scx_ops_disabling()) -+ return false; -+ if (READ_ONCE(scx_switching_all)) -+ return true; -+ return p->policy == SCHED_EXT; -+} -+ -+static void scx_ops_fallback_enqueue(struct task_struct *p, u64 enq_flags) -+{ -+ if (enq_flags & SCX_ENQ_LAST) -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); -+ else -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+static void scx_ops_fallback_dispatch(s32 cpu, struct task_struct *prev) {} -+ -+static void scx_ops_disable_workfn(struct kthread_work *work) -+{ -+ struct scx_exit_info *ei = &scx_exit_info; -+ struct scx_task_iter sti; -+ struct task_struct *p; -+ struct rhashtable_iter rht_iter; -+ struct scx_dispatch_q *dsq; -+ const char *reason; -+ int i, cpu, kind; -+ -+ kind = atomic_read(&scx_exit_kind); -+ while (true) { -+ /* -+ * NONE indicates that a new scx_ops has been registered since -+ * disable was scheduled - don't kill the new ops. DONE -+ * indicates that the ops has already been disabled. -+ */ -+ if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) -+ return; -+ if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) -+ break; -+ } -+ -+ cancel_delayed_work_sync(&scx_watchdog_work); -+ -+ switch (kind) { -+ case SCX_EXIT_UNREG: -+ reason = "BPF scheduler unregistered"; -+ break; -+ case SCX_EXIT_SYSRQ: -+ reason = "disabled by sysrq-S"; -+ break; -+ case SCX_EXIT_ERROR: -+ reason = "runtime error"; -+ break; -+ case SCX_EXIT_ERROR_BPF: -+ reason = "scx_bpf_error"; -+ break; -+ case SCX_EXIT_ERROR_STALL: -+ reason = "runnable task stall"; -+ break; -+ default: -+ reason = ""; -+ } -+ -+ ei->kind = kind; -+ strlcpy(ei->reason, reason, sizeof(ei->reason)); -+ -+ switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { -+ case SCX_OPS_DISABLED: -+ pr_warn("sched_ext: ops error detected without ops (%s)\n", -+ scx_exit_info.msg); -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != -+ SCX_OPS_DISABLING); -+ return; -+ case SCX_OPS_PREPPING: -+ goto forward_progress_guaranteed; -+ case SCX_OPS_DISABLING: -+ /* shouldn't happen but handle it like ENABLING if it does */ -+ WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); -+ fallthrough; -+ case SCX_OPS_ENABLING: -+ case SCX_OPS_ENABLED: -+ break; -+ } -+ -+ /* -+ * DISABLING is set and ops was either ENABLING or ENABLED indicating -+ * that the ops and static branches are set. -+ * -+ * We must guarantee that all runnable tasks make forward progress -+ * without trusting the BPF scheduler. We can't grab any mutexes or -+ * rwsems as they might be held by tasks that the BPF scheduler is -+ * forgetting to run, which unfortunately also excludes toggling the -+ * static branches. -+ * -+ * Let's work around by overriding a couple ops and modifying behaviors -+ * based on the DISABLING state and then cycling the tasks through -+ * dequeue/enqueue to force global FIFO scheduling. -+ * -+ * a. ops.enqueue() and .dispatch() are overridden for simple global -+ * FIFO scheduling. -+ * -+ * b. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value -+ * can't be trusted. Whenever a tick triggers, the running task is -+ * rotated to the tail of the queue with core_sched_at touched. -+ * -+ * c. pick_next_task() suppresses zero slice warning. -+ * -+ * d. scx_prio_less() reverts to the default core_sched_at order. -+ */ -+ scx_ops.enqueue = scx_ops_fallback_enqueue; -+ scx_ops.dispatch = scx_ops_fallback_dispatch; -+ -+ spin_lock_irq(&scx_tasks_lock); -+ scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_filtered_locked(&sti))) { -+ if (READ_ONCE(p->__state) != TASK_DEAD) { -+ struct sched_enq_and_set_ctx ctx; -+ -+ /* cycling deq/enq is enough, see above */ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -+ sched_enq_and_set_task(&ctx); -+ } -+ } -+ scx_task_iter_exit(&sti); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ /* kick all CPUs to restore ticks */ -+ for_each_possible_cpu(cpu) -+ resched_cpu(cpu); -+ -+forward_progress_guaranteed: -+ /* -+ * Here, every runnable task is guaranteed to make forward progress and -+ * we can safely use blocking synchronization constructs. Actually -+ * disable ops. -+ */ -+ mutex_lock(&scx_ops_enable_mutex); -+ -+ static_branch_disable(&__scx_switched_all); -+ WRITE_ONCE(scx_switching_all, false); -+ -+ /* avoid racing against fork and cgroup changes */ -+ cpus_read_lock(); -+ percpu_down_write(&scx_fork_rwsem); -+ scx_cgroup_lock(); -+ -+ spin_lock_irq(&scx_tasks_lock); -+ scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_filtered_locked(&sti))) { -+ const struct sched_class *old_class = p->sched_class; -+ struct sched_enq_and_set_ctx ctx; -+ bool alive = READ_ONCE(p->__state) != TASK_DEAD; -+ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -+ -+ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); -+ -+ __setscheduler_prio(p, p->prio); -+ if (alive) -+ check_class_changing(task_rq(p), p, old_class); -+ -+ sched_enq_and_set_task(&ctx); -+ -+ if (alive) -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ -+ scx_ops_disable_task(p); -+ } -+ scx_task_iter_exit(&sti); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ /* no task is on scx, turn off all the switches and flush in-progress calls */ -+ static_branch_disable_cpuslocked(&__scx_ops_enabled); -+ for (i = 0; i < SCX_NR_ONLINE_OPS; i++) -+ static_branch_disable_cpuslocked(&scx_has_op[i]); -+ static_branch_disable_cpuslocked(&scx_ops_enq_last); -+ static_branch_disable_cpuslocked(&scx_ops_enq_exiting); -+ static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); -+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); -+ synchronize_rcu(); -+ -+ scx_cgroup_exit(); -+ -+ scx_cgroup_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+ cpus_read_unlock(); -+ -+ if (ei->kind >= SCX_EXIT_ERROR) { -+ printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); -+ -+ if (ei->msg[0] == '\0') -+ printk(KERN_ERR "sched_ext: %s\n", ei->reason); -+ else -+ printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); -+ -+ stack_trace_print(ei->bt, ei->bt_len, 2); -+ } -+ -+ if (scx_ops.exit) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); -+ -+ memset(&scx_ops, 0, sizeof(scx_ops)); -+ -+ rhashtable_walk_enter(&dsq_hash, &rht_iter); -+ do { -+ rhashtable_walk_start(&rht_iter); -+ -+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) -+ destroy_dsq(dsq->id); -+ -+ rhashtable_walk_stop(&rht_iter); -+ } while (dsq == ERR_PTR(-EAGAIN)); -+ rhashtable_walk_exit(&rht_iter); -+ -+ free_percpu(scx_dsp_buf); -+ scx_dsp_buf = NULL; -+ scx_dsp_max_batch = 0; -+ -+ mutex_unlock(&scx_ops_enable_mutex); -+ -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != -+ SCX_OPS_DISABLING); -+ -+ scx_cgroup_config_knobs(); -+} -+ -+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); -+ -+static void schedule_scx_ops_disable_work(void) -+{ -+ struct kthread_worker *helper = READ_ONCE(scx_ops_helper); -+ -+ /* -+ * We may be called spuriously before the first bpf_sched_ext_reg(). If -+ * scx_ops_helper isn't set up yet, there's nothing to do. -+ */ -+ if (helper) -+ kthread_queue_work(helper, &scx_ops_disable_work); -+} -+ -+static void scx_ops_disable(enum scx_exit_kind kind) -+{ -+ int none = SCX_EXIT_NONE; -+ -+ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) -+ kind = SCX_EXIT_ERROR; -+ -+ atomic_try_cmpxchg(&scx_exit_kind, &none, kind); -+ -+ schedule_scx_ops_disable_work(); -+} -+ -+static void scx_ops_error_irq_workfn(struct irq_work *irq_work) -+{ -+ schedule_scx_ops_disable_work(); -+} -+ -+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); -+ -+__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind, -+ const char *fmt, ...) -+{ -+ struct scx_exit_info *ei = &scx_exit_info; -+ int none = SCX_EXIT_NONE; -+ va_list args; -+ -+ if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) -+ return; -+ -+ ei->bt_len = stack_trace_save(ei->bt, ARRAY_SIZE(ei->bt), 1); -+ -+ va_start(args, fmt); -+ vscnprintf(ei->msg, ARRAY_SIZE(ei->msg), fmt, args); -+ va_end(args); -+ -+ irq_work_queue(&scx_ops_error_irq_work); -+} -+ -+static struct kthread_worker *scx_create_rt_helper(const char *name) -+{ -+ struct kthread_worker *helper; -+ -+ helper = kthread_create_worker(0, name); -+ if (helper) -+ sched_set_fifo(helper->task); -+ return helper; -+} -+ -+static int scx_ops_enable(struct sched_ext_ops *ops) -+{ -+ struct scx_task_iter sti; -+ struct task_struct *p; -+ int i, ret; -+ -+ mutex_lock(&scx_ops_enable_mutex); -+ -+ if (!scx_ops_helper) { -+ WRITE_ONCE(scx_ops_helper, -+ scx_create_rt_helper("sched_ext_ops_helper")); -+ if (!scx_ops_helper) { -+ ret = -ENOMEM; -+ goto err_unlock; -+ } -+ } -+ -+ if (scx_ops_enable_state() != SCX_OPS_DISABLED) { -+ ret = -EBUSY; -+ goto err_unlock; -+ } -+ -+ /* -+ * Set scx_ops, transition to PREPPING and clear exit info to arm the -+ * disable path. Failure triggers full disabling from here on. -+ */ -+ scx_ops = *ops; -+ -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != -+ SCX_OPS_DISABLED); -+ -+ memset(&scx_exit_info, 0, sizeof(scx_exit_info)); -+ atomic_set(&scx_exit_kind, SCX_EXIT_NONE); -+ scx_warned_zero_slice = false; -+ -+ atomic_long_set(&scx_nr_rejected, 0); -+ -+ /* -+ * Keep CPUs stable during enable so that the BPF scheduler can track -+ * online CPUs by watching ->on/offline_cpu() after ->init(). -+ */ -+ cpus_read_lock(); -+ -+ scx_switch_all_req = false; -+ if (scx_ops.init) { -+ ret = SCX_CALL_OP_RET(SCX_KF_INIT, init); -+ if (ret) { -+ ret = ops_sanitize_err("init", ret); -+ goto err_disable; -+ } -+ -+ /* -+ * Exit early if ops.init() triggered scx_bpf_error(). Not -+ * strictly necessary as we'll fail transitioning into ENABLING -+ * later but that'd be after calling ops.prep_enable() on all -+ * tasks and with -EBUSY which isn't very intuitive. Let's exit -+ * early with success so that the condition is notified through -+ * ops.exit() like other scx_bpf_error() invocations. -+ */ -+ if (atomic_read(&scx_exit_kind) != SCX_EXIT_NONE) -+ goto err_disable; -+ } -+ -+ WARN_ON_ONCE(scx_dsp_buf); -+ scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; -+ scx_dsp_buf = __alloc_percpu(sizeof(scx_dsp_buf[0]) * scx_dsp_max_batch, -+ __alignof__(scx_dsp_buf[0])); -+ if (!scx_dsp_buf) { -+ ret = -ENOMEM; -+ goto err_disable; -+ } -+ -+ scx_watchdog_timeout = SCX_WATCHDOG_MAX_TIMEOUT; -+ if (ops->timeout_ms) -+ scx_watchdog_timeout = msecs_to_jiffies(ops->timeout_ms); -+ -+ scx_watchdog_timestamp = jiffies; -+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work, -+ scx_watchdog_timeout / 2); -+ -+ /* -+ * Lock out forks, cgroup on/offlining and moves before opening the -+ * floodgate so that they don't wander into the operations prematurely. -+ */ -+ percpu_down_write(&scx_fork_rwsem); -+ scx_cgroup_lock(); -+ -+ for (i = 0; i < SCX_NR_ONLINE_OPS; i++) -+ if (((void (**)(void))ops)[i]) -+ static_branch_enable_cpuslocked(&scx_has_op[i]); -+ -+ if (ops->flags & SCX_OPS_ENQ_LAST) -+ static_branch_enable_cpuslocked(&scx_ops_enq_last); -+ -+ if (ops->flags & SCX_OPS_ENQ_EXITING) -+ static_branch_enable_cpuslocked(&scx_ops_enq_exiting); -+ if (scx_ops.cpu_acquire || scx_ops.cpu_release) -+ static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); -+ -+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { -+ reset_idle_masks(); -+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); -+ } else { -+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); -+ } -+ -+ /* -+ * All cgroups should be initialized before letting in tasks. cgroup -+ * on/offlining and task migrations are already locked out. -+ */ -+ ret = scx_cgroup_init(); -+ if (ret) -+ goto err_disable_unlock; -+ -+ static_branch_enable_cpuslocked(&__scx_ops_enabled); -+ -+ /* -+ * Enable ops for every task. Fork is excluded by scx_fork_rwsem -+ * preventing new tasks from being added. No need to exclude tasks -+ * leaving as sched_ext_free() can handle both prepped and enabled -+ * tasks. Prep all tasks first and then enable them with preemption -+ * disabled. -+ */ -+ spin_lock_irq(&scx_tasks_lock); -+ -+ scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_filtered(&sti))) { -+ get_task_struct(p); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ ret = scx_ops_prepare_task(p, task_group(p)); -+ if (ret) { -+ put_task_struct(p); -+ spin_lock_irq(&scx_tasks_lock); -+ scx_task_iter_exit(&sti); -+ spin_unlock_irq(&scx_tasks_lock); -+ pr_err("sched_ext: ops.prep_enable() failed (%d) for %s[%d] while loading\n", -+ ret, p->comm, p->pid); -+ goto err_disable_unlock; -+ } -+ -+ put_task_struct(p); -+ spin_lock_irq(&scx_tasks_lock); -+ } -+ scx_task_iter_exit(&sti); -+ -+ /* -+ * All tasks are prepped but are still ops-disabled. Ensure that -+ * %current can't be scheduled out and switch everyone. -+ * preempt_disable() is necessary because we can't guarantee that -+ * %current won't be starved if scheduled out while switching. -+ */ -+ preempt_disable(); -+ -+ /* -+ * From here on, the disable path must assume that tasks have ops -+ * enabled and need to be recovered. -+ */ -+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { -+ preempt_enable(); -+ spin_unlock_irq(&scx_tasks_lock); -+ ret = -EBUSY; -+ goto err_disable_unlock; -+ } -+ -+ /* -+ * We're fully committed and can't fail. The PREPPED -> ENABLED -+ * transitions here are synchronized against sched_ext_free() through -+ * scx_tasks_lock. -+ */ -+ WRITE_ONCE(scx_switching_all, scx_switch_all_req); -+ -+ scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_filtered_locked(&sti))) { -+ if (READ_ONCE(p->__state) != TASK_DEAD) { -+ const struct sched_class *old_class = p->sched_class; -+ struct sched_enq_and_set_ctx ctx; -+ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, -+ &ctx); -+ -+ scx_ops_enable_task(p); -+ __setscheduler_prio(p, p->prio); -+ check_class_changing(task_rq(p), p, old_class); -+ -+ sched_enq_and_set_task(&ctx); -+ -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ } else { -+ scx_ops_disable_task(p); -+ } -+ } -+ scx_task_iter_exit(&sti); -+ -+ spin_unlock_irq(&scx_tasks_lock); -+ preempt_enable(); -+ scx_cgroup_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+ -+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { -+ ret = -EBUSY; -+ goto err_disable; -+ } -+ -+ if (scx_switch_all_req) -+ static_branch_enable_cpuslocked(&__scx_switched_all); -+ -+ cpus_read_unlock(); -+ mutex_unlock(&scx_ops_enable_mutex); -+ -+ scx_cgroup_config_knobs(); -+ -+ return 0; -+ -+err_unlock: -+ mutex_unlock(&scx_ops_enable_mutex); -+ return ret; -+ -+err_disable_unlock: -+ scx_cgroup_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+err_disable: -+ cpus_read_unlock(); -+ mutex_unlock(&scx_ops_enable_mutex); -+ /* must be fully disabled before returning */ -+ scx_ops_disable(SCX_EXIT_ERROR); -+ kthread_flush_work(&scx_ops_disable_work); -+ return ret; -+} -+ -+#ifdef CONFIG_SCHED_DEBUG -+static int scx_debug_show(struct seq_file *m, void *v) -+{ -+ mutex_lock(&scx_ops_enable_mutex); -+ seq_printf(m, "%-30s: %s\n", "ops", scx_ops.name); -+ seq_printf(m, "%-30s: %ld\n", "enabled", scx_enabled()); -+ seq_printf(m, "%-30s: %d\n", "switching_all", -+ READ_ONCE(scx_switching_all)); -+ seq_printf(m, "%-30s: %ld\n", "switched_all", scx_switched_all()); -+ seq_printf(m, "%-30s: %s\n", "enable_state", -+ scx_ops_enable_state_str[scx_ops_enable_state()]); -+ seq_printf(m, "%-30s: %lu\n", "nr_rejected", -+ atomic_long_read(&scx_nr_rejected)); -+ mutex_unlock(&scx_ops_enable_mutex); -+ return 0; -+} -+ -+static int scx_debug_open(struct inode *inode, struct file *file) -+{ -+ return single_open(file, scx_debug_show, NULL); -+} -+ -+const struct file_operations sched_ext_fops = { -+ .open = scx_debug_open, -+ .read = seq_read, -+ .llseek = seq_lseek, -+ .release = single_release, -+}; -+#endif -+ -+/******************************************************************************** -+ * bpf_struct_ops plumbing. -+ */ -+#include -+#include -+#include -+ -+extern struct btf *btf_vmlinux; -+static const struct btf_type *task_struct_type; -+ -+static bool bpf_scx_is_valid_access(int off, int size, -+ enum bpf_access_type type, -+ const struct bpf_prog *prog, -+ struct bpf_insn_access_aux *info) -+{ -+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) -+ return false; -+ if (type != BPF_READ) -+ return false; -+ if (off % size != 0) -+ return false; -+ -+ return btf_ctx_access(off, size, type, prog, info); -+} -+ -+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, -+ const struct bpf_reg_state *reg, int off, -+ int size) -+{ -+ const struct btf_type *t; -+ -+ t = btf_type_by_id(reg->btf, reg->btf_id); -+ if (t == task_struct_type) { -+ if (off >= offsetof(struct task_struct, scx.slice) && -+ off + size <= offsetofend(struct task_struct, scx.slice)) -+ return SCALAR_VALUE; -+ if (off >= offsetof(struct task_struct, scx.dsq_vtime) && -+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) -+ return SCALAR_VALUE; -+ if (off >= offsetof(struct task_struct, scx.disallow) && -+ off + size <= offsetofend(struct task_struct, scx.disallow)) -+ return SCALAR_VALUE; -+ } -+ -+ return -EACCES; -+} -+ -+static const struct bpf_func_proto * -+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -+{ -+ switch (func_id) { -+ case BPF_FUNC_task_storage_get: -+ return &bpf_task_storage_get_proto; -+ case BPF_FUNC_task_storage_delete: -+ return &bpf_task_storage_delete_proto; -+ default: -+ return bpf_base_func_proto(func_id); -+ } -+} -+ -+const struct bpf_verifier_ops bpf_scx_verifier_ops = { -+ .get_func_proto = bpf_scx_get_func_proto, -+ .is_valid_access = bpf_scx_is_valid_access, -+ .btf_struct_access = bpf_scx_btf_struct_access, -+}; -+ -+static int bpf_scx_init_member(const struct btf_type *t, -+ const struct btf_member *member, -+ void *kdata, const void *udata) -+{ -+ const struct sched_ext_ops *uops = udata; -+ struct sched_ext_ops *ops = kdata; -+ u32 moff = __btf_member_bit_offset(t, member) / 8; -+ int ret; -+ -+ switch (moff) { -+ case offsetof(struct sched_ext_ops, dispatch_max_batch): -+ if (*(u32 *)(udata + moff) > INT_MAX) -+ return -E2BIG; -+ ops->dispatch_max_batch = *(u32 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, flags): -+ if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) -+ return -EINVAL; -+ ops->flags = *(u64 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, name): -+ ret = bpf_obj_name_cpy(ops->name, uops->name, -+ sizeof(ops->name)); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) -+ return -EINVAL; -+ return 1; -+ case offsetof(struct sched_ext_ops, timeout_ms): -+ if (*(u32 *)(udata + moff) > SCX_WATCHDOG_MAX_TIMEOUT) -+ return -E2BIG; -+ ops->timeout_ms = *(u32 *)(udata + moff); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static int bpf_scx_check_member(const struct btf_type *t, -+ const struct btf_member *member, -+ const struct bpf_prog *prog) -+{ -+ u32 moff = __btf_member_bit_offset(t, member) / 8; -+ -+ switch (moff) { -+ case offsetof(struct sched_ext_ops, prep_enable): -+#ifdef CONFIG_EXT_GROUP_SCHED -+ case offsetof(struct sched_ext_ops, cgroup_init): -+ case offsetof(struct sched_ext_ops, cgroup_exit): -+ case offsetof(struct sched_ext_ops, cgroup_prep_move): -+#endif -+ case offsetof(struct sched_ext_ops, init): -+ case offsetof(struct sched_ext_ops, exit): -+ break; -+ default: -+ if (prog->aux->sleepable) -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static int bpf_scx_reg(void *kdata) -+{ -+ return scx_ops_enable(kdata); -+} -+ -+static void bpf_scx_unreg(void *kdata) -+{ -+ scx_ops_disable(SCX_EXIT_UNREG); -+ kthread_flush_work(&scx_ops_disable_work); -+} -+ -+static int bpf_scx_init(struct btf *btf) -+{ -+ u32 type_id; -+ -+ type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); -+ if (type_id < 0) -+ return -EINVAL; -+ task_struct_type = btf_type_by_id(btf, type_id); -+ -+ return 0; -+} -+ -+static int bpf_scx_update(void *kdata, void *old_kdata) -+{ -+ /* -+ * sched_ext does not support updating the actively-loaded BPF -+ * scheduler, as registering a BPF scheduler can always fail if the -+ * scheduler returns an error code for e.g. ops.init(), -+ * ops.prep_enable(), etc. Similarly, we can always race with -+ * unregistration happening elsewhere, such as with sysrq. -+ */ -+ return -EOPNOTSUPP; -+} -+ -+static int bpf_scx_validate(void *kdata) -+{ -+ return 0; -+} -+ -+/* "extern" to avoid sparse warning, only used in this file */ -+extern struct bpf_struct_ops bpf_sched_ext_ops; -+ -+struct bpf_struct_ops bpf_sched_ext_ops = { -+ .verifier_ops = &bpf_scx_verifier_ops, -+ .reg = bpf_scx_reg, -+ .unreg = bpf_scx_unreg, -+ .check_member = bpf_scx_check_member, -+ .init_member = bpf_scx_init_member, -+ .init = bpf_scx_init, -+ .update = bpf_scx_update, -+ .validate = bpf_scx_validate, -+ .name = "sched_ext_ops", -+}; -+ -+static void sysrq_handle_sched_ext_reset(u8 key) -+{ -+ if (scx_ops_helper) -+ scx_ops_disable(SCX_EXIT_SYSRQ); -+ else -+ pr_info("sched_ext: BPF scheduler not yet used\n"); -+} -+ -+static const struct sysrq_key_op sysrq_sched_ext_reset_op = { -+ .handler = sysrq_handle_sched_ext_reset, -+ .help_msg = "reset-sched-ext(S)", -+ .action_msg = "Disable sched_ext and revert all tasks to CFS", -+ .enable_mask = SYSRQ_ENABLE_RTNICE, -+}; -+ -+static void kick_cpus_irq_workfn(struct irq_work *irq_work) -+{ -+ struct rq *this_rq = this_rq(); -+ unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); -+ int this_cpu = cpu_of(this_rq); -+ int cpu; -+ -+ for_each_cpu(cpu, this_rq->scx.cpus_to_kick) { -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_rq_lock_irqsave(rq, flags); -+ -+ if (cpu_online(cpu) || cpu == this_cpu) { -+ if (cpumask_test_cpu(cpu, this_rq->scx.cpus_to_preempt) && -+ rq->curr->sched_class == &ext_sched_class) -+ rq->curr->scx.slice = 0; -+ pseqs[cpu] = rq->scx.pnt_seq; -+ resched_curr(rq); -+ } else { -+ cpumask_clear_cpu(cpu, this_rq->scx.cpus_to_wait); -+ } -+ -+ raw_spin_rq_unlock_irqrestore(rq, flags); -+ } -+ -+ for_each_cpu_andnot(cpu, this_rq->scx.cpus_to_wait, -+ cpumask_of(this_cpu)) { -+ /* -+ * Pairs with smp_store_release() issued by this CPU in -+ * scx_notify_pick_next_task() on the resched path. -+ * -+ * We busy-wait here to guarantee that no other task can be -+ * scheduled on our core before the target CPU has entered the -+ * resched path. -+ */ -+ while (smp_load_acquire(&cpu_rq(cpu)->scx.pnt_seq) == pseqs[cpu]) -+ cpu_relax(); -+ } -+ -+ cpumask_clear(this_rq->scx.cpus_to_kick); -+ cpumask_clear(this_rq->scx.cpus_to_preempt); -+ cpumask_clear(this_rq->scx.cpus_to_wait); -+} -+ -+/** -+ * print_scx_info - print out sched_ext scheduler state -+ * @log_lvl: the log level to use when printing -+ * @p: target task -+ * -+ * If a sched_ext scheduler is enabled, print the name and state of the -+ * scheduler. If @p is on sched_ext, print further information about the task. -+ * -+ * This function can be safely called on any task as long as the task_struct -+ * itself is accessible. While safe, this function isn't synchronized and may -+ * print out mixups or garbages of limited length. -+ */ -+void print_scx_info(const char *log_lvl, struct task_struct *p) -+{ -+ enum scx_ops_enable_state state = scx_ops_enable_state(); -+ const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; -+ char runnable_at_buf[22] = "?"; -+ struct sched_class *class; -+ unsigned long runnable_at; -+ -+ if (state == SCX_OPS_DISABLED) -+ return; -+ -+ /* -+ * Carefully check if the task was running on sched_ext, and then -+ * carefully copy the time it's been runnable, and its state. -+ */ -+ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || -+ class != &ext_sched_class) { -+ printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, -+ scx_ops_enable_state_str[state], all); -+ return; -+ } -+ -+ if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, -+ sizeof(runnable_at))) -+ scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+lldms", -+ (s64)(runnable_at - jiffies) * (HZ / MSEC_PER_SEC)); -+ -+ /* Print everything onto one line to conserve console spce. */ -+ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", -+ log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, -+ runnable_at_buf); -+} -+ -+void __init init_sched_ext_class(void) -+{ -+ int cpu; -+ u32 v; -+ -+ /* -+ * The following is to prevent the compiler from optimizing out the enum -+ * definitions so that BPF scheduler implementations can use them -+ * through the generated vmlinux.h. -+ */ -+ WRITE_ONCE(v, SCX_WAKE_EXEC | SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | -+ SCX_TG_ONLINE | SCX_KICK_PREEMPT); -+ -+ BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -+ init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); -+#ifdef CONFIG_SMP -+ BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); -+ BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -+#endif -+ scx_kick_cpus_pnt_seqs = -+ __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * -+ num_possible_cpus(), -+ __alignof__(scx_kick_cpus_pnt_seqs[0])); -+ BUG_ON(!scx_kick_cpus_pnt_seqs); -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); -+ INIT_LIST_HEAD(&rq->scx.watchdog_list); -+ -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); -+ init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); -+ } -+ -+ register_sysrq_key('S', &sysrq_sched_ext_reset_op); -+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); -+ scx_cgroup_config_knobs(); -+} -+ -+ -+/******************************************************************************** -+ * Helpers that can be called from the BPF scheduler. -+ */ -+#include -+ -+/* Disables missing prototype warnings for kfuncs */ -+__diag_push(); -+__diag_ignore_all("-Wmissing-prototypes", -+ "Global functions as their definitions will be in vmlinux BTF"); -+ -+/** -+ * scx_bpf_switch_all - Switch all tasks into SCX -+ * -+ * Switch all existing and future non-dl/rt tasks to SCX. This can only be -+ * called from ops.init(), and actual switching is performed asynchronously. -+ */ -+void scx_bpf_switch_all(void) -+{ -+ if (!scx_kf_allowed(SCX_KF_INIT)) -+ return; -+ -+ scx_switch_all_req = true; -+} -+ -+BTF_SET8_START(scx_kfunc_ids_init) -+BTF_ID_FLAGS(func, scx_bpf_switch_all) -+BTF_SET8_END(scx_kfunc_ids_init) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_init = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_init, -+}; -+ -+/** -+ * scx_bpf_create_dsq - Create a custom DSQ -+ * @dsq_id: DSQ to create -+ * @node: NUMA node to allocate from -+ * -+ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(), -+ * ops.prep_enable(), ops.cgroup_init() and ops.cgroup_prep_move(). -+ */ -+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) -+{ -+ if (!scx_kf_allowed(SCX_KF_INIT | SCX_KF_SLEEPABLE)) -+ return -EINVAL; -+ -+ if (unlikely(node >= (int)nr_node_ids || -+ (node < 0 && node != NUMA_NO_NODE))) -+ return -EINVAL; -+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); -+} -+ -+BTF_SET8_START(scx_kfunc_ids_sleepable) -+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -+BTF_SET8_END(scx_kfunc_ids_sleepable) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_sleepable, -+}; -+ -+static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) -+{ -+ if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) -+ return false; -+ -+ lockdep_assert_irqs_disabled(); -+ -+ if (unlikely(!p)) { -+ scx_ops_error("called with NULL task"); -+ return false; -+ } -+ -+ if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { -+ scx_ops_error("invalid enq_flags 0x%llx", enq_flags); -+ return false; -+ } -+ -+ return true; -+} -+ -+static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) -+{ -+ struct task_struct *ddsp_task; -+ int idx; -+ -+ ddsp_task = __this_cpu_read(direct_dispatch_task); -+ if (ddsp_task) { -+ direct_dispatch(ddsp_task, p, dsq_id, enq_flags); -+ return; -+ } -+ -+ idx = __this_cpu_read(scx_dsp_ctx.buf_cursor); -+ if (unlikely(idx >= scx_dsp_max_batch)) { -+ scx_ops_error("dispatch buffer overflow"); -+ return; -+ } -+ -+ this_cpu_ptr(scx_dsp_buf)[idx] = (struct scx_dsp_buf_ent){ -+ .task = p, -+ .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, -+ .dsq_id = dsq_id, -+ .enq_flags = enq_flags, -+ }; -+ __this_cpu_inc(scx_dsp_ctx.buf_cursor); -+} -+ -+/** -+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ -+ * @p: task_struct to dispatch -+ * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe -+ * to call this function spuriously. Can be called from ops.enqueue() and -+ * ops.dispatch(). -+ * -+ * When called from ops.enqueue(), it's for direct dispatch and @p must match -+ * the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be used to target the -+ * local DSQ of a CPU other than the enqueueing one. Use ops.select_cpu() to be -+ * on the target CPU in the first place. -+ * -+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id -+ * and this function can be called upto ops.dispatch_max_batch times to dispatch -+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the -+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. -+ * -+ * This function doesn't have any locking restrictions and may be called under -+ * BPF locks (in the future when BPF introduces more flexible locking). -+ * -+ * @p is allowed to run for @slice. The scheduling path is triggered on slice -+ * exhaustion. If zero, the current residual slice is maintained. If -+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with -+ * scx_bpf_kick_cpu() to trigger scheduling. -+ */ -+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, -+ u64 enq_flags) -+{ -+ if (!scx_dispatch_preamble(p, enq_flags)) -+ return; -+ -+ if (slice) -+ p->scx.slice = slice; -+ else -+ p->scx.slice = p->scx.slice ?: 1; -+ -+ scx_dispatch_commit(p, dsq_id, enq_flags); -+} -+ -+/** -+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ -+ * @p: task_struct to dispatch -+ * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs -+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. -+ * Tasks queued into the priority queue are ordered by @vtime and always -+ * consumed after the tasks in the FIFO queue. All other aspects are identical -+ * to scx_bpf_dispatch(). -+ * -+ * @vtime ordering is according to time_before64() which considers wrapping. A -+ * numerically larger vtime may indicate an earlier position in the ordering and -+ * vice-versa. -+ */ -+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, -+ u64 vtime, u64 enq_flags) -+{ -+ if (!scx_dispatch_preamble(p, enq_flags)) -+ return; -+ -+ if (slice) -+ p->scx.slice = slice; -+ else -+ p->scx.slice = p->scx.slice ?: 1; -+ -+ p->scx.dsq_vtime = vtime; -+ -+ scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); -+} -+ -+BTF_SET8_START(scx_kfunc_ids_enqueue_dispatch) -+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) -+BTF_SET8_END(scx_kfunc_ids_enqueue_dispatch) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_enqueue_dispatch, -+}; -+ -+/** -+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots -+ * -+ * Can only be called from ops.dispatch(). -+ */ -+u32 scx_bpf_dispatch_nr_slots(void) -+{ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return 0; -+ -+ return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx.buf_cursor); -+} -+ -+/** -+ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ -+ * @dsq_id: DSQ to consume -+ * -+ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it -+ * to the current CPU's local DSQ for execution. Can only be called from -+ * ops.dispatch(). -+ * -+ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before -+ * trying to consume the specified DSQ. It may also grab rq locks and thus can't -+ * be called under any BPF locks. -+ * -+ * Returns %true if a task has been consumed, %false if there isn't any task to -+ * consume. -+ */ -+bool scx_bpf_consume(u64 dsq_id) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(&scx_dsp_ctx); -+ struct scx_dispatch_q *dsq; -+ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return false; -+ -+ flush_dispatch_buf(dspc->rq, dspc->rf); -+ -+ dsq = find_non_local_dsq(dsq_id); -+ if (unlikely(!dsq)) { -+ scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); -+ return false; -+ } -+ -+ if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { -+ /* -+ * A successfully consumed task can be dequeued before it starts -+ * running while the CPU is trying to migrate other dispatched -+ * tasks. Bump nr_tasks to tell balance_scx() to retry on empty -+ * local DSQ. -+ */ -+ dspc->nr_tasks++; -+ return true; -+ } else { -+ return false; -+ } -+} -+ -+BTF_SET8_START(scx_kfunc_ids_dispatch) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) -+BTF_ID_FLAGS(func, scx_bpf_consume) -+BTF_SET8_END(scx_kfunc_ids_dispatch) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_dispatch, -+}; -+ -+/** -+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ -+ * -+ * Iterate over all of the tasks currently enqueued on the local DSQ of the -+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of -+ * processed tasks. Can only be called from ops.cpu_release(). -+ */ -+u32 scx_bpf_reenqueue_local(void) -+{ -+ u32 nr_enqueued, i; -+ struct rq *rq; -+ struct scx_rq *scx_rq; -+ -+ if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) -+ return 0; -+ -+ rq = cpu_rq(smp_processor_id()); -+ lockdep_assert_rq_held(rq); -+ scx_rq = &rq->scx; -+ -+ /* -+ * Get the number of tasks on the local DSQ before iterating over it to -+ * pull off tasks. The enqueue callback below can signal that it wants -+ * the task to stay on the local DSQ, and we want to prevent the BPF -+ * scheduler from causing us to loop indefinitely. -+ */ -+ nr_enqueued = scx_rq->local_dsq.nr; -+ for (i = 0; i < nr_enqueued; i++) { -+ struct task_struct *p; -+ -+ p = first_local_task(rq); -+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != -+ SCX_OPSS_NONE); -+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); -+ WARN_ON_ONCE(p->scx.holding_cpu != -1); -+ dispatch_dequeue(scx_rq, p); -+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); -+ } -+ -+ return nr_enqueued; -+} -+ -+BTF_SET8_START(scx_kfunc_ids_cpu_release) -+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) -+BTF_SET8_END(scx_kfunc_ids_cpu_release) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_cpu_release, -+}; -+ -+/** -+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU -+ * @cpu: cpu to kick -+ * @flags: %SCX_KICK_* flags -+ * -+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or -+ * trigger rescheduling on a busy CPU. This can be called from any online -+ * scx_ops operation and the actual kicking is performed asynchronously through -+ * an irq work. -+ */ -+void scx_bpf_kick_cpu(s32 cpu, u64 flags) -+{ -+ struct rq *rq; -+ -+ if (!ops_cpu_valid(cpu)) { -+ scx_ops_error("invalid cpu %d", cpu); -+ return; -+ } -+ -+ preempt_disable(); -+ rq = this_rq(); -+ -+ /* -+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting -+ * rq locks. We can probably be smarter and avoid bouncing if called -+ * from ops which don't hold a rq lock. -+ */ -+ cpumask_set_cpu(cpu, rq->scx.cpus_to_kick); -+ if (flags & SCX_KICK_PREEMPT) -+ cpumask_set_cpu(cpu, rq->scx.cpus_to_preempt); -+ if (flags & SCX_KICK_WAIT) -+ cpumask_set_cpu(cpu, rq->scx.cpus_to_wait); -+ -+ irq_work_queue(&rq->scx.kick_cpus_irq_work); -+ preempt_enable(); -+} -+ -+/** -+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks -+ * @dsq_id: id of the DSQ -+ * -+ * Return the number of tasks in the DSQ matching @dsq_id. If not found, -+ * -%ENOENT is returned. Can be called from any non-sleepable online scx_ops -+ * operations. -+ */ -+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) -+{ -+ struct scx_dispatch_q *dsq; -+ -+ lockdep_assert(rcu_read_lock_any_held()); -+ -+ if (dsq_id == SCX_DSQ_LOCAL) { -+ return this_rq()->scx.local_dsq.nr; -+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (ops_cpu_valid(cpu)) -+ return cpu_rq(cpu)->scx.local_dsq.nr; -+ } else { -+ dsq = find_non_local_dsq(dsq_id); -+ if (dsq) -+ return dsq->nr; -+ } -+ return -ENOENT; -+} -+ -+/** -+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state -+ * @cpu: cpu to test and clear idle for -+ * -+ * Returns %true if @cpu was idle and its idle state was successfully cleared. -+ * %false otherwise. -+ * -+ * Unavailable if ops.update_idle() is implemented and -+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. -+ */ -+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return false; -+ } -+ -+ if (ops_cpu_valid(cpu)) -+ return test_and_clear_cpu_idle(cpu); -+ else -+ return false; -+} -+ -+/** -+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu -+ * @cpus_allowed: Allowed cpumask -+ * @flags: %SCX_PICK_IDLE_CPU_* flags -+ * -+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu -+ * number on success. -%EBUSY if no matching cpu was found. -+ * -+ * Idle CPU tracking may race against CPU scheduling state transitions. For -+ * example, this function may return -%EBUSY as CPUs are transitioning into the -+ * idle state. If the caller then assumes that there will be dispatch events on -+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs -+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and -+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch -+ * event in the near future. -+ * -+ * Unavailable if ops.update_idle() is implemented and -+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. -+ */ -+s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return -EBUSY; -+ } -+ -+ return scx_pick_idle_cpu(cpus_allowed, flags); -+} -+ -+/** -+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU -+ * @cpus_allowed: Allowed cpumask -+ * @flags: %SCX_PICK_IDLE_CPU_* flags -+ * -+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any -+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu -+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is -+ * empty. -+ * -+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not -+ * set, this function can't tell which CPUs are idle and will always pick any -+ * CPU. -+ */ -+s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, u64 flags) -+{ -+ s32 cpu; -+ -+ if (static_branch_likely(&scx_builtin_idle_enabled)) { -+ cpu = scx_pick_idle_cpu(cpus_allowed, flags); -+ if (cpu >= 0) -+ return cpu; -+ } -+ -+ cpu = cpumask_any_distribute(cpus_allowed); -+ if (cpu < nr_cpu_ids) -+ return cpu; -+ else -+ return -EBUSY; -+} -+ -+/** -+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking -+ * per-CPU cpumask. -+ * -+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. -+ */ -+const struct cpumask *scx_bpf_get_idle_cpumask(void) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return cpu_none_mask; -+ } -+ -+#ifdef CONFIG_SMP -+ return idle_masks.cpu; -+#else -+ return cpu_none_mask; -+#endif -+} -+ -+/** -+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, -+ * per-physical-core cpumask. Can be used to determine if an entire physical -+ * core is free. -+ * -+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. -+ */ -+const struct cpumask *scx_bpf_get_idle_smtmask(void) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return cpu_none_mask; -+ } -+ -+#ifdef CONFIG_SMP -+ if (sched_smt_active()) -+ return idle_masks.smt; -+ else -+ return idle_masks.cpu; -+#else -+ return cpu_none_mask; -+#endif -+} -+ -+/** -+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to -+ * either the percpu, or SMT idle-tracking cpumask. -+ */ -+void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -+{ -+ /* -+ * Empty function body because we aren't actually acquiring or -+ * releasing a reference to a global idle cpumask, which is read-only -+ * in the caller and is never released. The acquire / release semantics -+ * here are just used to make the cpumask is a trusted pointer in the -+ * caller. -+ */ -+} -+ -+struct scx_bpf_error_bstr_bufs { -+ u64 data[MAX_BPRINTF_VARARGS]; -+ char msg[SCX_EXIT_MSG_LEN]; -+}; -+ -+static DEFINE_PER_CPU(struct scx_bpf_error_bstr_bufs, scx_bpf_error_bstr_bufs); -+ -+/** -+ * scx_bpf_error_bstr - Indicate fatal error -+ * @fmt: error message format string -+ * @data: format string parameters packaged using ___bpf_fill() macro -+ * @data__sz: @data len, must end in '__sz' for the verifier -+ * -+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops -+ * disabling. -+ */ -+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data__sz) -+{ -+ struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; -+ struct scx_bpf_error_bstr_bufs *bufs; -+ unsigned long flags; -+ int ret; -+ -+ local_irq_save(flags); -+ bufs = this_cpu_ptr(&scx_bpf_error_bstr_bufs); -+ -+ if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || -+ (data__sz && !data)) { -+ scx_ops_error("invalid data=%p and data__sz=%u", -+ (void *)data, data__sz); -+ goto out_restore; -+ } -+ -+ ret = copy_from_kernel_nofault(bufs->data, data, data__sz); -+ if (ret) { -+ scx_ops_error("failed to read data fields (%d)", ret); -+ goto out_restore; -+ } -+ -+ ret = bpf_bprintf_prepare(fmt, UINT_MAX, bufs->data, data__sz / 8, -+ &bprintf_data); -+ if (ret < 0) { -+ scx_ops_error("failed to format prepration (%d)", ret); -+ goto out_restore; -+ } -+ -+ ret = bstr_printf(bufs->msg, sizeof(bufs->msg), fmt, -+ bprintf_data.bin_args); -+ bpf_bprintf_cleanup(&bprintf_data); -+ if (ret < 0) { -+ scx_ops_error("scx_ops_error(\"%s\", %p, %u) failed to format", -+ fmt, data, data__sz); -+ goto out_restore; -+ } -+ -+ scx_ops_error_kind(SCX_EXIT_ERROR_BPF, "%s", bufs->msg); -+out_restore: -+ local_irq_restore(flags); -+} -+ -+/** -+ * scx_bpf_destroy_dsq - Destroy a custom DSQ -+ * @dsq_id: DSQ to destroy -+ * -+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with -+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is -+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ -+ * which doesn't exist. Can be called from any online scx_ops operations. -+ */ -+void scx_bpf_destroy_dsq(u64 dsq_id) -+{ -+ destroy_dsq(dsq_id); -+} -+ -+/** -+ * scx_bpf_task_running - Is task currently running? -+ * @p: task of interest -+ */ -+bool scx_bpf_task_running(const struct task_struct *p) -+{ -+ return task_rq(p)->curr == p; -+} -+ -+/** -+ * scx_bpf_task_cpu - CPU a task is currently associated with -+ * @p: task of interest -+ */ -+s32 scx_bpf_task_cpu(const struct task_struct *p) -+{ -+ return task_cpu(p); -+} -+ -+/** -+ * scx_bpf_task_cgroup - Return the sched cgroup of a task -+ * @p: task of interest -+ * -+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with -+ * from the scheduler's POV. SCX operations should use this function to -+ * determine @p's current cgroup as, unlike following @p->cgroups, -+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all -+ * rq-locked operations. Can be called on the parameter tasks of rq-locked -+ * operations. The restriction guarantees that @p's rq is locked by the caller. -+ */ -+#ifdef CONFIG_CGROUP_SCHED -+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) -+{ -+ struct task_group *tg = p->sched_task_group; -+ struct cgroup *cgrp = &cgrp_dfl_root.cgrp; -+ -+ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) -+ goto out; -+ -+ /* -+ * A task_group may either be a cgroup or an autogroup. In the latter -+ * case, @tg->css.cgroup is %NULL. A task_group can't become the other -+ * kind once created. -+ */ -+ if (tg && tg->css.cgroup) -+ cgrp = tg->css.cgroup; -+ else -+ cgrp = &cgrp_dfl_root.cgrp; -+out: -+ cgroup_get(cgrp); -+ return cgrp; -+} -+#endif -+ -+BTF_SET8_START(scx_kfunc_ids_ops_only) -+BTF_ID_FLAGS(func, scx_bpf_kick_cpu) -+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) -+BTF_SET8_END(scx_kfunc_ids_ops_only) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_ops_only = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_ops_only, -+}; -+ -+BTF_SET8_START(scx_kfunc_ids_any) -+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) -+#ifdef CONFIG_CGROUP_SCHED -+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) -+#endif -+BTF_SET8_END(scx_kfunc_ids_any) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_any = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_any, -+}; -+ -+__diag_pop(); -+ -+/* -+ * This can't be done from init_sched_ext_class() as register_btf_kfunc_id_set() -+ * needs most of the system to be up. -+ */ -+static int __init register_ext_kfuncs(void) -+{ -+ int ret; -+ -+ /* -+ * Some kfuncs are context-sensitive and can only be called from -+ * specific SCX ops. They are grouped into BTF sets accordingly. -+ * Unfortunately, BPF currently doesn't have a way of enforcing such -+ * restrictions. Eventually, the verifier should be able to enforce -+ * them. For now, register them the same and make each kfunc explicitly -+ * check using scx_kf_allowed(). -+ */ -+ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_init)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_sleepable)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_enqueue_dispatch)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_dispatch)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_cpu_release)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_ops_only)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_any)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, -+ &scx_kfunc_set_any))) { -+ pr_err("sched_ext: failed to register kfunc sets (%d)\n", ret); -+ return ret; -+ } -+ -+ return 0; -+} -+__initcall(register_ext_kfuncs); -diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h -new file mode 100644 -index 000000000..27248760f ---- /dev/null -+++ b/kernel/sched/ext.h -@@ -0,0 +1,266 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+enum scx_wake_flags { -+ /* expose select WF_* flags as enums */ -+ SCX_WAKE_EXEC = WF_EXEC, -+ SCX_WAKE_FORK = WF_FORK, -+ SCX_WAKE_TTWU = WF_TTWU, -+ SCX_WAKE_SYNC = WF_SYNC, -+}; -+ -+enum scx_enq_flags { -+ /* expose select ENQUEUE_* flags as enums */ -+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, -+ SCX_ENQ_HEAD = ENQUEUE_HEAD, -+ -+ /* high 32bits are SCX specific */ -+ -+ /* -+ * Set the following to trigger preemption when calling -+ * scx_bpf_dispatch() with a local dsq as the target. The slice of the -+ * current task is cleared to zero and the CPU is kicked into the -+ * scheduling path. Implies %SCX_ENQ_HEAD. -+ */ -+ SCX_ENQ_PREEMPT = 1LLU << 32, -+ -+ /* -+ * The task being enqueued was previously enqueued on the current CPU's -+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the -+ * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was -+ * invoked in a ->cpu_release() callback, and the task is again -+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the -+ * task will not be scheduled on the CPU until at least the next invocation -+ * of the ->cpu_acquire() callback. -+ */ -+ SCX_ENQ_REENQ = 1LLU << 40, -+ -+ /* -+ * The task being enqueued is the only task available for the cpu. By -+ * default, ext core keeps executing such tasks but when -+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with -+ * %SCX_ENQ_LAST and %SCX_ENQ_LOCAL flags set. -+ * -+ * If the BPF scheduler wants to continue executing the task, -+ * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. -+ * If the task gets queued on a different dsq or the BPF side, the BPF -+ * scheduler is responsible for triggering a follow-up scheduling event. -+ * Otherwise, Execution may stall. -+ */ -+ SCX_ENQ_LAST = 1LLU << 41, -+ -+ /* -+ * A hint indicating that it's advisable to enqueue the task on the -+ * local dsq of the currently selected CPU. Currently used by -+ * select_cpu_dfl() and together with %SCX_ENQ_LAST. -+ */ -+ SCX_ENQ_LOCAL = 1LLU << 42, -+ -+ /* high 8 bits are internal */ -+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, -+ -+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56, -+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -+}; -+ -+enum scx_deq_flags { -+ /* expose select DEQUEUE_* flags as enums */ -+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, -+ -+ /* high 32bits are SCX specific */ -+ -+ /* -+ * The generic core-sched layer decided to execute the task even though -+ * it hasn't been dispatched yet. Dequeue from the BPF side. -+ */ -+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -+}; -+ -+enum scx_pick_idle_cpu_flags { -+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ -+}; -+ -+enum scx_kick_flags { -+ SCX_KICK_PREEMPT = 1LLU << 0, /* force scheduling on the CPU */ -+ SCX_KICK_WAIT = 1LLU << 1, /* wait for the CPU to be rescheduled */ -+}; -+ -+enum scx_tg_flags { -+ SCX_TG_ONLINE = 1U << 0, -+ SCX_TG_INITED = 1U << 1, -+}; -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ -+struct sched_enq_and_set_ctx { -+ struct task_struct *p; -+ int queue_flags; -+ bool queued; -+ bool running; -+}; -+ -+void sched_deq_and_put_task(struct task_struct *p, int queue_flags, -+ struct sched_enq_and_set_ctx *ctx); -+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); -+ -+extern const struct sched_class ext_sched_class; -+extern const struct bpf_verifier_ops bpf_sched_ext_verifier_ops; -+extern const struct file_operations sched_ext_fops; -+extern unsigned long scx_watchdog_timeout; -+extern unsigned long scx_watchdog_timestamp; -+ -+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); -+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); -+#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) -+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) -+ -+DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -+ -+static inline bool task_on_scx(const struct task_struct *p) -+{ -+ return scx_enabled() && p->sched_class == &ext_sched_class; -+} -+ -+bool task_should_scx(struct task_struct *p); -+void scx_pre_fork(struct task_struct *p); -+int scx_fork(struct task_struct *p); -+void scx_post_fork(struct task_struct *p); -+void scx_cancel_fork(struct task_struct *p); -+int scx_check_setscheduler(struct task_struct *p, int policy); -+bool scx_can_stop_tick(struct rq *rq); -+void init_sched_ext_class(void); -+ -+__printf(2, 3) void scx_ops_error_kind(enum scx_exit_kind kind, -+ const char *fmt, ...); -+#define scx_ops_error(fmt, args...) \ -+ scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) -+ -+void __scx_notify_pick_next_task(struct rq *rq, -+ struct task_struct *p, -+ const struct sched_class *active); -+ -+static inline void scx_notify_pick_next_task(struct rq *rq, -+ struct task_struct *p, -+ const struct sched_class *active) -+{ -+ if (!scx_enabled()) -+ return; -+#ifdef CONFIG_SMP -+ /* -+ * Pairs with the smp_load_acquire() issued by a CPU in -+ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a -+ * resched. -+ */ -+ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); -+#endif -+ if (!static_branch_unlikely(&scx_ops_cpu_preempt)) -+ return; -+ __scx_notify_pick_next_task(rq, p, active); -+} -+ -+static inline void scx_notify_sched_tick(void) -+{ -+ unsigned long last_check; -+ -+ if (!scx_enabled()) -+ return; -+ -+ last_check = scx_watchdog_timestamp; -+ if (unlikely(time_after(jiffies, last_check + scx_watchdog_timeout))) { -+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check); -+ -+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, -+ "watchdog failed to check in for %u.%03us", -+ dur_ms / 1000, dur_ms % 1000); -+ } -+} -+ -+static inline const struct sched_class *next_active_class(const struct sched_class *class) -+{ -+ class++; -+ if (scx_switched_all() && class == &fair_sched_class) -+ class++; -+ if (!scx_enabled() && class == &ext_sched_class) -+ class++; -+ return class; -+} -+ -+#define for_active_class_range(class, _from, _to) \ -+ for (class = (_from); class != (_to); class = next_active_class(class)) -+ -+#define for_each_active_class(class) \ -+ for_active_class_range(class, __sched_class_highest, __sched_class_lowest) -+ -+/* -+ * SCX requires a balance() call before every pick_next_task() call including -+ * when waking up from idle. -+ */ -+#define for_balance_class_range(class, prev_class, end_class) \ -+ for_active_class_range(class, (prev_class) > &ext_sched_class ? \ -+ &ext_sched_class : (prev_class), (end_class)) -+ -+#ifdef CONFIG_SCHED_CORE -+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, -+ bool in_fi); -+#endif -+ -+#else /* CONFIG_SCHED_CLASS_EXT */ -+ -+#define scx_enabled() false -+#define scx_switched_all() false -+ -+static inline bool task_on_scx(const struct task_struct *p) { return false; } -+static inline void scx_pre_fork(struct task_struct *p) {} -+static inline int scx_fork(struct task_struct *p) { return 0; } -+static inline void scx_post_fork(struct task_struct *p) {} -+static inline void scx_cancel_fork(struct task_struct *p) {} -+static inline int scx_check_setscheduler(struct task_struct *p, -+ int policy) { return 0; } -+static inline bool scx_can_stop_tick(struct rq *rq) { return true; } -+static inline void init_sched_ext_class(void) {} -+static inline void scx_notify_pick_next_task(struct rq *rq, -+ const struct task_struct *p, -+ const struct sched_class *active) {} -+static inline void scx_notify_sched_tick(void) {} -+ -+#define for_each_active_class for_each_class -+#define for_balance_class_range for_class_range -+ -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+ -+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) -+void __scx_update_idle(struct rq *rq, bool idle); -+ -+static inline void scx_update_idle(struct rq *rq, bool idle) -+{ -+ if (scx_enabled()) -+ __scx_update_idle(rq, idle); -+} -+#else -+static inline void scx_update_idle(struct rq *rq, bool idle) {} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+#ifdef CONFIG_EXT_GROUP_SCHED -+int scx_tg_online(struct task_group *tg); -+void scx_tg_offline(struct task_group *tg); -+int scx_cgroup_can_attach(struct cgroup_taskset *tset); -+void scx_move_task(struct task_struct *p); -+void scx_cgroup_finish_attach(void); -+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); -+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); -+#else /* CONFIG_EXT_GROUP_SCHED */ -+static inline int scx_tg_online(struct task_group *tg) { return 0; } -+static inline void scx_tg_offline(struct task_group *tg) {} -+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -+static inline void scx_move_task(struct task_struct *p) {} -+static inline void scx_cgroup_finish_attach(void) {} -+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} -+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+#endif /* CONFIG_CGROUP_SCHED */ -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index fa9fff0f9..1ed9d351c 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -3785,7 +3785,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - } - } - --void reweight_task(struct task_struct *p, int prio) -+static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio) - { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); -@@ -8187,7 +8187,7 @@ static void check_preempt_wakeup(struct rq *rq, struct task_struct *p, int wake_ - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): - */ -- if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) -+ if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION)) - return; - - find_matching_se(&se, &pse); -@@ -12325,14 +12325,14 @@ void trigger_load_balance(struct rq *rq) - nohz_balancer_kick(rq); - } - --static void rq_online_fair(struct rq *rq) -+static void rq_online_fair(struct rq *rq, enum rq_onoff_reason reason) - { - update_sysctl(); - - update_runtime_enabled(rq); - } - --static void rq_offline_fair(struct rq *rq) -+static void rq_offline_fair(struct rq *rq, enum rq_onoff_reason reason) - { - update_sysctl(); - -@@ -13024,6 +13024,7 @@ DEFINE_SCHED_CLASS(fair) = { - .task_tick = task_tick_fair, - .task_fork = task_fork_fair, - -+ .reweight_task = reweight_task_fair, - .prio_changed = prio_changed_fair, - .switched_from = switched_from_fair, - .switched_to = switched_to_fair, -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index 5007b25c5..b33cefeb4 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -408,11 +408,13 @@ static void check_preempt_curr_idle(struct rq *rq, struct task_struct *p, int fl - - static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) - { -+ scx_update_idle(rq, false); - } - - static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) - { - update_idle_core(rq); -+ scx_update_idle(rq, true); - schedstat_inc(rq->sched_goidle); - } - -diff --git a/kernel/sched/rt.c b/kernel/sched/rt.c -index 904dd8534..449a9f28d 100644 ---- a/kernel/sched/rt.c -+++ b/kernel/sched/rt.c -@@ -2481,7 +2481,7 @@ static void task_woken_rt(struct rq *rq, struct task_struct *p) - } - - /* Assumes rq->lock is held */ --static void rq_online_rt(struct rq *rq) -+static void rq_online_rt(struct rq *rq, enum rq_onoff_reason reason) - { - if (rq->rt.overloaded) - rt_set_overload(rq); -@@ -2492,7 +2492,7 @@ static void rq_online_rt(struct rq *rq) - } - - /* Assumes rq->lock is held */ --static void rq_offline_rt(struct rq *rq) -+static void rq_offline_rt(struct rq *rq, enum rq_onoff_reason reason) - { - if (rq->rt.overloaded) - rt_clear_overload(rq); -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 048462724..0b33d0117 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -182,9 +182,19 @@ static inline int idle_policy(int policy) - { - return policy == SCHED_IDLE; - } -+ -+static inline int normal_policy(int policy) -+{ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ if (policy == SCHED_EXT) -+ return true; -+#endif -+ return policy == SCHED_NORMAL; -+} -+ - static inline int fair_policy(int policy) - { -- return policy == SCHED_NORMAL || policy == SCHED_BATCH; -+ return normal_policy(policy) || policy == SCHED_BATCH; - } - - static inline int rt_policy(int policy) -@@ -232,6 +242,24 @@ static inline void update_avg(u64 *avg, u64 sample) - #define shr_bound(val, shift) \ - (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) - -+/* -+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are -+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it -+ * maps pretty well onto the shares value used by scheduler and the round-trip -+ * conversions preserve the original value over the entire range. -+ */ -+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) -+{ -+ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); -+} -+ -+static inline unsigned long sched_weight_to_cgroup(unsigned long weight) -+{ -+ return clamp_t(unsigned long, -+ DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), -+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); -+} -+ - /* - * !! For sched_setattr_nocheck() (kernel) only !! - * -@@ -390,6 +418,11 @@ struct task_group { - struct rt_bandwidth rt_bandwidth; - #endif - -+#ifdef CONFIG_EXT_GROUP_SCHED -+ u32 scx_flags; /* SCX_TG_* */ -+ u32 scx_weight; -+#endif -+ - struct rcu_head rcu; - struct list_head list; - -@@ -445,6 +478,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) - return walk_tg_tree_from(&root_task_group, down, up, data); - } - -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ - extern int tg_nop(struct task_group *tg, void *data); - - extern void free_fair_sched_group(struct task_group *tg); -@@ -490,6 +528,11 @@ extern void set_task_rq_fair(struct sched_entity *se, - static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } - #endif /* CONFIG_SMP */ -+#else /* CONFIG_FAIR_GROUP_SCHED */ -+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) -+{ -+ return 0; -+} - #endif /* CONFIG_FAIR_GROUP_SCHED */ - - #else /* CONFIG_CGROUP_SCHED */ -@@ -651,6 +694,28 @@ struct cfs_rq { - #endif /* CONFIG_FAIR_GROUP_SCHED */ - }; - -+#ifdef CONFIG_SCHED_CLASS_EXT -+/* scx_rq->flags, protected by the rq lock */ -+enum scx_rq_flags { -+ SCX_RQ_CAN_STOP_TICK = 1 << 0, -+}; -+ -+struct scx_rq { -+ struct scx_dispatch_q local_dsq; -+ struct list_head watchdog_list; -+ unsigned long ops_qseq; -+ u64 extra_enq_flags; /* see move_task_to_local_dsq() */ -+ u32 nr_running; -+ u32 flags; -+ bool cpu_released; -+ cpumask_var_t cpus_to_kick; -+ cpumask_var_t cpus_to_preempt; -+ cpumask_var_t cpus_to_wait; -+ unsigned long pnt_seq; -+ struct irq_work kick_cpus_irq_work; -+}; -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+ - static inline int rt_bandwidth_enabled(void) - { - return sysctl_sched_rt_runtime >= 0; -@@ -998,6 +1063,9 @@ struct rq { - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ struct scx_rq scx; -+#endif - - #ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this CPU: */ -@@ -2222,6 +2290,11 @@ extern const u32 sched_prio_to_wmult[40]; - - #define RETRY_TASK ((void *)-1UL) - -+enum rq_onoff_reason { -+ RQ_ONOFF_HOTPLUG, /* CPU is going on/offline */ -+ RQ_ONOFF_TOPOLOGY, /* sched domain topology update */ -+}; -+ - struct affinity_context { - const struct cpumask *new_mask; - struct cpumask *user_mask; -@@ -2258,8 +2331,8 @@ struct sched_class { - - void (*set_cpus_allowed)(struct task_struct *p, struct affinity_context *ctx); - -- void (*rq_online)(struct rq *rq); -- void (*rq_offline)(struct rq *rq); -+ void (*rq_online)(struct rq *rq, enum rq_onoff_reason reason); -+ void (*rq_offline)(struct rq *rq, enum rq_onoff_reason reason); - - struct rq *(*find_lock_rq)(struct task_struct *p, struct rq *rq); - #endif -@@ -2273,8 +2346,11 @@ struct sched_class { - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. - */ -+ void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); -+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task, -+ int newprio); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); - -@@ -2432,7 +2508,7 @@ extern void init_sched_dl_class(void); - extern void init_sched_rt_class(void); - extern void init_sched_fair_class(void); - --extern void reweight_task(struct task_struct *p, int prio); -+extern void __setscheduler_prio(struct task_struct *p, int prio); - - extern void resched_curr(struct rq *rq); - extern void resched_cpu(int cpu); -@@ -2513,6 +2589,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); - extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); - -+extern void check_class_changing(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class); -+extern void check_class_changed(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class, -+ int oldprio); -+ - extern void check_preempt_curr(struct rq *rq, struct task_struct *p, int flags); - - #ifdef CONFIG_PREEMPT_RT -@@ -2794,8 +2876,8 @@ static inline void double_rq_unlock(struct rq *rq1, struct rq *rq2) - raw_spin_rq_unlock(rq1); - } - --extern void set_rq_online (struct rq *rq); --extern void set_rq_offline(struct rq *rq); -+extern void set_rq_online (struct rq *rq, enum rq_onoff_reason reason); -+extern void set_rq_offline(struct rq *rq, enum rq_onoff_reason reason); - extern bool sched_smp_initialized; - - #else /* CONFIG_SMP */ -@@ -3528,4 +3610,27 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } - extern u64 avg_vruntime(struct cfs_rq *cfs_rq); - extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); - -+#ifdef CONFIG_CGROUP_SCHED -+enum cpu_cftype_id { -+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) -+ CPU_CFTYPE_WEIGHT, -+ CPU_CFTYPE_WEIGHT_NICE, -+ CPU_CFTYPE_IDLE, -+#endif -+#ifdef CONFIG_CFS_BANDWIDTH -+ CPU_CFTYPE_MAX, -+ CPU_CFTYPE_MAX_BURST, -+#endif -+#ifdef CONFIG_UCLAMP_TASK_GROUP -+ CPU_CFTYPE_UCLAMP_MIN, -+ CPU_CFTYPE_UCLAMP_MAX, -+#endif -+ CPU_CFTYPE_CNT, -+}; -+ -+extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1]; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#include "ext.h" -+ - #endif /* _KERNEL_SCHED_SCHED_H */ -diff --git a/kernel/sched/topology.c b/kernel/sched/topology.c -index 423d08947..2adf6a0fb 100644 ---- a/kernel/sched/topology.c -+++ b/kernel/sched/topology.c -@@ -495,7 +495,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - old_rd = rq->rd; - - if (cpumask_test_cpu(rq->cpu, old_rd->online)) -- set_rq_offline(rq); -+ set_rq_offline(rq, RQ_ONOFF_TOPOLOGY); - - cpumask_clear_cpu(rq->cpu, old_rd->span); - -@@ -513,7 +513,7 @@ void rq_attach_root(struct rq *rq, struct root_domain *rd) - - cpumask_set_cpu(rq->cpu, rd->span); - if (cpumask_test_cpu(rq->cpu, cpu_active_mask)) -- set_rq_online(rq); -+ set_rq_online(rq, RQ_ONOFF_TOPOLOGY); - - rq_unlock_irqrestore(rq, &rf); - -diff --git a/lib/dump_stack.c b/lib/dump_stack.c -index 83471e815..6e667c445 100644 ---- a/lib/dump_stack.c -+++ b/lib/dump_stack.c -@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) - - print_worker_info(log_lvl, current); - print_stop_info(log_lvl, current); -+ print_scx_info(log_lvl, current); - } - - /** -diff --git a/tools/Makefile b/tools/Makefile -index 37e9f6804..8021267f7 100644 ---- a/tools/Makefile -+++ b/tools/Makefile -@@ -29,6 +29,7 @@ help: - @echo ' pci - PCI tools' - @echo ' perf - Linux performance measurement and analysis tool' - @echo ' selftests - various kernel selftests' -+ @echo ' sched_ext - sched_ext example schedulers' - @echo ' bootconfig - boot config tool' - @echo ' spi - spi tools' - @echo ' tmon - thermal monitoring and tuning tool' -@@ -92,6 +93,9 @@ perf: FORCE - $(Q)mkdir -p $(PERF_O) . - $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= - -+sched_ext: FORCE -+ $(call descend,sched_ext) -+ - selftests: FORCE - $(call descend,testing/$@) - -@@ -185,6 +189,9 @@ perf_clean: - $(Q)mkdir -p $(PERF_O) . - $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean - -+sched_ext_clean: -+ $(call descend,sched_ext,clean) -+ - selftests_clean: - $(call descend,testing/$(@:_clean=),clean) - -@@ -214,6 +221,7 @@ clean: acpi_clean cgroup_clean counter_clean cpupower_clean hv_clean firewire_cl - mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ - freefall_clean build_clean libbpf_clean libsubcmd_clean \ - gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ -- intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean -+ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ -+ sched_ext_clean - - .PHONY: FORCE -diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore -new file mode 100644 -index 000000000..215ed36b2 ---- /dev/null -+++ b/tools/sched_ext/.gitignore -@@ -0,0 +1,10 @@ -+scx_simple -+scx_qmap -+scx_central -+scx_pair -+scx_flatcg -+scx_userland -+*.skel.h -+*.subskel.h -+/tools/ -+build/ -diff --git a/tools/sched_ext/Kconfig b/tools/sched_ext/Kconfig -new file mode 100644 -index 000000000..6543fcf19 ---- /dev/null -+++ b/tools/sched_ext/Kconfig -@@ -0,0 +1,9 @@ -+CONFIG_BPF=y -+CONFIG_SCHED_CLASS_EXT=y -+CONFIG_BPF_SYSCALL=y -+CONFIG_BPF_JIT=y -+CONFIG_DEBUG_INFO_BTF=y -+CONFIG_BPF_JIT_ALWAYS_ON=y -+CONFIG_BPF_JIT_DEFAULT_ON=y -+CONFIG_PAHOLE_HAS_SPLIT_BTF=y -+CONFIG_PAHOLE_HAS_BTF_TAG=y -diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile -new file mode 100644 -index 000000000..7ea754b7d ---- /dev/null -+++ b/tools/sched_ext/Makefile -@@ -0,0 +1,301 @@ -+# SPDX-License-Identifier: GPL-2.0 -+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+include ../build/Build.include -+include ../scripts/Makefile.arch -+include ../scripts/Makefile.include -+ -+all: all_targets -+ -+ifneq ($(LLVM),) -+ifneq ($(filter %/,$(LLVM)),) -+LLVM_PREFIX := $(LLVM) -+else ifneq ($(filter -%,$(LLVM)),) -+LLVM_SUFFIX := $(LLVM) -+endif -+ -+CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi -+CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu -+CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl -+CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu -+CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu -+CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu -+CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu -+CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu -+CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu -+CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) -+ -+ifeq ($(CROSS_COMPILE),) -+ifeq ($(CLANG_TARGET_FLAGS),) -+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk -+else -+CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) -+endif # CLANG_TARGET_FLAGS -+else -+CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) -+endif # CROSS_COMPILE -+ -+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as -+else -+CC := $(CROSS_COMPILE)gcc -+endif # LLVM -+ -+CURDIR := $(abspath .) -+TOOLSDIR := $(abspath ..) -+LIBDIR := $(TOOLSDIR)/lib -+BPFDIR := $(LIBDIR)/bpf -+TOOLSINCDIR := $(TOOLSDIR)/include -+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool -+APIDIR := $(TOOLSINCDIR)/uapi -+GENDIR := $(abspath ../../include/generated) -+GENHDR := $(GENDIR)/autoconf.h -+ -+ifeq ($(O),) -+OUTPUT_DIR := $(CURDIR)/build -+else -+OUTPUT_DIR := $(O)/build -+endif # O -+OBJ_DIR := $(OUTPUT_DIR)/obj -+INCLUDE_DIR := $(OUTPUT_DIR)/include -+BPFOBJ_DIR := $(OBJ_DIR)/libbpf -+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext -+BINDIR := $(OUTPUT_DIR)/bin -+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a -+ifneq ($(CROSS_COMPILE),) -+HOST_BUILD_DIR := $(OBJ_DIR)/host -+HOST_OUTPUT_DIR := host-tools -+HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include -+else -+HOST_BUILD_DIR := $(OBJ_DIR) -+HOST_OUTPUT_DIR := $(OUTPUT_DIR) -+HOST_INCLUDE_DIR := $(INCLUDE_DIR) -+endif -+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a -+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids -+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool -+ -+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ -+ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ -+ ../../vmlinux \ -+ /sys/kernel/btf/vmlinux \ -+ /boot/vmlinux-$(shell uname -r) -+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) -+ifeq ($(VMLINUX_BTF),) -+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") -+endif -+ -+BPFTOOL ?= $(DEFAULT_BPFTOOL) -+ -+ifneq ($(wildcard $(GENHDR)),) -+ GENFLAGS := -DHAVE_GENHDR -+endif -+ -+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ -+ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -+ -I$(TOOLSINCDIR) -I$(APIDIR) -+ -+CARGOFLAGS := --release --target-dir $(OUTPUT_DIR) -+ifneq ($(CARGO_OFFLINE),) -+CARGOFLAGS += --offline -+endif -+ -+# Silence some warnings when compiled with clang -+ifneq ($(LLVM),) -+CFLAGS += -Wno-unused-command-line-argument -+endif -+ -+LDFLAGS = -lelf -lz -lpthread -+ -+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ -+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -+$(shell $(1) -dM -E - $@ -+else -+ $(call msg,CP,,$@) -+ $(Q)cp "$(VMLINUX_H)" $@ -+endif -+ -+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h scx_common.bpf.h \ -+ user_exit_info.h ravg.bpf.h ravg_impl.bpf.h \ -+ | $(BPFOBJ) $(SCXOBJ_DIR) -+ $(call msg,CLNG-BPF,,$(notdir $@)) -+ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ -+ -+$(INCLUDE_DIR)/%.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) -+ $(eval sched=$(notdir $@)) -+ $(call msg,GEN-SKEL,,$(sched)) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) -+ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) -+ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $@ -+ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) -+ -+SCX_COMMON_DEPS := scx_common.h user_exit_info.h | $(BINDIR) -+ -+################ -+# C schedulers # -+################ -+c-sched-targets = scx_simple scx_qmap scx_central scx_pair scx_flatcg \ -+ scx_userland scx_nest -+ -+$(addprefix $(BINDIR)/,$(c-sched-targets)): \ -+ $(BINDIR)/%: \ -+ $(filter-out %.bpf.c,%.c) \ -+ $(INCLUDE_DIR)/%.skel.h \ -+ $(SCX_COMMON_DEPS) -+ $(eval sched=$(notdir $@)) -+ $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o -+ $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) -+$(c-sched-targets): %: $(BINDIR)/% -+ -+ -+################### -+# Rust schedulers # -+################### -+rust-sched-targets := scx_rusty scx_layered -+ -+# Separate build target that is available for build systems to use to fetch -+# dependencies in a separate step from building. This allows the scheduler -+# to be compiled without network access. -+# -+# If the regular rust scheduler Make target (e.g. scx_rusty) is invoked without -+# CARGO_OFFLINE=1 (e.g. if building locally), then cargo build will download -+# all of the necessary dependencies, and the deps target can be skipped. -+$(addsuffix _deps,$(rust-sched-targets)): -+ $(eval sched=$(@:_deps=)) -+ $(Q)cargo fetch --manifest-path=$(sched)/Cargo.toml -+ -+$(rust-sched-targets): %: $(INCLUDE_DIR)/vmlinux.h $(SCX_COMMON_DEPS) -+ $(eval export RUSTFLAGS = -C link-args=-lzstd -C link-args=-lz -C link-args=-lelf -L $(BPFOBJ_DIR)) -+ $(eval export SCX_RUST_CLANG = $(CLANG)) -+ $(eval export SCX_RUST_BPF_CFLAGS= $(BPF_CFLAGS)) -+ $(eval sched=$(notdir $@)) -+ $(Q)cargo build --manifest-path=$(sched)/Cargo.toml $(CARGOFLAGS) -+ $(Q)cp $(OUTPUT_DIR)/release/$(sched) $(BINDIR)/$@ -+ -+install: all -+ $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ -+ $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ -+ -+clean: -+ $(foreach sched,$(rust-sched-targets),cargo clean --manifest-path=$(sched)/Cargo.toml;) -+ rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) -+ rm -f *.o *.bpf.o *.skel.h *.subskel.h -+ rm -f $(c-sched-targets) -+ -+help: -+ @echo 'Building targets' -+ @echo '================' -+ @echo '' -+ @echo ' all - Compile all schedulers' -+ @echo '' -+ @echo 'Alternatively, you may compile individual schedulers:' -+ @echo '' -+ @printf ' %s\n' $(c-sched-targets) $(rust-sched-targets) -+ @echo '' -+ @echo 'For any scheduler build target, you may specify an alternative' -+ @echo 'build output path with the O= environment variable. For example:' -+ @echo '' -+ @echo ' O=/tmp/sched_ext make all' -+ @echo '' -+ @echo 'will compile all schedulers, and emit the build artifacts to' -+ @echo '/tmp/sched_ext/build.' -+ @echo '' -+ @echo '' -+ @echo 'Rust scheduler targets' -+ @echo '======================' -+ @echo '' -+ @printf ' %s\n' $(rust-sched-targets) -+ @printf ' %s_deps\n' $(rust-sched-targets) -+ @echo '' -+ @echo 'For any rust schedulers built with cargo, you can specify' -+ @echo 'CARGO_OFFLINE=1 to ensure the build portion does not access the' -+ @echo 'network (e.g. if the scheduler is being packaged).' -+ @echo '' -+ @echo 'For such use cases, the build workflow will look something like this:' -+ @echo '' -+ @echo ' make scx_rusty_deps' -+ @echo ' CARGO_OFFLINE=1 make scx_rusty' -+ @echo '' -+ @echo 'If network access during build is allowed, you can just make scx_rusty' -+ @echo 'directly without CARGO_OFFLINE, and dependencies will be downloaded' -+ @echo 'during the build step.' -+ @echo '' -+ @echo '' -+ @echo 'Installing targets' -+ @echo '==================' -+ @echo '' -+ @echo ' install - Compile and install all schedulers to /usr/bin.' -+ @echo ' You may specify the DESTDIR= environment variable' -+ @echo ' to indicate a prefix for /usr/bin. For example:' -+ @echo '' -+ @echo ' DESTDIR=/tmp/sched_ext make install' -+ @echo '' -+ @echo ' will build the schedulers in CWD/build, and' -+ @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' -+ @echo '' -+ @echo '' -+ @echo 'Cleaning targets' -+ @echo '================' -+ @echo '' -+ @echo ' clean - Remove all generated files, including intermediate' -+ @echo ' rust files for rust schedulers.' -+ -+all_targets: $(c-sched-targets) $(rust-sched-targets) -+ -+.PHONY: all all_targets $(c-sched-targets) $(rust-sched-targets) clean help -+ -+# delete failed targets -+.DELETE_ON_ERROR: -+ -+# keep intermediate (.skel.h, .bpf.o, etc) targets -+.SECONDARY: -diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md -new file mode 100644 -index 000000000..8e7194ada ---- /dev/null -+++ b/tools/sched_ext/README.md -@@ -0,0 +1,403 @@ -+SCHED_EXT EXAMPLE SCHEDULERS -+============================ -+ -+# Introduction -+ -+This directory contains a number of example sched_ext schedulers. These -+schedulers are meant to provide examples of different types of schedulers -+that can be built using sched_ext, and illustrate how various features of -+sched_ext can be used. -+ -+Some of the examples are performant, production-ready schedulers. That is, for -+the correct workload and with the correct tuning, they may be deployed in a -+production environment with acceptable or possibly even improved performance. -+Others are just examples that in practice, would not provide acceptable -+performance (though they could be improved to get there). -+ -+This README will describe these example schedulers, including describing the -+types of workloads or scenarios they're designed to accommodate, and whether or -+not they're production ready. For more details on any of these schedulers, -+please see the header comment in their .bpf.c file. -+ -+ -+# Compiling the examples -+ -+There are a few toolchain dependencies for compiling the example schedulers. -+ -+## Toolchain dependencies -+ -+1. clang >= 16.0.0 -+ -+The schedulers are BPF programs, and therefore must be compiled with clang. gcc -+is actively working on adding a BPF backend compiler as well, but are still -+missing some features such as BTF type tags which are necessary for using -+kptrs. -+ -+2. pahole >= 1.25 -+ -+You may need pahole in order to generate BTF from DWARF. -+ -+3. rust >= 1.70.0 -+ -+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You -+should be able to use the stable build from rustup, but if that doesn't -+work, try using the rustup nightly build. -+ -+There are other requirements as well, such as make, but these are the main / -+non-trivial ones. -+ -+## Compiling the kernel -+ -+In order to run a sched_ext scheduler, you'll have to run a kernel compiled -+with the patches in this repository, and with a minimum set of necessary -+Kconfig options: -+ -+``` -+CONFIG_BPF=y -+CONFIG_SCHED_CLASS_EXT=y -+CONFIG_BPF_SYSCALL=y -+CONFIG_BPF_JIT=y -+CONFIG_DEBUG_INFO_BTF=y -+``` -+ -+It's also recommended that you also include the following Kconfig options: -+ -+``` -+CONFIG_BPF_JIT_ALWAYS_ON=y -+CONFIG_BPF_JIT_DEFAULT_ON=y -+CONFIG_PAHOLE_HAS_SPLIT_BTF=y -+CONFIG_PAHOLE_HAS_BTF_TAG=y -+``` -+ -+There is a `Kconfig` file in this directory whose contents you can append to -+your local `.config` file, as long as there are no conflicts with any existing -+options in the file. -+ -+## Getting a vmlinux.h file -+ -+You may notice that most of the example schedulers include a "vmlinux.h" file. -+This is a large, auto-generated header file that contains all of the types -+defined in some vmlinux binary that was compiled with -+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig -+options specified above). -+ -+The header file is created using `bpftool`, by passing it a vmlinux binary -+compiled with BTF as follows: -+ -+```bash -+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h -+``` -+ -+`bpftool` analyzes all of the BTF encodings in the binary, and produces a -+header file that can be included by BPF programs to access those types. For -+example, using vmlinux.h allows a scheduler to access fields defined directly -+in vmlinux as follows: -+ -+```c -+#include "vmlinux.h" -+// vmlinux.h is also implicitly included by scx_common.bpf.h. -+#include "scx_common.bpf.h" -+ -+/* -+ * vmlinux.h provides definitions for struct task_struct and -+ * struct scx_enable_args. -+ */ -+void BPF_STRUCT_OPS(example_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ bpf_printk("Task %s enabled in example scheduler", p->comm); -+} -+ -+// vmlinux.h provides the definition for struct sched_ext_ops. -+SEC(".struct_ops.link") -+struct sched_ext_ops example_ops { -+ .enable = (void *)example_enable, -+ .name = "example", -+} -+``` -+ -+The scheduler build system will generate this vmlinux.h file as part of the -+scheduler build pipeline. It looks for a vmlinux file in the following -+dependency order: -+ -+1. If the O= environment variable is defined, at `$O/vmlinux` -+2. If the KBUILD_OUTPUT= environment variable is defined, at -+ `$KBUILD_OUTPUT/vmlinux` -+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're -+ compiling the schedulers) -+3. `/sys/kernel/btf/vmlinux` -+4. `/boot/vmlinux-$(uname -r)` -+ -+In other words, if you have compiled a kernel in your local repo, its vmlinux -+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of -+the kernel you're currently running on. This means that if you're running on a -+kernel with sched_ext support, you may not need to compile a local kernel at -+all. -+ -+### Aside on CO-RE -+ -+One of the cooler features of BPF is that it supports -+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run -+Everywhere). This feature allows you to reference fields inside of structs with -+types defined internal to the kernel, and not have to recompile if you load the -+BPF program on a different kernel with the field at a different offset. In our -+example above, we print out a task name with `p->comm`. CO-RE would perform -+relocations for that access when the program is loaded to ensure that it's -+referencing the correct offset for the currently running kernel. -+ -+## Compiling the schedulers -+ -+Once you have your toolchain setup, and a vmlinux that can be used to generate -+a full vmlinux.h file, you can compile the schedulers using `make`: -+ -+```bash -+$ make -j($nproc) -+``` -+ -+# Schedulers -+ -+This section lists, in alphabetical order, all of the current example -+schedulers. -+ -+-------------------------------------------------------------------------------- -+ -+## scx_simple -+ -+### Overview -+ -+A simple scheduler that provides an example of a minimal sched_ext -+scheduler. scx_simple can be run in either global weighted vtime mode, or -+FIFO mode. -+ -+### Typical Use Case -+ -+Though very simple, this scheduler should perform reasonably well on -+single-socket CPUs with a uniform L3 cache topology. Note that while running in -+global FIFO mode may work well for some workloads, saturating threads can -+easily drown out inactive ones. -+ -+### Production Ready? -+ -+This scheduler could be used in a production environment, assuming the hardware -+constraints enumerated above, and assuming the workload can accommodate a -+simple scheduling policy. -+ -+-------------------------------------------------------------------------------- -+ -+## scx_qmap -+ -+### Overview -+ -+Another simple, yet slightly more complex scheduler that provides an example of -+a basic weighted FIFO queuing policy. It also provides examples of some common -+useful BPF features, such as sleepable per-task storage allocation in the -+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to -+enqueue tasks. It also illustrates how core-sched support could be implemented. -+ -+### Typical Use Case -+ -+Purely used to illustrate sched_ext features. -+ -+### Production Ready? -+ -+No -+ -+-------------------------------------------------------------------------------- -+ -+## scx_central -+ -+### Overview -+ -+A "central" scheduler where scheduling decisions are made from a single CPU. -+This scheduler illustrates how scheduling decisions can be dispatched from a -+single CPU, allowing other cores to run with infinite slices, without timer -+ticks, and without having to incur the overhead of making scheduling decisions. -+ -+### Typical Use Case -+ -+This scheduler could theoretically be useful for any workload that benefits -+from minimizing scheduling overhead and timer ticks. An example of where this -+could be particularly useful is running VMs, where running with infinite slices -+and no timer ticks allows the VM to avoid unnecessary expensive vmexits. -+ -+### Production Ready? -+ -+Not yet. While tasks are run with an infinite slice (SCX_SLICE_INF), they're -+preempted every 20ms in a timer callback. The scheduler also puts the core -+schedling logic inside of the central / scheduling CPU's ops.dispatch() path, -+and does not yet have any kind of priority mechanism. -+ -+-------------------------------------------------------------------------------- -+ -+## scx_pair -+ -+### Overview -+ -+A sibling scheduler which ensures that tasks will only ever be co-located on a -+physical core if they're in the same cgroup. It illustrates how a scheduling -+policy could be implemented to mitigate CPU bugs, such as L1TF, and also shows -+how some useful kfuncs such as `scx_bpf_kick_cpu()` can be utilized. -+ -+### Typical Use Case -+ -+While this scheduler is only meant to be used to illustrate certain sched_ext -+features, with a bit more work (e.g. by adding some form of priority handling -+inside and across cgroups), it could have been used as a way to quickly -+mitigate L1TF before core scheduling was implemented and rolled out. -+ -+### Production Ready? -+ -+No -+ -+-------------------------------------------------------------------------------- -+ -+## scx_flatcg -+ -+### Overview -+ -+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical -+weight-based cgroup CPU control by flattening the cgroup hierarchy into a -+single layer, by compounding the active weight share at each level. The effect -+of this is a much more performant CPU controller, which does not need to -+descend down cgroup trees in order to properly compute a cgroup's share. -+ -+### Typical Use Case -+ -+This scheduler could be useful for any typical workload requiring a CPU -+controller, but which cannot tolerate the higher overheads of the fair CPU -+controller. -+ -+### Production Ready? -+ -+Yes, though the scheduler (currently) does not adequately accommodate -+thundering herds of cgroups. If, for example, many cgroups which are nested -+behind a low-priority cgroup were to wake up around the same time, they may be -+able to consume more CPU cycles than they are entitled to. -+ -+-------------------------------------------------------------------------------- -+ -+## scx_userland -+ -+### Overview -+ -+A simple weighted vtime scheduler where all scheduling decisions take place in -+user space. This is in contrast to Rusty, where load balancing lives in user -+space, but scheduling decisions are still made in the kernel. -+ -+### Typical Use Case -+ -+There are many advantages to writing schedulers in user space. For example, you -+can use a debugger, you can write the scheduler in Rust, and you can use data -+structures bundled with your favorite library. -+ -+On the other hand, user space scheduling can be hard to get right. You can -+potentially deadlock due to not scheduling a task that's required for the -+scheduler itself to make forward progress (though the sched_ext watchdog will -+protect the system by unloading your scheduler after a timeout if that -+happens). You also have to bootstrap some communication protocol between the -+kernel and user space. -+ -+A more robust solution to this would be building a user space scheduling -+framework that abstracts much of this complexity away from you. -+ -+### Production Ready? -+ -+No. This scheduler uses an ordered list for vtime scheduling, and is stricly -+less performant than just using something like `scx_simple`. It is purely -+meant to illustrate that it's possible to build a user space scheduler on -+top of sched_ext. -+ -+-------------------------------------------------------------------------------- -+ -+## scx_rusty -+ -+### Overview -+ -+A multi-domain, BPF / user space hybrid scheduler. The BPF portion of the -+scheduler does a simple round robin in each domain, and the user space portion -+(written in Rust) calculates the load factor of each domain, and informs BPF of -+how tasks should be load balanced accordingly. -+ -+### Typical Use Case -+ -+Rusty is designed to be flexible, and accommodate different architectures and -+workloads. Various load balancing thresholds (e.g. greediness, frequenty, etc), -+as well as how Rusty should partition the system into scheduling domains, can -+be tuned to achieve the optimal configuration for any given system or workload. -+ -+### Production Ready? -+ -+Yes. If tuned correctly, rusty should be performant across various CPU -+architectures and workloads. Rusty by default creates a separate scheduling -+domain per-LLC, so its default configuration may be performant as well. -+ -+That said, you may run into an issue with infeasible weights, where a task with -+a very high weight may cause the scheduler to incorrectly leave cores idle -+because it thinks they're necessary to accommodate the compute for a single -+task. This can also happen in CFS, and should soon be addressed for rusty. -+ -+-------------------------------------------------------------------------------- -+ -+# Troubleshooting -+ -+There are a number of common issues that you may run into when building the -+schedulers. We'll go over some of the common ones here. -+ -+## Build Failures -+ -+### Old version of clang -+ -+``` -+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole -+ _Static_assert(SCX_DSQ_FLAG_BUILTIN, -+ ^~~~~~~~~~~~~~~~~~~~ -+1 error generated. -+``` -+ -+This means you built the kernel or the schedulers with an older version of -+clang than what's supported (i.e. older than 16.0.0). To remediate this: -+ -+1. `which clang` to make sure you're using a sufficiently new version of clang. -+ -+2. `make fullclean` in the root path of the repository, and rebuild the kernel -+ and schedulers. -+ -+3. Rebuild the kernel, and then your example schedulers. -+ -+The schedulers are also cleaned if you invoke `make mrproper` in the root -+directory of the tree. -+ -+### Stale kernel build / incomplete vmlinux.h file -+ -+As described above, you'll need a `vmlinux.h` file that was generated from a -+vmlinux built with BTF, and with sched_ext support enabled. If you don't, -+you'll see errors such as the following which indicate that a type being -+referenced in a scheduler is unknown: -+ -+``` -+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' -+ -+const struct scx_exit_info *ei) -+ -+^ -+``` -+ -+In order to resolve this, please follow the steps above in -+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your -+schedulers are using a vmlinux.h file that includes the requisite types. -+ -+## Misc -+ -+### llvm: [OFF] -+ -+You may see the following output when building the schedulers: -+ -+``` -+Auto-detecting system features: -+... clang-bpf-co-re: [ on ] -+... llvm: [ OFF ] -+... libcap: [ on ] -+... libbfd: [ on ] -+``` -+ -+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. -diff --git a/tools/sched_ext/gnu/stubs.h b/tools/sched_ext/gnu/stubs.h -new file mode 100644 -index 000000000..719225b16 ---- /dev/null -+++ b/tools/sched_ext/gnu/stubs.h -@@ -0,0 +1 @@ -+/* dummy .h to trick /usr/include/features.h to work with 'clang -target bpf' */ -diff --git a/tools/sched_ext/ravg.bpf.h b/tools/sched_ext/ravg.bpf.h -new file mode 100644 -index 000000000..a233d85d0 ---- /dev/null -+++ b/tools/sched_ext/ravg.bpf.h -@@ -0,0 +1,42 @@ -+#ifndef __SCX_RAVG_BPF_H__ -+#define __SCX_RAVG_BPF_H__ -+ -+/* -+ * Running average helpers to be used in BPF progs. Assumes vmlinux.h has -+ * already been included. -+ */ -+enum ravg_consts { -+ RAVG_VAL_BITS = 44, /* input values are 44bit */ -+ RAVG_FRAC_BITS = 20, /* 1048576 is 1.0 */ -+}; -+ -+/* -+ * Running avg mechanism. Accumulates values between 0 and RAVG_MAX_VAL in -+ * arbitrary time intervals. The accumulated values are halved every half_life -+ * with each period starting when the current time % half_life is 0. Zeroing is -+ * enough for initialization. -+ * -+ * See ravg_accumulate() and ravg_read() for more details. -+ */ -+struct ravg_data { -+ /* current value */ -+ u64 val; -+ -+ /* -+ * The timestamp of @val. The latest completed seq #: -+ * -+ * (val_at / half_life) - 1 -+ */ -+ u64 val_at; -+ -+ /* running avg as of the latest completed seq */ -+ u64 old; -+ -+ /* -+ * Accumulated value of the current period. Input value is 48bits and we -+ * normalize half-life to 16bit, so it should fit in an u64. -+ */ -+ u64 cur; -+}; -+ -+#endif /* __SCX_RAVG_BPF_H__ */ -diff --git a/tools/sched_ext/ravg_impl.bpf.h b/tools/sched_ext/ravg_impl.bpf.h -new file mode 100644 -index 000000000..4922a3e68 ---- /dev/null -+++ b/tools/sched_ext/ravg_impl.bpf.h -@@ -0,0 +1,358 @@ -+/* to be included in the main bpf.c file */ -+#include "ravg.bpf.h" -+ -+#define RAVG_FN_ATTRS inline __attribute__((unused, always_inline)) -+ -+static RAVG_FN_ATTRS void ravg_add(u64 *sum, u64 addend) -+{ -+ u64 new = *sum + addend; -+ -+ if (new >= *sum) -+ *sum = new; -+ else -+ *sum = -1; -+} -+ -+static RAVG_FN_ATTRS u64 ravg_decay(u64 v, u32 shift) -+{ -+ if (shift >= 64) -+ return 0; -+ else -+ return v >> shift; -+} -+ -+static RAVG_FN_ATTRS u32 ravg_normalize_dur(u32 dur, u32 half_life) -+{ -+ if (dur < half_life) -+ return (((u64)dur << RAVG_FRAC_BITS) + half_life - 1) / -+ half_life; -+ else -+ return 1 << RAVG_FRAC_BITS; -+} -+ -+/* -+ * Pre-computed decayed full-period values. This is quicker and keeps the bpf -+ * verifier happy by removing the need for looping. -+ * -+ * [0] = ravg_decay(1 << RAVG_FRAC_BITS, 1) -+ * [1] = [0] + ravg_decay(1 << RAVG_FRAC_BITS, 2) -+ * [2] = [1] + ravg_decay(1 << RAVG_FRAC_BITS, 3) -+ * ... -+ */ -+static u64 ravg_full_sum[] = { -+ 524288, 786432, 917504, 983040, -+ 1015808, 1032192, 1040384, 1044480, -+ 1046528, 1047552, 1048064, 1048320, -+ 1048448, 1048512, 1048544, 1048560, -+ 1048568, 1048572, 1048574, 1048575, -+ /* the same from here on */ -+}; -+ -+static const int ravg_full_sum_len = sizeof(ravg_full_sum) / sizeof(ravg_full_sum[0]); -+ -+/** -+ * ravg_accumulate - Accumulate a new value -+ * @rd: ravg_data to accumulate into -+ * @new_val: new value -+ * @now: current timestamp -+ * @half_life: decay period, must be the same across calls -+ * -+ * The current value is changing to @val at @now. Accumulate accordingly. -+ */ -+static RAVG_FN_ATTRS void ravg_accumulate(struct ravg_data *rd, u64 new_val, u64 now, -+ u32 half_life) -+{ -+ u32 cur_seq, val_seq, seq_delta; -+ -+ /* -+ * It may be difficult for the caller to guarantee monotonic progress if -+ * multiple CPUs accumulate to the same ravg_data. Handle @now being in -+ * the past of @rd->val_at. -+ */ -+ if (now < rd->val_at) -+ now = rd->val_at; -+ -+ cur_seq = now / half_life; -+ val_seq = rd->val_at / half_life; -+ seq_delta = cur_seq - val_seq; -+ -+ /* -+ * Decay ->old and fold ->cur into it. -+ * -+ * @end -+ * v -+ * timeline |---------|---------|---------|---------|---------| -+ * seq delta 4 3 2 1 0 -+ * seq ->seq cur_seq -+ * val ->old ->cur ^ -+ * | | | -+ * \---------+------------------/ -+ */ -+ if (seq_delta > 0) { -+ /* decay ->old to bring it upto the cur_seq - 1 */ -+ rd->old = ravg_decay(rd->old, seq_delta); -+ /* non-zero ->cur must be from val_seq, calc and fold */ -+ ravg_add(&rd->old, ravg_decay(rd->cur, seq_delta)); -+ /* clear */ -+ rd->cur = 0; -+ } -+ -+ if (!rd->val) -+ goto out; -+ -+ /* -+ * Accumulate @rd->val between @rd->val_at and @now. -+ * -+ * @rd->val_at @now -+ * v v -+ * timeline |---------|---------|---------|---------|---------| -+ * seq delta [ 3 | 2 | 1 | 0 ] -+ */ -+ if (seq_delta > 0) { -+ u32 dur; -+ -+ /* fold the oldest period which may be partial */ -+ dur = ravg_normalize_dur(half_life - rd->val_at % half_life, half_life); -+ ravg_add(&rd->old, rd->val * ravg_decay(dur, seq_delta)); -+ -+ /* fold the full periods in the middle with precomputed vals */ -+ if (seq_delta > 1) { -+ u32 idx = seq_delta - 2; -+ -+ if (idx >= ravg_full_sum_len) -+ idx = ravg_full_sum_len - 1; -+ -+ ravg_add(&rd->old, rd->val * ravg_full_sum[idx]); -+ } -+ -+ /* accumulate the current period duration into ->cur */ -+ rd->cur += rd->val * ravg_normalize_dur(now % half_life, -+ half_life); -+ } else { -+ rd->cur += rd->val * ravg_normalize_dur(now - rd->val_at, -+ half_life); -+ } -+out: -+ if (new_val >= 1LLU << RAVG_VAL_BITS) -+ rd->val = (1LLU << RAVG_VAL_BITS) - 1; -+ else -+ rd->val = new_val; -+ rd->val_at = now; -+} -+ -+/** -+ * ravg_transfer - Transfer in or out a component running avg -+ * @base: ravg_data to transfer @xfer into or out of -+ * @base_new_val: new value for @base -+ * @xfer: ravg_data to transfer -+ * @xfer_new_val: new value for @xfer -+ * @is_xfer_in: transfer direction -+ * -+ * An ravg may be a sum of component ravgs. For example, a scheduling domain's -+ * load is the sum of the load values of all member tasks. If a task is migrated -+ * to a different domain, its contribution should be subtracted from the source -+ * ravg and added to the destination one. -+ * -+ * This function can be used for such component transfers. Both @base and @xfer -+ * must have been accumulated at the same timestamp. @xfer's contribution is -+ * subtracted if @is_fer_in is %false and added if %true. -+ */ -+static RAVG_FN_ATTRS void ravg_transfer(struct ravg_data *base, u64 base_new_val, -+ struct ravg_data *xfer, u64 xfer_new_val, -+ u32 half_life, bool is_xfer_in) -+{ -+ /* synchronize @base and @xfer */ -+ if ((s64)(base->val_at - xfer->val_at) < 0) -+ ravg_accumulate(base, base_new_val, xfer->val_at, half_life); -+ else if ((s64)(base->val_at - xfer->val_at) > 0) -+ ravg_accumulate(xfer, xfer_new_val, base->val_at, half_life); -+ -+ /* transfer */ -+ if (is_xfer_in) { -+ base->old += xfer->old; -+ base->cur += xfer->cur; -+ } else { -+ if (base->old > xfer->old) -+ base->old -= xfer->old; -+ else -+ base->old = 0; -+ -+ if (base->cur > xfer->cur) -+ base->cur -= xfer->cur; -+ else -+ base->cur = 0; -+ } -+} -+ -+/** -+ * u64_x_u32_rshift - Calculate ((u64 * u32) >> rshift) -+ * @a: multiplicand -+ * @b: multiplier -+ * @rshift: number of bits to shift right -+ * -+ * Poor man's 128bit arithmetic. Calculate ((@a * @b) >> @rshift) where @a is -+ * u64 and @b is u32 and (@a * @b) may be bigger than #U64_MAX. The caller must -+ * ensure that the final shifted result fits in u64. -+ */ -+static inline __attribute__((always_inline)) -+u64 u64_x_u32_rshift(u64 a, u32 b, u32 rshift) -+{ -+ const u64 mask32 = (u32)-1; -+ u64 al = a & mask32; -+ u64 ah = (a & (mask32 << 32)) >> 32; -+ -+ /* -+ * ah: high 32 al: low 32 -+ * a |--------------||--------------| -+ * -+ * ah * b |--------------||--------------| -+ * al * b |--------------||--------------| -+ */ -+ al *= b; -+ ah *= b; -+ -+ /* -+ * (ah * b) >> rshift |--------------||--------------| -+ * (al * b) >> rshift |--------------||--------| -+ * <--------> -+ * 32 - rshift -+ */ -+ al >>= rshift; -+ if (rshift <= 32) -+ ah <<= 32 - rshift; -+ else -+ ah >>= rshift - 32; -+ -+ return al + ah; -+} -+ -+/** -+ * ravg_scale - Scale a running avg -+ * @rd: ravg_data to scale -+ * @mult: multipler -+ * @rshift: right shift amount -+ * -+ * Scale @rd by multiplying the tracked values by @mult and shifting right by -+ * @rshift. -+ */ -+static RAVG_FN_ATTRS void ravg_scale(struct ravg_data *rd, u32 mult, u32 rshift) -+{ -+ rd->val = u64_x_u32_rshift(rd->val, mult, rshift); -+ rd->old = u64_x_u32_rshift(rd->old, mult, rshift); -+ rd->cur = u64_x_u32_rshift(rd->cur, mult, rshift); -+} -+ -+/** -+ * ravg_read - Read the current running avg -+ * @rd: ravg_data to read from -+ * @now: timestamp as of which to read the running avg -+ * @half_life: decay period, must match ravg_accumulate()'s -+ * -+ * Read running avg from @rd as of @now. -+ */ -+static RAVG_FN_ATTRS u64 ravg_read(struct ravg_data *rd, u64 now, u64 half_life) -+{ -+ struct ravg_data trd; -+ u32 elapsed; -+ -+ /* -+ * It may be difficult for the caller to guarantee monotonic progress if -+ * multiple CPUs accumulate to the same ravg_data. Handle @now being in -+ * the past of @rd->val_at. -+ */ -+ if (now < rd->val_at) -+ now = rd->val_at; -+ -+ elapsed = now % half_life; -+ -+ /* -+ * Accumulate the ongoing period into a temporary copy. This allows -+ * external readers to access up-to-date avg without strongly -+ * synchronizing with the updater (we need to add a seq lock tho). -+ */ -+ trd = *rd; -+ rd = &trd; -+ ravg_accumulate(rd, 0, now, half_life); -+ -+ /* -+ * At the beginning of a new half_life period, the running avg is the -+ * same as @rd->old. At the beginning of the next, it'd be old load / 2 -+ * + current load / 2. Inbetween, we blend the two linearly. -+ */ -+ if (elapsed) { -+ u32 progress = ravg_normalize_dur(elapsed, half_life); -+ /* -+ * `H` is the duration of the half-life window, and `E` is how -+ * much time has elapsed in this window. `P` is [0.0, 1.0] -+ * representing how much the current window has progressed: -+ * -+ * P = E / H -+ * -+ * If `old` is @rd->old, we would want to calculate the -+ * following for blending: -+ * -+ * old * (1.0 - P / 2) -+ * -+ * Because @progress is [0, 1 << RAVG_FRAC_BITS], let's multiply -+ * and then divide by 1 << RAVG_FRAC_BITS: -+ * -+ * (1 << RAVG_FRAC_BITS) - (1 << RAVG_FRAC_BITS) * P / 2 -+ * old * ----------------------------------------------------- -+ * 1 << RAVG_FRAC_BITS -+ * -+ * As @progress is (1 << RAVG_FRAC_BITS) * P: -+ * -+ * (1 << RAVG_FRAC_BITS) - progress / 2 -+ * old * ------------------------------------ -+ * 1 << RAVG_FRAC_BITS -+ * -+ * As @rd->old uses full 64bit, the multiplication can overflow, -+ * but we also know that the final result is gonna be smaller -+ * than @rd->old and thus fit. Use u64_x_u32_rshift() to handle -+ * the interim multiplication correctly. -+ */ -+ u64 old = u64_x_u32_rshift(rd->old, -+ (1 << RAVG_FRAC_BITS) - progress / 2, -+ RAVG_FRAC_BITS); -+ /* -+ * If `S` is the Sum(val * duration) for this half-life window, -+ * the avg for this window is: -+ * -+ * S / E -+ * -+ * We would want to calculate the following for blending: -+ * -+ * S / E * (P / 2) -+ * -+ * As P = E / H, -+ * -+ * S / E * (E / H / 2) -+ * S / H / 2 -+ * -+ * Expanding S, the above becomes: -+ * -+ * Sum(val * duration) / H / 2 -+ * Sum(val * (duration / H)) / 2 -+ * -+ * As we use RAVG_FRAC_BITS bits for fixed point arithmetic, -+ * let's multiply the whole result accordingly: -+ * -+ * (Sum(val * (duration / H)) / 2) * (1 << RAVG_FRAC_BITS) -+ * -+ * duration * (1 << RAVG_FRAC_BITS) -+ * Sum(val * --------------------------------) / 2 -+ * H -+ * -+ * The righthand multiplier inside Sum() is the normalized -+ * duration returned from ravg_normalize_dur(), so, the whole -+ * Sum term equals @rd->cur. -+ * -+ * rd->cur / 2 -+ */ -+ u64 cur = rd->cur / 2; -+ -+ return old + cur; -+ } else { -+ return rd->old; -+ } -+} -diff --git a/tools/sched_ext/ravg_read.rs.h b/tools/sched_ext/ravg_read.rs.h -new file mode 100644 -index 000000000..4efaa2390 ---- /dev/null -+++ b/tools/sched_ext/ravg_read.rs.h -@@ -0,0 +1,82 @@ -+/// ravg_read() implementation for rust userland. See ravg_read() in -+/// ravg_impl.bpf.h. We don't yet have a good mechanism to share BPF and -+/// matching rust code across multiple schedulers. For now, include both BPF -+/// and rust code from scheduler implementations. -+fn ravg_read( -+ val: u64, -+ val_at: u64, -+ old: u64, -+ cur: u64, -+ now: u64, -+ half_life: u32, -+ frac_bits: u32, -+) -> f64 { -+ let ravg_1: f64 = (1 << frac_bits) as f64; -+ let half_life = half_life as u64; -+ let val = val as f64; -+ let mut old = old as f64 / ravg_1; -+ let mut cur = cur as f64 / ravg_1; -+ -+ let now = now.max(val_at); -+ let normalized_dur = |dur| dur as f64 / half_life as f64; -+ -+ // -+ // The following is f64 implementation of BPF ravg_accumulate(). -+ // -+ let cur_seq = (now / half_life) as i64; -+ let val_seq = (val_at / half_life) as i64; -+ let seq_delta = (cur_seq - val_seq) as i32; -+ -+ if seq_delta > 0 { -+ let full_decay = 2f64.powi(seq_delta); -+ -+ // Decay $old and fold $cur into it. -+ old /= full_decay; -+ old += cur / full_decay; -+ cur = 0.0; -+ -+ // Fold the oldest period whicy may be partial. -+ old += val * normalized_dur(half_life - val_at % half_life) / full_decay; -+ -+ // Pre-computed decayed full-period values. -+ const FULL_SUMS: [f64; 20] = [ -+ 0.5, -+ 0.75, -+ 0.875, -+ 0.9375, -+ 0.96875, -+ 0.984375, -+ 0.9921875, -+ 0.99609375, -+ 0.998046875, -+ 0.9990234375, -+ 0.99951171875, -+ 0.999755859375, -+ 0.9998779296875, -+ 0.99993896484375, -+ 0.999969482421875, -+ 0.9999847412109375, -+ 0.9999923706054688, -+ 0.9999961853027344, -+ 0.9999980926513672, -+ 0.9999990463256836, -+ // Use the same value beyond this point. -+ ]; -+ -+ // Fold the full periods in the middle. -+ if seq_delta >= 2 { -+ let idx = ((seq_delta - 2) as usize).min(FULL_SUMS.len() - 1); -+ old += val * FULL_SUMS[idx]; -+ } -+ -+ // Accumulate the current period duration into @cur. -+ cur += val * normalized_dur(now % half_life); -+ } else { -+ cur += val * normalized_dur(now - val_at); -+ } -+ -+ // -+ // The following is the blending part of BPF ravg_read(). -+ // -+ old * (1.0 - normalized_dur(now % half_life) / 2.0) + cur / 2.0 -+} -diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c -new file mode 100644 -index 000000000..890e97e22 ---- /dev/null -+++ b/tools/sched_ext/scx_central.bpf.c -@@ -0,0 +1,346 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A central FIFO sched_ext scheduler which demonstrates the followings: -+ * -+ * a. Making all scheduling decisions from one CPU: -+ * -+ * The central CPU is the only one making scheduling decisions. All other -+ * CPUs kick the central CPU when they run out of tasks to run. -+ * -+ * There is one global BPF queue and the central CPU schedules all CPUs by -+ * dispatching from the global queue to each CPU's local dsq from dispatch(). -+ * This isn't the most straightforward. e.g. It'd be easier to bounce -+ * through per-CPU BPF queues. The current design is chosen to maximally -+ * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. -+ * -+ * b. Tickless operation -+ * -+ * All tasks are dispatched with the infinite slice which allows stopping the -+ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full -+ * parameter. The tickless operation can be observed through -+ * /proc/interrupts. -+ * -+ * Periodic switching is enforced by a periodic timer checking all CPUs and -+ * preempting them as necessary. Unfortunately, BPF timer currently doesn't -+ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to -+ * the central CPU. -+ * -+ * c. Preemption -+ * -+ * Kthreads are unconditionally queued to the head of a matching local dsq -+ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always -+ * prioritized over user threads, which is required for ensuring forward -+ * progress as e.g. the periodic timer may run on a ksoftirqd and if the -+ * ksoftirqd gets starved by a user thread, there may not be anything else to -+ * vacate that user thread. -+ * -+ * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the -+ * next tasks. -+ * -+ * This scheduler is designed to maximize usage of various SCX mechanisms. A -+ * more practical implementation would likely put the scheduling loop outside -+ * the central CPU's dispatch() path and add some form of priority mechanism. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include "scx_common.bpf.h" -+ -+char _license[] SEC("license") = "GPL"; -+ -+enum { -+ FALLBACK_DSQ_ID = 0, -+ MS_TO_NS = 1000LLU * 1000, -+ TIMER_INTERVAL_NS = 1 * MS_TO_NS, -+}; -+ -+const volatile bool switch_partial; -+const volatile s32 central_cpu; -+const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+ -+u64 nr_total, nr_locals, nr_queued, nr_lost_pids; -+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; -+u64 nr_overflows; -+ -+struct user_exit_info uei; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 4096); -+ __type(value, s32); -+} central_q SEC(".maps"); -+ -+/* can't use percpu map due to bad lookups */ -+bool RESIZABLE_ARRAY(data, cpu_gimme_task); -+u64 RESIZABLE_ARRAY(data, cpu_started_at); -+ -+struct central_timer { -+ struct bpf_timer timer; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct central_timer); -+} central_timer SEC(".maps"); -+ -+static bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ /* -+ * Steer wakeups to the central CPU as much as possible to avoid -+ * disturbing other CPUs. It's safe to blindly return the central cpu as -+ * select_cpu() is a hint and if @p can't be on it, the kernel will -+ * automatically pick a fallback CPU. -+ */ -+ return central_cpu; -+} -+ -+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ s32 pid = p->pid; -+ -+ __sync_fetch_and_add(&nr_total, 1); -+ -+ /* -+ * Push per-cpu kthreads at the head of local dsq's and preempt the -+ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked -+ * behind other threads which is necessary for forward progress -+ * guarantee as we depend on the BPF timer which may run from ksoftirqd. -+ */ -+ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { -+ __sync_fetch_and_add(&nr_locals, 1); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, -+ enq_flags | SCX_ENQ_PREEMPT); -+ return; -+ } -+ -+ if (bpf_map_push_elem(¢ral_q, &pid, 0)) { -+ __sync_fetch_and_add(&nr_overflows, 1); -+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); -+ return; -+ } -+ -+ __sync_fetch_and_add(&nr_queued, 1); -+ -+ if (!scx_bpf_task_running(p)) -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+} -+ -+static bool dispatch_to_cpu(s32 cpu) -+{ -+ struct task_struct *p; -+ s32 pid; -+ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ if (bpf_map_pop_elem(¢ral_q, &pid)) -+ break; -+ -+ __sync_fetch_and_sub(&nr_queued, 1); -+ -+ p = bpf_task_from_pid(pid); -+ if (!p) { -+ __sync_fetch_and_add(&nr_lost_pids, 1); -+ continue; -+ } -+ -+ /* -+ * If we can't run the task at the top, do the dumb thing and -+ * bounce it to the fallback dsq. -+ */ -+ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { -+ __sync_fetch_and_add(&nr_mismatches, 1); -+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); -+ bpf_task_release(p); -+ continue; -+ } -+ -+ /* dispatch to local and mark that @cpu doesn't need more */ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); -+ -+ if (cpu != central_cpu) -+ scx_bpf_kick_cpu(cpu, 0); -+ -+ bpf_task_release(p); -+ return true; -+ } -+ -+ return false; -+} -+ -+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ if (cpu == central_cpu) { -+ /* dispatch for all other CPUs first */ -+ __sync_fetch_and_add(&nr_dispatches, 1); -+ -+ bpf_for(cpu, 0, nr_cpu_ids) { -+ bool *gimme; -+ -+ if (!scx_bpf_dispatch_nr_slots()) -+ break; -+ -+ /* central's gimme is never set */ -+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (gimme && !*gimme) -+ continue; -+ -+ if (dispatch_to_cpu(cpu)) -+ *gimme = false; -+ } -+ -+ /* -+ * Retry if we ran out of dispatch buffer slots as we might have -+ * skipped some CPUs and also need to dispatch for self. The ext -+ * core automatically retries if the local dsq is empty but we -+ * can't rely on that as we're dispatching for other CPUs too. -+ * Kick self explicitly to retry. -+ */ -+ if (!scx_bpf_dispatch_nr_slots()) { -+ __sync_fetch_and_add(&nr_retries, 1); -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+ return; -+ } -+ -+ /* look for a task to run on the central CPU */ -+ if (scx_bpf_consume(FALLBACK_DSQ_ID)) -+ return; -+ dispatch_to_cpu(central_cpu); -+ } else { -+ bool *gimme; -+ -+ if (scx_bpf_consume(FALLBACK_DSQ_ID)) -+ return; -+ -+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (gimme) -+ *gimme = true; -+ -+ /* -+ * Force dispatch on the scheduling CPU so that it finds a task -+ * to run for us. -+ */ -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+ } -+} -+ -+void BPF_STRUCT_OPS(central_running, struct task_struct *p) -+{ -+ s32 cpu = scx_bpf_task_cpu(p); -+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at) -+ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ -+} -+ -+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) -+{ -+ s32 cpu = scx_bpf_task_cpu(p); -+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at) -+ *started_at = 0; -+} -+ -+static int central_timerfn(void *map, int *key, struct bpf_timer *timer) -+{ -+ u64 now = bpf_ktime_get_ns(); -+ u64 nr_to_kick = nr_queued; -+ s32 i, curr_cpu; -+ -+ curr_cpu = bpf_get_smp_processor_id(); -+ /* -+ * XXX BACKPORT NOTE - BPF_F_TIMER_CPU_PIN is not available in v6.6 and -+ * we can't guarantee that the central timer runs on the central CPU. -+ */ -+ /*if (curr_cpu != central_cpu) { -+ scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", -+ curr_cpu, central_cpu); -+ return 0; -+ }*/ -+ -+ bpf_for(i, 0, nr_cpu_ids) { -+ s32 cpu = (nr_timers + i) % nr_cpu_ids; -+ u64 *started_at; -+ -+ if (cpu == central_cpu) -+ continue; -+ -+ /* kick iff the current one exhausted its slice */ -+ started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at && *started_at && -+ vtime_before(now, *started_at + slice_ns)) -+ continue; -+ -+ /* and there's something pending */ -+ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || -+ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) -+ ; -+ else if (nr_to_kick) -+ nr_to_kick--; -+ else -+ continue; -+ -+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); -+ } -+ -+ bpf_timer_start(timer, TIMER_INTERVAL_NS, 0 /*BPF_F_TIMER_CPU_PIN*/); -+ __sync_fetch_and_add(&nr_timers, 1); -+ return 0; -+} -+ -+int BPF_STRUCT_OPS_SLEEPABLE(central_init) -+{ -+ u32 key = 0; -+ struct bpf_timer *timer; -+ int ret; -+ -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ -+ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); -+ if (ret) -+ return ret; -+ -+ timer = bpf_map_lookup_elem(¢ral_timer, &key); -+ if (!timer) -+ return -ESRCH; -+ -+ if (bpf_get_smp_processor_id() != central_cpu) -+ return -EINVAL; -+ -+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); -+ bpf_timer_set_callback(timer, central_timerfn); -+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0 /*BPF_F_TIMER_CPU_PIN*/); -+ return ret; -+} -+ -+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops central_ops = { -+ /* -+ * We are offloading all scheduling decisions to the central CPU and -+ * thus being the last task on a given CPU doesn't mean anything -+ * special. Enqueue the last tasks like any other tasks. -+ */ -+ .flags = SCX_OPS_ENQ_LAST, -+ -+ .select_cpu = (void *)central_select_cpu, -+ .enqueue = (void *)central_enqueue, -+ .dispatch = (void *)central_dispatch, -+ .running = (void *)central_running, -+ .stopping = (void *)central_stopping, -+ .init = (void *)central_init, -+ .exit = (void *)central_exit, -+ .name = "central", -+}; -diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c -new file mode 100644 -index 000000000..1e2985900 ---- /dev/null -+++ b/tools/sched_ext/scx_central.c -@@ -0,0 +1,123 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#define _GNU_SOURCE -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_common.h" -+#include "scx_central.skel.h" -+ -+const char help_fmt[] = -+"A central FIFO sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-c CPU] [-p]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -c CPU Override the central CPU (default: 0)\n" -+" -p Switch only tasks on SCHED_EXT policy intead of all\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_central *skel; -+ struct bpf_link *link; -+ __u64 seq = 0; -+ __s32 opt; -+ cpu_set_t *cpuset; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ skel = scx_central__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ skel->rodata->central_cpu = 0; -+ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); -+ -+ while ((opt = getopt(argc, argv, "s:c:ph")) != -1) { -+ switch (opt) { -+ case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ case 'c': -+ skel->rodata->central_cpu = strtoul(optarg, NULL, 0); -+ break; -+ case 'p': -+ skel->rodata->switch_partial = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ /* Resize arrays so their element count is equal to cpu count. */ -+ RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids); -+ RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids); -+ -+ SCX_BUG_ON(scx_central__load(skel), "Failed to load skel"); -+ -+ /* -+ * Affinitize the loading thread to the central CPU, as: -+ * - That's where the BPF timer is first invoked in the BPF program. -+ * - We probably don't want this user space component to take up a core -+ * from a task that would benefit from avoiding preemption on one of -+ * the tickless cores. -+ * -+ * Until BPF supports pinning the timer, it's not guaranteed that it -+ * will always be invoked on the central CPU. In practice, this -+ * suffices the majority of the time. -+ */ -+ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); -+ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); -+ CPU_ZERO(cpuset); -+ CPU_SET(skel->rodata->central_cpu, cpuset); -+ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), -+ "Failed to affinitize to central CPU %d (max %d)", -+ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); -+ CPU_FREE(cpuset); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.central_ops); -+ SCX_BUG_ON(!link, "Failed to attach struct_ops"); -+ -+ while (!exit_req && !uei_exited(&skel->bss->uei)) { -+ printf("[SEQ %llu]\n", seq++); -+ printf("total :%10lu local:%10lu queued:%10lu lost:%10lu\n", -+ skel->bss->nr_total, -+ skel->bss->nr_locals, -+ skel->bss->nr_queued, -+ skel->bss->nr_lost_pids); -+ printf("timer :%10lu dispatch:%10lu mismatch:%10lu retry:%10lu\n", -+ skel->bss->nr_timers, -+ skel->bss->nr_dispatches, -+ skel->bss->nr_mismatches, -+ skel->bss->nr_retries); -+ printf("overflow:%10lu\n", -+ skel->bss->nr_overflows); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ uei_print(&skel->bss->uei); -+ scx_central__destroy(skel); -+ return 0; -+} -diff --git a/tools/sched_ext/scx_common.bpf.h b/tools/sched_ext/scx_common.bpf.h -new file mode 100644 -index 000000000..5c503c235 ---- /dev/null -+++ b/tools/sched_ext/scx_common.bpf.h -@@ -0,0 +1,244 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef __SCHED_EXT_COMMON_BPF_H -+#define __SCHED_EXT_COMMON_BPF_H -+ -+#include "vmlinux.h" -+#include -+#include -+#include -+#include "user_exit_info.h" -+ -+#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ -+#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ -+#define PF_EXITING 0x00000004 -+#define CLOCK_MONOTONIC 1 -+ -+/* -+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can -+ * lead to really confusing misbehaviors. Let's trigger a build failure. -+ */ -+static inline void ___vmlinux_h_sanity_check___(void) -+{ -+ _Static_assert(SCX_DSQ_FLAG_BUILTIN, -+ "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); -+} -+ -+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; -+ -+static inline __attribute__((format(printf, 1, 2))) -+void ___scx_bpf_error_format_checker(const char *fmt, ...) {} -+ -+/* -+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments -+ * instead of an array of u64. Note that __param[] must have at least one -+ * element to keep the verifier happy. -+ */ -+#define scx_bpf_error(fmt, args...) \ -+({ \ -+ static char ___fmt[] = fmt; \ -+ unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ -+ \ -+ _Pragma("GCC diagnostic push") \ -+ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ -+ ___bpf_fill(___param, args); \ -+ _Pragma("GCC diagnostic pop") \ -+ \ -+ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ -+ \ -+ ___scx_bpf_error_format_checker(fmt, ##args); \ -+}) -+ -+void scx_bpf_switch_all(void) __ksym; -+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; -+bool scx_bpf_consume(u64 dsq_id) __ksym; -+u32 scx_bpf_dispatch_nr_slots(void) __ksym; -+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; -+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; -+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; -+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; -+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; -+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; -+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; -+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; -+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; -+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; -+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; -+bool scx_bpf_task_running(const struct task_struct *p) __ksym; -+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; -+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; -+u32 scx_bpf_reenqueue_local(void) __ksym; -+ -+#define BPF_STRUCT_OPS(name, args...) \ -+SEC("struct_ops/"#name) \ -+BPF_PROG(name, ##args) -+ -+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ -+SEC("struct_ops.s/"#name) \ -+BPF_PROG(name, ##args) -+ -+/** -+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized -+ * @elfsec: the data section of the BPF program in which to place the array -+ * @arr: the name of the array -+ * -+ * libbpf has an API for setting map value sizes. Since data sections (i.e. -+ * bss, data, rodata) themselves are maps, a data section can be resized. If -+ * a data section has an array as its last element, the BTF info for that -+ * array will be adjusted so that length of the array is extended to meet the -+ * new length of the data section. This macro annotates an array to have an -+ * element count of one with the assumption that this array can be resized -+ * within the userspace program. It also annotates the section specifier so -+ * this array exists in a custom sub data section which can be resized -+ * independently. -+ * -+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an -+ * array declared with RESIZABLE_ARRAY(). -+ */ -+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) -+ -+/** -+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member -+ * @base: struct or array to index -+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) -+ * -+ * The verifier often gets confused by the instruction sequence the compiler -+ * generates for indexing struct fields or arrays. This macro forces the -+ * compiler to generate a code sequence which first calculates the byte offset, -+ * checks it against the struct or array size and add that byte offset to -+ * generate the pointer to the member to help the verifier. -+ * -+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However, -+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller -+ * must check for %NULL and take appropriate action to appease the verifier. To -+ * avoid confusing the verifier, it's best to check for %NULL and dereference -+ * immediately. -+ * -+ * vptr = MEMBER_VPTR(my_array, [i][j]); -+ * if (!vptr) -+ * return error; -+ * *vptr = new_value; -+ * -+ * sizeof(@base) should encompass the memory area to be accessed and thus can't -+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of -+ * `MEMBER_VPTR(ptr, ->member)`. -+ */ -+#define MEMBER_VPTR(base, member) (typeof((base) member) *)({ \ -+ u64 __base = (u64)&(base); \ -+ u64 __addr = (u64)&((base) member) - __base; \ -+ _Static_assert(sizeof(base) >= sizeof((base) member), \ -+ "@base is smaller than @member, is @base a pointer?"); \ -+ asm volatile ( \ -+ "if %0 <= %[max] goto +2\n" \ -+ "%0 = 0\n" \ -+ "goto +1\n" \ -+ "%0 += %1\n" \ -+ : "+r"(__addr) \ -+ : "r"(__base), \ -+ [max]"i"(sizeof(base) - sizeof((base) member))); \ -+ __addr; \ -+}) -+ -+/** -+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element -+ * @arr: array to index into -+ * @i: array index -+ * @n: number of elements in array -+ * -+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the -+ * element count needs to be explicit. -+ * It can be used in cases where a global array is defined with an initial -+ * size but is intended to be be resized before loading the BPF program. -+ * Without this version of the macro, MEMBER_VPTR() will use the compile time -+ * size of the array to compute the max, which will result in rejection by -+ * the verifier. -+ */ -+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *)({ \ -+ u64 __base = (u64)arr; \ -+ u64 __addr = (u64)&(arr[i]) - __base; \ -+ asm volatile ( \ -+ "if %0 <= %[max] goto +2\n" \ -+ "%0 = 0\n" \ -+ "goto +1\n" \ -+ "%0 += %1\n" \ -+ : "+r"(__addr) \ -+ : "r"(__base), \ -+ [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ -+ __addr; \ -+}) -+ -+/* -+ * BPF core and other generic helpers -+ */ -+ -+/* list and rbtree */ -+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) -+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) -+ -+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; -+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; -+ -+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) -+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) -+ -+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; -+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; -+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, -+ struct bpf_rb_node *node) __ksym; -+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, -+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), -+ void *meta, __u64 off) __ksym; -+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) -+ -+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; -+ -+/* task */ -+struct task_struct *bpf_task_from_pid(s32 pid) __ksym; -+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; -+void bpf_task_release(struct task_struct *p) __ksym; -+ -+/* cgroup */ -+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; -+void bpf_cgroup_release(struct cgroup *cgrp) __ksym; -+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; -+ -+/* cpumask */ -+struct bpf_cpumask *bpf_cpumask_create(void) __ksym; -+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; -+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; -+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; -+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; -+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; -+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; -+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; -+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+ -+/* rcu */ -+void bpf_rcu_read_lock(void) __ksym; -+void bpf_rcu_read_unlock(void) __ksym; -+ -+#endif /* __SCHED_EXT_COMMON_BPF_H */ -diff --git a/tools/sched_ext/scx_common.h b/tools/sched_ext/scx_common.h -new file mode 100644 -index 000000000..0e93d6b69 ---- /dev/null -+++ b/tools/sched_ext/scx_common.h -@@ -0,0 +1,59 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+#ifndef __SCHED_EXT_COMMON_H -+#define __SCHED_EXT_COMMON_H -+ -+#include -+#include -+#include -+ -+#include "user_exit_info.h" -+ -+#ifdef __KERNEL__ -+#error "Should not be included by BPF programs" -+#endif -+ -+#define SCX_BUG(__fmt, ...) \ -+ do { \ -+ fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__, \ -+ strerror(errno)); \ -+ fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ -+ fprintf(stderr, "\n"); \ -+ \ -+ exit(EXIT_FAILURE); \ -+ } while (0) -+ -+#define SCX_BUG_ON(__cond, __fmt, ...) \ -+ do { \ -+ if (__cond) \ -+ SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ -+ } while (0) -+ -+/** -+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array -+ * @elfsec: the data section of the BPF program in which to the array exists -+ * @arr: the name of the array -+ * @n: the desired array element count -+ * -+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two -+ * operations. It resizes the map which corresponds to the custom data -+ * section that contains the target array. As a side effect, the BTF info for -+ * the array is adjusted so that the array length is sized to cover the new -+ * data section size. The second operation is reassigning the skeleton pointer -+ * for that custom data section so that it points to the newly memory mapped -+ * region. -+ */ -+#define RESIZE_ARRAY(elfsec, arr, n) \ -+ do { \ -+ size_t __sz; \ -+ bpf_map__set_value_size(skel->maps.elfsec##_##arr, \ -+ sizeof(skel->elfsec##_##arr->arr[0]) * (n)); \ -+ skel->elfsec##_##arr = \ -+ bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \ -+ } while (0) -+ -+#endif /* __SCHED_EXT_COMMON_H */ -diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c -new file mode 100644 -index 000000000..2db3d8d45 ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.bpf.c -@@ -0,0 +1,912 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements -+ * hierarchical weight-based cgroup CPU control by flattening the cgroup -+ * hierarchy into a single layer by compounding the active weight share at each -+ * level. Consider the following hierarchy with weights in parentheses: -+ * -+ * R + A (100) + B (100) -+ * | \ C (100) -+ * \ D (200) -+ * -+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. -+ * Let's say all three have runnable tasks. The total share that each of these -+ * three cgroups is entitled to can be calculated by compounding its share at -+ * each level. -+ * -+ * For example, B is competing against C and in that competition its share is -+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's -+ * share in that competition is 200/(200+100) == 1/3. B's eventual share in the -+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's -+ * eventual shaer is the same at 1/6. D is only competing at the top level and -+ * its share is 200/(100+200) == 2/3. -+ * -+ * So, instead of hierarchically scheduling level-by-level, we can consider it -+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 -+ * and keep updating the eventual shares as the cgroups' runnable states change. -+ * -+ * This flattening of hierarchy can bring a substantial performance gain when -+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using -+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it -+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two -+ * apache instances competing with 2:1 weight ratio nested four level deep. -+ * -+ * However, the gain comes at the cost of not being able to properly handle -+ * thundering herd of cgroups. For example, if many cgroups which are nested -+ * behind a low priority parent cgroup wake up around the same time, they may be -+ * able to consume more CPU cycles than they are entitled to. In many use cases, -+ * this isn't a real concern especially given the performance gain. Also, there -+ * are ways to mitigate the problem further by e.g. introducing an extra -+ * scheduling layer on cgroup delegation boundaries. -+ * -+ * The scheduler first picks the cgroup to run and then schedule the tasks -+ * within by using nested weighted vtime scheduling by default. The -+ * cgroup-internal scheduling can be switched to FIFO with the -f option. -+ */ -+#include "scx_common.bpf.h" -+#include "user_exit_info.h" -+#include "scx_flatcg.h" -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ -+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; -+const volatile bool fifo_sched; -+const volatile bool switch_partial; -+ -+u64 cvtime_now; -+struct user_exit_info uei; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __type(key, u32); -+ __type(value, u64); -+ __uint(max_entries, FCG_NR_STATS); -+} stats SEC(".maps"); -+ -+static void stat_inc(enum fcg_stat_idx idx) -+{ -+ u32 idx_v = idx; -+ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); -+ if (cnt_p) -+ (*cnt_p)++; -+} -+ -+struct fcg_cpu_ctx { -+ u64 cur_cgid; -+ u64 cur_at; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __type(key, u32); -+ __type(value, struct fcg_cpu_ctx); -+ __uint(max_entries, 1); -+} cpu_ctx SEC(".maps"); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct fcg_cgrp_ctx); -+} cgrp_ctx SEC(".maps"); -+ -+struct cgv_node { -+ struct bpf_rb_node rb_node; -+ __u64 cvtime; -+ __u64 cgid; -+}; -+ -+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; -+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); -+ -+struct cgv_node_stash { -+ struct cgv_node __kptr *node; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_HASH); -+ __uint(max_entries, 16384); -+ __type(key, __u64); -+ __type(value, struct cgv_node_stash); -+} cgv_node_stash SEC(".maps"); -+ -+struct fcg_task_ctx { -+ u64 bypassed_at; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct fcg_task_ctx); -+} task_ctx SEC(".maps"); -+ -+/* gets inc'd on weight tree changes to expire the cached hweights */ -+unsigned long hweight_gen = 1; -+ -+static u64 div_round_up(u64 dividend, u64 divisor) -+{ -+ return (dividend + divisor - 1) / divisor; -+} -+ -+static bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) -+{ -+ struct cgv_node *cgc_a, *cgc_b; -+ -+ cgc_a = container_of(a, struct cgv_node, rb_node); -+ cgc_b = container_of(b, struct cgv_node, rb_node); -+ -+ return cgc_a->cvtime < cgc_b->cvtime; -+} -+ -+static struct fcg_cpu_ctx *find_cpu_ctx(void) -+{ -+ struct fcg_cpu_ctx *cpuc; -+ u32 idx = 0; -+ -+ cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); -+ if (!cpuc) { -+ scx_bpf_error("cpu_ctx lookup failed"); -+ return NULL; -+ } -+ return cpuc; -+} -+ -+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (!cgc) { -+ scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); -+ return NULL; -+ } -+ return cgc; -+} -+ -+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ -+ cgrp = bpf_cgroup_ancestor(cgrp, level); -+ if (!cgrp) { -+ scx_bpf_error("ancestor cgroup lookup failed"); -+ return NULL; -+ } -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ scx_bpf_error("ancestor cgrp_ctx lookup failed"); -+ bpf_cgroup_release(cgrp); -+ return cgc; -+} -+ -+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) -+{ -+ int level; -+ -+ if (!cgc->nr_active) { -+ stat_inc(FCG_STAT_HWT_SKIP); -+ return; -+ } -+ -+ if (cgc->hweight_gen == hweight_gen) { -+ stat_inc(FCG_STAT_HWT_CACHE); -+ return; -+ } -+ -+ stat_inc(FCG_STAT_HWT_UPDATES); -+ bpf_for(level, 0, cgrp->level + 1) { -+ struct fcg_cgrp_ctx *cgc; -+ bool is_active; -+ -+ cgc = find_ancestor_cgrp_ctx(cgrp, level); -+ if (!cgc) -+ break; -+ -+ if (!level) { -+ cgc->hweight = FCG_HWEIGHT_ONE; -+ cgc->hweight_gen = hweight_gen; -+ } else { -+ struct fcg_cgrp_ctx *pcgc; -+ -+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); -+ if (!pcgc) -+ break; -+ -+ /* -+ * We can be oppotunistic here and not grab the -+ * cgv_tree_lock and deal with the occasional races. -+ * However, hweight updates are already cached and -+ * relatively low-frequency. Let's just do the -+ * straightforward thing. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ is_active = cgc->nr_active; -+ if (is_active) { -+ cgc->hweight_gen = pcgc->hweight_gen; -+ cgc->hweight = -+ div_round_up(pcgc->hweight * cgc->weight, -+ pcgc->child_weight_sum); -+ } -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!is_active) { -+ stat_inc(FCG_STAT_HWT_RACE); -+ break; -+ } -+ } -+ } -+} -+ -+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) -+{ -+ u64 delta, cvtime, max_budget; -+ -+ /* -+ * A node which is on the rbtree can't be pointed to from elsewhere yet -+ * and thus can't be updated and repositioned. Instead, we collect the -+ * vtime deltas separately and apply it asynchronously here. -+ */ -+ delta = cgc->cvtime_delta; -+ __sync_fetch_and_sub(&cgc->cvtime_delta, delta); -+ cvtime = cgv_node->cvtime + delta; -+ -+ /* -+ * Allow a cgroup to carry the maximum budget proportional to its -+ * hweight such that a full-hweight cgroup can immediately take up half -+ * of the CPUs at the most while staying at the front of the rbtree. -+ */ -+ max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / -+ (2 * FCG_HWEIGHT_ONE); -+ if (vtime_before(cvtime, cvtime_now - max_budget)) -+ cvtime = cvtime_now - max_budget; -+ -+ cgv_node->cvtime = cvtime; -+} -+ -+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) -+{ -+ struct cgv_node_stash *stash; -+ struct cgv_node *cgv_node; -+ u64 cgid = cgrp->kn->id; -+ -+ /* paired with cmpxchg in try_pick_next_cgroup() */ -+ if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { -+ stat_inc(FCG_STAT_ENQ_SKIP); -+ return; -+ } -+ -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash) { -+ scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); -+ return; -+ } -+ -+ /* NULL if the node is already on the rbtree */ -+ cgv_node = bpf_kptr_xchg(&stash->node, NULL); -+ if (!cgv_node) { -+ stat_inc(FCG_STAT_ENQ_RACE); -+ return; -+ } -+ -+ bpf_spin_lock(&cgv_tree_lock); -+ cgrp_cap_budget(cgv_node, cgc); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+} -+ -+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ struct fcg_task_ctx *taskc; -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ /* -+ * If select_cpu_dfl() is recommending local enqueue, the target CPU is -+ * idle. Follow it and charge the cgroup later in fcg_stopping() after -+ * the fact. Use the same mechanism to deal with tasks with custom -+ * affinities so that we don't have to worry about per-cgroup dq's -+ * containing tasks that can't be executed from some CPUs. -+ */ -+ if ((enq_flags & SCX_ENQ_LOCAL) || p->nr_cpus_allowed != nr_cpus) { -+ /* -+ * Tell fcg_stopping() that this bypassed the regular scheduling -+ * path and should be force charged to the cgroup. 0 is used to -+ * indicate that the task isn't bypassing, so if the current -+ * runtime is 0, go back by one nanosecond. -+ */ -+ taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; -+ -+ /* -+ * The global dq is deprioritized as we don't want to let tasks -+ * to boost themselves by constraining its cpumask. The -+ * deprioritization is rather severe, so let's not apply that to -+ * per-cpu kernel threads. This is ham-fisted. We probably wanna -+ * implement per-cgroup fallback dq's instead so that we have -+ * more control over when tasks with custom cpumask get issued. -+ */ -+ if ((enq_flags & SCX_ENQ_LOCAL) || -+ (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD))) { -+ stat_inc(FCG_STAT_LOCAL); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); -+ } else { -+ stat_inc(FCG_STAT_GLOBAL); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+ } -+ return; -+ } -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ goto out_release; -+ -+ if (fifo_sched) { -+ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); -+ } else { -+ u64 tvtime = p->scx.dsq_vtime; -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) -+ tvtime = cgc->tvtime_now - SCX_SLICE_DFL; -+ -+ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, -+ tvtime, enq_flags); -+ } -+ -+ cgrp_enqueued(cgrp, cgc); -+out_release: -+ bpf_cgroup_release(cgrp); -+} -+ -+/* -+ * Walk the cgroup tree to update the active weight sums as tasks wake up and -+ * sleep. The weight sums are used as the base when calculating the proportion a -+ * given cgroup or task is entitled to at each level. -+ */ -+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ bool updated = false; -+ int idx; -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ return; -+ -+ /* -+ * In most cases, a hot cgroup would have multiple threads going to -+ * sleep and waking up while the whole cgroup stays active. In leaf -+ * cgroups, ->nr_runnable which is updated with __sync operations gates -+ * ->nr_active updates, so that we don't have to grab the cgv_tree_lock -+ * repeatedly for a busy cgroup which is staying active. -+ */ -+ if (runnable) { -+ if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) -+ return; -+ stat_inc(FCG_STAT_ACT); -+ } else { -+ if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) -+ return; -+ stat_inc(FCG_STAT_DEACT); -+ } -+ -+ /* -+ * If @cgrp is becoming runnable, its hweight should be refreshed after -+ * it's added to the weight tree so that enqueue has the up-to-date -+ * value. If @cgrp is becoming quiescent, the hweight should be -+ * refreshed before it's removed from the weight tree so that the usage -+ * charging which happens afterwards has access to the latest value. -+ */ -+ if (!runnable) -+ cgrp_refresh_hweight(cgrp, cgc); -+ -+ /* propagate upwards */ -+ bpf_for(idx, 0, cgrp->level) { -+ int level = cgrp->level - idx; -+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; -+ bool propagate = false; -+ -+ cgc = find_ancestor_cgrp_ctx(cgrp, level); -+ if (!cgc) -+ break; -+ if (level) { -+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); -+ if (!pcgc) -+ break; -+ } -+ -+ /* -+ * We need the propagation protected by a lock to synchronize -+ * against weight changes. There's no reason to drop the lock at -+ * each level but bpf_spin_lock() doesn't want any function -+ * calls while locked. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ -+ if (runnable) { -+ if (!cgc->nr_active++) { -+ updated = true; -+ if (pcgc) { -+ propagate = true; -+ pcgc->child_weight_sum += cgc->weight; -+ } -+ } -+ } else { -+ if (!--cgc->nr_active) { -+ updated = true; -+ if (pcgc) { -+ propagate = true; -+ pcgc->child_weight_sum -= cgc->weight; -+ } -+ } -+ } -+ -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!propagate) -+ break; -+ } -+ -+ if (updated) -+ __sync_fetch_and_add(&hweight_gen, 1); -+ -+ if (runnable) -+ cgrp_refresh_hweight(cgrp, cgc); -+} -+ -+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) -+{ -+ struct cgroup *cgrp; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ update_active_weight_sums(cgrp, true); -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) -+{ -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ if (fifo_sched) -+ return; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (cgc) { -+ /* -+ * @cgc->tvtime_now always progresses forward as tasks start -+ * executing. The test and update can be performed concurrently -+ * from multiple CPUs and thus racy. Any error should be -+ * contained and temporary. Let's just live with it. -+ */ -+ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) -+ cgc->tvtime_now = p->scx.dsq_vtime; -+ } -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) -+{ -+ struct fcg_task_ctx *taskc; -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ /* -+ * Scale the execution time by the inverse of the weight and charge. -+ * -+ * Note that the default yield implementation yields by setting -+ * @p->scx.slice to zero and the following would treat the yielding task -+ * as if it has consumed all its slice. If this penalizes yielding tasks -+ * too much, determine the execution time by taking explicit timestamps -+ * instead of depending on @p->scx.slice. -+ */ -+ if (!fifo_sched) -+ p->scx.dsq_vtime += -+ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ if (!taskc->bypassed_at) -+ return; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (cgc) { -+ __sync_fetch_and_add(&cgc->cvtime_delta, -+ p->se.sum_exec_runtime - taskc->bypassed_at); -+ taskc->bypassed_at = 0; -+ } -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) -+{ -+ struct cgroup *cgrp; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ update_active_weight_sums(cgrp, false); -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) -+{ -+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ return; -+ -+ if (cgrp->level) { -+ pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); -+ if (!pcgc) -+ return; -+ } -+ -+ bpf_spin_lock(&cgv_tree_lock); -+ if (pcgc && cgc->nr_active) -+ pcgc->child_weight_sum += (s64)weight - cgc->weight; -+ cgc->weight = weight; -+ bpf_spin_unlock(&cgv_tree_lock); -+} -+ -+static bool try_pick_next_cgroup(u64 *cgidp) -+{ -+ struct bpf_rb_node *rb_node; -+ struct cgv_node_stash *stash; -+ struct cgv_node *cgv_node; -+ struct fcg_cgrp_ctx *cgc; -+ struct cgroup *cgrp; -+ u64 cgid; -+ -+ /* pop the front cgroup and wind cvtime_now accordingly */ -+ bpf_spin_lock(&cgv_tree_lock); -+ -+ rb_node = bpf_rbtree_first(&cgv_tree); -+ if (!rb_node) { -+ bpf_spin_unlock(&cgv_tree_lock); -+ stat_inc(FCG_STAT_PNC_NO_CGRP); -+ *cgidp = 0; -+ return true; -+ } -+ -+ rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!rb_node) { -+ /* -+ * This should never happen. bpf_rbtree_first() was called -+ * above while the tree lock was held, so the node should -+ * always be present. -+ */ -+ scx_bpf_error("node could not be removed"); -+ return true; -+ } -+ -+ cgv_node = container_of(rb_node, struct cgv_node, rb_node); -+ cgid = cgv_node->cgid; -+ -+ if (vtime_before(cvtime_now, cgv_node->cvtime)) -+ cvtime_now = cgv_node->cvtime; -+ -+ /* -+ * If lookup fails, the cgroup's gone. Free and move on. See -+ * fcg_cgroup_exit(). -+ */ -+ cgrp = bpf_cgroup_from_id(cgid); -+ if (!cgrp) { -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (!cgc) { -+ bpf_cgroup_release(cgrp); -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ if (!scx_bpf_consume(cgid)) { -+ bpf_cgroup_release(cgrp); -+ stat_inc(FCG_STAT_PNC_EMPTY); -+ goto out_stash; -+ } -+ -+ /* -+ * Successfully consumed from the cgroup. This will be our current -+ * cgroup for the new slice. Refresh its hweight. -+ */ -+ cgrp_refresh_hweight(cgrp, cgc); -+ -+ bpf_cgroup_release(cgrp); -+ -+ /* -+ * As the cgroup may have more tasks, add it back to the rbtree. Note -+ * that here we charge the full slice upfront and then exact later -+ * according to the actual consumption. This prevents lowpri thundering -+ * herd from saturating the machine. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); -+ cgrp_cap_budget(cgv_node, cgc); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ *cgidp = cgid; -+ stat_inc(FCG_STAT_PNC_NEXT); -+ return true; -+ -+out_stash: -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash) { -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ /* -+ * Paired with cmpxchg in cgrp_enqueued(). If they see the following -+ * transition, they'll enqueue the cgroup. If they are earlier, we'll -+ * see their task in the dq below and requeue the cgroup. -+ */ -+ __sync_val_compare_and_swap(&cgc->queued, 1, 0); -+ -+ if (scx_bpf_dsq_nr_queued(cgid)) { -+ bpf_spin_lock(&cgv_tree_lock); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+ } else { -+ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); -+ if (cgv_node) { -+ scx_bpf_error("unexpected !NULL cgv_node stash"); -+ goto out_free; -+ } -+ } -+ -+ return false; -+ -+out_free: -+ bpf_obj_drop(cgv_node); -+ return false; -+} -+ -+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ struct fcg_cpu_ctx *cpuc; -+ struct fcg_cgrp_ctx *cgc; -+ struct cgroup *cgrp; -+ u64 now = bpf_ktime_get_ns(); -+ -+ cpuc = find_cpu_ctx(); -+ if (!cpuc) -+ return; -+ -+ if (!cpuc->cur_cgid) -+ goto pick_next_cgroup; -+ -+ if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { -+ if (scx_bpf_consume(cpuc->cur_cgid)) { -+ stat_inc(FCG_STAT_CNS_KEEP); -+ return; -+ } -+ stat_inc(FCG_STAT_CNS_EMPTY); -+ } else { -+ stat_inc(FCG_STAT_CNS_EXPIRE); -+ } -+ -+ /* -+ * The current cgroup is expiring. It was already charged a full slice. -+ * Calculate the actual usage and accumulate the delta. -+ */ -+ cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); -+ if (!cgrp) { -+ stat_inc(FCG_STAT_CNS_GONE); -+ goto pick_next_cgroup; -+ } -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (cgc) { -+ /* -+ * We want to update the vtime delta and then look for the next -+ * cgroup to execute but the latter needs to be done in a loop -+ * and we can't keep the lock held. Oh well... -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ __sync_fetch_and_add(&cgc->cvtime_delta, -+ (cpuc->cur_at + cgrp_slice_ns - now) * -+ FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); -+ bpf_spin_unlock(&cgv_tree_lock); -+ } else { -+ stat_inc(FCG_STAT_CNS_GONE); -+ } -+ -+ bpf_cgroup_release(cgrp); -+ -+pick_next_cgroup: -+ cpuc->cur_at = now; -+ -+ if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { -+ cpuc->cur_cgid = 0; -+ return; -+ } -+ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ if (try_pick_next_cgroup(&cpuc->cur_cgid)) -+ break; -+ } -+} -+ -+s32 BPF_STRUCT_OPS(fcg_prep_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ struct fcg_task_ctx *taskc; -+ struct fcg_cgrp_ctx *cgc; -+ -+ /* -+ * @p is new. Let's ensure that its task_ctx is available. We can sleep -+ * in this function and the following will automatically use GFP_KERNEL. -+ */ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE); -+ if (!taskc) -+ return -ENOMEM; -+ -+ taskc->bypassed_at = 0; -+ -+ if (!(cgc = find_cgrp_ctx(args->cgroup))) -+ return -ENOENT; -+ -+ p->scx.dsq_vtime = cgc->tvtime_now; -+ -+ return 0; -+} -+ -+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ struct cgv_node *cgv_node; -+ struct cgv_node_stash empty_stash = {}, *stash; -+ u64 cgid = cgrp->kn->id; -+ int ret; -+ -+ /* -+ * Technically incorrect as cgroup ID is full 64bit while dq ID is -+ * 63bit. Should not be a problem in practice and easy to spot in the -+ * unlikely case that it breaks. -+ */ -+ ret = scx_bpf_create_dsq(cgid, -1); -+ if (ret) -+ return ret; -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE); -+ if (!cgc) { -+ ret = -ENOMEM; -+ goto err_destroy_dsq; -+ } -+ -+ cgc->weight = args->weight; -+ cgc->hweight = FCG_HWEIGHT_ONE; -+ -+ ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, -+ BPF_NOEXIST); -+ if (ret) { -+ if (ret != -ENOMEM) -+ scx_bpf_error("unexpected stash creation error (%d)", -+ ret); -+ goto err_destroy_dsq; -+ } -+ -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash) { -+ scx_bpf_error("unexpected cgv_node stash lookup failure"); -+ ret = -ENOENT; -+ goto err_destroy_dsq; -+ } -+ -+ cgv_node = bpf_obj_new(struct cgv_node); -+ if (!cgv_node) { -+ ret = -ENOMEM; -+ goto err_del_cgv_node; -+ } -+ -+ cgv_node->cgid = cgid; -+ cgv_node->cvtime = cvtime_now; -+ -+ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); -+ if (cgv_node) { -+ scx_bpf_error("unexpected !NULL cgv_node stash"); -+ ret = -EBUSY; -+ goto err_drop; -+ } -+ -+ return 0; -+ -+err_drop: -+ bpf_obj_drop(cgv_node); -+err_del_cgv_node: -+ bpf_map_delete_elem(&cgv_node_stash, &cgid); -+err_destroy_dsq: -+ scx_bpf_destroy_dsq(cgid); -+ return ret; -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) -+{ -+ u64 cgid = cgrp->kn->id; -+ -+ /* -+ * For now, there's no way find and remove the cgv_node if it's on the -+ * cgv_tree. Let's drain them in the dispatch path as they get popped -+ * off the front of the tree. -+ */ -+ bpf_map_delete_elem(&cgv_node_stash, &cgid); -+ scx_bpf_destroy_dsq(cgid); -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{ -+ struct fcg_cgrp_ctx *from_cgc, *to_cgc; -+ s64 vtime_delta; -+ -+ /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ -+ if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) -+ return; -+ -+ vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; -+ p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; -+} -+ -+s32 BPF_STRUCT_OPS(fcg_init) -+{ -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops flatcg_ops = { -+ .enqueue = (void *)fcg_enqueue, -+ .dispatch = (void *)fcg_dispatch, -+ .runnable = (void *)fcg_runnable, -+ .running = (void *)fcg_running, -+ .stopping = (void *)fcg_stopping, -+ .quiescent = (void *)fcg_quiescent, -+ .prep_enable = (void *)fcg_prep_enable, -+ .cgroup_set_weight = (void *)fcg_cgroup_set_weight, -+ .cgroup_init = (void *)fcg_cgroup_init, -+ .cgroup_exit = (void *)fcg_cgroup_exit, -+ .cgroup_move = (void *)fcg_cgroup_move, -+ .init = (void *)fcg_init, -+ .exit = (void *)fcg_exit, -+ .flags = SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING, -+ .name = "flatcg", -+}; -diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c -new file mode 100644 -index 000000000..f824c4b34 ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.c -@@ -0,0 +1,221 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_common.h" -+#include "scx_flatcg.h" -+#include "scx_flatcg.skel.h" -+ -+#ifndef FILEID_KERNFS -+#define FILEID_KERNFS 0xfe -+#endif -+ -+const char help_fmt[] = -+"A flattened cgroup hierarchy sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-p]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -i INTERVAL Report interval\n" -+" -f Use FIFO scheduling instead of weighted vtime scheduling\n" -+" -p Switch only tasks on SCHED_EXT policy intead of all\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) -+{ -+ FILE *fp; -+ char buf[4096]; -+ char *line, *cur = NULL, *tok; -+ __u64 sum = 0, idle = 0; -+ __u64 delta_sum, delta_idle; -+ int idx; -+ -+ fp = fopen("/proc/stat", "r"); -+ if (!fp) { -+ perror("fopen(\"/proc/stat\")"); -+ return 0.0; -+ } -+ -+ if (!fgets(buf, sizeof(buf), fp)) { -+ perror("fgets(\"/proc/stat\")"); -+ fclose(fp); -+ return 0.0; -+ } -+ fclose(fp); -+ -+ line = buf; -+ for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { -+ char *endp = NULL; -+ __u64 v; -+ -+ if (idx == 0) { -+ line = NULL; -+ continue; -+ } -+ v = strtoull(tok, &endp, 0); -+ if (!endp || *endp != '\0') { -+ fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", -+ idx, tok); -+ continue; -+ } -+ sum += v; -+ if (idx == 4) -+ idle = v; -+ } -+ -+ delta_sum = sum - *last_sum; -+ delta_idle = idle - *last_idle; -+ *last_sum = sum; -+ *last_idle = idle; -+ -+ return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; -+} -+ -+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) -+{ -+ __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; -+ __u32 idx; -+ -+ memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); -+ -+ for (idx = 0; idx < FCG_NR_STATS; idx++) { -+ int ret, cpu; -+ -+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), -+ &idx, cnts[idx]); -+ if (ret < 0) -+ continue; -+ for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) -+ stats[idx] += cnts[idx][cpu]; -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_flatcg *skel; -+ struct bpf_link *link; -+ struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; -+ bool dump_cgrps = false; -+ __u64 last_cpu_sum = 0, last_cpu_idle = 0; -+ __u64 last_stats[FCG_NR_STATS] = {}; -+ unsigned long seq = 0; -+ __s32 opt; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ skel = scx_flatcg__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); -+ -+ while ((opt = getopt(argc, argv, "s:i:dfph")) != -1) { -+ double v; -+ -+ switch (opt) { -+ case 's': -+ v = strtod(optarg, NULL); -+ skel->rodata->cgrp_slice_ns = v * 1000; -+ break; -+ case 'i': -+ v = strtod(optarg, NULL); -+ intv_ts.tv_sec = v; -+ intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; -+ break; -+ case 'd': -+ dump_cgrps = true; -+ break; -+ case 'f': -+ skel->rodata->fifo_sched = true; -+ break; -+ case 'p': -+ skel->rodata->switch_partial = true; -+ break; -+ case 'h': -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", -+ (double)skel->rodata->cgrp_slice_ns / 1000000.0, -+ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, -+ dump_cgrps); -+ -+ SCX_BUG_ON(scx_flatcg__load(skel), "Failed to load skel"); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.flatcg_ops); -+ SCX_BUG_ON(!link, "Failed to attach struct_ops"); -+ -+ while (!exit_req && !uei_exited(&skel->bss->uei)) { -+ __u64 acc_stats[FCG_NR_STATS]; -+ __u64 stats[FCG_NR_STATS]; -+ float cpu_util; -+ int i; -+ -+ cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); -+ -+ fcg_read_stats(skel, acc_stats); -+ for (i = 0; i < FCG_NR_STATS; i++) -+ stats[i] = acc_stats[i] - last_stats[i]; -+ -+ memcpy(last_stats, acc_stats, sizeof(acc_stats)); -+ -+ printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%lu]\n", -+ seq++, cpu_util * 100.0, skel->data->hweight_gen); -+ printf(" act:%6llu deact:%6llu local:%6llu global:%6llu\n", -+ stats[FCG_STAT_ACT], -+ stats[FCG_STAT_DEACT], -+ stats[FCG_STAT_LOCAL], -+ stats[FCG_STAT_GLOBAL]); -+ printf("HWT skip:%6llu race:%6llu cache:%6llu update:%6llu\n", -+ stats[FCG_STAT_HWT_SKIP], -+ stats[FCG_STAT_HWT_RACE], -+ stats[FCG_STAT_HWT_CACHE], -+ stats[FCG_STAT_HWT_UPDATES]); -+ printf("ENQ skip:%6llu race:%6llu\n", -+ stats[FCG_STAT_ENQ_SKIP], -+ stats[FCG_STAT_ENQ_RACE]); -+ printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", -+ stats[FCG_STAT_CNS_KEEP], -+ stats[FCG_STAT_CNS_EXPIRE], -+ stats[FCG_STAT_CNS_EMPTY], -+ stats[FCG_STAT_CNS_GONE]); -+ printf("PNC nocgrp:%6llu next:%6llu empty:%6llu gone:%6llu\n", -+ stats[FCG_STAT_PNC_NO_CGRP], -+ stats[FCG_STAT_PNC_NEXT], -+ stats[FCG_STAT_PNC_EMPTY], -+ stats[FCG_STAT_PNC_GONE]); -+ printf("BAD remove:%6llu\n", -+ acc_stats[FCG_STAT_BAD_REMOVAL]); -+ -+ nanosleep(&intv_ts, NULL); -+ } -+ -+ bpf_link__destroy(link); -+ uei_print(&skel->bss->uei); -+ scx_flatcg__destroy(skel); -+ return 0; -+} -diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h -new file mode 100644 -index 000000000..490758ed4 ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.h -@@ -0,0 +1,49 @@ -+#ifndef __SCX_EXAMPLE_FLATCG_H -+#define __SCX_EXAMPLE_FLATCG_H -+ -+enum { -+ FCG_HWEIGHT_ONE = 1LLU << 16, -+}; -+ -+enum fcg_stat_idx { -+ FCG_STAT_ACT, -+ FCG_STAT_DEACT, -+ FCG_STAT_LOCAL, -+ FCG_STAT_GLOBAL, -+ -+ FCG_STAT_HWT_UPDATES, -+ FCG_STAT_HWT_CACHE, -+ FCG_STAT_HWT_SKIP, -+ FCG_STAT_HWT_RACE, -+ -+ FCG_STAT_ENQ_SKIP, -+ FCG_STAT_ENQ_RACE, -+ -+ FCG_STAT_CNS_KEEP, -+ FCG_STAT_CNS_EXPIRE, -+ FCG_STAT_CNS_EMPTY, -+ FCG_STAT_CNS_GONE, -+ -+ FCG_STAT_PNC_NO_CGRP, -+ FCG_STAT_PNC_NEXT, -+ FCG_STAT_PNC_EMPTY, -+ FCG_STAT_PNC_GONE, -+ -+ FCG_STAT_BAD_REMOVAL, -+ -+ FCG_NR_STATS, -+}; -+ -+struct fcg_cgrp_ctx { -+ u32 nr_active; -+ u32 nr_runnable; -+ u32 queued; -+ u32 weight; -+ u32 hweight; -+ u64 child_weight_sum; -+ u64 hweight_gen; -+ s64 cvtime_delta; -+ u64 tvtime_now; -+}; -+ -+#endif /* __SCX_EXAMPLE_FLATCG_H */ -diff --git a/tools/sched_ext/scx_layered/.gitignore b/tools/sched_ext/scx_layered/.gitignore -new file mode 100644 -index 000000000..186dba259 ---- /dev/null -+++ b/tools/sched_ext/scx_layered/.gitignore -@@ -0,0 +1,3 @@ -+src/bpf/.output -+Cargo.lock -+target -diff --git a/tools/sched_ext/scx_layered/Cargo.toml b/tools/sched_ext/scx_layered/Cargo.toml -new file mode 100644 -index 000000000..6ba1b98d2 ---- /dev/null -+++ b/tools/sched_ext/scx_layered/Cargo.toml -@@ -0,0 +1,30 @@ -+[package] -+name = "scx_layered" -+version = "0.0.1" -+authors = ["Tejun Heo ", "Meta"] -+edition = "2021" -+description = "Userspace scheduling with BPF for Ads" -+license = "GPL-2.0-only" -+ -+[dependencies] -+anyhow = "1.0" -+bitvec = "1.0" -+clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } -+ctrlc = { version = "3.1", features = ["termination"] } -+fb_procfs = "0.7" -+lazy_static = "1.4" -+libbpf-rs = "0.21" -+libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] } -+libc = "0.2" -+log = "0.4" -+serde = { version = "1.0", features = ["derive"] } -+serde_json = "1.0" -+simplelog = "0.12" -+ -+[build-dependencies] -+bindgen = { version = "0.61" } -+libbpf-cargo = "0.21" -+glob = "0.3" -+ -+[features] -+enable_backtrace = [] -diff --git a/tools/sched_ext/scx_layered/build.rs b/tools/sched_ext/scx_layered/build.rs -new file mode 100644 -index 000000000..ea0bbd48a ---- /dev/null -+++ b/tools/sched_ext/scx_layered/build.rs -@@ -0,0 +1,77 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+extern crate bindgen; -+ -+use std::env; -+use std::fs::create_dir_all; -+use std::path::Path; -+use std::path::PathBuf; -+ -+use glob::glob; -+use libbpf_cargo::SkeletonBuilder; -+ -+const HEADER_PATH: &str = "src/bpf/layered.h"; -+ -+fn bindgen_layered() { -+ // Tell cargo to invalidate the built crate whenever the wrapper changes -+ println!("cargo:rerun-if-changed={}", HEADER_PATH); -+ -+ // The bindgen::Builder is the main entry point -+ // to bindgen, and lets you build up options for -+ // the resulting bindings. -+ let bindings = bindgen::Builder::default() -+ // The input header we would like to generate -+ // bindings for. -+ .header(HEADER_PATH) -+ // Tell cargo to invalidate the built crate whenever any of the -+ // included header files changed. -+ .parse_callbacks(Box::new(bindgen::CargoCallbacks)) -+ // Finish the builder and generate the bindings. -+ .generate() -+ // Unwrap the Result and panic on failure. -+ .expect("Unable to generate bindings"); -+ -+ // Write the bindings to the $OUT_DIR/bindings.rs file. -+ let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); -+ bindings -+ .write_to_file(out_path.join("layered_sys.rs")) -+ .expect("Couldn't write bindings!"); -+} -+ -+fn gen_bpf_sched(name: &str) { -+ let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap(); -+ let clang = env::var("SCX_RUST_CLANG").unwrap(); -+ eprintln!("{}", clang); -+ let outpath = format!("./src/bpf/.output/{}.skel.rs", name); -+ let skel = Path::new(&outpath); -+ let src = format!("./src/bpf/{}.bpf.c", name); -+ let obj = format!("./src/bpf/.output/{}.bpf.o", name); -+ SkeletonBuilder::new() -+ .source(src.clone()) -+ .obj(obj) -+ .clang(clang) -+ .clang_args(bpf_cflags) -+ .build_and_generate(skel) -+ .unwrap(); -+ -+ // Trigger rebuild if any .[hc] files are changed in the directory. -+ for path in glob("./src/bpf/*.[hc]").unwrap().filter_map(Result::ok) { -+ println!("cargo:rerun-if-changed={}", path.to_str().unwrap()); -+ } -+} -+ -+fn main() { -+ bindgen_layered(); -+ // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton. -+ // Reasons are because the generated skeleton contains compiler attributes -+ // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]` -+ // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside -+ // the path attribute either (see https://github.com/rust-lang/rust/pull/83366). -+ // -+ // However, there is hope! When the above feature stabilizes we can clean this -+ // all up. -+ create_dir_all("./src/bpf/.output").unwrap(); -+ gen_bpf_sched("layered"); -+} -diff --git a/tools/sched_ext/scx_layered/rustfmt.toml b/tools/sched_ext/scx_layered/rustfmt.toml -new file mode 100644 -index 000000000..b7258ed0a ---- /dev/null -+++ b/tools/sched_ext/scx_layered/rustfmt.toml -@@ -0,0 +1,8 @@ -+# Get help on options with `rustfmt --help=config` -+# Please keep these in alphabetical order. -+edition = "2021" -+group_imports = "StdExternalCrate" -+imports_granularity = "Item" -+merge_derives = false -+use_field_init_shorthand = true -+version = "Two" -diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c -new file mode 100644 -index 000000000..b0a27f3c7 ---- /dev/null -+++ b/tools/sched_ext/scx_layered/src/bpf/layered.bpf.c -@@ -0,0 +1,974 @@ -+/* Copyright (c) Meta Platforms, Inc. and affiliates. */ -+#include "../../../scx_common.bpf.h" -+#include "layered.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile u32 debug = 0; -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+const volatile u32 nr_possible_cpus = 1; -+const volatile u32 nr_layers = 1; -+const volatile bool smt_enabled = true; -+const volatile unsigned char all_cpus[MAX_CPUS_U8]; -+ -+private(all_cpumask) struct bpf_cpumask __kptr *all_cpumask; -+struct layer layers[MAX_LAYERS]; -+u32 fallback_cpu; -+static u32 preempt_cursor; -+ -+#define dbg(fmt, args...) do { if (debug) bpf_printk(fmt, ##args); } while (0) -+#define trace(fmt, args...) do { if (debug > 1) bpf_printk(fmt, ##args); } while (0) -+ -+#include "util.bpf.c" -+#include "../../../ravg_impl.bpf.h" -+ -+struct user_exit_info uei; -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __type(key, u32); -+ __type(value, struct cpu_ctx); -+ __uint(max_entries, 1); -+} cpu_ctxs SEC(".maps"); -+ -+static struct cpu_ctx *lookup_cpu_ctx(int cpu) -+{ -+ struct cpu_ctx *cctx; -+ u32 zero = 0; -+ -+ if (cpu < 0) -+ cctx = bpf_map_lookup_elem(&cpu_ctxs, &zero); -+ else -+ cctx = bpf_map_lookup_percpu_elem(&cpu_ctxs, &zero, cpu); -+ -+ if (!cctx) { -+ scx_bpf_error("no cpu_ctx for cpu %d", cpu); -+ return NULL; -+ } -+ -+ return cctx; -+} -+ -+static void gstat_inc(enum global_stat_idx idx, struct cpu_ctx *cctx) -+{ -+ if (idx < 0 || idx >= NR_GSTATS) { -+ scx_bpf_error("invalid global stat idx %d", idx); -+ return; -+ } -+ -+ cctx->gstats[idx]++; -+} -+ -+static void lstat_inc(enum layer_stat_idx idx, struct layer *layer, struct cpu_ctx *cctx) -+{ -+ u64 *vptr; -+ -+ if ((vptr = MEMBER_VPTR(*cctx, .lstats[layer->idx][idx]))) -+ (*vptr)++; -+ else -+ scx_bpf_error("invalid layer or stat idxs: %d, %d", idx, layer->idx); -+} -+ -+struct lock_wrapper { -+ struct bpf_spin_lock lock; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __type(key, u32); -+ __type(value, struct lock_wrapper); -+ __uint(max_entries, MAX_LAYERS); -+ __uint(map_flags, 0); -+} layer_load_locks SEC(".maps"); -+ -+static void adj_load(u32 layer_idx, s64 adj, u64 now) -+{ -+ struct layer *layer; -+ struct lock_wrapper *lockw; -+ -+ layer = MEMBER_VPTR(layers, [layer_idx]); -+ lockw = bpf_map_lookup_elem(&layer_load_locks, &layer_idx); -+ -+ if (!layer || !lockw) { -+ scx_bpf_error("Can't access layer%d or its load_lock", layer_idx); -+ return; -+ } -+ -+ bpf_spin_lock(&lockw->lock); -+ layer->load += adj; -+ ravg_accumulate(&layer->load_rd, layer->load, now, USAGE_HALF_LIFE); -+ bpf_spin_unlock(&lockw->lock); -+ -+ if (debug && adj < 0 && (s64)layer->load < 0) -+ scx_bpf_error("cpu%d layer%d load underflow (load=%lld adj=%lld)", -+ bpf_get_smp_processor_id(), layer_idx, layer->load, adj); -+} -+ -+struct layer_cpumask_wrapper { -+ struct bpf_cpumask __kptr *cpumask; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __type(key, u32); -+ __type(value, struct layer_cpumask_wrapper); -+ __uint(max_entries, MAX_LAYERS); -+ __uint(map_flags, 0); -+} layer_cpumasks SEC(".maps"); -+ -+static struct cpumask *lookup_layer_cpumask(int idx) -+{ -+ struct layer_cpumask_wrapper *cpumaskw; -+ -+ if ((cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx))) { -+ return (struct cpumask *)cpumaskw->cpumask; -+ } else { -+ scx_bpf_error("no layer_cpumask"); -+ return NULL; -+ } -+} -+ -+static void refresh_cpumasks(int idx) -+{ -+ struct layer_cpumask_wrapper *cpumaskw; -+ struct layer *layer; -+ int cpu, total = 0; -+ -+ if (!__sync_val_compare_and_swap(&layers[idx].refresh_cpus, 1, 0)) -+ return; -+ -+ cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &idx); -+ -+ bpf_for(cpu, 0, nr_possible_cpus) { -+ u8 *u8_ptr; -+ -+ if ((u8_ptr = MEMBER_VPTR(layers, [idx].cpus[cpu / 8]))) { -+ /* -+ * XXX - The following test should be outside the loop -+ * but that makes the verifier think that -+ * cpumaskw->cpumask might be NULL in the loop. -+ */ -+ barrier_var(cpumaskw); -+ if (!cpumaskw || !cpumaskw->cpumask) { -+ scx_bpf_error("can't happen"); -+ return; -+ } -+ -+ if (*u8_ptr & (1 << (cpu % 8))) { -+ bpf_cpumask_set_cpu(cpu, cpumaskw->cpumask); -+ total++; -+ } else { -+ bpf_cpumask_clear_cpu(cpu, cpumaskw->cpumask); -+ } -+ } else { -+ scx_bpf_error("can't happen"); -+ } -+ } -+ -+ // XXX - shouldn't be necessary -+ layer = MEMBER_VPTR(layers, [idx]); -+ if (!layer) { -+ scx_bpf_error("can't happen"); -+ return; -+ } -+ -+ layer->nr_cpus = total; -+ __sync_fetch_and_add(&layer->cpus_seq, 1); -+ trace("LAYER[%d] now has %d cpus, seq=%llu", idx, layer->nr_cpus, layer->cpus_seq); -+} -+ -+SEC("fentry/scheduler_tick") -+int scheduler_tick_fentry(const void *ctx) -+{ -+ int idx; -+ -+ if (bpf_get_smp_processor_id() == 0) -+ bpf_for(idx, 0, nr_layers) -+ refresh_cpumasks(idx); -+ return 0; -+} -+ -+struct task_ctx { -+ int pid; -+ -+ int layer; -+ bool refresh_layer; -+ u64 layer_cpus_seq; -+ struct bpf_cpumask __kptr *layered_cpumask; -+ -+ bool all_cpus_allowed; -+ bool dispatch_local; -+ u64 started_running_at; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_HASH); -+ __type(key, pid_t); -+ __type(value, struct task_ctx); -+ __uint(max_entries, MAX_TASKS); -+ __uint(map_flags, 0); -+} task_ctxs SEC(".maps"); -+ -+struct task_ctx *lookup_task_ctx_may_fail(struct task_struct *p) -+{ -+ s32 pid = p->pid; -+ -+ return bpf_map_lookup_elem(&task_ctxs, &pid); -+} -+ -+struct task_ctx *lookup_task_ctx(struct task_struct *p) -+{ -+ struct task_ctx *tctx; -+ s32 pid = p->pid; -+ -+ if ((tctx = bpf_map_lookup_elem(&task_ctxs, &pid))) { -+ return tctx; -+ } else { -+ scx_bpf_error("task_ctx lookup failed"); -+ return NULL; -+ } -+} -+ -+struct layer *lookup_layer(int idx) -+{ -+ if (idx < 0 || idx >= nr_layers) { -+ scx_bpf_error("invalid layer %d", idx); -+ return NULL; -+ } -+ return &layers[idx]; -+} -+ -+SEC("tp_btf/cgroup_attach_task") -+int BPF_PROG(tp_cgroup_attach_task, struct cgroup *cgrp, const char *cgrp_path, -+ struct task_struct *leader, bool threadgroup) -+{ -+ struct task_struct *next; -+ struct task_ctx *tctx; -+ int leader_pid = leader->pid; -+ -+ if (!(tctx = lookup_task_ctx_may_fail(leader))) -+ return 0; -+ tctx->refresh_layer = true; -+ -+ if (!threadgroup) -+ return 0; -+ -+ if (!(next = bpf_task_acquire(leader))) { -+ scx_bpf_error("failed to acquire leader"); -+ return 0; -+ } -+ -+ bpf_repeat(MAX_TASKS) { -+ struct task_struct *p; -+ int pid; -+ -+ p = container_of(next->thread_group.next, struct task_struct, thread_group); -+ bpf_task_release(next); -+ -+ pid = BPF_CORE_READ(p, pid); -+ if (pid == leader_pid) { -+ next = NULL; -+ break; -+ } -+ -+ next = bpf_task_from_pid(pid); -+ if (!next) { -+ scx_bpf_error("thread iteration failed"); -+ break; -+ } -+ -+ if ((tctx = lookup_task_ctx(next))) -+ tctx->refresh_layer = true; -+ } -+ -+ if (next) -+ bpf_task_release(next); -+ return 0; -+} -+ -+SEC("tp_btf/task_rename") -+int BPF_PROG(tp_task_rename, struct task_struct *p, const char *buf) -+{ -+ struct task_ctx *tctx; -+ -+ if ((tctx = lookup_task_ctx_may_fail(p))) -+ tctx->refresh_layer = true; -+ return 0; -+} -+ -+static void maybe_refresh_layered_cpumask(struct cpumask *layered_cpumask, -+ struct task_struct *p, struct task_ctx *tctx, -+ const struct cpumask *layer_cpumask) -+{ -+ u64 layer_seq = layers->cpus_seq; -+ -+ if (tctx->layer_cpus_seq == layer_seq) -+ return; -+ -+ /* -+ * XXX - We're assuming that the updated @layer_cpumask matching the new -+ * @layer_seq is visible which may not be true. For now, leave it as-is. -+ * Let's update once BPF grows enough memory ordering constructs. -+ */ -+ bpf_cpumask_and((struct bpf_cpumask *)layered_cpumask, layer_cpumask, p->cpus_ptr); -+ tctx->layer_cpus_seq = layer_seq; -+ trace("%s[%d] cpumask refreshed to seq %llu", p->comm, p->pid, layer_seq); -+} -+ -+static s32 pick_idle_cpu_from(const struct cpumask *cand_cpumask, s32 prev_cpu, -+ const struct cpumask *idle_cpumask, -+ const struct cpumask *idle_smtmask) -+{ -+ bool prev_in_cand = bpf_cpumask_test_cpu(prev_cpu, cand_cpumask); -+ s32 cpu; -+ -+ /* -+ * If CPU has SMT, any wholly idle CPU is likely a better pick than -+ * partially idle @prev_cpu. -+ */ -+ if (smt_enabled) { -+ if (prev_in_cand && -+ bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) -+ return prev_cpu; -+ -+ cpu = scx_bpf_pick_idle_cpu(cand_cpumask, SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) -+ return cpu; -+ } -+ -+ if (prev_in_cand && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) -+ return prev_cpu; -+ -+ return scx_bpf_pick_idle_cpu(cand_cpumask, 0); -+} -+ -+s32 BPF_STRUCT_OPS(layered_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) -+{ -+ const struct cpumask *idle_cpumask, *idle_smtmask; -+ struct cpumask *layer_cpumask, *layered_cpumask; -+ struct cpu_ctx *cctx; -+ struct task_ctx *tctx; -+ struct layer *layer; -+ s32 cpu; -+ -+ /* look up everything we need */ -+ if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) || -+ !(layered_cpumask = (struct cpumask *)tctx->layered_cpumask)) -+ return prev_cpu; -+ -+ /* -+ * We usually update the layer in layered_runnable() to avoid confusing. -+ * As layered_select_cpu() takes place before runnable, new tasks would -+ * still have -1 layer. Just return @prev_cpu. -+ */ -+ if (tctx->layer < 0) -+ return prev_cpu; -+ -+ if (!(layer = lookup_layer(tctx->layer)) || -+ !(layer_cpumask = lookup_layer_cpumask(tctx->layer))) -+ return prev_cpu; -+ -+ if (!(idle_cpumask = scx_bpf_get_idle_cpumask())) -+ return prev_cpu; -+ -+ if (!(idle_smtmask = scx_bpf_get_idle_smtmask())) { -+ cpu = prev_cpu; -+ goto out_put_idle_cpumask; -+ } -+ -+ /* not much to do if bound to a single CPU */ -+ if (p->nr_cpus_allowed == 1) { -+ cpu = prev_cpu; -+ if (scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ if (!bpf_cpumask_test_cpu(cpu, layer_cpumask)) -+ lstat_inc(LSTAT_AFFN_VIOL, layer, cctx); -+ goto dispatch_local; -+ } else { -+ goto out_put_cpumasks; -+ } -+ } -+ -+ maybe_refresh_layered_cpumask(layered_cpumask, p, tctx, layer_cpumask); -+ -+ /* -+ * If CPU has SMT, any wholly idle CPU is likely a better pick than -+ * partially idle @prev_cpu. -+ */ -+ if ((cpu = pick_idle_cpu_from(layered_cpumask, prev_cpu, -+ idle_cpumask, idle_smtmask)) >= 0) -+ goto dispatch_local; -+ -+ /* -+ * If the layer is an open one, we can try the whole machine. -+ */ -+ if (layer->open && -+ ((cpu = pick_idle_cpu_from(p->cpus_ptr, prev_cpu, -+ idle_cpumask, idle_smtmask)) >= 0)) { -+ lstat_inc(LSTAT_OPEN_IDLE, layer, cctx); -+ goto dispatch_local; -+ } -+ -+ cpu = prev_cpu; -+ goto out_put_cpumasks; -+ -+dispatch_local: -+ tctx->dispatch_local = true; -+out_put_cpumasks: -+ scx_bpf_put_idle_cpumask(idle_smtmask); -+out_put_idle_cpumask: -+ scx_bpf_put_idle_cpumask(idle_cpumask); -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(layered_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ struct cpu_ctx *cctx; -+ struct task_ctx *tctx; -+ struct layer *layer; -+ u64 vtime = p->scx.dsq_vtime; -+ u32 idx; -+ -+ if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) || -+ !(layer = lookup_layer(tctx->layer))) -+ return; -+ -+ if (tctx->dispatch_local) { -+ tctx->dispatch_local = false; -+ lstat_inc(LSTAT_LOCAL, layer, cctx); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ lstat_inc(LSTAT_GLOBAL, layer, cctx); -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(vtime, layer->vtime_now - slice_ns)) -+ vtime = layer->vtime_now - slice_ns; -+ -+ if (!tctx->all_cpus_allowed) { -+ lstat_inc(LSTAT_AFFN_VIOL, layer, cctx); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ scx_bpf_dispatch_vtime(p, tctx->layer, slice_ns, vtime, enq_flags); -+ -+ if (!layer->preempt) -+ return; -+ -+ bpf_for(idx, 0, nr_possible_cpus) { -+ struct cpu_ctx *cand_cctx; -+ u32 cpu = (preempt_cursor + idx) % nr_possible_cpus; -+ -+ if (!all_cpumask || -+ !bpf_cpumask_test_cpu(cpu, (const struct cpumask *)all_cpumask)) -+ continue; -+ if (!(cand_cctx = lookup_cpu_ctx(cpu)) || cand_cctx->current_preempt) -+ continue; -+ -+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); -+ -+ /* -+ * Round-robining doesn't have to be strict. Let's not bother -+ * with atomic ops on $preempt_cursor. -+ */ -+ preempt_cursor = (cpu + 1) % nr_possible_cpus; -+ -+ lstat_inc(LSTAT_PREEMPT, layer, cctx); -+ break; -+ } -+} -+ -+void BPF_STRUCT_OPS(layered_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ int idx; -+ -+ /* consume preempting layers first */ -+ bpf_for(idx, 0, nr_layers) -+ if (layers[idx].preempt && scx_bpf_consume(idx)) -+ return; -+ -+ /* consume !open layers second */ -+ bpf_for(idx, 0, nr_layers) { -+ struct layer *layer = &layers[idx]; -+ struct cpumask *layer_cpumask; -+ -+ if (layer->open) -+ continue; -+ -+ /* consume matching layers */ -+ if (!(layer_cpumask = lookup_layer_cpumask(idx))) -+ return; -+ -+ if (bpf_cpumask_test_cpu(cpu, layer_cpumask) || -+ (cpu == fallback_cpu && layer->nr_cpus == 0)) { -+ if (scx_bpf_consume(idx)) -+ return; -+ } -+ } -+ -+ /* consume !preempting open layers */ -+ bpf_for(idx, 0, nr_layers) { -+ if (!layers[idx].preempt && layers[idx].open && -+ scx_bpf_consume(idx)) -+ return; -+ } -+} -+ -+static bool match_one(struct layer_match *match, struct task_struct *p, const char *cgrp_path) -+{ -+ switch (match->kind) { -+ case MATCH_CGROUP_PREFIX: { -+ return match_prefix(match->cgroup_prefix, cgrp_path, MAX_PATH); -+ } -+ case MATCH_COMM_PREFIX: { -+ char comm[MAX_COMM]; -+ memcpy(comm, p->comm, MAX_COMM); -+ return match_prefix(match->comm_prefix, comm, MAX_COMM); -+ } -+ case MATCH_NICE_ABOVE: -+ return (s32)p->static_prio - 120 > match->nice_above_or_below; -+ case MATCH_NICE_BELOW: -+ return (s32)p->static_prio - 120 < match->nice_above_or_below; -+ default: -+ scx_bpf_error("invalid match kind %d", match->kind); -+ return false; -+ } -+} -+ -+static bool match_layer(struct layer *layer, struct task_struct *p, const char *cgrp_path) -+{ -+ u32 nr_match_ors = layer->nr_match_ors; -+ u64 or_idx, and_idx; -+ -+ if (nr_match_ors > MAX_LAYER_MATCH_ORS) { -+ scx_bpf_error("too many ORs"); -+ return false; -+ } -+ -+ bpf_for(or_idx, 0, nr_match_ors) { -+ struct layer_match_ands *ands; -+ bool matched = true; -+ -+ barrier_var(or_idx); -+ if (or_idx >= MAX_LAYER_MATCH_ORS) -+ return false; /* can't happen */ -+ ands = &layer->matches[or_idx]; -+ -+ if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) { -+ scx_bpf_error("too many ANDs"); -+ return false; -+ } -+ -+ bpf_for(and_idx, 0, ands->nr_match_ands) { -+ struct layer_match *match; -+ -+ barrier_var(and_idx); -+ if (and_idx >= NR_LAYER_MATCH_KINDS) -+ return false; /* can't happen */ -+ match = &ands->matches[and_idx]; -+ -+ if (!match_one(match, p, cgrp_path)) { -+ matched = false; -+ break; -+ } -+ } -+ -+ if (matched) -+ return true; -+ } -+ -+ return false; -+} -+ -+static void maybe_refresh_layer(struct task_struct *p, struct task_ctx *tctx) -+{ -+ const char *cgrp_path; -+ bool matched = false; -+ u64 idx; // XXX - int makes verifier unhappy -+ -+ if (!tctx->refresh_layer) -+ return; -+ tctx->refresh_layer = false; -+ -+ if (!(cgrp_path = format_cgrp_path(p->cgroups->dfl_cgrp))) -+ return; -+ -+ if (tctx->layer >= 0 && tctx->layer < nr_layers) -+ __sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1); -+ -+ bpf_for(idx, 0, nr_layers) { -+ if (match_layer(&layers[idx], p, cgrp_path)) { -+ matched = true; -+ break; -+ } -+ } -+ -+ if (matched) { -+ struct layer *layer = &layers[idx]; -+ -+ tctx->layer = idx; -+ tctx->layer_cpus_seq = layer->cpus_seq - 1; -+ __sync_fetch_and_add(&layer->nr_tasks, 1); -+ /* -+ * XXX - To be correct, we'd need to calculate the vtime -+ * delta in the previous layer, scale it by the load -+ * fraction difference and then offset from the new -+ * layer's vtime_now. For now, just do the simple thing -+ * and assume the offset to be zero. -+ * -+ * Revisit if high frequency dynamic layer switching -+ * needs to be supported. -+ */ -+ p->scx.dsq_vtime = layer->vtime_now; -+ } else { -+ scx_bpf_error("[%s]%d didn't match any layer", p->comm, p->pid); -+ } -+ -+ if (tctx->layer < nr_layers - 1) -+ trace("LAYER=%d %s[%d] cgrp=\"%s\"", -+ tctx->layer, p->comm, p->pid, cgrp_path); -+} -+ -+void BPF_STRUCT_OPS(layered_runnable, struct task_struct *p, u64 enq_flags) -+{ -+ u64 now = bpf_ktime_get_ns(); -+ struct task_ctx *tctx; -+ -+ if (!(tctx = lookup_task_ctx(p))) -+ return; -+ -+ maybe_refresh_layer(p, tctx); -+ -+ adj_load(tctx->layer, p->scx.weight, now); -+} -+ -+void BPF_STRUCT_OPS(layered_running, struct task_struct *p) -+{ -+ struct cpu_ctx *cctx; -+ struct task_ctx *tctx; -+ struct layer *layer; -+ -+ if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p)) || -+ !(layer = lookup_layer(tctx->layer))) -+ return; -+ -+ if (vtime_before(layer->vtime_now, p->scx.dsq_vtime)) -+ layer->vtime_now = p->scx.dsq_vtime; -+ -+ cctx->current_preempt = layer->preempt; -+ tctx->started_running_at = bpf_ktime_get_ns(); -+} -+ -+void BPF_STRUCT_OPS(layered_stopping, struct task_struct *p, bool runnable) -+{ -+ struct cpu_ctx *cctx; -+ struct task_ctx *tctx; -+ u64 used; -+ u32 layer; -+ -+ if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) -+ return; -+ -+ layer = tctx->layer; -+ if (layer >= nr_layers) { -+ scx_bpf_error("invalid layer %u", layer); -+ return; -+ } -+ -+ used = bpf_ktime_get_ns() - tctx->started_running_at; -+ cctx->layer_cycles[layer] += used; -+ cctx->current_preempt = false; -+ -+ /* scale the execution time by the inverse of the weight and charge */ -+ p->scx.dsq_vtime += used * 100 / p->scx.weight; -+} -+ -+void BPF_STRUCT_OPS(layered_quiescent, struct task_struct *p, u64 deq_flags) -+{ -+ struct task_ctx *tctx; -+ -+ if ((tctx = lookup_task_ctx(p))) -+ adj_load(tctx->layer, -(s64)p->scx.weight, bpf_ktime_get_ns()); -+} -+ -+void BPF_STRUCT_OPS(layered_set_weight, struct task_struct *p, u32 weight) -+{ -+ struct task_ctx *tctx; -+ -+ if ((tctx = lookup_task_ctx(p))) -+ tctx->refresh_layer = true; -+} -+ -+void BPF_STRUCT_OPS(layered_set_cpumask, struct task_struct *p, -+ const struct cpumask *cpumask) -+{ -+ struct task_ctx *tctx; -+ -+ if (!(tctx = lookup_task_ctx(p))) -+ return; -+ -+ if (!all_cpumask) { -+ scx_bpf_error("NULL all_cpumask"); -+ return; -+ } -+ -+ tctx->all_cpus_allowed = -+ bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask); -+} -+ -+s32 BPF_STRUCT_OPS(layered_prep_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ struct task_ctx tctx_init = { -+ .pid = p->pid, -+ .layer = -1, -+ .refresh_layer = true, -+ }; -+ struct task_ctx *tctx; -+ struct bpf_cpumask *cpumask; -+ s32 pid = p->pid; -+ s32 ret; -+ -+ if (all_cpumask) -+ tctx_init.all_cpus_allowed = -+ bpf_cpumask_subset((const struct cpumask *)all_cpumask, p->cpus_ptr); -+ else -+ scx_bpf_error("missing all_cpumask"); -+ -+ /* -+ * XXX - We want BPF_NOEXIST but bpf_map_delete_elem() in .disable() may -+ * fail spuriously due to BPF recursion protection triggering -+ * unnecessarily. -+ */ -+ if ((ret = bpf_map_update_elem(&task_ctxs, &pid, &tctx_init, 0 /*BPF_NOEXIST*/))) { -+ scx_bpf_error("task_ctx allocation failure, ret=%d", ret); -+ return ret; -+ } -+ -+ /* -+ * Read the entry from the map immediately so we can add the cpumask -+ * with bpf_kptr_xchg(). -+ */ -+ if (!(tctx = lookup_task_ctx(p))) -+ return -ENOENT; -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) { -+ bpf_map_delete_elem(&task_ctxs, &pid); -+ return -ENOMEM; -+ } -+ -+ cpumask = bpf_kptr_xchg(&tctx->layered_cpumask, cpumask); -+ if (cpumask) { -+ /* Should never happen as we just inserted it above. */ -+ bpf_cpumask_release(cpumask); -+ bpf_map_delete_elem(&task_ctxs, &pid); -+ return -EINVAL; -+ } -+ -+ /* -+ * We are matching cgroup hierarchy path directly rather than the CPU -+ * controller path. As the former isn't available during the scheduler -+ * fork path, let's delay the layer selection until the first -+ * runnable(). -+ */ -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(layered_cancel_enable, struct task_struct *p) -+{ -+ s32 pid = p->pid; -+ -+ bpf_map_delete_elem(&task_ctxs, &pid); -+} -+ -+void BPF_STRUCT_OPS(layered_disable, struct task_struct *p) -+{ -+ struct cpu_ctx *cctx; -+ struct task_ctx *tctx; -+ s32 pid = p->pid; -+ int ret; -+ -+ if (!(cctx = lookup_cpu_ctx(-1)) || !(tctx = lookup_task_ctx(p))) -+ return; -+ -+ if (tctx->layer >= 0 && tctx->layer < nr_layers) -+ __sync_fetch_and_add(&layers[tctx->layer].nr_tasks, -1); -+ -+ /* -+ * XXX - There's no reason delete should fail here but BPF's recursion -+ * protection can unnecessarily fail the operation. The fact that -+ * deletions aren't reliable means that we sometimes leak task_ctx and -+ * can't use BPF_NOEXIST on allocation in .prep_enable(). -+ */ -+ ret = bpf_map_delete_elem(&task_ctxs, &pid); -+ if (ret) -+ gstat_inc(GSTAT_TASK_CTX_FREE_FAILED, cctx); -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(layered_init) -+{ -+ struct bpf_cpumask *cpumask; -+ int i, j, k, nr_online_cpus, ret; -+ -+ scx_bpf_switch_all(); -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ -+ nr_online_cpus = 0; -+ bpf_for(i, 0, nr_possible_cpus) { -+ const volatile u8 *u8_ptr; -+ -+ if ((u8_ptr = MEMBER_VPTR(all_cpus, [i / 8]))) { -+ if (*u8_ptr & (1 << (i % 8))) { -+ bpf_cpumask_set_cpu(i, cpumask); -+ nr_online_cpus++; -+ } -+ } else { -+ return -EINVAL; -+ } -+ } -+ -+ cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ -+ dbg("CFG: Dumping configuration, nr_online_cpus=%d smt_enabled=%d", -+ nr_online_cpus, smt_enabled); -+ -+ bpf_for(i, 0, nr_layers) { -+ struct layer *layer = &layers[i]; -+ -+ dbg("CFG LAYER[%d] open=%d preempt=%d", -+ i, layer->open, layer->preempt); -+ -+ if (layer->nr_match_ors > MAX_LAYER_MATCH_ORS) { -+ scx_bpf_error("too many ORs"); -+ return -EINVAL; -+ } -+ -+ bpf_for(j, 0, layer->nr_match_ors) { -+ struct layer_match_ands *ands = MEMBER_VPTR(layers, [i].matches[j]); -+ if (!ands) { -+ scx_bpf_error("shouldn't happen"); -+ return -EINVAL; -+ } -+ -+ if (ands->nr_match_ands > NR_LAYER_MATCH_KINDS) { -+ scx_bpf_error("too many ANDs"); -+ return -EINVAL; -+ } -+ -+ dbg("CFG OR[%02d]", j); -+ -+ bpf_for(k, 0, ands->nr_match_ands) { -+ char header[32]; -+ u64 header_data[1] = { k }; -+ struct layer_match *match; -+ -+ bpf_snprintf(header, sizeof(header), "CFG AND[%02d]:", -+ header_data, sizeof(header_data)); -+ -+ match = MEMBER_VPTR(layers, [i].matches[j].matches[k]); -+ if (!match) { -+ scx_bpf_error("shouldn't happen"); -+ return -EINVAL; -+ } -+ -+ switch (match->kind) { -+ case MATCH_CGROUP_PREFIX: -+ dbg("%s CGROUP_PREFIX \"%s\"", header, match->cgroup_prefix); -+ break; -+ case MATCH_COMM_PREFIX: -+ dbg("%s COMM_PREFIX \"%s\"", header, match->comm_prefix); -+ break; -+ case MATCH_NICE_ABOVE: -+ dbg("%s NICE_ABOVE %d", header, match->nice_above_or_below); -+ break; -+ case MATCH_NICE_BELOW: -+ dbg("%s NICE_BELOW %d", header, match->nice_above_or_below); -+ break; -+ default: -+ scx_bpf_error("%s Invalid kind", header); -+ return -EINVAL; -+ } -+ } -+ if (ands->nr_match_ands == 0) -+ dbg("CFG DEFAULT"); -+ } -+ } -+ -+ bpf_for(i, 0, nr_layers) { -+ struct layer_cpumask_wrapper *cpumaskw; -+ -+ layers[i].idx = i; -+ -+ ret = scx_bpf_create_dsq(i, -1); -+ if (ret < 0) -+ return ret; -+ -+ if (!(cpumaskw = bpf_map_lookup_elem(&layer_cpumasks, &i))) -+ return -ENONET; -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ -+ /* -+ * Start all layers with full cpumask so that everything runs -+ * everywhere. This will soon be updated by refresh_cpumasks() -+ * once the scheduler starts running. -+ */ -+ bpf_cpumask_setall(cpumask); -+ -+ cpumask = bpf_kptr_xchg(&cpumaskw->cpumask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ } -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(layered_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops layered = { -+ .select_cpu = (void *)layered_select_cpu, -+ .enqueue = (void *)layered_enqueue, -+ .dispatch = (void *)layered_dispatch, -+ .runnable = (void *)layered_runnable, -+ .running = (void *)layered_running, -+ .stopping = (void *)layered_stopping, -+ .quiescent = (void *)layered_quiescent, -+ .set_weight = (void *)layered_set_weight, -+ .set_cpumask = (void *)layered_set_cpumask, -+ .prep_enable = (void *)layered_prep_enable, -+ .cancel_enable = (void *)layered_cancel_enable, -+ .disable = (void *)layered_disable, -+ .init = (void *)layered_init, -+ .exit = (void *)layered_exit, -+ .name = "layered", -+}; -diff --git a/tools/sched_ext/scx_layered/src/bpf/layered.h b/tools/sched_ext/scx_layered/src/bpf/layered.h -new file mode 100644 -index 000000000..bedfa0650 ---- /dev/null -+++ b/tools/sched_ext/scx_layered/src/bpf/layered.h -@@ -0,0 +1,100 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+#ifndef __LAYERED_H -+#define __LAYERED_H -+ -+#include -+#ifndef __kptr -+#ifdef __KERNEL__ -+#error "__kptr_ref not defined in the kernel" -+#endif -+#define __kptr -+#endif -+ -+#ifndef __KERNEL__ -+typedef unsigned long long u64; -+typedef long long s64; -+#endif -+ -+#include "../../../ravg.bpf.h" -+ -+enum consts { -+ MAX_CPUS_SHIFT = 9, -+ MAX_CPUS = 1 << MAX_CPUS_SHIFT, -+ MAX_CPUS_U8 = MAX_CPUS / 8, -+ MAX_TASKS = 131072, -+ MAX_PATH = 4096, -+ MAX_COMM = 16, -+ MAX_LAYER_MATCH_ORS = 32, -+ MAX_LAYERS = 16, -+ USAGE_HALF_LIFE = 100000000, /* 100ms */ -+ -+ /* XXX remove */ -+ MAX_CGRP_PREFIXES = 32 -+}; -+ -+/* Statistics */ -+enum global_stat_idx { -+ GSTAT_TASK_CTX_FREE_FAILED, -+ NR_GSTATS, -+}; -+ -+enum layer_stat_idx { -+ LSTAT_LOCAL, -+ LSTAT_GLOBAL, -+ LSTAT_OPEN_IDLE, -+ LSTAT_AFFN_VIOL, -+ LSTAT_PREEMPT, -+ NR_LSTATS, -+}; -+ -+struct cpu_ctx { -+ bool current_preempt; -+ u64 layer_cycles[MAX_LAYERS]; -+ u64 gstats[NR_GSTATS]; -+ u64 lstats[MAX_LAYERS][NR_LSTATS]; -+}; -+ -+enum layer_match_kind { -+ MATCH_CGROUP_PREFIX, -+ MATCH_COMM_PREFIX, -+ MATCH_NICE_ABOVE, -+ MATCH_NICE_BELOW, -+ -+ NR_LAYER_MATCH_KINDS, -+}; -+ -+struct layer_match { -+ int kind; -+ char cgroup_prefix[MAX_PATH]; -+ char comm_prefix[MAX_COMM]; -+ int nice_above_or_below; -+}; -+ -+struct layer_match_ands { -+ struct layer_match matches[NR_LAYER_MATCH_KINDS]; -+ int nr_match_ands; -+}; -+ -+struct layer { -+ struct layer_match_ands matches[MAX_LAYER_MATCH_ORS]; -+ unsigned int nr_match_ors; -+ unsigned int idx; -+ bool open; -+ bool preempt; -+ -+ u64 vtime_now; -+ u64 nr_tasks; -+ -+ u64 load; -+ struct ravg_data load_rd; -+ -+ u64 cpus_seq; -+ unsigned int refresh_cpus; -+ unsigned char cpus[MAX_CPUS_U8]; -+ unsigned int nr_cpus; // managed from BPF side -+}; -+ -+#endif /* __LAYERED_H */ -diff --git a/tools/sched_ext/scx_layered/src/bpf/util.bpf.c b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c -new file mode 100644 -index 000000000..703e0eece ---- /dev/null -+++ b/tools/sched_ext/scx_layered/src/bpf/util.bpf.c -@@ -0,0 +1,68 @@ -+/* to be included in the main bpf.c file */ -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(key_size, sizeof(u32)); -+ /* double size because verifier can't follow length calculation */ -+ __uint(value_size, 2 * MAX_PATH); -+ __uint(max_entries, 1); -+} cgrp_path_bufs SEC(".maps"); -+ -+static char *format_cgrp_path(struct cgroup *cgrp) -+{ -+ u32 zero = 0; -+ char *path = bpf_map_lookup_elem(&cgrp_path_bufs, &zero); -+ u32 len = 0, level, max_level; -+ -+ if (!path) { -+ scx_bpf_error("cgrp_path_buf lookup failed"); -+ return NULL; -+ } -+ -+ max_level = cgrp->level; -+ if (max_level > 127) -+ max_level = 127; -+ -+ bpf_for(level, 1, max_level + 1) { -+ int ret; -+ -+ if (level > 1 && len < MAX_PATH - 1) -+ path[len++] = '/'; -+ -+ if (len >= MAX_PATH - 1) { -+ scx_bpf_error("cgrp_path_buf overflow"); -+ return NULL; -+ } -+ -+ ret = bpf_probe_read_kernel_str(path + len, MAX_PATH - len - 1, -+ BPF_CORE_READ(cgrp, ancestors[level], kn, name)); -+ if (ret < 0) { -+ scx_bpf_error("bpf_probe_read_kernel_str failed"); -+ return NULL; -+ } -+ -+ len += ret - 1; -+ } -+ -+ if (len >= MAX_PATH - 2) { -+ scx_bpf_error("cgrp_path_buf overflow"); -+ return NULL; -+ } -+ path[len] = '/'; -+ path[len + 1] = '\0'; -+ -+ return path; -+} -+ -+static inline bool match_prefix(const char *prefix, const char *str, u32 max_len) -+{ -+ int c; -+ -+ bpf_for(c, 0, max_len) { -+ if (prefix[c] == '\0') -+ return true; -+ if (str[c] != prefix[c]) -+ return false; -+ } -+ return false; -+} -diff --git a/tools/sched_ext/scx_layered/src/layered_sys.rs b/tools/sched_ext/scx_layered/src/layered_sys.rs -new file mode 100644 -index 000000000..afc821d38 ---- /dev/null -+++ b/tools/sched_ext/scx_layered/src/layered_sys.rs -@@ -0,0 +1,10 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+#![allow(non_upper_case_globals)] -+#![allow(non_camel_case_types)] -+#![allow(non_snake_case)] -+#![allow(dead_code)] -+ -+include!(concat!(env!("OUT_DIR"), "/layered_sys.rs")); -diff --git a/tools/sched_ext/scx_layered/src/main.rs b/tools/sched_ext/scx_layered/src/main.rs -new file mode 100644 -index 000000000..7eb2edf53 ---- /dev/null -+++ b/tools/sched_ext/scx_layered/src/main.rs -@@ -0,0 +1,1641 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+#[path = "bpf/.output/layered.skel.rs"] -+mod layered; -+pub use layered::*; -+pub mod layered_sys; -+ -+use std::collections::BTreeMap; -+use std::collections::BTreeSet; -+use std::ffi::CStr; -+use std::ffi::CString; -+use std::fs; -+use std::io::Read; -+use std::io::Write; -+use std::ops::Sub; -+use std::sync::atomic::AtomicBool; -+use std::sync::atomic::Ordering; -+use std::sync::Arc; -+use std::time::Duration; -+use std::time::Instant; -+ -+use ::fb_procfs as procfs; -+use anyhow::anyhow; -+use anyhow::bail; -+use anyhow::Context; -+use anyhow::Result; -+use bitvec::prelude::*; -+use clap::Parser; -+use libbpf_rs::skel::OpenSkel as _; -+use libbpf_rs::skel::Skel as _; -+use libbpf_rs::skel::SkelBuilder as _; -+use log::debug; -+use log::info; -+use log::trace; -+use serde::Deserialize; -+use serde::Serialize; -+ -+const RAVG_FRAC_BITS: u32 = layered_sys::ravg_consts_RAVG_FRAC_BITS; -+const MAX_CPUS: usize = layered_sys::consts_MAX_CPUS as usize; -+const MAX_PATH: usize = layered_sys::consts_MAX_PATH as usize; -+const MAX_COMM: usize = layered_sys::consts_MAX_COMM as usize; -+const MAX_LAYER_MATCH_ORS: usize = layered_sys::consts_MAX_LAYER_MATCH_ORS as usize; -+const MAX_LAYERS: usize = layered_sys::consts_MAX_LAYERS as usize; -+const USAGE_HALF_LIFE: u32 = layered_sys::consts_USAGE_HALF_LIFE; -+const USAGE_HALF_LIFE_F64: f64 = USAGE_HALF_LIFE as f64 / 1_000_000_000.0; -+const NR_GSTATS: usize = layered_sys::global_stat_idx_NR_GSTATS as usize; -+const NR_LSTATS: usize = layered_sys::layer_stat_idx_NR_LSTATS as usize; -+const NR_LAYER_MATCH_KINDS: usize = layered_sys::layer_match_kind_NR_LAYER_MATCH_KINDS as usize; -+const CORE_CACHE_LEVEL: u32 = 2; -+ -+include!("../../ravg_read.rs.h"); -+ -+lazy_static::lazy_static! { -+ static ref NR_POSSIBLE_CPUS: usize = libbpf_rs::num_possible_cpus().unwrap(); -+ static ref USAGE_DECAY: f64 = 0.5f64.powf(1.0 / USAGE_HALF_LIFE_F64); -+} -+ -+/// scx_layered: A highly configurable multi-layer sched_ext scheduler -+/// -+/// scx_layered allows classifying tasks into multiple layers and applying -+/// different scheduling policies to them. The configuration is specified in -+/// json and composed of two parts - matches and policies. -+/// -+/// Matches -+/// ======= -+/// -+/// Whenever a task is forked or its attributes are changed, the task goes -+/// through a series of matches to determine the layer it belongs to. A -+/// match set is composed of OR groups of AND blocks. An example: -+/// -+/// "matches": [ -+/// [ -+/// { -+/// "CgroupPrefix": "system.slice/" -+/// } -+/// ], -+/// [ -+/// { -+/// "CommPrefix": "fbagent" -+/// }, -+/// { -+/// "NiceAbove": 0 -+/// } -+/// ] -+/// ], -+/// -+/// The outer array contains the OR groups and the inner AND blocks, so the -+/// above matches: -+/// -+/// * Tasks which are in the cgroup sub-hierarchy under "system.slice". -+/// * Or tasks whose comm starts with "fbagent" and have a nice value > 0. -+/// -+/// Currently, the following matches are supported: -+/// -+/// * CgroupPrefix: Matches the prefix of the cgroup that the task belongs -+/// to. As this is a string match, whether the pattern has the trailing -+/// '/' makes a difference. For example, "TOP/CHILD/" only matches tasks -+/// which are under that particular cgroup while "TOP/CHILD" also matches -+/// tasks under "TOP/CHILD0/" or "TOP/CHILD1/". -+/// -+/// * CommPrefix: Matches the task's comm prefix. -+/// -+/// * NiceAbove: Matches if the task's nice value is greater than the -+/// pattern. -+/// -+/// * NiceBelow: Matches if the task's nice value is smaller than the -+/// pattern. -+/// -+/// While there are complexity limitations as the matches are performed in -+/// BPF, it is straightforward to add more types of matches. -+/// -+/// Policies -+/// ======== -+/// -+/// The following is an example policy configuration for a layer. -+/// -+/// "kind": { -+/// "Confined": { -+/// "cpus_range": [1, 8], -+/// "util_range": [0.8, 0.9] -+/// } -+/// } -+/// -+/// It's of "Confined" kind, which tries to concentrate the layer's tasks -+/// into a limited number of CPUs. In the above case, the number of CPUs -+/// assigned to the layer is scaled between 1 and 8 so that the per-cpu -+/// utilization is kept between 80% and 90%. If the CPUs are loaded higher -+/// than 90%, more CPUs are allocated to the layer. If the utilization drops -+/// below 80%, the layer loses CPUs. -+/// -+/// Currently, the following policy kinds are supported: -+/// -+/// * Confined: Tasks are restricted to the allocated CPUs. The number of -+/// CPUs allocated is modulated to keep the per-CPU utilization in -+/// "util_range". The range can optionally be restricted with the -+/// "cpus_range" property. -+/// -+/// * Grouped: Similar to Confined but tasks may spill outside if there are -+/// idle CPUs outside the allocated ones. If "preempt" is true, tasks in -+/// this layer will preempt tasks which belong to other non-preempting -+/// layers when no idle CPUs are available. -+/// -+/// * Open: Prefer the CPUs which are not occupied by Confined or Grouped -+/// layers. Tasks in this group will spill into occupied CPUs if there are -+/// no unoccupied idle CPUs. If "preempt" is true, tasks in this layer -+/// will preempt tasks which belong to other non-preempting layers when no -+/// idle CPUs are available. -+/// -+/// Similar to matches, adding new policies and extending existing ones -+/// should be relatively straightforward. -+/// -+/// Configuration example and running scx_layered -+/// ============================================= -+/// -+/// A scx_layered config is composed of layer configs and a layer config is -+/// composed of a name, a set of matches and a policy block. Running the -+/// following will write an example configuration into example.json. -+/// -+/// $ scx_layered -e example.json -+/// -+/// Note that the last layer in the configuration must have an empty match -+/// set as it must match all tasks which haven't been matched into previous -+/// layers. -+/// -+/// The configuration can be specified in multiple json files and command -+/// line arguments. Each must contain valid layer configurations and they're -+/// concatenated in the specified order. In most cases, something like the -+/// following should do. -+/// -+/// $ scx_layered file:example.json -+/// -+/// Statistics -+/// ========== -+/// -+/// scx_layered will print out a set of statistics every monitoring -+/// interval. -+/// -+/// tot= 117909 local=86.20 open_idle= 0.21 affn_viol= 1.37 tctx_err=9 proc=6ms -+/// busy= 34.2 util= 1733.6 load= 21744.1 fallback_cpu= 1 -+/// batch : util/frac= 11.8/ 0.7 load/frac= 29.7: 0.1 tasks= 2597 -+/// tot= 3478 local=67.80 open_idle= 0.00 preempt= 0.00 affn_viol= 0.00 -+/// cpus= 2 [ 2, 2] 04000001 00000000 -+/// immediate: util/frac= 1218.8/ 70.3 load/frac= 21399.9: 98.4 tasks= 1107 -+/// tot= 68997 local=90.57 open_idle= 0.26 preempt= 9.36 affn_viol= 0.00 -+/// cpus= 50 [ 50, 50] fbfffffe 000fffff -+/// normal : util/frac= 502.9/ 29.0 load/frac= 314.5: 1.4 tasks= 3512 -+/// tot= 45434 local=80.97 open_idle= 0.16 preempt= 0.00 affn_viol= 3.56 -+/// cpus= 50 [ 50, 50] fbfffffe 000fffff -+/// -+/// Global statistics: -+/// -+/// - tot: Total scheduling events in the period. -+/// -+/// - local: % that got scheduled directly into an idle CPU. -+/// -+/// - open_idle: % of open layer tasks scheduled into occupied idle CPUs. -+/// -+/// - affn_viol: % which violated configured policies due to CPU affinity -+/// restrictions. -+/// -+/// - proc: CPU time this binary consumed during the period. -+/// -+/// - busy: CPU busy % (100% means all CPUs were fully occupied) -+/// -+/// - util: CPU utilization % (100% means one CPU was fully occupied) -+/// -+/// - load: Sum of weight * duty_cycle for all tasks -+/// -+/// Per-layer statistics: -+/// -+/// - util/frac: CPU utilization and fraction % (sum of fractions across -+/// layers is always 100%). -+/// -+/// - load/frac: Load sum and fraction %. -+/// -+/// - tasks: Number of tasks. -+/// -+/// - tot: Total scheduling events. -+/// -+/// - open_idle: % of tasks scheduled into idle CPUs occupied by other layers. -+/// -+/// - preempt: % of tasks that preempted other tasks. -+/// -+/// - affn_viol: % which violated configured policies due to CPU affinity -+/// restrictions. -+/// -+/// - cpus: CUR_NR_CPUS [MIN_NR_CPUS, MAX_NR_CPUS] CUR_CPU_MASK -+/// -+#[derive(Debug, Parser)] -+#[command(verbatim_doc_comment)] -+struct Opts { -+ /// Scheduling slice duration in microseconds. -+ #[clap(short = 's', long, default_value = "20000")] -+ slice_us: u64, -+ -+ /// Scheduling interval in seconds. -+ #[clap(short = 'i', long, default_value = "0.1")] -+ interval: f64, -+ -+ /// Monitoring interval in seconds. -+ #[clap(short = 'm', long, default_value = "2.0")] -+ monitor: f64, -+ -+ /// Disable load-fraction based max layer CPU limit. ***NOTE*** -+ /// load-fraction calculation is currently broken due to lack of -+ /// infeasible weight adjustments. Setting this option is recommended. -+ #[clap(short = 'n', long)] -+ no_load_frac_limit: bool, -+ -+ /// Enable verbose output including libbpf details. Specify multiple -+ /// times to increase verbosity. -+ #[clap(short = 'v', long, action = clap::ArgAction::Count)] -+ verbose: u8, -+ -+ /// Write example layer specifications into the file and exit. -+ #[clap(short = 'e', long)] -+ example: Option, -+ -+ /// Layer specification. See --help. -+ specs: Vec, -+} -+ -+#[derive(Clone, Debug, Serialize, Deserialize)] -+enum LayerMatch { -+ CgroupPrefix(String), -+ CommPrefix(String), -+ NiceAbove(i32), -+ NiceBelow(i32), -+} -+ -+#[derive(Clone, Debug, Serialize, Deserialize)] -+enum LayerKind { -+ Confined { -+ cpus_range: Option<(usize, usize)>, -+ util_range: (f64, f64), -+ }, -+ Grouped { -+ cpus_range: Option<(usize, usize)>, -+ util_range: (f64, f64), -+ preempt: bool, -+ }, -+ Open { -+ preempt: bool, -+ }, -+} -+ -+#[derive(Clone, Debug, Serialize, Deserialize)] -+struct LayerSpec { -+ name: String, -+ comment: Option, -+ matches: Vec>, -+ kind: LayerKind, -+} -+ -+impl LayerSpec { -+ fn parse(input: &str) -> Result> { -+ let config: LayerConfig = if input.starts_with("f:") || input.starts_with("file:") { -+ let mut f = fs::OpenOptions::new() -+ .read(true) -+ .open(input.split_once(':').unwrap().1)?; -+ let mut content = String::new(); -+ f.read_to_string(&mut content)?; -+ serde_json::from_str(&content)? -+ } else { -+ serde_json::from_str(input)? -+ }; -+ Ok(config.specs) -+ } -+} -+ -+#[derive(Clone, Debug, Serialize, Deserialize)] -+#[serde(transparent)] -+struct LayerConfig { -+ specs: Vec, -+} -+ -+fn now_monotonic() -> u64 { -+ let mut time = libc::timespec { -+ tv_sec: 0, -+ tv_nsec: 0, -+ }; -+ let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) }; -+ assert!(ret == 0); -+ time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64 -+} -+ -+fn read_total_cpu(reader: &procfs::ProcReader) -> Result { -+ reader -+ .read_stat() -+ .context("Failed to read procfs")? -+ .total_cpu -+ .ok_or_else(|| anyhow!("Could not read total cpu stat in proc")) -+} -+ -+fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result { -+ match (curr, prev) { -+ ( -+ procfs::CpuStat { -+ user_usec: Some(curr_user), -+ nice_usec: Some(curr_nice), -+ system_usec: Some(curr_system), -+ idle_usec: Some(curr_idle), -+ iowait_usec: Some(curr_iowait), -+ irq_usec: Some(curr_irq), -+ softirq_usec: Some(curr_softirq), -+ stolen_usec: Some(curr_stolen), -+ .. -+ }, -+ procfs::CpuStat { -+ user_usec: Some(prev_user), -+ nice_usec: Some(prev_nice), -+ system_usec: Some(prev_system), -+ idle_usec: Some(prev_idle), -+ iowait_usec: Some(prev_iowait), -+ irq_usec: Some(prev_irq), -+ softirq_usec: Some(prev_softirq), -+ stolen_usec: Some(prev_stolen), -+ .. -+ }, -+ ) => { -+ let idle_usec = curr_idle - prev_idle; -+ let iowait_usec = curr_iowait - prev_iowait; -+ let user_usec = curr_user - prev_user; -+ let system_usec = curr_system - prev_system; -+ let nice_usec = curr_nice - prev_nice; -+ let irq_usec = curr_irq - prev_irq; -+ let softirq_usec = curr_softirq - prev_softirq; -+ let stolen_usec = curr_stolen - prev_stolen; -+ -+ let busy_usec = -+ user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; -+ let total_usec = idle_usec + busy_usec + iowait_usec; -+ if total_usec > 0 { -+ Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0)) -+ } else { -+ Ok(1.0) -+ } -+ } -+ _ => { -+ bail!("Missing stats in cpustat"); -+ } -+ } -+} -+ -+fn copy_into_cstr(dst: &mut [i8], src: &str) { -+ let cstr = CString::new(src).unwrap(); -+ let bytes = unsafe { std::mem::transmute::<&[u8], &[i8]>(cstr.as_bytes_with_nul()) }; -+ dst[0..bytes.len()].copy_from_slice(bytes); -+} -+ -+fn format_bitvec(bitvec: &BitVec) -> String { -+ let mut vals = Vec::::new(); -+ let mut val: u32 = 0; -+ for (idx, bit) in bitvec.iter().enumerate() { -+ if idx > 0 && idx % 32 == 0 { -+ vals.push(val); -+ val = 0; -+ } -+ if *bit { -+ val |= 1 << (idx % 32); -+ } -+ } -+ vals.push(val); -+ let mut output = vals -+ .iter() -+ .fold(String::new(), |string, v| format!("{}{:08x} ", string, v)); -+ output.pop(); -+ output -+} -+ -+fn read_cpu_ctxs(skel: &LayeredSkel) -> Result> { -+ let mut cpu_ctxs = vec![]; -+ let cpu_ctxs_vec = skel -+ .maps() -+ .cpu_ctxs() -+ .lookup_percpu(&0u32.to_ne_bytes(), libbpf_rs::MapFlags::ANY) -+ .context("Failed to lookup cpu_ctx")? -+ .unwrap(); -+ for cpu in 0..*NR_POSSIBLE_CPUS { -+ cpu_ctxs.push(*unsafe { -+ &*(cpu_ctxs_vec[cpu].as_slice().as_ptr() as *const layered_sys::cpu_ctx) -+ }); -+ } -+ Ok(cpu_ctxs) -+} -+ -+#[derive(Clone, Debug)] -+struct BpfStats { -+ gstats: Vec, -+ lstats: Vec>, -+ lstats_sums: Vec, -+} -+ -+impl BpfStats { -+ fn read(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Self { -+ let mut gstats = vec![0u64; NR_GSTATS]; -+ let mut lstats = vec![vec![0u64; NR_LSTATS]; nr_layers]; -+ -+ for cpu in 0..*NR_POSSIBLE_CPUS { -+ for stat in 0..NR_GSTATS { -+ gstats[stat] += cpu_ctxs[cpu].gstats[stat]; -+ } -+ for layer in 0..nr_layers { -+ for stat in 0..NR_LSTATS { -+ lstats[layer][stat] += cpu_ctxs[cpu].lstats[layer][stat]; -+ } -+ } -+ } -+ -+ let mut lstats_sums = vec![0u64; NR_LSTATS]; -+ for layer in 0..nr_layers { -+ for stat in 0..NR_LSTATS { -+ lstats_sums[stat] += lstats[layer][stat]; -+ } -+ } -+ -+ Self { -+ gstats, -+ lstats, -+ lstats_sums, -+ } -+ } -+} -+ -+impl<'a, 'b> Sub<&'b BpfStats> for &'a BpfStats { -+ type Output = BpfStats; -+ -+ fn sub(self, rhs: &'b BpfStats) -> BpfStats { -+ let vec_sub = |l: &[u64], r: &[u64]| l.iter().zip(r.iter()).map(|(l, r)| *l - *r).collect(); -+ BpfStats { -+ gstats: vec_sub(&self.gstats, &rhs.gstats), -+ lstats: self -+ .lstats -+ .iter() -+ .zip(rhs.lstats.iter()) -+ .map(|(l, r)| vec_sub(l, r)) -+ .collect(), -+ lstats_sums: vec_sub(&self.lstats_sums, &rhs.lstats_sums), -+ } -+ } -+} -+ -+struct Stats { -+ nr_layers: usize, -+ at: Instant, -+ -+ nr_layer_tasks: Vec, -+ -+ total_load: f64, -+ layer_loads: Vec, -+ -+ total_util: f64, // Running AVG of sum of layer_utils -+ layer_utils: Vec, -+ prev_layer_cycles: Vec, -+ -+ cpu_busy: f64, // Read from /proc, maybe higher than total_util -+ prev_total_cpu: procfs::CpuStat, -+ -+ bpf_stats: BpfStats, -+ prev_bpf_stats: BpfStats, -+} -+ -+impl Stats { -+ fn read_layer_loads(skel: &mut LayeredSkel, nr_layers: usize) -> (f64, Vec) { -+ let now_mono = now_monotonic(); -+ let layer_loads: Vec = skel -+ .bss() -+ .layers -+ .iter() -+ .take(nr_layers) -+ .map(|layer| { -+ let rd = &layer.load_rd; -+ ravg_read( -+ rd.val, -+ rd.val_at, -+ rd.old, -+ rd.cur, -+ now_mono, -+ USAGE_HALF_LIFE, -+ RAVG_FRAC_BITS, -+ ) -+ }) -+ .collect(); -+ (layer_loads.iter().sum(), layer_loads) -+ } -+ -+ fn read_layer_cycles(cpu_ctxs: &[layered_sys::cpu_ctx], nr_layers: usize) -> Vec { -+ let mut layer_cycles = vec![0u64; nr_layers]; -+ -+ for cpu in 0..*NR_POSSIBLE_CPUS { -+ for layer in 0..nr_layers { -+ layer_cycles[layer] += cpu_ctxs[cpu].layer_cycles[layer]; -+ } -+ } -+ -+ layer_cycles -+ } -+ -+ fn new(skel: &mut LayeredSkel, proc_reader: &procfs::ProcReader) -> Result { -+ let nr_layers = skel.rodata().nr_layers as usize; -+ let bpf_stats = BpfStats::read(&read_cpu_ctxs(skel)?, nr_layers); -+ -+ Ok(Self { -+ at: Instant::now(), -+ nr_layers, -+ -+ nr_layer_tasks: vec![0; nr_layers], -+ -+ total_load: 0.0, -+ layer_loads: vec![0.0; nr_layers], -+ -+ total_util: 0.0, -+ layer_utils: vec![0.0; nr_layers], -+ prev_layer_cycles: vec![0; nr_layers], -+ -+ cpu_busy: 0.0, -+ prev_total_cpu: read_total_cpu(&proc_reader)?, -+ -+ bpf_stats: bpf_stats.clone(), -+ prev_bpf_stats: bpf_stats, -+ }) -+ } -+ -+ fn refresh( -+ &mut self, -+ skel: &mut LayeredSkel, -+ proc_reader: &procfs::ProcReader, -+ now: Instant, -+ ) -> Result<()> { -+ let elapsed = now.duration_since(self.at).as_secs_f64() as f64; -+ let cpu_ctxs = read_cpu_ctxs(skel)?; -+ -+ let nr_layer_tasks: Vec = skel -+ .bss() -+ .layers -+ .iter() -+ .take(self.nr_layers) -+ .map(|layer| layer.nr_tasks as usize) -+ .collect(); -+ -+ let (total_load, layer_loads) = Self::read_layer_loads(skel, self.nr_layers); -+ -+ let cur_layer_cycles = Self::read_layer_cycles(&cpu_ctxs, self.nr_layers); -+ let cur_layer_utils: Vec = cur_layer_cycles -+ .iter() -+ .zip(self.prev_layer_cycles.iter()) -+ .map(|(cur, prev)| (cur - prev) as f64 / 1_000_000_000.0 / elapsed) -+ .collect(); -+ let layer_utils: Vec = cur_layer_utils -+ .iter() -+ .zip(self.layer_utils.iter()) -+ .map(|(cur, prev)| { -+ let decay = USAGE_DECAY.powf(elapsed); -+ prev * decay + cur * (1.0 - decay) -+ }) -+ .collect(); -+ -+ let cur_total_cpu = read_total_cpu(proc_reader)?; -+ let cpu_busy = calc_util(&cur_total_cpu, &self.prev_total_cpu)?; -+ -+ let cur_bpf_stats = BpfStats::read(&cpu_ctxs, self.nr_layers); -+ let bpf_stats = &cur_bpf_stats - &self.prev_bpf_stats; -+ -+ *self = Self { -+ at: now, -+ nr_layers: self.nr_layers, -+ -+ nr_layer_tasks, -+ -+ total_load, -+ layer_loads, -+ -+ total_util: layer_utils.iter().sum(), -+ layer_utils: layer_utils.try_into().unwrap(), -+ prev_layer_cycles: cur_layer_cycles, -+ -+ cpu_busy, -+ prev_total_cpu: cur_total_cpu, -+ -+ bpf_stats, -+ prev_bpf_stats: cur_bpf_stats, -+ }; -+ Ok(()) -+ } -+} -+ -+#[derive(Debug, Default)] -+struct UserExitInfo { -+ kind: i32, -+ reason: Option, -+ msg: Option, -+} -+ -+impl UserExitInfo { -+ fn read(bpf_uei: &layered_bss_types::user_exit_info) -> Result { -+ let kind = unsafe { std::ptr::read_volatile(&bpf_uei.kind as *const _) }; -+ -+ let (reason, msg) = if kind != 0 { -+ ( -+ Some( -+ unsafe { CStr::from_ptr(bpf_uei.reason.as_ptr() as *const _) } -+ .to_str() -+ .context("Failed to convert reason to string")? -+ .to_string(), -+ ) -+ .filter(|s| !s.is_empty()), -+ Some( -+ unsafe { CStr::from_ptr(bpf_uei.msg.as_ptr() as *const _) } -+ .to_str() -+ .context("Failed to convert msg to string")? -+ .to_string(), -+ ) -+ .filter(|s| !s.is_empty()), -+ ) -+ } else { -+ (None, None) -+ }; -+ -+ Ok(Self { kind, reason, msg }) -+ } -+ -+ fn exited(bpf_uei: &layered_bss_types::user_exit_info) -> Result { -+ Ok(Self::read(bpf_uei)?.kind != 0) -+ } -+ -+ fn report(&self) -> Result<()> { -+ let why = match (&self.reason, &self.msg) { -+ (Some(reason), None) => format!("{}", reason), -+ (Some(reason), Some(msg)) => format!("{} ({})", reason, msg), -+ _ => "".into(), -+ }; -+ -+ match self.kind { -+ 0 => Ok(()), -+ etype => { -+ if etype != 64 { -+ bail!("EXIT: kind={} {}", etype, why); -+ } else { -+ info!("EXIT: {}", why); -+ Ok(()) -+ } -+ } -+ } -+ } -+} -+ -+#[derive(Debug)] -+struct CpuPool { -+ nr_cores: usize, -+ nr_cpus: usize, -+ all_cpus: BitVec, -+ core_cpus: Vec, -+ cpu_core: Vec, -+ available_cores: BitVec, -+ first_cpu: usize, -+ fallback_cpu: usize, // next free or the first CPU if none is free -+} -+ -+impl CpuPool { -+ fn new() -> Result { -+ if *NR_POSSIBLE_CPUS > MAX_CPUS { -+ bail!( -+ "NR_POSSIBLE_CPUS {} > MAX_CPUS {}", -+ *NR_POSSIBLE_CPUS, -+ MAX_CPUS -+ ); -+ } -+ -+ let mut cpu_to_cache = vec![]; // (cpu_id, Option) -+ let mut cache_ids = BTreeSet::::new(); -+ let mut nr_offline = 0; -+ -+ // Build cpu -> cache ID mapping. -+ for cpu in 0..*NR_POSSIBLE_CPUS { -+ let path = format!( -+ "/sys/devices/system/cpu/cpu{}/cache/index{}/id", -+ cpu, CORE_CACHE_LEVEL -+ ); -+ let id = match std::fs::read_to_string(&path) { -+ Ok(val) => Some(val.trim().parse::().with_context(|| { -+ format!("Failed to parse {:?}'s content {:?}", &path, &val) -+ })?), -+ Err(e) if e.kind() == std::io::ErrorKind::NotFound => { -+ nr_offline += 1; -+ None -+ } -+ Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)), -+ }; -+ -+ cpu_to_cache.push(id); -+ if let Some(id) = id { -+ cache_ids.insert(id); -+ } -+ } -+ -+ let nr_cpus = *NR_POSSIBLE_CPUS - nr_offline; -+ -+ // Cache IDs may have holes. Assign consecutive core IDs to existing -+ // cache IDs. -+ let mut cache_to_core = BTreeMap::::new(); -+ let mut nr_cores = 0; -+ for cache_id in cache_ids.iter() { -+ cache_to_core.insert(*cache_id, nr_cores); -+ nr_cores += 1; -+ } -+ -+ // Build core -> cpumask and cpu -> core mappings. -+ let mut all_cpus = bitvec![0; *NR_POSSIBLE_CPUS]; -+ let mut core_cpus = vec![bitvec![0; *NR_POSSIBLE_CPUS]; nr_cores]; -+ let mut cpu_core = vec![]; -+ -+ for (cpu, cache) in cpu_to_cache.iter().enumerate().take(*NR_POSSIBLE_CPUS) { -+ if let Some(cache_id) = cache { -+ let core_id = cache_to_core[cache_id]; -+ all_cpus.set(cpu, true); -+ core_cpus[core_id].set(cpu, true); -+ cpu_core.push(core_id); -+ } -+ } -+ -+ info!( -+ "CPUs: online/possible={}/{} nr_cores={}", -+ nr_cpus, *NR_POSSIBLE_CPUS, nr_cores, -+ ); -+ -+ let first_cpu = core_cpus[0].first_one().unwrap(); -+ -+ let mut cpu_pool = Self { -+ nr_cores, -+ nr_cpus, -+ all_cpus, -+ core_cpus, -+ cpu_core, -+ available_cores: bitvec![1; nr_cores], -+ first_cpu, -+ fallback_cpu: first_cpu, -+ }; -+ cpu_pool.update_fallback_cpu(); -+ Ok(cpu_pool) -+ } -+ -+ fn update_fallback_cpu(&mut self) { -+ match self.available_cores.first_one() { -+ Some(next) => self.fallback_cpu = self.core_cpus[next].first_one().unwrap(), -+ None => self.fallback_cpu = self.first_cpu, -+ } -+ } -+ -+ fn alloc<'a>(&'a mut self) -> Option<&'a BitVec> { -+ let core = self.available_cores.first_one()?; -+ self.available_cores.set(core, false); -+ self.update_fallback_cpu(); -+ Some(&self.core_cpus[core]) -+ } -+ -+ fn cpus_to_cores(&self, cpus_to_match: &BitVec) -> Result { -+ let mut cpus = cpus_to_match.clone(); -+ let mut cores = bitvec![0; self.nr_cores]; -+ -+ while let Some(cpu) = cpus.first_one() { -+ let core = self.cpu_core[cpu]; -+ -+ if (self.core_cpus[core].clone() & !cpus.clone()).count_ones() != 0 { -+ bail!( -+ "CPUs {} partially intersect with core {} ({})", -+ cpus_to_match, -+ core, -+ self.core_cpus[core], -+ ); -+ } -+ -+ cpus &= !self.core_cpus[core].clone(); -+ cores.set(core, true); -+ } -+ -+ Ok(cores) -+ } -+ -+ fn free<'a>(&'a mut self, cpus_to_free: &BitVec) -> Result<()> { -+ let cores = self.cpus_to_cores(cpus_to_free)?; -+ if (self.available_cores.clone() & &cores).any() { -+ bail!("Some of CPUs {} are already free", cpus_to_free); -+ } -+ self.available_cores |= cores; -+ self.update_fallback_cpu(); -+ Ok(()) -+ } -+ -+ fn next_to_free<'a>(&'a self, cands: &BitVec) -> Result> { -+ let last = match cands.last_one() { -+ Some(ret) => ret, -+ None => return Ok(None), -+ }; -+ let core = self.cpu_core[last]; -+ if (self.core_cpus[core].clone() & !cands.clone()).count_ones() != 0 { -+ bail!( -+ "CPUs{} partially intersect with core {} ({})", -+ cands, -+ core, -+ self.core_cpus[core] -+ ); -+ } -+ -+ Ok(Some(&self.core_cpus[core])) -+ } -+ -+ fn available_cpus(&self) -> BitVec { -+ let mut cpus = bitvec![0; self.nr_cpus]; -+ for core in self.available_cores.iter_ones() { -+ cpus |= &self.core_cpus[core]; -+ } -+ cpus -+ } -+} -+ -+#[derive(Debug)] -+struct Layer { -+ name: String, -+ kind: LayerKind, -+ -+ nr_cpus: usize, -+ cpus: BitVec, -+} -+ -+impl Layer { -+ fn new(cpu_pool: &mut CpuPool, name: &str, kind: LayerKind) -> Result { -+ match &kind { -+ LayerKind::Confined { -+ cpus_range, -+ util_range, -+ } => { -+ let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX)); -+ if cpus_range.0 > cpus_range.1 || cpus_range.1 == 0 { -+ bail!("invalid cpus_range {:?}", cpus_range); -+ } -+ if util_range.0 < 0.0 -+ || util_range.0 > 1.0 -+ || util_range.1 < 0.0 -+ || util_range.1 > 1.0 -+ || util_range.0 >= util_range.1 -+ { -+ bail!("invalid util_range {:?}", util_range); -+ } -+ } -+ _ => {} -+ } -+ -+ let nr_cpus = cpu_pool.nr_cpus; -+ -+ let mut layer = Self { -+ name: name.into(), -+ kind, -+ -+ nr_cpus: 0, -+ cpus: bitvec![0; nr_cpus], -+ }; -+ -+ match &layer.kind { -+ LayerKind::Confined { -+ cpus_range, -+ util_range, -+ } -+ | LayerKind::Grouped { -+ cpus_range, -+ util_range, -+ .. -+ } => { -+ layer.resize_confined_or_grouped( -+ cpu_pool, -+ *cpus_range, -+ *util_range, -+ (0.0, 0.0), -+ (0.0, 0.0), -+ false, -+ )?; -+ } -+ _ => {} -+ } -+ -+ Ok(layer) -+ } -+ -+ fn grow_confined_or_grouped( -+ &mut self, -+ cpu_pool: &mut CpuPool, -+ (cpus_min, cpus_max): (usize, usize), -+ (_util_low, util_high): (f64, f64), -+ (layer_load, total_load): (f64, f64), -+ (layer_util, _total_util): (f64, f64), -+ no_load_frac_limit: bool, -+ ) -> Result { -+ if self.nr_cpus >= cpus_max { -+ return Ok(false); -+ } -+ -+ // Do we already have enough? -+ if self.nr_cpus >= cpus_min -+ && (layer_util == 0.0 -+ || (self.nr_cpus > 0 && layer_util / self.nr_cpus as f64 <= util_high)) -+ { -+ return Ok(false); -+ } -+ -+ // Can't have more CPUs than our load fraction. -+ if !no_load_frac_limit -+ && self.nr_cpus >= cpus_min -+ && (total_load >= 0.0 -+ && self.nr_cpus as f64 / cpu_pool.nr_cpus as f64 >= layer_load / total_load) -+ { -+ trace!( -+ "layer-{} needs more CPUs (util={:.3}) but is over the load fraction", -+ &self.name, -+ layer_util -+ ); -+ return Ok(false); -+ } -+ -+ let new_cpus = match cpu_pool.alloc().clone() { -+ Some(ret) => ret.clone(), -+ None => { -+ trace!("layer-{} can't grow, no CPUs", &self.name); -+ return Ok(false); -+ } -+ }; -+ -+ trace!( -+ "layer-{} adding {} CPUs to {} CPUs", -+ &self.name, -+ new_cpus.count_ones(), -+ self.nr_cpus -+ ); -+ -+ self.nr_cpus += new_cpus.count_ones(); -+ self.cpus |= &new_cpus; -+ Ok(true) -+ } -+ -+ fn cpus_to_free( -+ &self, -+ cpu_pool: &mut CpuPool, -+ (cpus_min, _cpus_max): (usize, usize), -+ (util_low, util_high): (f64, f64), -+ (layer_load, total_load): (f64, f64), -+ (layer_util, _total_util): (f64, f64), -+ no_load_frac_limit: bool, -+ ) -> Result> { -+ if self.nr_cpus <= cpus_min { -+ return Ok(None); -+ } -+ -+ let cpus_to_free = match cpu_pool.next_to_free(&self.cpus)? { -+ Some(ret) => ret.clone(), -+ None => return Ok(None), -+ }; -+ -+ let nr_to_free = cpus_to_free.count_ones(); -+ -+ // If we'd be over the load fraction even after freeing -+ // $cpus_to_free, we have to free. -+ if !no_load_frac_limit -+ && total_load >= 0.0 -+ && (self.nr_cpus - nr_to_free) as f64 / cpu_pool.nr_cpus as f64 -+ >= layer_load / total_load -+ { -+ return Ok(Some(cpus_to_free)); -+ } -+ -+ if layer_util / self.nr_cpus as f64 >= util_low { -+ return Ok(None); -+ } -+ -+ // Can't shrink if losing the CPUs pushes us over @util_high. -+ match self.nr_cpus - nr_to_free { -+ 0 => { -+ if layer_util > 0.0 { -+ return Ok(None); -+ } -+ } -+ nr_left => { -+ if layer_util / nr_left as f64 >= util_high { -+ return Ok(None); -+ } -+ } -+ } -+ -+ return Ok(Some(cpus_to_free)); -+ } -+ -+ fn shrink_confined_or_grouped( -+ &mut self, -+ cpu_pool: &mut CpuPool, -+ cpus_range: (usize, usize), -+ util_range: (f64, f64), -+ load: (f64, f64), -+ util: (f64, f64), -+ no_load_frac_limit: bool, -+ ) -> Result { -+ match self.cpus_to_free( -+ cpu_pool, -+ cpus_range, -+ util_range, -+ load, -+ util, -+ no_load_frac_limit, -+ )? { -+ Some(cpus_to_free) => { -+ trace!("freeing CPUs {}", &cpus_to_free); -+ self.nr_cpus -= cpus_to_free.count_ones(); -+ self.cpus &= !cpus_to_free.clone(); -+ cpu_pool.free(&cpus_to_free)?; -+ Ok(true) -+ } -+ None => Ok(false), -+ } -+ } -+ -+ fn resize_confined_or_grouped( -+ &mut self, -+ cpu_pool: &mut CpuPool, -+ cpus_range: Option<(usize, usize)>, -+ util_range: (f64, f64), -+ load: (f64, f64), -+ util: (f64, f64), -+ no_load_frac_limit: bool, -+ ) -> Result { -+ let cpus_range = cpus_range.unwrap_or((0, std::usize::MAX)); -+ let mut adjusted = 0; -+ -+ while self.grow_confined_or_grouped( -+ cpu_pool, -+ cpus_range, -+ util_range, -+ load, -+ util, -+ no_load_frac_limit, -+ )? { -+ adjusted += 1; -+ trace!("{} grew, adjusted={}", &self.name, adjusted); -+ } -+ -+ if adjusted == 0 { -+ while self.shrink_confined_or_grouped( -+ cpu_pool, -+ cpus_range, -+ util_range, -+ load, -+ util, -+ no_load_frac_limit, -+ )? { -+ adjusted -= 1; -+ trace!("{} shrunk, adjusted={}", &self.name, adjusted); -+ } -+ } -+ -+ if adjusted != 0 { -+ trace!("{} done resizing, adjusted={}", &self.name, adjusted); -+ } -+ Ok(adjusted) -+ } -+} -+ -+struct Scheduler<'a> { -+ skel: LayeredSkel<'a>, -+ struct_ops: Option, -+ layer_specs: Vec, -+ -+ sched_intv: Duration, -+ monitor_intv: Duration, -+ no_load_frac_limit: bool, -+ -+ cpu_pool: CpuPool, -+ layers: Vec, -+ -+ proc_reader: procfs::ProcReader, -+ sched_stats: Stats, -+ report_stats: Stats, -+ -+ nr_layer_cpus_min_max: Vec<(usize, usize)>, -+ processing_dur: Duration, -+ prev_processing_dur: Duration, -+} -+ -+impl<'a> Scheduler<'a> { -+ fn init_layers(skel: &mut OpenLayeredSkel, specs: &Vec) -> Result<()> { -+ skel.rodata().nr_layers = specs.len() as u32; -+ -+ for (spec_i, spec) in specs.iter().enumerate() { -+ let layer = &mut skel.bss().layers[spec_i]; -+ -+ for (or_i, or) in spec.matches.iter().enumerate() { -+ for (and_i, and) in or.iter().enumerate() { -+ let mt = &mut layer.matches[or_i].matches[and_i]; -+ match and { -+ LayerMatch::CgroupPrefix(prefix) => { -+ mt.kind = layered_sys::layer_match_kind_MATCH_CGROUP_PREFIX as i32; -+ copy_into_cstr(&mut mt.cgroup_prefix, prefix.as_str()); -+ } -+ LayerMatch::CommPrefix(prefix) => { -+ mt.kind = layered_sys::layer_match_kind_MATCH_COMM_PREFIX as i32; -+ copy_into_cstr(&mut mt.comm_prefix, prefix.as_str()); -+ } -+ LayerMatch::NiceAbove(nice) => { -+ mt.kind = layered_sys::layer_match_kind_MATCH_NICE_ABOVE as i32; -+ mt.nice_above_or_below = *nice; -+ } -+ LayerMatch::NiceBelow(nice) => { -+ mt.kind = layered_sys::layer_match_kind_MATCH_NICE_BELOW as i32; -+ mt.nice_above_or_below = *nice; -+ } -+ } -+ } -+ layer.matches[or_i].nr_match_ands = or.len() as i32; -+ } -+ -+ layer.nr_match_ors = spec.matches.len() as u32; -+ -+ match &spec.kind { -+ LayerKind::Open { preempt } | LayerKind::Grouped { preempt, .. } => { -+ layer.open = true; -+ layer.preempt = *preempt; -+ } -+ _ => {} -+ } -+ } -+ -+ Ok(()) -+ } -+ -+ fn init(opts: &Opts, layer_specs: Vec) -> Result { -+ let nr_layers = layer_specs.len(); -+ let mut cpu_pool = CpuPool::new()?; -+ -+ // Open the BPF prog first for verification. -+ let mut skel_builder = LayeredSkelBuilder::default(); -+ skel_builder.obj_builder.debug(opts.verbose > 1); -+ let mut skel = skel_builder.open().context("Failed to open BPF program")?; -+ -+ // Initialize skel according to @opts. -+ skel.rodata().debug = opts.verbose as u32; -+ skel.rodata().slice_ns = opts.slice_us * 1000; -+ skel.rodata().nr_possible_cpus = *NR_POSSIBLE_CPUS as u32; -+ skel.rodata().smt_enabled = cpu_pool.nr_cpus > cpu_pool.nr_cores; -+ for cpu in cpu_pool.all_cpus.iter_ones() { -+ skel.rodata().all_cpus[cpu / 8] |= 1 << (cpu % 8); -+ } -+ Self::init_layers(&mut skel, &layer_specs)?; -+ -+ // Attach. -+ let mut skel = skel.load().context("Failed to load BPF program")?; -+ skel.attach().context("Failed to attach BPF program")?; -+ let struct_ops = Some( -+ skel.maps_mut() -+ .layered() -+ .attach_struct_ops() -+ .context("Failed to attach layered struct ops")?, -+ ); -+ info!("Layered Scheduler Attached"); -+ -+ let mut layers = vec![]; -+ for spec in layer_specs.iter() { -+ layers.push(Layer::new(&mut cpu_pool, &spec.name, spec.kind.clone())?); -+ } -+ -+ // Other stuff. -+ let proc_reader = procfs::ProcReader::new(); -+ -+ Ok(Self { -+ struct_ops, // should be held to keep it attached -+ layer_specs, -+ -+ sched_intv: Duration::from_secs_f64(opts.interval), -+ monitor_intv: Duration::from_secs_f64(opts.monitor), -+ no_load_frac_limit: opts.no_load_frac_limit, -+ -+ cpu_pool, -+ layers, -+ -+ sched_stats: Stats::new(&mut skel, &proc_reader)?, -+ report_stats: Stats::new(&mut skel, &proc_reader)?, -+ -+ nr_layer_cpus_min_max: vec![(0, 0); nr_layers], -+ processing_dur: Duration::from_millis(0), -+ prev_processing_dur: Duration::from_millis(0), -+ -+ proc_reader, -+ skel, -+ }) -+ } -+ -+ fn update_bpf_layer_cpumask(layer: &Layer, bpf_layer: &mut layered_bss_types::layer) { -+ for bit in 0..layer.cpus.len() { -+ if layer.cpus[bit] { -+ bpf_layer.cpus[bit / 8] |= 1 << (bit % 8); -+ } else { -+ bpf_layer.cpus[bit / 8] &= !(1 << (bit % 8)); -+ } -+ } -+ bpf_layer.refresh_cpus = 1; -+ } -+ -+ fn step(&mut self) -> Result<()> { -+ let started_at = Instant::now(); -+ self.sched_stats -+ .refresh(&mut self.skel, &self.proc_reader, started_at)?; -+ let mut updated = false; -+ -+ for idx in 0..self.layers.len() { -+ match self.layers[idx].kind { -+ LayerKind::Confined { -+ cpus_range, -+ util_range, -+ } -+ | LayerKind::Grouped { -+ cpus_range, -+ util_range, -+ .. -+ } => { -+ let load = ( -+ self.sched_stats.layer_loads[idx], -+ self.sched_stats.total_load, -+ ); -+ let util = ( -+ self.sched_stats.layer_utils[idx], -+ self.sched_stats.total_util, -+ ); -+ if self.layers[idx].resize_confined_or_grouped( -+ &mut self.cpu_pool, -+ cpus_range, -+ util_range, -+ load, -+ util, -+ self.no_load_frac_limit, -+ )? != 0 -+ { -+ Self::update_bpf_layer_cpumask( -+ &self.layers[idx], -+ &mut self.skel.bss().layers[idx], -+ ); -+ updated = true; -+ } -+ } -+ _ => {} -+ } -+ } -+ -+ if updated { -+ let available_cpus = self.cpu_pool.available_cpus(); -+ let nr_available_cpus = available_cpus.count_ones(); -+ for idx in 0..self.layers.len() { -+ let layer = &mut self.layers[idx]; -+ let bpf_layer = &mut self.skel.bss().layers[idx]; -+ match &layer.kind { -+ LayerKind::Open { .. } => { -+ layer.cpus.copy_from_bitslice(&available_cpus); -+ layer.nr_cpus = nr_available_cpus; -+ Self::update_bpf_layer_cpumask(layer, bpf_layer); -+ } -+ _ => {} -+ } -+ } -+ -+ self.skel.bss().fallback_cpu = self.cpu_pool.fallback_cpu as u32; -+ -+ for (lidx, layer) in self.layers.iter().enumerate() { -+ self.nr_layer_cpus_min_max[lidx] = ( -+ self.nr_layer_cpus_min_max[lidx].0.min(layer.nr_cpus), -+ self.nr_layer_cpus_min_max[lidx].1.max(layer.nr_cpus), -+ ); -+ } -+ } -+ -+ self.processing_dur += Instant::now().duration_since(started_at); -+ Ok(()) -+ } -+ -+ fn report(&mut self) -> Result<()> { -+ let started_at = Instant::now(); -+ self.report_stats -+ .refresh(&mut self.skel, &self.proc_reader, started_at)?; -+ let stats = &self.report_stats; -+ -+ let processing_dur = self.processing_dur - self.prev_processing_dur; -+ self.prev_processing_dur = self.processing_dur; -+ -+ let lsum = |idx| stats.bpf_stats.lstats_sums[idx as usize]; -+ let total = lsum(layered_sys::layer_stat_idx_LSTAT_LOCAL) -+ + lsum(layered_sys::layer_stat_idx_LSTAT_GLOBAL); -+ let lsum_pct = |idx| { -+ if total != 0 { -+ lsum(idx) as f64 / total as f64 * 100.0 -+ } else { -+ 0.0 -+ } -+ }; -+ -+ info!( -+ "tot={:7} local={:5.2} open_idle={:5.2} affn_viol={:5.2} tctx_err={} proc={:?}ms", -+ total, -+ lsum_pct(layered_sys::layer_stat_idx_LSTAT_LOCAL), -+ lsum_pct(layered_sys::layer_stat_idx_LSTAT_OPEN_IDLE), -+ lsum_pct(layered_sys::layer_stat_idx_LSTAT_AFFN_VIOL), -+ stats.prev_bpf_stats.gstats -+ [layered_sys::global_stat_idx_GSTAT_TASK_CTX_FREE_FAILED as usize], -+ processing_dur.as_millis(), -+ ); -+ -+ info!( -+ "busy={:5.1} util={:7.1} load={:9.1} fallback_cpu={:3}", -+ stats.cpu_busy * 100.0, -+ stats.total_util * 100.0, -+ stats.total_load, -+ self.cpu_pool.fallback_cpu, -+ ); -+ -+ let header_width = self -+ .layer_specs -+ .iter() -+ .map(|spec| spec.name.len()) -+ .max() -+ .unwrap() -+ .max(4); -+ -+ let calc_frac = |a, b| { -+ if b != 0.0 { a / b * 100.0 } else { 0.0 } -+ }; -+ -+ for (lidx, (spec, layer)) in self.layer_specs.iter().zip(self.layers.iter()).enumerate() { -+ let lstat = |sidx| stats.bpf_stats.lstats[lidx][sidx as usize]; -+ let ltotal = lstat(layered_sys::layer_stat_idx_LSTAT_LOCAL) -+ + lstat(layered_sys::layer_stat_idx_LSTAT_GLOBAL); -+ let lstat_pct = |sidx| { -+ if ltotal != 0 { -+ lstat(sidx) as f64 / ltotal as f64 * 100.0 -+ } else { -+ 0.0 -+ } -+ }; -+ -+ info!( -+ " {:) -> Result<()> { -+ let now = Instant::now(); -+ let mut next_sched_at = now + self.sched_intv; -+ let mut next_monitor_at = now + self.monitor_intv; -+ -+ while !shutdown.load(Ordering::Relaxed) && !UserExitInfo::exited(&self.skel.bss().uei)? { -+ let now = Instant::now(); -+ -+ if now >= next_sched_at { -+ self.step()?; -+ while next_sched_at < now { -+ next_sched_at += self.sched_intv; -+ } -+ } -+ -+ if now >= next_monitor_at { -+ self.report()?; -+ while next_monitor_at < now { -+ next_monitor_at += self.monitor_intv; -+ } -+ } -+ -+ std::thread::sleep( -+ next_sched_at -+ .min(next_monitor_at) -+ .duration_since(Instant::now()), -+ ); -+ } -+ -+ self.struct_ops.take(); -+ UserExitInfo::read(&self.skel.bss().uei)?.report() -+ } -+} -+ -+impl<'a> Drop for Scheduler<'a> { -+ fn drop(&mut self) { -+ if let Some(struct_ops) = self.struct_ops.take() { -+ drop(struct_ops); -+ } -+ } -+} -+ -+fn write_example_file(path: &str) -> Result<()> { -+ let example = LayerConfig { -+ specs: vec![ -+ LayerSpec { -+ name: "batch".into(), -+ comment: Some("tasks under system.slice or tasks with nice value > 0".into()), -+ matches: vec![ -+ vec![LayerMatch::CgroupPrefix("system.slice/".into())], -+ vec![LayerMatch::NiceAbove(0)], -+ ], -+ kind: LayerKind::Confined { -+ cpus_range: Some((0, 16)), -+ util_range: (0.8, 0.9), -+ }, -+ }, -+ LayerSpec { -+ name: "immediate".into(), -+ comment: Some("tasks under workload.slice with nice value < 0".into()), -+ matches: vec![vec![ -+ LayerMatch::CgroupPrefix("workload.slice/".into()), -+ LayerMatch::NiceBelow(0), -+ ]], -+ kind: LayerKind::Open { preempt: true }, -+ }, -+ LayerSpec { -+ name: "normal".into(), -+ comment: Some("the rest".into()), -+ matches: vec![vec![]], -+ kind: LayerKind::Grouped { -+ cpus_range: None, -+ util_range: (0.5, 0.6), -+ preempt: false, -+ }, -+ }, -+ ], -+ }; -+ -+ let mut f = fs::OpenOptions::new() -+ .create_new(true) -+ .write(true) -+ .open(path)?; -+ Ok(f.write_all(serde_json::to_string_pretty(&example)?.as_bytes())?) -+} -+ -+fn verify_layer_specs(specs: &[LayerSpec]) -> Result<()> { -+ let nr_specs = specs.len(); -+ if nr_specs == 0 { -+ bail!("No layer spec"); -+ } -+ if nr_specs > MAX_LAYERS { -+ bail!("Too many layer specs"); -+ } -+ -+ for (idx, spec) in specs.iter().enumerate() { -+ if idx < nr_specs - 1 { -+ if spec.matches.len() == 0 { -+ bail!("Non-terminal spec {:?} has NULL matches", spec.name); -+ } -+ } else { -+ if spec.matches.len() != 1 || spec.matches[0].len() != 0 { -+ bail!("Terminal spec {:?} must have an empty match", spec.name); -+ } -+ } -+ -+ if spec.matches.len() > MAX_LAYER_MATCH_ORS { -+ bail!( -+ "Spec {:?} has too many ({}) OR match blocks", -+ spec.name, -+ spec.matches.len() -+ ); -+ } -+ -+ for (ands_idx, ands) in spec.matches.iter().enumerate() { -+ if ands.len() > NR_LAYER_MATCH_KINDS { -+ bail!( -+ "Spec {:?}'s {}th OR block has too many ({}) match conditions", -+ spec.name, -+ ands_idx, -+ ands.len() -+ ); -+ } -+ for one in ands.iter() { -+ match one { -+ LayerMatch::CgroupPrefix(prefix) => { -+ if prefix.len() > MAX_PATH { -+ bail!("Spec {:?} has too long a cgroup prefix", spec.name); -+ } -+ } -+ LayerMatch::CommPrefix(prefix) => { -+ if prefix.len() > MAX_COMM { -+ bail!("Spec {:?} has too long a comm prefix", spec.name); -+ } -+ } -+ _ => {} -+ } -+ } -+ } -+ -+ match spec.kind { -+ LayerKind::Confined { -+ cpus_range, -+ util_range, -+ } -+ | LayerKind::Grouped { -+ cpus_range, -+ util_range, -+ .. -+ } => { -+ if let Some((cpus_min, cpus_max)) = cpus_range { -+ if cpus_min > cpus_max { -+ bail!( -+ "Spec {:?} has invalid cpus_range({}, {})", -+ spec.name, -+ cpus_min, -+ cpus_max -+ ); -+ } -+ } -+ if util_range.0 >= util_range.1 { -+ bail!( -+ "Spec {:?} has invalid util_range ({}, {})", -+ spec.name, -+ util_range.0, -+ util_range.1 -+ ); -+ } -+ } -+ _ => {} -+ } -+ } -+ -+ Ok(()) -+} -+ -+fn main() -> Result<()> { -+ let opts = Opts::parse(); -+ -+ let llv = match opts.verbose { -+ 0 => simplelog::LevelFilter::Info, -+ 1 => simplelog::LevelFilter::Debug, -+ _ => simplelog::LevelFilter::Trace, -+ }; -+ let mut lcfg = simplelog::ConfigBuilder::new(); -+ lcfg.set_time_level(simplelog::LevelFilter::Error) -+ .set_location_level(simplelog::LevelFilter::Off) -+ .set_target_level(simplelog::LevelFilter::Off) -+ .set_thread_level(simplelog::LevelFilter::Off); -+ simplelog::TermLogger::init( -+ llv, -+ lcfg.build(), -+ simplelog::TerminalMode::Stderr, -+ simplelog::ColorChoice::Auto, -+ )?; -+ -+ debug!("opts={:?}", &opts); -+ -+ if let Some(path) = &opts.example { -+ write_example_file(path)?; -+ return Ok(()); -+ } -+ -+ let mut layer_config = LayerConfig { specs: vec![] }; -+ for (idx, input) in opts.specs.iter().enumerate() { -+ layer_config.specs.append( -+ &mut LayerSpec::parse(input) -+ .context(format!("Failed to parse specs[{}] ({:?})", idx, input))?, -+ ); -+ } -+ -+ debug!("specs={}", serde_json::to_string_pretty(&layer_config)?); -+ verify_layer_specs(&layer_config.specs)?; -+ -+ let mut sched = Scheduler::init(&opts, layer_config.specs)?; -+ -+ let shutdown = Arc::new(AtomicBool::new(false)); -+ let shutdown_clone = shutdown.clone(); -+ ctrlc::set_handler(move || { -+ shutdown_clone.store(true, Ordering::Relaxed); -+ }) -+ .context("Error setting Ctrl-C handler")?; -+ -+ sched.run(shutdown) -+} -diff --git a/tools/sched_ext/scx_nest.bpf.c b/tools/sched_ext/scx_nest.bpf.c -new file mode 100644 -index 000000000..3ab6d52d0 ---- /dev/null -+++ b/tools/sched_ext/scx_nest.bpf.c -@@ -0,0 +1,681 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * As described in [0], a Nest scheduler which encourages task placement on -+ * cores that are likely to be running at higher frequency, based upon recent usage. -+ * -+ * [0]: https://hal.inria.fr/hal-03612592/file/paper.pdf -+ * -+ * It operates as a global weighted vtime scheduler (similarly to CFS), while -+ * using the Nest algorithm to choose idle cores at wakup time. -+ * -+ * It also demonstrates the following niceties. -+ * -+ * - More robust task placement policies. -+ * - Termination notification for userspace. -+ * -+ * While rather simple, this scheduler should work reasonably well on CPUs with -+ * a uniform L3 cache topology. While preemption is not implemented, the fact -+ * that the scheduling queue is shared across all CPUs means that whatever is -+ * at the front of the queue is likely to be executed fairly quickly given -+ * enough number of CPUs. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include "scx_common.bpf.h" -+#include "vmlinux.h" -+#include "scx_nest.h" -+ -+#define TASK_DEAD 0x00000080 -+ -+char _license[] SEC("license") = "GPL"; -+ -+enum { -+ FALLBACK_DSQ_ID = 0, -+ MSEC_PER_SEC = 1000LLU, -+ USEC_PER_MSEC = 1000LLU, -+ NSEC_PER_USEC = 1000LLU, -+ NSEC_PER_MSEC = USEC_PER_MSEC * NSEC_PER_USEC, -+ USEC_PER_SEC = USEC_PER_MSEC * MSEC_PER_SEC, -+ NSEC_PER_SEC = NSEC_PER_USEC * USEC_PER_SEC, -+}; -+ -+#define CLOCK_BOOTTIME 7 -+#define NUMA_NO_NODE -1 -+ -+const volatile u64 p_remove_ns = 2 * NSEC_PER_MSEC; -+const volatile u64 r_max = 5; -+const volatile u64 r_impatient = 2; -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+const volatile bool find_fully_idle = false; -+const volatile u64 sampling_cadence_ns = 1 * NSEC_PER_SEC; -+const volatile u64 r_depth = 5; -+ -+// Used for stats tracking. May be stale at any given time. -+u64 stats_primary_mask, stats_reserved_mask, stats_other_mask, stats_idle_mask; -+ -+// Used for internal tracking. -+static s32 nr_reserved; -+ -+static u64 vtime_now; -+struct user_exit_info uei; -+ -+extern unsigned long CONFIG_HZ __kconfig; -+ -+/* Per-task scheduling context */ -+struct task_ctx { -+ /* -+ * A temporary cpumask for calculating a task's primary and reserve -+ * mask. -+ */ -+ struct bpf_cpumask __kptr *tmp_mask; -+ -+ /* -+ * The number of times that a task observes that its previous core is -+ * not idle. If this occurs r_impatient times in a row, a core is -+ * attempted to be retrieved from either the reserve nest, or the -+ * fallback nest. -+ */ -+ u32 prev_misses; -+ -+ /* -+ * A core that the task is "attached" to, meaning the last core that it -+ * executed on at least twice in a row, and the core that it first -+ * tries to migrate to on wakeup. The task only migrates to the -+ * attached core if it is idle and in the primary nest. -+ */ -+ s32 attached_core; -+ -+ /* -+ * The last core that the task executed on. This is used to determine -+ * if the task should attach to the core that it will execute on next. -+ */ -+ s32 prev_cpu; -+ -+ /* Dispatch directly to local_dsq */ -+ bool force_local; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct task_ctx); -+} task_ctx_stor SEC(".maps"); -+ -+struct pcpu_ctx { -+ /* The timer used to compact the core from the primary nest. */ -+ struct bpf_timer timer; -+ -+ /* Whether the current core has been scheduled for compaction. */ -+ bool scheduled_compaction; -+ -+ /* Number of times a primary core has been scheduled for compaction. */ -+ u32 num_schedulings; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __uint(max_entries, 1024); -+ __type(key, s32); -+ __type(value, struct pcpu_ctx); -+} pcpu_ctxs SEC(".maps"); -+ -+struct stats_timer { -+ struct bpf_timer timer; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct stats_timer); -+} stats_timer SEC(".maps"); -+ -+const volatile u32 nr_cpus = 1; /* !0 for veristat, set during init. */ -+ -+private(NESTS) struct bpf_cpumask __kptr *primary_cpumask; -+private(NESTS) struct bpf_cpumask __kptr *reserve_cpumask; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(key_size, sizeof(u32)); -+ __uint(value_size, sizeof(u64)); -+ __uint(max_entries, NEST_STAT(NR)); -+} stats SEC(".maps"); -+ -+ -+static __attribute__((always_inline)) void stat_inc(u32 idx) -+{ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); -+ if (cnt_p) -+ (*cnt_p)++; -+} -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+static const struct cpumask *cast_mask(struct bpf_cpumask *mask) -+{ -+ return (const struct cpumask *)mask; -+} -+ -+static __attribute__((always_inline)) void -+try_make_core_reserved(s32 cpu, struct bpf_cpumask * reserved, bool promotion) -+{ -+ s32 tmp_nr_reserved; -+ -+ /* -+ * This check is racy, but that's OK. If we incorrectly fail to promote -+ * a core to reserve, it's because another context added or removed a -+ * core from reserved in this small window. It will balance out over -+ * subsequent wakeups. -+ */ -+ tmp_nr_reserved = nr_reserved; -+ if (tmp_nr_reserved < r_max) { -+ /* -+ * It's possible that we could exceed r_max for a time here, -+ * but that should balance out as more cores are either demoted -+ * or fail to be promoted into the reserve nest. -+ */ -+ __sync_fetch_and_add(&nr_reserved, 1); -+ bpf_cpumask_set_cpu(cpu, reserved); -+ if (promotion) -+ stat_inc(NEST_STAT(PROMOTED_TO_RESERVED)); -+ else -+ stat_inc(NEST_STAT(DEMOTED_TO_RESERVED)); -+ } else { -+ bpf_cpumask_clear_cpu(cpu, reserved); -+ stat_inc(NEST_STAT(RESERVED_AT_CAPACITY)); -+ } -+} -+ -+static void update_attached(struct task_ctx *tctx, s32 prev_cpu, s32 new_cpu) -+{ -+ if (tctx->prev_cpu == new_cpu) -+ tctx->attached_core = new_cpu; -+ tctx->prev_cpu = prev_cpu; -+} -+ -+s32 BPF_STRUCT_OPS(nest_select_cpu, struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags) -+{ -+ struct bpf_cpumask *p_mask, *primary, *reserve; -+ s32 cpu; -+ struct task_ctx *tctx; -+ struct pcpu_ctx *pcpu_ctx; -+ bool direct_to_primary = false; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) -+ return -ENOENT; -+ -+ bpf_rcu_read_lock(); -+ p_mask = tctx->tmp_mask; -+ primary = primary_cpumask; -+ reserve = reserve_cpumask; -+ if (!p_mask || !primary || !reserve) { -+ bpf_rcu_read_unlock(); -+ return -ENOENT; -+ } -+ -+ // Unset below if we can't find a core to migrate to. -+ tctx->force_local = true; -+ tctx->prev_cpu = prev_cpu; -+ -+ bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(primary)); -+ -+ /* First try to wake the task on its attached core. */ -+ if (bpf_cpumask_test_cpu(tctx->attached_core, cast_mask(p_mask)) && -+ scx_bpf_test_and_clear_cpu_idle(tctx->attached_core)) { -+ cpu = tctx->attached_core; -+ tctx->prev_misses = 0; -+ stat_inc(NEST_STAT(WAKEUP_ATTACHED)); -+ goto migrate_primary; -+ } -+ -+ /* -+ * Try to stay on the previous core if it's in the primary set, and -+ * there's no hypertwin. If the previous core is the core the task is -+ * attached to, don't bother as we already just tried that above. -+ */ -+ if (prev_cpu != tctx->attached_core && -+ bpf_cpumask_test_cpu(prev_cpu, cast_mask(p_mask)) && -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ cpu = prev_cpu; -+ tctx->prev_misses = 0; -+ stat_inc(NEST_STAT(WAKEUP_PREV_PRIMARY)); -+ goto migrate_primary; -+ } -+ -+ if (r_impatient > 0 && ++tctx->prev_misses >= r_impatient) { -+ direct_to_primary = true; -+ tctx->prev_misses = 0; -+ stat_inc(NEST_STAT(TASK_IMPATIENT)); -+ goto search_reserved; -+ } -+ -+ if (find_fully_idle) { -+ /* Then try any fully idle core in primary. */ -+ cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), -+ SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) { -+ stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_PRIMARY)); -+ goto migrate_primary; -+ } -+ } -+ -+ /* Then try _any_ idle core in primary, even if its hypertwin is active. */ -+ cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); -+ if (cpu >= 0) { -+ stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_PRIMARY)); -+ goto migrate_primary; -+ } -+ -+search_reserved: -+ /* Then try any fully idle core in reserve. */ -+ bpf_cpumask_and(p_mask, p->cpus_ptr, cast_mask(reserve)); -+ if (find_fully_idle) { -+ cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), -+ SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) { -+ stat_inc(NEST_STAT(WAKEUP_FULLY_IDLE_RESERVE)); -+ goto promote_to_primary; -+ } -+ } -+ -+ /* Then try _any_ idle core in reserve, even if its hypertwin is active. */ -+ cpu = scx_bpf_pick_idle_cpu(cast_mask(p_mask), 0); -+ if (cpu >= 0) { -+ stat_inc(NEST_STAT(WAKEUP_ANY_IDLE_RESERVE)); -+ goto promote_to_primary; -+ } -+ -+ /* Then try _any_ idle core in the task's cpumask. */ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) { -+ /* -+ * We found a core that (we didn't _think_) is in any nest. -+ * This means that we need to either promote the core to the -+ * reserve nest, or if we're going direct to primary due to -+ * r_impatient being exceeded, promote directly to primary. -+ * -+ * We have to do one final check here to see if the core is in -+ * the primary or reserved cpumask because we could potentially -+ * race with the core changing states between AND'ing the -+ * primary and reserve masks with p->cpus_ptr above, and -+ * atomically reserving it from the idle mask with -+ * scx_bpf_pick_idle_cpu(). This is also technically true of -+ * the checks above, but in all of those cases we just put the -+ * core directly into the primary mask so it's not really that -+ * big of a problem. Here, we want to make sure that we don't -+ * accidentally put a core into the reserve nest that was e.g. -+ * already in the primary nest. This is unlikely, but we check -+ * for it on what should be a relatively cold path regardless. -+ */ -+ stat_inc(NEST_STAT(WAKEUP_IDLE_OTHER)); -+ if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) -+ goto migrate_primary; -+ else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) -+ goto promote_to_primary; -+ else if (direct_to_primary) -+ goto promote_to_primary; -+ else -+ try_make_core_reserved(cpu, reserve, true); -+ bpf_rcu_read_unlock(); -+ return cpu; -+ } -+ -+ bpf_rcu_read_unlock(); -+ tctx->force_local = false; -+ return prev_cpu; -+ -+promote_to_primary: -+ stat_inc(NEST_STAT(PROMOTED_TO_PRIMARY)); -+migrate_primary: -+ pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); -+ if (pcpu_ctx) { -+ if (pcpu_ctx->scheduled_compaction) { -+ if (bpf_timer_cancel(&pcpu_ctx->timer) < 0) -+ scx_bpf_error("Failed to cancel pcpu timer"); -+ pcpu_ctx->scheduled_compaction = false; -+ stat_inc(NEST_STAT(CANCELLED_COMPACTION)); -+ } -+ } else { -+ scx_bpf_error("Failed to lookup pcpu ctx"); -+ } -+ bpf_cpumask_set_cpu(cpu, primary); -+ /* -+ * Check to see whether the CPU is in the reserved nest. This can -+ * happen if the core is compacted concurrently with us trying to place -+ * the currently-waking task onto it. Similarly, this is the expected -+ * state of the core if we found the core in the reserve nest and are -+ * promoting it. -+ * -+ * We don't have to worry about racing with any other waking task here -+ * because we've atomically reserved the core with (some variant of) -+ * scx_bpf_pick_idle_cpu(). -+ */ -+ if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) { -+ __sync_sub_and_fetch(&nr_reserved, 1); -+ bpf_cpumask_clear_cpu(cpu, reserve); -+ } -+ bpf_rcu_read_unlock(); -+ update_attached(tctx, prev_cpu, cpu); -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(nest_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ struct task_ctx *tctx; -+ u64 vtime = p->scx.dsq_vtime; -+ s32 cpu = bpf_get_smp_processor_id(); -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("Unable to find task ctx"); -+ return; -+ } -+ -+ if (tctx->force_local || (enq_flags & SCX_ENQ_LOCAL)) { -+ tctx->force_local = false; -+ if (enq_flags & SCX_ENQ_LOCAL) -+ update_attached(tctx, tctx->prev_cpu, cpu); -+ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(vtime, vtime_now - slice_ns)) -+ vtime = vtime_now - slice_ns; -+ -+ scx_bpf_dispatch_vtime(p, FALLBACK_DSQ_ID, slice_ns, vtime, -+ enq_flags); -+} -+ -+void BPF_STRUCT_OPS(nest_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ struct pcpu_ctx *pcpu_ctx; -+ struct bpf_cpumask *primary, *reserve; -+ s32 key = cpu; -+ bool in_primary; -+ -+ primary = primary_cpumask; -+ reserve = reserve_cpumask; -+ if (!primary || !reserve) { -+ scx_bpf_error("No primary or reserve cpumask"); -+ return; -+ } -+ -+ if (!scx_bpf_consume(FALLBACK_DSQ_ID)) { -+ in_primary = bpf_cpumask_test_cpu(cpu, cast_mask(primary)); -+ -+ if (prev && (prev->scx.flags & SCX_TASK_QUEUED) && in_primary) { -+ scx_bpf_dispatch(prev, SCX_DSQ_LOCAL, slice_ns, 0); -+ return; -+ } -+ -+ stat_inc(NEST_STAT(NOT_CONSUMED)); -+ if (in_primary) { -+ pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); -+ if (!pcpu_ctx) { -+ scx_bpf_error("Failed to lookup pcpu ctx"); -+ return; -+ } -+ -+ /* -+ * Immediately demote a primary core if: -+ * - It's been scheduled for compaction at least -+ * r_depth times without actually being compacted. -+ * - The previous task on it is dying -+ * -+ * Note that we elect to not compact the "first" CPU in -+ * the mask so as to encourage at least one core to -+ * remain in the nest. It would be better to check for -+ * whether there is only one core remaining in the -+ * nest, but BPF doesn't yet have a kfunc for querying -+ * cpumask weight. -+ */ -+ if ((prev && prev->__state == TASK_DEAD) || -+ (cpu != bpf_cpumask_first(cast_mask(primary)) && pcpu_ctx->num_schedulings >= r_depth)) { -+ stat_inc(NEST_STAT(COMPACTED)); -+ bpf_cpumask_clear_cpu(cpu, primary); -+ try_make_core_reserved(cpu, reserve, false); -+ pcpu_ctx->num_schedulings = 0; -+ } else { -+ pcpu_ctx->scheduled_compaction = true; -+ /* -+ * The core isn't being used anymore. Set a -+ * timer to remove the core from the nest in -+ * p_remove if it's still unused by that point. -+ */ -+ bpf_timer_start(&pcpu_ctx->timer, p_remove_ns, -+ 0 /*BPF_F_TIMER_CPU_PIN*/); -+ pcpu_ctx->num_schedulings++; -+ stat_inc(NEST_STAT(SCHEDULED_COMPACTION)); -+ } -+ } -+ return; -+ } -+ stat_inc(NEST_STAT(CONSUMED)); -+} -+ -+void BPF_STRUCT_OPS(nest_running, struct task_struct *p) -+{ -+ /* -+ * Global vtime always progresses forward as tasks start executing. The -+ * test and update can be performed concurrently from multiple CPUs and -+ * thus racy. Any error should be contained and temporary. Let's just -+ * live with it. -+ */ -+ if (vtime_before(vtime_now, p->scx.dsq_vtime)) -+ vtime_now = p->scx.dsq_vtime; -+} -+ -+void BPF_STRUCT_OPS(nest_stopping, struct task_struct *p, bool runnable) -+{ -+ /* scale the execution time by the inverse of the weight and charge */ -+ p->scx.dsq_vtime += (slice_ns - p->scx.slice) * 100 / p->scx.weight; -+} -+ -+s32 BPF_STRUCT_OPS(nest_prep_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ struct task_ctx *tctx; -+ struct bpf_cpumask *cpumask; -+ -+ /* -+ * @p is new. Let's ensure that its task_ctx is available. We can sleep -+ * in this function and the following will automatically use GFP_KERNEL. -+ */ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE); -+ if (!tctx) -+ return -ENOMEM; -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ -+ cpumask = bpf_kptr_xchg(&tctx->tmp_mask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ -+ tctx->attached_core = -1; -+ tctx->prev_cpu = -1; -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(nest_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ p->scx.dsq_vtime = vtime_now; -+} -+ -+static int compact_primary_core(void *map, int *key, struct bpf_timer *timer) -+{ -+ struct bpf_cpumask *primary, *reserve; -+ s32 cpu = bpf_get_smp_processor_id(); -+ struct pcpu_ctx *pcpu_ctx; -+ -+ stat_inc(NEST_STAT(COMPACTED)); -+ /* -+ * If we made it to this callback, it means that the timer callback was -+ * never cancelled, and so the core needs to be demoted from the -+ * primary nest. -+ */ -+ pcpu_ctx = bpf_map_lookup_elem(&pcpu_ctxs, &cpu); -+ if (!pcpu_ctx) { -+ scx_bpf_error("Couldn't lookup pcpu ctx"); -+ return 0; -+ } -+ bpf_rcu_read_lock(); -+ primary = primary_cpumask; -+ reserve = reserve_cpumask; -+ if (!primary || !reserve) { -+ scx_bpf_error("Couldn't find primary or reserve"); -+ bpf_rcu_read_unlock(); -+ return 0; -+ } -+ -+ bpf_cpumask_clear_cpu(cpu, primary); -+ try_make_core_reserved(cpu, reserve, false); -+ bpf_rcu_read_unlock(); -+ pcpu_ctx->num_schedulings = 0; -+ pcpu_ctx->scheduled_compaction = false; -+ return 0; -+} -+ -+static int stats_timerfn(void *map, int *key, struct bpf_timer *timer) -+{ -+ s32 cpu; -+ struct bpf_cpumask *primary, *reserve; -+ const struct cpumask *idle; -+ stats_primary_mask = 0; -+ stats_reserved_mask = 0; -+ stats_other_mask = 0; -+ stats_idle_mask = 0; -+ long err; -+ -+ bpf_rcu_read_lock(); -+ primary = primary_cpumask; -+ reserve = reserve_cpumask; -+ if (!primary || !reserve) { -+ bpf_rcu_read_unlock(); -+ scx_bpf_error("Failed to lookup primary or reserve"); -+ return 0; -+ } -+ -+ idle = scx_bpf_get_idle_cpumask(); -+ bpf_for(cpu, 0, nr_cpus) { -+ if (bpf_cpumask_test_cpu(cpu, cast_mask(primary))) -+ stats_primary_mask |= (1ULL << cpu); -+ else if (bpf_cpumask_test_cpu(cpu, cast_mask(reserve))) -+ stats_reserved_mask |= (1ULL << cpu); -+ else -+ stats_other_mask |= (1ULL << cpu); -+ -+ if (bpf_cpumask_test_cpu(cpu, idle)) -+ stats_idle_mask |= (1ULL << cpu); -+ } -+ bpf_rcu_read_unlock(); -+ scx_bpf_put_idle_cpumask(idle); -+ -+ err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); -+ if (err) -+ scx_bpf_error("Failed to arm stats timer"); -+ -+ return 0; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(nest_init) -+{ -+ struct bpf_cpumask *cpumask; -+ s32 cpu; -+ int err; -+ struct bpf_timer *timer; -+ u32 key = 0; -+ -+ scx_bpf_switch_all(); -+ -+ err = scx_bpf_create_dsq(FALLBACK_DSQ_ID, NUMA_NO_NODE); -+ if (err) { -+ scx_bpf_error("Failed to create fallback DSQ"); -+ return err; -+ } -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ bpf_cpumask_clear(cpumask); -+ cpumask = bpf_kptr_xchg(&primary_cpumask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ -+ bpf_cpumask_clear(cpumask); -+ cpumask = bpf_kptr_xchg(&reserve_cpumask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ -+ bpf_for(cpu, 0, nr_cpus) { -+ s32 key = cpu; -+ struct pcpu_ctx *ctx = bpf_map_lookup_elem(&pcpu_ctxs, &key); -+ -+ if (!ctx) { -+ scx_bpf_error("Failed to lookup pcpu_ctx"); -+ return -ENOENT; -+ } -+ ctx->scheduled_compaction = false; -+ if (bpf_timer_init(&ctx->timer, &pcpu_ctxs, CLOCK_BOOTTIME)) { -+ scx_bpf_error("Failed to initialize pcpu timer"); -+ return -EINVAL; -+ } -+ ctx->num_schedulings = 0; -+ bpf_timer_set_callback(&ctx->timer, compact_primary_core); -+ } -+ -+ timer = bpf_map_lookup_elem(&stats_timer, &key); -+ if (!timer) { -+ scx_bpf_error("Failed to lookup central timer"); -+ return -ESRCH; -+ } -+ bpf_timer_init(timer, &stats_timer, CLOCK_BOOTTIME); -+ bpf_timer_set_callback(timer, stats_timerfn); -+ err = bpf_timer_start(timer, sampling_cadence_ns - 5000, 0); -+ if (err) -+ scx_bpf_error("Failed to arm stats timer"); -+ -+ return err; -+} -+ -+void BPF_STRUCT_OPS(nest_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops nest_ops = { -+ .select_cpu = (void *)nest_select_cpu, -+ .enqueue = (void *)nest_enqueue, -+ .dispatch = (void *)nest_dispatch, -+ .running = (void *)nest_running, -+ .stopping = (void *)nest_stopping, -+ .prep_enable = (void *)nest_prep_enable, -+ .enable = (void *)nest_enable, -+ .init = (void *)nest_init, -+ .exit = (void *)nest_exit, -+ .flags = 0, -+ .name = "nest", -+}; -diff --git a/tools/sched_ext/scx_nest.c b/tools/sched_ext/scx_nest.c -new file mode 100644 -index 000000000..90f5a8bd2 ---- /dev/null -+++ b/tools/sched_ext/scx_nest.c -@@ -0,0 +1,227 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "user_exit_info.h" -+#include "scx_nest.skel.h" -+#include "scx_common.h" -+#include "scx_nest.h" -+ -+#define SAMPLING_CADENCE_S 2 -+ -+const char help_fmt[] = -+"A Nest sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-p] [-d DELAY] [-m ] [-i ITERS]\n" -+"\n" -+" -d DELAY_US Delay (us), before removing an idle core from the primary nest (default 2000us / 2ms)\n" -+" -m R_MAX Maximum number of cores in the reserve nest (default 5)\n" -+" -i ITERS Number of successive placement failures tolerated before trying to aggressively expand primary nest (default 2), or 0 to disable\n" -+" -s SLICE_US Override slice duration in us (default 20000us / 20ms)\n" -+" -D R_SCHED Override the number of times that a core may be scheduled for compaction before having compaction happen immediately (default 5), or -1 to disable\n" -+" -I First try to find a fully idle core, and then any idle core, when searching nests. Default behavior is to ignore hypertwins and check for any idle core.\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+ -+static void sigint_handler(int nest) -+{ -+ exit_req = 1; -+} -+ -+struct nest_stat { -+ const char *label; -+ enum nest_stat_group group; -+ enum nest_stat_idx idx; -+}; -+ -+#define NEST_ST(__stat, __grp, __desc) { \ -+ .label = #__stat, \ -+ .group = __grp, \ -+ .idx = NEST_STAT(__stat) \ -+}, -+static struct nest_stat nest_stats[NEST_STAT(NR)] = { -+#include "scx_nest_stats_table.h" -+}; -+#undef NEST_ST -+ -+static void read_stats(struct scx_nest *skel, u64 *stats) -+{ -+ int nr_cpus = libbpf_num_possible_cpus(); -+ u64 cnts[NEST_STAT(NR)][nr_cpus]; -+ u32 idx; -+ -+ memset(stats, 0, sizeof(stats[0]) * NEST_STAT(NR)); -+ -+ for (idx = 0; idx < NEST_STAT(NR); idx++) { -+ int ret, cpu; -+ -+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), -+ &idx, cnts[idx]); -+ if (ret < 0) -+ continue; -+ for (cpu = 0; cpu < nr_cpus; cpu++) -+ stats[idx] += cnts[idx][cpu]; -+ } -+} -+ -+static void print_underline(const char *str) -+{ -+ char buf[64]; -+ size_t len; -+ -+ len = strlen(str); -+ memset(buf, '-', len); -+ buf[len] = '\0'; -+ printf("\n\n%s\n%s\n", str, buf); -+} -+ -+static void print_stat_grp(enum nest_stat_group grp) -+{ -+ const char *group; -+ -+ switch (grp) { -+ case STAT_GRP_WAKEUP: -+ group = "Wakeup stats"; -+ break; -+ case STAT_GRP_NEST: -+ group = "Nest stats"; -+ break; -+ case STAT_GRP_CONSUME: -+ group = "Consume stats"; -+ break; -+ default: -+ group = "Unknown stats"; -+ break; -+ } -+ -+ print_underline(group); -+} -+ -+static void print_active_nests(const struct scx_nest *skel) -+{ -+ u64 primary = skel->bss->stats_primary_mask; -+ u64 reserved = skel->bss->stats_reserved_mask; -+ u64 other = skel->bss->stats_other_mask; -+ u64 idle = skel->bss->stats_idle_mask; -+ u32 nr_cpus = skel->rodata->nr_cpus, cpu; -+ int idx; -+ char cpus[nr_cpus + 1]; -+ -+ memset(cpus, 0, nr_cpus + 1); -+ print_underline("Masks"); -+ for (idx = 0; idx < 4; idx++) { -+ const char *mask_str; -+ u64 mask, total = 0; -+ -+ memset(cpus, '-', nr_cpus); -+ if (idx == 0) { -+ mask_str = "PRIMARY"; -+ mask = primary; -+ } else if (idx == 1) { -+ mask_str = "RESERVED"; -+ mask = reserved; -+ } else if (idx == 2) { -+ mask_str = "OTHER"; -+ mask = other; -+ } else { -+ mask_str = "IDLE"; -+ mask = idle; -+ } -+ for (cpu = 0; cpu < nr_cpus; cpu++) { -+ if (mask & (1ULL << cpu)) { -+ cpus[cpu] = '*'; -+ total++; -+ } -+ } -+ printf("%-9s(%2lu): | %s |\n", mask_str, total, cpus); -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_nest *skel; -+ struct bpf_link *link; -+ __u32 opt; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ skel = scx_nest__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); -+ skel->rodata->sampling_cadence_ns = SAMPLING_CADENCE_S * 1000 * 1000 * 1000; -+ -+ while ((opt = getopt(argc, argv, "hId:D:m:i:s:")) != -1) { -+ switch (opt) { -+ case 'd': -+ skel->rodata->p_remove_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ case 'D': -+ skel->rodata->r_depth = strtoull(optarg, NULL, 0); -+ break; -+ case 'm': -+ skel->rodata->r_max = strtoull(optarg, NULL, 0); -+ break; -+ case 'i': -+ skel->rodata->r_impatient = strtoull(optarg, NULL, 0); -+ break; -+ case 'I': -+ skel->rodata->find_fully_idle = true; -+ break; -+ case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ SCX_BUG_ON(scx_nest__load(skel), "Failed to load skel"); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.nest_ops); -+ SCX_BUG_ON(!link, "Failed to attach struct_ops"); -+ -+ while (!exit_req && !uei_exited(&skel->bss->uei)) { -+ u64 stats[NEST_STAT(NR)]; -+ enum nest_stat_idx i; -+ enum nest_stat_group last_grp = -1; -+ -+ read_stats(skel, stats); -+ for (i = 0; i < NEST_STAT(NR); i++) { -+ struct nest_stat *nest_stat; -+ -+ nest_stat = &nest_stats[i]; -+ if (nest_stat->group != last_grp) { -+ print_stat_grp(nest_stat->group); -+ last_grp = nest_stat->group; -+ } -+ printf("%s=%lu\n", nest_stat->label, stats[nest_stat->idx]); -+ } -+ printf("\n"); -+ print_active_nests(skel); -+ printf("\n"); -+ printf("\n"); -+ printf("\n"); -+ fflush(stdout); -+ sleep(SAMPLING_CADENCE_S); -+ } -+ -+ bpf_link__destroy(link); -+ uei_print(&skel->bss->uei); -+ scx_nest__destroy(skel); -+ return 0; -+} -diff --git a/tools/sched_ext/scx_nest.h b/tools/sched_ext/scx_nest.h -new file mode 100644 -index 000000000..060444f81 ---- /dev/null -+++ b/tools/sched_ext/scx_nest.h -@@ -0,0 +1,18 @@ -+#ifndef __SCX_NEST_H -+#define __SCX_NEST_H -+ -+enum nest_stat_group { -+ STAT_GRP_WAKEUP, -+ STAT_GRP_NEST, -+ STAT_GRP_CONSUME, -+}; -+ -+#define NEST_STAT(__stat) BPFSTAT_##__stat -+#define NEST_ST(__stat, __grp, __desc) NEST_STAT(__stat), -+enum nest_stat_idx { -+#include "scx_nest_stats_table.h" -+ NEST_ST(NR, 0, 0) -+}; -+#undef NEST_ST -+ -+#endif /* __SCX_NEST_H */ -diff --git a/tools/sched_ext/scx_nest_stats_table.h b/tools/sched_ext/scx_nest_stats_table.h -new file mode 100644 -index 000000000..b6ef2e4d3 ---- /dev/null -+++ b/tools/sched_ext/scx_nest_stats_table.h -@@ -0,0 +1,19 @@ -+NEST_ST(WAKEUP_ATTACHED, STAT_GRP_WAKEUP, "Attached CPU was idle, and in primary nest") -+NEST_ST(WAKEUP_PREV_PRIMARY, STAT_GRP_WAKEUP, "Previous CPU was idle, and in primary nest") -+NEST_ST(WAKEUP_FULLY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to fully idle primary nest core") -+NEST_ST(WAKEUP_ANY_IDLE_PRIMARY, STAT_GRP_WAKEUP, "Woken up to idle logical primary nest core") -+NEST_ST(WAKEUP_FULLY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to fully idle reserve nest core") -+NEST_ST(WAKEUP_ANY_IDLE_RESERVE, STAT_GRP_WAKEUP, "Woken up to idle logical reserve nest core") -+NEST_ST(WAKEUP_IDLE_OTHER, STAT_GRP_WAKEUP, "Woken to any idle logical core in p->cpus_ptr") -+ -+NEST_ST(TASK_IMPATIENT, STAT_GRP_NEST, "A task was found to be impatient") -+NEST_ST(PROMOTED_TO_PRIMARY, STAT_GRP_NEST, "A core was promoted into the primary nest") -+NEST_ST(PROMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was promoted into the reserve nest") -+NEST_ST(DEMOTED_TO_RESERVED, STAT_GRP_NEST, "A core was demoted into the reserve nest") -+NEST_ST(RESERVED_AT_CAPACITY, STAT_GRP_NEST, "Reserved nest was at capacity") -+NEST_ST(SCHEDULED_COMPACTION, STAT_GRP_NEST, "Scheduled a primary core to be compacted") -+NEST_ST(CANCELLED_COMPACTION, STAT_GRP_NEST, "Cancelled a primary core from being compacted at task wakeup time") -+NEST_ST(COMPACTED, STAT_GRP_NEST, "A core was compacted") -+ -+NEST_ST(CONSUMED, STAT_GRP_CONSUME, "A task was consumed from the global DSQ") -+NEST_ST(NOT_CONSUMED, STAT_GRP_CONSUME, "There was no task in the global DSQ") -diff --git a/tools/sched_ext/scx_pair.bpf.c b/tools/sched_ext/scx_pair.bpf.c -new file mode 100644 -index 000000000..9c9cf97f4 ---- /dev/null -+++ b/tools/sched_ext/scx_pair.bpf.c -@@ -0,0 +1,626 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A demo sched_ext core-scheduler which always makes every sibling CPU pair -+ * execute from the same CPU cgroup. -+ * -+ * This scheduler is a minimal implementation and would need some form of -+ * priority handling both inside each cgroup and across the cgroups to be -+ * practically useful. -+ * -+ * Each CPU in the system is paired with exactly one other CPU, according to a -+ * "stride" value that can be specified when the BPF scheduler program is first -+ * loaded. Throughout the runtime of the scheduler, these CPU pairs guarantee -+ * that they will only ever schedule tasks that belong to the same CPU cgroup. -+ * -+ * Scheduler Initialization -+ * ------------------------ -+ * -+ * The scheduler BPF program is first initialized from user space, before it is -+ * enabled. During this initialization process, each CPU on the system is -+ * assigned several values that are constant throughout its runtime: -+ * -+ * 1. *Pair CPU*: The CPU that it synchronizes with when making scheduling -+ * decisions. Paired CPUs always schedule tasks from the same -+ * CPU cgroup, and synchronize with each other to guarantee -+ * that this constraint is not violated. -+ * 2. *Pair ID*: Each CPU pair is assigned a Pair ID, which is used to access -+ * a struct pair_ctx object that is shared between the pair. -+ * 3. *In-pair-index*: An index, 0 or 1, that is assigned to each core in the -+ * pair. Each struct pair_ctx has an active_mask field, -+ * which is a bitmap used to indicate whether each core -+ * in the pair currently has an actively running task. -+ * This index specifies which entry in the bitmap corresponds -+ * to each CPU in the pair. -+ * -+ * During this initialization, the CPUs are paired according to a "stride" that -+ * may be specified when invoking the user space program that initializes and -+ * loads the scheduler. By default, the stride is 1/2 the total number of CPUs. -+ * -+ * Tasks and cgroups -+ * ----------------- -+ * -+ * Every cgroup in the system is registered with the scheduler using the -+ * pair_cgroup_init() callback, and every task in the system is associated with -+ * exactly one cgroup. At a high level, the idea with the pair scheduler is to -+ * always schedule tasks from the same cgroup within a given CPU pair. When a -+ * task is enqueued (i.e. passed to the pair_enqueue() callback function), its -+ * cgroup ID is read from its task struct, and then a corresponding queue map -+ * is used to FIFO-enqueue the task for that cgroup. -+ * -+ * If you look through the implementation of the scheduler, you'll notice that -+ * there is quite a bit of complexity involved with looking up the per-cgroup -+ * FIFO queue that we enqueue tasks in. For example, there is a cgrp_q_idx_hash -+ * BPF hash map that is used to map a cgroup ID to a globally unique ID that's -+ * allocated in the BPF program. This is done because we use separate maps to -+ * store the FIFO queue of tasks, and the length of that map, per cgroup. This -+ * complexity is only present because of current deficiencies in BPF that will -+ * soon be addressed. The main point to keep in mind is that newly enqueued -+ * tasks are added to their cgroup's FIFO queue. -+ * -+ * Dispatching tasks -+ * ----------------- -+ * -+ * This section will describe how enqueued tasks are dispatched and scheduled. -+ * Tasks are dispatched in pair_dispatch(), and at a high level the workflow is -+ * as follows: -+ * -+ * 1. Fetch the struct pair_ctx for the current CPU. As mentioned above, this is -+ * the structure that's used to synchronize amongst the two pair CPUs in their -+ * scheduling decisions. After any of the following events have occurred: -+ * -+ * - The cgroup's slice run has expired, or -+ * - The cgroup becomes empty, or -+ * - Either CPU in the pair is preempted by a higher priority scheduling class -+ * -+ * The cgroup transitions to the draining state and stops executing new tasks -+ * from the cgroup. -+ * -+ * 2. If the pair is still executing a task, mark the pair_ctx as draining, and -+ * wait for the pair CPU to be preempted. -+ * -+ * 3. Otherwise, if the pair CPU is not running a task, we can move onto -+ * scheduling new tasks. Pop the next cgroup id from the top_q queue. -+ * -+ * 4. Pop a task from that cgroup's FIFO task queue, and begin executing it. -+ * -+ * Note again that this scheduling behavior is simple, but the implementation -+ * is complex mostly because this it hits several BPF shortcomings and has to -+ * work around in often awkward ways. Most of the shortcomings are expected to -+ * be resolved in the near future which should allow greatly simplifying this -+ * scheduler. -+ * -+ * Dealing with preemption -+ * ----------------------- -+ * -+ * SCX is the lowest priority sched_class, and could be preempted by them at -+ * any time. To address this, the scheduler implements pair_cpu_release() and -+ * pair_cpu_acquire() callbacks which are invoked by the core scheduler when -+ * the scheduler loses and gains control of the CPU respectively. -+ * -+ * In pair_cpu_release(), we mark the pair_ctx as having been preempted, and -+ * then invoke: -+ * -+ * scx_bpf_kick_cpu(pair_cpu, SCX_KICK_PREEMPT | SCX_KICK_WAIT); -+ * -+ * This preempts the pair CPU, and waits until it has re-entered the scheduler -+ * before returning. This is necessary to ensure that the higher priority -+ * sched_class that preempted our scheduler does not schedule a task -+ * concurrently with our pair CPU. -+ * -+ * When the CPU is re-acquired in pair_cpu_acquire(), we unmark the preemption -+ * in the pair_ctx, and send another resched IPI to the pair CPU to re-enable -+ * pair scheduling. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include "scx_common.bpf.h" -+#include "scx_pair.h" -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile bool switch_partial; -+ -+/* !0 for veristat, set during init */ -+const volatile u32 nr_cpu_ids = 1; -+ -+/* a pair of CPUs stay on a cgroup for this duration */ -+const volatile u32 pair_batch_dur_ns = SCX_SLICE_DFL; -+ -+/* cpu ID -> pair cpu ID */ -+const volatile s32 RESIZABLE_ARRAY(rodata, pair_cpu); -+ -+/* cpu ID -> pair_id */ -+const volatile u32 RESIZABLE_ARRAY(rodata, pair_id); -+ -+/* CPU ID -> CPU # in the pair (0 or 1) */ -+const volatile u32 RESIZABLE_ARRAY(rodata, in_pair_idx); -+ -+struct pair_ctx { -+ struct bpf_spin_lock lock; -+ -+ /* the cgroup the pair is currently executing */ -+ u64 cgid; -+ -+ /* the pair started executing the current cgroup at */ -+ u64 started_at; -+ -+ /* whether the current cgroup is draining */ -+ bool draining; -+ -+ /* the CPUs that are currently active on the cgroup */ -+ u32 active_mask; -+ -+ /* -+ * the CPUs that are currently preempted and running tasks in a -+ * different scheduler. -+ */ -+ u32 preempted_mask; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __type(key, u32); -+ __type(value, struct pair_ctx); -+} pair_ctx SEC(".maps"); -+ -+/* queue of cgrp_q's possibly with tasks on them */ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ /* -+ * Because it's difficult to build strong synchronization encompassing -+ * multiple non-trivial operations in BPF, this queue is managed in an -+ * opportunistic way so that we guarantee that a cgroup w/ active tasks -+ * is always on it but possibly multiple times. Once we have more robust -+ * synchronization constructs and e.g. linked list, we should be able to -+ * do this in a prettier way but for now just size it big enough. -+ */ -+ __uint(max_entries, 4 * MAX_CGRPS); -+ __type(value, u64); -+} top_q SEC(".maps"); -+ -+/* per-cgroup q which FIFOs the tasks from the cgroup */ -+struct cgrp_q { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, MAX_QUEUED); -+ __type(value, u32); -+}; -+ -+/* -+ * Ideally, we want to allocate cgrp_q and cgrq_q_len in the cgroup local -+ * storage; however, a cgroup local storage can only be accessed from the BPF -+ * progs attached to the cgroup. For now, work around by allocating array of -+ * cgrp_q's and then allocating per-cgroup indices. -+ * -+ * Another caveat: It's difficult to populate a large array of maps statically -+ * or from BPF. Initialize it from userland. -+ */ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); -+ __uint(max_entries, MAX_CGRPS); -+ __type(key, s32); -+ __array(values, struct cgrp_q); -+} cgrp_q_arr SEC(".maps"); -+ -+static u64 cgrp_q_len[MAX_CGRPS]; -+ -+/* -+ * This and cgrp_q_idx_hash combine into a poor man's IDR. This likely would be -+ * useful to have as a map type. -+ */ -+static u32 cgrp_q_idx_cursor; -+static u64 cgrp_q_idx_busy[MAX_CGRPS]; -+ -+/* -+ * All added up, the following is what we do: -+ * -+ * 1. When a cgroup is enabled, RR cgroup_q_idx_busy array doing cmpxchg looking -+ * for a free ID. If not found, fail cgroup creation with -EBUSY. -+ * -+ * 2. Hash the cgroup ID to the allocated cgrp_q_idx in the following -+ * cgrp_q_idx_hash. -+ * -+ * 3. Whenever a cgrp_q needs to be accessed, first look up the cgrp_q_idx from -+ * cgrp_q_idx_hash and then access the corresponding entry in cgrp_q_arr. -+ * -+ * This is sadly complicated for something pretty simple. Hopefully, we should -+ * be able to simplify in the future. -+ */ -+struct { -+ __uint(type, BPF_MAP_TYPE_HASH); -+ __uint(max_entries, MAX_CGRPS); -+ __uint(key_size, sizeof(u64)); /* cgrp ID */ -+ __uint(value_size, sizeof(s32)); /* cgrp_q idx */ -+} cgrp_q_idx_hash SEC(".maps"); -+ -+/* statistics */ -+u64 nr_total, nr_dispatched, nr_missing, nr_kicks, nr_preemptions; -+u64 nr_exps, nr_exp_waits, nr_exp_empty; -+u64 nr_cgrp_next, nr_cgrp_coll, nr_cgrp_empty; -+ -+struct user_exit_info uei; -+ -+static bool time_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+void BPF_STRUCT_OPS(pair_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ struct cgroup *cgrp; -+ struct cgrp_q *cgq; -+ s32 pid = p->pid; -+ u64 cgid; -+ u32 *q_idx; -+ u64 *cgq_len; -+ -+ __sync_fetch_and_add(&nr_total, 1); -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ cgid = cgrp->kn->id; -+ bpf_cgroup_release(cgrp); -+ -+ /* find the cgroup's q and push @p into it */ -+ q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); -+ if (!q_idx) { -+ scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); -+ return; -+ } -+ -+ cgq = bpf_map_lookup_elem(&cgrp_q_arr, q_idx); -+ if (!cgq) { -+ scx_bpf_error("failed to lookup q_arr for cgroup[%llu] q_idx[%u]", -+ cgid, *q_idx); -+ return; -+ } -+ -+ if (bpf_map_push_elem(cgq, &pid, 0)) { -+ scx_bpf_error("cgroup[%llu] queue overflow", cgid); -+ return; -+ } -+ -+ /* bump q len, if going 0 -> 1, queue cgroup into the top_q */ -+ cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); -+ if (!cgq_len) { -+ scx_bpf_error("MEMBER_VTPR malfunction"); -+ return; -+ } -+ -+ if (!__sync_fetch_and_add(cgq_len, 1) && -+ bpf_map_push_elem(&top_q, &cgid, 0)) { -+ scx_bpf_error("top_q overflow"); -+ return; -+ } -+} -+ -+static int lookup_pairc_and_mask(s32 cpu, struct pair_ctx **pairc, u32 *mask) -+{ -+ u32 *vptr; -+ -+ vptr = (u32 *)ARRAY_ELEM_PTR(pair_id, cpu, nr_cpu_ids); -+ if (!vptr) -+ return -EINVAL; -+ -+ *pairc = bpf_map_lookup_elem(&pair_ctx, vptr); -+ if (!(*pairc)) -+ return -EINVAL; -+ -+ vptr = (u32 *)ARRAY_ELEM_PTR(in_pair_idx, cpu, nr_cpu_ids); -+ if (!vptr) -+ return -EINVAL; -+ -+ *mask = 1U << *vptr; -+ -+ return 0; -+} -+ -+static int try_dispatch(s32 cpu) -+{ -+ struct pair_ctx *pairc; -+ struct bpf_map *cgq_map; -+ struct task_struct *p; -+ u64 now = bpf_ktime_get_ns(); -+ bool kick_pair = false; -+ bool expired, pair_preempted; -+ u32 *vptr, in_pair_mask; -+ s32 pid, q_idx; -+ u64 cgid; -+ int ret; -+ -+ ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); -+ if (ret) { -+ scx_bpf_error("failed to lookup pairc and in_pair_mask for cpu[%d]", -+ cpu); -+ return -ENOENT; -+ } -+ -+ bpf_spin_lock(&pairc->lock); -+ pairc->active_mask &= ~in_pair_mask; -+ -+ expired = time_before(pairc->started_at + pair_batch_dur_ns, now); -+ if (expired || pairc->draining) { -+ u64 new_cgid = 0; -+ -+ __sync_fetch_and_add(&nr_exps, 1); -+ -+ /* -+ * We're done with the current cgid. An obvious optimization -+ * would be not draining if the next cgroup is the current one. -+ * For now, be dumb and always expire. -+ */ -+ pairc->draining = true; -+ -+ pair_preempted = pairc->preempted_mask; -+ if (pairc->active_mask || pair_preempted) { -+ /* -+ * The other CPU is still active, or is no longer under -+ * our control due to e.g. being preempted by a higher -+ * priority sched_class. We want to wait until this -+ * cgroup expires, or until control of our pair CPU has -+ * been returned to us. -+ * -+ * If the pair controls its CPU, and the time already -+ * expired, kick. When the other CPU arrives at -+ * dispatch and clears its active mask, it'll push the -+ * pair to the next cgroup and kick this CPU. -+ */ -+ __sync_fetch_and_add(&nr_exp_waits, 1); -+ bpf_spin_unlock(&pairc->lock); -+ if (expired && !pair_preempted) -+ kick_pair = true; -+ goto out_maybe_kick; -+ } -+ -+ bpf_spin_unlock(&pairc->lock); -+ -+ /* -+ * Pick the next cgroup. It'd be easier / cleaner to not drop -+ * pairc->lock and use stronger synchronization here especially -+ * given that we'll be switching cgroups significantly less -+ * frequently than tasks. Unfortunately, bpf_spin_lock can't -+ * really protect anything non-trivial. Let's do opportunistic -+ * operations instead. -+ */ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ u32 *q_idx; -+ u64 *cgq_len; -+ -+ if (bpf_map_pop_elem(&top_q, &new_cgid)) { -+ /* no active cgroup, go idle */ -+ __sync_fetch_and_add(&nr_exp_empty, 1); -+ return 0; -+ } -+ -+ q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &new_cgid); -+ if (!q_idx) -+ continue; -+ -+ /* -+ * This is the only place where empty cgroups are taken -+ * off the top_q. -+ */ -+ cgq_len = MEMBER_VPTR(cgrp_q_len, [*q_idx]); -+ if (!cgq_len || !*cgq_len) -+ continue; -+ -+ /* -+ * If it has any tasks, requeue as we may race and not -+ * execute it. -+ */ -+ bpf_map_push_elem(&top_q, &new_cgid, 0); -+ break; -+ } -+ -+ bpf_spin_lock(&pairc->lock); -+ -+ /* -+ * The other CPU may already have started on a new cgroup while -+ * we dropped the lock. Make sure that we're still draining and -+ * start on the new cgroup. -+ */ -+ if (pairc->draining && !pairc->active_mask) { -+ __sync_fetch_and_add(&nr_cgrp_next, 1); -+ pairc->cgid = new_cgid; -+ pairc->started_at = now; -+ pairc->draining = false; -+ kick_pair = true; -+ } else { -+ __sync_fetch_and_add(&nr_cgrp_coll, 1); -+ } -+ } -+ -+ cgid = pairc->cgid; -+ pairc->active_mask |= in_pair_mask; -+ bpf_spin_unlock(&pairc->lock); -+ -+ /* again, it'd be better to do all these with the lock held, oh well */ -+ vptr = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); -+ if (!vptr) { -+ scx_bpf_error("failed to lookup q_idx for cgroup[%llu]", cgid); -+ return -ENOENT; -+ } -+ q_idx = *vptr; -+ -+ /* claim one task from cgrp_q w/ q_idx */ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ u64 *cgq_len, len; -+ -+ cgq_len = MEMBER_VPTR(cgrp_q_len, [q_idx]); -+ if (!cgq_len || !(len = *(volatile u64 *)cgq_len)) { -+ /* the cgroup must be empty, expire and repeat */ -+ __sync_fetch_and_add(&nr_cgrp_empty, 1); -+ bpf_spin_lock(&pairc->lock); -+ pairc->draining = true; -+ pairc->active_mask &= ~in_pair_mask; -+ bpf_spin_unlock(&pairc->lock); -+ return -EAGAIN; -+ } -+ -+ if (__sync_val_compare_and_swap(cgq_len, len, len - 1) != len) -+ continue; -+ -+ break; -+ } -+ -+ cgq_map = bpf_map_lookup_elem(&cgrp_q_arr, &q_idx); -+ if (!cgq_map) { -+ scx_bpf_error("failed to lookup cgq_map for cgroup[%llu] q_idx[%d]", -+ cgid, q_idx); -+ return -ENOENT; -+ } -+ -+ if (bpf_map_pop_elem(cgq_map, &pid)) { -+ scx_bpf_error("cgq_map is empty for cgroup[%llu] q_idx[%d]", -+ cgid, q_idx); -+ return -ENOENT; -+ } -+ -+ p = bpf_task_from_pid(pid); -+ if (p) { -+ __sync_fetch_and_add(&nr_dispatched, 1); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); -+ bpf_task_release(p); -+ } else { -+ /* we don't handle dequeues, retry on lost tasks */ -+ __sync_fetch_and_add(&nr_missing, 1); -+ return -EAGAIN; -+ } -+ -+out_maybe_kick: -+ if (kick_pair) { -+ s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); -+ if (pair) { -+ __sync_fetch_and_add(&nr_kicks, 1); -+ scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); -+ } -+ } -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(pair_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ if (try_dispatch(cpu) != -EAGAIN) -+ break; -+ } -+} -+ -+void BPF_STRUCT_OPS(pair_cpu_acquire, s32 cpu, struct scx_cpu_acquire_args *args) -+{ -+ int ret; -+ u32 in_pair_mask; -+ struct pair_ctx *pairc; -+ bool kick_pair; -+ -+ ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); -+ if (ret) -+ return; -+ -+ bpf_spin_lock(&pairc->lock); -+ pairc->preempted_mask &= ~in_pair_mask; -+ /* Kick the pair CPU, unless it was also preempted. */ -+ kick_pair = !pairc->preempted_mask; -+ bpf_spin_unlock(&pairc->lock); -+ -+ if (kick_pair) { -+ s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); -+ -+ if (pair) { -+ __sync_fetch_and_add(&nr_kicks, 1); -+ scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT); -+ } -+ } -+} -+ -+void BPF_STRUCT_OPS(pair_cpu_release, s32 cpu, struct scx_cpu_release_args *args) -+{ -+ int ret; -+ u32 in_pair_mask; -+ struct pair_ctx *pairc; -+ bool kick_pair; -+ -+ ret = lookup_pairc_and_mask(cpu, &pairc, &in_pair_mask); -+ if (ret) -+ return; -+ -+ bpf_spin_lock(&pairc->lock); -+ pairc->preempted_mask |= in_pair_mask; -+ pairc->active_mask &= ~in_pair_mask; -+ /* Kick the pair CPU if it's still running. */ -+ kick_pair = pairc->active_mask; -+ pairc->draining = true; -+ bpf_spin_unlock(&pairc->lock); -+ -+ if (kick_pair) { -+ s32 *pair = (s32 *)ARRAY_ELEM_PTR(pair_cpu, cpu, nr_cpu_ids); -+ -+ if (pair) { -+ __sync_fetch_and_add(&nr_kicks, 1); -+ scx_bpf_kick_cpu(*pair, SCX_KICK_PREEMPT | SCX_KICK_WAIT); -+ } -+ } -+ __sync_fetch_and_add(&nr_preemptions, 1); -+} -+ -+s32 BPF_STRUCT_OPS(pair_cgroup_init, struct cgroup *cgrp) -+{ -+ u64 cgid = cgrp->kn->id; -+ s32 i, q_idx; -+ -+ bpf_for(i, 0, MAX_CGRPS) { -+ q_idx = __sync_fetch_and_add(&cgrp_q_idx_cursor, 1) % MAX_CGRPS; -+ if (!__sync_val_compare_and_swap(&cgrp_q_idx_busy[q_idx], 0, 1)) -+ break; -+ } -+ if (i == MAX_CGRPS) -+ return -EBUSY; -+ -+ if (bpf_map_update_elem(&cgrp_q_idx_hash, &cgid, &q_idx, BPF_ANY)) { -+ u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [q_idx]); -+ if (busy) -+ *busy = 0; -+ return -EBUSY; -+ } -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(pair_cgroup_exit, struct cgroup *cgrp) -+{ -+ u64 cgid = cgrp->kn->id; -+ s32 *q_idx; -+ -+ q_idx = bpf_map_lookup_elem(&cgrp_q_idx_hash, &cgid); -+ if (q_idx) { -+ u64 *busy = MEMBER_VPTR(cgrp_q_idx_busy, [*q_idx]); -+ if (busy) -+ *busy = 0; -+ bpf_map_delete_elem(&cgrp_q_idx_hash, &cgid); -+ } -+} -+ -+s32 BPF_STRUCT_OPS(pair_init) -+{ -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(pair_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops pair_ops = { -+ .enqueue = (void *)pair_enqueue, -+ .dispatch = (void *)pair_dispatch, -+ .cpu_acquire = (void *)pair_cpu_acquire, -+ .cpu_release = (void *)pair_cpu_release, -+ .cgroup_init = (void *)pair_cgroup_init, -+ .cgroup_exit = (void *)pair_cgroup_exit, -+ .init = (void *)pair_init, -+ .exit = (void *)pair_exit, -+ .name = "pair", -+}; -diff --git a/tools/sched_ext/scx_pair.c b/tools/sched_ext/scx_pair.c -new file mode 100644 -index 000000000..48344af03 ---- /dev/null -+++ b/tools/sched_ext/scx_pair.c -@@ -0,0 +1,168 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "scx_common.h" -+#include "scx_pair.h" -+#include "scx_pair.skel.h" -+ -+const char help_fmt[] = -+"A demo sched_ext core-scheduler which always makes every sibling CPU pair\n" -+"execute from the same CPU cgroup.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-S STRIDE] [-p]\n" -+"\n" -+" -S STRIDE Override CPU pair stride (default: nr_cpus_ids / 2)\n" -+" -p Switch only tasks on SCHED_EXT policy intead of all\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_pair *skel; -+ struct bpf_link *link; -+ __u64 seq = 0; -+ __s32 stride, i, opt, outer_fd; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ skel = scx_pair__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); -+ -+ /* pair up the earlier half to the latter by default, override with -s */ -+ stride = skel->rodata->nr_cpu_ids / 2; -+ -+ while ((opt = getopt(argc, argv, "S:ph")) != -1) { -+ switch (opt) { -+ case 'S': -+ stride = strtoul(optarg, NULL, 0); -+ break; -+ case 'p': -+ skel->rodata->switch_partial = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ bpf_map__set_max_entries(skel->maps.pair_ctx, skel->rodata->nr_cpu_ids / 2); -+ -+ /* Resize arrays so their element count is equal to cpu count. */ -+ RESIZE_ARRAY(rodata, pair_cpu, skel->rodata->nr_cpu_ids); -+ RESIZE_ARRAY(rodata, pair_id, skel->rodata->nr_cpu_ids); -+ RESIZE_ARRAY(rodata, in_pair_idx, skel->rodata->nr_cpu_ids); -+ -+ for (i = 0; i < skel->rodata->nr_cpu_ids; i++) -+ skel->rodata_pair_cpu->pair_cpu[i] = -1; -+ -+ printf("Pairs: "); -+ for (i = 0; i < skel->rodata->nr_cpu_ids; i++) { -+ int j = (i + stride) % skel->rodata->nr_cpu_ids; -+ -+ if (skel->rodata_pair_cpu->pair_cpu[i] >= 0) -+ continue; -+ -+ SCX_BUG_ON(i == j, -+ "Invalid stride %d - CPU%d wants to be its own pair", -+ stride, i); -+ -+ SCX_BUG_ON(skel->rodata_pair_cpu->pair_cpu[j] >= 0, -+ "Invalid stride %d - three CPUs (%d, %d, %d) want to be a pair", -+ stride, i, j, skel->rodata_pair_cpu->pair_cpu[j]); -+ -+ skel->rodata_pair_cpu->pair_cpu[i] = j; -+ skel->rodata_pair_cpu->pair_cpu[j] = i; -+ skel->rodata_pair_id->pair_id[i] = i; -+ skel->rodata_pair_id->pair_id[j] = i; -+ skel->rodata_in_pair_idx->in_pair_idx[i] = 0; -+ skel->rodata_in_pair_idx->in_pair_idx[j] = 1; -+ -+ printf("[%d, %d] ", i, j); -+ } -+ printf("\n"); -+ -+ SCX_BUG_ON(scx_pair__load(skel), "Failed to load skel"); -+ -+ /* -+ * Populate the cgrp_q_arr map which is an array containing per-cgroup -+ * queues. It'd probably be better to do this from BPF but there are too -+ * many to initialize statically and there's no way to dynamically -+ * populate from BPF. -+ */ -+ outer_fd = bpf_map__fd(skel->maps.cgrp_q_arr); -+ SCX_BUG_ON(outer_fd < 0, "Failed to get outer_fd: %d", outer_fd); -+ -+ printf("Initializing"); -+ for (i = 0; i < MAX_CGRPS; i++) { -+ __s32 inner_fd; -+ -+ if (exit_req) -+ break; -+ -+ inner_fd = bpf_map_create(BPF_MAP_TYPE_QUEUE, NULL, 0, -+ sizeof(__u32), MAX_QUEUED, NULL); -+ SCX_BUG_ON(inner_fd < 0, "Failed to get inner_fd: %d", -+ inner_fd); -+ SCX_BUG_ON(bpf_map_update_elem(outer_fd, &i, &inner_fd, BPF_ANY), -+ "Failed to set inner map"); -+ close(inner_fd); -+ -+ if (!(i % 10)) -+ printf("."); -+ fflush(stdout); -+ } -+ printf("\n"); -+ -+ /* -+ * Fully initialized, attach and run. -+ */ -+ link = bpf_map__attach_struct_ops(skel->maps.pair_ops); -+ SCX_BUG_ON(!link, "Failed to attach struct_ops"); -+ -+ while (!exit_req && !uei_exited(&skel->bss->uei)) { -+ printf("[SEQ %llu]\n", seq++); -+ printf(" total:%10lu dispatch:%10lu missing:%10lu\n", -+ skel->bss->nr_total, -+ skel->bss->nr_dispatched, -+ skel->bss->nr_missing); -+ printf(" kicks:%10lu preemptions:%7lu\n", -+ skel->bss->nr_kicks, -+ skel->bss->nr_preemptions); -+ printf(" exp:%10lu exp_wait:%10lu exp_empty:%10lu\n", -+ skel->bss->nr_exps, -+ skel->bss->nr_exp_waits, -+ skel->bss->nr_exp_empty); -+ printf("cgnext:%10lu cgcoll:%10lu cgempty:%10lu\n", -+ skel->bss->nr_cgrp_next, -+ skel->bss->nr_cgrp_coll, -+ skel->bss->nr_cgrp_empty); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ uei_print(&skel->bss->uei); -+ scx_pair__destroy(skel); -+ return 0; -+} -diff --git a/tools/sched_ext/scx_pair.h b/tools/sched_ext/scx_pair.h -new file mode 100644 -index 000000000..d9666a447 ---- /dev/null -+++ b/tools/sched_ext/scx_pair.h -@@ -0,0 +1,9 @@ -+#ifndef __SCX_EXAMPLE_PAIR_H -+#define __SCX_EXAMPLE_PAIR_H -+ -+enum { -+ MAX_QUEUED = 4096, -+ MAX_CGRPS = 4096, -+}; -+ -+#endif /* __SCX_EXAMPLE_PAIR_H */ -diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c -new file mode 100644 -index 000000000..b6365df0f ---- /dev/null -+++ b/tools/sched_ext/scx_qmap.bpf.c -@@ -0,0 +1,401 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A simple five-level FIFO queue scheduler. -+ * -+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets -+ * assigned to one depending on its compound weight. Each CPU round robins -+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from -+ * queue0, 2 from queue1, 4 from queue2 and so on. -+ * -+ * This scheduler demonstrates: -+ * -+ * - BPF-side queueing using PIDs. -+ * - Sleepable per-task storage allocation using ops.prep_enable(). -+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking -+ * the CPU away. -+ * - Core-sched support. -+ * -+ * This scheduler is primarily for demonstration and testing of sched_ext -+ * features and unlikely to be useful for actual workloads. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include "scx_common.bpf.h" -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+const volatile bool switch_partial; -+const volatile u32 stall_user_nth; -+const volatile u32 stall_kernel_nth; -+const volatile u32 dsp_inf_loop_after; -+const volatile s32 disallow_tgid; -+ -+u32 test_error_cnt; -+ -+struct user_exit_info uei; -+ -+struct qmap { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 4096); -+ __type(value, u32); -+} queue0 SEC(".maps"), -+ queue1 SEC(".maps"), -+ queue2 SEC(".maps"), -+ queue3 SEC(".maps"), -+ queue4 SEC(".maps"); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); -+ __uint(max_entries, 5); -+ __type(key, int); -+ __array(values, struct qmap); -+} queue_arr SEC(".maps") = { -+ .values = { -+ [0] = &queue0, -+ [1] = &queue1, -+ [2] = &queue2, -+ [3] = &queue3, -+ [4] = &queue4, -+ }, -+}; -+ -+/* -+ * Per-queue sequence numbers to implement core-sched ordering. -+ * -+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the -+ * sequence number of the latest dispatched task. The distance between the a -+ * task's seq and the associated queue's head seq is called the queue distance -+ * and used when comparing two tasks for ordering. See qmap_core_sched_before(). -+ */ -+static u64 core_sched_head_seqs[5]; -+static u64 core_sched_tail_seqs[5]; -+ -+/* Per-task scheduling context */ -+struct task_ctx { -+ bool force_local; /* Dispatch directly to local_dsq */ -+ u64 core_sched_seq; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct task_ctx); -+} task_ctx_stor SEC(".maps"); -+ -+/* Per-cpu dispatch index and remaining count */ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(max_entries, 2); -+ __type(key, u32); -+ __type(value, u64); -+} dispatch_idx_cnt SEC(".maps"); -+ -+/* Statistics */ -+unsigned long nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; -+unsigned long nr_core_sched_execed; -+ -+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ struct task_ctx *tctx; -+ s32 cpu; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return -ESRCH; -+ } -+ -+ if (p->nr_cpus_allowed == 1 || -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ tctx->force_local = true; -+ return prev_cpu; -+ } -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ return cpu; -+ -+ return prev_cpu; -+} -+ -+static int weight_to_idx(u32 weight) -+{ -+ /* Coarsely map the compound weight to a FIFO. */ -+ if (weight <= 25) -+ return 0; -+ else if (weight <= 50) -+ return 1; -+ else if (weight < 200) -+ return 2; -+ else if (weight < 400) -+ return 3; -+ else -+ return 4; -+} -+ -+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ static u32 user_cnt, kernel_cnt; -+ struct task_ctx *tctx; -+ u32 pid = p->pid; -+ int idx = weight_to_idx(p->scx.weight); -+ void *ring; -+ -+ if (p->flags & PF_KTHREAD) { -+ if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) -+ return; -+ } else { -+ if (stall_user_nth && !(++user_cnt % stall_user_nth)) -+ return; -+ } -+ -+ if (test_error_cnt && !--test_error_cnt) -+ scx_bpf_error("test triggering error"); -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ /* -+ * All enqueued tasks must have their core_sched_seq updated for correct -+ * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in -+ * qmap_ops.flags. -+ */ -+ tctx->core_sched_seq = core_sched_tail_seqs[idx]++; -+ -+ /* -+ * If qmap_select_cpu() is telling us to or this is the last runnable -+ * task on the CPU, enqueue locally. -+ */ -+ if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { -+ tctx->force_local = false; -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ /* -+ * If the task was re-enqueued due to the CPU being preempted by a -+ * higher priority scheduling class, just re-enqueue the task directly -+ * on the global DSQ. As we want another CPU to pick it up, find and -+ * kick an idle CPU. -+ */ -+ if (enq_flags & SCX_ENQ_REENQ) { -+ s32 cpu; -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, 0, enq_flags); -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ scx_bpf_kick_cpu(cpu, 0); -+ return; -+ } -+ -+ ring = bpf_map_lookup_elem(&queue_arr, &idx); -+ if (!ring) { -+ scx_bpf_error("failed to find ring %d", idx); -+ return; -+ } -+ -+ /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ -+ if (bpf_map_push_elem(ring, &pid, 0)) { -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ __sync_fetch_and_add(&nr_enqueued, 1); -+} -+ -+/* -+ * The BPF queue map doesn't support removal and sched_ext can handle spurious -+ * dispatches. qmap_dequeue() is only used to collect statistics. -+ */ -+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) -+{ -+ __sync_fetch_and_add(&nr_dequeued, 1); -+ if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) -+ __sync_fetch_and_add(&nr_core_sched_execed, 1); -+} -+ -+static void update_core_sched_head_seq(struct task_struct *p) -+{ -+ struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ int idx = weight_to_idx(p->scx.weight); -+ -+ if (tctx) -+ core_sched_head_seqs[idx] = tctx->core_sched_seq; -+ else -+ scx_bpf_error("task_ctx lookup failed"); -+} -+ -+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ u32 zero = 0, one = 1; -+ u64 *idx = bpf_map_lookup_elem(&dispatch_idx_cnt, &zero); -+ u64 *cnt = bpf_map_lookup_elem(&dispatch_idx_cnt, &one); -+ void *fifo; -+ s32 pid; -+ int i; -+ -+ if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { -+ struct task_struct *p; -+ -+ /* -+ * PID 2 should be kthreadd which should mostly be idle and off -+ * the scheduler. Let's keep dispatching it to force the kernel -+ * to call this function over and over again. -+ */ -+ p = bpf_task_from_pid(2); -+ if (p) { -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); -+ bpf_task_release(p); -+ return; -+ } -+ } -+ -+ if (!idx || !cnt) { -+ scx_bpf_error("failed to lookup idx[%p], cnt[%p]", idx, cnt); -+ return; -+ } -+ -+ for (i = 0; i < 5; i++) { -+ /* Advance the dispatch cursor and pick the fifo. */ -+ if (!*cnt) { -+ *idx = (*idx + 1) % 5; -+ *cnt = 1 << *idx; -+ } -+ (*cnt)--; -+ -+ fifo = bpf_map_lookup_elem(&queue_arr, idx); -+ if (!fifo) { -+ scx_bpf_error("failed to find ring %llu", *idx); -+ return; -+ } -+ -+ /* Dispatch or advance. */ -+ if (!bpf_map_pop_elem(fifo, &pid)) { -+ struct task_struct *p; -+ -+ p = bpf_task_from_pid(pid); -+ if (p) { -+ update_core_sched_head_seq(p); -+ __sync_fetch_and_add(&nr_dispatched, 1); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, slice_ns, 0); -+ bpf_task_release(p); -+ return; -+ } -+ } -+ -+ *cnt = 0; -+ } -+} -+ -+/* -+ * The distance from the head of the queue scaled by the weight of the queue. -+ * The lower the number, the older the task and the higher the priority. -+ */ -+static s64 task_qdist(struct task_struct *p) -+{ -+ int idx = weight_to_idx(p->scx.weight); -+ struct task_ctx *tctx; -+ s64 qdist; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return 0; -+ } -+ -+ qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; -+ -+ /* -+ * As queue index increments, the priority doubles. The queue w/ index 3 -+ * is dispatched twice more frequently than 2. Reflect the difference by -+ * scaling qdists accordingly. Note that the shift amount needs to be -+ * flipped depending on the sign to avoid flipping priority direction. -+ */ -+ if (qdist >= 0) -+ return qdist << (4 - idx); -+ else -+ return qdist << idx; -+} -+ -+/* -+ * This is called to determine the task ordering when core-sched is picking -+ * tasks to execute on SMT siblings and should encode about the same ordering as -+ * the regular scheduling path. Use the priority-scaled distances from the head -+ * of the queues to compare the two tasks which should be consistent with the -+ * dispatch path behavior. -+ */ -+bool BPF_STRUCT_OPS(qmap_core_sched_before, -+ struct task_struct *a, struct task_struct *b) -+{ -+ return task_qdist(a) > task_qdist(b); -+} -+ -+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) -+{ -+ u32 cnt; -+ -+ /* -+ * Called when @cpu is taken by a higher priority scheduling class. This -+ * makes @cpu no longer available for executing sched_ext tasks. As we -+ * don't want the tasks in @cpu's local dsq to sit there until @cpu -+ * becomes available again, re-enqueue them into the global dsq. See -+ * %SCX_ENQ_REENQ handling in qmap_enqueue(). -+ */ -+ cnt = scx_bpf_reenqueue_local(); -+ if (cnt) -+ __sync_fetch_and_add(&nr_reenqueued, cnt); -+} -+ -+s32 BPF_STRUCT_OPS(qmap_prep_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ if (p->tgid == disallow_tgid) -+ p->scx.disallow = true; -+ -+ /* -+ * @p is new. Let's ensure that its task_ctx is available. We can sleep -+ * in this function and the following will automatically use GFP_KERNEL. -+ */ -+ if (bpf_task_storage_get(&task_ctx_stor, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE)) -+ return 0; -+ else -+ return -ENOMEM; -+} -+ -+s32 BPF_STRUCT_OPS(qmap_init) -+{ -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops qmap_ops = { -+ .select_cpu = (void *)qmap_select_cpu, -+ .enqueue = (void *)qmap_enqueue, -+ .dequeue = (void *)qmap_dequeue, -+ .dispatch = (void *)qmap_dispatch, -+ .core_sched_before = (void *)qmap_core_sched_before, -+ .cpu_release = (void *)qmap_cpu_release, -+ .prep_enable = (void *)qmap_prep_enable, -+ .init = (void *)qmap_init, -+ .exit = (void *)qmap_exit, -+ .flags = SCX_OPS_ENQ_LAST, -+ .timeout_ms = 5000U, -+ .name = "qmap", -+}; -diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c -new file mode 100644 -index 000000000..edc3d0a4e ---- /dev/null -+++ b/tools/sched_ext/scx_qmap.c -@@ -0,0 +1,105 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_common.h" -+#include "scx_qmap.skel.h" -+ -+const char help_fmt[] = -+"A simple five-level FIFO queue sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-d PID] [-p]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" -+" -t COUNT Stall every COUNT'th user thread\n" -+" -T COUNT Stall every COUNT'th kernel thread\n" -+" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" -+" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" -+" -p Switch only tasks on SCHED_EXT policy intead of all\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_qmap *skel; -+ struct bpf_link *link; -+ int opt; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ skel = scx_qmap__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ while ((opt = getopt(argc, argv, "s:e:t:T:l:d:ph")) != -1) { -+ switch (opt) { -+ case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ case 'e': -+ skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); -+ break; -+ case 't': -+ skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); -+ break; -+ case 'T': -+ skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); -+ break; -+ case 'l': -+ skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); -+ break; -+ case 'd': -+ skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); -+ if (skel->rodata->disallow_tgid < 0) -+ skel->rodata->disallow_tgid = getpid(); -+ break; -+ case 'p': -+ skel->rodata->switch_partial = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ SCX_BUG_ON(scx_qmap__load(skel), "Failed to load skel"); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.qmap_ops); -+ SCX_BUG_ON(!link, "Failed to attach struct_ops"); -+ -+ while (!exit_req && !uei_exited(&skel->bss->uei)) { -+ long nr_enqueued = skel->bss->nr_enqueued; -+ long nr_dispatched = skel->bss->nr_dispatched; -+ -+ printf("enq=%lu, dsp=%lu, delta=%ld, reenq=%lu, deq=%lu, core=%lu\n", -+ nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, -+ skel->bss->nr_reenqueued, skel->bss->nr_dequeued, -+ skel->bss->nr_core_sched_execed); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ uei_print(&skel->bss->uei); -+ scx_qmap__destroy(skel); -+ return 0; -+} -diff --git a/tools/sched_ext/scx_rusty/.gitignore b/tools/sched_ext/scx_rusty/.gitignore -new file mode 100644 -index 000000000..186dba259 ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/.gitignore -@@ -0,0 +1,3 @@ -+src/bpf/.output -+Cargo.lock -+target -diff --git a/tools/sched_ext/scx_rusty/Cargo.toml b/tools/sched_ext/scx_rusty/Cargo.toml -new file mode 100644 -index 000000000..b0edd3b93 ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/Cargo.toml -@@ -0,0 +1,28 @@ -+[package] -+name = "scx_rusty" -+version = "0.5.0" -+authors = ["Dan Schatzberg ", "Meta"] -+edition = "2021" -+description = "Userspace scheduling with BPF" -+license = "GPL-2.0-only" -+ -+[dependencies] -+anyhow = "1.0.65" -+bitvec = { version = "1.0", features = ["serde"] } -+clap = { version = "4.1", features = ["derive", "env", "unicode", "wrap_help"] } -+ctrlc = { version = "3.1", features = ["termination"] } -+fb_procfs = "0.7.0" -+hex = "0.4.3" -+libbpf-rs = "0.21.0" -+libbpf-sys = { version = "1.2.0", features = ["novendor", "static"] } -+libc = "0.2.137" -+log = "0.4.17" -+ordered-float = "3.4.0" -+simplelog = "0.12.0" -+ -+[build-dependencies] -+bindgen = { version = "0.61.0" } -+libbpf-cargo = "0.21.0" -+ -+[features] -+enable_backtrace = [] -diff --git a/tools/sched_ext/scx_rusty/build.rs b/tools/sched_ext/scx_rusty/build.rs -new file mode 100644 -index 000000000..c54b8f33c ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/build.rs -@@ -0,0 +1,72 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+extern crate bindgen; -+ -+use std::env; -+use std::fs::create_dir_all; -+use std::path::Path; -+use std::path::PathBuf; -+ -+use libbpf_cargo::SkeletonBuilder; -+ -+const HEADER_PATH: &str = "src/bpf/rusty.h"; -+ -+fn bindgen_rusty() { -+ // Tell cargo to invalidate the built crate whenever the wrapper changes -+ println!("cargo:rerun-if-changed={}", HEADER_PATH); -+ -+ // The bindgen::Builder is the main entry point -+ // to bindgen, and lets you build up options for -+ // the resulting bindings. -+ let bindings = bindgen::Builder::default() -+ // The input header we would like to generate -+ // bindings for. -+ .header(HEADER_PATH) -+ // Tell cargo to invalidate the built crate whenever any of the -+ // included header files changed. -+ .parse_callbacks(Box::new(bindgen::CargoCallbacks)) -+ // Finish the builder and generate the bindings. -+ .generate() -+ // Unwrap the Result and panic on failure. -+ .expect("Unable to generate bindings"); -+ -+ // Write the bindings to the $OUT_DIR/bindings.rs file. -+ let out_path = PathBuf::from(env::var("OUT_DIR").unwrap()); -+ bindings -+ .write_to_file(out_path.join("rusty_sys.rs")) -+ .expect("Couldn't write bindings!"); -+} -+ -+fn gen_bpf_sched(name: &str) { -+ let bpf_cflags = env::var("SCX_RUST_BPF_CFLAGS").unwrap(); -+ let clang = env::var("SCX_RUST_CLANG").unwrap(); -+ eprintln!("{}", clang); -+ let outpath = format!("./src/bpf/.output/{}.skel.rs", name); -+ let skel = Path::new(&outpath); -+ let src = format!("./src/bpf/{}.bpf.c", name); -+ let obj = format!("./src/bpf/.output/{}.bpf.o", name); -+ SkeletonBuilder::new() -+ .source(src.clone()) -+ .obj(obj) -+ .clang(clang) -+ .clang_args(bpf_cflags) -+ .build_and_generate(skel) -+ .unwrap(); -+ println!("cargo:rerun-if-changed={}", src); -+} -+ -+fn main() { -+ bindgen_rusty(); -+ // It's unfortunate we cannot use `OUT_DIR` to store the generated skeleton. -+ // Reasons are because the generated skeleton contains compiler attributes -+ // that cannot be `include!()`ed via macro. And we cannot use the `#[path = "..."]` -+ // trick either because you cannot yet `concat!(env!("OUT_DIR"), "/skel.rs")` inside -+ // the path attribute either (see https://github.com/rust-lang/rust/pull/83366). -+ // -+ // However, there is hope! When the above feature stabilizes we can clean this -+ // all up. -+ create_dir_all("./src/bpf/.output").unwrap(); -+ gen_bpf_sched("rusty"); -+} -diff --git a/tools/sched_ext/scx_rusty/rustfmt.toml b/tools/sched_ext/scx_rusty/rustfmt.toml -new file mode 100644 -index 000000000..b7258ed0a ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/rustfmt.toml -@@ -0,0 +1,8 @@ -+# Get help on options with `rustfmt --help=config` -+# Please keep these in alphabetical order. -+edition = "2021" -+group_imports = "StdExternalCrate" -+imports_granularity = "Item" -+merge_derives = false -+use_field_init_shorthand = true -+version = "Two" -diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c -new file mode 100644 -index 000000000..7a8b27cea ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.bpf.c -@@ -0,0 +1,1153 @@ -+/* Copyright (c) Meta Platforms, Inc. and affiliates. */ -+/* -+ * This software may be used and distributed according to the terms of the -+ * GNU General Public License version 2. -+ * -+ * scx_rusty is a multi-domain BPF / userspace hybrid scheduler where the BPF -+ * part does simple round robin in each domain and the userspace part -+ * calculates the load factor of each domain and tells the BPF part how to load -+ * balance the domains. -+ * -+ * Every task has an entry in the task_data map which lists which domain the -+ * task belongs to. When a task first enters the system (rusty_prep_enable), -+ * they are round-robined to a domain. -+ * -+ * rusty_select_cpu is the primary scheduling logic, invoked when a task -+ * becomes runnable. The lb_data map is populated by userspace to inform the BPF -+ * scheduler that a task should be migrated to a new domain. Otherwise, the task -+ * is scheduled in priority order as follows: -+ * * The current core if the task was woken up synchronously and there are idle -+ * cpus in the system -+ * * The previous core, if idle -+ * * The pinned-to core if the task is pinned to a specific core -+ * * Any idle cpu in the domain -+ * -+ * If none of the above conditions are met, then the task is enqueued to a -+ * dispatch queue corresponding to the domain (rusty_enqueue). -+ * -+ * rusty_dispatch will attempt to consume a task from its domain's -+ * corresponding dispatch queue (this occurs after scheduling any tasks directly -+ * assigned to it due to the logic in rusty_select_cpu). If no task is found, -+ * then greedy load stealing will attempt to find a task on another dispatch -+ * queue to run. -+ * -+ * Load balancing is almost entirely handled by userspace. BPF populates the -+ * task weight, dom mask and current dom in the task_data map and executes the -+ * load balance based on userspace populating the lb_data map. -+ */ -+#include "../../../scx_common.bpf.h" -+#include "../../../ravg_impl.bpf.h" -+#include "rusty.h" -+ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+/* -+ * const volatiles are set during initialization and treated as consts by the -+ * jit compiler. -+ */ -+ -+/* -+ * Domains and cpus -+ */ -+const volatile u32 nr_doms = 32; /* !0 for veristat, set during init */ -+const volatile u32 nr_cpus = 64; /* !0 for veristat, set during init */ -+const volatile u32 cpu_dom_id_map[MAX_CPUS]; -+const volatile u64 dom_cpumasks[MAX_DOMS][MAX_CPUS / 64]; -+const volatile u32 load_half_life = 1000000000 /* 1s */; -+ -+const volatile bool kthreads_local; -+const volatile bool fifo_sched; -+const volatile bool switch_partial; -+const volatile u32 greedy_threshold; -+const volatile u32 debug; -+ -+/* base slice duration */ -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+ -+/* -+ * Exit info -+ */ -+int exit_kind = SCX_EXIT_NONE; -+char exit_msg[SCX_EXIT_MSG_LEN]; -+ -+/* -+ * Per-CPU context -+ */ -+struct pcpu_ctx { -+ u32 dom_rr_cur; /* used when scanning other doms */ -+ -+ /* libbpf-rs does not respect the alignment, so pad out the struct explicitly */ -+ u8 _padding[CACHELINE_SIZE - sizeof(u32)]; -+} __attribute__((aligned(CACHELINE_SIZE))); -+ -+struct pcpu_ctx pcpu_ctx[MAX_CPUS]; -+ -+/* -+ * Domain context -+ */ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __type(key, u32); -+ __type(value, struct dom_ctx); -+ __uint(max_entries, MAX_DOMS); -+ __uint(map_flags, 0); -+} dom_data SEC(".maps"); -+ -+struct lock_wrapper { -+ struct bpf_spin_lock lock; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __type(key, u32); -+ __type(value, struct lock_wrapper); -+ __uint(max_entries, MAX_DOMS); -+ __uint(map_flags, 0); -+} dom_load_locks SEC(".maps"); -+ -+struct dom_active_pids { -+ u64 gen; -+ u64 read_idx; -+ u64 write_idx; -+ s32 pids[MAX_DOM_ACTIVE_PIDS]; -+}; -+ -+struct dom_active_pids dom_active_pids[MAX_DOMS]; -+ -+const u64 ravg_1 = 1 << RAVG_FRAC_BITS; -+ -+static void dom_load_adj(u32 dom_id, s64 adj, u64 now) -+{ -+ struct dom_ctx *domc; -+ struct lock_wrapper *lockw; -+ -+ domc = bpf_map_lookup_elem(&dom_data, &dom_id); -+ lockw = bpf_map_lookup_elem(&dom_load_locks, &dom_id); -+ -+ if (!domc || !lockw) { -+ scx_bpf_error("dom_ctx / lock lookup failed"); -+ return; -+ } -+ -+ bpf_spin_lock(&lockw->lock); -+ domc->load += adj; -+ ravg_accumulate(&domc->load_rd, domc->load, now, load_half_life); -+ bpf_spin_unlock(&lockw->lock); -+ -+ if (adj < 0 && (s64)domc->load < 0) -+ scx_bpf_error("cpu%d dom%u load underflow (load=%lld adj=%lld)", -+ bpf_get_smp_processor_id(), dom_id, domc->load, adj); -+ -+ if (debug >=2 && -+ (!domc->dbg_load_printed_at || now - domc->dbg_load_printed_at >= 1000000000)) { -+ bpf_printk("LOAD ADJ dom=%u adj=%lld load=%llu", -+ dom_id, -+ adj, -+ ravg_read(&domc->load_rd, now, load_half_life) >> RAVG_FRAC_BITS); -+ domc->dbg_load_printed_at = now; -+ } -+} -+ -+static void dom_load_xfer_task(struct task_struct *p, struct task_ctx *taskc, -+ u32 from_dom_id, u32 to_dom_id, u64 now) -+{ -+ struct dom_ctx *from_domc, *to_domc; -+ struct lock_wrapper *from_lockw, *to_lockw; -+ struct ravg_data task_load_rd; -+ u64 from_load[2], to_load[2], task_load; -+ -+ from_domc = bpf_map_lookup_elem(&dom_data, &from_dom_id); -+ from_lockw = bpf_map_lookup_elem(&dom_load_locks, &from_dom_id); -+ to_domc = bpf_map_lookup_elem(&dom_data, &to_dom_id); -+ to_lockw = bpf_map_lookup_elem(&dom_load_locks, &to_dom_id); -+ if (!from_domc || !from_lockw || !to_domc || !to_lockw) { -+ scx_bpf_error("dom_ctx / lock lookup failed"); -+ return; -+ } -+ -+ /* -+ * @p is moving from @from_dom_id to @to_dom_id. Its load contribution -+ * should be moved together. We only track duty cycle for tasks. Scale -+ * it by weight to get load_rd. -+ */ -+ ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); -+ task_load_rd = taskc->dcyc_rd; -+ ravg_scale(&task_load_rd, p->scx.weight, 0); -+ -+ if (debug >= 2) -+ task_load = ravg_read(&task_load_rd, now, load_half_life); -+ -+ /* transfer out of @from_dom_id */ -+ bpf_spin_lock(&from_lockw->lock); -+ if (taskc->runnable) -+ from_domc->load -= p->scx.weight; -+ -+ if (debug >= 2) -+ from_load[0] = ravg_read(&from_domc->load_rd, now, load_half_life); -+ -+ ravg_transfer(&from_domc->load_rd, from_domc->load, -+ &task_load_rd, taskc->runnable, load_half_life, false); -+ -+ if (debug >= 2) -+ from_load[1] = ravg_read(&from_domc->load_rd, now, load_half_life); -+ -+ bpf_spin_unlock(&from_lockw->lock); -+ -+ /* transfer into @to_dom_id */ -+ bpf_spin_lock(&to_lockw->lock); -+ if (taskc->runnable) -+ to_domc->load += p->scx.weight; -+ -+ if (debug >= 2) -+ to_load[0] = ravg_read(&to_domc->load_rd, now, load_half_life); -+ -+ ravg_transfer(&to_domc->load_rd, to_domc->load, -+ &task_load_rd, taskc->runnable, load_half_life, true); -+ -+ if (debug >= 2) -+ to_load[1] = ravg_read(&to_domc->load_rd, now, load_half_life); -+ -+ bpf_spin_unlock(&to_lockw->lock); -+ -+ if (debug >= 2) -+ bpf_printk("XFER dom%u->%u task=%lu from=%lu->%lu to=%lu->%lu", -+ from_dom_id, to_dom_id, -+ task_load >> RAVG_FRAC_BITS, -+ from_load[0] >> RAVG_FRAC_BITS, -+ from_load[1] >> RAVG_FRAC_BITS, -+ to_load[0] >> RAVG_FRAC_BITS, -+ to_load[1] >> RAVG_FRAC_BITS); -+} -+ -+/* -+ * Statistics -+ */ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(key_size, sizeof(u32)); -+ __uint(value_size, sizeof(u64)); -+ __uint(max_entries, RUSTY_NR_STATS); -+} stats SEC(".maps"); -+ -+static inline void stat_add(enum stat_idx idx, u64 addend) -+{ -+ u32 idx_v = idx; -+ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); -+ if (cnt_p) -+ (*cnt_p) += addend; -+} -+ -+/* Map pid -> task_ctx */ -+struct { -+ __uint(type, BPF_MAP_TYPE_HASH); -+ __type(key, pid_t); -+ __type(value, struct task_ctx); -+ __uint(max_entries, 1000000); -+ __uint(map_flags, 0); -+} task_data SEC(".maps"); -+ -+struct task_ctx *lookup_task_ctx(struct task_struct *p) -+{ -+ struct task_ctx *taskc; -+ s32 pid = p->pid; -+ -+ if ((taskc = bpf_map_lookup_elem(&task_data, &pid))) { -+ return taskc; -+ } else { -+ scx_bpf_error("task_ctx lookup failed for pid %d", p->pid); -+ return NULL; -+ } -+} -+ -+/* -+ * This is populated from userspace to indicate which pids should be reassigned -+ * to new doms. -+ */ -+struct { -+ __uint(type, BPF_MAP_TYPE_HASH); -+ __type(key, pid_t); -+ __type(value, u32); -+ __uint(max_entries, 1000); -+ __uint(map_flags, 0); -+} lb_data SEC(".maps"); -+ -+/* -+ * Userspace tuner will frequently update the following struct with tuning -+ * parameters and bump its gen. refresh_tune_params() converts them into forms -+ * that can be used directly in the scheduling paths. -+ */ -+struct tune_input{ -+ u64 gen; -+ u64 direct_greedy_cpumask[MAX_CPUS / 64]; -+ u64 kick_greedy_cpumask[MAX_CPUS / 64]; -+} tune_input; -+ -+u64 tune_params_gen; -+private(A) struct bpf_cpumask __kptr *all_cpumask; -+private(A) struct bpf_cpumask __kptr *direct_greedy_cpumask; -+private(A) struct bpf_cpumask __kptr *kick_greedy_cpumask; -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+static u32 cpu_to_dom_id(s32 cpu) -+{ -+ const volatile u32 *dom_idp; -+ -+ if (nr_doms <= 1) -+ return 0; -+ -+ dom_idp = MEMBER_VPTR(cpu_dom_id_map, [cpu]); -+ if (!dom_idp) -+ return MAX_DOMS; -+ -+ return *dom_idp; -+} -+ -+static void refresh_tune_params(void) -+{ -+ s32 cpu; -+ -+ if (tune_params_gen == tune_input.gen) -+ return; -+ -+ tune_params_gen = tune_input.gen; -+ -+ bpf_for(cpu, 0, nr_cpus) { -+ u32 dom_id = cpu_to_dom_id(cpu); -+ struct dom_ctx *domc; -+ -+ if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) { -+ scx_bpf_error("Failed to lookup dom[%u]", dom_id); -+ return; -+ } -+ -+ if (tune_input.direct_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) { -+ if (direct_greedy_cpumask) -+ bpf_cpumask_set_cpu(cpu, direct_greedy_cpumask); -+ if (domc->direct_greedy_cpumask) -+ bpf_cpumask_set_cpu(cpu, domc->direct_greedy_cpumask); -+ } else { -+ if (direct_greedy_cpumask) -+ bpf_cpumask_clear_cpu(cpu, direct_greedy_cpumask); -+ if (domc->direct_greedy_cpumask) -+ bpf_cpumask_clear_cpu(cpu, domc->direct_greedy_cpumask); -+ } -+ -+ if (tune_input.kick_greedy_cpumask[cpu / 64] & (1LLU << (cpu % 64))) { -+ if (kick_greedy_cpumask) -+ bpf_cpumask_set_cpu(cpu, kick_greedy_cpumask); -+ } else { -+ if (kick_greedy_cpumask) -+ bpf_cpumask_clear_cpu(cpu, kick_greedy_cpumask); -+ } -+ } -+} -+ -+static bool task_set_domain(struct task_ctx *taskc, struct task_struct *p, -+ u32 new_dom_id, bool init_dsq_vtime) -+{ -+ struct dom_ctx *old_domc, *new_domc; -+ struct bpf_cpumask *d_cpumask, *t_cpumask; -+ u32 old_dom_id = taskc->dom_id; -+ s64 vtime_delta; -+ -+ old_domc = bpf_map_lookup_elem(&dom_data, &old_dom_id); -+ if (!old_domc) { -+ scx_bpf_error("Failed to lookup old dom%u", old_dom_id); -+ return false; -+ } -+ -+ if (init_dsq_vtime) -+ vtime_delta = 0; -+ else -+ vtime_delta = p->scx.dsq_vtime - old_domc->vtime_now; -+ -+ new_domc = bpf_map_lookup_elem(&dom_data, &new_dom_id); -+ if (!new_domc) { -+ scx_bpf_error("Failed to lookup new dom%u", new_dom_id); -+ return false; -+ } -+ -+ d_cpumask = new_domc->cpumask; -+ if (!d_cpumask) { -+ scx_bpf_error("Failed to get dom%u cpumask kptr", -+ new_dom_id); -+ return false; -+ } -+ -+ t_cpumask = taskc->cpumask; -+ if (!t_cpumask) { -+ scx_bpf_error("Failed to look up task cpumask"); -+ return false; -+ } -+ -+ /* -+ * set_cpumask might have happened between userspace requesting LB and -+ * here and @p might not be able to run in @dom_id anymore. Verify. -+ */ -+ if (bpf_cpumask_intersects((const struct cpumask *)d_cpumask, -+ p->cpus_ptr)) { -+ u64 now = bpf_ktime_get_ns(); -+ -+ dom_load_xfer_task(p, taskc, taskc->dom_id, new_dom_id, now); -+ -+ p->scx.dsq_vtime = new_domc->vtime_now + vtime_delta; -+ taskc->dom_id = new_dom_id; -+ bpf_cpumask_and(t_cpumask, (const struct cpumask *)d_cpumask, -+ p->cpus_ptr); -+ } -+ -+ return taskc->dom_id == new_dom_id; -+} -+ -+s32 BPF_STRUCT_OPS(rusty_select_cpu, struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags) -+{ -+ const struct cpumask *idle_smtmask = scx_bpf_get_idle_smtmask(); -+ struct task_ctx *taskc; -+ struct bpf_cpumask *p_cpumask; -+ bool prev_domestic, has_idle_cores; -+ s32 cpu; -+ -+ refresh_tune_params(); -+ -+ if (!(taskc = lookup_task_ctx(p)) || !(p_cpumask = taskc->cpumask)) -+ goto enoent; -+ -+ if (kthreads_local && -+ (p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { -+ cpu = prev_cpu; -+ stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); -+ goto direct; -+ } -+ -+ /* -+ * If WAKE_SYNC and the machine isn't fully saturated, wake up @p to the -+ * local dsq of the waker. -+ */ -+ if (p->nr_cpus_allowed > 1 && (wake_flags & SCX_WAKE_SYNC)) { -+ struct task_struct *current = (void *)bpf_get_current_task(); -+ -+ if (!(BPF_CORE_READ(current, flags) & PF_EXITING) && -+ taskc->dom_id < MAX_DOMS) { -+ struct dom_ctx *domc; -+ struct bpf_cpumask *d_cpumask; -+ const struct cpumask *idle_cpumask; -+ bool has_idle; -+ -+ domc = bpf_map_lookup_elem(&dom_data, &taskc->dom_id); -+ if (!domc) { -+ scx_bpf_error("Failed to find dom%u", taskc->dom_id); -+ goto enoent; -+ } -+ d_cpumask = domc->cpumask; -+ if (!d_cpumask) { -+ scx_bpf_error("Failed to acquire dom%u cpumask kptr", -+ taskc->dom_id); -+ goto enoent; -+ } -+ -+ idle_cpumask = scx_bpf_get_idle_cpumask(); -+ -+ has_idle = bpf_cpumask_intersects((const struct cpumask *)d_cpumask, -+ idle_cpumask); -+ -+ scx_bpf_put_idle_cpumask(idle_cpumask); -+ -+ if (has_idle) { -+ cpu = bpf_get_smp_processor_id(); -+ if (bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { -+ stat_add(RUSTY_STAT_WAKE_SYNC, 1); -+ goto direct; -+ } -+ } -+ } -+ } -+ -+ /* If only one CPU is allowed, dispatch */ -+ if (p->nr_cpus_allowed == 1) { -+ stat_add(RUSTY_STAT_PINNED, 1); -+ cpu = prev_cpu; -+ goto direct; -+ } -+ -+ has_idle_cores = !bpf_cpumask_empty(idle_smtmask); -+ -+ /* did @p get pulled out to a foreign domain by e.g. greedy execution? */ -+ prev_domestic = bpf_cpumask_test_cpu(prev_cpu, -+ (const struct cpumask *)p_cpumask); -+ -+ /* -+ * See if we want to keep @prev_cpu. We want to keep @prev_cpu if the -+ * whole physical core is idle. If the sibling[s] are busy, it's likely -+ * more advantageous to look for wholly idle cores first. -+ */ -+ if (prev_domestic) { -+ if (bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ stat_add(RUSTY_STAT_PREV_IDLE, 1); -+ cpu = prev_cpu; -+ goto direct; -+ } -+ } else { -+ /* -+ * @prev_cpu is foreign. Linger iff the domain isn't too busy as -+ * indicated by direct_greedy_cpumask. There may also be an idle -+ * CPU in the domestic domain -+ */ -+ if (direct_greedy_cpumask && -+ bpf_cpumask_test_cpu(prev_cpu, (const struct cpumask *) -+ direct_greedy_cpumask) && -+ bpf_cpumask_test_cpu(prev_cpu, idle_smtmask) && -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ stat_add(RUSTY_STAT_GREEDY_IDLE, 1); -+ cpu = prev_cpu; -+ goto direct; -+ } -+ } -+ -+ /* -+ * @prev_cpu didn't work out. Let's see whether there's an idle CPU @p -+ * can be directly dispatched to. We'll first try to find the best idle -+ * domestic CPU and then move onto foreign. -+ */ -+ -+ /* If there is a domestic idle core, dispatch directly */ -+ if (has_idle_cores) { -+ cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, -+ SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) { -+ stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); -+ goto direct; -+ } -+ } -+ -+ /* -+ * If @prev_cpu was domestic and is idle itself even though the core -+ * isn't, picking @prev_cpu may improve L1/2 locality. -+ */ -+ if (prev_domestic && scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); -+ cpu = prev_cpu; -+ goto direct; -+ } -+ -+ /* If there is any domestic idle CPU, dispatch directly */ -+ cpu = scx_bpf_pick_idle_cpu((const struct cpumask *)p_cpumask, 0); -+ if (cpu >= 0) { -+ stat_add(RUSTY_STAT_DIRECT_DISPATCH, 1); -+ goto direct; -+ } -+ -+ /* -+ * Domestic domain is fully booked. If there are CPUs which are idle and -+ * under-utilized, ignore domain boundaries and push the task there. Try -+ * to find an idle core first. -+ */ -+ if (taskc->all_cpus && direct_greedy_cpumask && -+ !bpf_cpumask_empty((const struct cpumask *)direct_greedy_cpumask)) { -+ u32 dom_id = cpu_to_dom_id(prev_cpu); -+ struct dom_ctx *domc; -+ -+ if (!(domc = bpf_map_lookup_elem(&dom_data, &dom_id))) { -+ scx_bpf_error("Failed to lookup dom[%u]", dom_id); -+ goto enoent; -+ } -+ -+ /* Try to find an idle core in the previous and then any domain */ -+ if (has_idle_cores) { -+ if (domc->direct_greedy_cpumask) { -+ cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) -+ domc->direct_greedy_cpumask, -+ SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) { -+ stat_add(RUSTY_STAT_DIRECT_GREEDY, 1); -+ goto direct; -+ } -+ } -+ -+ if (direct_greedy_cpumask) { -+ cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) -+ direct_greedy_cpumask, -+ SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) { -+ stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1); -+ goto direct; -+ } -+ } -+ } -+ -+ /* -+ * No idle core. Is there any idle CPU? -+ */ -+ if (domc->direct_greedy_cpumask) { -+ cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) -+ domc->direct_greedy_cpumask, 0); -+ if (cpu >= 0) { -+ stat_add(RUSTY_STAT_DIRECT_GREEDY, 1); -+ goto direct; -+ } -+ } -+ -+ if (direct_greedy_cpumask) { -+ cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) -+ direct_greedy_cpumask, 0); -+ if (cpu >= 0) { -+ stat_add(RUSTY_STAT_DIRECT_GREEDY_FAR, 1); -+ goto direct; -+ } -+ } -+ } -+ -+ /* -+ * We're going to queue on the domestic domain's DSQ. @prev_cpu may be -+ * in a different domain. Returning an out-of-domain CPU can lead to -+ * stalls as all in-domain CPUs may be idle by the time @p gets -+ * enqueued. -+ */ -+ if (prev_domestic) -+ cpu = prev_cpu; -+ else -+ cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0); -+ -+ scx_bpf_put_idle_cpumask(idle_smtmask); -+ return cpu; -+ -+direct: -+ taskc->dispatch_local = true; -+ scx_bpf_put_idle_cpumask(idle_smtmask); -+ return cpu; -+ -+enoent: -+ scx_bpf_put_idle_cpumask(idle_smtmask); -+ return -ENOENT; -+} -+ -+void BPF_STRUCT_OPS(rusty_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ struct task_ctx *taskc; -+ struct bpf_cpumask *p_cpumask; -+ pid_t pid = p->pid; -+ u32 *new_dom; -+ s32 cpu; -+ -+ if (!(taskc = lookup_task_ctx(p))) -+ return; -+ if (!(p_cpumask = taskc->cpumask)) { -+ scx_bpf_error("NULL cpmask"); -+ return; -+ } -+ -+ /* -+ * Migrate @p to a new domain if requested by userland through lb_data. -+ */ -+ new_dom = bpf_map_lookup_elem(&lb_data, &pid); -+ if (new_dom && *new_dom != taskc->dom_id && -+ task_set_domain(taskc, p, *new_dom, false)) { -+ stat_add(RUSTY_STAT_LOAD_BALANCE, 1); -+ taskc->dispatch_local = false; -+ cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0); -+ if (cpu >= 0) -+ scx_bpf_kick_cpu(cpu, 0); -+ goto dom_queue; -+ } -+ -+ if (taskc->dispatch_local) { -+ taskc->dispatch_local = false; -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ /* -+ * @p is about to be queued on its domain's dsq. However, @p may be on a -+ * foreign CPU due to a greedy execution and not have gone through -+ * ->select_cpu() if it's being enqueued e.g. after slice exhaustion. If -+ * so, @p would be queued on its domain's dsq but none of the CPUs in -+ * the domain would be woken up which can induce temporary execution -+ * stalls. Kick a domestic CPU if @p is on a foreign domain. -+ */ -+ if (!bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), (const struct cpumask *)p_cpumask)) { -+ cpu = scx_bpf_pick_any_cpu((const struct cpumask *)p_cpumask, 0); -+ scx_bpf_kick_cpu(cpu, 0); -+ stat_add(RUSTY_STAT_REPATRIATE, 1); -+ } -+ -+dom_queue: -+ if (fifo_sched) { -+ scx_bpf_dispatch(p, taskc->dom_id, slice_ns, enq_flags); -+ } else { -+ u64 vtime = p->scx.dsq_vtime; -+ u32 dom_id = taskc->dom_id; -+ struct dom_ctx *domc; -+ -+ domc = bpf_map_lookup_elem(&dom_data, &dom_id); -+ if (!domc) { -+ scx_bpf_error("Failed to lookup dom[%u]", dom_id); -+ return; -+ } -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(vtime, domc->vtime_now - slice_ns)) -+ vtime = domc->vtime_now - slice_ns; -+ -+ scx_bpf_dispatch_vtime(p, taskc->dom_id, slice_ns, vtime, enq_flags); -+ } -+ -+ /* -+ * If there are CPUs which are idle and not saturated, wake them up to -+ * see whether they'd be able to steal the just queued task. This path -+ * is taken only if DIRECT_GREEDY didn't trigger in select_cpu(). -+ * -+ * While both mechanisms serve very similar purposes, DIRECT_GREEDY -+ * emplaces the task in a foreign CPU directly while KICK_GREEDY just -+ * wakes up a foreign CPU which will then first try to execute from its -+ * domestic domain first before snooping foreign ones. -+ * -+ * While KICK_GREEDY is a more expensive way of accelerating greedy -+ * execution, DIRECT_GREEDY shows negative performance impacts when the -+ * CPUs are highly loaded while KICK_GREEDY doesn't. Even under fairly -+ * high utilization, KICK_GREEDY can slightly improve work-conservation. -+ */ -+ if (taskc->all_cpus && kick_greedy_cpumask) { -+ cpu = scx_bpf_pick_idle_cpu((const struct cpumask *) -+ kick_greedy_cpumask, 0); -+ if (cpu >= 0) { -+ stat_add(RUSTY_STAT_KICK_GREEDY, 1); -+ scx_bpf_kick_cpu(cpu, 0); -+ } -+ } -+} -+ -+static bool cpumask_intersects_domain(const struct cpumask *cpumask, u32 dom_id) -+{ -+ s32 cpu; -+ -+ if (dom_id >= MAX_DOMS) -+ return false; -+ -+ bpf_for(cpu, 0, nr_cpus) { -+ if (bpf_cpumask_test_cpu(cpu, cpumask) && -+ (dom_cpumasks[dom_id][cpu / 64] & (1LLU << (cpu % 64)))) -+ return true; -+ } -+ return false; -+} -+ -+static u32 dom_rr_next(s32 cpu) -+{ -+ struct pcpu_ctx *pcpuc; -+ u32 dom_id; -+ -+ pcpuc = MEMBER_VPTR(pcpu_ctx, [cpu]); -+ if (!pcpuc) -+ return 0; -+ -+ dom_id = (pcpuc->dom_rr_cur + 1) % nr_doms; -+ -+ if (dom_id == cpu_to_dom_id(cpu)) -+ dom_id = (dom_id + 1) % nr_doms; -+ -+ pcpuc->dom_rr_cur = dom_id; -+ return dom_id; -+} -+ -+void BPF_STRUCT_OPS(rusty_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ u32 dom = cpu_to_dom_id(cpu); -+ -+ if (scx_bpf_consume(dom)) { -+ stat_add(RUSTY_STAT_DSQ_DISPATCH, 1); -+ return; -+ } -+ -+ if (!greedy_threshold) -+ return; -+ -+ bpf_repeat(nr_doms - 1) { -+ u32 dom_id = dom_rr_next(cpu); -+ -+ if (scx_bpf_dsq_nr_queued(dom_id) >= greedy_threshold && -+ scx_bpf_consume(dom_id)) { -+ stat_add(RUSTY_STAT_GREEDY, 1); -+ break; -+ } -+ } -+} -+ -+void BPF_STRUCT_OPS(rusty_runnable, struct task_struct *p, u64 enq_flags) -+{ -+ u64 now = bpf_ktime_get_ns(); -+ struct task_ctx *taskc; -+ -+ if (!(taskc = lookup_task_ctx(p))) -+ return; -+ -+ taskc->runnable = true; -+ taskc->is_kworker = p->flags & PF_WQ_WORKER; -+ -+ ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); -+ dom_load_adj(taskc->dom_id, p->scx.weight, now); -+} -+ -+void BPF_STRUCT_OPS(rusty_running, struct task_struct *p) -+{ -+ struct task_ctx *taskc; -+ struct dom_ctx *domc; -+ u32 dom_id, dap_gen; -+ -+ if (!(taskc = lookup_task_ctx(p))) -+ return; -+ -+ taskc->running_at = bpf_ktime_get_ns(); -+ dom_id = taskc->dom_id; -+ if (dom_id >= MAX_DOMS) { -+ scx_bpf_error("Invalid dom ID"); -+ return; -+ } -+ -+ /* -+ * Record that @p has been active in @domc. Load balancer will only -+ * consider recently active tasks. Access synchronization rules aren't -+ * strict. We just need to be right most of the time. -+ */ -+ dap_gen = dom_active_pids[dom_id].gen; -+ if (taskc->dom_active_pids_gen != dap_gen) { -+ u64 idx = __sync_fetch_and_add(&dom_active_pids[dom_id].write_idx, 1) % -+ MAX_DOM_ACTIVE_PIDS; -+ s32 *pidp; -+ -+ pidp = MEMBER_VPTR(dom_active_pids, [dom_id].pids[idx]); -+ if (!pidp) { -+ scx_bpf_error("dom_active_pids[%u][%llu] indexing failed", -+ dom_id, idx); -+ return; -+ } -+ -+ *pidp = p->pid; -+ taskc->dom_active_pids_gen = dap_gen; -+ } -+ -+ if (fifo_sched) -+ return; -+ -+ domc = bpf_map_lookup_elem(&dom_data, &dom_id); -+ if (!domc) { -+ scx_bpf_error("Failed to lookup dom[%u]", dom_id); -+ return; -+ } -+ -+ /* -+ * Global vtime always progresses forward as tasks start executing. The -+ * test and update can be performed concurrently from multiple CPUs and -+ * thus racy. Any error should be contained and temporary. Let's just -+ * live with it. -+ */ -+ if (vtime_before(domc->vtime_now, p->scx.dsq_vtime)) -+ domc->vtime_now = p->scx.dsq_vtime; -+} -+ -+void BPF_STRUCT_OPS(rusty_stopping, struct task_struct *p, bool runnable) -+{ -+ struct task_ctx *taskc; -+ -+ if (fifo_sched) -+ return; -+ -+ if (!(taskc = lookup_task_ctx(p))) -+ return; -+ -+ /* scale the execution time by the inverse of the weight and charge */ -+ p->scx.dsq_vtime += -+ (bpf_ktime_get_ns() - taskc->running_at) * 100 / p->scx.weight; -+} -+ -+void BPF_STRUCT_OPS(rusty_quiescent, struct task_struct *p, u64 deq_flags) -+{ -+ u64 now = bpf_ktime_get_ns(); -+ struct task_ctx *taskc; -+ -+ if (!(taskc = lookup_task_ctx(p))) -+ return; -+ -+ taskc->runnable = false; -+ -+ ravg_accumulate(&taskc->dcyc_rd, taskc->runnable, now, load_half_life); -+ dom_load_adj(taskc->dom_id, -(s64)p->scx.weight, now); -+} -+ -+void BPF_STRUCT_OPS(rusty_set_weight, struct task_struct *p, u32 weight) -+{ -+ struct task_ctx *taskc; -+ -+ if (!(taskc = lookup_task_ctx(p))) -+ return; -+ -+ taskc->weight = weight; -+} -+ -+static u32 task_pick_domain(struct task_ctx *taskc, struct task_struct *p, -+ const struct cpumask *cpumask) -+{ -+ s32 cpu = bpf_get_smp_processor_id(); -+ u32 first_dom = MAX_DOMS, dom; -+ -+ if (cpu < 0 || cpu >= MAX_CPUS) -+ return MAX_DOMS; -+ -+ taskc->dom_mask = 0; -+ -+ dom = pcpu_ctx[cpu].dom_rr_cur++; -+ bpf_repeat(nr_doms) { -+ dom = (dom + 1) % nr_doms; -+ if (cpumask_intersects_domain(cpumask, dom)) { -+ taskc->dom_mask |= 1LLU << dom; -+ /* -+ * AsThe starting point is round-robin'd and the first -+ * match should be spread across all the domains. -+ */ -+ if (first_dom == MAX_DOMS) -+ first_dom = dom; -+ } -+ } -+ -+ return first_dom; -+} -+ -+static void task_pick_and_set_domain(struct task_ctx *taskc, -+ struct task_struct *p, -+ const struct cpumask *cpumask, -+ bool init_dsq_vtime) -+{ -+ u32 dom_id = 0; -+ -+ if (nr_doms > 1) -+ dom_id = task_pick_domain(taskc, p, cpumask); -+ -+ if (!task_set_domain(taskc, p, dom_id, init_dsq_vtime)) -+ scx_bpf_error("Failed to set dom%d for %s[%d]", -+ dom_id, p->comm, p->pid); -+} -+ -+void BPF_STRUCT_OPS(rusty_set_cpumask, struct task_struct *p, -+ const struct cpumask *cpumask) -+{ -+ struct task_ctx *taskc; -+ -+ if (!(taskc = lookup_task_ctx(p))) -+ return; -+ -+ task_pick_and_set_domain(taskc, p, cpumask, false); -+ if (all_cpumask) -+ taskc->all_cpus = -+ bpf_cpumask_subset((const struct cpumask *)all_cpumask, cpumask); -+} -+ -+s32 BPF_STRUCT_OPS(rusty_prep_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ struct bpf_cpumask *cpumask; -+ struct task_ctx taskc = { .dom_active_pids_gen = -1 }; -+ struct task_ctx *map_value; -+ long ret; -+ pid_t pid; -+ -+ pid = p->pid; -+ ret = bpf_map_update_elem(&task_data, &pid, &taskc, BPF_NOEXIST); -+ if (ret) { -+ stat_add(RUSTY_STAT_TASK_GET_ERR, 1); -+ return ret; -+ } -+ -+ /* -+ * Read the entry from the map immediately so we can add the cpumask -+ * with bpf_kptr_xchg(). -+ */ -+ map_value = bpf_map_lookup_elem(&task_data, &pid); -+ if (!map_value) -+ /* Should never happen -- it was just inserted above. */ -+ return -EINVAL; -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) { -+ bpf_map_delete_elem(&task_data, &pid); -+ return -ENOMEM; -+ } -+ -+ cpumask = bpf_kptr_xchg(&map_value->cpumask, cpumask); -+ if (cpumask) { -+ /* Should never happen as we just inserted it above. */ -+ bpf_cpumask_release(cpumask); -+ bpf_map_delete_elem(&task_data, &pid); -+ return -EINVAL; -+ } -+ -+ task_pick_and_set_domain(map_value, p, p->cpus_ptr, true); -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(rusty_disable, struct task_struct *p) -+{ -+ pid_t pid = p->pid; -+ long ret = bpf_map_delete_elem(&task_data, &pid); -+ if (ret) { -+ stat_add(RUSTY_STAT_TASK_GET_ERR, 1); -+ return; -+ } -+} -+ -+static s32 create_dom(u32 dom_id) -+{ -+ struct dom_ctx domc_init = {}, *domc; -+ struct bpf_cpumask *cpumask; -+ u32 cpu; -+ s32 ret; -+ -+ ret = scx_bpf_create_dsq(dom_id, -1); -+ if (ret < 0) { -+ scx_bpf_error("Failed to create dsq %u (%d)", dom_id, ret); -+ return ret; -+ } -+ -+ ret = bpf_map_update_elem(&dom_data, &dom_id, &domc_init, 0); -+ if (ret) { -+ scx_bpf_error("Failed to add dom_ctx entry %u (%d)", dom_id, ret); -+ return ret; -+ } -+ -+ domc = bpf_map_lookup_elem(&dom_data, &dom_id); -+ if (!domc) { -+ /* Should never happen, we just inserted it above. */ -+ scx_bpf_error("No dom%u", dom_id); -+ return -ENOENT; -+ } -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) { -+ scx_bpf_error("Failed to create BPF cpumask for domain %u", dom_id); -+ return -ENOMEM; -+ } -+ -+ for (cpu = 0; cpu < MAX_CPUS; cpu++) { -+ const volatile u64 *dmask; -+ -+ dmask = MEMBER_VPTR(dom_cpumasks, [dom_id][cpu / 64]); -+ if (!dmask) { -+ scx_bpf_error("array index error"); -+ bpf_cpumask_release(cpumask); -+ return -ENOENT; -+ } -+ -+ if (*dmask & (1LLU << (cpu % 64))) { -+ bpf_cpumask_set_cpu(cpu, cpumask); -+ -+ bpf_rcu_read_lock(); -+ if (all_cpumask) -+ bpf_cpumask_set_cpu(cpu, all_cpumask); -+ bpf_rcu_read_unlock(); -+ } -+ } -+ -+ cpumask = bpf_kptr_xchg(&domc->cpumask, cpumask); -+ if (cpumask) { -+ scx_bpf_error("Domain %u cpumask already present", dom_id); -+ bpf_cpumask_release(cpumask); -+ return -EEXIST; -+ } -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) { -+ scx_bpf_error("Failed to create BPF cpumask for domain %u", -+ dom_id); -+ return -ENOMEM; -+ } -+ -+ cpumask = bpf_kptr_xchg(&domc->direct_greedy_cpumask, cpumask); -+ if (cpumask) { -+ scx_bpf_error("Domain %u direct_greedy_cpumask already present", -+ dom_id); -+ bpf_cpumask_release(cpumask); -+ return -EEXIST; -+ } -+ -+ return 0; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(rusty_init) -+{ -+ struct bpf_cpumask *cpumask; -+ s32 i, ret; -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ cpumask = bpf_kptr_xchg(&all_cpumask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ cpumask = bpf_kptr_xchg(&direct_greedy_cpumask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ -+ cpumask = bpf_cpumask_create(); -+ if (!cpumask) -+ return -ENOMEM; -+ cpumask = bpf_kptr_xchg(&kick_greedy_cpumask, cpumask); -+ if (cpumask) -+ bpf_cpumask_release(cpumask); -+ -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ -+ bpf_for(i, 0, nr_doms) { -+ ret = create_dom(i); -+ if (ret) -+ return ret; -+ } -+ -+ bpf_for(i, 0, nr_cpus) -+ pcpu_ctx[i].dom_rr_cur = i; -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(rusty_exit, struct scx_exit_info *ei) -+{ -+ bpf_probe_read_kernel_str(exit_msg, sizeof(exit_msg), ei->msg); -+ exit_kind = ei->kind; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops rusty = { -+ .select_cpu = (void *)rusty_select_cpu, -+ .enqueue = (void *)rusty_enqueue, -+ .dispatch = (void *)rusty_dispatch, -+ .runnable = (void *)rusty_runnable, -+ .running = (void *)rusty_running, -+ .stopping = (void *)rusty_stopping, -+ .quiescent = (void *)rusty_quiescent, -+ .set_weight = (void *)rusty_set_weight, -+ .set_cpumask = (void *)rusty_set_cpumask, -+ .prep_enable = (void *)rusty_prep_enable, -+ .disable = (void *)rusty_disable, -+ .init = (void *)rusty_init, -+ .exit = (void *)rusty_exit, -+ .name = "rusty", -+}; -diff --git a/tools/sched_ext/scx_rusty/src/bpf/rusty.h b/tools/sched_ext/scx_rusty/src/bpf/rusty.h -new file mode 100644 -index 000000000..8a7487cf4 ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/src/bpf/rusty.h -@@ -0,0 +1,97 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+#ifndef __RUSTY_H -+#define __RUSTY_H -+ -+#include -+#ifndef __kptr -+#ifdef __KERNEL__ -+#error "__kptr_ref not defined in the kernel" -+#endif -+#define __kptr -+#endif -+ -+#ifndef __KERNEL__ -+typedef unsigned char u8; -+typedef unsigned int u32; -+typedef unsigned long long u64; -+#endif -+ -+#include "../../../ravg.bpf.h" -+ -+enum consts { -+ MAX_CPUS = 512, -+ MAX_DOMS = 64, /* limited to avoid complex bitmask ops */ -+ CACHELINE_SIZE = 64, -+ -+ /* -+ * When userspace load balancer is trying to determine the tasks to push -+ * out from an overloaded domain, it looks at the the following number -+ * of recently active tasks of the domain. While this may lead to -+ * spurious migration victim selection failures in pathological cases, -+ * this isn't a practical problem as the LB rounds are best-effort -+ * anyway and will be retried until loads are balanced. -+ */ -+ MAX_DOM_ACTIVE_PIDS = 1024, -+}; -+ -+/* Statistics */ -+enum stat_idx { -+ /* The following fields add up to all dispatched tasks */ -+ RUSTY_STAT_WAKE_SYNC, -+ RUSTY_STAT_PREV_IDLE, -+ RUSTY_STAT_GREEDY_IDLE, -+ RUSTY_STAT_PINNED, -+ RUSTY_STAT_DIRECT_DISPATCH, -+ RUSTY_STAT_DIRECT_GREEDY, -+ RUSTY_STAT_DIRECT_GREEDY_FAR, -+ RUSTY_STAT_DSQ_DISPATCH, -+ RUSTY_STAT_GREEDY, -+ -+ /* Extra stats that don't contribute to total */ -+ RUSTY_STAT_REPATRIATE, -+ RUSTY_STAT_KICK_GREEDY, -+ RUSTY_STAT_LOAD_BALANCE, -+ -+ /* Errors */ -+ RUSTY_STAT_TASK_GET_ERR, -+ -+ RUSTY_NR_STATS, -+}; -+ -+struct task_ctx { -+ /* The domains this task can run on */ -+ u64 dom_mask; -+ -+ struct bpf_cpumask __kptr *cpumask; -+ u32 dom_id; -+ u32 weight; -+ bool runnable; -+ u64 dom_active_pids_gen; -+ u64 running_at; -+ -+ /* The task is a workqueue worker thread */ -+ bool is_kworker; -+ -+ /* Allowed on all CPUs and eligible for DIRECT_GREEDY optimization */ -+ bool all_cpus; -+ -+ /* select_cpu() telling enqueue() to queue directly on the DSQ */ -+ bool dispatch_local; -+ -+ struct ravg_data dcyc_rd; -+}; -+ -+struct dom_ctx { -+ u64 vtime_now; -+ struct bpf_cpumask __kptr *cpumask; -+ struct bpf_cpumask __kptr *direct_greedy_cpumask; -+ -+ u64 load; -+ struct ravg_data load_rd; -+ u64 dbg_load_printed_at; -+}; -+ -+#endif /* __RUSTY_H */ -diff --git a/tools/sched_ext/scx_rusty/src/main.rs b/tools/sched_ext/scx_rusty/src/main.rs -new file mode 100644 -index 000000000..3b0bcd742 ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/src/main.rs -@@ -0,0 +1,1265 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+#[path = "bpf/.output/rusty.skel.rs"] -+mod rusty; -+pub use rusty::*; -+pub mod rusty_sys; -+ -+use std::cell::Cell; -+use std::collections::BTreeMap; -+use std::collections::BTreeSet; -+use std::ffi::CStr; -+use std::ops::Bound::Included; -+use std::ops::Bound::Unbounded; -+use std::sync::atomic::AtomicBool; -+use std::sync::atomic::Ordering; -+use std::sync::Arc; -+use std::time::Duration; -+use std::time::Instant; -+ -+use ::fb_procfs as procfs; -+use anyhow::anyhow; -+use anyhow::bail; -+use anyhow::Context; -+use anyhow::Result; -+use bitvec::prelude::*; -+use clap::Parser; -+use libbpf_rs::skel::OpenSkel as _; -+use libbpf_rs::skel::Skel as _; -+use libbpf_rs::skel::SkelBuilder as _; -+use log::debug; -+use log::info; -+use log::trace; -+use log::warn; -+use ordered_float::OrderedFloat; -+ -+const RAVG_FRAC_BITS: u32 = rusty_sys::ravg_consts_RAVG_FRAC_BITS; -+const MAX_DOMS: usize = rusty_sys::consts_MAX_DOMS as usize; -+const MAX_CPUS: usize = rusty_sys::consts_MAX_CPUS as usize; -+ -+include!("../../ravg_read.rs.h"); -+ -+/// scx_rusty: A multi-domain BPF / userspace hybrid scheduler -+/// -+/// The BPF part does simple vtime or round robin scheduling in each domain -+/// while tracking average load of each domain and duty cycle of each task. -+/// -+/// The userspace part performs two roles. First, it makes higher frequency -+/// (100ms) tuning decisions. It identifies CPUs which are not too heavily -+/// loaded and mark them so that they can pull tasks from other overloaded -+/// domains on the fly. -+/// -+/// Second, it drives lower frequency (2s) load balancing. It determines -+/// whether load balancing is necessary by comparing domain load averages. -+/// If there are large enough load differences, it examines upto 1024 -+/// recently active tasks on the domain to determine which should be -+/// migrated. -+/// -+/// The overhead of userspace operations is low. Load balancing is not -+/// performed frequently but work-conservation is still maintained through -+/// tuning and greedy execution. Load balancing itself is not that expensive -+/// either. It only accesses per-domain load metrics to determine the -+/// domains that need load balancing and limited number of per-task metrics -+/// for each pushing domain. -+/// -+/// An earlier variant of this scheduler was used to balance across six -+/// domains, each representing a chiplet in a six-chiplet AMD processor, and -+/// could match the performance of production setup using CFS. -+/// -+/// WARNING: Very high weight (low nice value) tasks can throw off load -+/// balancing due to infeasible weight problem. This problem will be solved -+/// in the near future. -+/// -+/// WARNING: scx_rusty currently assumes that all domains have equal -+/// processing power and at similar distances from each other. This -+/// limitation will be removed in the future. -+#[derive(Debug, Parser)] -+struct Opts { -+ /// Scheduling slice duration in microseconds. -+ #[clap(short = 's', long, default_value = "20000")] -+ slice_us: u64, -+ -+ /// Monitoring and load balance interval in seconds. -+ #[clap(short = 'i', long, default_value = "2.0")] -+ interval: f64, -+ -+ /// Tuner runs at higher frequency than the load balancer to dynamically -+ /// tune scheduling behavior. Tuning interval in seconds. -+ #[clap(short = 'I', long, default_value = "0.1")] -+ tune_interval: f64, -+ -+ /// The half-life of task and domain load running averages in seconds. -+ #[clap(short = 'l', long, default_value = "1.0")] -+ load_half_life: f64, -+ -+ /// Build domains according to how CPUs are grouped at this cache level -+ /// as determined by /sys/devices/system/cpu/cpuX/cache/indexI/id. -+ #[clap(short = 'c', long, default_value = "3")] -+ cache_level: u32, -+ -+ /// Instead of using cache locality, set the cpumask for each domain -+ /// manually, provide multiple --cpumasks, one for each domain. E.g. -+ /// --cpumasks 0xff_00ff --cpumasks 0xff00 will create two domains with -+ /// the corresponding CPUs belonging to each domain. Each CPU must -+ /// belong to precisely one domain. -+ #[clap(short = 'C', long, num_args = 1.., conflicts_with = "cache_level")] -+ cpumasks: Vec, -+ -+ /// When non-zero, enable greedy task stealing. When a domain is idle, a -+ /// cpu will attempt to steal tasks from a domain with at least -+ /// greedy_threshold tasks enqueued. These tasks aren't permanently -+ /// stolen from the domain. -+ #[clap(short = 'g', long, default_value = "1")] -+ greedy_threshold: u32, -+ -+ /// Disable load balancing. Unless disabled, periodically userspace will -+ /// calculate the load factor of each domain and instruct BPF which -+ /// processes to move. -+ #[clap(long, action = clap::ArgAction::SetTrue)] -+ no_load_balance: bool, -+ -+ /// Put per-cpu kthreads directly into local dsq's. -+ #[clap(short = 'k', long, action = clap::ArgAction::SetTrue)] -+ kthreads_local: bool, -+ -+ /// In recent kernels (>=v6.6), the kernel is responsible for balancing -+ /// kworkers across L3 cache domains. Exclude them from load-balancing -+ /// to avoid conflicting operations. Greedy executions still apply. -+ #[clap(short = 'b', long, action = clap::ArgAction::SetTrue)] -+ balanced_kworkers: bool, -+ -+ /// Use FIFO scheduling instead of weighted vtime scheduling. -+ #[clap(short = 'f', long, action = clap::ArgAction::SetTrue)] -+ fifo_sched: bool, -+ -+ /// Idle CPUs with utilization lower than this will get remote tasks -+ /// directly pushed on them. 0 disables, 100 enables always. -+ #[clap(short = 'D', long, default_value = "90.0")] -+ direct_greedy_under: f64, -+ -+ /// Idle CPUs with utilization lower than this may get kicked to -+ /// accelerate stealing when a task is queued on a saturated remote -+ /// domain. 0 disables, 100 enables always. -+ #[clap(short = 'K', long, default_value = "100.0")] -+ kick_greedy_under: f64, -+ -+ /// If specified, only tasks which have their scheduling policy set to -+ /// SCHED_EXT using sched_setscheduler(2) are switched. Otherwise, all -+ /// tasks are switched. -+ #[clap(short = 'p', long, action = clap::ArgAction::SetTrue)] -+ partial: bool, -+ -+ /// Enable verbose output including libbpf details. Specify multiple -+ /// times to increase verbosity. -+ #[clap(short = 'v', long, action = clap::ArgAction::Count)] -+ verbose: u8, -+} -+ -+fn now_monotonic() -> u64 { -+ let mut time = libc::timespec { -+ tv_sec: 0, -+ tv_nsec: 0, -+ }; -+ let ret = unsafe { libc::clock_gettime(libc::CLOCK_MONOTONIC, &mut time) }; -+ assert!(ret == 0); -+ time.tv_sec as u64 * 1_000_000_000 + time.tv_nsec as u64 -+} -+ -+fn clear_map(map: &libbpf_rs::Map) { -+ for key in map.keys() { -+ let _ = map.delete(&key); -+ } -+} -+ -+fn format_cpumask(cpumask: &[u64], nr_cpus: usize) -> String { -+ cpumask -+ .iter() -+ .take((nr_cpus + 64) / 64) -+ .rev() -+ .fold(String::new(), |acc, x| format!("{} {:016X}", acc, x)) -+} -+ -+fn read_total_cpu(reader: &procfs::ProcReader) -> Result { -+ reader -+ .read_stat() -+ .context("Failed to read procfs")? -+ .total_cpu -+ .ok_or_else(|| anyhow!("Could not read total cpu stat in proc")) -+} -+ -+fn calc_util(curr: &procfs::CpuStat, prev: &procfs::CpuStat) -> Result { -+ match (curr, prev) { -+ ( -+ procfs::CpuStat { -+ user_usec: Some(prev_user), -+ nice_usec: Some(prev_nice), -+ system_usec: Some(prev_system), -+ idle_usec: Some(prev_idle), -+ iowait_usec: Some(prev_iowait), -+ irq_usec: Some(prev_irq), -+ softirq_usec: Some(prev_softirq), -+ stolen_usec: Some(prev_stolen), -+ .. -+ }, -+ procfs::CpuStat { -+ user_usec: Some(curr_user), -+ nice_usec: Some(curr_nice), -+ system_usec: Some(curr_system), -+ idle_usec: Some(curr_idle), -+ iowait_usec: Some(curr_iowait), -+ irq_usec: Some(curr_irq), -+ softirq_usec: Some(curr_softirq), -+ stolen_usec: Some(curr_stolen), -+ .. -+ }, -+ ) => { -+ let idle_usec = curr_idle - prev_idle; -+ let iowait_usec = curr_iowait - prev_iowait; -+ let user_usec = curr_user - prev_user; -+ let system_usec = curr_system - prev_system; -+ let nice_usec = curr_nice - prev_nice; -+ let irq_usec = curr_irq - prev_irq; -+ let softirq_usec = curr_softirq - prev_softirq; -+ let stolen_usec = curr_stolen - prev_stolen; -+ -+ let busy_usec = -+ user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; -+ let total_usec = idle_usec + busy_usec + iowait_usec; -+ if total_usec > 0 { -+ Ok(((busy_usec as f64) / (total_usec as f64)).clamp(0.0, 1.0)) -+ } else { -+ Ok(1.0) -+ } -+ } -+ _ => { -+ bail!("Missing stats in cpustat"); -+ } -+ } -+} -+ -+#[derive(Debug)] -+struct Topology { -+ nr_cpus: usize, -+ nr_doms: usize, -+ dom_cpus: Vec>, -+ cpu_dom: Vec>, -+} -+ -+impl Topology { -+ fn from_cpumasks(cpumasks: &[String], nr_cpus: usize) -> Result { -+ if cpumasks.len() > MAX_DOMS { -+ bail!( -+ "Number of requested domains ({}) is greater than MAX_DOMS ({})", -+ cpumasks.len(), -+ MAX_DOMS -+ ); -+ } -+ let mut cpu_dom = vec![None; nr_cpus]; -+ let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; cpumasks.len()]; -+ for (dom, cpumask) in cpumasks.iter().enumerate() { -+ let hex_str = { -+ let mut tmp_str = cpumask -+ .strip_prefix("0x") -+ .unwrap_or(cpumask) -+ .replace('_', ""); -+ if tmp_str.len() % 2 != 0 { -+ tmp_str = "0".to_string() + &tmp_str; -+ } -+ tmp_str -+ }; -+ let byte_vec = hex::decode(&hex_str) -+ .with_context(|| format!("Failed to parse cpumask: {}", cpumask))?; -+ -+ for (index, &val) in byte_vec.iter().rev().enumerate() { -+ let mut v = val; -+ while v != 0 { -+ let lsb = v.trailing_zeros() as usize; -+ v &= !(1 << lsb); -+ let cpu = index * 8 + lsb; -+ if cpu > nr_cpus { -+ bail!( -+ concat!( -+ "Found cpu ({}) in cpumask ({}) which is larger", -+ " than the number of cpus on the machine ({})" -+ ), -+ cpu, -+ cpumask, -+ nr_cpus -+ ); -+ } -+ if let Some(other_dom) = cpu_dom[cpu] { -+ bail!( -+ "Found cpu ({}) with domain ({}) but also in cpumask ({})", -+ cpu, -+ other_dom, -+ cpumask -+ ); -+ } -+ cpu_dom[cpu] = Some(dom); -+ dom_cpus[dom].set(cpu, true); -+ } -+ } -+ dom_cpus[dom].set_uninitialized(false); -+ } -+ -+ for (cpu, dom) in cpu_dom.iter().enumerate() { -+ if dom.is_none() { -+ bail!( -+ "CPU {} not assigned to any domain. Make sure it is covered by some --cpumasks argument.", -+ cpu -+ ); -+ } -+ } -+ -+ Ok(Self { -+ nr_cpus, -+ nr_doms: dom_cpus.len(), -+ dom_cpus, -+ cpu_dom, -+ }) -+ } -+ -+ fn from_cache_level(level: u32, nr_cpus: usize) -> Result { -+ let mut cpu_to_cache = vec![]; // (cpu_id, Option) -+ let mut cache_ids = BTreeSet::::new(); -+ let mut nr_offline = 0; -+ -+ // Build cpu -> cache ID mapping. -+ for cpu in 0..nr_cpus { -+ let path = format!("/sys/devices/system/cpu/cpu{}/cache/index{}/id", cpu, level); -+ let id = match std::fs::read_to_string(&path) { -+ Ok(val) => Some(val.trim().parse::().with_context(|| { -+ format!("Failed to parse {:?}'s content {:?}", &path, &val) -+ })?), -+ Err(e) if e.kind() == std::io::ErrorKind::NotFound => { -+ nr_offline += 1; -+ None -+ } -+ Err(e) => return Err(e).with_context(|| format!("Failed to open {:?}", &path)), -+ }; -+ -+ cpu_to_cache.push(id); -+ if let Some(id) = id { -+ cache_ids.insert(id); -+ } -+ } -+ -+ info!( -+ "CPUs: online/possible = {}/{}", -+ nr_cpus - nr_offline, -+ nr_cpus -+ ); -+ -+ // Cache IDs may have holes. Assign consecutive domain IDs to -+ // existing cache IDs. -+ let mut cache_to_dom = BTreeMap::::new(); -+ let mut nr_doms = 0; -+ for cache_id in cache_ids.iter() { -+ cache_to_dom.insert(*cache_id, nr_doms); -+ nr_doms += 1; -+ } -+ -+ if nr_doms > MAX_DOMS { -+ bail!( -+ "Total number of doms {} is greater than MAX_DOMS ({})", -+ nr_doms, -+ MAX_DOMS -+ ); -+ } -+ -+ // Build and return dom -> cpumask and cpu -> dom mappings. -+ let mut dom_cpus = vec![bitvec![u64, Lsb0; 0; MAX_CPUS]; nr_doms]; -+ let mut cpu_dom = vec![]; -+ -+ for (cpu, cache) in cpu_to_cache.iter().enumerate().take(nr_cpus) { -+ match cache { -+ Some(cache_id) => { -+ let dom_id = cache_to_dom[cache_id]; -+ dom_cpus[dom_id].set(cpu, true); -+ cpu_dom.push(Some(dom_id)); -+ } -+ None => { -+ dom_cpus[0].set(cpu, true); -+ cpu_dom.push(None); -+ } -+ } -+ } -+ -+ Ok(Self { -+ nr_cpus, -+ nr_doms: dom_cpus.len(), -+ dom_cpus, -+ cpu_dom, -+ }) -+ } -+} -+ -+struct Tuner { -+ top: Arc, -+ direct_greedy_under: f64, -+ kick_greedy_under: f64, -+ proc_reader: procfs::ProcReader, -+ prev_cpu_stats: BTreeMap, -+ dom_utils: Vec, -+} -+ -+impl Tuner { -+ fn new(top: Arc, opts: &Opts) -> Result { -+ let proc_reader = procfs::ProcReader::new(); -+ let prev_cpu_stats = proc_reader -+ .read_stat()? -+ .cpus_map -+ .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?; -+ Ok(Self { -+ direct_greedy_under: opts.direct_greedy_under / 100.0, -+ kick_greedy_under: opts.kick_greedy_under / 100.0, -+ proc_reader, -+ prev_cpu_stats, -+ dom_utils: vec![0.0; top.nr_doms], -+ top, -+ }) -+ } -+ -+ fn step(&mut self, skel: &mut RustySkel) -> Result<()> { -+ let curr_cpu_stats = self -+ .proc_reader -+ .read_stat()? -+ .cpus_map -+ .ok_or_else(|| anyhow!("Expected cpus_map to exist"))?; -+ let ti = &mut skel.bss().tune_input; -+ let mut dom_nr_cpus = vec![0; self.top.nr_doms]; -+ let mut dom_util_sum = vec![0.0; self.top.nr_doms]; -+ -+ for cpu in 0..self.top.nr_cpus { -+ let cpu32 = cpu as u32; -+ // None domain indicates the CPU was offline during -+ // initialization and None CpuStat indicates the CPU has gone -+ // down since then. Ignore both. -+ if let (Some(dom), Some(curr), Some(prev)) = ( -+ self.top.cpu_dom[cpu], -+ curr_cpu_stats.get(&cpu32), -+ self.prev_cpu_stats.get(&cpu32), -+ ) { -+ dom_nr_cpus[dom] += 1; -+ dom_util_sum[dom] += calc_util(curr, prev)?; -+ } -+ } -+ -+ for dom in 0..self.top.nr_doms { -+ // Calculate the domain avg util. If there are no active CPUs, -+ // it doesn't really matter. Go with 0.0 as that's less likely -+ // to confuse users. -+ let util = match dom_nr_cpus[dom] { -+ 0 => 0.0, -+ nr => dom_util_sum[dom] / nr as f64, -+ }; -+ -+ self.dom_utils[dom] = util; -+ -+ // This could be implemented better. -+ let update_dom_bits = |target: &mut [u64; 8], val: bool| { -+ for cpu in 0..self.top.nr_cpus { -+ if let Some(cdom) = self.top.cpu_dom[cpu] { -+ if cdom == dom { -+ if val { -+ target[cpu / 64] |= 1u64 << (cpu % 64); -+ } else { -+ target[cpu / 64] &= !(1u64 << (cpu % 64)); -+ } -+ } -+ } -+ } -+ }; -+ -+ update_dom_bits( -+ &mut ti.direct_greedy_cpumask, -+ self.direct_greedy_under > 0.99999 || util < self.direct_greedy_under, -+ ); -+ update_dom_bits( -+ &mut ti.kick_greedy_cpumask, -+ self.kick_greedy_under > 0.99999 || util < self.kick_greedy_under, -+ ); -+ } -+ -+ ti.gen += 1; -+ self.prev_cpu_stats = curr_cpu_stats; -+ Ok(()) -+ } -+} -+ -+#[derive(Debug)] -+struct TaskInfo { -+ pid: i32, -+ dom_mask: u64, -+ migrated: Cell, -+ is_kworker: bool, -+} -+ -+struct LoadBalancer<'a, 'b, 'c> { -+ skel: &'a mut RustySkel<'b>, -+ top: Arc, -+ skip_kworkers: bool, -+ -+ tasks_by_load: Vec, TaskInfo>>>, -+ load_avg: f64, -+ dom_loads: Vec, -+ -+ imbal: Vec, -+ doms_to_push: BTreeMap, u32>, -+ doms_to_pull: BTreeMap, u32>, -+ -+ nr_lb_data_errors: &'c mut u64, -+} -+ -+impl<'a, 'b, 'c> LoadBalancer<'a, 'b, 'c> { -+ // If imbalance gets higher than this ratio, try to balance the loads. -+ const LOAD_IMBAL_HIGH_RATIO: f64 = 0.10; -+ -+ // Aim to transfer this fraction of the imbalance on each round. We want -+ // to be gradual to avoid unnecessary oscillations. While this can delay -+ // convergence, greedy execution should be able to bridge the temporary -+ // gap. -+ const LOAD_IMBAL_XFER_TARGET_RATIO: f64 = 0.50; -+ -+ // Don't push out more than this ratio of load on each round. While this -+ // overlaps with XFER_TARGET_RATIO, XFER_TARGET_RATIO only defines the -+ // target and doesn't limit the total load. As long as the transfer -+ // reduces load imbalance between the two involved domains, it'd happily -+ // transfer whatever amount that can be transferred. This limit is used -+ // as the safety cap to avoid draining a given domain too much in a -+ // single round. -+ const LOAD_IMBAL_PUSH_MAX_RATIO: f64 = 0.50; -+ -+ fn new( -+ skel: &'a mut RustySkel<'b>, -+ top: Arc, -+ skip_kworkers: bool, -+ nr_lb_data_errors: &'c mut u64, -+ ) -> Self { -+ Self { -+ skel, -+ skip_kworkers, -+ -+ tasks_by_load: (0..top.nr_doms).map(|_| None).collect(), -+ load_avg: 0f64, -+ dom_loads: vec![0.0; top.nr_doms], -+ -+ imbal: vec![0.0; top.nr_doms], -+ doms_to_pull: BTreeMap::new(), -+ doms_to_push: BTreeMap::new(), -+ -+ nr_lb_data_errors, -+ -+ top, -+ } -+ } -+ -+ fn read_dom_loads(&mut self) -> Result<()> { -+ let now_mono = now_monotonic(); -+ let load_half_life = self.skel.rodata().load_half_life; -+ let maps = self.skel.maps(); -+ let dom_data = maps.dom_data(); -+ let mut load_sum = 0.0f64; -+ -+ for i in 0..self.top.nr_doms { -+ let key = unsafe { std::mem::transmute::(i as u32) }; -+ -+ if let Some(dom_ctx_map_elem) = dom_data -+ .lookup(&key, libbpf_rs::MapFlags::ANY) -+ .context("Failed to lookup dom_ctx")? -+ { -+ let dom_ctx = unsafe { -+ &*(dom_ctx_map_elem.as_slice().as_ptr() as *const rusty_sys::dom_ctx) -+ }; -+ -+ let rd = &dom_ctx.load_rd; -+ self.dom_loads[i] = ravg_read( -+ rd.val, -+ rd.val_at, -+ rd.old, -+ rd.cur, -+ now_mono, -+ load_half_life, -+ RAVG_FRAC_BITS, -+ ); -+ -+ load_sum += self.dom_loads[i]; -+ } -+ } -+ -+ self.load_avg = load_sum / self.top.nr_doms as f64; -+ -+ Ok(()) -+ } -+ -+ /// To balance dom loads, identify doms with lower and higher load than -+ /// average. -+ fn calculate_dom_load_balance(&mut self) -> Result<()> { -+ for (dom, dom_load) in self.dom_loads.iter().enumerate() { -+ let imbal = dom_load - self.load_avg; -+ if imbal.abs() >= self.load_avg * Self::LOAD_IMBAL_HIGH_RATIO { -+ if imbal > 0f64 { -+ self.doms_to_push.insert(OrderedFloat(imbal), dom as u32); -+ } else { -+ self.doms_to_pull.insert(OrderedFloat(-imbal), dom as u32); -+ } -+ self.imbal[dom] = imbal; -+ } -+ } -+ Ok(()) -+ } -+ -+ /// @dom needs to push out tasks to balance loads. Make sure its -+ /// tasks_by_load is populated so that the victim tasks can be picked. -+ fn populate_tasks_by_load(&mut self, dom: u32) -> Result<()> { -+ if self.tasks_by_load[dom as usize].is_some() { -+ return Ok(()); -+ } -+ -+ // Read active_pids and update write_idx and gen. -+ // -+ // XXX - We can't read task_ctx inline because self.skel.bss() -+ // borrows mutably and thus conflicts with self.skel.maps(). -+ const MAX_PIDS: u64 = rusty_sys::consts_MAX_DOM_ACTIVE_PIDS as u64; -+ let active_pids = &mut self.skel.bss().dom_active_pids[dom as usize]; -+ let mut pids = vec![]; -+ -+ let (mut ridx, widx) = (active_pids.read_idx, active_pids.write_idx); -+ if widx - ridx > MAX_PIDS { -+ ridx = widx - MAX_PIDS; -+ } -+ -+ for idx in ridx..widx { -+ let pid = active_pids.pids[(idx % MAX_PIDS) as usize]; -+ pids.push(pid); -+ } -+ -+ active_pids.read_idx = active_pids.write_idx; -+ active_pids.gen += 1; -+ -+ // Read task_ctx and load. -+ let load_half_life = self.skel.rodata().load_half_life; -+ let maps = self.skel.maps(); -+ let task_data = maps.task_data(); -+ let now_mono = now_monotonic(); -+ let mut tasks_by_load = BTreeMap::new(); -+ -+ for pid in pids.iter() { -+ let key = unsafe { std::mem::transmute::(*pid) }; -+ -+ if let Some(task_data_elem) = task_data.lookup(&key, libbpf_rs::MapFlags::ANY)? { -+ let task_ctx = -+ unsafe { &*(task_data_elem.as_slice().as_ptr() as *const rusty_sys::task_ctx) }; -+ -+ if task_ctx.dom_id != dom { -+ continue; -+ } -+ -+ let rd = &task_ctx.dcyc_rd; -+ let load = task_ctx.weight as f64 -+ * ravg_read( -+ rd.val, -+ rd.val_at, -+ rd.old, -+ rd.cur, -+ now_mono, -+ load_half_life, -+ RAVG_FRAC_BITS, -+ ); -+ -+ tasks_by_load.insert( -+ OrderedFloat(load), -+ TaskInfo { -+ pid: *pid, -+ dom_mask: task_ctx.dom_mask, -+ migrated: Cell::new(false), -+ is_kworker: task_ctx.is_kworker, -+ }, -+ ); -+ } -+ } -+ -+ debug!( -+ "DOM[{:02}] read load for {} tasks", -+ dom, -+ &tasks_by_load.len(), -+ ); -+ trace!("DOM[{:02}] tasks_by_load={:?}", dom, &tasks_by_load); -+ -+ self.tasks_by_load[dom as usize] = Some(tasks_by_load); -+ Ok(()) -+ } -+ -+ // Find the first candidate pid which hasn't already been migrated and -+ // can run in @pull_dom. -+ fn find_first_candidate<'d, I>( -+ tasks_by_load: I, -+ pull_dom: u32, -+ skip_kworkers: bool, -+ ) -> Option<(f64, &'d TaskInfo)> -+ where -+ I: IntoIterator, &'d TaskInfo)>, -+ { -+ match tasks_by_load -+ .into_iter() -+ .skip_while(|(_, task)| { -+ task.migrated.get() -+ || (task.dom_mask & (1 << pull_dom) == 0) -+ || (skip_kworkers && task.is_kworker) -+ }) -+ .next() -+ { -+ Some((OrderedFloat(load), task)) => Some((*load, task)), -+ None => None, -+ } -+ } -+ -+ fn pick_victim( -+ &mut self, -+ (push_dom, to_push): (u32, f64), -+ (pull_dom, to_pull): (u32, f64), -+ ) -> Result> { -+ let to_xfer = to_pull.min(to_push) * Self::LOAD_IMBAL_XFER_TARGET_RATIO; -+ -+ debug!( -+ "considering dom {}@{:.2} -> {}@{:.2}", -+ push_dom, to_push, pull_dom, to_pull -+ ); -+ -+ let calc_new_imbal = |xfer: f64| (to_push - xfer).abs() + (to_pull - xfer).abs(); -+ -+ self.populate_tasks_by_load(push_dom)?; -+ -+ // We want to pick a task to transfer from push_dom to pull_dom to -+ // reduce the load imbalance between the two closest to $to_xfer. -+ // IOW, pick a task which has the closest load value to $to_xfer -+ // that can be migrated. Find such task by locating the first -+ // migratable task while scanning left from $to_xfer and the -+ // counterpart while scanning right and picking the better of the -+ // two. -+ let (load, task, new_imbal) = match ( -+ Self::find_first_candidate( -+ self.tasks_by_load[push_dom as usize] -+ .as_ref() -+ .unwrap() -+ .range((Unbounded, Included(&OrderedFloat(to_xfer)))) -+ .rev(), -+ pull_dom, -+ self.skip_kworkers, -+ ), -+ Self::find_first_candidate( -+ self.tasks_by_load[push_dom as usize] -+ .as_ref() -+ .unwrap() -+ .range((Included(&OrderedFloat(to_xfer)), Unbounded)), -+ pull_dom, -+ self.skip_kworkers, -+ ), -+ ) { -+ (None, None) => return Ok(None), -+ (Some((load, task)), None) | (None, Some((load, task))) => { -+ (load, task, calc_new_imbal(load)) -+ } -+ (Some((load0, task0)), Some((load1, task1))) => { -+ let (new_imbal0, new_imbal1) = (calc_new_imbal(load0), calc_new_imbal(load1)); -+ if new_imbal0 <= new_imbal1 { -+ (load0, task0, new_imbal0) -+ } else { -+ (load1, task1, new_imbal1) -+ } -+ } -+ }; -+ -+ // If the best candidate can't reduce the imbalance, there's nothing -+ // to do for this pair. -+ let old_imbal = to_push + to_pull; -+ if old_imbal < new_imbal { -+ debug!( -+ "skipping pid {}, dom {} -> {} won't improve imbal {:.2} -> {:.2}", -+ task.pid, push_dom, pull_dom, old_imbal, new_imbal -+ ); -+ return Ok(None); -+ } -+ -+ debug!( -+ "migrating pid {}, dom {} -> {}, imbal={:.2} -> {:.2}", -+ task.pid, push_dom, pull_dom, old_imbal, new_imbal, -+ ); -+ -+ Ok(Some((task, load))) -+ } -+ -+ // Actually execute the load balancing. Concretely this writes pid -> dom -+ // entries into the lb_data map for bpf side to consume. -+ fn load_balance(&mut self) -> Result<()> { -+ clear_map(self.skel.maps().lb_data()); -+ -+ debug!("imbal={:?}", &self.imbal); -+ debug!("doms_to_push={:?}", &self.doms_to_push); -+ debug!("doms_to_pull={:?}", &self.doms_to_pull); -+ -+ // Push from the most imbalanced to least. -+ while let Some((OrderedFloat(mut to_push), push_dom)) = self.doms_to_push.pop_last() { -+ let push_max = self.dom_loads[push_dom as usize] * Self::LOAD_IMBAL_PUSH_MAX_RATIO; -+ let mut pushed = 0f64; -+ -+ // Transfer tasks from push_dom to reduce imbalance. -+ loop { -+ let last_pushed = pushed; -+ -+ // Pull from the most imbalaned to least. -+ let mut doms_to_pull = BTreeMap::<_, _>::new(); -+ std::mem::swap(&mut self.doms_to_pull, &mut doms_to_pull); -+ let mut pull_doms = doms_to_pull.into_iter().rev().collect::>(); -+ -+ for (to_pull, pull_dom) in pull_doms.iter_mut() { -+ if let Some((task, load)) = -+ self.pick_victim((push_dom, to_push), (*pull_dom, f64::from(*to_pull)))? -+ { -+ // Execute migration. -+ task.migrated.set(true); -+ to_push -= load; -+ *to_pull -= load; -+ pushed += load; -+ -+ // Ask BPF code to execute the migration. -+ let pid = task.pid; -+ let cpid = (pid as libc::pid_t).to_ne_bytes(); -+ if let Err(e) = self.skel.maps_mut().lb_data().update( -+ &cpid, -+ &pull_dom.to_ne_bytes(), -+ libbpf_rs::MapFlags::NO_EXIST, -+ ) { -+ warn!( -+ "Failed to update lb_data map for pid={} error={:?}", -+ pid, &e -+ ); -+ *self.nr_lb_data_errors += 1; -+ } -+ -+ // Always break after a successful migration so that -+ // the pulling domains are always considered in the -+ // descending imbalance order. -+ break; -+ } -+ } -+ -+ pull_doms -+ .into_iter() -+ .map(|(k, v)| self.doms_to_pull.insert(k, v)) -+ .count(); -+ -+ // Stop repeating if nothing got transferred or pushed enough. -+ if pushed == last_pushed || pushed >= push_max { -+ break; -+ } -+ } -+ } -+ Ok(()) -+ } -+} -+ -+struct Scheduler<'a> { -+ skel: RustySkel<'a>, -+ struct_ops: Option, -+ -+ sched_interval: Duration, -+ tune_interval: Duration, -+ balance_load: bool, -+ balanced_kworkers: bool, -+ -+ top: Arc, -+ proc_reader: procfs::ProcReader, -+ -+ prev_at: Instant, -+ prev_total_cpu: procfs::CpuStat, -+ -+ nr_lb_data_errors: u64, -+ -+ tuner: Tuner, -+} -+ -+impl<'a> Scheduler<'a> { -+ fn init(opts: &Opts) -> Result { -+ // Open the BPF prog first for verification. -+ let mut skel_builder = RustySkelBuilder::default(); -+ skel_builder.obj_builder.debug(opts.verbose > 0); -+ let mut skel = skel_builder.open().context("Failed to open BPF program")?; -+ -+ let nr_cpus = libbpf_rs::num_possible_cpus().unwrap(); -+ if nr_cpus > MAX_CPUS { -+ bail!( -+ "nr_cpus ({}) is greater than MAX_CPUS ({})", -+ nr_cpus, -+ MAX_CPUS -+ ); -+ } -+ -+ // Initialize skel according to @opts. -+ let top = Arc::new(if !opts.cpumasks.is_empty() { -+ Topology::from_cpumasks(&opts.cpumasks, nr_cpus)? -+ } else { -+ Topology::from_cache_level(opts.cache_level, nr_cpus)? -+ }); -+ -+ skel.rodata().nr_doms = top.nr_doms as u32; -+ skel.rodata().nr_cpus = top.nr_cpus as u32; -+ -+ for (cpu, dom) in top.cpu_dom.iter().enumerate() { -+ skel.rodata().cpu_dom_id_map[cpu] = dom.unwrap_or(0) as u32; -+ } -+ -+ for (dom, cpus) in top.dom_cpus.iter().enumerate() { -+ let raw_cpus_slice = cpus.as_raw_slice(); -+ let dom_cpumask_slice = &mut skel.rodata().dom_cpumasks[dom]; -+ let (left, _) = dom_cpumask_slice.split_at_mut(raw_cpus_slice.len()); -+ left.clone_from_slice(cpus.as_raw_slice()); -+ info!( -+ "DOM[{:02}] cpumask{} ({} cpus)", -+ dom, -+ &format_cpumask(dom_cpumask_slice, nr_cpus), -+ cpus.count_ones() -+ ); -+ } -+ -+ skel.rodata().slice_ns = opts.slice_us * 1000; -+ skel.rodata().load_half_life = (opts.load_half_life * 1000000000.0) as u32; -+ skel.rodata().kthreads_local = opts.kthreads_local; -+ skel.rodata().fifo_sched = opts.fifo_sched; -+ skel.rodata().switch_partial = opts.partial; -+ skel.rodata().greedy_threshold = opts.greedy_threshold; -+ skel.rodata().debug = opts.verbose as u32; -+ -+ // Attach. -+ let mut skel = skel.load().context("Failed to load BPF program")?; -+ skel.attach().context("Failed to attach BPF program")?; -+ let struct_ops = Some( -+ skel.maps_mut() -+ .rusty() -+ .attach_struct_ops() -+ .context("Failed to attach rusty struct ops")?, -+ ); -+ info!("Rusty Scheduler Attached"); -+ -+ // Other stuff. -+ let proc_reader = procfs::ProcReader::new(); -+ let prev_total_cpu = read_total_cpu(&proc_reader)?; -+ -+ Ok(Self { -+ skel, -+ struct_ops, // should be held to keep it attached -+ -+ sched_interval: Duration::from_secs_f64(opts.interval), -+ tune_interval: Duration::from_secs_f64(opts.tune_interval), -+ balance_load: !opts.no_load_balance, -+ balanced_kworkers: opts.balanced_kworkers, -+ -+ top: top.clone(), -+ proc_reader, -+ -+ prev_at: Instant::now(), -+ prev_total_cpu, -+ -+ nr_lb_data_errors: 0, -+ -+ tuner: Tuner::new(top, opts)?, -+ }) -+ } -+ -+ fn get_cpu_busy(&mut self) -> Result { -+ let total_cpu = read_total_cpu(&self.proc_reader)?; -+ let busy = match (&self.prev_total_cpu, &total_cpu) { -+ ( -+ procfs::CpuStat { -+ user_usec: Some(prev_user), -+ nice_usec: Some(prev_nice), -+ system_usec: Some(prev_system), -+ idle_usec: Some(prev_idle), -+ iowait_usec: Some(prev_iowait), -+ irq_usec: Some(prev_irq), -+ softirq_usec: Some(prev_softirq), -+ stolen_usec: Some(prev_stolen), -+ guest_usec: _, -+ guest_nice_usec: _, -+ }, -+ procfs::CpuStat { -+ user_usec: Some(curr_user), -+ nice_usec: Some(curr_nice), -+ system_usec: Some(curr_system), -+ idle_usec: Some(curr_idle), -+ iowait_usec: Some(curr_iowait), -+ irq_usec: Some(curr_irq), -+ softirq_usec: Some(curr_softirq), -+ stolen_usec: Some(curr_stolen), -+ guest_usec: _, -+ guest_nice_usec: _, -+ }, -+ ) => { -+ let idle_usec = curr_idle - prev_idle; -+ let iowait_usec = curr_iowait - prev_iowait; -+ let user_usec = curr_user - prev_user; -+ let system_usec = curr_system - prev_system; -+ let nice_usec = curr_nice - prev_nice; -+ let irq_usec = curr_irq - prev_irq; -+ let softirq_usec = curr_softirq - prev_softirq; -+ let stolen_usec = curr_stolen - prev_stolen; -+ -+ let busy_usec = -+ user_usec + system_usec + nice_usec + irq_usec + softirq_usec + stolen_usec; -+ let total_usec = idle_usec + busy_usec + iowait_usec; -+ busy_usec as f64 / total_usec as f64 -+ } -+ _ => { -+ bail!("Some procfs stats are not populated!"); -+ } -+ }; -+ -+ self.prev_total_cpu = total_cpu; -+ Ok(busy) -+ } -+ -+ fn read_bpf_stats(&mut self) -> Result> { -+ let mut maps = self.skel.maps_mut(); -+ let stats_map = maps.stats(); -+ let mut stats: Vec = Vec::new(); -+ let zero_vec = vec![vec![0u8; stats_map.value_size() as usize]; self.top.nr_cpus]; -+ -+ for stat in 0..rusty_sys::stat_idx_RUSTY_NR_STATS { -+ let cpu_stat_vec = stats_map -+ .lookup_percpu(&stat.to_ne_bytes(), libbpf_rs::MapFlags::ANY) -+ .with_context(|| format!("Failed to lookup stat {}", stat))? -+ .expect("per-cpu stat should exist"); -+ let sum = cpu_stat_vec -+ .iter() -+ .map(|val| { -+ u64::from_ne_bytes( -+ val.as_slice() -+ .try_into() -+ .expect("Invalid value length in stat map"), -+ ) -+ }) -+ .sum(); -+ stats_map -+ .update_percpu(&stat.to_ne_bytes(), &zero_vec, libbpf_rs::MapFlags::ANY) -+ .context("Failed to zero stat")?; -+ stats.push(sum); -+ } -+ Ok(stats) -+ } -+ -+ fn report( -+ &mut self, -+ stats: &[u64], -+ cpu_busy: f64, -+ processing_dur: Duration, -+ load_avg: f64, -+ dom_loads: &[f64], -+ imbal: &[f64], -+ ) { -+ let stat = |idx| stats[idx as usize]; -+ let total = stat(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_PINNED) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH) -+ + stat(rusty_sys::stat_idx_RUSTY_STAT_GREEDY); -+ -+ info!( -+ "cpu={:7.2} bal={} load_avg={:8.2} task_err={} lb_data_err={} proc={:?}ms", -+ cpu_busy * 100.0, -+ stats[rusty_sys::stat_idx_RUSTY_STAT_LOAD_BALANCE as usize], -+ load_avg, -+ stats[rusty_sys::stat_idx_RUSTY_STAT_TASK_GET_ERR as usize], -+ self.nr_lb_data_errors, -+ processing_dur.as_millis(), -+ ); -+ -+ let stat_pct = |idx| stat(idx) as f64 / total as f64 * 100.0; -+ -+ info!( -+ "tot={:7} wsync={:5.2} prev_idle={:5.2} greedy_idle={:5.2} pin={:5.2}", -+ total, -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_WAKE_SYNC), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PREV_IDLE), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY_IDLE), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_PINNED), -+ ); -+ -+ info!( -+ "dir={:5.2} dir_greedy={:5.2} dir_greedy_far={:5.2}", -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_DISPATCH), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DIRECT_GREEDY_FAR), -+ ); -+ -+ info!( -+ "dsq={:5.2} greedy={:5.2} kick_greedy={:5.2} rep={:5.2}", -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_DSQ_DISPATCH), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_GREEDY), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_KICK_GREEDY), -+ stat_pct(rusty_sys::stat_idx_RUSTY_STAT_REPATRIATE), -+ ); -+ -+ let ti = &self.skel.bss().tune_input; -+ info!( -+ "direct_greedy_cpumask={}", -+ format_cpumask(&ti.direct_greedy_cpumask, self.top.nr_cpus) -+ ); -+ info!( -+ " kick_greedy_cpumask={}", -+ format_cpumask(&ti.kick_greedy_cpumask, self.top.nr_cpus) -+ ); -+ -+ for i in 0..self.top.nr_doms { -+ info!( -+ "DOM[{:02}] util={:6.2} load={:8.2} imbal={}", -+ i, -+ self.tuner.dom_utils[i] * 100.0, -+ dom_loads[i], -+ if imbal[i] == 0.0 { -+ format!("{:9.2}", 0.0) -+ } else { -+ format!("{:+9.2}", imbal[i]) -+ }, -+ ); -+ } -+ } -+ -+ fn lb_step(&mut self) -> Result<()> { -+ let started_at = Instant::now(); -+ let bpf_stats = self.read_bpf_stats()?; -+ let cpu_busy = self.get_cpu_busy()?; -+ -+ let mut lb = LoadBalancer::new( -+ &mut self.skel, -+ self.top.clone(), -+ self.balanced_kworkers, -+ &mut self.nr_lb_data_errors, -+ ); -+ -+ lb.read_dom_loads()?; -+ lb.calculate_dom_load_balance()?; -+ -+ if self.balance_load { -+ lb.load_balance()?; -+ } -+ -+ // Extract fields needed for reporting and drop lb to release -+ // mutable borrows. -+ let (load_avg, dom_loads, imbal) = (lb.load_avg, lb.dom_loads, lb.imbal); -+ -+ self.report( -+ &bpf_stats, -+ cpu_busy, -+ Instant::now().duration_since(started_at), -+ load_avg, -+ &dom_loads, -+ &imbal, -+ ); -+ -+ self.prev_at = started_at; -+ Ok(()) -+ } -+ -+ fn read_bpf_exit_kind(&mut self) -> i32 { -+ unsafe { std::ptr::read_volatile(&self.skel.bss().exit_kind as *const _) } -+ } -+ -+ fn report_bpf_exit_kind(&mut self) -> Result<()> { -+ // Report msg if EXT_OPS_EXIT_ERROR. -+ match self.read_bpf_exit_kind() { -+ 0 => Ok(()), -+ etype if etype == 2 => { -+ let cstr = unsafe { CStr::from_ptr(self.skel.bss().exit_msg.as_ptr() as *const _) }; -+ let msg = cstr -+ .to_str() -+ .context("Failed to convert exit msg to string") -+ .unwrap(); -+ bail!("BPF exit_kind={} msg={}", etype, msg); -+ } -+ etype => { -+ info!("BPF exit_kind={}", etype); -+ Ok(()) -+ } -+ } -+ } -+ -+ fn run(&mut self, shutdown: Arc) -> Result<()> { -+ let now = Instant::now(); -+ let mut next_tune_at = now + self.tune_interval; -+ let mut next_sched_at = now + self.sched_interval; -+ -+ while !shutdown.load(Ordering::Relaxed) && self.read_bpf_exit_kind() == 0 { -+ let now = Instant::now(); -+ -+ if now >= next_tune_at { -+ self.tuner.step(&mut self.skel)?; -+ next_tune_at += self.tune_interval; -+ if next_tune_at < now { -+ next_tune_at = now + self.tune_interval; -+ } -+ } -+ -+ if now >= next_sched_at { -+ self.lb_step()?; -+ next_sched_at += self.sched_interval; -+ if next_sched_at < now { -+ next_sched_at = now + self.sched_interval; -+ } -+ } -+ -+ std::thread::sleep( -+ next_sched_at -+ .min(next_tune_at) -+ .duration_since(Instant::now()), -+ ); -+ } -+ -+ self.report_bpf_exit_kind() -+ } -+} -+ -+impl<'a> Drop for Scheduler<'a> { -+ fn drop(&mut self) { -+ if let Some(struct_ops) = self.struct_ops.take() { -+ drop(struct_ops); -+ } -+ } -+} -+ -+fn main() -> Result<()> { -+ let opts = Opts::parse(); -+ -+ let llv = match opts.verbose { -+ 0 => simplelog::LevelFilter::Info, -+ 1 => simplelog::LevelFilter::Debug, -+ _ => simplelog::LevelFilter::Trace, -+ }; -+ let mut lcfg = simplelog::ConfigBuilder::new(); -+ lcfg.set_time_level(simplelog::LevelFilter::Error) -+ .set_location_level(simplelog::LevelFilter::Off) -+ .set_target_level(simplelog::LevelFilter::Off) -+ .set_thread_level(simplelog::LevelFilter::Off); -+ simplelog::TermLogger::init( -+ llv, -+ lcfg.build(), -+ simplelog::TerminalMode::Stderr, -+ simplelog::ColorChoice::Auto, -+ )?; -+ -+ let mut sched = Scheduler::init(&opts)?; -+ -+ let shutdown = Arc::new(AtomicBool::new(false)); -+ let shutdown_clone = shutdown.clone(); -+ ctrlc::set_handler(move || { -+ shutdown_clone.store(true, Ordering::Relaxed); -+ }) -+ .context("Error setting Ctrl-C handler")?; -+ -+ sched.run(shutdown) -+} -diff --git a/tools/sched_ext/scx_rusty/src/rusty_sys.rs b/tools/sched_ext/scx_rusty/src/rusty_sys.rs -new file mode 100644 -index 000000000..e948d81e7 ---- /dev/null -+++ b/tools/sched_ext/scx_rusty/src/rusty_sys.rs -@@ -0,0 +1,10 @@ -+// Copyright (c) Meta Platforms, Inc. and affiliates. -+ -+// This software may be used and distributed according to the terms of the -+// GNU General Public License version 2. -+#![allow(non_upper_case_globals)] -+#![allow(non_camel_case_types)] -+#![allow(non_snake_case)] -+#![allow(dead_code)] -+ -+include!(concat!(env!("OUT_DIR"), "/rusty_sys.rs")); -diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c -new file mode 100644 -index 000000000..56b589d7f ---- /dev/null -+++ b/tools/sched_ext/scx_simple.bpf.c -@@ -0,0 +1,143 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A simple scheduler. -+ * -+ * By default, it operates as a simple global weighted vtime scheduler and can -+ * be switched to FIFO scheduling. It also demonstrates the following niceties. -+ * -+ * - Statistics tracking how many tasks are queued to local and global dsq's. -+ * - Termination notification for userspace. -+ * -+ * While very simple, this scheduler should work reasonably well on CPUs with a -+ * uniform L3 cache topology. While preemption is not implemented, the fact that -+ * the scheduling queue is shared across all CPUs means that whatever is at the -+ * front of the queue is likely to be executed fairly quickly given enough -+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads -+ * but comes with the usual problems with FIFO scheduling where saturating -+ * threads can easily drown out interactive ones. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include "scx_common.bpf.h" -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile bool fifo_sched; -+const volatile bool switch_partial; -+ -+static u64 vtime_now; -+struct user_exit_info uei; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(key_size, sizeof(u32)); -+ __uint(value_size, sizeof(u64)); -+ __uint(max_entries, 2); /* [local, global] */ -+} stats SEC(".maps"); -+ -+static void stat_inc(u32 idx) -+{ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); -+ if (cnt_p) -+ (*cnt_p)++; -+} -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ /* -+ * If scx_select_cpu_dfl() is setting %SCX_ENQ_LOCAL, it indicates that -+ * running @p on its CPU directly shouldn't affect fairness. Just queue -+ * it on the local FIFO. -+ */ -+ if (enq_flags & SCX_ENQ_LOCAL) { -+ stat_inc(0); /* count local queueing */ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); -+ return; -+ } -+ -+ stat_inc(1); /* count global queueing */ -+ -+ if (fifo_sched) { -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+ } else { -+ u64 vtime = p->scx.dsq_vtime; -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) -+ vtime = vtime_now - SCX_SLICE_DFL; -+ -+ scx_bpf_dispatch_vtime(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, vtime, -+ enq_flags); -+ } -+} -+ -+void BPF_STRUCT_OPS(simple_running, struct task_struct *p) -+{ -+ if (fifo_sched) -+ return; -+ -+ /* -+ * Global vtime always progresses forward as tasks start executing. The -+ * test and update can be performed concurrently from multiple CPUs and -+ * thus racy. Any error should be contained and temporary. Let's just -+ * live with it. -+ */ -+ if (vtime_before(vtime_now, p->scx.dsq_vtime)) -+ vtime_now = p->scx.dsq_vtime; -+} -+ -+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) -+{ -+ if (fifo_sched) -+ return; -+ -+ /* -+ * Scale the execution time by the inverse of the weight and charge. -+ * -+ * Note that the default yield implementation yields by setting -+ * @p->scx.slice to zero and the following would treat the yielding task -+ * as if it has consumed all its slice. If this penalizes yielding tasks -+ * too much, determine the execution time by taking explicit timestamps -+ * instead of depending on @p->scx.slice. -+ */ -+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+} -+ -+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ p->scx.dsq_vtime = vtime_now; -+} -+ -+s32 BPF_STRUCT_OPS(simple_init) -+{ -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops simple_ops = { -+ .enqueue = (void *)simple_enqueue, -+ .running = (void *)simple_running, -+ .stopping = (void *)simple_stopping, -+ .enable = (void *)simple_enable, -+ .init = (void *)simple_init, -+ .exit = (void *)simple_exit, -+ .name = "simple", -+}; -diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c -new file mode 100644 -index 000000000..900f1c3e7 ---- /dev/null -+++ b/tools/sched_ext/scx_simple.c -@@ -0,0 +1,99 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "scx_common.h" -+#include "scx_simple.skel.h" -+ -+const char help_fmt[] = -+"A simple sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-f] [-p]\n" -+"\n" -+" -f Use FIFO scheduling instead of weighted vtime scheduling\n" -+" -p Switch only tasks on SCHED_EXT policy intead of all\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+ -+static void sigint_handler(int simple) -+{ -+ exit_req = 1; -+} -+ -+static void read_stats(struct scx_simple *skel, __u64 *stats) -+{ -+ int nr_cpus = libbpf_num_possible_cpus(); -+ __u64 cnts[2][nr_cpus]; -+ __u32 idx; -+ -+ memset(stats, 0, sizeof(stats[0]) * 2); -+ -+ for (idx = 0; idx < 2; idx++) { -+ int ret, cpu; -+ -+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), -+ &idx, cnts[idx]); -+ if (ret < 0) -+ continue; -+ for (cpu = 0; cpu < nr_cpus; cpu++) -+ stats[idx] += cnts[idx][cpu]; -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_simple *skel; -+ struct bpf_link *link; -+ __u32 opt; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ skel = scx_simple__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ while ((opt = getopt(argc, argv, "fph")) != -1) { -+ switch (opt) { -+ case 'f': -+ skel->rodata->fifo_sched = true; -+ break; -+ case 'p': -+ skel->rodata->switch_partial = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ SCX_BUG_ON(scx_simple__load(skel), "Failed to load skel"); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.simple_ops); -+ SCX_BUG_ON(!link, "Failed to attach struct_ops"); -+ -+ while (!exit_req && !uei_exited(&skel->bss->uei)) { -+ __u64 stats[2]; -+ -+ read_stats(skel, stats); -+ printf("local=%llu global=%llu\n", stats[0], stats[1]); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ uei_print(&skel->bss->uei); -+ scx_simple__destroy(skel); -+ return 0; -+} -diff --git a/tools/sched_ext/scx_userland.bpf.c b/tools/sched_ext/scx_userland.bpf.c -new file mode 100644 -index 000000000..9e107a874 ---- /dev/null -+++ b/tools/sched_ext/scx_userland.bpf.c -@@ -0,0 +1,262 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A minimal userland scheduler. -+ * -+ * In terms of scheduling, this provides two different types of behaviors: -+ * 1. A global FIFO scheduling order for _any_ tasks that have CPU affinity. -+ * All such tasks are direct-dispatched from the kernel, and are never -+ * enqueued in user space. -+ * 2. A primitive vruntime scheduler that is implemented in user space, for all -+ * other tasks. -+ * -+ * Some parts of this example user space scheduler could be implemented more -+ * efficiently using more complex and sophisticated data structures. For -+ * example, rather than using BPF_MAP_TYPE_QUEUE's, -+ * BPF_MAP_TYPE_{USER_}RINGBUF's could be used for exchanging messages between -+ * user space and kernel space. Similarly, we use a simple vruntime-sorted list -+ * in user space, but an rbtree could be used instead. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include "scx_common.bpf.h" -+#include "scx_userland.h" -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile bool switch_partial; -+const volatile s32 usersched_pid; -+ -+/* !0 for veristat, set during init */ -+const volatile u32 num_possible_cpus = 64; -+ -+/* Stats that are printed by user space. */ -+u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues; -+ -+struct user_exit_info uei; -+ -+/* -+ * Whether the user space scheduler needs to be scheduled due to a task being -+ * enqueued in user space. -+ */ -+static bool usersched_needed; -+ -+/* -+ * The map containing tasks that are enqueued in user space from the kernel. -+ * -+ * This map is drained by the user space scheduler. -+ */ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, USERLAND_MAX_TASKS); -+ __type(value, struct scx_userland_enqueued_task); -+} enqueued SEC(".maps"); -+ -+/* -+ * The map containing tasks that are dispatched to the kernel from user space. -+ * -+ * Drained by the kernel in userland_dispatch(). -+ */ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, USERLAND_MAX_TASKS); -+ __type(value, s32); -+} dispatched SEC(".maps"); -+ -+/* Per-task scheduling context */ -+struct task_ctx { -+ bool force_local; /* Dispatch directly to local DSQ */ -+}; -+ -+/* Map that contains task-local storage. */ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct task_ctx); -+} task_ctx_stor SEC(".maps"); -+ -+static bool is_usersched_task(const struct task_struct *p) -+{ -+ return p->pid == usersched_pid; -+} -+ -+static bool keep_in_kernel(const struct task_struct *p) -+{ -+ return p->nr_cpus_allowed < num_possible_cpus; -+} -+ -+static struct task_struct *usersched_task(void) -+{ -+ struct task_struct *p; -+ -+ p = bpf_task_from_pid(usersched_pid); -+ /* -+ * Should never happen -- the usersched task should always be managed -+ * by sched_ext. -+ */ -+ if (!p) -+ scx_bpf_error("Failed to find usersched task %d", usersched_pid); -+ -+ return p; -+} -+ -+s32 BPF_STRUCT_OPS(userland_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ if (keep_in_kernel(p)) { -+ s32 cpu; -+ struct task_ctx *tctx; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("Failed to look up task-local storage for %s", p->comm); -+ return -ESRCH; -+ } -+ -+ if (p->nr_cpus_allowed == 1 || -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ tctx->force_local = true; -+ return prev_cpu; -+ } -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) { -+ tctx->force_local = true; -+ return cpu; -+ } -+ } -+ -+ return prev_cpu; -+} -+ -+static void dispatch_user_scheduler(void) -+{ -+ struct task_struct *p; -+ -+ usersched_needed = false; -+ p = usersched_task(); -+ if (p) { -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); -+ bpf_task_release(p); -+ } -+} -+ -+static void enqueue_task_in_user_space(struct task_struct *p, u64 enq_flags) -+{ -+ struct scx_userland_enqueued_task task; -+ -+ memset(&task, 0, sizeof(task)); -+ task.pid = p->pid; -+ task.sum_exec_runtime = p->se.sum_exec_runtime; -+ task.weight = p->scx.weight; -+ -+ if (bpf_map_push_elem(&enqueued, &task, 0)) { -+ /* -+ * If we fail to enqueue the task in user space, put it -+ * directly on the global DSQ. -+ */ -+ __sync_fetch_and_add(&nr_failed_enqueues, 1); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+ } else { -+ __sync_fetch_and_add(&nr_user_enqueues, 1); -+ usersched_needed = true; -+ } -+} -+ -+void BPF_STRUCT_OPS(userland_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ if (keep_in_kernel(p)) { -+ u64 dsq_id = SCX_DSQ_GLOBAL; -+ struct task_ctx *tctx; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("Failed to lookup task ctx for %s", p->comm); -+ return; -+ } -+ -+ if (tctx->force_local) -+ dsq_id = SCX_DSQ_LOCAL; -+ tctx->force_local = false; -+ scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); -+ __sync_fetch_and_add(&nr_kernel_enqueues, 1); -+ return; -+ } else if (!is_usersched_task(p)) { -+ enqueue_task_in_user_space(p, enq_flags); -+ } -+} -+ -+void BPF_STRUCT_OPS(userland_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ if (usersched_needed) -+ dispatch_user_scheduler(); -+ -+ bpf_repeat(4096) { -+ s32 pid; -+ struct task_struct *p; -+ -+ if (bpf_map_pop_elem(&dispatched, &pid)) -+ break; -+ -+ /* -+ * The task could have exited by the time we get around to -+ * dispatching it. Treat this as a normal occurrence, and simply -+ * move onto the next iteration. -+ */ -+ p = bpf_task_from_pid(pid); -+ if (!p) -+ continue; -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); -+ bpf_task_release(p); -+ } -+} -+ -+s32 BPF_STRUCT_OPS(userland_prep_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ if (bpf_task_storage_get(&task_ctx_stor, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE)) -+ return 0; -+ else -+ return -ENOMEM; -+} -+ -+s32 BPF_STRUCT_OPS(userland_init) -+{ -+ if (num_possible_cpus == 0) { -+ scx_bpf_error("User scheduler # CPUs uninitialized (%d)", -+ num_possible_cpus); -+ return -EINVAL; -+ } -+ -+ if (usersched_pid <= 0) { -+ scx_bpf_error("User scheduler pid uninitialized (%d)", -+ usersched_pid); -+ return -EINVAL; -+ } -+ -+ if (!switch_partial) -+ scx_bpf_switch_all(); -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(userland_exit, struct scx_exit_info *ei) -+{ -+ uei_record(&uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops userland_ops = { -+ .select_cpu = (void *)userland_select_cpu, -+ .enqueue = (void *)userland_enqueue, -+ .dispatch = (void *)userland_dispatch, -+ .prep_enable = (void *)userland_prep_enable, -+ .init = (void *)userland_init, -+ .exit = (void *)userland_exit, -+ .timeout_ms = 3000, -+ .name = "userland", -+}; -diff --git a/tools/sched_ext/scx_userland.c b/tools/sched_ext/scx_userland.c -new file mode 100644 -index 000000000..a750f10df ---- /dev/null -+++ b/tools/sched_ext/scx_userland.c -@@ -0,0 +1,366 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A demo sched_ext user space scheduler which provides vruntime semantics -+ * using a simple ordered-list implementation. -+ * -+ * Each CPU in the system resides in a single, global domain. This precludes -+ * the need to do any load balancing between domains. The scheduler could -+ * easily be extended to support multiple domains, with load balancing -+ * happening in user space. -+ * -+ * Any task which has any CPU affinity is scheduled entirely in BPF. This -+ * program only schedules tasks which may run on any CPU. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "scx_common.h" -+#include "scx_userland.h" -+#include "scx_userland.skel.h" -+ -+const char help_fmt[] = -+"A minimal userland sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-b BATCH] [-p]\n" -+"\n" -+" -b BATCH The number of tasks to batch when dispatching (default: 8)\n" -+" -p Don't switch all, switch only tasks on SCHED_EXT policy\n" -+" -h Display this help and exit\n"; -+ -+/* Defined in UAPI */ -+#define SCHED_EXT 7 -+ -+/* Number of tasks to batch when dispatching to user space. */ -+static __u32 batch_size = 8; -+ -+static volatile int exit_req; -+static int enqueued_fd, dispatched_fd; -+ -+static struct scx_userland *skel; -+static struct bpf_link *ops_link; -+ -+/* Stats collected in user space. */ -+static __u64 nr_vruntime_enqueues, nr_vruntime_dispatches; -+ -+/* The data structure containing tasks that are enqueued in user space. */ -+struct enqueued_task { -+ LIST_ENTRY(enqueued_task) entries; -+ __u64 sum_exec_runtime; -+ double vruntime; -+}; -+ -+/* -+ * Use a vruntime-sorted list to store tasks. This could easily be extended to -+ * a more optimal data structure, such as an rbtree as is done in CFS. We -+ * currently elect to use a sorted list to simplify the example for -+ * illustrative purposes. -+ */ -+LIST_HEAD(listhead, enqueued_task); -+ -+/* -+ * A vruntime-sorted list of tasks. The head of the list contains the task with -+ * the lowest vruntime. That is, the task that has the "highest" claim to be -+ * scheduled. -+ */ -+static struct listhead vruntime_head = LIST_HEAD_INITIALIZER(vruntime_head); -+ -+/* -+ * The statically allocated array of tasks. We use a statically allocated list -+ * here to avoid having to allocate on the enqueue path, which could cause a -+ * deadlock. A more substantive user space scheduler could e.g. provide a hook -+ * for newly enabled tasks that are passed to the scheduler from the -+ * .prep_enable() callback to allows the scheduler to allocate on safe paths. -+ */ -+struct enqueued_task tasks[USERLAND_MAX_TASKS]; -+ -+static double min_vruntime; -+ -+static void sigint_handler(int userland) -+{ -+ exit_req = 1; -+} -+ -+static __u32 task_pid(const struct enqueued_task *task) -+{ -+ return ((uintptr_t)task - (uintptr_t)tasks) / sizeof(*task); -+} -+ -+static int dispatch_task(__s32 pid) -+{ -+ int err; -+ -+ err = bpf_map_update_elem(dispatched_fd, NULL, &pid, 0); -+ if (err) { -+ fprintf(stderr, "Failed to dispatch task %d\n", pid); -+ exit_req = 1; -+ } else { -+ nr_vruntime_dispatches++; -+ } -+ -+ return err; -+} -+ -+static struct enqueued_task *get_enqueued_task(__s32 pid) -+{ -+ if (pid >= USERLAND_MAX_TASKS) -+ return NULL; -+ -+ return &tasks[pid]; -+} -+ -+static double calc_vruntime_delta(__u64 weight, __u64 delta) -+{ -+ double weight_f = (double)weight / 100.0; -+ double delta_f = (double)delta; -+ -+ return delta_f / weight_f; -+} -+ -+static void update_enqueued(struct enqueued_task *enqueued, const struct scx_userland_enqueued_task *bpf_task) -+{ -+ __u64 delta; -+ -+ delta = bpf_task->sum_exec_runtime - enqueued->sum_exec_runtime; -+ -+ enqueued->vruntime += calc_vruntime_delta(bpf_task->weight, delta); -+ if (min_vruntime > enqueued->vruntime) -+ enqueued->vruntime = min_vruntime; -+ enqueued->sum_exec_runtime = bpf_task->sum_exec_runtime; -+} -+ -+static int vruntime_enqueue(const struct scx_userland_enqueued_task *bpf_task) -+{ -+ struct enqueued_task *curr, *enqueued, *prev; -+ -+ curr = get_enqueued_task(bpf_task->pid); -+ if (!curr) -+ return ENOENT; -+ -+ update_enqueued(curr, bpf_task); -+ nr_vruntime_enqueues++; -+ -+ /* -+ * Enqueue the task in a vruntime-sorted list. A more optimal data -+ * structure such as an rbtree could easily be used as well. We elect -+ * to use a list here simply because it's less code, and thus the -+ * example is less convoluted and better serves to illustrate what a -+ * user space scheduler could look like. -+ */ -+ -+ if (LIST_EMPTY(&vruntime_head)) { -+ LIST_INSERT_HEAD(&vruntime_head, curr, entries); -+ return 0; -+ } -+ -+ LIST_FOREACH(enqueued, &vruntime_head, entries) { -+ if (curr->vruntime <= enqueued->vruntime) { -+ LIST_INSERT_BEFORE(enqueued, curr, entries); -+ return 0; -+ } -+ prev = enqueued; -+ } -+ -+ LIST_INSERT_AFTER(prev, curr, entries); -+ -+ return 0; -+} -+ -+static void drain_enqueued_map(void) -+{ -+ while (1) { -+ struct scx_userland_enqueued_task task; -+ int err; -+ -+ if (bpf_map_lookup_and_delete_elem(enqueued_fd, NULL, &task)) -+ return; -+ -+ err = vruntime_enqueue(&task); -+ if (err) { -+ fprintf(stderr, "Failed to enqueue task %d: %s\n", -+ task.pid, strerror(err)); -+ exit_req = 1; -+ return; -+ } -+ } -+} -+ -+static void dispatch_batch(void) -+{ -+ __u32 i; -+ -+ for (i = 0; i < batch_size; i++) { -+ struct enqueued_task *task; -+ int err; -+ __s32 pid; -+ -+ task = LIST_FIRST(&vruntime_head); -+ if (!task) -+ return; -+ -+ min_vruntime = task->vruntime; -+ pid = task_pid(task); -+ LIST_REMOVE(task, entries); -+ err = dispatch_task(pid); -+ if (err) { -+ fprintf(stderr, "Failed to dispatch task %d in %u\n", -+ pid, i); -+ return; -+ } -+ } -+} -+ -+static void *run_stats_printer(void *arg) -+{ -+ while (!exit_req) { -+ __u64 nr_failed_enqueues, nr_kernel_enqueues, nr_user_enqueues, total; -+ -+ nr_failed_enqueues = skel->bss->nr_failed_enqueues; -+ nr_kernel_enqueues = skel->bss->nr_kernel_enqueues; -+ nr_user_enqueues = skel->bss->nr_user_enqueues; -+ total = nr_failed_enqueues + nr_kernel_enqueues + nr_user_enqueues; -+ -+ printf("o-----------------------o\n"); -+ printf("| BPF ENQUEUES |\n"); -+ printf("|-----------------------|\n"); -+ printf("| kern: %10llu |\n", nr_kernel_enqueues); -+ printf("| user: %10llu |\n", nr_user_enqueues); -+ printf("| failed: %10llu |\n", nr_failed_enqueues); -+ printf("| -------------------- |\n"); -+ printf("| total: %10llu |\n", total); -+ printf("| |\n"); -+ printf("|-----------------------|\n"); -+ printf("| VRUNTIME / USER |\n"); -+ printf("|-----------------------|\n"); -+ printf("| enq: %10llu |\n", nr_vruntime_enqueues); -+ printf("| disp: %10llu |\n", nr_vruntime_dispatches); -+ printf("o-----------------------o\n"); -+ printf("\n\n"); -+ sleep(1); -+ } -+ -+ return NULL; -+} -+ -+static int spawn_stats_thread(void) -+{ -+ pthread_t stats_printer; -+ -+ return pthread_create(&stats_printer, NULL, run_stats_printer, NULL); -+} -+ -+static void bootstrap(int argc, char **argv) -+{ -+ int err; -+ __u32 opt; -+ struct sched_param sched_param = { -+ .sched_priority = sched_get_priority_max(SCHED_EXT), -+ }; -+ bool switch_partial = false; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ /* -+ * Enforce that the user scheduler task is managed by sched_ext. The -+ * task eagerly drains the list of enqueued tasks in its main work -+ * loop, and then yields the CPU. The BPF scheduler only schedules the -+ * user space scheduler task when at least one other task in the system -+ * needs to be scheduled. -+ */ -+ err = syscall(__NR_sched_setscheduler, getpid(), SCHED_EXT, &sched_param); -+ SCX_BUG_ON(err, "Failed to set scheduler to SCHED_EXT"); -+ -+ while ((opt = getopt(argc, argv, "b:ph")) != -1) { -+ switch (opt) { -+ case 'b': -+ batch_size = strtoul(optarg, NULL, 0); -+ break; -+ case 'p': -+ switch_partial = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ exit(opt != 'h'); -+ } -+ } -+ -+ /* -+ * It's not always safe to allocate in a user space scheduler, as an -+ * enqueued task could hold a lock that we require in order to be able -+ * to allocate. -+ */ -+ err = mlockall(MCL_CURRENT | MCL_FUTURE); -+ SCX_BUG_ON(err, "Failed to prefault and lock address space"); -+ -+ skel = scx_userland__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ skel->rodata->num_possible_cpus = libbpf_num_possible_cpus(); -+ assert(skel->rodata->num_possible_cpus > 0); -+ skel->rodata->usersched_pid = getpid(); -+ assert(skel->rodata->usersched_pid > 0); -+ skel->rodata->switch_partial = switch_partial; -+ -+ SCX_BUG_ON(scx_userland__load(skel), "Failed to load skel"); -+ -+ enqueued_fd = bpf_map__fd(skel->maps.enqueued); -+ dispatched_fd = bpf_map__fd(skel->maps.dispatched); -+ assert(enqueued_fd > 0); -+ assert(dispatched_fd > 0); -+ -+ SCX_BUG_ON(spawn_stats_thread(), "Failed to spawn stats thread"); -+ -+ ops_link = bpf_map__attach_struct_ops(skel->maps.userland_ops); -+ SCX_BUG_ON(!ops_link, "Failed to attach struct_ops"); -+} -+ -+static void sched_main_loop(void) -+{ -+ while (!exit_req) { -+ /* -+ * Perform the following work in the main user space scheduler -+ * loop: -+ * -+ * 1. Drain all tasks from the enqueued map, and enqueue them -+ * to the vruntime sorted list. -+ * -+ * 2. Dispatch a batch of tasks from the vruntime sorted list -+ * down to the kernel. -+ * -+ * 3. Yield the CPU back to the system. The BPF scheduler will -+ * reschedule the user space scheduler once another task has -+ * been enqueued to user space. -+ */ -+ drain_enqueued_map(); -+ dispatch_batch(); -+ sched_yield(); -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ bootstrap(argc, argv); -+ sched_main_loop(); -+ -+ exit_req = 1; -+ bpf_link__destroy(ops_link); -+ uei_print(&skel->bss->uei); -+ scx_userland__destroy(skel); -+ return 0; -+} -diff --git a/tools/sched_ext/scx_userland.h b/tools/sched_ext/scx_userland.h -new file mode 100644 -index 000000000..639c6809c ---- /dev/null -+++ b/tools/sched_ext/scx_userland.h -@@ -0,0 +1,19 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* Copyright (c) 2022 Meta, Inc */ -+ -+#ifndef __SCX_USERLAND_COMMON_H -+#define __SCX_USERLAND_COMMON_H -+ -+#define USERLAND_MAX_TASKS 8192 -+ -+/* -+ * An instance of a task that has been enqueued by the kernel for consumption -+ * by a user space global scheduler thread. -+ */ -+struct scx_userland_enqueued_task { -+ __s32 pid; -+ u64 sum_exec_runtime; -+ u64 weight; -+}; -+ -+#endif // __SCX_USERLAND_COMMON_H -diff --git a/tools/sched_ext/user_exit_info.h b/tools/sched_ext/user_exit_info.h -new file mode 100644 -index 000000000..f0e45bf3c ---- /dev/null -+++ b/tools/sched_ext/user_exit_info.h -@@ -0,0 +1,50 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Define struct user_exit_info which is shared between BPF and userspace parts -+ * to communicate exit status and other information. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef __USER_EXIT_INFO_H -+#define __USER_EXIT_INFO_H -+ -+struct user_exit_info { -+ int kind; -+ char reason[128]; -+ char msg[1024]; -+}; -+ -+#ifdef __bpf__ -+ -+#include "vmlinux.h" -+#include -+ -+static inline void uei_record(struct user_exit_info *uei, -+ const struct scx_exit_info *ei) -+{ -+ bpf_probe_read_kernel_str(uei->reason, sizeof(uei->reason), ei->reason); -+ bpf_probe_read_kernel_str(uei->msg, sizeof(uei->msg), ei->msg); -+ /* use __sync to force memory barrier */ -+ __sync_val_compare_and_swap(&uei->kind, uei->kind, ei->kind); -+} -+ -+#else /* !__bpf__ */ -+ -+static inline bool uei_exited(struct user_exit_info *uei) -+{ -+ /* use __sync to force memory barrier */ -+ return __sync_val_compare_and_swap(&uei->kind, -1, -1); -+} -+ -+static inline void uei_print(const struct user_exit_info *uei) -+{ -+ fprintf(stderr, "EXIT: %s", uei->reason); -+ if (uei->msg[0] != '\0') -+ fprintf(stderr, " (%s)", uei->msg); -+ fputs("\n", stderr); -+} -+ -+#endif /* __bpf__ */ -+#endif /* __USER_EXIT_INFO_H */ --- -2.43.0.rc2 - diff --git a/sys-kernel/git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch b/sys-kernel/gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch similarity index 61% rename from sys-kernel/git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch rename to sys-kernel/gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch index 75c48bf..50f27db 100644 --- a/sys-kernel/git-sources/0001-more-uarches-for-kernel-6.8-rc4+.patch +++ b/sys-kernel/gentoo-sources-6.6/0100-more-ISA-levels-and-uarches-for-kernel-6.1.79-6.8-rc3.patch @@ -1,26 +1,37 @@ -From 71dd30c3e2ab2852b0290ae1f34ce1c7f8655040 Mon Sep 17 00:00:00 2001 -From: graysky -Date: Wed, 21 Feb 2024 08:38:13 -0500 +From a4ebe91654460da51b0327f3d0a051aaeab2d423 Mon Sep 17 00:00:00 2001 +From: graysky +Date: Mon, 16 Sep 2024 05:55:58 -0400 FEATURES -This patch adds additional CPU options to the Linux kernel accessible under: - Processor type and features ---> - Processor family ---> +This patch adds additional tunings via new x86-64 ISA levels and +more micro-architecture options to the Linux kernel in three classes. -With the release of gcc 11.1 and clang 12.0, several generic 64-bit levels are -offered which are good for supported Intel or AMD CPUs: -• x86-64-v2 -• x86-64-v3 -• x86-64-v4 +1. New generic x86-64 ISA levels + +These are selectable under: + Processor type and features ---> x86-64 compiler ISA level + +• x86-64 A value of (1) is the default +• x86-64-v2 A value of (2) brings support for vector + instructions up to Streaming SIMD Extensions 4.2 (SSE4.2) + and Supplemental Streaming SIMD Extensions 3 (SSSE3), the + POPCNT instruction, and CMPXCHG16B. +• x86-64-v3 A value of (3) adds vector instructions up to AVX2, MOVBE, + and additional bit-manipulation instructions. + +There is also x86-64-v4 but including this makes little sense as +the kernel does not use any of the AVX512 instructions anyway. Users of glibc 2.33 and above can see which level is supported by running: - /lib/ld-linux-x86-64.so.2 --help | grep supported + /lib/ld-linux-x86-64.so.2 --help | grep supported Or - /lib64/ld-linux-x86-64.so.2 --help | grep supported + /lib64/ld-linux-x86-64.so.2 --help | grep supported + +2. New micro-architectures -Alternatively, compare the flags from /proc/cpuinfo to this list.[1] +These are selectable under: + Processor type and features ---> Processor family -CPU-specific microarchitectures include: • AMD Improved K8-family • AMD K10-family • AMD Family 10h (Barcelona) @@ -32,8 +43,9 @@ CPU-specific microarchitectures include: • AMD Family 15h (Excavator) • AMD Family 17h (Zen) • AMD Family 17h (Zen 2) -• AMD Family 19h (Zen 3)† -• AMD Family 19h (Zen 4)§ +• AMD Family 19h (Zen 3)** +• AMD Family 19h (Zen 4)‡ +• AMD Family 1Ah (Zen 5)§ • Intel Silvermont low-power processors • Intel Goldmont low-power processors (Apollo Lake and Denverton) • Intel Goldmont Plus low-power processors (Gemini Lake) @@ -50,24 +62,27 @@ CPU-specific microarchitectures include: • Intel Xeon (Cascade Lake) • Intel Xeon (Cooper Lake)* • Intel 3rd Gen 10nm++ i3/i5/i7/i9-family (Tiger Lake)* -• Intel 4th Gen 10nm++ Xeon (Sapphire Rapids)‡ -• Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)‡ -• Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)‡ -• Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)§ -• Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)§ -• Intel 5th Gen 10nm++ Xeon (Emerald Rapids)§ +• Intel 4th Gen 10nm++ Xeon (Sapphire Rapids)† +• Intel 11th Gen i3/i5/i7/i9-family (Rocket Lake)† +• Intel 12th Gen i3/i5/i7/i9-family (Alder Lake)† +• Intel 13th Gen i3/i5/i7/i9-family (Raptor Lake)‡ +• Intel 14th Gen i3/i5/i7/i9-family (Meteor Lake)‡ +• Intel 5th Gen 10nm++ Xeon (Emerald Rapids)‡ Notes: If not otherwise noted, gcc >=9.1 is required for support. *Requires gcc >=10.1 or clang >=10.0 - †Required gcc >=10.3 or clang >=12.0 - ‡Required gcc >=11.1 or clang >=12.0 - §Required gcc >=13.0 or clang >=15.0.5 + **Required gcc >=10.3 or clang >=12.0 + †Required gcc >=11.1 or clang >=12.0 + ‡Required gcc >=13.0 or clang >=15.0.5 + §Required gcc >14.0 or clang >=19.0? -It also offers to compile passing the 'native' option which, "selects the CPU +3. Auto-detected micro-architecture levels + +Compile by passing the '-march=native' option which, "selects the CPU to generate code for at compilation time by determining the processor type of the compiling machine. Using -march=native enables all instruction subsets supported by the local machine and will produce code optimized for the local -machine under the constraints of the selected instruction set."[2] +machine under the constraints of the selected instruction set."[1] Users of Intel CPUs should select the 'Intel-Native' option and users of AMD CPUs should select the 'AMD-Native' option. @@ -75,9 +90,9 @@ CPUs should select the 'AMD-Native' option. MINOR NOTES RELATING TO INTEL ATOM PROCESSORS This patch also changes -march=atom to -march=bonnell in accordance with the gcc v4.9 changes. Upstream is using the deprecated -match=atom flags when I -believe it should use the newer -march=bonnell flag for atom processors.[3] +believe it should use the newer -march=bonnell flag for atom processors.[2] -It is not recommended to compile on Atom-CPUs with the 'native' option.[4] The +It is not recommended to compile on Atom-CPUs with the 'native' option.[3] The recommendation is to use the 'atom' option instead. BENEFITS @@ -85,41 +100,43 @@ Small but real speed increases are measurable using a make endpoint comparing a generic kernel to one built with one of the respective microarchs. See the following experimental evidence supporting this statement: -https://github.com/graysky2/kernel_gcc_patch +https://github.com/graysky2/kernel_compiler_patch?tab=readme-ov-file#benchmarks REQUIREMENTS -linux version 5.17+ +linux version 6.8-rc3+ gcc version >=9.0 or clang version >=9.0 ACKNOWLEDGMENTS -This patch builds on the seminal work by Jeroen.[5] +This patch builds on the seminal work by Jeroen.[4] REFERENCES -1. https://gitlab.com/x86-psABIs/x86-64-ABI/-/commit/77566eb03bc6a326811cb7e9 -2. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options -3. https://bugzilla.kernel.org/show_bug.cgi?id=77461 -4. https://github.com/graysky2/kernel_gcc_patch/issues/15 -5. http://www.linuxforge.net/docs/linux/linux-gcc.php +1. https://gcc.gnu.org/onlinedocs/gcc/x86-Options.html#index-x86-Options +2. https://bugzilla.kernel.org/show_bug.cgi?id=77461 +3. https://github.com/graysky2/kernel_gcc_patch/issues/15 +4. http://www.linuxforge.net/docs/linux/linux-gcc.php + --- - arch/x86/Kconfig.cpu | 424 ++++++++++++++++++++++++++++++-- - arch/x86/Makefile | 44 +++- - arch/x86/include/asm/vermagic.h | 74 ++++++ - 3 files changed, 526 insertions(+), 16 deletions(-) + arch/x86/Kconfig.cpu | 359 ++++++++++++++++++++++++++++++-- + arch/x86/Makefile | 89 +++++++- + arch/x86/include/asm/vermagic.h | 70 +++++++ + 3 files changed, 500 insertions(+), 18 deletions(-) diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu -index 2a7279d80460a..6924a0f5f1c26 100644 +index ce5ed2c2db0c..1cd49fac2ac9 100644 --- a/arch/x86/Kconfig.cpu +++ b/arch/x86/Kconfig.cpu -@@ -157,7 +157,7 @@ config MPENTIUM4 - +@@ -155,9 +155,8 @@ config MPENTIUM4 + -Paxville + -Dempsey +- config MK6 - bool "K6/K6-II/K6-III" + bool "AMD K6/K6-II/K6-III" depends on X86_32 help Select this for an AMD K6-family processor. Enables use of -@@ -165,7 +165,7 @@ config MK6 +@@ -165,7 +164,7 @@ config MK6 flags to GCC. config MK7 @@ -128,7 +145,7 @@ index 2a7279d80460a..6924a0f5f1c26 100644 depends on X86_32 help Select this for an AMD Athlon K7-family processor. Enables use of -@@ -173,12 +173,106 @@ config MK7 +@@ -173,12 +172,114 @@ config MK7 flags to GCC. config MK8 @@ -232,44 +249,59 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + Select this for AMD Family 19h Zen 4 processors. + + Enables -march=znver4 ++ ++config MZEN5 ++ bool "AMD Zen 5" ++ depends on (CC_IS_GCC && GCC_VERSION > 140000) || (CC_IS_CLANG && CLANG_VERSION >= 191000) ++ help ++ Select this for AMD Family 19h Zen 5 processors. ++ ++ Enables -march=znver5 + config MCRUSOE bool "Crusoe" depends on X86_32 -@@ -270,7 +364,7 @@ config MPSC +@@ -269,8 +370,17 @@ config MPSC + using the cpu family field in /proc/cpuinfo. Family 15 is an older Xeon, Family 6 a newer one. ++config MATOM ++ bool "Intel Atom" ++ help ++ ++ Select this for the Intel Atom platform. Intel Atom CPUs have an ++ in-order pipelining architecture and thus can benefit from ++ accordingly optimized code. Use a recent GCC with specific Atom ++ support in order to fully benefit from selecting this option. ++ config MCORE2 - bool "Core 2/newer Xeon" + bool "Intel Core 2" help Select this for Intel Core 2 and newer Core 2 Xeons (Xeon 51xx and -@@ -278,6 +372,8 @@ config MCORE2 +@@ -278,14 +388,191 @@ config MCORE2 family in /proc/cpuinfo. Newer ones have 6 and older ones 15 (not a typo) +-config MATOM +- bool "Intel Atom" + Enables -march=core2 + - config MATOM - bool "Intel Atom" - help -@@ -287,6 +383,212 @@ config MATOM - accordingly optimized code. Use a recent GCC with specific Atom - support in order to fully benefit from selecting this option. - +config MNEHALEM + bool "Intel Nehalem" -+ select X86_P6_NOP -+ help -+ + help + +- Select this for the Intel Atom platform. Intel Atom CPUs have an +- in-order pipelining architecture and thus can benefit from +- accordingly optimized code. Use a recent GCC with specific Atom +- support in order to fully benefit from selecting this option. + Select this for 1st Gen Core processors in the Nehalem family. + + Enables -march=nehalem + +config MWESTMERE + bool "Intel Westmere" -+ select X86_P6_NOP + help + + Select this for the Intel Westmere formerly Nehalem-C family. @@ -278,7 +310,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MSILVERMONT + bool "Intel Silvermont" -+ select X86_P6_NOP + help + + Select this for the Intel Silvermont platform. @@ -287,7 +318,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MGOLDMONT + bool "Intel Goldmont" -+ select X86_P6_NOP + help + + Select this for the Intel Goldmont platform including Apollo Lake and Denverton. @@ -296,7 +326,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MGOLDMONTPLUS + bool "Intel Goldmont Plus" -+ select X86_P6_NOP + help + + Select this for the Intel Goldmont Plus platform including Gemini Lake. @@ -305,7 +334,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MSANDYBRIDGE + bool "Intel Sandy Bridge" -+ select X86_P6_NOP + help + + Select this for 2nd Gen Core processors in the Sandy Bridge family. @@ -314,7 +342,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MIVYBRIDGE + bool "Intel Ivy Bridge" -+ select X86_P6_NOP + help + + Select this for 3rd Gen Core processors in the Ivy Bridge family. @@ -323,7 +350,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MHASWELL + bool "Intel Haswell" -+ select X86_P6_NOP + help + + Select this for 4th Gen Core processors in the Haswell family. @@ -332,7 +358,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MBROADWELL + bool "Intel Broadwell" -+ select X86_P6_NOP + help + + Select this for 5th Gen Core processors in the Broadwell family. @@ -341,7 +366,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MSKYLAKE + bool "Intel Skylake" -+ select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake family. @@ -350,7 +374,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MSKYLAKEX + bool "Intel Skylake X" -+ select X86_P6_NOP + help + + Select this for 6th Gen Core processors in the Skylake X family. @@ -359,7 +382,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MCANNONLAKE + bool "Intel Cannon Lake" -+ select X86_P6_NOP + help + + Select this for 8th Gen Core processors @@ -368,7 +390,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MICELAKE + bool "Intel Ice Lake" -+ select X86_P6_NOP + help + + Select this for 10th Gen Core processors in the Ice Lake family. @@ -377,7 +398,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 + +config MCASCADELAKE + bool "Intel Cascade Lake" -+ select X86_P6_NOP + help + + Select this for Xeon processors in the Cascade Lake family. @@ -387,7 +407,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MCOOPERLAKE + bool "Intel Cooper Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) -+ select X86_P6_NOP + help + + Select this for Xeon processors in the Cooper Lake family. @@ -397,7 +416,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MTIGERLAKE + bool "Intel Tiger Lake" + depends on (CC_IS_GCC && GCC_VERSION > 100100) || (CC_IS_CLANG && CLANG_VERSION >= 100000) -+ select X86_P6_NOP + help + + Select this for third-generation 10 nm process processors in the Tiger Lake family. @@ -407,7 +425,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MSAPPHIRERAPIDS + bool "Intel Sapphire Rapids" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ select X86_P6_NOP + help + + Select this for fourth-generation 10 nm process processors in the Sapphire Rapids family. @@ -417,7 +434,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MROCKETLAKE + bool "Intel Rocket Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ select X86_P6_NOP + help + + Select this for eleventh-generation processors in the Rocket Lake family. @@ -427,7 +443,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MALDERLAKE + bool "Intel Alder Lake" + depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ select X86_P6_NOP + help + + Select this for twelfth-generation processors in the Alder Lake family. @@ -437,7 +452,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MRAPTORLAKE + bool "Intel Raptor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) -+ select X86_P6_NOP + help + + Select this for thirteenth-generation processors in the Raptor Lake family. @@ -447,7 +461,6 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MMETEORLAKE + bool "Intel Meteor Lake" + depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) -+ select X86_P6_NOP + help + + Select this for fourteenth-generation processors in the Meteor Lake family. @@ -457,44 +470,18 @@ index 2a7279d80460a..6924a0f5f1c26 100644 +config MEMERALDRAPIDS + bool "Intel Emerald Rapids" + depends on (CC_IS_GCC && GCC_VERSION > 130000) || (CC_IS_CLANG && CLANG_VERSION >= 150500) -+ select X86_P6_NOP + help + + Select this for fifth-generation 10 nm process processors in the Emerald Rapids family. + + Enables -march=emeraldrapids -+ + config GENERIC_CPU bool "Generic-x86-64" - depends on X86_64 -@@ -294,6 +596,50 @@ config GENERIC_CPU +@@ -294,6 +581,26 @@ config GENERIC_CPU Generic x86-64 CPU. Run equally well on all x86-64 CPUs. -+config GENERIC_CPU2 -+ bool "Generic-x86-64-v2" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ depends on X86_64 -+ help -+ Generic x86-64 CPU. -+ Run equally well on all x86-64 CPUs with min support of x86-64-v2. -+ -+config GENERIC_CPU3 -+ bool "Generic-x86-64-v3" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ depends on X86_64 -+ help -+ Generic x86-64-v3 CPU with v3 instructions. -+ Run equally well on all x86-64 CPUs with min support of x86-64-v3. -+ -+config GENERIC_CPU4 -+ bool "Generic-x86-64-v4" -+ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) -+ depends on X86_64 -+ help -+ Generic x86-64 CPU with v4 instructions. -+ Run equally well on all x86-64 CPUs with min support of x86-64-v4. -+ +config MNATIVE_INTEL + bool "Intel-Native optimizations autodetected by the compiler" + help @@ -518,133 +505,78 @@ index 2a7279d80460a..6924a0f5f1c26 100644 endchoice config X86_GENERIC -@@ -318,9 +664,17 @@ config X86_INTERNODE_CACHE_SHIFT +@@ -308,6 +615,30 @@ config X86_GENERIC + This is really intended for distributors who need more + generic optimizations. + ++config X86_64_VERSION ++ int "x86-64 compiler ISA level" ++ range 1 3 ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 && GENERIC_CPU ++ help ++ Specify a specific x86-64 compiler ISA level. ++ ++ There are three x86-64 ISA levels that work on top of ++ the x86-64 baseline, namely: x86-64-v2, x86-64-v3, and x86-64-v4. ++ ++ x86-64-v2 brings support for vector instructions up to Streaming SIMD ++ Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3 ++ (SSSE3), the POPCNT instruction, and CMPXCHG16B. ++ ++ x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional ++ bit-manipulation instructions. ++ ++ x86-64-v4 is not included since the kernel does not use AVX512 instructions ++ ++ You can find the best version for your CPU by running one of the following: ++ /lib/ld-linux-x86-64.so.2 --help | grep supported ++ /lib64/ld-linux-x86-64.so.2 --help | grep supported ++ + # + # Define implied options from the CPU selection here + config X86_INTERNODE_CACHE_SHIFT +@@ -318,7 +649,7 @@ config X86_INTERNODE_CACHE_SHIFT config X86_L1_CACHE_SHIFT int default "7" if MPENTIUM4 || MPSC - default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU -+ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || MK8SSE3 || MK10 \ -+ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ -+ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ -+ || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ -+ || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ -+ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE \ -+ || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD || X86_GENERIC || GENERIC_CPU || GENERIC_CPU2 \ -+ || GENERIC_CPU3 || GENERIC_CPU4 ++ default "6" if MK7 || MK8 || MPENTIUMM || MCORE2 || MATOM || MVIAC7 || X86_GENERIC || GENERIC_CPU || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD default "4" if MELAN || M486SX || M486 || MGEODEGX1 -- default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX -+ default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII \ -+ || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - - config X86_F00F_BUG - def_bool y -@@ -332,15 +686,27 @@ config X86_INVD_BUG + default "5" if MWINCHIP3D || MWINCHIPC6 || MCRUSOE || MEFFICEON || MCYRIXIII || MK6 || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || M586 || MVIAC3_2 || MGEODE_LX - config X86_ALIGNMENT_16 - def_bool y -- depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 -+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MELAN || MK6 || M586MMX || M586TSC \ -+ || M586 || M486SX || M486 || MVIAC3_2 || MGEODEGX1 +@@ -336,11 +667,11 @@ config X86_ALIGNMENT_16 config X86_INTEL_USERCOPY def_bool y - depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 -+ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC \ -+ || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ -+ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ -+ || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ -+ || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL ++ depends on MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M586MMX || X86_GENERIC || MK8 || MK7 || MEFFICEON || MCORE2 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL config X86_USE_PPRO_CHECKSUM def_bool y - depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM -+ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ -+ || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX \ -+ || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ -+ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ -+ || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE \ -+ || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE \ -+ || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ -+ || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD ++ depends on MWINCHIP3D || MWINCHIPC6 || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MK8 || MVIAC3_2 || MVIAC7 || MEFFICEON || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MZEN5 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD # # P6_NOPs are a relatively minor optimization that require a family >= -@@ -356,11 +722,22 @@ config X86_USE_PPRO_CHECKSUM - config X86_P6_NOP - def_bool y - depends on X86_64 -- depends on (MCORE2 || MPENTIUM4 || MPSC) -+ depends on (MCORE2 || MPENTIUM4 || MPSC || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ -+ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE \ -+ || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE \ -+ || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \ -+ || MNATIVE_INTEL) - - config X86_TSC - def_bool y -- depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 || MGEODE_LX || MCORE2 || MATOM) || X86_64 -+ depends on (MWINCHIP3D || MCRUSOE || MEFFICEON || MCYRIXIII || MK7 || MK6 || MPENTIUM4 || MPENTIUMM \ -+ || MPENTIUMIII || MPENTIUMII || M686 || M586MMX || M586TSC || MK8 || MVIAC3_2 || MVIAC7 || MGEODEGX1 \ -+ || MGEODE_LX || MCORE2 || MATOM || MK8SSE3 || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER \ -+ || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM \ -+ || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL \ -+ || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ -+ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS \ -+ || MNATIVE_INTEL || MNATIVE_AMD) || X86_64 - - config X86_HAVE_PAE - def_bool y -@@ -368,18 +745,37 @@ config X86_HAVE_PAE - - config X86_CMPXCHG64 - def_bool y -- depends on X86_HAVE_PAE || M586TSC || M586MMX || MK6 || MK7 -+ depends on X86_PAE || X86_64 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ -+ || M586TSC || M586MMX || MATOM || MGEODE_LX || MGEODEGX1 || MK6 || MK7 || MK8 || MK8SSE3 || MK10 \ -+ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR || MZEN \ -+ || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT || MGOLDMONTPLUS \ -+ || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX || MCANNONLAKE \ -+ || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE \ -+ || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD - - # this should be set for all -march=.. options where the compiler - # generates cmov. - config X86_CMOV - def_bool y -- depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX) -+ depends on (MK8 || MK7 || MCORE2 || MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ -+ || MVIAC3_2 || MVIAC7 || MCRUSOE || MEFFICEON || X86_64 || MATOM || MGEODE_LX || MK8SSE3 || MK10 \ -+ || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER || MEXCAVATOR \ -+ || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT || MGOLDMONT \ -+ || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL || MSKYLAKE || MSKYLAKEX \ -+ || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE || MTIGERLAKE || MSAPPHIRERAPIDS \ -+ || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MMETEORLAKE || MEMERALDRAPIDS || MNATIVE_INTEL || MNATIVE_AMD) - - config X86_MINIMUM_CPU_FAMILY - int - default "64" if X86_64 -- default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8) -+ default "6" if X86_32 && (MPENTIUM4 || MPENTIUMM || MPENTIUMIII || MPENTIUMII || M686 \ -+ || MVIAC3_2 || MVIAC7 || MEFFICEON || MATOM || MCORE2 || MK7 || MK8 || MK8SSE3 \ -+ || MK10 || MBARCELONA || MBOBCAT || MJAGUAR || MBULLDOZER || MPILEDRIVER || MSTEAMROLLER \ -+ || MEXCAVATOR || MZEN || MZEN2 || MZEN3 || MZEN4 || MNEHALEM || MWESTMERE || MSILVERMONT \ -+ || MGOLDMONT || MGOLDMONTPLUS || MSANDYBRIDGE || MIVYBRIDGE || MHASWELL || MBROADWELL \ -+ || MSKYLAKE || MSKYLAKEX || MCANNONLAKE || MICELAKE || MCASCADELAKE || MCOOPERLAKE \ -+ || MTIGERLAKE || MSAPPHIRERAPIDS || MROCKETLAKE || MALDERLAKE || MRAPTORLAKE || MRAPTORLAKE \ -+ || MNATIVE_INTEL || MNATIVE_AMD) - default "5" if X86_32 && X86_CMPXCHG64 - default "4" - diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index da8f3caf27815..c873d10df15d0 100644 +index 3419ffa2a350..c778e8a006e2 100644 --- a/arch/x86/Makefile +++ b/arch/x86/Makefile -@@ -152,8 +152,48 @@ else - # FIXME - should be integrated in Makefile.cpu (Makefile_32.cpu) +@@ -152,15 +152,96 @@ else cflags-$(CONFIG_MK8) += -march=k8 cflags-$(CONFIG_MPSC) += -march=nocona -- cflags-$(CONFIG_MCORE2) += -march=core2 + cflags-$(CONFIG_MCORE2) += -march=core2 - cflags-$(CONFIG_MATOM) += -march=atom +- cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic ++ cflags-$(CONFIG_MATOM) += -march=bonnell ++ ifeq ($(CONFIG_X86_64_VERSION),1) ++ cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic ++ rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic ++ else ++ cflags-$(CONFIG_GENERIC_CPU) += -march=x86-64-v$(CONFIG_X86_64_VERSION) ++ rustflags-$(CONFIG_GENERIC_CPU) += -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION) ++ endif + cflags-$(CONFIG_MK8SSE3) += -march=k8-sse3 + cflags-$(CONFIG_MK10) += -march=amdfam10 + cflags-$(CONFIG_MBARCELONA) += -march=barcelona @@ -658,10 +590,10 @@ index da8f3caf27815..c873d10df15d0 100644 + cflags-$(CONFIG_MZEN2) += -march=znver2 + cflags-$(CONFIG_MZEN3) += -march=znver3 + cflags-$(CONFIG_MZEN4) += -march=znver4 ++ cflags-$(CONFIG_MZEN5) += -march=znver5 + cflags-$(CONFIG_MNATIVE_INTEL) += -march=native -+ cflags-$(CONFIG_MNATIVE_AMD) += -march=native ++ cflags-$(CONFIG_MNATIVE_AMD) += -march=native -mno-tbm + cflags-$(CONFIG_MATOM) += -march=bonnell -+ cflags-$(CONFIG_MCORE2) += -march=core2 + cflags-$(CONFIG_MNEHALEM) += -march=nehalem + cflags-$(CONFIG_MWESTMERE) += -march=westmere + cflags-$(CONFIG_MSILVERMONT) += -march=silvermont @@ -684,14 +616,56 @@ index da8f3caf27815..c873d10df15d0 100644 + cflags-$(CONFIG_MRAPTORLAKE) += -march=raptorlake + cflags-$(CONFIG_MMETEORLAKE) += -march=meteorlake + cflags-$(CONFIG_MEMERALDRAPIDS) += -march=emeraldrapids -+ cflags-$(CONFIG_GENERIC_CPU2) += -march=x86-64-v2 -+ cflags-$(CONFIG_GENERIC_CPU3) += -march=x86-64-v3 -+ cflags-$(CONFIG_GENERIC_CPU4) += -march=x86-64-v4 - cflags-$(CONFIG_GENERIC_CPU) += -mtune=generic KBUILD_CFLAGS += $(cflags-y) + rustflags-$(CONFIG_MK8) += -Ctarget-cpu=k8 + rustflags-$(CONFIG_MPSC) += -Ctarget-cpu=nocona + rustflags-$(CONFIG_MCORE2) += -Ctarget-cpu=core2 +- rustflags-$(CONFIG_MATOM) += -Ctarget-cpu=atom +- rustflags-$(CONFIG_GENERIC_CPU) += -Ztune-cpu=generic ++ rustflags-$(CONFIG_MK8SSE3) += -Ctarget-cpu=k8-sse3 ++ rustflags-$(CONFIG_MK10) += -Ctarget-cpu=amdfam10 ++ rustflags-$(CONFIG_MBARCELONA) += -Ctarget-cpu=barcelona ++ rustflags-$(CONFIG_MBOBCAT) += -Ctarget-cpu=btver1 ++ rustflags-$(CONFIG_MJAGUAR) += -Ctarget-cpu=btver2 ++ rustflags-$(CONFIG_MBULLDOZER) += -Ctarget-cpu=bdver1 ++ rustflags-$(CONFIG_MPILEDRIVER) += -Ctarget-cpu=bdver2 ++ rustflags-$(CONFIG_MSTEAMROLLER) += -Ctarget-cpu=bdver3 ++ rustflags-$(CONFIG_MEXCAVATOR) += -Ctarget-cpu=bdver4 ++ rustflags-$(CONFIG_MZEN) += -Ctarget-cpu=znver1 ++ rustflags-$(CONFIG_MZEN2) += -Ctarget-cpu=znver2 ++ rustflags-$(CONFIG_MZEN3) += -Ctarget-cpu=znver3 ++ rustflags-$(CONFIG_MZEN4) += -Ctarget-cpu=znver4 ++ rustflags-$(CONFIG_MZEN5) += -Ctarget-cpu=znver5 ++ rustflags-$(CONFIG_MNATIVE_INTEL) += -Ctarget-cpu=native ++ rustflags-$(CONFIG_MNATIVE_AMD) += -Ctarget-cpu=native -mno-tbm ++ rustflags-$(CONFIG_MNEHALEM) += -Ctarget-cpu=nehalem ++ rustflags-$(CONFIG_MWESTMERE) += -Ctarget-cpu=westmere ++ rustflags-$(CONFIG_MSILVERMONT) += -Ctarget-cpu=silvermont ++ rustflags-$(CONFIG_MGOLDMONT) += -Ctarget-cpu=goldmont ++ rustflags-$(CONFIG_MGOLDMONTPLUS) += -Ctarget-cpu=goldmont-plus ++ rustflags-$(CONFIG_MSANDYBRIDGE) += -Ctarget-cpu=sandybridge ++ rustflags-$(CONFIG_MIVYBRIDGE) += -Ctarget-cpu=ivybridge ++ rustflags-$(CONFIG_MHASWELL) += -Ctarget-cpu=haswell ++ rustflags-$(CONFIG_MBROADWELL) += -Ctarget-cpu=broadwell ++ rustflags-$(CONFIG_MSKYLAKE) += -Ctarget-cpu=skylake ++ rustflags-$(CONFIG_MSKYLAKEX) += -Ctarget-cpu=skylake-avx512 ++ rustflags-$(CONFIG_MCANNONLAKE) += -Ctarget-cpu=cannonlake ++ rustflags-$(CONFIG_MICELAKE) += -Ctarget-cpu=icelake-client ++ rustflags-$(CONFIG_MCASCADELAKE) += -Ctarget-cpu=cascadelake ++ rustflags-$(CONFIG_MCOOPERLAKE) += -Ctarget-cpu=cooperlake ++ rustflags-$(CONFIG_MTIGERLAKE) += -Ctarget-cpu=tigerlake ++ rustflags-$(CONFIG_MSAPPHIRERAPIDS) += -Ctarget-cpu=sapphirerapids ++ rustflags-$(CONFIG_MROCKETLAKE) += -Ctarget-cpu=rocketlake ++ rustflags-$(CONFIG_MALDERLAKE) += -Ctarget-cpu=alderlake ++ rustflags-$(CONFIG_MRAPTORLAKE) += -Ctarget-cpu=raptorlake ++ rustflags-$(CONFIG_MMETEORLAKE) += -Ctarget-cpu=meteorlake ++ rustflags-$(CONFIG_MEMERALDRAPIDS) += -Ctarget-cpu=emeraldrapids + KBUILD_RUSTFLAGS += $(rustflags-y) + + KBUILD_CFLAGS += -mno-red-zone diff --git a/arch/x86/include/asm/vermagic.h b/arch/x86/include/asm/vermagic.h -index 75884d2cdec37..02c1386eb653e 100644 +index 75884d2cdec3..f4e29563473d 100644 --- a/arch/x86/include/asm/vermagic.h +++ b/arch/x86/include/asm/vermagic.h @@ -17,6 +17,54 @@ @@ -749,7 +723,7 @@ index 75884d2cdec37..02c1386eb653e 100644 #elif defined CONFIG_MATOM #define MODULE_PROC_FAMILY "ATOM " #elif defined CONFIG_M686 -@@ -35,6 +83,32 @@ +@@ -35,6 +83,28 @@ #define MODULE_PROC_FAMILY "K7 " #elif defined CONFIG_MK8 #define MODULE_PROC_FAMILY "K8 " @@ -775,13 +749,9 @@ index 75884d2cdec37..02c1386eb653e 100644 +#define MODULE_PROC_FAMILY "ZEN " +#elif defined CONFIG_MZEN2 +#define MODULE_PROC_FAMILY "ZEN2 " -+#elif defined CONFIG_MZEN3 -+#define MODULE_PROC_FAMILY "ZEN3 " -+#elif defined CONFIG_MZEN4 -+#define MODULE_PROC_FAMILY "ZEN4 " #elif defined CONFIG_MELAN #define MODULE_PROC_FAMILY "ELAN " #elif defined CONFIG_MCRUSOE -- -2.43.2 +2.46.2 diff --git a/sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip b/sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip deleted file mode 100644 index 63038a4..0000000 --- a/sys-kernel/gentoo-sources-6.6/0100_sched-fair_multi-llc_select_idle_sibling.patch.skip +++ /dev/null @@ -1,94 +0,0 @@ -From c5214e13ad60bd0022bab45cbac2c9db6bc1e0d4 Mon Sep 17 00:00:00 2001 -From: Peter Zijlstra -Date: Tue, 30 May 2023 13:20:46 +0200 -Subject: sched/fair: Multi-LLC select_idle_sibling() - -Tejun reported that when he targets workqueues towards a specific LLC -on his Zen2 machine with 3 cores / LLC and 4 LLCs in total, he gets -significant idle time. - -This is, of course, because of how select_idle_sibling() will not -consider anything outside of the local LLC, and since all these tasks -are short running the periodic idle load balancer is ineffective. - -And while it is good to keep work cache local, it is better to not -have significant idle time. Therefore, have select_idle_sibling() try -other LLCs inside the same node when the local one comes up empty. - -Reported-by: Tejun Heo -Signed-off-by: Peter Zijlstra (Intel) ---- - kernel/sched/fair.c | 37 +++++++++++++++++++++++++++++++++++++ - kernel/sched/features.h | 1 + - 2 files changed, 38 insertions(+) - -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index 48b6f0ca13acc..cd80e30b9d679 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -7027,6 +7027,37 @@ static int select_idle_cpu(struct task_struct *p, struct sched_domain *sd, bool - return idle_cpu; - } - -+/* -+ * For the multiple-LLC per node case, make sure to try the other LLC's if the -+ * local LLC comes up empty. -+ */ -+static int -+select_idle_node(struct task_struct *p, struct sched_domain *sd, int target) -+{ -+ struct sched_domain *parent = sd->parent; -+ struct sched_group *sg; -+ -+ /* Make sure to not cross nodes. */ -+ if (!parent || parent->flags & SD_NUMA) -+ return -1; -+ -+ sg = parent->groups; -+ do { -+ int cpu = cpumask_first(sched_group_span(sg)); -+ -+ if (!cpus_share_cache(cpu, target)) { -+ int i = select_idle_cpu(p, per_cpu(sd_llc, cpu), -+ test_idle_cores(cpu), cpu); -+ if ((unsigned)i < nr_cpumask_bits) -+ return i; -+ } -+ -+ sg = sg->next; -+ } while (sg != parent->groups); -+ -+ return -1; -+} -+ - /* - * Scan the asym_capacity domain for idle CPUs; pick the first idle one on which - * the task fits. If no CPU is big enough, but there are idle ones, try to -@@ -7199,6 +7230,12 @@ static int select_idle_sibling(struct task_struct *p, int prev, int target) - if ((unsigned)i < nr_cpumask_bits) - return i; - -+ if (sched_feat(SIS_NODE)) { -+ i = select_idle_node(p, sd, target); -+ if ((unsigned)i < nr_cpumask_bits) -+ return i; -+ } -+ - return target; - } - -diff --git a/kernel/sched/features.h b/kernel/sched/features.h -index ee7f23c76bd33..9e390eb82e384 100644 ---- a/kernel/sched/features.h -+++ b/kernel/sched/features.h -@@ -62,6 +62,7 @@ SCHED_FEAT(TTWU_QUEUE, true) - */ - SCHED_FEAT(SIS_PROP, false) - SCHED_FEAT(SIS_UTIL, true) -+SCHED_FEAT(SIS_NODE, true) - - /* - * Issue a WARN when we do multiple update_rq_clock() calls --- -cgit - diff --git a/sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch b/sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch deleted file mode 100644 index e0fb4ec..0000000 --- a/sys-kernel/gentoo-sources-6.6/0202-ZEN-INTERACTIVE-Tune-ondemand-governor-for-interacti.patch +++ /dev/null @@ -1,75 +0,0 @@ -From 76f1df5c1a512d1f459678d17c4b78a74d304cc9 Mon Sep 17 00:00:00 2001 -From: Andre Ramnitz -Date: Mon, 20 Mar 2023 18:39:46 +0100 -Subject: ZEN: INTERACTIVE: Tune ondemand governor for interactivity - -4.10: -During some personal testing with the Dolphin emulator, MuQSS has -serious problems scaling its frequencies causing poor performance where -boosting the CPU frequencies would have fixed them. Reducing the -up_threshold to 45 with MuQSS appears to fix the issue, letting the -introduction to "Star Wars: Rogue Leader" run at 100% speed versus about -80% on my test system. - -Also, lets refactor the definitions and include some indentation to help -the reader discern what the scope of all the macros are. - -5.4: -On the last custom kernel benchmark from Phoronix with Xanmod, Michael -configured all the kernels to run using ondemand instead of the kernel's -[default selection][1]. This reminded me that another option outside of -the kernels control is the user's choice to change the cpufreq governor, -for better or for worse. - -In Liquorix, performance is the default governor whether you're running -acpi-cpufreq or intel-pstate. I expect laptop users to install TLP or -LMT to control the power balance on their system, especially when -they're plugged in or on battery. However, it's pretty clear to me a -lot of people would choose ondemand over performance since it's not -obvious it has huge performance ramifications with MuQSS, and ondemand -otherwise is "good enough" for most people. - -Lets codify lower up thresholds for MuQSS to more closely synergize with -its aggressive thread migration behavior. This way when ondemand is -configured, you get sort of a "performance-lite" type of result but with -the power savings you expect when leaving the running system idle. - -[1]: https://www.phoronix.com/scan.php?page=article&item=xanmod-2020-kernel - -5.14: -Although CFS and similar schedulers (BMQ, PDS, and CacULE), reuse a lot -more of mainline scheduling and do a good job of pinning single threaded -tasks to their respective core, there's still applications that -confusingly run steady near 50% and benefit from going full speed or -turbo when they need to run (emulators for more recent consoles come to -mind). - -Drop the up threshold for all non-MuQSS schedulers from 80/95 to 55/60. - -5.15: -Remove MuQSS cpufreq configuration. ---- - drivers/cpufreq/cpufreq_ondemand.c | 6 +++--- - 1 file changed, 3 insertions(+), 3 deletions(-) - -diff --git a/drivers/cpufreq/cpufreq_ondemand.c b/drivers/cpufreq/cpufreq_ondemand.c -index c52d19d67557..38d3d2fc9c4e 100644 ---- a/drivers/cpufreq/cpufreq_ondemand.c -+++ b/drivers/cpufreq/cpufreq_ondemand.c -@@ -18,10 +18,10 @@ - #include "cpufreq_ondemand.h" - - /* On-demand governor macros */ --#define DEF_FREQUENCY_UP_THRESHOLD (80) --#define DEF_SAMPLING_DOWN_FACTOR (1) -+#define DEF_FREQUENCY_UP_THRESHOLD (55) -+#define DEF_SAMPLING_DOWN_FACTOR (5) - #define MAX_SAMPLING_DOWN_FACTOR (100000) --#define MICRO_FREQUENCY_UP_THRESHOLD (95) -+#define MICRO_FREQUENCY_UP_THRESHOLD (60) - #define MICRO_FREQUENCY_MIN_SAMPLE_RATE (10000) - #define MIN_FREQUENCY_UP_THRESHOLD (1) - #define MAX_FREQUENCY_UP_THRESHOLD (100) --- -2.39.2 - diff --git a/sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch b/sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch deleted file mode 100644 index c78aa89..0000000 --- a/sys-kernel/gentoo-sources-6.6/0205-ZEN-cpufreq-Remove-schedutil-dependency-on-Intel-AMD.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 915bbf3cf328160cb27c7b6f98ec4958f0e537e7 Mon Sep 17 00:00:00 2001 -From: Andre Ramnitz -Date: Mon, 20 Mar 2023 18:45:37 +0100 -Subject: ZEN: cpufreq: Remove schedutil dependency on Intel/AMD P-State - drivers - -Although both P-State drivers depend on schedutil in Kconfig, both code -bases do not use any schedutil code. This arbitrarily enables schedutil -when unwanted in some configurations. ---- - drivers/cpufreq/Kconfig.x86 | 2 -- - 1 file changed, 2 deletions(-) - -diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 -index 00476e94db90..c3a219218fac 100644 ---- a/drivers/cpufreq/Kconfig.x86 -+++ b/drivers/cpufreq/Kconfig.x86 -@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE - select ACPI_PROCESSOR if ACPI - select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO - select CPU_FREQ_GOV_PERFORMANCE -- select CPU_FREQ_GOV_SCHEDUTIL if SMP - help - This driver provides a P state for Intel core processors. - The driver implements an internal governor and will become -@@ -39,7 +38,6 @@ config X86_AMD_PSTATE - depends on X86 && ACPI - select ACPI_PROCESSOR - select ACPI_CPPC_LIB if X86_64 -- select CPU_FREQ_GOV_SCHEDUTIL if SMP - help - This driver adds a CPUFreq driver which utilizes a fine grain - processor performance frequency control range instead of legacy --- -2.39.2 - diff --git a/sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch b/sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch deleted file mode 100644 index 100bbd9..0000000 --- a/sys-kernel/gentoo-sources-6.6/0210-ZEN-Restore-CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3.patch +++ /dev/null @@ -1,47 +0,0 @@ -From 015323f4b5e73a7076b5c60bd79c7cc480f65f37 Mon Sep 17 00:00:00 2001 -From: Andre Ramnitz -Date: Tue, 21 Mar 2023 00:19:25 +0100 -Subject: ZEN: Restore CONFIG_OPTIMIZE_FOR_PERFORMANCE_O3 - -This reverts a6036a4 (kbuild: drop -support for CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3), removes the -dependency on CONFIG_ARC and adds RUSTFLAGS ---- - Makefile | 3 +++ - init/Kconfig | 6 ++++++ - 2 files changed, 9 insertions(+) - -diff --git a/Makefile b/Makefile -index 3f6628780eb2..64c2842330db 100644 ---- a/Makefile -+++ b/Makefile -@@ -834,6 +834,9 @@ KBUILD_CFLAGS += $(call cc-disable-warning, address-of-packed-member) - ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE - KBUILD_CFLAGS += -O2 - KBUILD_RUSTFLAGS += -Copt-level=2 -+else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 -+KBUILD_CFLAGS += -O3 -+KBUILD_RUSTFLAGS += -Copt-level=3 - else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE - KBUILD_CFLAGS += -Os - KBUILD_RUSTFLAGS += -Copt-level=s -diff --git a/init/Kconfig b/init/Kconfig -index 44e90b28a30f..6731063983ec 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1420,6 +1420,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE - with the "-O2" compiler flag for best performance and most - helpful compile-time warnings. - -+config CC_OPTIMIZE_FOR_PERFORMANCE_O3 -+ bool "Optimize more for performance (-O3)" -+ help -+ Choosing this option will pass "-O3" to your compiler to optimize -+ the kernel yet more for performance. -+ - config CC_OPTIMIZE_FOR_SIZE - bool "Optimize for size (-Os)" - help --- -2.39.2 - diff --git a/sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch b/sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch deleted file mode 100644 index 357ed7b..0000000 --- a/sys-kernel/gentoo-sources-6.6/0211-ZEN-arch-x86-Disable-AVX2-and-tree-vectorization.patch +++ /dev/null @@ -1,29 +0,0 @@ -From ced477387463f385e2a0e01824ae4d512fe5b323 Mon Sep 17 00:00:00 2001 -From: Andre Ramnitz -Date: Tue, 21 Mar 2023 00:36:58 +0100 -Subject: ZEN: arch/x86: Disable AVX2 and tree vectorization - -From ClearLinux's own patches, disable both AVX2 and tree vectorization -when using O3 and higher than generic amd64 architectures. - -Source: https://github.com/clearlinux-pkgs/linux/blob/main/0133-novector.patch ---- - arch/x86/Makefile | 2 +- - 1 file changed, 1 insertion(+), 1 deletion(-) - -diff --git a/arch/x86/Makefile b/arch/x86/Makefile -index 73ed982d4100..5d687a64d710 100644 ---- a/arch/x86/Makefile -+++ b/arch/x86/Makefile -@@ -67,7 +67,7 @@ export BITS - # - # https://gcc.gnu.org/bugzilla/show_bug.cgi?id=53383 - # --KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -+KBUILD_CFLAGS += -mno-sse -mno-mmx -mno-sse2 -mno-3dnow -mno-avx -mno-avx2 -fno-tree-vectorize - KBUILD_RUSTFLAGS += -Ctarget-feature=-sse,-sse2,-sse3,-ssse3,-sse4.1,-sse4.2,-avx,-avx2 - - ifeq ($(CONFIG_X86_KERNEL_IBT),y) --- -2.39.2 - diff --git a/sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch b/sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch deleted file mode 100644 index ddee93a..0000000 --- a/sys-kernel/gentoo-sources-6.6/0214-ZEN-INTERACTIVE-dm-crypt-Disable-workqueues-for-cryp.patch +++ /dev/null @@ -1,36 +0,0 @@ -From 19847222d00356eb18a22008b1e9c42237bef979 Mon Sep 17 00:00:00 2001 -From: Andre Ramnitz -Date: Tue, 21 Mar 2023 00:35:39 +0100 -Subject: ZEN: INTERACTIVE: dm-crypt: Disable workqueues for crypto ops - -Queueing in dm-crypt for crypto operations reduces performance on modern -systems. As discussed in an article from Cloudflare, they discovered -that queuing was introduced because the crypto subsystem used to be -synchronous. Since it's now asynchronous, we get double queueing when -using the subsystem through dm-crypt. This is obviously undesirable and -reduces throughput and increases latency. - -Disable queueing when using our Zen Interactive configuration. - -Fixes: zen-kernel#282 ---- - drivers/md/dm-crypt.c | 3 +++ - 1 file changed, 3 insertions(+) - -diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c -index 2653516bcdef..7fac0f569cef 100644 ---- a/drivers/md/dm-crypt.c -+++ b/drivers/md/dm-crypt.c -@@ -3207,6 +3207,9 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) - goto bad; - } - -+ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); -+ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); -+ - ret = crypt_ctr_cipher(ti, argv[0], argv[1]); - if (ret < 0) - goto bad; --- -2.39.2 - diff --git a/sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch b/sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch deleted file mode 100644 index 0ae59fa..0000000 --- a/sys-kernel/gentoo-sources-6.6/0215-ZEN-Add-VHBA-driver.patch +++ /dev/null @@ -1,1199 +0,0 @@ -From e707ce895085656b53783187aaacbb89867090de Mon Sep 17 00:00:00 2001 -From: Andre Ramnitz -Date: Tue, 21 Mar 2023 00:38:15 +0100 -Subject: ZEN: Add VHBA driver - -remote https://github.com/cdemu/cdemu -tag vhba-module-20211218 ---- - drivers/scsi/Kconfig | 2 + - drivers/scsi/Makefile | 1 + - drivers/scsi/vhba/Kconfig | 9 + - drivers/scsi/vhba/Makefile | 4 + - drivers/scsi/vhba/vhba.c | 1124 ++++++++++++++++++++++++++++++++++++ - 5 files changed, 1140 insertions(+) - create mode 100644 drivers/scsi/vhba/Kconfig - create mode 100644 drivers/scsi/vhba/Makefile - create mode 100644 drivers/scsi/vhba/vhba.c - -diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig -index 03e71e3d5e5b..d4c6603e238b 100644 ---- a/drivers/scsi/Kconfig -+++ b/drivers/scsi/Kconfig -@@ -1524,4 +1524,6 @@ endif # SCSI_LOWLEVEL - - source "drivers/scsi/device_handler/Kconfig" - -+source "drivers/scsi/vhba/Kconfig" -+ - endmenu -diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile -index f055bfd54a68..e16e95f2c3de 100644 ---- a/drivers/scsi/Makefile -+++ b/drivers/scsi/Makefile -@@ -151,6 +151,7 @@ obj-$(CONFIG_CHR_DEV_SCH) += ch.o - obj-$(CONFIG_SCSI_ENCLOSURE) += ses.o - - obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/ -+obj-$(CONFIG_VHBA) += vhba/ - - # This goes last, so that "real" scsi devices probe earlier - obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o -diff --git a/drivers/scsi/vhba/Kconfig b/drivers/scsi/vhba/Kconfig -new file mode 100644 -index 000000000000..e70a381fe3df ---- /dev/null -+++ b/drivers/scsi/vhba/Kconfig -@@ -0,0 +1,9 @@ -+config VHBA -+ tristate "Virtual (SCSI) Host Bus Adapter" -+ depends on SCSI -+ help -+ This is the in-kernel part of CDEmu, a CD/DVD-ROM device -+ emulator. -+ -+ This driver can also be built as a module. If so, the module -+ will be called vhba. -diff --git a/drivers/scsi/vhba/Makefile b/drivers/scsi/vhba/Makefile -new file mode 100644 -index 000000000000..ad8b7c6442af ---- /dev/null -+++ b/drivers/scsi/vhba/Makefile -@@ -0,0 +1,4 @@ -+VHBA_VERSION := 20211218 -+ -+obj-$(CONFIG_VHBA) += vhba.o -+ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror -diff --git a/drivers/scsi/vhba/vhba.c b/drivers/scsi/vhba/vhba.c -new file mode 100644 -index 000000000000..676af31c33ad ---- /dev/null -+++ b/drivers/scsi/vhba/vhba.c -@@ -0,0 +1,1124 @@ -+/* -+ * vhba.c -+ * -+ * Copyright (C) 2007-2012 Chia-I Wu -+ * -+ * This program is free software; you can redistribute it and/or modify -+ * it under the terms of the GNU General Public License as published by -+ * the Free Software Foundation; either version 2 of the License, or -+ * (at your option) any later version. -+ * -+ * This program is distributed in the hope that it will be useful, -+ * but WITHOUT ANY WARRANTY; without even the implied warranty of -+ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the -+ * GNU General Public License for more details. -+ * -+ * You should have received a copy of the GNU General Public License along -+ * with this program; if not, write to the Free Software Foundation, Inc., -+ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. -+ */ -+ -+#define pr_fmt(fmt) "vhba: " fmt -+ -+#include -+ -+#include -+#include -+#include -+#include -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) -+#include -+#else -+#include -+#endif -+#include -+#include -+#include -+#include -+#include -+#ifdef CONFIG_COMPAT -+#include -+#endif -+#include -+#include -+#include -+#include -+#include -+#include -+ -+ -+MODULE_AUTHOR("Chia-I Wu"); -+MODULE_VERSION(VHBA_VERSION); -+MODULE_DESCRIPTION("Virtual SCSI HBA"); -+MODULE_LICENSE("GPL"); -+ -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0) -+#define sdev_dbg(sdev, fmt, a...) \ -+ dev_dbg(&(sdev)->sdev_gendev, fmt, ##a) -+#define scmd_dbg(scmd, fmt, a...) \ -+ dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a) -+#endif -+ -+#define VHBA_MAX_SECTORS_PER_IO 256 -+#define VHBA_MAX_BUS 16 -+#define VHBA_MAX_ID 16 -+#define VHBA_MAX_DEVICES (VHBA_MAX_BUS * (VHBA_MAX_ID-1)) -+#define VHBA_KBUF_SIZE PAGE_SIZE -+ -+#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL) -+#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL) -+ -+ -+static int vhba_can_queue = 32; -+module_param_named(can_queue, vhba_can_queue, int, 0); -+ -+ -+enum vhba_req_state { -+ VHBA_REQ_FREE, -+ VHBA_REQ_PENDING, -+ VHBA_REQ_READING, -+ VHBA_REQ_SENT, -+ VHBA_REQ_WRITING, -+}; -+ -+struct vhba_command { -+ struct scsi_cmnd *cmd; -+ /* metatags are per-host. not to be confused with -+ queue tags that are usually per-lun */ -+ unsigned long metatag; -+ int status; -+ struct list_head entry; -+}; -+ -+struct vhba_device { -+ unsigned int num; -+ spinlock_t cmd_lock; -+ struct list_head cmd_list; -+ wait_queue_head_t cmd_wq; -+ atomic_t refcnt; -+ -+ unsigned char *kbuf; -+ size_t kbuf_size; -+}; -+ -+struct vhba_host { -+ struct Scsi_Host *shost; -+ spinlock_t cmd_lock; -+ int cmd_next; -+ struct vhba_command *commands; -+ spinlock_t dev_lock; -+ struct vhba_device *devices[VHBA_MAX_DEVICES]; -+ int num_devices; -+ DECLARE_BITMAP(chgmap, VHBA_MAX_DEVICES); -+ int chgtype[VHBA_MAX_DEVICES]; -+ struct work_struct scan_devices; -+}; -+ -+#define MAX_COMMAND_SIZE 16 -+ -+struct vhba_request { -+ __u32 metatag; -+ __u32 lun; -+ __u8 cdb[MAX_COMMAND_SIZE]; -+ __u8 cdb_len; -+ __u32 data_len; -+}; -+ -+struct vhba_response { -+ __u32 metatag; -+ __u32 status; -+ __u32 data_len; -+}; -+ -+ -+ -+struct vhba_command *vhba_alloc_command (void); -+void vhba_free_command (struct vhba_command *vcmd); -+ -+static struct platform_device vhba_platform_device; -+ -+ -+ -+/* These functions define a symmetric 1:1 mapping between device numbers and -+ the bus and id. We have reserved the last id per bus for the host itself. */ -+void devnum_to_bus_and_id(unsigned int devnum, unsigned int *bus, unsigned int *id) -+{ -+ *bus = devnum / (VHBA_MAX_ID-1); -+ *id = devnum % (VHBA_MAX_ID-1); -+} -+ -+unsigned int bus_and_id_to_devnum(unsigned int bus, unsigned int id) -+{ -+ return (bus * (VHBA_MAX_ID-1)) + id; -+} -+ -+struct vhba_device *vhba_device_alloc (void) -+{ -+ struct vhba_device *vdev; -+ -+ vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL); -+ if (!vdev) { -+ return NULL; -+ } -+ -+ spin_lock_init(&vdev->cmd_lock); -+ INIT_LIST_HEAD(&vdev->cmd_list); -+ init_waitqueue_head(&vdev->cmd_wq); -+ atomic_set(&vdev->refcnt, 1); -+ -+ vdev->kbuf = NULL; -+ vdev->kbuf_size = 0; -+ -+ return vdev; -+} -+ -+void vhba_device_put (struct vhba_device *vdev) -+{ -+ if (atomic_dec_and_test(&vdev->refcnt)) { -+ kfree(vdev); -+ } -+} -+ -+struct vhba_device *vhba_device_get (struct vhba_device *vdev) -+{ -+ atomic_inc(&vdev->refcnt); -+ -+ return vdev; -+} -+ -+int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd) -+{ -+ struct vhba_host *vhost; -+ struct vhba_command *vcmd; -+ unsigned long flags; -+ -+ vhost = platform_get_drvdata(&vhba_platform_device); -+ -+ vcmd = vhba_alloc_command(); -+ if (!vcmd) { -+ return SCSI_MLQUEUE_HOST_BUSY; -+ } -+ -+ vcmd->cmd = cmd; -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) -+ vcmd->metatag = scsi_cmd_to_rq(vcmd->cmd)->tag; -+#else -+ vcmd->metatag = vcmd->cmd->request->tag; -+#endif -+ list_add_tail(&vcmd->entry, &vdev->cmd_list); -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ wake_up_interruptible(&vdev->cmd_wq); -+ -+ return 0; -+} -+ -+int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd) -+{ -+ struct vhba_command *vcmd; -+ int retval; -+ unsigned long flags; -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { -+ if (vcmd->cmd == cmd) { -+ list_del_init(&vcmd->entry); -+ break; -+ } -+ } -+ -+ /* command not found */ -+ if (&vcmd->entry == &vdev->cmd_list) { -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ return SUCCESS; -+ } -+ -+ while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) { -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ scmd_dbg(cmd, "wait for I/O before aborting\n"); -+ schedule_timeout(1); -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ } -+ -+ retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS; -+ -+ vhba_free_command(vcmd); -+ -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ return retval; -+} -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) -+int vhba_slave_alloc(struct scsi_device *sdev) -+{ -+ struct Scsi_Host *shost = sdev->host; -+ -+ sdev_dbg(sdev, "enabling tagging (queue depth: %i).\n", sdev->queue_depth); -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) -+ if (!shost_use_blk_mq(shost) && shost->bqt) { -+#else -+ if (shost->bqt) { -+#endif -+ blk_queue_init_tags(sdev->request_queue, sdev->queue_depth, shost->bqt); -+ } -+ scsi_adjust_queue_depth(sdev, 0, sdev->queue_depth); -+ -+ return 0; -+} -+#endif -+ -+void vhba_scan_devices_add (struct vhba_host *vhost, int bus, int id) -+{ -+ struct scsi_device *sdev; -+ -+ sdev = scsi_device_lookup(vhost->shost, bus, id, 0); -+ if (!sdev) { -+ scsi_add_device(vhost->shost, bus, id, 0); -+ } else { -+ dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device %d:%d:0!\n", bus, id); -+ scsi_device_put(sdev); -+ } -+} -+ -+void vhba_scan_devices_remove (struct vhba_host *vhost, int bus, int id) -+{ -+ struct scsi_device *sdev; -+ -+ sdev = scsi_device_lookup(vhost->shost, bus, id, 0); -+ if (sdev) { -+ scsi_remove_device(sdev); -+ scsi_device_put(sdev); -+ } else { -+ dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device %d:%d:0!\n", bus, id); -+ } -+} -+ -+void vhba_scan_devices (struct work_struct *work) -+{ -+ struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices); -+ unsigned long flags; -+ int change, exists; -+ unsigned int devnum; -+ unsigned int bus, id; -+ -+ for (;;) { -+ spin_lock_irqsave(&vhost->dev_lock, flags); -+ -+ devnum = find_first_bit(vhost->chgmap, VHBA_MAX_DEVICES); -+ if (devnum >= VHBA_MAX_DEVICES) { -+ spin_unlock_irqrestore(&vhost->dev_lock, flags); -+ break; -+ } -+ change = vhost->chgtype[devnum]; -+ exists = vhost->devices[devnum] != NULL; -+ -+ vhost->chgtype[devnum] = 0; -+ clear_bit(devnum, vhost->chgmap); -+ -+ spin_unlock_irqrestore(&vhost->dev_lock, flags); -+ -+ devnum_to_bus_and_id(devnum, &bus, &id); -+ -+ if (change < 0) { -+ dev_dbg(&vhost->shost->shost_gendev, "trying to remove target %d:%d:0\n", bus, id); -+ vhba_scan_devices_remove(vhost, bus, id); -+ } else if (change > 0) { -+ dev_dbg(&vhost->shost->shost_gendev, "trying to add target %d:%d:0\n", bus, id); -+ vhba_scan_devices_add(vhost, bus, id); -+ } else { -+ /* quick sequence of add/remove or remove/add; we determine -+ which one it was by checking if device structure exists */ -+ if (exists) { -+ /* remove followed by add: remove and (re)add */ -+ dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target %d:%d:0\n", bus, id); -+ vhba_scan_devices_remove(vhost, bus, id); -+ vhba_scan_devices_add(vhost, bus, id); -+ } else { -+ /* add followed by remove: no-op */ -+ dev_dbg(&vhost->shost->shost_gendev, "no-op for target %d:%d:0\n", bus, id); -+ } -+ } -+ } -+} -+ -+int vhba_add_device (struct vhba_device *vdev) -+{ -+ struct vhba_host *vhost; -+ unsigned int devnum; -+ unsigned long flags; -+ -+ vhost = platform_get_drvdata(&vhba_platform_device); -+ -+ vhba_device_get(vdev); -+ -+ spin_lock_irqsave(&vhost->dev_lock, flags); -+ if (vhost->num_devices >= VHBA_MAX_DEVICES) { -+ spin_unlock_irqrestore(&vhost->dev_lock, flags); -+ vhba_device_put(vdev); -+ return -EBUSY; -+ } -+ -+ for (devnum = 0; devnum < VHBA_MAX_DEVICES; devnum++) { -+ if (vhost->devices[devnum] == NULL) { -+ vdev->num = devnum; -+ vhost->devices[devnum] = vdev; -+ vhost->num_devices++; -+ set_bit(devnum, vhost->chgmap); -+ vhost->chgtype[devnum]++; -+ break; -+ } -+ } -+ spin_unlock_irqrestore(&vhost->dev_lock, flags); -+ -+ schedule_work(&vhost->scan_devices); -+ -+ return 0; -+} -+ -+int vhba_remove_device (struct vhba_device *vdev) -+{ -+ struct vhba_host *vhost; -+ unsigned long flags; -+ -+ vhost = platform_get_drvdata(&vhba_platform_device); -+ -+ spin_lock_irqsave(&vhost->dev_lock, flags); -+ set_bit(vdev->num, vhost->chgmap); -+ vhost->chgtype[vdev->num]--; -+ vhost->devices[vdev->num] = NULL; -+ vhost->num_devices--; -+ spin_unlock_irqrestore(&vhost->dev_lock, flags); -+ -+ vhba_device_put(vdev); -+ -+ schedule_work(&vhost->scan_devices); -+ -+ return 0; -+} -+ -+struct vhba_device *vhba_lookup_device (int devnum) -+{ -+ struct vhba_host *vhost; -+ struct vhba_device *vdev = NULL; -+ unsigned long flags; -+ -+ vhost = platform_get_drvdata(&vhba_platform_device); -+ -+ if (likely(devnum < VHBA_MAX_DEVICES)) { -+ spin_lock_irqsave(&vhost->dev_lock, flags); -+ vdev = vhost->devices[devnum]; -+ if (vdev) { -+ vdev = vhba_device_get(vdev); -+ } -+ -+ spin_unlock_irqrestore(&vhost->dev_lock, flags); -+ } -+ -+ return vdev; -+} -+ -+struct vhba_command *vhba_alloc_command (void) -+{ -+ struct vhba_host *vhost; -+ struct vhba_command *vcmd; -+ unsigned long flags; -+ int i; -+ -+ vhost = platform_get_drvdata(&vhba_platform_device); -+ -+ spin_lock_irqsave(&vhost->cmd_lock, flags); -+ -+ vcmd = vhost->commands + vhost->cmd_next++; -+ if (vcmd->status != VHBA_REQ_FREE) { -+ for (i = 0; i < vhba_can_queue; i++) { -+ vcmd = vhost->commands + i; -+ -+ if (vcmd->status == VHBA_REQ_FREE) { -+ vhost->cmd_next = i + 1; -+ break; -+ } -+ } -+ -+ if (i == vhba_can_queue) { -+ vcmd = NULL; -+ } -+ } -+ -+ if (vcmd) { -+ vcmd->status = VHBA_REQ_PENDING; -+ } -+ -+ vhost->cmd_next %= vhba_can_queue; -+ -+ spin_unlock_irqrestore(&vhost->cmd_lock, flags); -+ -+ return vcmd; -+} -+ -+void vhba_free_command (struct vhba_command *vcmd) -+{ -+ struct vhba_host *vhost; -+ unsigned long flags; -+ -+ vhost = platform_get_drvdata(&vhba_platform_device); -+ -+ spin_lock_irqsave(&vhost->cmd_lock, flags); -+ vcmd->status = VHBA_REQ_FREE; -+ spin_unlock_irqrestore(&vhost->cmd_lock, flags); -+} -+ -+int vhba_queuecommand (struct Scsi_Host *shost, struct scsi_cmnd *cmd) -+{ -+ struct vhba_device *vdev; -+ int retval; -+ unsigned int devnum; -+ -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) -+ scmd_dbg(cmd, "queue %p tag %i\n", cmd, scsi_cmd_to_rq(cmd)->tag); -+#else -+ scmd_dbg(cmd, "queue %p tag %i\n", cmd, cmd->request->tag); -+#endif -+ -+ devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id); -+ vdev = vhba_lookup_device(devnum); -+ if (!vdev) { -+ scmd_dbg(cmd, "no such device\n"); -+ -+ cmd->result = DID_NO_CONNECT << 16; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) -+ scsi_done(cmd); -+#else -+ cmd->scsi_done(cmd); -+#endif -+ -+ return 0; -+ } -+ -+ retval = vhba_device_queue(vdev, cmd); -+ -+ vhba_device_put(vdev); -+ -+ return retval; -+} -+ -+int vhba_abort (struct scsi_cmnd *cmd) -+{ -+ struct vhba_device *vdev; -+ int retval = SUCCESS; -+ unsigned int devnum; -+ -+ scmd_dbg(cmd, "abort %p\n", cmd); -+ -+ devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id); -+ vdev = vhba_lookup_device(devnum); -+ if (vdev) { -+ retval = vhba_device_dequeue(vdev, cmd); -+ vhba_device_put(vdev); -+ } else { -+ cmd->result = DID_NO_CONNECT << 16; -+ } -+ -+ return retval; -+} -+ -+static struct scsi_host_template vhba_template = { -+ .module = THIS_MODULE, -+ .name = "vhba", -+ .proc_name = "vhba", -+ .queuecommand = vhba_queuecommand, -+ .eh_abort_handler = vhba_abort, -+ .this_id = -1, -+ .max_sectors = VHBA_MAX_SECTORS_PER_IO, -+ .sg_tablesize = 256, -+#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) -+ .slave_alloc = vhba_slave_alloc, -+#endif -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) -+ .tag_alloc_policy = BLK_TAG_ALLOC_RR, -+#endif -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) -+ .use_blk_tags = 1, -+#endif -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) -+ .max_segment_size = VHBA_KBUF_SIZE, -+#endif -+}; -+ -+ssize_t do_request (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, char __user *buf, size_t buf_len) -+{ -+ struct vhba_request vreq; -+ ssize_t ret; -+ -+ scmd_dbg(cmd, "request %lu (%p), cdb 0x%x, bufflen %d, sg count %d\n", -+ metatag, cmd, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd)); -+ -+ ret = sizeof(vreq); -+ if (DATA_TO_DEVICE(cmd->sc_data_direction)) { -+ ret += scsi_bufflen(cmd); -+ } -+ -+ if (ret > buf_len) { -+ scmd_dbg(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret); -+ return -EIO; -+ } -+ -+ vreq.metatag = metatag; -+ vreq.lun = cmd->device->lun; -+ memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE); -+ vreq.cdb_len = cmd->cmd_len; -+ vreq.data_len = scsi_bufflen(cmd); -+ -+ if (copy_to_user(buf, &vreq, sizeof(vreq))) { -+ return -EFAULT; -+ } -+ -+ if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) { -+ buf += sizeof(vreq); -+ -+ if (scsi_sg_count(cmd)) { -+ unsigned char *kaddr, *uaddr; -+ struct scatterlist *sglist = scsi_sglist(cmd); -+ struct scatterlist *sg; -+ int i; -+ -+ uaddr = (unsigned char *) buf; -+ -+ for_each_sg(sglist, sg, scsi_sg_count(cmd), i) { -+ size_t len = sg->length; -+ -+ if (len > vdev->kbuf_size) { -+ scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size); -+ len = vdev->kbuf_size; -+ } -+ -+ kaddr = kmap_atomic(sg_page(sg)); -+ memcpy(vdev->kbuf, kaddr + sg->offset, len); -+ kunmap_atomic(kaddr); -+ -+ if (copy_to_user(uaddr, vdev->kbuf, len)) { -+ return -EFAULT; -+ } -+ uaddr += len; -+ } -+ } else { -+ if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) { -+ return -EFAULT; -+ } -+ } -+ } -+ -+ return ret; -+} -+ -+ssize_t do_response (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res) -+{ -+ ssize_t ret = 0; -+ -+ scmd_dbg(cmd, "response %lu (%p), status %x, data len %d, sg count %d\n", -+ metatag, cmd, res->status, res->data_len, scsi_sg_count(cmd)); -+ -+ if (res->status) { -+ if (res->data_len > SCSI_SENSE_BUFFERSIZE) { -+ scmd_dbg(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len); -+ res->data_len = SCSI_SENSE_BUFFERSIZE; -+ } -+ -+ if (copy_from_user(cmd->sense_buffer, buf, res->data_len)) { -+ return -EFAULT; -+ } -+ -+ cmd->result = res->status; -+ -+ ret += res->data_len; -+ } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) { -+ size_t to_read; -+ -+ if (res->data_len > scsi_bufflen(cmd)) { -+ scmd_dbg(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len); -+ res->data_len = scsi_bufflen(cmd); -+ } -+ -+ to_read = res->data_len; -+ -+ if (scsi_sg_count(cmd)) { -+ unsigned char *kaddr, *uaddr; -+ struct scatterlist *sglist = scsi_sglist(cmd); -+ struct scatterlist *sg; -+ int i; -+ -+ uaddr = (unsigned char *)buf; -+ -+ for_each_sg(sglist, sg, scsi_sg_count(cmd), i) { -+ size_t len = (sg->length < to_read) ? sg->length : to_read; -+ -+ if (len > vdev->kbuf_size) { -+ scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size); -+ len = vdev->kbuf_size; -+ } -+ -+ if (copy_from_user(vdev->kbuf, uaddr, len)) { -+ return -EFAULT; -+ } -+ uaddr += len; -+ -+ kaddr = kmap_atomic(sg_page(sg)); -+ memcpy(kaddr + sg->offset, vdev->kbuf, len); -+ kunmap_atomic(kaddr); -+ -+ to_read -= len; -+ if (to_read == 0) { -+ break; -+ } -+ } -+ } else { -+ if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) { -+ return -EFAULT; -+ } -+ -+ to_read -= res->data_len; -+ } -+ -+ scsi_set_resid(cmd, to_read); -+ -+ ret += res->data_len - to_read; -+ } -+ -+ return ret; -+} -+ -+struct vhba_command *next_command (struct vhba_device *vdev) -+{ -+ struct vhba_command *vcmd; -+ -+ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { -+ if (vcmd->status == VHBA_REQ_PENDING) { -+ break; -+ } -+ } -+ -+ if (&vcmd->entry == &vdev->cmd_list) { -+ vcmd = NULL; -+ } -+ -+ return vcmd; -+} -+ -+struct vhba_command *match_command (struct vhba_device *vdev, __u32 metatag) -+{ -+ struct vhba_command *vcmd; -+ -+ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { -+ if (vcmd->metatag == metatag) { -+ break; -+ } -+ } -+ -+ if (&vcmd->entry == &vdev->cmd_list) { -+ vcmd = NULL; -+ } -+ -+ return vcmd; -+} -+ -+struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags) -+{ -+ struct vhba_command *vcmd; -+ DEFINE_WAIT(wait); -+ -+ while (!(vcmd = next_command(vdev))) { -+ if (signal_pending(current)) { -+ break; -+ } -+ -+ prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE); -+ -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ schedule(); -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ } -+ -+ finish_wait(&vdev->cmd_wq, &wait); -+ if (vcmd) { -+ vcmd->status = VHBA_REQ_READING; -+ } -+ -+ return vcmd; -+} -+ -+ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset) -+{ -+ struct vhba_device *vdev; -+ struct vhba_command *vcmd; -+ ssize_t ret; -+ unsigned long flags; -+ -+ vdev = file->private_data; -+ -+ /* Get next command */ -+ if (file->f_flags & O_NONBLOCK) { -+ /* Non-blocking variant */ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ vcmd = next_command(vdev); -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ if (!vcmd) { -+ return -EWOULDBLOCK; -+ } -+ } else { -+ /* Blocking variant */ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ vcmd = wait_command(vdev, flags); -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ if (!vcmd) { -+ return -ERESTARTSYS; -+ } -+ } -+ -+ ret = do_request(vdev, vcmd->metatag, vcmd->cmd, buf, buf_len); -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ if (ret >= 0) { -+ vcmd->status = VHBA_REQ_SENT; -+ *offset += ret; -+ } else { -+ vcmd->status = VHBA_REQ_PENDING; -+ } -+ -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ return ret; -+} -+ -+ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset) -+{ -+ struct vhba_device *vdev; -+ struct vhba_command *vcmd; -+ struct vhba_response res; -+ ssize_t ret; -+ unsigned long flags; -+ -+ if (buf_len < sizeof(res)) { -+ return -EIO; -+ } -+ -+ if (copy_from_user(&res, buf, sizeof(res))) { -+ return -EFAULT; -+ } -+ -+ vdev = file->private_data; -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ vcmd = match_command(vdev, res.metatag); -+ if (!vcmd || vcmd->status != VHBA_REQ_SENT) { -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ pr_debug("ctl dev #%u not expecting response\n", vdev->num); -+ return -EIO; -+ } -+ vcmd->status = VHBA_REQ_WRITING; -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ ret = do_response(vdev, vcmd->metatag, vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res); -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ if (ret >= 0) { -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) -+ scsi_done(vcmd->cmd); -+#else -+ vcmd->cmd->scsi_done(vcmd->cmd); -+#endif -+ ret += sizeof(res); -+ -+ /* don't compete with vhba_device_dequeue */ -+ if (!list_empty(&vcmd->entry)) { -+ list_del_init(&vcmd->entry); -+ vhba_free_command(vcmd); -+ } -+ } else { -+ vcmd->status = VHBA_REQ_SENT; -+ } -+ -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ return ret; -+} -+ -+long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg) -+{ -+ struct vhba_device *vdev = file->private_data; -+ struct vhba_host *vhost = platform_get_drvdata(&vhba_platform_device); -+ -+ switch (cmd) { -+ case 0xBEEF001: { -+ unsigned int ident[4]; /* host, channel, id, lun */ -+ -+ ident[0] = vhost->shost->host_no; -+ devnum_to_bus_and_id(vdev->num, &ident[1], &ident[2]); -+ ident[3] = 0; /* lun */ -+ -+ if (copy_to_user((void *) arg, ident, sizeof(ident))) { -+ return -EFAULT; -+ } -+ -+ return 0; -+ } -+ case 0xBEEF002: { -+ unsigned int devnum = vdev->num; -+ -+ if (copy_to_user((void *) arg, &devnum, sizeof(devnum))) { -+ return -EFAULT; -+ } -+ -+ return 0; -+ } -+ } -+ -+ return -ENOTTY; -+} -+ -+#ifdef CONFIG_COMPAT -+long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg) -+{ -+ unsigned long compat_arg = (unsigned long)compat_ptr(arg); -+ return vhba_ctl_ioctl(file, cmd, compat_arg); -+} -+#endif -+ -+unsigned int vhba_ctl_poll (struct file *file, poll_table *wait) -+{ -+ struct vhba_device *vdev = file->private_data; -+ unsigned int mask = 0; -+ unsigned long flags; -+ -+ poll_wait(file, &vdev->cmd_wq, wait); -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ if (next_command(vdev)) { -+ mask |= POLLIN | POLLRDNORM; -+ } -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ return mask; -+} -+ -+int vhba_ctl_open (struct inode *inode, struct file *file) -+{ -+ struct vhba_device *vdev; -+ int retval; -+ -+ pr_debug("ctl dev open\n"); -+ -+ /* check if vhba is probed */ -+ if (!platform_get_drvdata(&vhba_platform_device)) { -+ return -ENODEV; -+ } -+ -+ vdev = vhba_device_alloc(); -+ if (!vdev) { -+ return -ENOMEM; -+ } -+ -+ vdev->kbuf_size = VHBA_KBUF_SIZE; -+ vdev->kbuf = kzalloc(vdev->kbuf_size, GFP_KERNEL); -+ if (!vdev->kbuf) { -+ return -ENOMEM; -+ } -+ -+ if (!(retval = vhba_add_device(vdev))) { -+ file->private_data = vdev; -+ } -+ -+ vhba_device_put(vdev); -+ -+ return retval; -+} -+ -+int vhba_ctl_release (struct inode *inode, struct file *file) -+{ -+ struct vhba_device *vdev; -+ struct vhba_command *vcmd; -+ unsigned long flags; -+ -+ vdev = file->private_data; -+ -+ pr_debug("ctl dev release\n"); -+ -+ vhba_device_get(vdev); -+ vhba_remove_device(vdev); -+ -+ spin_lock_irqsave(&vdev->cmd_lock, flags); -+ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { -+ WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING); -+ -+ scmd_dbg(vcmd->cmd, "device released with command %lu (%p)\n", vcmd->metatag, vcmd->cmd); -+ vcmd->cmd->result = DID_NO_CONNECT << 16; -+#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) -+ scsi_done(vcmd->cmd); -+#else -+ vcmd->cmd->scsi_done(vcmd->cmd); -+#endif -+ vhba_free_command(vcmd); -+ } -+ INIT_LIST_HEAD(&vdev->cmd_list); -+ spin_unlock_irqrestore(&vdev->cmd_lock, flags); -+ -+ kfree(vdev->kbuf); -+ vdev->kbuf = NULL; -+ -+ vhba_device_put(vdev); -+ -+ return 0; -+} -+ -+static struct file_operations vhba_ctl_fops = { -+ .owner = THIS_MODULE, -+ .open = vhba_ctl_open, -+ .release = vhba_ctl_release, -+ .read = vhba_ctl_read, -+ .write = vhba_ctl_write, -+ .poll = vhba_ctl_poll, -+ .unlocked_ioctl = vhba_ctl_ioctl, -+#ifdef CONFIG_COMPAT -+ .compat_ioctl = vhba_ctl_compat_ioctl, -+#endif -+}; -+ -+static struct miscdevice vhba_miscdev = { -+ .minor = MISC_DYNAMIC_MINOR, -+ .name = "vhba_ctl", -+ .fops = &vhba_ctl_fops, -+}; -+ -+int vhba_probe (struct platform_device *pdev) -+{ -+ struct Scsi_Host *shost; -+ struct vhba_host *vhost; -+ int i; -+ -+ vhba_can_queue = clamp(vhba_can_queue, 1, 256); -+ -+ shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host)); -+ if (!shost) { -+ return -ENOMEM; -+ } -+ -+ shost->max_channel = VHBA_MAX_BUS-1; -+ shost->max_id = VHBA_MAX_ID; -+ /* we don't support lun > 0 */ -+ shost->max_lun = 1; -+ shost->max_cmd_len = MAX_COMMAND_SIZE; -+ shost->can_queue = vhba_can_queue; -+ shost->cmd_per_lun = vhba_can_queue; -+ -+ vhost = (struct vhba_host *)shost->hostdata; -+ memset(vhost, 0, sizeof(struct vhba_host)); -+ -+ vhost->shost = shost; -+ vhost->num_devices = 0; -+ spin_lock_init(&vhost->dev_lock); -+ spin_lock_init(&vhost->cmd_lock); -+ INIT_WORK(&vhost->scan_devices, vhba_scan_devices); -+ vhost->cmd_next = 0; -+ vhost->commands = kzalloc(vhba_can_queue * sizeof(struct vhba_command), GFP_KERNEL); -+ if (!vhost->commands) { -+ return -ENOMEM; -+ } -+ -+ for (i = 0; i < vhba_can_queue; i++) { -+ vhost->commands[i].status = VHBA_REQ_FREE; -+ } -+ -+ platform_set_drvdata(pdev, vhost); -+ -+#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) -+ i = scsi_init_shared_tag_map(shost, vhba_can_queue); -+ if (i) return i; -+#endif -+ -+ if (scsi_add_host(shost, &pdev->dev)) { -+ scsi_host_put(shost); -+ return -ENOMEM; -+ } -+ -+ return 0; -+} -+ -+int vhba_remove (struct platform_device *pdev) -+{ -+ struct vhba_host *vhost; -+ struct Scsi_Host *shost; -+ -+ vhost = platform_get_drvdata(pdev); -+ shost = vhost->shost; -+ -+ scsi_remove_host(shost); -+ scsi_host_put(shost); -+ -+ kfree(vhost->commands); -+ -+ return 0; -+} -+ -+void vhba_release (struct device * dev) -+{ -+ return; -+} -+ -+static struct platform_device vhba_platform_device = { -+ .name = "vhba", -+ .id = -1, -+ .dev = { -+ .release = vhba_release, -+ }, -+}; -+ -+static struct platform_driver vhba_platform_driver = { -+ .driver = { -+ .owner = THIS_MODULE, -+ .name = "vhba", -+ }, -+ .probe = vhba_probe, -+ .remove = vhba_remove, -+}; -+ -+int __init vhba_init (void) -+{ -+ int ret; -+ -+ ret = platform_device_register(&vhba_platform_device); -+ if (ret < 0) { -+ return ret; -+ } -+ -+ ret = platform_driver_register(&vhba_platform_driver); -+ if (ret < 0) { -+ platform_device_unregister(&vhba_platform_device); -+ return ret; -+ } -+ -+ ret = misc_register(&vhba_miscdev); -+ if (ret < 0) { -+ platform_driver_unregister(&vhba_platform_driver); -+ platform_device_unregister(&vhba_platform_device); -+ return ret; -+ } -+ -+ return 0; -+} -+ -+void __exit vhba_exit(void) -+{ -+ misc_deregister(&vhba_miscdev); -+ platform_driver_unregister(&vhba_platform_driver); -+ platform_device_unregister(&vhba_platform_device); -+} -+ -+module_init(vhba_init); -+module_exit(vhba_exit); -+ --- -2.39.2 - diff --git a/sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch deleted file mode 100644 index de7b51d..0000000 --- a/sys-kernel/gentoo-sources-6.6/0301-amd-pstate_preferred_core_V12.patch +++ /dev/null @@ -1,54 +0,0 @@ -From: Meng Li -To: "Rafael J . Wysocki" , - Huang Rui -Cc: , , - , , - Shuah Khan , - , - "Nathan Fontenot" , - Deepak Sharma , - Alex Deucher , - Mario Limonciello , - Shimmer Huang , - "Perry Yuan" , - Xiaojian Du , - Viresh Kumar , - Borislav Petkov , - "Oleksandr Natalenko" , - Meng Li , Perry Yuan -Subject: [PATCH V12 1/7] x86: Drop CPU_SUP_INTEL from SCHED_MC_PRIO for the expansion. -Date: Tue, 5 Dec 2023 14:35:31 +0800 [thread overview] -Message-ID: <20231205063537.872834-2-li.meng@amd.com> (raw) -In-Reply-To: <20231205063537.872834-1-li.meng@amd.com> - -amd-pstate driver also uses SCHED_MC_PRIO, so decouple the requirement -of CPU_SUP_INTEL from the dependencies to allow compilation in kernels -without Intel CPU support. - -Tested-by: Oleksandr Natalenko -Reviewed-by: Mario Limonciello -Reviewed-by: Huang Rui -Reviewed-by: Perry Yuan -Signed-off-by: Meng Li ---- - arch/x86/Kconfig | 5 +++-- - 1 file changed, 3 insertions(+), 2 deletions(-) - -diff --git a/arch/x86/Kconfig b/arch/x86/Kconfig -index 3762f41bb092..3e57773f946a 100644 ---- a/arch/x86/Kconfig -+++ b/arch/x86/Kconfig -@@ -1054,8 +1054,9 @@ config SCHED_MC - - config SCHED_MC_PRIO - bool "CPU core priorities scheduler support" -- depends on SCHED_MC && CPU_SUP_INTEL -- select X86_INTEL_PSTATE -+ depends on SCHED_MC -+ select X86_INTEL_PSTATE if CPU_SUP_INTEL -+ select X86_AMD_PSTATE if CPU_SUP_AMD && ACPI - select CPU_FREQ - default y - help --- -2.34.1 diff --git a/sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch deleted file mode 100644 index fe399e3..0000000 --- a/sys-kernel/gentoo-sources-6.6/0302-amd-pstate_preferred_core_V12.patch +++ /dev/null @@ -1,92 +0,0 @@ -From: Meng Li -To: "Rafael J . Wysocki" , - Huang Rui -Cc: , , - , , - Shuah Khan , - , - "Nathan Fontenot" , - Deepak Sharma , - Alex Deucher , - Mario Limonciello , - Shimmer Huang , - "Perry Yuan" , - Xiaojian Du , - Viresh Kumar , - Borislav Petkov , - "Oleksandr Natalenko" , - Meng Li , Wyes Karny , - Perry Yuan -Subject: [PATCH V12 2/7] acpi: cppc: Add get the highest performance cppc control -Date: Tue, 5 Dec 2023 14:35:32 +0800 [thread overview] -Message-ID: <20231205063537.872834-3-li.meng@amd.com> (raw) -In-Reply-To: <20231205063537.872834-1-li.meng@amd.com> - -Add support for getting the highest performance to the -generic CPPC driver. This enables downstream drivers -such as amd-pstate to discover and use these values. - -Please refer to the ACPI_Spec for details on continuous -performance control of CPPC. - -Tested-by: Oleksandr Natalenko -Reviewed-by: Mario Limonciello -Reviewed-by: Wyes Karny -Reviewed-by: Perry Yuan -Acked-by: Huang Rui -Signed-off-by: Meng Li -Link: https://uefi.org/specs/ACPI/6.5/08_Processor_Configuration_and_Control.html?highlight=cppc#highest-performance ---- - drivers/acpi/cppc_acpi.c | 13 +++++++++++++ - include/acpi/cppc_acpi.h | 5 +++++ - 2 files changed, 18 insertions(+) - -diff --git a/drivers/acpi/cppc_acpi.c b/drivers/acpi/cppc_acpi.c -index 7ff269a78c20..ad388a0e8484 100644 ---- a/drivers/acpi/cppc_acpi.c -+++ b/drivers/acpi/cppc_acpi.c -@@ -1154,6 +1154,19 @@ int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) - return cppc_get_perf(cpunum, NOMINAL_PERF, nominal_perf); - } - -+/** -+ * cppc_get_highest_perf - Get the highest performance register value. -+ * @cpunum: CPU from which to get highest performance. -+ * @highest_perf: Return address. -+ * -+ * Return: 0 for success, -EIO otherwise. -+ */ -+int cppc_get_highest_perf(int cpunum, u64 *highest_perf) -+{ -+ return cppc_get_perf(cpunum, HIGHEST_PERF, highest_perf); -+} -+EXPORT_SYMBOL_GPL(cppc_get_highest_perf); -+ - /** - * cppc_get_epp_perf - Get the epp register value. - * @cpunum: CPU from which to get epp preference value. -diff --git a/include/acpi/cppc_acpi.h b/include/acpi/cppc_acpi.h -index 6126c977ece0..c0b69ffe7bdb 100644 ---- a/include/acpi/cppc_acpi.h -+++ b/include/acpi/cppc_acpi.h -@@ -139,6 +139,7 @@ struct cppc_cpudata { - #ifdef CONFIG_ACPI_CPPC_LIB - extern int cppc_get_desired_perf(int cpunum, u64 *desired_perf); - extern int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf); -+extern int cppc_get_highest_perf(int cpunum, u64 *highest_perf); - extern int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs); - extern int cppc_set_perf(int cpu, struct cppc_perf_ctrls *perf_ctrls); - extern int cppc_set_enable(int cpu, bool enable); -@@ -165,6 +166,10 @@ static inline int cppc_get_nominal_perf(int cpunum, u64 *nominal_perf) - { - return -ENOTSUPP; - } -+static inline int cppc_get_highest_perf(int cpunum, u64 *highest_perf) -+{ -+ return -ENOTSUPP; -+} - static inline int cppc_get_perf_ctrs(int cpu, struct cppc_perf_fb_ctrs *perf_fb_ctrs) - { - return -ENOTSUPP; --- -2.34.1 diff --git a/sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch deleted file mode 100644 index e891fcd..0000000 --- a/sys-kernel/gentoo-sources-6.6/0303-amd-pstate_preferred_core_V12.patch +++ /dev/null @@ -1,322 +0,0 @@ -From: Meng Li -To: "Rafael J . Wysocki" , - Huang Rui -Cc: , , - , , - Shuah Khan , - , - "Nathan Fontenot" , - Deepak Sharma , - Alex Deucher , - Mario Limonciello , - Shimmer Huang , - "Perry Yuan" , - Xiaojian Du , - Viresh Kumar , - Borislav Petkov , - "Oleksandr Natalenko" , - Meng Li , Wyes Karny -Subject: [PATCH V12 3/7] cpufreq: amd-pstate: Enable amd-pstate preferred core supporting. -Date: Tue, 5 Dec 2023 14:35:33 +0800 [thread overview] -Message-ID: <20231205063537.872834-4-li.meng@amd.com> (raw) -In-Reply-To: <20231205063537.872834-1-li.meng@amd.com> - -amd-pstate driver utilizes the functions and data structures -provided by the ITMT architecture to enable the scheduler to -favor scheduling on cores which can be get a higher frequency -with lower voltage. We call it amd-pstate preferrred core. - -Here sched_set_itmt_core_prio() is called to set priorities and -sched_set_itmt_support() is called to enable ITMT feature. -amd-pstate driver uses the highest performance value to indicate -the priority of CPU. The higher value has a higher priority. - -The initial core rankings are set up by amd-pstate when the -system boots. - -Add a variable hw_prefcore in cpudata structure. It will check -if the processor and power firmware support preferred core -feature. - -Add one new early parameter `disable` to allow user to disable -the preferred core. - -Only when hardware supports preferred core and user set `enabled` -in early parameter, amd pstate driver supports preferred core featue. - -Tested-by: Oleksandr Natalenko -Reviewed-by: Huang Rui -Reviewed-by: Wyes Karny -Reviewed-by: Mario Limonciello -Co-developed-by: Perry Yuan -Signed-off-by: Perry Yuan -Signed-off-by: Meng Li ---- - drivers/cpufreq/amd-pstate.c | 131 ++++++++++++++++++++++++++++++++--- - include/linux/amd-pstate.h | 4 ++ - 2 files changed, 127 insertions(+), 8 deletions(-) - -diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 1f6186475715..9c2790753f99 100644 ---- a/drivers/cpufreq/amd-pstate.c -+++ b/drivers/cpufreq/amd-pstate.c -@@ -37,6 +37,7 @@ - #include - #include - #include -+#include - - #include - #include -@@ -49,6 +50,7 @@ - - #define AMD_PSTATE_TRANSITION_LATENCY 20000 - #define AMD_PSTATE_TRANSITION_DELAY 1000 -+#define AMD_PSTATE_PREFCORE_THRESHOLD 166 - - /* - * TODO: We need more time to fine tune processors with shared memory solution -@@ -64,6 +66,7 @@ static struct cpufreq_driver amd_pstate_driver; - static struct cpufreq_driver amd_pstate_epp_driver; - static int cppc_state = AMD_PSTATE_UNDEFINED; - static bool cppc_enabled; -+static bool amd_pstate_prefcore = true; - - /* - * AMD Energy Preference Performance (EPP) -@@ -297,13 +300,14 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) - if (ret) - return ret; - -- /* -- * TODO: Introduce AMD specific power feature. -- * -- * CPPC entry doesn't indicate the highest performance in some ASICs. -+ /* For platforms that do not support the preferred core feature, the -+ * highest_pef may be configured with 166 or 255, to avoid max frequency -+ * calculated wrongly. we take the AMD_CPPC_HIGHEST_PERF(cap1) value as -+ * the default max perf. - */ -- highest_perf = amd_get_highest_perf(); -- if (highest_perf > AMD_CPPC_HIGHEST_PERF(cap1)) -+ if (cpudata->hw_prefcore) -+ highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD; -+ else - highest_perf = AMD_CPPC_HIGHEST_PERF(cap1); - - WRITE_ONCE(cpudata->highest_perf, highest_perf); -@@ -324,8 +328,9 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) - if (ret) - return ret; - -- highest_perf = amd_get_highest_perf(); -- if (highest_perf > cppc_perf.highest_perf) -+ if (cpudata->hw_prefcore) -+ highest_perf = AMD_PSTATE_PREFCORE_THRESHOLD; -+ else - highest_perf = cppc_perf.highest_perf; - - WRITE_ONCE(cpudata->highest_perf, highest_perf); -@@ -706,6 +711,80 @@ static void amd_perf_ctl_reset(unsigned int cpu) - wrmsrl_on_cpu(cpu, MSR_AMD_PERF_CTL, 0); - } - -+/* -+ * Set amd-pstate preferred core enable can't be done directly from cpufreq callbacks -+ * due to locking, so queue the work for later. -+ */ -+static void amd_pstste_sched_prefcore_workfn(struct work_struct *work) -+{ -+ sched_set_itmt_support(); -+} -+static DECLARE_WORK(sched_prefcore_work, amd_pstste_sched_prefcore_workfn); -+ -+/* -+ * Get the highest performance register value. -+ * @cpu: CPU from which to get highest performance. -+ * @highest_perf: Return address. -+ * -+ * Return: 0 for success, -EIO otherwise. -+ */ -+static int amd_pstate_get_highest_perf(int cpu, u32 *highest_perf) -+{ -+ int ret; -+ -+ if (boot_cpu_has(X86_FEATURE_CPPC)) { -+ u64 cap1; -+ -+ ret = rdmsrl_safe_on_cpu(cpu, MSR_AMD_CPPC_CAP1, &cap1); -+ if (ret) -+ return ret; -+ WRITE_ONCE(*highest_perf, AMD_CPPC_HIGHEST_PERF(cap1)); -+ } else { -+ u64 cppc_highest_perf; -+ -+ ret = cppc_get_highest_perf(cpu, &cppc_highest_perf); -+ if (ret) -+ return ret; -+ WRITE_ONCE(*highest_perf, cppc_highest_perf); -+ } -+ -+ return (ret); -+} -+ -+#define CPPC_MAX_PERF U8_MAX -+ -+static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) -+{ -+ int ret, prio; -+ u32 highest_perf; -+ -+ ret = amd_pstate_get_highest_perf(cpudata->cpu, &highest_perf); -+ if (ret) -+ return; -+ -+ cpudata->hw_prefcore = true; -+ /* check if CPPC preferred core feature is enabled*/ -+ if (highest_perf < CPPC_MAX_PERF) -+ prio = (int)highest_perf; -+ else { -+ pr_debug("AMD CPPC preferred core is unsupported!\n"); -+ cpudata->hw_prefcore = false; -+ return; -+ } -+ -+ if (!amd_pstate_prefcore) -+ return; -+ -+ /* -+ * The priorities can be set regardless of whether or not -+ * sched_set_itmt_support(true) has been called and it is valid to -+ * update them at any time after it has been called. -+ */ -+ sched_set_itmt_core_prio(prio, cpudata->cpu); -+ -+ schedule_work(&sched_prefcore_work); -+} -+ - static int amd_pstate_cpu_init(struct cpufreq_policy *policy) - { - int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; -@@ -727,6 +806,8 @@ static int amd_pstate_cpu_init(struct cpufreq_policy *policy) - - cpudata->cpu = policy->cpu; - -+ amd_pstate_init_prefcore(cpudata); -+ - ret = amd_pstate_init_perf(cpudata); - if (ret) - goto free_cpudata1; -@@ -877,6 +958,17 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, - return sysfs_emit(buf, "%u\n", perf); - } - -+static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, -+ char *buf) -+{ -+ bool hw_prefcore; -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ hw_prefcore = READ_ONCE(cpudata->hw_prefcore); -+ -+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(hw_prefcore)); -+} -+ - static ssize_t show_energy_performance_available_preferences( - struct cpufreq_policy *policy, char *buf) - { -@@ -1074,18 +1166,27 @@ static ssize_t status_store(struct device *a, struct device_attribute *b, - return ret < 0 ? ret : count; - } - -+static ssize_t prefcore_show(struct device *dev, -+ struct device_attribute *attr, char *buf) -+{ -+ return sysfs_emit(buf, "%s\n", str_enabled_disabled(amd_pstate_prefcore)); -+} -+ - cpufreq_freq_attr_ro(amd_pstate_max_freq); - cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); - - cpufreq_freq_attr_ro(amd_pstate_highest_perf); -+cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); - cpufreq_freq_attr_rw(energy_performance_preference); - cpufreq_freq_attr_ro(energy_performance_available_preferences); - static DEVICE_ATTR_RW(status); -+static DEVICE_ATTR_RO(prefcore); - - static struct freq_attr *amd_pstate_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, -+ &amd_pstate_hw_prefcore, - NULL, - }; - -@@ -1093,6 +1194,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, -+ &amd_pstate_hw_prefcore, - &energy_performance_preference, - &energy_performance_available_preferences, - NULL, -@@ -1100,6 +1202,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { - - static struct attribute *pstate_global_attributes[] = { - &dev_attr_status.attr, -+ &dev_attr_prefcore.attr, - NULL - }; - -@@ -1151,6 +1254,8 @@ static int amd_pstate_epp_cpu_init(struct cpufreq_policy *policy) - cpudata->cpu = policy->cpu; - cpudata->epp_policy = 0; - -+ amd_pstate_init_prefcore(cpudata); -+ - ret = amd_pstate_init_perf(cpudata); - if (ret) - goto free_cpudata1; -@@ -1568,7 +1673,17 @@ static int __init amd_pstate_param(char *str) - - return amd_pstate_set_driver(mode_idx); - } -+ -+static int __init amd_prefcore_param(char *str) -+{ -+ if (!strcmp(str, "disable")) -+ amd_pstate_prefcore = false; -+ -+ return 0; -+} -+ - early_param("amd_pstate", amd_pstate_param); -+early_param("amd_prefcore", amd_prefcore_param); - - MODULE_AUTHOR("Huang Rui "); - MODULE_DESCRIPTION("AMD Processor P-state Frequency Driver"); -diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h -index 6ad02ad9c7b4..68fc1bd8d851 100644 ---- a/include/linux/amd-pstate.h -+++ b/include/linux/amd-pstate.h -@@ -52,6 +52,9 @@ struct amd_aperf_mperf { - * @prev: Last Aperf/Mperf/tsc count value read from register - * @freq: current cpu frequency value - * @boost_supported: check whether the Processor or SBIOS supports boost mode -+ * @hw_prefcore: check whether HW supports preferred core featue. -+ * Only when hw_prefcore and early prefcore param are true, -+ * AMD P-State driver supports preferred core featue. - * @epp_policy: Last saved policy used to set energy-performance preference - * @epp_cached: Cached CPPC energy-performance preference value - * @policy: Cpufreq policy value -@@ -85,6 +88,7 @@ struct amd_cpudata { - - u64 freq; - bool boost_supported; -+ bool hw_prefcore; - - /* EPP feature related attributes*/ - s16 epp_policy; --- -2.34.1 diff --git a/sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch deleted file mode 100644 index 912e49f..0000000 --- a/sys-kernel/gentoo-sources-6.6/0304-amd-pstate_preferred_core_V12.patch +++ /dev/null @@ -1,120 +0,0 @@ -From: Meng Li -To: "Rafael J . Wysocki" , - Huang Rui -Cc: , , - , , - Shuah Khan , - , - "Nathan Fontenot" , - Deepak Sharma , - Alex Deucher , - Mario Limonciello , - Shimmer Huang , - "Perry Yuan" , - Xiaojian Du , - Viresh Kumar , - Borislav Petkov , - "Oleksandr Natalenko" , - Meng Li , Perry Yuan -Subject: [PATCH V12 4/7] cpufreq: Add a notification message that the highest perf has changed -Date: Tue, 5 Dec 2023 14:35:34 +0800 [thread overview] -Message-ID: <20231205063537.872834-5-li.meng@amd.com> (raw) -In-Reply-To: <20231205063537.872834-1-li.meng@amd.com> - -ACPI 6.5 section 8.4.6.1.1.1 specifies that Notify event 0x85 can be -emmitted to cause the the OSPM to re-evaluate the highest performance -register. Add support for this event. - -Tested-by: Oleksandr Natalenko -Reviewed-by: Mario Limonciello -Reviewed-by: Huang Rui -Reviewed-by: Perry Yuan -Signed-off-by: Meng Li -Link: https://uefi.org/specs/ACPI/6.5/05_ACPI_Software_Programming_Model.html#processor-device-notification-values ---- - drivers/acpi/processor_driver.c | 6 ++++++ - drivers/cpufreq/cpufreq.c | 13 +++++++++++++ - include/linux/cpufreq.h | 5 +++++ - 3 files changed, 24 insertions(+) - -diff --git a/drivers/acpi/processor_driver.c b/drivers/acpi/processor_driver.c -index 4bd16b3f0781..29b2fb68a35d 100644 ---- a/drivers/acpi/processor_driver.c -+++ b/drivers/acpi/processor_driver.c -@@ -27,6 +27,7 @@ - #define ACPI_PROCESSOR_NOTIFY_PERFORMANCE 0x80 - #define ACPI_PROCESSOR_NOTIFY_POWER 0x81 - #define ACPI_PROCESSOR_NOTIFY_THROTTLING 0x82 -+#define ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED 0x85 - - MODULE_AUTHOR("Paul Diefenbaugh"); - MODULE_DESCRIPTION("ACPI Processor Driver"); -@@ -83,6 +84,11 @@ static void acpi_processor_notify(acpi_handle handle, u32 event, void *data) - acpi_bus_generate_netlink_event(device->pnp.device_class, - dev_name(&device->dev), event, 0); - break; -+ case ACPI_PROCESSOR_NOTIFY_HIGEST_PERF_CHANGED: -+ cpufreq_update_highest_perf(pr->id); -+ acpi_bus_generate_netlink_event(device->pnp.device_class, -+ dev_name(&device->dev), event, 0); -+ break; - default: - acpi_handle_debug(handle, "Unsupported event [0x%x]\n", event); - break; -diff --git a/drivers/cpufreq/cpufreq.c b/drivers/cpufreq/cpufreq.c -index 934d35f570b7..14a4cbc6dd05 100644 ---- a/drivers/cpufreq/cpufreq.c -+++ b/drivers/cpufreq/cpufreq.c -@@ -2717,6 +2717,19 @@ void cpufreq_update_limits(unsigned int cpu) - } - EXPORT_SYMBOL_GPL(cpufreq_update_limits); - -+/** -+ * cpufreq_update_highest_perf - Update highest performance for a given CPU. -+ * @cpu: CPU to update the highest performance for. -+ * -+ * Invoke the driver's ->update_highest_perf callback if present -+ */ -+void cpufreq_update_highest_perf(unsigned int cpu) -+{ -+ if (cpufreq_driver->update_highest_perf) -+ cpufreq_driver->update_highest_perf(cpu); -+} -+EXPORT_SYMBOL_GPL(cpufreq_update_highest_perf); -+ - /********************************************************************* - * BOOST * - *********************************************************************/ -diff --git a/include/linux/cpufreq.h b/include/linux/cpufreq.h -index 1c5ca92a0555..f62257b2a42f 100644 ---- a/include/linux/cpufreq.h -+++ b/include/linux/cpufreq.h -@@ -235,6 +235,7 @@ int cpufreq_get_policy(struct cpufreq_policy *policy, unsigned int cpu); - void refresh_frequency_limits(struct cpufreq_policy *policy); - void cpufreq_update_policy(unsigned int cpu); - void cpufreq_update_limits(unsigned int cpu); -+void cpufreq_update_highest_perf(unsigned int cpu); - bool have_governor_per_policy(void); - bool cpufreq_supports_freq_invariance(void); - struct kobject *get_governor_parent_kobj(struct cpufreq_policy *policy); -@@ -263,6 +264,7 @@ static inline bool cpufreq_supports_freq_invariance(void) - return false; - } - static inline void disable_cpufreq(void) { } -+static inline void cpufreq_update_highest_perf(unsigned int cpu) { } - #endif - - #ifdef CONFIG_CPU_FREQ_STAT -@@ -380,6 +382,9 @@ struct cpufreq_driver { - /* Called to update policy limits on firmware notifications. */ - void (*update_limits)(unsigned int cpu); - -+ /* Called to update highest performance on firmware notifications. */ -+ void (*update_highest_perf)(unsigned int cpu); -+ - /* optional */ - int (*bios_limit)(int cpu, unsigned int *limit); - --- -2.34.1 - diff --git a/sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch deleted file mode 100644 index 12e3a68..0000000 --- a/sys-kernel/gentoo-sources-6.6/0305-amd-pstate_preferred_core_V12.patch +++ /dev/null @@ -1,182 +0,0 @@ -From: Meng Li -To: "Rafael J . Wysocki" , - Huang Rui -Cc: , , - , , - Shuah Khan , - , - "Nathan Fontenot" , - Deepak Sharma , - Alex Deucher , - Mario Limonciello , - Shimmer Huang , - "Perry Yuan" , - Xiaojian Du , - Viresh Kumar , - Borislav Petkov , - "Oleksandr Natalenko" , - Meng Li , Wyes Karny , - Perry Yuan -Subject: [PATCH V12 5/7] cpufreq: amd-pstate: Update amd-pstate preferred core ranking dynamically -Date: Tue, 5 Dec 2023 14:35:35 +0800 [thread overview] -Message-ID: <20231205063537.872834-6-li.meng@amd.com> (raw) -In-Reply-To: <20231205063537.872834-1-li.meng@amd.com> - -Preferred core rankings can be changed dynamically by the -platform based on the workload and platform conditions and -accounting for thermals and aging. -When this occurs, cpu priority need to be set. - -Tested-by: Oleksandr Natalenko -Reviewed-by: Mario Limonciello -Reviewed-by: Wyes Karny -Reviewed-by: Huang Rui -Reviewed-by: Perry Yuan -Signed-off-by: Meng Li ---- - drivers/cpufreq/amd-pstate.c | 44 ++++++++++++++++++++++++++++++++++++ - include/linux/amd-pstate.h | 6 +++++ - 2 files changed, 50 insertions(+) - -diff --git a/drivers/cpufreq/amd-pstate.c b/drivers/cpufreq/amd-pstate.c -index 9c2790753f99..25f0fb53d320 100644 ---- a/drivers/cpufreq/amd-pstate.c -+++ b/drivers/cpufreq/amd-pstate.c -@@ -315,6 +315,7 @@ static int pstate_init_perf(struct amd_cpudata *cpudata) - WRITE_ONCE(cpudata->nominal_perf, AMD_CPPC_NOMINAL_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_nonlinear_perf, AMD_CPPC_LOWNONLIN_PERF(cap1)); - WRITE_ONCE(cpudata->lowest_perf, AMD_CPPC_LOWEST_PERF(cap1)); -+ WRITE_ONCE(cpudata->prefcore_ranking, AMD_CPPC_HIGHEST_PERF(cap1)); - WRITE_ONCE(cpudata->min_limit_perf, AMD_CPPC_LOWEST_PERF(cap1)); - return 0; - } -@@ -339,6 +340,7 @@ static int cppc_init_perf(struct amd_cpudata *cpudata) - WRITE_ONCE(cpudata->lowest_nonlinear_perf, - cppc_perf.lowest_nonlinear_perf); - WRITE_ONCE(cpudata->lowest_perf, cppc_perf.lowest_perf); -+ WRITE_ONCE(cpudata->prefcore_ranking, cppc_perf.highest_perf); - WRITE_ONCE(cpudata->min_limit_perf, cppc_perf.lowest_perf); - - if (cppc_state == AMD_PSTATE_ACTIVE) -@@ -785,6 +787,32 @@ static void amd_pstate_init_prefcore(struct amd_cpudata *cpudata) - schedule_work(&sched_prefcore_work); - } - -+static void amd_pstate_update_highest_perf(unsigned int cpu) -+{ -+ struct cpufreq_policy *policy = cpufreq_cpu_get(cpu); -+ struct amd_cpudata *cpudata = policy->driver_data; -+ u32 prev_high = 0, cur_high = 0; -+ int ret; -+ -+ if ((!amd_pstate_prefcore) || (!cpudata->hw_prefcore)) -+ goto free_cpufreq_put; -+ -+ ret = amd_pstate_get_highest_perf(cpu, &cur_high); -+ if (ret) -+ goto free_cpufreq_put; -+ -+ prev_high = READ_ONCE(cpudata->prefcore_ranking); -+ if (prev_high != cur_high) { -+ WRITE_ONCE(cpudata->prefcore_ranking, cur_high); -+ -+ if (cur_high < CPPC_MAX_PERF) -+ sched_set_itmt_core_prio((int)cur_high, cpu); -+ } -+ -+free_cpufreq_put: -+ cpufreq_cpu_put(policy); -+} -+ - static int amd_pstate_cpu_init(struct cpufreq_policy *policy) - { - int min_freq, max_freq, nominal_freq, lowest_nonlinear_freq, ret; -@@ -958,6 +986,17 @@ static ssize_t show_amd_pstate_highest_perf(struct cpufreq_policy *policy, - return sysfs_emit(buf, "%u\n", perf); - } - -+static ssize_t show_amd_pstate_prefcore_ranking(struct cpufreq_policy *policy, -+ char *buf) -+{ -+ u32 perf; -+ struct amd_cpudata *cpudata = policy->driver_data; -+ -+ perf = READ_ONCE(cpudata->prefcore_ranking); -+ -+ return sysfs_emit(buf, "%u\n", perf); -+} -+ - static ssize_t show_amd_pstate_hw_prefcore(struct cpufreq_policy *policy, - char *buf) - { -@@ -1176,6 +1215,7 @@ cpufreq_freq_attr_ro(amd_pstate_max_freq); - cpufreq_freq_attr_ro(amd_pstate_lowest_nonlinear_freq); - - cpufreq_freq_attr_ro(amd_pstate_highest_perf); -+cpufreq_freq_attr_ro(amd_pstate_prefcore_ranking); - cpufreq_freq_attr_ro(amd_pstate_hw_prefcore); - cpufreq_freq_attr_rw(energy_performance_preference); - cpufreq_freq_attr_ro(energy_performance_available_preferences); -@@ -1186,6 +1226,7 @@ static struct freq_attr *amd_pstate_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, -+ &amd_pstate_prefcore_ranking, - &amd_pstate_hw_prefcore, - NULL, - }; -@@ -1194,6 +1235,7 @@ static struct freq_attr *amd_pstate_epp_attr[] = { - &amd_pstate_max_freq, - &amd_pstate_lowest_nonlinear_freq, - &amd_pstate_highest_perf, -+ &amd_pstate_prefcore_ranking, - &amd_pstate_hw_prefcore, - &energy_performance_preference, - &energy_performance_available_preferences, -@@ -1538,6 +1580,7 @@ static struct cpufreq_driver amd_pstate_driver = { - .suspend = amd_pstate_cpu_suspend, - .resume = amd_pstate_cpu_resume, - .set_boost = amd_pstate_set_boost, -+ .update_highest_perf = amd_pstate_update_highest_perf, - .name = "amd-pstate", - .attr = amd_pstate_attr, - }; -@@ -1552,6 +1595,7 @@ static struct cpufreq_driver amd_pstate_epp_driver = { - .online = amd_pstate_epp_cpu_online, - .suspend = amd_pstate_epp_suspend, - .resume = amd_pstate_epp_resume, -+ .update_highest_perf = amd_pstate_update_highest_perf, - .name = "amd-pstate-epp", - .attr = amd_pstate_epp_attr, - }; -diff --git a/include/linux/amd-pstate.h b/include/linux/amd-pstate.h -index 68fc1bd8d851..d21838835abd 100644 ---- a/include/linux/amd-pstate.h -+++ b/include/linux/amd-pstate.h -@@ -39,11 +39,16 @@ struct amd_aperf_mperf { - * @cppc_req_cached: cached performance request hints - * @highest_perf: the maximum performance an individual processor may reach, - * assuming ideal conditions -+ * For platforms that do not support the preferred core feature, the -+ * highest_pef may be configured with 166 or 255, to avoid max frequency -+ * calculated wrongly. we take the fixed value as the highest_perf. - * @nominal_perf: the maximum sustained performance level of the processor, - * assuming ideal operating conditions - * @lowest_nonlinear_perf: the lowest performance level at which nonlinear power - * savings are achieved - * @lowest_perf: the absolute lowest performance level of the processor -+ * @prefcore_ranking: the preferred core ranking, the higher value indicates a higher -+ * priority. - * @max_freq: the frequency that mapped to highest_perf - * @min_freq: the frequency that mapped to lowest_perf - * @nominal_freq: the frequency that mapped to nominal_perf -@@ -73,6 +78,7 @@ struct amd_cpudata { - u32 nominal_perf; - u32 lowest_nonlinear_perf; - u32 lowest_perf; -+ u32 prefcore_ranking; - u32 min_limit_perf; - u32 max_limit_perf; - u32 min_limit_freq; --- -2.34.1 diff --git a/sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch deleted file mode 100644 index d5a3807..0000000 --- a/sys-kernel/gentoo-sources-6.6/0306-amd-pstate_preferred_core_V12.patch +++ /dev/null @@ -1,125 +0,0 @@ -From: Meng Li -To: "Rafael J . Wysocki" , - Huang Rui -Cc: , , - , , - Shuah Khan , - , - "Nathan Fontenot" , - Deepak Sharma , - Alex Deucher , - Mario Limonciello , - Shimmer Huang , - "Perry Yuan" , - Xiaojian Du , - Viresh Kumar , - Borislav Petkov , - "Oleksandr Natalenko" , - Meng Li , Wyes Karny , - Perry Yuan -Subject: [PATCH V12 6/7] Documentation: amd-pstate: introduce amd-pstate preferred core -Date: Tue, 5 Dec 2023 14:35:36 +0800 [thread overview] -Message-ID: <20231205063537.872834-7-li.meng@amd.com> (raw) -In-Reply-To: <20231205063537.872834-1-li.meng@amd.com> - -Introduce amd-pstate preferred core. - -check preferred core state set by the kernel parameter: -$ cat /sys/devices/system/cpu/amd-pstate/prefcore - -Tested-by: Oleksandr Natalenko -Reviewed-by: Wyes Karny -Reviewed-by: Mario Limonciello -Reviewed-by: Huang Rui -Reviewed-by: Perry Yuan -Signed-off-by: Meng Li ---- - Documentation/admin-guide/pm/amd-pstate.rst | 59 ++++++++++++++++++++- - 1 file changed, 57 insertions(+), 2 deletions(-) - -diff --git a/Documentation/admin-guide/pm/amd-pstate.rst b/Documentation/admin-guide/pm/amd-pstate.rst -index 1cf40f69278c..0b832ff529db 100644 ---- a/Documentation/admin-guide/pm/amd-pstate.rst -+++ b/Documentation/admin-guide/pm/amd-pstate.rst -@@ -300,8 +300,8 @@ platforms. The AMD P-States mechanism is the more performance and energy - efficiency frequency management method on AMD processors. - - --AMD Pstate Driver Operation Modes --================================= -+``amd-pstate`` Driver Operation Modes -+====================================== - - ``amd_pstate`` CPPC has 3 operation modes: autonomous (active) mode, - non-autonomous (passive) mode and guided autonomous (guided) mode. -@@ -353,6 +353,48 @@ is activated. In this mode, driver requests minimum and maximum performance - level and the platform autonomously selects a performance level in this range - and appropriate to the current workload. - -+``amd-pstate`` Preferred Core -+================================= -+ -+The core frequency is subjected to the process variation in semiconductors. -+Not all cores are able to reach the maximum frequency respecting the -+infrastructure limits. Consequently, AMD has redefined the concept of -+maximum frequency of a part. This means that a fraction of cores can reach -+maximum frequency. To find the best process scheduling policy for a given -+scenario, OS needs to know the core ordering informed by the platform through -+highest performance capability register of the CPPC interface. -+ -+``amd-pstate`` preferred core enables the scheduler to prefer scheduling on -+cores that can achieve a higher frequency with lower voltage. The preferred -+core rankings can dynamically change based on the workload, platform conditions, -+thermals and ageing. -+ -+The priority metric will be initialized by the ``amd-pstate`` driver. The ``amd-pstate`` -+driver will also determine whether or not ``amd-pstate`` preferred core is -+supported by the platform. -+ -+``amd-pstate`` driver will provide an initial core ordering when the system boots. -+The platform uses the CPPC interfaces to communicate the core ranking to the -+operating system and scheduler to make sure that OS is choosing the cores -+with highest performance firstly for scheduling the process. When ``amd-pstate`` -+driver receives a message with the highest performance change, it will -+update the core ranking and set the cpu's priority. -+ -+``amd-pstate`` Preferred Core Switch -+================================= -+Kernel Parameters -+----------------- -+ -+``amd-pstate`` peferred core`` has two states: enable and disable. -+Enable/disable states can be chosen by different kernel parameters. -+Default enable ``amd-pstate`` preferred core. -+ -+``amd_prefcore=disable`` -+ -+For systems that support ``amd-pstate`` preferred core, the core rankings will -+always be advertised by the platform. But OS can choose to ignore that via the -+kernel parameter ``amd_prefcore=disable``. -+ - User Space Interface in ``sysfs`` - General - =========================================== - -@@ -385,6 +427,19 @@ control its functionality at the system level. They are located in the - to the operation mode represented by that string - or to be - unregistered in the "disable" case. - -+``prefcore`` -+ Preferred core state of the driver: "enabled" or "disabled". -+ -+ "enabled" -+ Enable the ``amd-pstate`` preferred core. -+ -+ "disabled" -+ Disable the ``amd-pstate`` preferred core -+ -+ -+ This attribute is read-only to check the state of preferred core set -+ by the kernel parameter. -+ - ``cpupower`` tool support for ``amd-pstate`` - =============================================== - --- -2.34.1 diff --git a/sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch b/sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch deleted file mode 100644 index 40153f4..0000000 --- a/sys-kernel/gentoo-sources-6.6/0307-amd-pstate_preferred_core_V12.patch +++ /dev/null @@ -1,57 +0,0 @@ -From: Meng Li -To: "Rafael J . Wysocki" , - Huang Rui -Cc: , , - , , - Shuah Khan , - , - "Nathan Fontenot" , - Deepak Sharma , - Alex Deucher , - Mario Limonciello , - Shimmer Huang , - "Perry Yuan" , - Xiaojian Du , - Viresh Kumar , - Borislav Petkov , - "Oleksandr Natalenko" , - Meng Li , Wyes Karny , - Perry Yuan -Subject: [PATCH V12 7/7] Documentation: introduce amd-pstate preferrd core mode kernel command line options -Date: Tue, 5 Dec 2023 14:35:37 +0800 [thread overview] -Message-ID: <20231205063537.872834-8-li.meng@amd.com> (raw) -In-Reply-To: <20231205063537.872834-1-li.meng@amd.com> - -amd-pstate driver support enable/disable preferred core. -Default enabled on platforms supporting amd-pstate preferred core. -Disable amd-pstate preferred core with -"amd_prefcore=disable" added to the kernel command line. - -Signed-off-by: Meng Li -Reviewed-by: Mario Limonciello -Reviewed-by: Wyes Karny -Reviewed-by: Huang Rui -Reviewed-by: Perry Yuan -Tested-by: Oleksandr Natalenko ---- - Documentation/admin-guide/kernel-parameters.txt | 5 +++++ - 1 file changed, 5 insertions(+) - -diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt -index 65731b060e3f..cbfa63a87e4a 100644 ---- a/Documentation/admin-guide/kernel-parameters.txt -+++ b/Documentation/admin-guide/kernel-parameters.txt -@@ -363,6 +363,11 @@ - selects a performance level in this range and appropriate - to the current workload. - -+ amd_prefcore= -+ [X86] -+ disable -+ Disable amd-pstate preferred core. -+ - amijoy.map= [HW,JOY] Amiga joystick support - Map of devices attached to JOY0DAT and JOY1DAT - Format: , --- -2.34.1 diff --git a/sys-kernel/gentoo-sources-7.0/0001-bore.patch b/sys-kernel/gentoo-sources-7.0/0001-bore.patch new file mode 100644 index 0000000..51617f0 --- /dev/null +++ b/sys-kernel/gentoo-sources-7.0/0001-bore.patch @@ -0,0 +1,1217 @@ +From 187d3236f77a721f684e3211dc50585973b04ab4 Mon Sep 17 00:00:00 2001 +From: Piotr Gorski +Date: Fri, 10 Apr 2026 08:27:29 +0200 +Subject: [PATCH] bore + +Signed-off-by: Piotr Gorski +--- + include/linux/sched.h | 34 +++ + include/linux/sched/bore.h | 41 ++++ + init/Kconfig | 17 ++ + kernel/Kconfig.hz | 17 ++ + kernel/exit.c | 4 + + kernel/fork.c | 13 ++ + kernel/futex/waitwake.c | 11 + + kernel/sched/Makefile | 1 + + kernel/sched/bore.c | 434 +++++++++++++++++++++++++++++++++++++ + kernel/sched/core.c | 12 + + kernel/sched/debug.c | 61 ++++++ + kernel/sched/fair.c | 126 ++++++++++- + kernel/sched/sched.h | 9 + + 13 files changed, 769 insertions(+), 11 deletions(-) + create mode 100644 include/linux/sched/bore.h + create mode 100644 kernel/sched/bore.c + +diff --git a/include/linux/sched.h b/include/linux/sched.h +index 5a5d3dbc9..b2b2d8c66 100644 +--- a/include/linux/sched.h ++++ b/include/linux/sched.h +@@ -817,6 +817,37 @@ struct kmap_ctrl { + #endif + }; + ++#ifdef CONFIG_SCHED_BORE ++#define BORE_BC_TIMESTAMP_SHIFT 16 ++ ++struct bore_bc { ++ union { ++ struct { ++ u64 timestamp: 48; ++ u64 penalty: 16; ++ }; ++ u64 value; ++ }; ++}; ++ ++struct bore_ctx { ++ u64 burst_time; ++ u16 prev_penalty; ++ u16 curr_penalty; ++ union { ++ u16 penalty; ++ struct { ++ u8 _; ++ u8 score; ++ }; ++ }; ++ bool stop_update; ++ bool futex_waiting; ++ struct bore_bc subtree; ++ struct bore_bc group; ++}; ++#endif /* CONFIG_SCHED_BORE */ ++ + struct task_struct { + #ifdef CONFIG_THREAD_INFO_IN_TASK + /* +@@ -875,6 +906,9 @@ struct task_struct { + #ifdef CONFIG_SCHED_CLASS_EXT + struct sched_ext_entity scx; + #endif ++#ifdef CONFIG_SCHED_BORE ++ struct bore_ctx bore; ++#endif /* CONFIG_SCHED_BORE */ + const struct sched_class *sched_class; + + #ifdef CONFIG_SCHED_CORE +diff --git a/include/linux/sched/bore.h b/include/linux/sched/bore.h +new file mode 100644 +index 000000000..9215c13a9 +--- /dev/null ++++ b/include/linux/sched/bore.h +@@ -0,0 +1,41 @@ ++#ifndef _KERNEL_SCHED_BORE_H ++#define _KERNEL_SCHED_BORE_H ++ ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#define SCHED_BORE_AUTHOR "Masahito Suzuki" ++#define SCHED_BORE_PROGNAME "BORE CPU Scheduler modification" ++ ++#define SCHED_BORE_VERSION "6.6.3" ++ ++extern u8 __read_mostly sched_bore; ++DECLARE_STATIC_KEY_TRUE(sched_bore_key); ++extern u8 __read_mostly sched_burst_inherit_type; ++extern u8 __read_mostly sched_burst_smoothness; ++extern u8 __read_mostly sched_burst_penalty_offset; ++extern uint __read_mostly sched_burst_penalty_scale; ++extern uint __read_mostly sched_burst_cache_lifetime; ++ ++extern u8 effective_prio_bore(struct task_struct *p); ++extern void update_curr_bore(struct task_struct *p, u64 delta_exec); ++extern void restart_burst_bore(struct task_struct *p); ++extern void restart_burst_rescale_deadline_bore(struct task_struct *p); ++extern void task_fork_bore(struct task_struct *p, struct task_struct *parent, ++ u64 clone_flags, u64 now); ++extern void sched_init_bore(void); ++extern void reset_task_bore(struct task_struct *p); ++ ++extern int sched_bore_update_handler(const struct ctl_table *table, ++ int write, void __user *buffer, size_t *lenp, loff_t *ppos); ++extern int sched_burst_inherit_type_update_handler(const struct ctl_table *table, ++ int write, void __user *buffer, size_t *lenp, loff_t *ppos); ++ ++extern void reweight_entity( ++ struct cfs_rq *cfs_rq, struct sched_entity *se, unsigned long weight); ++ ++#endif /* _KERNEL_SCHED_BORE_H */ +diff --git a/init/Kconfig b/init/Kconfig +index 7484cd703..4cf628106 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -1446,6 +1446,23 @@ config CHECKPOINT_RESTORE + + If unsure, say N here. + ++config SCHED_BORE ++ bool "Burst-Oriented Response Enhancer" ++ default y ++ help ++ In Desktop and Mobile computing, one might prefer interactive ++ tasks to keep responsive no matter what they run in the background. ++ ++ Enabling this kernel feature modifies the scheduler to discriminate ++ tasks by their burst time (runtime since it last went sleeping or ++ yielding state) and prioritize those that run less bursty. ++ Such tasks usually include window compositor, widgets backend, ++ terminal emulator, video playback, games and so on. ++ With a little impact to scheduling fairness, it may improve ++ responsiveness especially under heavy background workload. ++ ++ If unsure, say Y here. ++ + config SCHED_AUTOGROUP + bool "Automatic process group scheduling" + select CGROUPS +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index ce1435cb0..9eee2005e 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -57,3 +57,20 @@ config HZ + + config SCHED_HRTICK + def_bool HIGH_RES_TIMERS ++ ++config MIN_BASE_SLICE_NS ++ int "Default value for min_base_slice_ns" ++ default 2000000 ++ help ++ The BORE Scheduler automatically calculates the optimal base ++ slice for the configured HZ using the following equation: ++ ++ base_slice_ns = ++ 1000000000/HZ * DIV_ROUNDUP(min_base_slice_ns, 1000000000/HZ) ++ ++ This option sets the default lower bound limit of the base slice ++ to prevent the loss of task throughput due to overscheduling. ++ ++ Setting this value too high can cause the system to boot with ++ an unnecessarily large base slice, resulting in high scheduling ++ latency and poor system responsiveness. +diff --git a/kernel/exit.c b/kernel/exit.c +index ede3117fa..3f3af470d 100644 +--- a/kernel/exit.c ++++ b/kernel/exit.c +@@ -147,7 +147,11 @@ static void __unhash_process(struct release_task_post *post, struct task_struct + detach_pid(post->pids, p, PIDTYPE_SID); + + list_del_rcu(&p->tasks); ++#ifdef CONFIG_SCHED_BORE ++ list_del_rcu(&p->sibling); ++#else /* !CONFIG_SCHED_BORE */ + list_del_init(&p->sibling); ++#endif /* CONFIG_SCHED_BORE */ + __this_cpu_dec(process_counts); + } + list_del_rcu(&p->thread_node); +diff --git a/kernel/fork.c b/kernel/fork.c +index bc2bf58b9..207276c30 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -117,6 +117,10 @@ + /* For dup_mmap(). */ + #include "../mm/internal.h" + ++#ifdef CONFIG_SCHED_BORE ++#include ++#endif /* CONFIG_SCHED_BORE */ ++ + #include + + #define CREATE_TRACE_POINTS +@@ -2362,6 +2366,11 @@ __latent_entropy struct task_struct *copy_process( + p->start_time = ktime_get_ns(); + p->start_boottime = ktime_get_boottime_ns(); + ++#ifdef CONFIG_SCHED_BORE ++ if (likely(p->pid)) ++ task_fork_bore(p, current, clone_flags, p->start_time); ++#endif /* CONFIG_SCHED_BORE */ ++ + /* + * Make it visible to the rest of the system, but dont wake it up yet. + * Need tasklist lock for parent etc handling! +@@ -2435,7 +2444,11 @@ __latent_entropy struct task_struct *copy_process( + */ + p->signal->has_child_subreaper = p->real_parent->signal->has_child_subreaper || + p->real_parent->signal->is_child_subreaper; ++#ifdef CONFIG_SCHED_BORE ++ list_add_tail_rcu(&p->sibling, &p->real_parent->children); ++#else /* !CONFIG_SCHED_BORE */ + list_add_tail(&p->sibling, &p->real_parent->children); ++#endif /* CONFIG_SCHED_BORE */ + list_add_tail_rcu(&p->tasks, &init_task.tasks); + attach_pid(p, PIDTYPE_TGID); + attach_pid(p, PIDTYPE_PGID); +diff --git a/kernel/futex/waitwake.c b/kernel/futex/waitwake.c +index 1c2dd03f1..de57e2d54 100644 +--- a/kernel/futex/waitwake.c ++++ b/kernel/futex/waitwake.c +@@ -4,6 +4,9 @@ + #include + #include + #include ++#ifdef CONFIG_SCHED_BORE ++#include ++#endif /* CONFIG_SCHED_BORE */ + + #include "futex.h" + +@@ -355,7 +358,15 @@ void futex_do_wait(struct futex_q *q, struct hrtimer_sleeper *timeout) + * is no timeout, or if it has yet to expire. + */ + if (!timeout || timeout->task) ++#ifdef CONFIG_SCHED_BORE ++ { ++ current->bore.futex_waiting = true; ++#endif /* CONFIG_SCHED_BORE */ + schedule(); ++#ifdef CONFIG_SCHED_BORE ++ current->bore.futex_waiting = false; ++ } ++#endif /* CONFIG_SCHED_BORE */ + } + __set_current_state(TASK_RUNNING); + } +diff --git a/kernel/sched/Makefile b/kernel/sched/Makefile +index b1f1a3670..f95a7b3d5 100644 +--- a/kernel/sched/Makefile ++++ b/kernel/sched/Makefile +@@ -40,3 +40,4 @@ obj-y += core.o + obj-y += fair.o + obj-y += build_policy.o + obj-y += build_utility.o ++obj-$(CONFIG_SCHED_BORE) += bore.o +diff --git a/kernel/sched/bore.c b/kernel/sched/bore.c +new file mode 100644 +index 000000000..c27a22cd6 +--- /dev/null ++++ b/kernel/sched/bore.c +@@ -0,0 +1,434 @@ ++/* ++ * Burst-Oriented Response Enhancer (BORE) CPU Scheduler ++ * Copyright (C) 2021-2025 Masahito Suzuki ++ */ ++#include ++#include ++#include ++#include "sched.h" ++ ++#ifdef CONFIG_SCHED_BORE ++DEFINE_STATIC_KEY_TRUE(sched_bore_key); ++u8 __read_mostly sched_bore = 1; ++u8 __read_mostly sched_burst_inherit_type = 2; ++u8 __read_mostly sched_burst_smoothness = 1; ++u8 __read_mostly sched_burst_penalty_offset = 24; ++uint __read_mostly sched_burst_penalty_scale = 1536; ++uint __read_mostly sched_burst_cache_lifetime = 75000000; ++static int __maybe_unused maxval_prio = 39; ++static int __maybe_unused maxval_6_bits = 63; ++static int __maybe_unused maxval_8_bits = 255; ++static int __maybe_unused maxval_12_bits = 4095; ++ ++#define MAX_BURST_PENALTY ((40U << 8) - 1) ++#define BURST_CACHE_SAMPLE_LIMIT 63 ++#define BURST_CACHE_SCAN_LIMIT (BURST_CACHE_SAMPLE_LIMIT * 2) ++ ++static u32 bore_reciprocal_lut[BURST_CACHE_SAMPLE_LIMIT + 1]; ++ ++DEFINE_STATIC_KEY_TRUE(sched_burst_inherit_key); ++DEFINE_STATIC_KEY_TRUE(sched_burst_ancestor_key); ++ ++static inline u32 log2p1_u64_u32fp(u64 v, u8 fp) { ++ if (unlikely(!v)) return 0; ++ int clz = __builtin_clzll(v); ++ int exponent = 64 - clz; ++ u32 mantissa = (u32)((v << clz) << 1 >> (64 - fp)); ++ return exponent << fp | mantissa; ++} ++ ++static inline u32 calc_burst_penalty(u64 burst_time) { ++ u32 greed = log2p1_u64_u32fp(burst_time, 8), ++ tolerance = sched_burst_penalty_offset << 8; ++ s32 diff = (s32)(greed - tolerance); ++ u32 penalty = diff & ~(diff >> 31); ++ u32 scaled_penalty = penalty * sched_burst_penalty_scale >> 10; ++ s32 overflow = scaled_penalty - MAX_BURST_PENALTY; ++ return scaled_penalty - (overflow & ~(overflow >> 31)); ++} ++ ++static inline u64 rescale_slice(u64 delta, u8 old_prio, u8 new_prio) { ++ u64 unscaled, rescaled; ++ unscaled = mul_u64_u32_shr(delta , sched_prio_to_weight[old_prio], 10); ++ rescaled = mul_u64_u32_shr(unscaled, sched_prio_to_wmult [new_prio], 22); ++ return rescaled; ++} ++ ++static inline u32 binary_smooth(u32 new, u32 old) { ++ u32 is_growing = (new > old); ++ u32 increment = (new - old) * is_growing; ++ u32 shift = sched_burst_smoothness; ++ u32 smoothed = old + ((increment + (1U << shift) - 1) >> shift); ++ return (new & ~(-is_growing)) | (smoothed & (-is_growing)); ++} ++ ++static void reweight_task_by_prio(struct task_struct *p, int prio) { ++ if (task_has_idle_policy(p)) return; ++ ++ struct sched_entity *se = &p->se; ++ unsigned long weight = scale_load(sched_prio_to_weight[prio]); ++ ++ if (se->on_rq) { ++ p->bore.stop_update = true; ++ reweight_entity(cfs_rq_of(se), se, weight); ++ p->bore.stop_update = false; ++ } else ++ se->load.weight = weight; ++ se->load.inv_weight = sched_prio_to_wmult[prio]; ++} ++ ++u8 effective_prio_bore(struct task_struct *p) { ++ int prio = p->static_prio - MAX_RT_PRIO; ++ if (static_branch_likely(&sched_bore_key)) ++ prio += p->bore.score; ++ prio &= ~(prio >> 31); ++ s32 diff = prio - maxval_prio; ++ prio -= (diff & ~(diff >> 31)); ++ return (u8)prio; ++} ++ ++static void update_penalty(struct task_struct *p) { ++ struct bore_ctx *ctx = &p->bore; ++ ++ u8 prev_prio = effective_prio_bore(p); ++ ++ s32 diff = (s32)ctx->curr_penalty - (s32)ctx->prev_penalty; ++ u16 max_val = ctx->curr_penalty - (diff & (diff >> 31)); ++ u32 is_kthread = !!(p->flags & PF_KTHREAD); ++ ctx->penalty = max_val & -(s32)(!is_kthread); ++ ++ u8 new_prio = effective_prio_bore(p); ++ if (new_prio != prev_prio) ++ reweight_task_by_prio(p, new_prio); ++} ++ ++void update_curr_bore(struct task_struct *p, u64 delta_exec) { ++ struct bore_ctx *ctx = &p->bore; ++ if (ctx->stop_update) return; ++ ++ ctx->burst_time += delta_exec; ++ u32 curr_penalty = ctx->curr_penalty = calc_burst_penalty(ctx->burst_time); ++ ++ if (curr_penalty <= ctx->prev_penalty) return; ++ update_penalty(p); ++} ++ ++void restart_burst_bore(struct task_struct *p) { ++ struct bore_ctx *ctx = &p->bore; ++ u32 new_penalty = binary_smooth(ctx->curr_penalty, ctx->prev_penalty); ++ ctx->prev_penalty = new_penalty; ++ ctx->curr_penalty = 0; ++ ctx->burst_time = 0; ++ update_penalty(p); ++} ++ ++void restart_burst_rescale_deadline_bore(struct task_struct *p) { ++ struct sched_entity *se = &p->se; ++ s64 vscaled, vremain = se->deadline - se->vruntime; ++ ++ u8 old_prio = effective_prio_bore(p); ++ restart_burst_bore(p); ++ u8 new_prio = effective_prio_bore(p); ++ ++ if (old_prio > new_prio) { ++ vscaled = rescale_slice(abs(vremain), old_prio, new_prio); ++ if (unlikely(vremain < 0)) ++ vscaled = -vscaled; ++ se->deadline = se->vruntime + vscaled; ++ } ++} ++ ++static inline bool task_is_bore_eligible(struct task_struct *p) ++{return p && p->sched_class == &fair_sched_class && !p->exit_state;} ++ ++#ifndef for_each_child_task ++#define for_each_child_task(p, t) \ ++ list_for_each_entry_rcu(t, &(p)->children, sibling) ++#endif ++ ++static inline u32 count_children_upto2(struct task_struct *p) { ++ struct list_head *head = &p->children; ++ struct list_head *first = READ_ONCE(head->next); ++ struct list_head *second = READ_ONCE(first->next); ++ return (first != head) + (second != head); ++} ++ ++static inline bool burst_cache_expired(struct bore_bc *bc, u64 now) { ++ struct bore_bc bc_val = { .value = READ_ONCE(bc->value) }; ++ u64 timestamp = (u64)bc_val.timestamp << BORE_BC_TIMESTAMP_SHIFT; ++ return now - timestamp > (u64)sched_burst_cache_lifetime; ++} ++ ++static void update_burst_cache(struct bore_bc *bc, ++ struct task_struct *p, u32 count, u32 total, u64 now) { ++ u32 average = (count == 1) ? total : ++ (u32)(((u64)total * bore_reciprocal_lut[count]) >> 32); ++ ++ struct bore_bc new_bc = { ++ .penalty = max(average, p->bore.penalty), ++ .timestamp = now >> BORE_BC_TIMESTAMP_SHIFT ++ }; ++ WRITE_ONCE(bc->value, new_bc.value); ++} ++ ++static u32 inherit_from_parent(struct task_struct *parent, ++ u64 clone_flags, u64 now) { ++ struct bore_bc bc_val; ++ ++ if (clone_flags & CLONE_PARENT) ++ parent = rcu_dereference(parent->real_parent); ++ ++ struct bore_bc *bc = &parent->bore.subtree; ++ ++ if (burst_cache_expired(bc, now)) { ++ struct task_struct *child; ++ u32 count = 0, total = 0, scan_count = 0; ++ for_each_child_task(parent, child) { ++ if (count >= BURST_CACHE_SAMPLE_LIMIT) break; ++ if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; ++ ++ if (!task_is_bore_eligible(child)) continue; ++ count++; ++ total += child->bore.penalty; ++ } ++ ++ update_burst_cache(bc, parent, count, total, now); ++ } ++ ++ bc_val.value = READ_ONCE(bc->value); ++ return (u32)bc_val.penalty; ++} ++ ++static u32 inherit_from_ancestor_hub(struct task_struct *parent, ++ u64 clone_flags, u64 now) { ++ struct bore_bc bc_val; ++ struct task_struct *ancestor = parent; ++ u32 sole_child_count = 0; ++ ++ if (clone_flags & CLONE_PARENT) { ++ ancestor = rcu_dereference(ancestor->real_parent); ++ sole_child_count = 1; ++ } ++ ++ for (struct task_struct *next; ++ (next = rcu_dereference(ancestor->real_parent)) != ancestor && ++ count_children_upto2(ancestor) <= sole_child_count; ++ ancestor = next, sole_child_count = 1) {} ++ ++ struct bore_bc *bc = &ancestor->bore.subtree; ++ ++ if (burst_cache_expired(bc, now)) { ++ struct task_struct *direct_child; ++ u32 count = 0, total = 0, scan_count = 0; ++ for_each_child_task(ancestor, direct_child) { ++ if (count >= BURST_CACHE_SAMPLE_LIMIT) break; ++ if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; ++ ++ struct task_struct *descendant = direct_child; ++ while (count_children_upto2(descendant) == 1) { ++ struct task_struct *next_descendant = ++ list_first_or_null_rcu(&descendant->children, ++ struct task_struct, sibling); ++ if (!next_descendant) break; ++ descendant = next_descendant; ++ } ++ ++ if (!task_is_bore_eligible(descendant)) continue; ++ count++; ++ total += descendant->bore.penalty; ++ } ++ ++ update_burst_cache(bc, ancestor, count, total, now); ++ } ++ ++ bc_val.value = READ_ONCE(bc->value); ++ return (u32)bc_val.penalty; ++} ++ ++static u32 inherit_from_thread_group(struct task_struct *p, u64 now) { ++ struct bore_bc bc_val; ++ struct task_struct *leader = p->group_leader; ++ struct bore_bc *bc = &leader->bore.group; ++ ++ if (burst_cache_expired(bc, now)) { ++ struct task_struct *sibling; ++ u32 count = 0, total = 0, scan_count = 0; ++ ++ for_each_thread(leader, sibling) { ++ if (count >= BURST_CACHE_SAMPLE_LIMIT) break; ++ if (scan_count++ >= BURST_CACHE_SCAN_LIMIT) break; ++ ++ if (!task_is_bore_eligible(sibling)) continue; ++ count++; ++ total += sibling->bore.penalty; ++ } ++ ++ update_burst_cache(bc, leader, count, total, now); ++ } ++ ++ bc_val.value = READ_ONCE(bc->value); ++ return (u32)bc_val.penalty; ++} ++ ++void task_fork_bore(struct task_struct *p, ++ struct task_struct *parent, u64 clone_flags, u64 now) { ++ if (!static_branch_likely(&sched_bore_key) || !task_is_bore_eligible(p)) return; ++ ++ rcu_read_lock(); ++ struct bore_ctx *ctx = &p->bore; ++ u32 inherited_penalty; ++ if (clone_flags & CLONE_THREAD) ++ inherited_penalty = inherit_from_thread_group(parent, now); ++ else if (static_branch_likely(&sched_burst_inherit_key)) ++ inherited_penalty = static_branch_likely(&sched_burst_ancestor_key)? ++ inherit_from_ancestor_hub(parent, clone_flags, now): ++ inherit_from_parent(parent, clone_flags, now); ++ else ++ inherited_penalty = 0; ++ ++ if (ctx->prev_penalty < inherited_penalty) ++ ctx->prev_penalty = inherited_penalty; ++ ctx->curr_penalty = 0; ++ ctx->burst_time = 0; ++ ctx->stop_update = false; ++ ctx->futex_waiting = false; ++ update_penalty(p); ++ rcu_read_unlock(); ++} ++ ++void reset_task_bore(struct task_struct *p) ++{ memset(&p->bore, 0, sizeof(struct bore_ctx)); } ++ ++static void update_inherit_type(void) { ++ switch(sched_burst_inherit_type) { ++ case 1: ++ static_branch_enable(&sched_burst_inherit_key); ++ static_branch_disable(&sched_burst_ancestor_key); ++ break; ++ case 2: ++ static_branch_enable(&sched_burst_inherit_key); ++ static_branch_enable(&sched_burst_ancestor_key); ++ break; ++ default: ++ static_branch_disable(&sched_burst_inherit_key); ++ break; ++ } ++} ++ ++void __init sched_init_bore(void) { ++ printk(KERN_INFO "%s %s by %s\n", ++ SCHED_BORE_PROGNAME, SCHED_BORE_VERSION, SCHED_BORE_AUTHOR); ++ ++ for (int i = 1; i <= BURST_CACHE_SAMPLE_LIMIT; i++) ++ bore_reciprocal_lut[i] = (u32)div64_u64(0xffffffffULL + i, i); ++ ++ reset_task_bore(&init_task); ++ update_inherit_type(); ++} ++ ++static void readjust_all_task_weights(void) { ++ struct task_struct *task; ++ struct rq *rq; ++ struct rq_flags rf; ++ ++ scoped_guard(write_lock_irq, &tasklist_lock) ++ for_each_process(task) { ++ if (!task_is_bore_eligible(task)) continue; ++ rq = task_rq_lock(task, &rf); ++ update_rq_clock(rq); ++ reweight_task_by_prio(task, effective_prio_bore(task)); ++ task_rq_unlock(rq, task, &rf); ++ } ++} ++ ++int sched_bore_update_handler(const struct ctl_table *table, ++ int write, void __user *buffer, size_t *lenp, loff_t *ppos) { ++ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); ++ if (ret || !write) ++ return ret; ++ ++ if (sched_bore) ++ static_branch_enable(&sched_bore_key); ++ else ++ static_branch_disable(&sched_bore_key); ++ ++ readjust_all_task_weights(); ++ ++ return 0; ++} ++ ++int sched_burst_inherit_type_update_handler(const struct ctl_table *table, ++ int write, void __user *buffer, size_t *lenp, loff_t *ppos) { ++ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); ++ if (ret || !write) ++ return ret; ++ ++ update_inherit_type(); ++ ++ return 0; ++} ++ ++#ifdef CONFIG_SYSCTL ++static struct ctl_table sched_bore_sysctls[] = { ++ { ++ .procname = "sched_bore", ++ .data = &sched_bore, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = sched_bore_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE, ++ }, ++ { ++ .procname = "sched_burst_inherit_type", ++ .data = &sched_burst_inherit_type, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = sched_burst_inherit_type_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_TWO, ++ }, ++ { ++ .procname = "sched_burst_smoothness", ++ .data = &sched_burst_smoothness, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_THREE, ++ }, ++ { ++ .procname = "sched_burst_penalty_offset", ++ .data = &sched_burst_penalty_offset, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = proc_dou8vec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_6_bits, ++ }, ++ { ++ .procname = "sched_burst_penalty_scale", ++ .data = &sched_burst_penalty_scale, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec_minmax, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = &maxval_12_bits, ++ }, ++ { ++ .procname = "sched_burst_cache_lifetime", ++ .data = &sched_burst_cache_lifetime, ++ .maxlen = sizeof(uint), ++ .mode = 0644, ++ .proc_handler = proc_douintvec, ++ }, ++}; ++ ++static int __init sched_bore_sysctl_init(void) { ++ register_sysctl_init("kernel", sched_bore_sysctls); ++ return 0; ++} ++late_initcall(sched_bore_sysctl_init); ++ ++#endif // CONFIG_SYSCTL ++#endif /* CONFIG_SCHED_BORE */ +diff --git a/kernel/sched/core.c b/kernel/sched/core.c +index 496dff740..2bc2b943a 100644 +--- a/kernel/sched/core.c ++++ b/kernel/sched/core.c +@@ -100,6 +100,10 @@ + #include "../smpboot.h" + #include "../locking/mutex.h" + ++#ifdef CONFIG_SCHED_BORE ++#include ++#endif /* CONFIG_SCHED_BORE */ ++ + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpu); + EXPORT_TRACEPOINT_SYMBOL_GPL(ipi_send_cpumask); + +@@ -1446,7 +1450,11 @@ int tg_nop(struct task_group *tg, void *data) + + void set_load_weight(struct task_struct *p, bool update_load) + { ++#ifdef CONFIG_SCHED_BORE ++ int prio = effective_prio_bore(p); ++#else /* !CONFIG_SCHED_BORE */ + int prio = p->static_prio - MAX_RT_PRIO; ++#endif /* CONFIG_SCHED_BORE */ + struct load_weight lw; + + if (task_has_idle_policy(p)) { +@@ -8611,6 +8619,10 @@ void __init sched_init(void) + BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ sched_init_bore(); ++#endif /* CONFIG_SCHED_BORE */ ++ + wait_bit_init(); + + #ifdef CONFIG_FAIR_GROUP_SCHED +diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c +index 15bf45b6f..282007725 100644 +--- a/kernel/sched/debug.c ++++ b/kernel/sched/debug.c +@@ -169,6 +169,53 @@ static const struct file_operations sched_feat_fops = { + .release = single_release, + }; + ++#ifdef CONFIG_SCHED_BORE ++#define DEFINE_SYSCTL_SCHED_FUNC(name, update_func) \ ++static ssize_t sched_##name##_write(struct file *filp, const char __user *ubuf, size_t cnt, loff_t *ppos) \ ++{ \ ++ char buf[16]; \ ++ unsigned int value; \ ++\ ++ if (cnt > 15) \ ++ cnt = 15; \ ++\ ++ if (copy_from_user(&buf, ubuf, cnt)) \ ++ return -EFAULT; \ ++ buf[cnt] = '\0'; \ ++\ ++ if (kstrtouint(buf, 10, &value)) \ ++ return -EINVAL; \ ++\ ++ sysctl_sched_##name = value; \ ++ sched_update_##update_func(); \ ++\ ++ *ppos += cnt; \ ++ return cnt; \ ++} \ ++\ ++static int sched_##name##_show(struct seq_file *m, void *v) \ ++{ \ ++ seq_printf(m, "%d\n", sysctl_sched_##name); \ ++ return 0; \ ++} \ ++\ ++static int sched_##name##_open(struct inode *inode, struct file *filp) \ ++{ \ ++ return single_open(filp, sched_##name##_show, NULL); \ ++} \ ++\ ++static const struct file_operations sched_##name##_fops = { \ ++ .open = sched_##name##_open, \ ++ .write = sched_##name##_write, \ ++ .read = seq_read, \ ++ .llseek = seq_lseek, \ ++ .release = single_release, \ ++}; ++ ++DEFINE_SYSCTL_SCHED_FUNC(min_base_slice, min_base_slice) ++ ++#undef DEFINE_SYSCTL_SCHED_FUNC ++#else /* !CONFIG_SCHED_BORE */ + static ssize_t sched_scaling_write(struct file *filp, const char __user *ubuf, + size_t cnt, loff_t *ppos) + { +@@ -208,6 +255,7 @@ static const struct file_operations sched_scaling_fops = { + .llseek = seq_lseek, + .release = single_release, + }; ++#endif /* CONFIG_SCHED_BORE */ + + #ifdef CONFIG_PREEMPT_DYNAMIC + +@@ -602,12 +650,19 @@ static __init int sched_init_debug(void) + debugfs_create_file("preempt", 0644, debugfs_sched, NULL, &sched_dynamic_fops); + #endif + ++#ifdef CONFIG_SCHED_BORE ++ debugfs_create_file("min_base_slice_ns", 0644, debugfs_sched, NULL, &sched_min_base_slice_fops); ++ debugfs_create_u32("base_slice_ns", 0444, debugfs_sched, &sysctl_sched_base_slice); ++#else /* !CONFIG_SCHED_BORE */ + debugfs_create_u32("base_slice_ns", 0644, debugfs_sched, &sysctl_sched_base_slice); ++#endif /* CONFIG_SCHED_BORE */ + + debugfs_create_u32("latency_warn_ms", 0644, debugfs_sched, &sysctl_resched_latency_warn_ms); + debugfs_create_u32("latency_warn_once", 0644, debugfs_sched, &sysctl_resched_latency_warn_once); + ++#if !defined(CONFIG_SCHED_BORE) + debugfs_create_file("tunable_scaling", 0644, debugfs_sched, NULL, &sched_scaling_fops); ++#endif /* CONFIG_SCHED_BORE */ + debugfs_create_u32("migration_cost_ns", 0644, debugfs_sched, &sysctl_sched_migration_cost); + debugfs_create_u32("nr_migrate", 0644, debugfs_sched, &sysctl_sched_nr_migrate); + +@@ -852,6 +907,9 @@ print_task(struct seq_file *m, struct rq *rq, struct task_struct *p) + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_sleep_runtime)), + SPLIT_NS(schedstat_val_or_zero(p->stats.sum_block_runtime))); + ++#ifdef CONFIG_SCHED_BORE ++ SEQ_printf(m, " %2d", p->bore.score); ++#endif /* CONFIG_SCHED_BORE */ + #ifdef CONFIG_NUMA_BALANCING + SEQ_printf(m, " %d %d", task_node(p), task_numa_group_id(p)); + #endif +@@ -1331,6 +1389,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, + __PS("nr_involuntary_switches", p->nivcsw); + + P(se.load.weight); ++#ifdef CONFIG_SCHED_BORE ++ P(bore.score); ++#endif /* CONFIG_SCHED_BORE */ + P(se.avg.load_sum); + P(se.avg.runnable_sum); + P(se.avg.util_sum); +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index ab4114712..630896fc0 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -58,6 +58,10 @@ + #include "stats.h" + #include "autogroup.h" + ++#ifdef CONFIG_SCHED_BORE ++#include ++#endif /* CONFIG_SCHED_BORE */ ++ + /* + * The initial- and re-scaling of tunables is configurable + * +@@ -67,17 +71,30 @@ + * SCHED_TUNABLESCALING_LOG - scaled logarithmically, *1+ilog(ncpus) + * SCHED_TUNABLESCALING_LINEAR - scaled linear, *ncpus + * +- * (default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) ++ * BORE : default SCHED_TUNABLESCALING_NONE = *1 constant ++ * EEVDF: default SCHED_TUNABLESCALING_LOG = *(1+ilog(ncpus)) + */ ++#ifdef CONFIG_SCHED_BORE ++unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_NONE; ++#else /* !CONFIG_SCHED_BORE */ + unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; ++#endif /* CONFIG_SCHED_BORE */ + + /* + * Minimal preemption granularity for CPU-bound tasks: + * +- * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) ++ * BORE : base_slice = minimum multiple of nsecs_per_tick >= min_base_slice ++ * (default min_base_slice = 2000000 constant, units: nanoseconds) ++ * EEVDF: default 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds + */ ++#ifdef CONFIG_SCHED_BORE ++static const unsigned int nsecs_per_tick = 1000000000ULL / HZ; ++unsigned int sysctl_sched_min_base_slice = CONFIG_MIN_BASE_SLICE_NS; ++__read_mostly uint sysctl_sched_base_slice = nsecs_per_tick; ++#else /* !CONFIG_SCHED_BORE */ + unsigned int sysctl_sched_base_slice = 700000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; ++#endif /* CONFIG_SCHED_BORE */ + + __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; + +@@ -189,6 +206,13 @@ static inline void update_load_set(struct load_weight *lw, unsigned long w) + * + * This idea comes from the SD scheduler of Con Kolivas: + */ ++#ifdef CONFIG_SCHED_BORE ++static void update_sysctl(void) { ++ sysctl_sched_base_slice = nsecs_per_tick * ++ max(1UL, DIV_ROUND_UP(sysctl_sched_min_base_slice, nsecs_per_tick)); ++} ++void sched_update_min_base_slice(void) { update_sysctl(); } ++#else /* !CONFIG_SCHED_BORE */ + static unsigned int get_update_sysctl_factor(void) + { + unsigned int cpus = min_t(unsigned int, num_online_cpus(), 8); +@@ -219,6 +243,7 @@ static void update_sysctl(void) + SET_SYSCTL(sched_base_slice); + #undef SET_SYSCTL + } ++#endif /* CONFIG_SCHED_BORE */ + + void __init sched_init_granularity(void) + { +@@ -957,7 +982,11 @@ struct sched_entity *__pick_first_entity(struct cfs_rq *cfs_rq) + */ + static inline void set_protect_slice(struct cfs_rq *cfs_rq, struct sched_entity *se) + { ++#ifdef CONFIG_SCHED_BORE ++ u64 slice = sysctl_sched_base_slice; ++#else /* CONFIG_SCHED_BORE */ + u64 slice = normalized_sysctl_sched_base_slice; ++#endif /* CONFIG_SCHED_BORE */ + u64 vprot = se->deadline; + + if (sched_feat(RUN_TO_PARITY)) +@@ -1035,6 +1064,11 @@ static struct sched_entity *__pick_eevdf(struct cfs_rq *cfs_rq, bool protect) + curr = NULL; + + if (curr && protect && protect_slice(curr)) ++#ifdef CONFIG_SCHED_BORE ++ if (!static_branch_likely(&sched_bore_key) || ++ !entity_is_task(curr) || ++ !task_of(curr)->bore.futex_waiting) ++#endif /* CONFIG_SCHED_BORE */ + return curr; + + /* Pick the leftmost entity if it's eligible */ +@@ -1096,6 +1130,7 @@ struct sched_entity *__pick_last_entity(struct cfs_rq *cfs_rq) + /************************************************************** + * Scheduling class statistics methods: + */ ++#if !defined(CONFIG_SCHED_BORE) + int sched_update_scaling(void) + { + unsigned int factor = get_update_sysctl_factor(); +@@ -1107,6 +1142,7 @@ int sched_update_scaling(void) + + return 0; + } ++#endif /* CONFIG_SCHED_BORE */ + + static void clear_buddies(struct cfs_rq *cfs_rq, struct sched_entity *se); + +@@ -1307,6 +1343,11 @@ static void update_curr(struct cfs_rq *cfs_rq) + resched = update_deadline(cfs_rq, curr); + + if (entity_is_task(curr)) { ++#ifdef CONFIG_SCHED_BORE ++ struct task_struct *p = task_of(curr); ++ update_curr_bore(p, delta_exec); ++#endif /* CONFIG_SCHED_BORE */ ++ + /* + * If the fair_server is active, we need to account for the + * fair_server time whether or not the task is running on +@@ -3843,17 +3884,23 @@ dequeue_load_avg(struct cfs_rq *cfs_rq, struct sched_entity *se) + + static void place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags); + +-static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, ++void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + unsigned long weight) + { + bool curr = cfs_rq->curr == se; + bool rel_vprot = false; + u64 vprot; ++#ifdef CONFIG_SCHED_BORE ++ s64 vlag_unscaled = 0; ++#endif /* !CONFIG_SCHED_BORE */ + + if (se->on_rq) { + /* commit outstanding execution time */ + update_curr(cfs_rq); + update_entity_lag(cfs_rq, se); ++#ifdef CONFIG_SCHED_BORE ++ vlag_unscaled = se->vlag; ++#endif /* !CONFIG_SCHED_BORE */ + se->deadline -= se->vruntime; + se->rel_deadline = 1; + if (curr && protect_slice(se)) { +@@ -3889,6 +3936,16 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, + + enqueue_load_avg(cfs_rq, se); + if (se->on_rq) { ++#ifdef CONFIG_SCHED_BORE ++ if (curr) { ++ se->vruntime += vlag_unscaled - se->vlag; ++ if (se->rel_deadline) { ++ se->deadline += se->vruntime; ++ se->rel_deadline = 0; ++ } ++ } ++ else ++#endif /* !CONFIG_SCHED_BORE */ + place_entity(cfs_rq, se, 0); + if (rel_vprot) + se->vprot = se->vruntime + vprot; +@@ -5164,12 +5221,11 @@ void __setparam_fair(struct task_struct *p, const struct sched_attr *attr) + static void + place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + { +- u64 vslice, vruntime = avg_vruntime(cfs_rq); ++ u64 vslice = 0, vruntime = avg_vruntime(cfs_rq); + s64 lag = 0; + + if (!se->custom_slice) + se->slice = sysctl_sched_base_slice; +- vslice = calc_delta_fair(se->slice, se); + + /* + * Due to how V is constructed as the weighted average of entities, +@@ -5254,7 +5310,18 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + se->rel_deadline = 0; + return; + } +- ++#ifdef CONFIG_SCHED_BORE ++ if (static_branch_likely(&sched_bore_key) && ++ entity_is_task(se) && ++ task_of(se)->bore.futex_waiting) ++ goto vslice_found; ++#endif /* !CONFIG_SCHED_BORE */ ++ vslice = calc_delta_fair(se->slice, se); ++#ifdef CONFIG_SCHED_BORE ++ if (static_branch_likely(&sched_bore_key)) ++ vslice >>= !!(flags & (ENQUEUE_INITIAL | ENQUEUE_WAKEUP)); ++ else ++#endif /* CONFIG_SCHED_BORE */ + /* + * When joining the competition; the existing tasks will be, + * on average, halfway through their slice, as such start tasks +@@ -5263,6 +5330,9 @@ place_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + if (sched_feat(PLACE_DEADLINE_INITIAL) && (flags & ENQUEUE_INITIAL)) + vslice /= 2; + ++#ifdef CONFIG_SCHED_BORE ++vslice_found: ++#endif /* CONFIG_SCHED_BORE */ + /* + * EEVDF: vd_i = ve_i + r_i/w_i + */ +@@ -5273,7 +5343,7 @@ static void check_enqueue_throttle(struct cfs_rq *cfs_rq); + static inline int cfs_rq_throttled(struct cfs_rq *cfs_rq); + + static void +-requeue_delayed_entity(struct sched_entity *se); ++requeue_delayed_entity(struct sched_entity *se, int flags); + + static void + enqueue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) +@@ -5431,6 +5501,10 @@ dequeue_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, int flags) + if (sched_feat(DELAY_DEQUEUE) && delay && + !entity_eligible(cfs_rq, se)) { + update_load_avg(cfs_rq, se, 0); ++#ifdef CONFIG_SCHED_BORE ++ if (static_branch_likely(&sched_bore_key) && sched_feat(DELAY_ZERO)) ++ update_entity_lag(cfs_rq, se); ++#endif /* CONFIG_SCHED_BORE */ + set_delayed(se); + return false; + } +@@ -6902,7 +6976,7 @@ static int sched_idle_cpu(int cpu) + } + + static void +-requeue_delayed_entity(struct sched_entity *se) ++requeue_delayed_entity(struct sched_entity *se, int flags) + { + struct cfs_rq *cfs_rq = cfs_rq_of(se); + +@@ -6915,13 +6989,22 @@ requeue_delayed_entity(struct sched_entity *se) + WARN_ON_ONCE(!se->on_rq); + + if (sched_feat(DELAY_ZERO)) { ++#ifdef CONFIG_SCHED_BORE ++ if (static_branch_likely(&sched_bore_key)) ++ flags |= ENQUEUE_WAKEUP; ++ else { ++#endif /* CONFIG_SCHED_BORE */ ++ flags = 0; + update_entity_lag(cfs_rq, se); ++#ifdef CONFIG_SCHED_BORE ++ } ++#endif /* CONFIG_SCHED_BORE */ + if (se->vlag > 0) { + cfs_rq->nr_queued--; + if (se != cfs_rq->curr) + __dequeue_entity(cfs_rq, se); + se->vlag = 0; +- place_entity(cfs_rq, se, 0); ++ place_entity(cfs_rq, se, flags); + if (se != cfs_rq->curr) + __enqueue_entity(cfs_rq, se); + cfs_rq->nr_queued++; +@@ -6961,7 +7044,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + util_est_enqueue(&rq->cfs, p); + + if (flags & ENQUEUE_DELAYED) { +- requeue_delayed_entity(se); ++ requeue_delayed_entity(se, flags); + return; + } + +@@ -6979,7 +7062,7 @@ enqueue_task_fair(struct rq *rq, struct task_struct *p, int flags) + for_each_sched_entity(se) { + if (se->on_rq) { + if (se->sched_delayed) +- requeue_delayed_entity(se); ++ requeue_delayed_entity(se, flags); + break; + } + cfs_rq = cfs_rq_of(se); +@@ -7186,6 +7269,15 @@ static bool dequeue_task_fair(struct rq *rq, struct task_struct *p, int flags) + util_est_dequeue(&rq->cfs, p); + + util_est_update(&rq->cfs, p, flags & DEQUEUE_SLEEP); ++#ifdef CONFIG_SCHED_BORE ++ struct cfs_rq *cfs_rq = cfs_rq_of(&p->se); ++ struct sched_entity *se = &p->se; ++ if ((flags & DEQUEUE_SLEEP) && entity_is_task(se)) { ++ if (cfs_rq->curr == se) ++ update_curr(cfs_rq); ++ restart_burst_bore(p); ++ } ++#endif /* CONFIG_SCHED_BORE */ + if (dequeue_entities(rq, &p->se, flags) < 0) + return false; + +@@ -9097,16 +9189,25 @@ static void yield_task_fair(struct rq *rq) + /* + * Are we the only task in the tree? + */ ++#if !defined(CONFIG_SCHED_BORE) + if (unlikely(rq->nr_running == 1)) + return; + + clear_buddies(cfs_rq, se); ++#endif /* CONFIG_SCHED_BORE */ + + update_rq_clock(rq); + /* + * Update run-time statistics of the 'current'. + */ + update_curr(cfs_rq); ++#ifdef CONFIG_SCHED_BORE ++ restart_burst_rescale_deadline_bore(curr); ++ if (unlikely(rq->nr_running == 1)) ++ return; ++ ++ clear_buddies(cfs_rq, se); ++#endif /* CONFIG_SCHED_BORE */ + /* + * Tell update_rq_clock() that we've just updated, + * so we don't do microscopic update in schedule() +@@ -13586,6 +13687,9 @@ static void switched_to_fair(struct rq *rq, struct task_struct *p) + WARN_ON_ONCE(p->se.sched_delayed); + + attach_task_cfs_rq(p); ++#ifdef CONFIG_SCHED_BORE ++ reset_task_bore(p); ++#endif /* CONFIG_SCHED_BORE */ + + set_task_max_allowed_capacity(p); + +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index 1ef9ba480..4b5bbf708 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2219,7 +2219,11 @@ extern int group_balance_cpu(struct sched_group *sg); + extern void update_sched_domain_debugfs(void); + extern void dirty_sched_domain_sysctl(int cpu); + ++#ifdef CONFIG_SCHED_BORE ++extern void sched_update_min_base_slice(void); ++#else /* !CONFIG_SCHED_BORE */ + extern int sched_update_scaling(void); ++#endif /* CONFIG_SCHED_BORE */ + + static inline const struct cpumask *task_user_cpus(struct task_struct *p) + { +@@ -3013,7 +3017,12 @@ extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + extern __read_mostly unsigned int sysctl_sched_nr_migrate; + extern __read_mostly unsigned int sysctl_sched_migration_cost; + ++#ifdef CONFIG_SCHED_BORE ++extern unsigned int sysctl_sched_min_base_slice; ++extern __read_mostly uint sysctl_sched_base_slice; ++#else /* !CONFIG_SCHED_BORE */ + extern unsigned int sysctl_sched_base_slice; ++#endif /* CONFIG_SCHED_BORE */ + + extern int sysctl_resched_latency_warn_ms; + extern int sysctl_resched_latency_warn_once; +-- +2.53.0 + diff --git a/sys-kernel/git-sources/0002-glitched-additional-timer-tick-frequencies.patch b/sys-kernel/gentoo-sources-7.0/0101-glitched-additional-timer-tick-frequencies.patch similarity index 100% rename from sys-kernel/git-sources/0002-glitched-additional-timer-tick-frequencies.patch rename to sys-kernel/gentoo-sources-7.0/0101-glitched-additional-timer-tick-frequencies.patch diff --git a/sys-kernel/git-sources/0001-asus.patch b/sys-kernel/git-sources/0001-asus.patch new file mode 100644 index 0000000..75ef225 --- /dev/null +++ b/sys-kernel/git-sources/0001-asus.patch @@ -0,0 +1,6038 @@ +From b5b4f8345dc0d81e7922485af45f5384008db8bf Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 1 Sep 2025 09:38:53 +0800 +Subject: [PATCH 1/4] asus + +Signed-off-by: Eric Naim +--- + .../ABI/testing/sysfs-platform-asus-wmi | 17 + + drivers/hid/Kconfig | 9 + + drivers/hid/Makefile | 1 + + drivers/hid/hid-asus-ally.c | 2197 +++++++++++++++++ + drivers/hid/hid-asus-ally.h | 398 +++ + drivers/hid/hid-asus.c | 29 +- + drivers/hid/hid-asus.h | 13 + + drivers/hid/hid-ids.h | 1 + + drivers/platform/x86/Kconfig | 23 + + drivers/platform/x86/Makefile | 1 + + drivers/platform/x86/asus-armoury.c | 1174 +++++++++ + drivers/platform/x86/asus-armoury.h | 1278 ++++++++++ + drivers/platform/x86/asus-wmi.c | 171 +- + include/linux/platform_data/x86/asus-wmi.h | 22 + + 14 files changed, 5293 insertions(+), 41 deletions(-) + create mode 100644 drivers/hid/hid-asus-ally.c + create mode 100644 drivers/hid/hid-asus-ally.h + create mode 100644 drivers/hid/hid-asus.h + create mode 100644 drivers/platform/x86/asus-armoury.c + create mode 100644 drivers/platform/x86/asus-armoury.h + +diff --git a/Documentation/ABI/testing/sysfs-platform-asus-wmi b/Documentation/ABI/testing/sysfs-platform-asus-wmi +index 28144371a0f1..765d50b0d9df 100644 +--- a/Documentation/ABI/testing/sysfs-platform-asus-wmi ++++ b/Documentation/ABI/testing/sysfs-platform-asus-wmi +@@ -63,6 +63,7 @@ Date: Aug 2022 + KernelVersion: 6.1 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Switch the GPU hardware MUX mode. Laptops with this feature can + can be toggled to boot with only the dGPU (discrete mode) or in + standard Optimus/Hybrid mode. On switch a reboot is required: +@@ -75,6 +76,7 @@ Date: Aug 2022 + KernelVersion: 5.17 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Disable discrete GPU: + * 0 - Enable dGPU, + * 1 - Disable dGPU +@@ -84,6 +86,7 @@ Date: Aug 2022 + KernelVersion: 5.17 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Enable the external GPU paired with ROG X-Flow laptops. + Toggling this setting will also trigger ACPI to disable the dGPU: + +@@ -95,6 +98,7 @@ Date: Aug 2022 + KernelVersion: 5.17 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Enable an LCD response-time boost to reduce or remove ghosting: + * 0 - Disable, + * 1 - Enable +@@ -104,6 +108,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Get the current charging mode being used: + * 1 - Barrel connected charger, + * 2 - USB-C charging +@@ -114,6 +119,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Show if the egpu (XG Mobile) is correctly connected: + * 0 - False, + * 1 - True +@@ -123,6 +129,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Change the mini-LED mode: + * 0 - Single-zone, + * 1 - Multi-zone +@@ -133,6 +140,7 @@ Date: Apr 2024 + KernelVersion: 6.10 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + List the available mini-led modes. + + What: /sys/devices/platform//ppt_pl1_spl +@@ -140,6 +148,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set the Package Power Target total of CPU: PL1 on Intel, SPL on AMD. + Shown on Intel+Nvidia or AMD+Nvidia based systems: + +@@ -150,6 +159,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set the Slow Package Power Tracking Limit of CPU: PL2 on Intel, SPPT, + on AMD. Shown on Intel+Nvidia or AMD+Nvidia based systems: + +@@ -160,6 +170,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set the Fast Package Power Tracking Limit of CPU. AMD+Nvidia only: + * min=5, max=250 + +@@ -168,6 +179,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set the APU SPPT limit. Shown on full AMD systems only: + * min=5, max=130 + +@@ -176,6 +188,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set the platform SPPT limit. Shown on full AMD systems only: + * min=5, max=130 + +@@ -184,6 +197,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set the dynamic boost limit of the Nvidia dGPU: + * min=5, max=25 + +@@ -192,6 +206,7 @@ Date: Jun 2023 + KernelVersion: 6.5 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set the target temperature limit of the Nvidia dGPU: + * min=75, max=87 + +@@ -200,6 +215,7 @@ Date: Apr 2024 + KernelVersion: 6.10 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set if the BIOS POST sound is played on boot. + * 0 - False, + * 1 - True +@@ -209,6 +225,7 @@ Date: Apr 2024 + KernelVersion: 6.10 + Contact: "Luke Jones" + Description: ++ DEPRECATED, WILL BE REMOVED SOON + Set if the MCU can go in to low-power mode on system sleep + * 0 - False, + * 1 - True +diff --git a/drivers/hid/Kconfig b/drivers/hid/Kconfig +index 79997553d8f9..d3147e48a8f1 100644 +--- a/drivers/hid/Kconfig ++++ b/drivers/hid/Kconfig +@@ -191,6 +191,15 @@ config HID_ASUS + - GL553V series + - GL753V series + ++config HID_ASUS_ALLY ++ tristate "Asus Ally gamepad configuration support" ++ depends on USB_HID ++ depends on LEDS_CLASS ++ depends on LEDS_CLASS_MULTICOLOR ++ select POWER_SUPPLY ++ help ++ Support for configuring the Asus ROG Ally gamepad using attributes. ++ + config HID_AUREAL + tristate "Aureal" + help +diff --git a/drivers/hid/Makefile b/drivers/hid/Makefile +index 10ae5dedbd84..958f67193c85 100644 +--- a/drivers/hid/Makefile ++++ b/drivers/hid/Makefile +@@ -33,6 +33,7 @@ obj-$(CONFIG_HID_APPLETB_BL) += hid-appletb-bl.o + obj-$(CONFIG_HID_APPLETB_KBD) += hid-appletb-kbd.o + obj-$(CONFIG_HID_CREATIVE_SB0540) += hid-creative-sb0540.o + obj-$(CONFIG_HID_ASUS) += hid-asus.o ++obj-$(CONFIG_HID_ASUS_ALLY) += hid-asus-ally.o + obj-$(CONFIG_HID_AUREAL) += hid-aureal.o + obj-$(CONFIG_HID_BELKIN) += hid-belkin.o + obj-$(CONFIG_HID_BETOP_FF) += hid-betopff.o +diff --git a/drivers/hid/hid-asus-ally.c b/drivers/hid/hid-asus-ally.c +new file mode 100644 +index 000000000000..e78625f70c44 +--- /dev/null ++++ b/drivers/hid/hid-asus-ally.c +@@ -0,0 +1,2197 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * HID driver for Asus ROG laptops and Ally ++ * ++ * Copyright (c) 2023 Luke Jones ++ */ ++ ++#include "linux/compiler_attributes.h" ++#include "linux/device.h" ++#include ++#include ++#include "linux/pm.h" ++#include "linux/printk.h" ++#include "linux/slab.h" ++#include ++#include ++#include ++#include ++#include ++ ++#include "hid-ids.h" ++#include "hid-asus.h" ++#include "hid-asus-ally.h" ++ ++#define DEBUG ++ ++#define READY_MAX_TRIES 3 ++#define FEATURE_REPORT_ID 0x0d ++#define FEATURE_ROG_ALLY_REPORT_ID 0x5a ++#define FEATURE_ROG_ALLY_CODE_PAGE 0xD1 ++#define FEATURE_ROG_ALLY_REPORT_SIZE 64 ++#define ALLY_X_INPUT_REPORT_USB 0x0B ++#define ALLY_X_INPUT_REPORT_USB_SIZE 16 ++ ++#define ROG_ALLY_REPORT_SIZE 64 ++#define ROG_ALLY_X_MIN_MCU 313 ++#define ROG_ALLY_MIN_MCU 319 ++ ++#define FEATURE_KBD_LED_REPORT_ID1 0x5d ++#define FEATURE_KBD_LED_REPORT_ID2 0x5e ++ ++#define BTN_DATA_LEN 11; ++#define BTN_CODE_BYTES_LEN 8 ++ ++static const u8 EC_INIT_STRING[] = { 0x5A, 'A', 'S', 'U', 'S', ' ', 'T', 'e','c', 'h', '.', 'I', 'n', 'c', '.', '\0' }; ++static const u8 EC_MODE_LED_APPLY[] = { 0x5A, 0xB4, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; ++static const u8 EC_MODE_LED_SET[] = { 0x5A, 0xB5, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0 }; ++static const u8 FORCE_FEEDBACK_OFF[] = { 0x0D, 0x0F, 0x00, 0x00, 0x00, 0x00, 0xFF, 0x00, 0xEB }; ++ ++static const struct hid_device_id rog_ally_devices[] = { ++ { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY) }, ++ { HID_USB_DEVICE(USB_VENDOR_ID_ASUSTEK, USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X) }, ++ {} ++}; ++ ++struct btn_code_map { ++ u64 code; ++ const char *name; ++}; ++ ++static const struct btn_code_map ally_btn_codes[] = { ++ { 0, "NONE" }, ++ /* Gamepad button codes */ ++ { BTN_PAD_A, "PAD_A" }, ++ { BTN_PAD_B, "PAD_B" }, ++ { BTN_PAD_X, "PAD_X" }, ++ { BTN_PAD_Y, "PAD_Y" }, ++ { BTN_PAD_LB, "PAD_LB" }, ++ { BTN_PAD_RB, "PAD_RB" }, ++ { BTN_PAD_LS, "PAD_LS" }, ++ { BTN_PAD_RS, "PAD_RS" }, ++ { BTN_PAD_DPAD_UP, "PAD_DPAD_UP" }, ++ { BTN_PAD_DPAD_DOWN, "PAD_DPAD_DOWN" }, ++ { BTN_PAD_DPAD_LEFT, "PAD_DPAD_LEFT" }, ++ { BTN_PAD_DPAD_RIGHT, "PAD_DPAD_RIGHT" }, ++ { BTN_PAD_VIEW, "PAD_VIEW" }, ++ { BTN_PAD_MENU, "PAD_MENU" }, ++ { BTN_PAD_XBOX, "PAD_XBOX" }, ++ ++ /* Triggers mapped to keyboard codes */ ++ { BTN_KB_M2, "KB_M2" }, ++ { BTN_KB_M1, "KB_M1" }, ++ { BTN_KB_ESC, "KB_ESC" }, ++ { BTN_KB_F1, "KB_F1" }, ++ { BTN_KB_F2, "KB_F2" }, ++ { BTN_KB_F3, "KB_F3" }, ++ { BTN_KB_F4, "KB_F4" }, ++ { BTN_KB_F5, "KB_F5" }, ++ { BTN_KB_F6, "KB_F6" }, ++ { BTN_KB_F7, "KB_F7" }, ++ { BTN_KB_F8, "KB_F8" }, ++ { BTN_KB_F9, "KB_F9" }, ++ { BTN_KB_F10, "KB_F10" }, ++ { BTN_KB_F11, "KB_F11" }, ++ { BTN_KB_F12, "KB_F12" }, ++ { BTN_KB_F14, "KB_F14" }, ++ { BTN_KB_F15, "KB_F15" }, ++ { BTN_KB_BACKTICK, "KB_BACKTICK" }, ++ { BTN_KB_1, "KB_1" }, ++ { BTN_KB_2, "KB_2" }, ++ { BTN_KB_3, "KB_3" }, ++ { BTN_KB_4, "KB_4" }, ++ { BTN_KB_5, "KB_5" }, ++ { BTN_KB_6, "KB_6" }, ++ { BTN_KB_7, "KB_7" }, ++ { BTN_KB_8, "KB_8" }, ++ { BTN_KB_9, "KB_9" }, ++ { BTN_KB_0, "KB_0" }, ++ { BTN_KB_HYPHEN, "KB_HYPHEN" }, ++ { BTN_KB_EQUALS, "KB_EQUALS" }, ++ { BTN_KB_BACKSPACE, "KB_BACKSPACE" }, ++ { BTN_KB_TAB, "KB_TAB" }, ++ { BTN_KB_Q, "KB_Q" }, ++ { BTN_KB_W, "KB_W" }, ++ { BTN_KB_E, "KB_E" }, ++ { BTN_KB_R, "KB_R" }, ++ { BTN_KB_T, "KB_T" }, ++ { BTN_KB_Y, "KB_Y" }, ++ { BTN_KB_U, "KB_U" }, ++ { BTN_KB_O, "KB_O" }, ++ { BTN_KB_P, "KB_P" }, ++ { BTN_KB_LBRACKET, "KB_LBRACKET" }, ++ { BTN_KB_RBRACKET, "KB_RBRACKET" }, ++ { BTN_KB_BACKSLASH, "KB_BACKSLASH" }, ++ { BTN_KB_CAPS, "KB_CAPS" }, ++ { BTN_KB_A, "KB_A" }, ++ { BTN_KB_S, "KB_S" }, ++ { BTN_KB_D, "KB_D" }, ++ { BTN_KB_F, "KB_F" }, ++ { BTN_KB_G, "KB_G" }, ++ { BTN_KB_H, "KB_H" }, ++ { BTN_KB_J, "KB_J" }, ++ { BTN_KB_K, "KB_K" }, ++ { BTN_KB_L, "KB_L" }, ++ { BTN_KB_SEMI, "KB_SEMI" }, ++ { BTN_KB_QUOTE, "KB_QUOTE" }, ++ { BTN_KB_RET, "KB_RET" }, ++ { BTN_KB_LSHIFT, "KB_LSHIFT" }, ++ { BTN_KB_Z, "KB_Z" }, ++ { BTN_KB_X, "KB_X" }, ++ { BTN_KB_C, "KB_C" }, ++ { BTN_KB_V, "KB_V" }, ++ { BTN_KB_B, "KB_B" }, ++ { BTN_KB_N, "KB_N" }, ++ { BTN_KB_M, "KB_M" }, ++ { BTN_KB_COMMA, "KB_COMMA" }, ++ { BTN_KB_PERIOD, "KB_PERIOD" }, ++ { BTN_KB_RSHIFT, "KB_RSHIFT" }, ++ { BTN_KB_LCTL, "KB_LCTL" }, ++ { BTN_KB_META, "KB_META" }, ++ { BTN_KB_LALT, "KB_LALT" }, ++ { BTN_KB_SPACE, "KB_SPACE" }, ++ { BTN_KB_RALT, "KB_RALT" }, ++ { BTN_KB_MENU, "KB_MENU" }, ++ { BTN_KB_RCTL, "KB_RCTL" }, ++ { BTN_KB_PRNTSCN, "KB_PRNTSCN" }, ++ { BTN_KB_SCRLCK, "KB_SCRLCK" }, ++ { BTN_KB_PAUSE, "KB_PAUSE" }, ++ { BTN_KB_INS, "KB_INS" }, ++ { BTN_KB_HOME, "KB_HOME" }, ++ { BTN_KB_PGUP, "KB_PGUP" }, ++ { BTN_KB_DEL, "KB_DEL" }, ++ { BTN_KB_END, "KB_END" }, ++ { BTN_KB_PGDWN, "KB_PGDWN" }, ++ { BTN_KB_UP_ARROW, "KB_UP_ARROW" }, ++ { BTN_KB_DOWN_ARROW, "KB_DOWN_ARROW" }, ++ { BTN_KB_LEFT_ARROW, "KB_LEFT_ARROW" }, ++ { BTN_KB_RIGHT_ARROW, "KB_RIGHT_ARROW" }, ++ ++ /* Numpad mappings */ ++ { BTN_NUMPAD_LOCK, "NUMPAD_LOCK" }, ++ { BTN_NUMPAD_FWDSLASH, "NUMPAD_FWDSLASH" }, ++ { BTN_NUMPAD_ASTERISK, "NUMPAD_ASTERISK" }, ++ { BTN_NUMPAD_HYPHEN, "NUMPAD_HYPHEN" }, ++ { BTN_NUMPAD_0, "NUMPAD_0" }, ++ { BTN_NUMPAD_1, "NUMPAD_1" }, ++ { BTN_NUMPAD_2, "NUMPAD_2" }, ++ { BTN_NUMPAD_3, "NUMPAD_3" }, ++ { BTN_NUMPAD_4, "NUMPAD_4" }, ++ { BTN_NUMPAD_5, "NUMPAD_5" }, ++ { BTN_NUMPAD_6, "NUMPAD_6" }, ++ { BTN_NUMPAD_7, "NUMPAD_7" }, ++ { BTN_NUMPAD_8, "NUMPAD_8" }, ++ { BTN_NUMPAD_9, "NUMPAD_9" }, ++ { BTN_NUMPAD_PLUS, "NUMPAD_PLUS" }, ++ { BTN_NUMPAD_ENTER, "NUMPAD_ENTER" }, ++ { BTN_NUMPAD_PERIOD, "NUMPAD_PERIOD" }, ++ ++ /* Mouse mappings */ ++ { BTN_MOUSE_LCLICK, "MOUSE_LCLICK" }, ++ { BTN_MOUSE_RCLICK, "MOUSE_RCLICK" }, ++ { BTN_MOUSE_MCLICK, "MOUSE_MCLICK" }, ++ { BTN_MOUSE_WHEEL_UP, "MOUSE_WHEEL_UP" }, ++ { BTN_MOUSE_WHEEL_DOWN, "MOUSE_WHEEL_DOWN" }, ++ ++ /* Media mappings */ ++ { BTN_MEDIA_SCREENSHOT, "MEDIA_SCREENSHOT" }, ++ { BTN_MEDIA_SHOW_KEYBOARD, "MEDIA_SHOW_KEYBOARD" }, ++ { BTN_MEDIA_SHOW_DESKTOP, "MEDIA_SHOW_DESKTOP" }, ++ { BTN_MEDIA_START_RECORDING, "MEDIA_START_RECORDING" }, ++ { BTN_MEDIA_MIC_OFF, "MEDIA_MIC_OFF" }, ++ { BTN_MEDIA_VOL_DOWN, "MEDIA_VOL_DOWN" }, ++ { BTN_MEDIA_VOL_UP, "MEDIA_VOL_UP" }, ++}; ++static const size_t keymap_len = ARRAY_SIZE(ally_btn_codes); ++ ++/* byte_array must be >= 8 in length */ ++static void btn_code_to_byte_array(u64 keycode, u8 *byte_array) ++{ ++ /* Convert the u64 to bytes[8] */ ++ for (int i = 0; i < 8; ++i) { ++ byte_array[i] = (keycode >> (56 - 8 * i)) & 0xFF; ++ } ++} ++ ++static u64 name_to_btn(const char *name) ++{ ++ int len = strcspn(name, "\n"); ++ for (size_t i = 0; i < keymap_len; ++i) { ++ if (strncmp(ally_btn_codes[i].name, name, len) == 0) { ++ return ally_btn_codes[i].code; ++ } ++ } ++ return -EINVAL; ++} ++ ++static const char* btn_to_name(u64 key) ++{ ++ for (size_t i = 0; i < keymap_len; ++i) { ++ if (ally_btn_codes[i].code == key) { ++ return ally_btn_codes[i].name; ++ } ++ } ++ return NULL; ++} ++ ++struct btn_data { ++ u64 button; ++ u64 macro; ++ bool turbo; ++}; ++ ++struct btn_mapping { ++ struct btn_data btn_a; ++ struct btn_data btn_b; ++ struct btn_data btn_x; ++ struct btn_data btn_y; ++ struct btn_data btn_lb; ++ struct btn_data btn_rb; ++ struct btn_data btn_ls; ++ struct btn_data btn_rs; ++ struct btn_data btn_lt; ++ struct btn_data btn_rt; ++ struct btn_data dpad_up; ++ struct btn_data dpad_down; ++ struct btn_data dpad_left; ++ struct btn_data dpad_right; ++ struct btn_data btn_view; ++ struct btn_data btn_menu; ++ struct btn_data btn_m1; ++ struct btn_data btn_m2; ++}; ++ ++struct deadzone { ++ u8 inner; ++ u8 outer; ++}; ++ ++struct response_curve { ++ uint8_t move_pct_1; ++ uint8_t response_pct_1; ++ uint8_t move_pct_2; ++ uint8_t response_pct_2; ++ uint8_t move_pct_3; ++ uint8_t response_pct_3; ++ uint8_t move_pct_4; ++ uint8_t response_pct_4; ++} __packed; ++ ++struct js_axis_calibrations { ++ uint16_t left_y_stable; ++ uint16_t left_y_min; ++ uint16_t left_y_max; ++ uint16_t left_x_stable; ++ uint16_t left_x_min; ++ uint16_t left_x_max; ++ uint16_t right_y_stable; ++ uint16_t right_y_min; ++ uint16_t right_y_max; ++ uint16_t right_x_stable; ++ uint16_t right_x_min; ++ uint16_t right_x_max; ++} __packed; ++ ++struct tr_axis_calibrations { ++ uint16_t left_stable; ++ uint16_t left_max; ++ uint16_t right_stable; ++ uint16_t right_max; ++} __packed; ++ ++/* ROG Ally has many settings related to the gamepad, all using the same n-key endpoint */ ++struct ally_gamepad_cfg { ++ struct hid_device *hdev; ++ struct input_dev *input; ++ ++ enum xpad_mode mode; ++ /* ++ * index: [mode] ++ */ ++ struct btn_mapping key_mapping[xpad_mode_mouse]; ++ /* ++ * index: left, right ++ * max: 64 ++ */ ++ u8 vibration_intensity[2]; ++ ++ /* deadzones */ ++ struct deadzone ls_dz; // left stick ++ struct deadzone rs_dz; // right stick ++ struct deadzone lt_dz; // left trigger ++ struct deadzone rt_dz; // right trigger ++ /* anti-deadzones */ ++ u8 ls_adz; // left stick ++ u8 rs_adz; // right stick ++ /* joystick response curves */ ++ struct response_curve ls_rc; ++ struct response_curve rs_rc; ++ ++ struct js_axis_calibrations js_cal; ++ struct tr_axis_calibrations tr_cal; ++}; ++ ++/* The hatswitch outputs integers, we use them to index this X|Y pair */ ++static const int hat_values[][2] = { ++ { 0, 0 }, { 0, -1 }, { 1, -1 }, { 1, 0 }, { 1, 1 }, ++ { 0, 1 }, { -1, 1 }, { -1, 0 }, { -1, -1 }, ++}; ++ ++/* rumble packet structure */ ++struct ff_data { ++ u8 enable; ++ u8 magnitude_left; ++ u8 magnitude_right; ++ u8 magnitude_strong; ++ u8 magnitude_weak; ++ u8 pulse_sustain_10ms; ++ u8 pulse_release_10ms; ++ u8 loop_count; ++} __packed; ++ ++struct ff_report { ++ u8 report_id; ++ struct ff_data ff; ++} __packed; ++ ++struct ally_x_input_report { ++ uint16_t x, y; ++ uint16_t rx, ry; ++ uint16_t z, rz; ++ uint8_t buttons[4]; ++} __packed; ++ ++struct ally_x_device { ++ struct input_dev *input; ++ struct hid_device *hdev; ++ spinlock_t lock; ++ ++ struct ff_report *ff_packet; ++ struct work_struct output_worker; ++ bool output_worker_initialized; ++ /* Prevent multiple queued event due to the enforced delay in worker */ ++ bool update_qam_btn; ++ /* Set if the QAM and AC buttons emit Xbox and Xbox+A */ ++ bool qam_btns_steam_mode; ++ bool update_ff; ++}; ++ ++struct ally_rgb_dev { ++ struct hid_device *hdev; ++ struct led_classdev_mc led_rgb_dev; ++ struct work_struct work; ++ bool output_worker_initialized; ++ spinlock_t lock; ++ ++ bool removed; ++ bool update_rgb; ++ uint8_t red[4]; ++ uint8_t green[4]; ++ uint8_t blue[4]; ++}; ++ ++struct ally_rgb_data { ++ uint8_t brightness; ++ uint8_t red[4]; ++ uint8_t green[4]; ++ uint8_t blue[4]; ++ bool initialized; ++}; ++ ++static struct ally_drvdata { ++ struct hid_device *hdev; ++ struct ally_x_device *ally_x; ++ struct ally_gamepad_cfg *gamepad_cfg; ++ struct ally_rgb_dev *led_rgb_dev; ++ struct ally_rgb_data led_rgb_data; ++ uint mcu_version; ++} drvdata; ++ ++static void reverse_bytes_in_pairs(u8 *buf, size_t size) { ++ uint16_t *word_ptr; ++ size_t i; ++ ++ for (i = 0; i < size; i += 2) { ++ if (i + 1 < size) { ++ word_ptr = (uint16_t *)&buf[i]; ++ *word_ptr = cpu_to_be16(*word_ptr); ++ } ++ } ++} ++ ++/** ++ * asus_dev_set_report - send set report request to device. ++ * ++ * @hdev: hid device ++ * @buf: in/out data to transfer ++ * @len: length of buf ++ * ++ * Return: count of data transferred, negative if error ++ * ++ * Same behavior as hid_hw_raw_request. Note that the input buffer is duplicated. ++ */ ++static int asus_dev_set_report(struct hid_device *hdev, const u8 *buf, size_t len) ++{ ++ unsigned char *dmabuf; ++ int ret; ++ ++ dmabuf = kmemdup(buf, len, GFP_KERNEL); ++ if (!dmabuf) ++ return -ENOMEM; ++ ++ ret = hid_hw_raw_request(hdev, buf[0], dmabuf, len, HID_FEATURE_REPORT, ++ HID_REQ_SET_REPORT); ++ kfree(dmabuf); ++ ++ return ret; ++} ++ ++/** ++ * asus_dev_get_report - send get report request to device. ++ * ++ * @hdev: hid device ++ * @out: buffer to write output data in to ++ * @len: length the output buffer provided ++ * ++ * Return: count of data transferred, negative if error ++ * ++ * Same behavior as hid_hw_raw_request. ++ */ ++static int asus_dev_get_report(struct hid_device *hdev, u8 *out, size_t len) ++{ ++ return hid_hw_raw_request(hdev, FEATURE_REPORT_ID, out, len, ++ HID_FEATURE_REPORT, HID_REQ_GET_REPORT); ++} ++ ++static u8 get_endpoint_address(struct hid_device *hdev) ++{ ++ struct usb_interface *intf; ++ struct usb_host_endpoint *ep; ++ ++ intf = to_usb_interface(hdev->dev.parent); ++ ++ if (intf) { ++ ep = intf->cur_altsetting->endpoint; ++ if (ep) { ++ return ep->desc.bEndpointAddress; ++ } ++ } ++ ++ return -ENODEV; ++} ++ ++/**************************************************************************************************/ ++/* ROG Ally gamepad configuration */ ++/**************************************************************************************************/ ++ ++/* This should be called before any attempts to set device functions */ ++static int ally_gamepad_check_ready(struct hid_device *hdev) ++{ ++ int ret, count; ++ u8 *hidbuf; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ ret = 0; ++ for (count = 0; count < READY_MAX_TRIES; count++) { ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ hidbuf[2] = xpad_cmd_check_ready; ++ hidbuf[3] = 01; ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ hid_dbg(hdev, "ROG Ally check failed set report: %d\n", ret); ++ ++ hidbuf[0] = hidbuf[1] = hidbuf[2] = hidbuf[3] = 0; ++ ret = asus_dev_get_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ hid_dbg(hdev, "ROG Ally check failed get report: %d\n", ret); ++ ++ ret = hidbuf[2] == xpad_cmd_check_ready; ++ if (ret) ++ break; ++ usleep_range( ++ 1000, ++ 2000); /* don't spam the entire loop in less than USB response time */ ++ } ++ ++ if (count == READY_MAX_TRIES) ++ hid_warn(hdev, "ROG Ally never responded with a ready\n"); ++ ++ kfree(hidbuf); ++ return ret; ++} ++ ++/* VIBRATION INTENSITY ****************************************************************************/ ++static ssize_t gamepad_vibration_intensity_index_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "left right\n"); ++} ++ ++ALLY_DEVICE_ATTR_RO(gamepad_vibration_intensity_index, vibration_intensity_index); ++ ++static ssize_t _gamepad_apply_intensity(struct hid_device *hdev, ++ struct ally_gamepad_cfg *ally_cfg) ++{ ++ u8 *hidbuf; ++ int ret; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ hidbuf[2] = xpad_cmd_set_vibe_intensity; ++ hidbuf[3] = xpad_cmd_len_vibe_intensity; ++ hidbuf[4] = ally_cfg->vibration_intensity[0]; ++ hidbuf[5] = ally_cfg->vibration_intensity[1]; ++ ++ ret = ally_gamepad_check_ready(hdev); ++ if (ret < 0) ++ goto report_fail; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ goto report_fail; ++ ++report_fail: ++ kfree(hidbuf); ++ return ret; ++} ++ ++static ssize_t gamepad_vibration_intensity_show(struct device *dev, ++ struct device_attribute *attr, char *buf) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ ++ if (!drvdata.gamepad_cfg) ++ return -ENODEV; ++ ++ return sysfs_emit( ++ buf, "%d %d\n", ++ ally_cfg->vibration_intensity[0], ++ ally_cfg->vibration_intensity[1]); ++} ++ ++static ssize_t gamepad_vibration_intensity_store(struct device *dev, ++ struct device_attribute *attr, const char *buf, ++ size_t count) ++{ ++ struct hid_device *hdev = to_hid_device(dev); ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ u32 left, right; ++ int ret; ++ ++ if (!drvdata.gamepad_cfg) ++ return -ENODEV; ++ ++ if (sscanf(buf, "%d %d", &left, &right) != 2) ++ return -EINVAL; ++ ++ if (left > 64 || right > 64) ++ return -EINVAL; ++ ++ ally_cfg->vibration_intensity[0] = left; ++ ally_cfg->vibration_intensity[1] = right; ++ ++ ret = _gamepad_apply_intensity(hdev, ally_cfg); ++ if (ret < 0) ++ return ret; ++ ++ return count; ++} ++ ++ALLY_DEVICE_ATTR_RW(gamepad_vibration_intensity, vibration_intensity); ++ ++/* ANALOGUE DEADZONES *****************************************************************************/ ++static ssize_t _gamepad_apply_deadzones(struct hid_device *hdev, ++ struct ally_gamepad_cfg *ally_cfg) ++{ ++ u8 *hidbuf; ++ int ret; ++ ++ ret = ally_gamepad_check_ready(hdev); ++ if (ret < 0) ++ return ret; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ hidbuf[2] = xpad_cmd_set_js_dz; ++ hidbuf[3] = xpad_cmd_len_deadzone; ++ hidbuf[4] = ally_cfg->ls_dz.inner; ++ hidbuf[5] = ally_cfg->ls_dz.outer; ++ hidbuf[6] = ally_cfg->rs_dz.inner; ++ hidbuf[7] = ally_cfg->rs_dz.outer; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ goto end; ++ ++ hidbuf[2] = xpad_cmd_set_tr_dz; ++ hidbuf[4] = ally_cfg->lt_dz.inner; ++ hidbuf[5] = ally_cfg->lt_dz.outer; ++ hidbuf[6] = ally_cfg->rt_dz.inner; ++ hidbuf[7] = ally_cfg->rt_dz.outer; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ goto end; ++ ++end: ++ kfree(hidbuf); ++ return ret; ++} ++ ++static void _gamepad_set_deadzones_default(struct ally_gamepad_cfg *ally_cfg) ++{ ++ ally_cfg->ls_dz.inner = 0x00; ++ ally_cfg->ls_dz.outer = 0x64; ++ ally_cfg->rs_dz.inner = 0x00; ++ ally_cfg->rs_dz.outer = 0x64; ++ ally_cfg->lt_dz.inner = 0x00; ++ ally_cfg->lt_dz.outer = 0x64; ++ ally_cfg->rt_dz.inner = 0x00; ++ ally_cfg->rt_dz.outer = 0x64; ++} ++ ++static ssize_t axis_xyz_deadzone_index_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ return sysfs_emit(buf, "inner outer\n"); ++} ++ ++ALLY_DEVICE_ATTR_RO(axis_xyz_deadzone_index, deadzone_index); ++ ++ALLY_DEADZONES(axis_xy_left, ls_dz); ++ALLY_DEADZONES(axis_xy_right, rs_dz); ++ALLY_DEADZONES(axis_z_left, lt_dz); ++ALLY_DEADZONES(axis_z_right, rt_dz); ++ ++/* ANTI-DEADZONES *********************************************************************************/ ++static ssize_t _gamepad_apply_js_ADZ(struct hid_device *hdev, ++ struct ally_gamepad_cfg *ally_cfg) ++{ ++ u8 *hidbuf; ++ int ret; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ hidbuf[2] = xpad_cmd_set_adz; ++ hidbuf[3] = xpad_cmd_len_adz; ++ hidbuf[4] = ally_cfg->ls_adz; ++ hidbuf[5] = ally_cfg->rs_adz; ++ ++ ret = ally_gamepad_check_ready(hdev); ++ if (ret < 0) ++ goto report_fail; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ goto report_fail; ++ ++report_fail: ++ kfree(hidbuf); ++ return ret; ++} ++ ++static void _gamepad_set_anti_deadzones_default(struct ally_gamepad_cfg *ally_cfg) ++{ ++ ally_cfg->ls_adz = 0x00; ++ ally_cfg->rs_adz = 0x00; ++} ++ ++static ssize_t _gamepad_js_ADZ_store(struct device *dev, const char *buf, u8 *adz) ++{ ++ int ret, val; ++ ++ ret = kstrtoint(buf, 0, &val); ++ if (ret) ++ return ret; ++ ++ if (val < 0 || val > 32) ++ return -EINVAL; ++ ++ *adz = val; ++ ++ return ret; ++} ++ ++static ssize_t axis_xy_left_anti_deadzone_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ ++ return sysfs_emit(buf, "%d\n", ally_cfg->ls_adz); ++} ++ ++static ssize_t axis_xy_left_anti_deadzone_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ int ret; ++ ++ ret = _gamepad_js_ADZ_store(dev, buf, &ally_cfg->ls_adz); ++ if (ret) ++ return ret; ++ ++ return count; ++} ++ALLY_DEVICE_ATTR_RW(axis_xy_left_anti_deadzone, anti_deadzone); ++ ++static ssize_t axis_xy_right_anti_deadzone_show(struct device *dev, ++ struct device_attribute *attr, ++ char *buf) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ ++ return sysfs_emit(buf, "%d\n", ally_cfg->rs_adz); ++} ++ ++static ssize_t axis_xy_right_anti_deadzone_store(struct device *dev, ++ struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ int ret; ++ ++ ret = _gamepad_js_ADZ_store(dev, buf, &ally_cfg->rs_adz); ++ if (ret) ++ return ret; ++ ++ return count; ++} ++ALLY_DEVICE_ATTR_RW(axis_xy_right_anti_deadzone, anti_deadzone); ++ ++/* JS RESPONSE CURVES *****************************************************************************/ ++static void _gamepad_set_js_response_curves_default(struct ally_gamepad_cfg *ally_cfg) ++{ ++ struct response_curve *js1_rc = &ally_cfg->ls_rc; ++ struct response_curve *js2_rc = &ally_cfg->rs_rc; ++ js1_rc->move_pct_1 = js2_rc->move_pct_1 = 0x16; // 25% ++ js1_rc->move_pct_2 = js2_rc->move_pct_2 = 0x32; // 50% ++ js1_rc->move_pct_3 = js2_rc->move_pct_3 = 0x48; // 75% ++ js1_rc->move_pct_4 = js2_rc->move_pct_4 = 0x64; // 100% ++ js1_rc->response_pct_1 = js2_rc->response_pct_1 = 0x16; ++ js1_rc->response_pct_2 = js2_rc->response_pct_2 = 0x32; ++ js1_rc->response_pct_3 = js2_rc->response_pct_3 = 0x48; ++ js1_rc->response_pct_4 = js2_rc->response_pct_4 = 0x64; ++} ++ ++static ssize_t _gamepad_apply_response_curves(struct hid_device *hdev, ++ struct ally_gamepad_cfg *ally_cfg) ++{ ++ u8 *hidbuf; ++ int ret; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ memcpy(&hidbuf[2], &ally_cfg->ls_rc, sizeof(ally_cfg->ls_rc)); ++ ++ ret = ally_gamepad_check_ready(hdev); ++ if (ret < 0) ++ goto report_fail; ++ ++ hidbuf[4] = 0x02; ++ memcpy(&hidbuf[5], &ally_cfg->rs_rc, sizeof(ally_cfg->rs_rc)); ++ ++ ret = ally_gamepad_check_ready(hdev); ++ if (ret < 0) ++ goto report_fail; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ goto report_fail; ++ ++report_fail: ++ kfree(hidbuf); ++ return ret; ++} ++ ++ALLY_JS_RC_POINT(axis_xy_left, move, 1); ++ALLY_JS_RC_POINT(axis_xy_left, move, 2); ++ALLY_JS_RC_POINT(axis_xy_left, move, 3); ++ALLY_JS_RC_POINT(axis_xy_left, move, 4); ++ALLY_JS_RC_POINT(axis_xy_left, response, 1); ++ALLY_JS_RC_POINT(axis_xy_left, response, 2); ++ALLY_JS_RC_POINT(axis_xy_left, response, 3); ++ALLY_JS_RC_POINT(axis_xy_left, response, 4); ++ ++ALLY_JS_RC_POINT(axis_xy_right, move, 1); ++ALLY_JS_RC_POINT(axis_xy_right, move, 2); ++ALLY_JS_RC_POINT(axis_xy_right, move, 3); ++ALLY_JS_RC_POINT(axis_xy_right, move, 4); ++ALLY_JS_RC_POINT(axis_xy_right, response, 1); ++ALLY_JS_RC_POINT(axis_xy_right, response, 2); ++ALLY_JS_RC_POINT(axis_xy_right, response, 3); ++ALLY_JS_RC_POINT(axis_xy_right, response, 4); ++ ++/* CALIBRATIONS ***********************************************************************************/ ++static int gamepad_get_calibration(struct hid_device *hdev) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ u8 *hidbuf; ++ int ret, i; ++ ++ if (!drvdata.gamepad_cfg) ++ return -ENODEV; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ for (i = 0; i < 2; i++) { ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = 0xD0; ++ hidbuf[2] = 0x03; ++ hidbuf[3] = i + 1; // 0x01 JS, 0x02 TR ++ hidbuf[4] = 0x20; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) { ++ hid_warn(hdev, "ROG Ally check failed set report: %d\n", ret); ++ goto cleanup; ++ } ++ ++ memset(hidbuf, 0, FEATURE_ROG_ALLY_REPORT_SIZE); ++ ret = asus_dev_get_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0 || hidbuf[5] != 1) { ++ hid_warn(hdev, "ROG Ally check failed get report: %d\n", ret); ++ goto cleanup; ++ } ++ ++ if (i == 0) { ++ /* Joystick calibration */ ++ reverse_bytes_in_pairs(&hidbuf[6], sizeof(struct js_axis_calibrations)); ++ ally_cfg->js_cal = *(struct js_axis_calibrations *)&hidbuf[6]; ++ print_hex_dump(KERN_INFO, "HID Buffer JS: ", DUMP_PREFIX_OFFSET, 16, 1, hidbuf, 32, true); ++ struct js_axis_calibrations *cal = &drvdata.gamepad_cfg->js_cal; ++ pr_err("LS_CAL: X: %d, Min: %d, Max: %d", cal->left_x_stable, cal->left_x_min, cal->left_x_max); ++ pr_err("LS_CAL: Y: %d, Min: %d, Max: %d", cal->left_y_stable, cal->left_y_min, cal->left_y_max); ++ pr_err("RS_CAL: X: %d, Min: %d, Max: %d", cal->right_x_stable, cal->right_x_min, cal->right_x_max); ++ pr_err("RS_CAL: Y: %d, Min: %d, Max: %d", cal->right_y_stable, cal->right_y_min, cal->right_y_max); ++ } else { ++ /* Trigger calibration */ ++ reverse_bytes_in_pairs(&hidbuf[6], sizeof(struct tr_axis_calibrations)); ++ ally_cfg->tr_cal = *(struct tr_axis_calibrations *)&hidbuf[6]; ++ print_hex_dump(KERN_INFO, "HID Buffer TR: ", DUMP_PREFIX_OFFSET, 16, 1, hidbuf, 32, true); ++ } ++ } ++ ++cleanup: ++ kfree(hidbuf); ++ return ret; ++} ++ ++static struct attribute *axis_xy_left_attrs[] = { ++ &dev_attr_axis_xy_left_anti_deadzone.attr, ++ &dev_attr_axis_xy_left_deadzone.attr, ++ &dev_attr_axis_xyz_deadzone_index.attr, ++ &dev_attr_axis_xy_left_move_1.attr, ++ &dev_attr_axis_xy_left_move_2.attr, ++ &dev_attr_axis_xy_left_move_3.attr, ++ &dev_attr_axis_xy_left_move_4.attr, ++ &dev_attr_axis_xy_left_response_1.attr, ++ &dev_attr_axis_xy_left_response_2.attr, ++ &dev_attr_axis_xy_left_response_3.attr, ++ &dev_attr_axis_xy_left_response_4.attr, ++ NULL ++}; ++static const struct attribute_group axis_xy_left_attr_group = { ++ .name = "axis_xy_left", ++ .attrs = axis_xy_left_attrs, ++}; ++ ++static struct attribute *axis_xy_right_attrs[] = { ++ &dev_attr_axis_xy_right_anti_deadzone.attr, ++ &dev_attr_axis_xy_right_deadzone.attr, ++ &dev_attr_axis_xyz_deadzone_index.attr, ++ &dev_attr_axis_xy_right_move_1.attr, ++ &dev_attr_axis_xy_right_move_2.attr, ++ &dev_attr_axis_xy_right_move_3.attr, ++ &dev_attr_axis_xy_right_move_4.attr, ++ &dev_attr_axis_xy_right_response_1.attr, ++ &dev_attr_axis_xy_right_response_2.attr, ++ &dev_attr_axis_xy_right_response_3.attr, ++ &dev_attr_axis_xy_right_response_4.attr, ++ NULL ++}; ++static const struct attribute_group axis_xy_right_attr_group = { ++ .name = "axis_xy_right", ++ .attrs = axis_xy_right_attrs, ++}; ++ ++static struct attribute *axis_z_left_attrs[] = { ++ &dev_attr_axis_z_left_deadzone.attr, ++ &dev_attr_axis_xyz_deadzone_index.attr, ++ NULL, ++}; ++static const struct attribute_group axis_z_left_attr_group = { ++ .name = "axis_z_left", ++ .attrs = axis_z_left_attrs, ++}; ++ ++static struct attribute *axis_z_right_attrs[] = { ++ &dev_attr_axis_z_right_deadzone.attr, ++ &dev_attr_axis_xyz_deadzone_index.attr, ++ NULL, ++}; ++static const struct attribute_group axis_z_right_attr_group = { ++ .name = "axis_z_right", ++ .attrs = axis_z_right_attrs, ++}; ++ ++/* A HID packet conatins mappings for two buttons: btn1, btn1_macro, btn2, btn2_macro */ ++static void _btn_pair_to_hid_pkt(struct ally_gamepad_cfg *ally_cfg, ++ enum btn_pair_index pair, ++ struct btn_data *btn1, struct btn_data *btn2, ++ u8 *out, int out_len) ++{ ++ int start = 5; ++ ++ out[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ out[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ out[2] = xpad_cmd_set_mapping; ++ out[3] = pair; ++ out[4] = xpad_cmd_len_mapping; ++ ++ btn_code_to_byte_array(btn1->button, &out[start]); ++ start += BTN_DATA_LEN; ++ btn_code_to_byte_array(btn1->macro, &out[start]); ++ start += BTN_DATA_LEN; ++ btn_code_to_byte_array(btn2->button, &out[start]); ++ start += BTN_DATA_LEN; ++ btn_code_to_byte_array(btn2->macro, &out[start]); ++ //print_hex_dump(KERN_DEBUG, "byte_array: ", DUMP_PREFIX_OFFSET, 64, 1, out, 64, false); ++} ++ ++/* Apply the mapping pair to the device */ ++static int _gamepad_apply_btn_pair(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg, ++ enum btn_pair_index btn_pair) ++{ ++ u8 mode = ally_cfg->mode - 1; ++ struct btn_data *btn1, *btn2; ++ u8 *hidbuf; ++ int ret; ++ ++ ret = ally_gamepad_check_ready(hdev); ++ if (ret < 0) ++ return ret; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ switch (btn_pair) { ++ case btn_pair_dpad_u_d: ++ btn1 = &ally_cfg->key_mapping[mode].dpad_up; ++ btn2 = &ally_cfg->key_mapping[mode].dpad_down; ++ break; ++ case btn_pair_dpad_l_r: ++ btn1 = &ally_cfg->key_mapping[mode].dpad_left; ++ btn2 = &ally_cfg->key_mapping[mode].dpad_right; ++ break; ++ case btn_pair_ls_rs: ++ btn1 = &ally_cfg->key_mapping[mode].btn_ls; ++ btn2 = &ally_cfg->key_mapping[mode].btn_rs; ++ break; ++ case btn_pair_lb_rb: ++ btn1 = &ally_cfg->key_mapping[mode].btn_lb; ++ btn2 = &ally_cfg->key_mapping[mode].btn_rb; ++ break; ++ case btn_pair_lt_rt: ++ btn1 = &ally_cfg->key_mapping[mode].btn_lt; ++ btn2 = &ally_cfg->key_mapping[mode].btn_rt; ++ break; ++ case btn_pair_a_b: ++ btn1 = &ally_cfg->key_mapping[mode].btn_a; ++ btn2 = &ally_cfg->key_mapping[mode].btn_b; ++ break; ++ case btn_pair_x_y: ++ btn1 = &ally_cfg->key_mapping[mode].btn_x; ++ btn2 = &ally_cfg->key_mapping[mode].btn_y; ++ break; ++ case btn_pair_view_menu: ++ btn1 = &ally_cfg->key_mapping[mode].btn_view; ++ btn2 = &ally_cfg->key_mapping[mode].btn_menu; ++ break; ++ case btn_pair_m1_m2: ++ btn1 = &ally_cfg->key_mapping[mode].btn_m1; ++ btn2 = &ally_cfg->key_mapping[mode].btn_m2; ++ break; ++ default: ++ break; ++ } ++ ++ _btn_pair_to_hid_pkt(ally_cfg, btn_pair, btn1, btn2, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ ++ kfree(hidbuf); ++ ++ return ret; ++} ++ ++static int _gamepad_apply_turbo(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg) ++{ ++ struct btn_mapping *map = &ally_cfg->key_mapping[ally_cfg->mode - 1]; ++ u8 *hidbuf; ++ int ret; ++ ++ /* set turbo */ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ hidbuf[2] = xpad_cmd_set_turbo; ++ hidbuf[3] = xpad_cmd_len_turbo; ++ ++ hidbuf[4] = map->dpad_up.turbo; ++ hidbuf[6] = map->dpad_down.turbo; ++ hidbuf[8] = map->dpad_left.turbo; ++ hidbuf[10] = map->dpad_right.turbo; ++ ++ hidbuf[12] = map->btn_ls.turbo; ++ hidbuf[14] = map->btn_rs.turbo; ++ hidbuf[16] = map->btn_lb.turbo; ++ hidbuf[18] = map->btn_rb.turbo; ++ ++ hidbuf[20] = map->btn_a.turbo; ++ hidbuf[22] = map->btn_b.turbo; ++ hidbuf[24] = map->btn_x.turbo; ++ hidbuf[26] = map->btn_y.turbo; ++ ++ hidbuf[28] = map->btn_lt.turbo; ++ hidbuf[30] = map->btn_rt.turbo; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ ++ kfree(hidbuf); ++ ++ return ret; ++} ++ ++static ssize_t _gamepad_apply_all(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg) ++{ ++ int ret; ++ ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_dpad_u_d); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_dpad_l_r); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_ls_rs); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_lb_rb); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_a_b); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_x_y); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_view_menu); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_m1_m2); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_lt_rt); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_turbo(hdev, ally_cfg); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_deadzones(hdev, ally_cfg); ++ if (ret < 0) ++ return ret; ++ ret = _gamepad_apply_js_ADZ(hdev, ally_cfg); ++ if (ret < 0) ++ return ret; ++ ret =_gamepad_apply_response_curves(hdev, ally_cfg); ++ if (ret < 0) ++ return ret; ++ ++ return 0; ++} ++ ++static ssize_t gamepad_apply_all_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ struct hid_device *hdev = to_hid_device(dev); ++ int ret; ++ ++ if (!drvdata.gamepad_cfg) ++ return -ENODEV; ++ ++ ret = _gamepad_apply_all(hdev, ally_cfg); ++ if (ret < 0) ++ return ret; ++ ++ return count; ++} ++ALLY_DEVICE_ATTR_WO(gamepad_apply_all, apply_all); ++ ++/* button map attributes, regular and macro*/ ++ALLY_BTN_MAPPING(m1, btn_m1); ++ALLY_BTN_MAPPING(m2, btn_m2); ++ALLY_BTN_MAPPING(view, btn_view); ++ALLY_BTN_MAPPING(menu, btn_menu); ++ALLY_TURBO_BTN_MAPPING(a, btn_a); ++ALLY_TURBO_BTN_MAPPING(b, btn_b); ++ALLY_TURBO_BTN_MAPPING(x, btn_x); ++ALLY_TURBO_BTN_MAPPING(y, btn_y); ++ALLY_TURBO_BTN_MAPPING(lb, btn_lb); ++ALLY_TURBO_BTN_MAPPING(rb, btn_rb); ++ALLY_TURBO_BTN_MAPPING(ls, btn_ls); ++ALLY_TURBO_BTN_MAPPING(rs, btn_rs); ++ALLY_TURBO_BTN_MAPPING(lt, btn_lt); ++ALLY_TURBO_BTN_MAPPING(rt, btn_rt); ++ALLY_TURBO_BTN_MAPPING(dpad_u, dpad_up); ++ALLY_TURBO_BTN_MAPPING(dpad_d, dpad_down); ++ALLY_TURBO_BTN_MAPPING(dpad_l, dpad_left); ++ALLY_TURBO_BTN_MAPPING(dpad_r, dpad_right); ++ ++static void _gamepad_set_xpad_default(struct ally_gamepad_cfg *ally_cfg) ++{ ++ struct btn_mapping *map = &ally_cfg->key_mapping[ally_cfg->mode - 1]; ++ map->btn_m1.button = BTN_KB_M1; ++ map->btn_m2.button = BTN_KB_M2; ++ map->btn_a.button = BTN_PAD_A; ++ map->btn_b.button = BTN_PAD_B; ++ map->btn_x.button = BTN_PAD_X; ++ map->btn_y.button = BTN_PAD_Y; ++ map->btn_lb.button = BTN_PAD_LB; ++ map->btn_rb.button = BTN_PAD_RB; ++ map->btn_lt.button = BTN_PAD_LT; ++ map->btn_rt.button = BTN_PAD_RT; ++ map->btn_ls.button = BTN_PAD_LS; ++ map->btn_rs.button = BTN_PAD_RS; ++ map->dpad_up.button = BTN_PAD_DPAD_UP; ++ map->dpad_down.button = BTN_PAD_DPAD_DOWN; ++ map->dpad_left.button = BTN_PAD_DPAD_LEFT; ++ map->dpad_right.button = BTN_PAD_DPAD_RIGHT; ++ map->btn_view.button = BTN_PAD_VIEW; ++ map->btn_menu.button = BTN_PAD_MENU; ++} ++ ++static ssize_t btn_mapping_reset_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ ++ if (!drvdata.gamepad_cfg) ++ return -ENODEV; ++ ++ switch (ally_cfg->mode) { ++ case xpad_mode_game: ++ _gamepad_set_xpad_default(ally_cfg); ++ break; ++ default: ++ _gamepad_set_xpad_default(ally_cfg); ++ break; ++ } ++ ++ return count; ++} ++ALLY_DEVICE_ATTR_WO(btn_mapping_reset, reset_btn_mapping); ++ ++/* GAMEPAD MODE */ ++static ssize_t _gamepad_set_mode(struct hid_device *hdev, struct ally_gamepad_cfg *ally_cfg, ++ int val) ++{ ++ u8 *hidbuf; ++ int ret; ++ ++ hidbuf = kzalloc(FEATURE_ROG_ALLY_REPORT_SIZE, GFP_KERNEL); ++ if (!hidbuf) ++ return -ENOMEM; ++ ++ hidbuf[0] = FEATURE_ROG_ALLY_REPORT_ID; ++ hidbuf[1] = FEATURE_ROG_ALLY_CODE_PAGE; ++ hidbuf[2] = xpad_cmd_set_mode; ++ hidbuf[3] = xpad_cmd_len_mode; ++ hidbuf[4] = val; ++ ++ ret = ally_gamepad_check_ready(hdev); ++ if (ret < 0) ++ goto report_fail; ++ ++ ret = asus_dev_set_report(hdev, hidbuf, FEATURE_ROG_ALLY_REPORT_SIZE); ++ if (ret < 0) ++ goto report_fail; ++ ++ ret = _gamepad_apply_all(hdev, ally_cfg); ++ if (ret < 0) ++ goto report_fail; ++ ++report_fail: ++ kfree(hidbuf); ++ return ret; ++} ++ ++static ssize_t gamepad_mode_show(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ ++ if (!drvdata.gamepad_cfg) ++ return -ENODEV; ++ ++ return sysfs_emit(buf, "%d\n", ally_cfg->mode); ++} ++ ++static ssize_t gamepad_mode_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct hid_device *hdev = to_hid_device(dev); ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ int ret, val; ++ ++ if (!drvdata.gamepad_cfg) ++ return -ENODEV; ++ ++ ret = kstrtoint(buf, 0, &val); ++ if (ret) ++ return ret; ++ ++ if (val < xpad_mode_game || val > xpad_mode_mouse) ++ return -EINVAL; ++ ++ ally_cfg->mode = val; ++ ++ ret = _gamepad_set_mode(hdev, ally_cfg, val); ++ if (ret < 0) ++ return ret; ++ ++ return count; ++} ++ ++DEVICE_ATTR_RW(gamepad_mode); ++ ++static ssize_t mcu_version_show(struct device *dev, struct device_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%d\n", drvdata.mcu_version); ++} ++ ++DEVICE_ATTR_RO(mcu_version); ++ ++/* ROOT LEVEL ATTRS *******************************************************************************/ ++static struct attribute *gamepad_device_attrs[] = { ++ &dev_attr_btn_mapping_reset.attr, ++ &dev_attr_gamepad_mode.attr, ++ &dev_attr_gamepad_apply_all.attr, ++ &dev_attr_gamepad_vibration_intensity.attr, ++ &dev_attr_gamepad_vibration_intensity_index.attr, ++ &dev_attr_mcu_version.attr, ++ NULL ++}; ++ ++static const struct attribute_group ally_controller_attr_group = { ++ .attrs = gamepad_device_attrs, ++}; ++ ++static const struct attribute_group *gamepad_device_attr_groups[] = { ++ &ally_controller_attr_group, ++ &axis_xy_left_attr_group, ++ &axis_xy_right_attr_group, ++ &axis_z_left_attr_group, ++ &axis_z_right_attr_group, ++ &btn_mapping_m1_attr_group, ++ &btn_mapping_m2_attr_group, ++ &btn_mapping_a_attr_group, ++ &btn_mapping_b_attr_group, ++ &btn_mapping_x_attr_group, ++ &btn_mapping_y_attr_group, ++ &btn_mapping_lb_attr_group, ++ &btn_mapping_rb_attr_group, ++ &btn_mapping_ls_attr_group, ++ &btn_mapping_rs_attr_group, ++ &btn_mapping_lt_attr_group, ++ &btn_mapping_rt_attr_group, ++ &btn_mapping_dpad_u_attr_group, ++ &btn_mapping_dpad_d_attr_group, ++ &btn_mapping_dpad_l_attr_group, ++ &btn_mapping_dpad_r_attr_group, ++ &btn_mapping_view_attr_group, ++ &btn_mapping_menu_attr_group, ++ NULL, ++}; ++ ++static struct ally_gamepad_cfg *ally_gamepad_cfg_create(struct hid_device *hdev) ++{ ++ struct ally_gamepad_cfg *ally_cfg; ++ struct input_dev *input_dev; ++ int err; ++ ++ ally_cfg = devm_kzalloc(&hdev->dev, sizeof(*ally_cfg), GFP_KERNEL); ++ if (!ally_cfg) ++ return ERR_PTR(-ENOMEM); ++ ally_cfg->hdev = hdev; ++ // Allocate memory for each mode's `btn_mapping` ++ ally_cfg->mode = xpad_mode_game; ++ ++ input_dev = devm_input_allocate_device(&hdev->dev); ++ if (!input_dev) { ++ err = -ENOMEM; ++ goto free_ally_cfg; ++ } ++ ++ input_dev->id.bustype = hdev->bus; ++ input_dev->id.vendor = hdev->vendor; ++ input_dev->id.product = hdev->product; ++ input_dev->id.version = hdev->version; ++ input_dev->uniq = hdev->uniq; ++ input_dev->name = "ASUS ROG Ally Config"; ++ input_set_capability(input_dev, EV_KEY, KEY_PROG1); ++ input_set_capability(input_dev, EV_KEY, KEY_F16); ++ input_set_capability(input_dev, EV_KEY, KEY_F17); ++ input_set_capability(input_dev, EV_KEY, KEY_F18); ++ input_set_drvdata(input_dev, hdev); ++ ++ err = input_register_device(input_dev); ++ if (err) ++ goto free_input_dev; ++ ally_cfg->input = input_dev; ++ ++ /* ignore all errors for this as they are related to USB HID I/O */ ++ _gamepad_set_xpad_default(ally_cfg); ++ ally_cfg->key_mapping[ally_cfg->mode - 1].btn_m1.button = BTN_KB_M1; ++ ally_cfg->key_mapping[ally_cfg->mode - 1].btn_m2.button = BTN_KB_M2; ++ _gamepad_apply_btn_pair(hdev, ally_cfg, btn_pair_m1_m2); ++ gamepad_get_calibration(hdev); ++ ++ ally_cfg->vibration_intensity[0] = 0x64; ++ ally_cfg->vibration_intensity[1] = 0x64; ++ _gamepad_set_deadzones_default(ally_cfg); ++ _gamepad_set_anti_deadzones_default(ally_cfg); ++ _gamepad_set_js_response_curves_default(ally_cfg); ++ ++ drvdata.gamepad_cfg = ally_cfg; // Must asign before attr group setup ++ if (sysfs_create_groups(&hdev->dev.kobj, gamepad_device_attr_groups)) { ++ err = -ENODEV; ++ goto unregister_input_dev; ++ } ++ ++ return ally_cfg; ++ ++unregister_input_dev: ++ input_unregister_device(input_dev); ++ ally_cfg->input = NULL; // Prevent double free when kfree(ally_cfg) happens ++ ++free_input_dev: ++ devm_kfree(&hdev->dev, input_dev); ++ ++free_ally_cfg: ++ devm_kfree(&hdev->dev, ally_cfg); ++ return ERR_PTR(err); ++} ++ ++static void ally_cfg_remove(struct hid_device *hdev) ++{ ++ // __gamepad_set_mode(hdev, drvdata.gamepad_cfg, xpad_mode_mouse); ++ sysfs_remove_groups(&hdev->dev.kobj, gamepad_device_attr_groups); ++} ++ ++/**************************************************************************************************/ ++/* ROG Ally gamepad i/o and force-feedback */ ++/**************************************************************************************************/ ++static int ally_x_raw_event(struct ally_x_device *ally_x, struct hid_report *report, u8 *data, ++ int size) ++{ ++ struct ally_x_input_report *in_report; ++ unsigned long flags; ++ u8 byte; ++ ++ if (data[0] == 0x0B) { ++ in_report = (struct ally_x_input_report *)&data[1]; ++ ++ input_report_abs(ally_x->input, ABS_X, in_report->x); ++ input_report_abs(ally_x->input, ABS_Y, in_report->y); ++ input_report_abs(ally_x->input, ABS_RX, in_report->rx); ++ input_report_abs(ally_x->input, ABS_RY, in_report->ry); ++ input_report_abs(ally_x->input, ABS_Z, in_report->z); ++ input_report_abs(ally_x->input, ABS_RZ, in_report->rz); ++ ++ byte = in_report->buttons[0]; ++ input_report_key(ally_x->input, BTN_A, byte & BIT(0)); ++ input_report_key(ally_x->input, BTN_B, byte & BIT(1)); ++ input_report_key(ally_x->input, BTN_X, byte & BIT(2)); ++ input_report_key(ally_x->input, BTN_Y, byte & BIT(3)); ++ input_report_key(ally_x->input, BTN_TL, byte & BIT(4)); ++ input_report_key(ally_x->input, BTN_TR, byte & BIT(5)); ++ input_report_key(ally_x->input, BTN_SELECT, byte & BIT(6)); ++ input_report_key(ally_x->input, BTN_START, byte & BIT(7)); ++ ++ byte = in_report->buttons[1]; ++ input_report_key(ally_x->input, BTN_THUMBL, byte & BIT(0)); ++ input_report_key(ally_x->input, BTN_THUMBR, byte & BIT(1)); ++ input_report_key(ally_x->input, BTN_MODE, byte & BIT(2)); ++ ++ byte = in_report->buttons[2]; ++ input_report_abs(ally_x->input, ABS_HAT0X, hat_values[byte][0]); ++ input_report_abs(ally_x->input, ABS_HAT0Y, hat_values[byte][1]); ++ } ++ /* ++ * The MCU used on Ally provides many devices: gamepad, keyboord, mouse, other. ++ * The AC and QAM buttons route through another interface making it difficult to ++ * use the events unless we grab those and use them here. Only works for Ally X. ++ */ ++ else if (data[0] == 0x5A) { ++ if (ally_x->qam_btns_steam_mode) { ++ spin_lock_irqsave(&ally_x->lock, flags); ++ if (data[1] == 0x38 && !ally_x->update_qam_btn) { ++ ally_x->update_qam_btn = true; ++ if (ally_x->output_worker_initialized) ++ schedule_work(&ally_x->output_worker); ++ } ++ spin_unlock_irqrestore(&ally_x->lock, flags); ++ /* Left/XBox button. Long press does ctrl+alt+del which we can't catch */ ++ input_report_key(ally_x->input, BTN_MODE, data[1] == 0xA6); ++ } else { ++ input_report_key(ally_x->input, KEY_F16, data[1] == 0xA6); ++ input_report_key(ally_x->input, KEY_PROG1, data[1] == 0x38); ++ } ++ /* QAM long press */ ++ input_report_key(ally_x->input, KEY_F17, data[1] == 0xA7); ++ /* QAM long press released */ ++ input_report_key(ally_x->input, KEY_F18, data[1] == 0xA8); ++ } ++ ++ input_sync(ally_x->input); ++ ++ return 0; ++} ++ ++static struct input_dev *ally_x_alloc_input_dev(struct hid_device *hdev, ++ const char *name_suffix) ++{ ++ struct input_dev *input_dev; ++ ++ input_dev = devm_input_allocate_device(&hdev->dev); ++ if (!input_dev) ++ return ERR_PTR(-ENOMEM); ++ ++ input_dev->id.bustype = hdev->bus; ++ input_dev->id.vendor = hdev->vendor; ++ input_dev->id.product = hdev->product; ++ input_dev->id.version = hdev->version; ++ input_dev->uniq = hdev->uniq; ++ input_dev->name = "ASUS ROG Ally X Gamepad"; ++ ++ input_set_drvdata(input_dev, hdev); ++ ++ return input_dev; ++} ++ ++static int ally_x_play_effect(struct input_dev *idev, void *data, struct ff_effect *effect) ++{ ++ struct ally_x_device *ally_x = drvdata.ally_x; ++ unsigned long flags; ++ ++ if (effect->type != FF_RUMBLE) ++ return 0; ++ ++ spin_lock_irqsave(&ally_x->lock, flags); ++ ally_x->ff_packet->ff.magnitude_strong = effect->u.rumble.strong_magnitude / 512; ++ ally_x->ff_packet->ff.magnitude_weak = effect->u.rumble.weak_magnitude / 512; ++ ally_x->update_ff = true; ++ spin_unlock_irqrestore(&ally_x->lock, flags); ++ ++ if (ally_x->output_worker_initialized) ++ schedule_work(&ally_x->output_worker); ++ ++ return 0; ++} ++ ++static void ally_x_work(struct work_struct *work) ++{ ++ struct ally_x_device *ally_x = container_of(work, struct ally_x_device, output_worker); ++ struct ff_report *ff_report = NULL; ++ bool update_qam = false; ++ bool update_ff = false; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ally_x->lock, flags); ++ update_ff = ally_x->update_ff; ++ if (ally_x->update_ff) { ++ ff_report = kmemdup(ally_x->ff_packet, sizeof(*ally_x->ff_packet), GFP_KERNEL); ++ ally_x->update_ff = false; ++ } ++ update_qam = ally_x->update_qam_btn; ++ spin_unlock_irqrestore(&ally_x->lock, flags); ++ ++ if (update_ff && ff_report) { ++ ff_report->ff.magnitude_left = ff_report->ff.magnitude_strong; ++ ff_report->ff.magnitude_right = ff_report->ff.magnitude_weak; ++ asus_dev_set_report(ally_x->hdev, (u8 *)ff_report, sizeof(*ff_report)); ++ } ++ kfree(ff_report); ++ ++ if (update_qam) { ++ /* ++ * The sleeps here are required to allow steam to register the button combo. ++ */ ++ usleep_range(1000, 2000); ++ input_report_key(ally_x->input, BTN_MODE, 1); ++ input_sync(ally_x->input); ++ ++ msleep(80); ++ input_report_key(ally_x->input, BTN_A, 1); ++ input_sync(ally_x->input); ++ ++ msleep(80); ++ input_report_key(ally_x->input, BTN_A, 0); ++ input_sync(ally_x->input); ++ ++ msleep(80); ++ input_report_key(ally_x->input, BTN_MODE, 0); ++ input_sync(ally_x->input); ++ ++ spin_lock_irqsave(&ally_x->lock, flags); ++ ally_x->update_qam_btn = false; ++ spin_unlock_irqrestore(&ally_x->lock, flags); ++ } ++} ++ ++static struct input_dev *ally_x_setup_input(struct hid_device *hdev) ++{ ++ int ret, abs_min = 0, js_abs_max = 65535, tr_abs_max = 1023; ++ struct input_dev *input; ++ ++ input = ally_x_alloc_input_dev(hdev, NULL); ++ if (IS_ERR(input)) ++ return ERR_CAST(input); ++ ++ input_set_abs_params(input, ABS_X, abs_min, js_abs_max, 0, 0); ++ input_set_abs_params(input, ABS_Y, abs_min, js_abs_max, 0, 0); ++ input_set_abs_params(input, ABS_RX, abs_min, js_abs_max, 0, 0); ++ input_set_abs_params(input, ABS_RY, abs_min, js_abs_max, 0, 0); ++ input_set_abs_params(input, ABS_Z, abs_min, tr_abs_max, 0, 0); ++ input_set_abs_params(input, ABS_RZ, abs_min, tr_abs_max, 0, 0); ++ input_set_abs_params(input, ABS_HAT0X, -1, 1, 0, 0); ++ input_set_abs_params(input, ABS_HAT0Y, -1, 1, 0, 0); ++ input_set_capability(input, EV_KEY, BTN_A); ++ input_set_capability(input, EV_KEY, BTN_B); ++ input_set_capability(input, EV_KEY, BTN_X); ++ input_set_capability(input, EV_KEY, BTN_Y); ++ input_set_capability(input, EV_KEY, BTN_TL); ++ input_set_capability(input, EV_KEY, BTN_TR); ++ input_set_capability(input, EV_KEY, BTN_SELECT); ++ input_set_capability(input, EV_KEY, BTN_START); ++ input_set_capability(input, EV_KEY, BTN_MODE); ++ input_set_capability(input, EV_KEY, BTN_THUMBL); ++ input_set_capability(input, EV_KEY, BTN_THUMBR); ++ ++ input_set_capability(input, EV_KEY, KEY_PROG1); ++ input_set_capability(input, EV_KEY, KEY_F16); ++ input_set_capability(input, EV_KEY, KEY_F17); ++ input_set_capability(input, EV_KEY, KEY_F18); ++ ++ input_set_capability(input, EV_FF, FF_RUMBLE); ++ input_ff_create_memless(input, NULL, ally_x_play_effect); ++ ++ ret = input_register_device(input); ++ if (ret) ++ return ERR_PTR(ret); ++ ++ return input; ++} ++ ++static ssize_t ally_x_qam_mode_show(struct device *dev, struct device_attribute *attr, ++ char *buf) ++{ ++ struct ally_x_device *ally_x = drvdata.ally_x; ++ ++ return sysfs_emit(buf, "%d\n", ally_x->qam_btns_steam_mode); ++} ++ ++static ssize_t ally_x_qam_mode_store(struct device *dev, struct device_attribute *attr, ++ const char *buf, size_t count) ++{ ++ struct ally_x_device *ally_x = drvdata.ally_x; ++ bool val; ++ int ret; ++ ++ ret = kstrtobool(buf, &val); ++ if (ret < 0) ++ return ret; ++ ++ ally_x->qam_btns_steam_mode = val; ++ ++ return count; ++} ++ALLY_DEVICE_ATTR_RW(ally_x_qam_mode, qam_mode); ++ ++static struct ally_x_device *ally_x_create(struct hid_device *hdev) ++{ ++ uint8_t max_output_report_size; ++ struct ally_x_device *ally_x; ++ struct ff_report *report; ++ int ret; ++ ++ ally_x = devm_kzalloc(&hdev->dev, sizeof(*ally_x), GFP_KERNEL); ++ if (!ally_x) ++ return ERR_PTR(-ENOMEM); ++ ++ ally_x->hdev = hdev; ++ INIT_WORK(&ally_x->output_worker, ally_x_work); ++ spin_lock_init(&ally_x->lock); ++ ally_x->output_worker_initialized = true; ++ ally_x->qam_btns_steam_mode = ++ true; /* Always default to steam mode, it can be changed by userspace attr */ ++ ++ max_output_report_size = sizeof(struct ally_x_input_report); ++ report = devm_kzalloc(&hdev->dev, sizeof(*report), GFP_KERNEL); ++ if (!report) { ++ ret = -ENOMEM; ++ goto free_ally_x; ++ } ++ ++ /* None of these bytes will change for the FF command for now */ ++ report->report_id = 0x0D; ++ report->ff.enable = 0x0F; /* Enable all by default */ ++ report->ff.pulse_sustain_10ms = 0xFF; /* Duration */ ++ report->ff.pulse_release_10ms = 0x00; /* Start Delay */ ++ report->ff.loop_count = 0xEB; /* Loop Count */ ++ ally_x->ff_packet = report; ++ ++ ally_x->input = ally_x_setup_input(hdev); ++ if (IS_ERR(ally_x->input)) { ++ ret = PTR_ERR(ally_x->input); ++ goto free_ff_packet; ++ } ++ ++ if (sysfs_create_file(&hdev->dev.kobj, &dev_attr_ally_x_qam_mode.attr)) { ++ ret = -ENODEV; ++ goto unregister_input; ++ } ++ ++ ally_x->update_ff = true; ++ if (ally_x->output_worker_initialized) ++ schedule_work(&ally_x->output_worker); ++ ++ hid_info(hdev, "Registered Ally X controller using %s\n", ++ dev_name(&ally_x->input->dev)); ++ return ally_x; ++ ++unregister_input: ++ input_unregister_device(ally_x->input); ++free_ff_packet: ++ kfree(ally_x->ff_packet); ++free_ally_x: ++ kfree(ally_x); ++ return ERR_PTR(ret); ++} ++ ++static void ally_x_remove(struct hid_device *hdev) ++{ ++ struct ally_x_device *ally_x = drvdata.ally_x; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&ally_x->lock, flags); ++ ally_x->output_worker_initialized = false; ++ spin_unlock_irqrestore(&ally_x->lock, flags); ++ cancel_work_sync(&ally_x->output_worker); ++ sysfs_remove_file(&hdev->dev.kobj, &dev_attr_ally_x_qam_mode.attr); ++} ++ ++/**************************************************************************************************/ ++/* ROG Ally LED control */ ++/**************************************************************************************************/ ++static void ally_rgb_schedule_work(struct ally_rgb_dev *led) ++{ ++ unsigned long flags; ++ ++ spin_lock_irqsave(&led->lock, flags); ++ if (!led->removed) ++ schedule_work(&led->work); ++ spin_unlock_irqrestore(&led->lock, flags); ++} ++ ++/* ++ * The RGB still has the basic 0-3 level brightness. Since the multicolour ++ * brightness is being used in place, set this to max ++ */ ++static int ally_rgb_set_bright_base_max(struct hid_device *hdev) ++{ ++ u8 buf[] = { FEATURE_KBD_LED_REPORT_ID1, 0xba, 0xc5, 0xc4, 0x02 }; ++ ++ return asus_dev_set_report(hdev, buf, sizeof(buf)); ++} ++ ++static void ally_rgb_do_work(struct work_struct *work) ++{ ++ struct ally_rgb_dev *led = container_of(work, struct ally_rgb_dev, work); ++ int ret; ++ unsigned long flags; ++ ++ u8 buf[16] = { [0] = FEATURE_ROG_ALLY_REPORT_ID, ++ [1] = FEATURE_ROG_ALLY_CODE_PAGE, ++ [2] = xpad_cmd_set_leds, ++ [3] = xpad_cmd_len_leds }; ++ ++ spin_lock_irqsave(&led->lock, flags); ++ if (!led->update_rgb) { ++ spin_unlock_irqrestore(&led->lock, flags); ++ return; ++ } ++ ++ for (int i = 0; i < 4; i++) { ++ buf[5 + i * 3] = drvdata.led_rgb_dev->green[i]; ++ buf[6 + i * 3] = drvdata.led_rgb_dev->blue[i]; ++ buf[4 + i * 3] = drvdata.led_rgb_dev->red[i]; ++ } ++ led->update_rgb = false; ++ ++ spin_unlock_irqrestore(&led->lock, flags); ++ ++ ret = asus_dev_set_report(led->hdev, buf, sizeof(buf)); ++ if (ret < 0) ++ hid_err(led->hdev, "Ally failed to set gamepad backlight: %d\n", ret); ++} ++ ++static void ally_rgb_set(struct led_classdev *cdev, enum led_brightness brightness) ++{ ++ struct led_classdev_mc *mc_cdev = lcdev_to_mccdev(cdev); ++ struct ally_rgb_dev *led = container_of(mc_cdev, struct ally_rgb_dev, led_rgb_dev); ++ int intensity, bright; ++ unsigned long flags; ++ ++ led_mc_calc_color_components(mc_cdev, brightness); ++ spin_lock_irqsave(&led->lock, flags); ++ led->update_rgb = true; ++ bright = mc_cdev->led_cdev.brightness; ++ for (int i = 0; i < 4; i++) { ++ intensity = mc_cdev->subled_info[i].intensity; ++ drvdata.led_rgb_dev->red[i] = (((intensity >> 16) & 0xFF) * bright) / 255; ++ drvdata.led_rgb_dev->green[i] = (((intensity >> 8) & 0xFF) * bright) / 255; ++ drvdata.led_rgb_dev->blue[i] = ((intensity & 0xFF) * bright) / 255; ++ } ++ spin_unlock_irqrestore(&led->lock, flags); ++ drvdata.led_rgb_data.initialized = true; ++ ++ ally_rgb_schedule_work(led); ++} ++ ++static int ally_rgb_set_static_from_multi(struct hid_device *hdev) ++{ ++ u8 buf[17] = {FEATURE_KBD_LED_REPORT_ID1, 0xb3}; ++ int ret; ++ ++ /* ++ * Set single zone single colour based on the first LED of EC software mode. ++ * buf[2] = zone, buf[3] = mode ++ */ ++ buf[4] = drvdata.led_rgb_data.red[0]; ++ buf[5] = drvdata.led_rgb_data.green[0]; ++ buf[6] = drvdata.led_rgb_data.blue[0]; ++ ++ ret = asus_dev_set_report(hdev, buf, sizeof(buf)); ++ if (ret < 0) ++ return ret; ++ ++ ret = asus_dev_set_report(hdev, EC_MODE_LED_APPLY, sizeof(EC_MODE_LED_APPLY)); ++ if (ret < 0) ++ return ret; ++ ++ return asus_dev_set_report(hdev, EC_MODE_LED_SET, sizeof(EC_MODE_LED_SET)); ++} ++ ++/* ++ * Store the RGB values for restoring on resume, and set the static mode to the first LED colour ++*/ ++static void ally_rgb_store_settings(void) ++{ ++ int arr_size = sizeof(drvdata.led_rgb_data.red); ++ ++ struct ally_rgb_dev *led_rgb = drvdata.led_rgb_dev; ++ ++ drvdata.led_rgb_data.brightness = led_rgb->led_rgb_dev.led_cdev.brightness; ++ ++ memcpy(drvdata.led_rgb_data.red, led_rgb->red, arr_size); ++ memcpy(drvdata.led_rgb_data.green, led_rgb->green, arr_size); ++ memcpy(drvdata.led_rgb_data.blue, led_rgb->blue, arr_size); ++ ++ ally_rgb_set_static_from_multi(led_rgb->hdev); ++} ++ ++static void ally_rgb_restore_settings(struct ally_rgb_dev *led_rgb, struct led_classdev *led_cdev, ++ struct mc_subled *mc_led_info) ++{ ++ int arr_size = sizeof(drvdata.led_rgb_data.red); ++ ++ memcpy(led_rgb->red, drvdata.led_rgb_data.red, arr_size); ++ memcpy(led_rgb->green, drvdata.led_rgb_data.green, arr_size); ++ memcpy(led_rgb->blue, drvdata.led_rgb_data.blue, arr_size); ++ for (int i = 0; i < 4; i++) { ++ mc_led_info[i].intensity = (drvdata.led_rgb_data.red[i] << 16) | ++ (drvdata.led_rgb_data.green[i] << 8) | ++ drvdata.led_rgb_data.blue[i]; ++ } ++ led_cdev->brightness = drvdata.led_rgb_data.brightness; ++} ++ ++/* Set LEDs. Call after any setup. */ ++static void ally_rgb_resume(void) ++{ ++ struct ally_rgb_dev *led_rgb = drvdata.led_rgb_dev; ++ struct led_classdev *led_cdev; ++ struct mc_subled *mc_led_info; ++ ++ if (!led_rgb) ++ return; ++ ++ led_cdev = &led_rgb->led_rgb_dev.led_cdev; ++ mc_led_info = led_rgb->led_rgb_dev.subled_info; ++ ++ if (drvdata.led_rgb_data.initialized) { ++ ally_rgb_restore_settings(led_rgb, led_cdev, mc_led_info); ++ led_rgb->update_rgb = true; ++ ally_rgb_schedule_work(led_rgb); ++ ally_rgb_set_bright_base_max(led_rgb->hdev); ++ } ++} ++ ++static int ally_rgb_register(struct hid_device *hdev, struct ally_rgb_dev *led_rgb) ++{ ++ struct mc_subled *mc_led_info; ++ struct led_classdev *led_cdev; ++ ++ mc_led_info = ++ devm_kmalloc_array(&hdev->dev, 12, sizeof(*mc_led_info), GFP_KERNEL | __GFP_ZERO); ++ if (!mc_led_info) ++ return -ENOMEM; ++ ++ mc_led_info[0].color_index = LED_COLOR_ID_RGB; ++ mc_led_info[1].color_index = LED_COLOR_ID_RGB; ++ mc_led_info[2].color_index = LED_COLOR_ID_RGB; ++ mc_led_info[3].color_index = LED_COLOR_ID_RGB; ++ ++ led_rgb->led_rgb_dev.subled_info = mc_led_info; ++ led_rgb->led_rgb_dev.num_colors = 4; ++ ++ led_cdev = &led_rgb->led_rgb_dev.led_cdev; ++ led_cdev->brightness = 128; ++ led_cdev->name = "ally:rgb:joystick_rings"; ++ led_cdev->max_brightness = 255; ++ led_cdev->brightness_set = ally_rgb_set; ++ ++ if (drvdata.led_rgb_data.initialized) { ++ ally_rgb_restore_settings(led_rgb, led_cdev, mc_led_info); ++ } ++ ++ return devm_led_classdev_multicolor_register(&hdev->dev, &led_rgb->led_rgb_dev); ++} ++ ++static struct ally_rgb_dev *ally_rgb_create(struct hid_device *hdev) ++{ ++ struct ally_rgb_dev *led_rgb; ++ int ret; ++ ++ led_rgb = devm_kzalloc(&hdev->dev, sizeof(struct ally_rgb_dev), GFP_KERNEL); ++ if (!led_rgb) ++ return ERR_PTR(-ENOMEM); ++ ++ ret = ally_rgb_register(hdev, led_rgb); ++ if (ret < 0) { ++ cancel_work_sync(&led_rgb->work); ++ devm_kfree(&hdev->dev, led_rgb); ++ return ERR_PTR(ret); ++ } ++ ++ led_rgb->hdev = hdev; ++ led_rgb->removed = false; ++ ++ INIT_WORK(&led_rgb->work, ally_rgb_do_work); ++ led_rgb->output_worker_initialized = true; ++ spin_lock_init(&led_rgb->lock); ++ ++ ally_rgb_set_bright_base_max(hdev); ++ ++ /* Not marked as initialized unless ally_rgb_set() is called */ ++ if (drvdata.led_rgb_data.initialized) { ++ msleep(1500); ++ led_rgb->update_rgb = true; ++ ally_rgb_schedule_work(led_rgb); ++ } ++ ++ return led_rgb; ++} ++ ++static void ally_rgb_remove(struct hid_device *hdev) ++{ ++ struct ally_rgb_dev *led_rgb = drvdata.led_rgb_dev; ++ unsigned long flags; ++ int ep; ++ ++ ep = get_endpoint_address(hdev); ++ if (ep != ROG_ALLY_CFG_INTF_IN) ++ return; ++ ++ if (!drvdata.led_rgb_dev || led_rgb->removed) ++ return; ++ ++ spin_lock_irqsave(&led_rgb->lock, flags); ++ led_rgb->removed = true; ++ led_rgb->output_worker_initialized = false; ++ spin_unlock_irqrestore(&led_rgb->lock, flags); ++ cancel_work_sync(&led_rgb->work); ++ devm_led_classdev_multicolor_unregister(&hdev->dev, &led_rgb->led_rgb_dev); ++ ++ hid_info(hdev, "Removed Ally RGB interface"); ++} ++ ++/**************************************************************************************************/ ++/* ROG Ally driver init */ ++/**************************************************************************************************/ ++ ++static int ally_raw_event(struct hid_device *hdev, struct hid_report *report, u8 *data, ++ int size) ++{ ++ struct ally_gamepad_cfg *cfg = drvdata.gamepad_cfg; ++ struct ally_x_device *ally_x = drvdata.ally_x; ++ ++ if (ally_x) { ++ if ((hdev->bus == BUS_USB && report->id == ALLY_X_INPUT_REPORT_USB && ++ size == ALLY_X_INPUT_REPORT_USB_SIZE) || ++ (data[0] == 0x5A)) { ++ ally_x_raw_event(ally_x, report, data, size); ++ } else { ++ return -1; ++ } ++ } ++ ++ if (cfg && !ally_x) { ++ input_report_key(cfg->input, KEY_PROG1, data[1] == 0x38); ++ input_report_key(cfg->input, KEY_F16, data[1] == 0xA6); ++ input_report_key(cfg->input, KEY_F17, data[1] == 0xA7); ++ input_report_key(cfg->input, KEY_F18, data[1] == 0xA8); ++ input_sync(cfg->input); ++ } ++ ++ return 0; ++} ++ ++static int ally_hid_init(struct hid_device *hdev) ++{ ++ int ret; ++ ++ ret = asus_dev_set_report(hdev, EC_INIT_STRING, sizeof(EC_INIT_STRING)); ++ if (ret < 0) { ++ hid_err(hdev, "Ally failed to send init command: %d\n", ret); ++ return ret; ++ } ++ ++ ret = asus_dev_set_report(hdev, FORCE_FEEDBACK_OFF, sizeof(FORCE_FEEDBACK_OFF)); ++ if (ret < 0) ++ hid_err(hdev, "Ally failed to send init command: %d\n", ret); ++ ++ return ret; ++} ++ ++static int ally_hid_probe(struct hid_device *hdev, const struct hid_device_id *_id) ++{ ++ struct usb_interface *intf = to_usb_interface(hdev->dev.parent); ++ struct usb_device *udev = interface_to_usbdev(intf); ++ u16 idProduct = le16_to_cpu(udev->descriptor.idProduct); ++ int ret, ep; ++ ++ ep = get_endpoint_address(hdev); ++ if (ep < 0) ++ return ep; ++ ++ if (ep != ROG_ALLY_CFG_INTF_IN && ++ ep != ROG_ALLY_X_INTF_IN) ++ return -ENODEV; ++ ++ ret = hid_parse(hdev); ++ if (ret) { ++ hid_err(hdev, "Parse failed\n"); ++ return ret; ++ } ++ ++ ret = hid_hw_start(hdev, HID_CONNECT_HIDRAW); ++ if (ret) { ++ hid_err(hdev, "Failed to start HID device\n"); ++ return ret; ++ } ++ ++ ret = hid_hw_open(hdev); ++ if (ret) { ++ hid_err(hdev, "Failed to open HID device\n"); ++ goto err_stop; ++ } ++ ++ /* Initialize MCU even before alloc */ ++ ret = ally_hid_init(hdev); ++ if (ret < 0) ++ return ret; ++ ++ drvdata.hdev = hdev; ++ hid_set_drvdata(hdev, &drvdata); ++ ++ /* This should almost always exist */ ++ if (ep == ROG_ALLY_CFG_INTF_IN) { ++ validate_mcu_fw_version(hdev, idProduct); ++ ++ drvdata.led_rgb_dev = ally_rgb_create(hdev); ++ if (IS_ERR(drvdata.led_rgb_dev)) ++ hid_err(hdev, "Failed to create Ally gamepad LEDs.\n"); ++ else ++ hid_info(hdev, "Created Ally RGB LED controls.\n"); ++ ++ drvdata.gamepad_cfg = ally_gamepad_cfg_create(hdev); ++ if (IS_ERR(drvdata.gamepad_cfg)) ++ hid_err(hdev, "Failed to create Ally gamepad attributes.\n"); ++ else ++ hid_info(hdev, "Created Ally gamepad attributes.\n"); ++ ++ if (IS_ERR(drvdata.led_rgb_dev) && IS_ERR(drvdata.gamepad_cfg)) ++ goto err_close; ++ } ++ ++ /* May or may not exist */ ++ if (ep == ROG_ALLY_X_INTF_IN) { ++ drvdata.ally_x = ally_x_create(hdev); ++ if (IS_ERR(drvdata.ally_x)) { ++ hid_err(hdev, "Failed to create Ally X gamepad.\n"); ++ drvdata.ally_x = NULL; ++ goto err_close; ++ } ++ hid_info(hdev, "Created Ally X controller.\n"); ++ ++ // Not required since we send this inputs ep through the gamepad input dev ++ if (drvdata.gamepad_cfg && drvdata.gamepad_cfg->input) { ++ input_unregister_device(drvdata.gamepad_cfg->input); ++ hid_info(hdev, "Ally X removed unrequired input dev.\n"); ++ } ++ } ++ ++ return 0; ++ ++err_close: ++ hid_hw_close(hdev); ++err_stop: ++ hid_hw_stop(hdev); ++ return ret; ++} ++ ++static void ally_hid_remove(struct hid_device *hdev) ++{ ++ if (drvdata.led_rgb_dev) ++ ally_rgb_remove(hdev); ++ ++ if (drvdata.ally_x) ++ ally_x_remove(hdev); ++ ++ if (drvdata.gamepad_cfg) ++ ally_cfg_remove(hdev); ++ ++ hid_hw_close(hdev); ++ hid_hw_stop(hdev); ++} ++ ++static int ally_hid_resume(struct hid_device *hdev) ++{ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; ++ int err; ++ ++ if (!ally_cfg) ++ return 0; ++ ++ err = _gamepad_apply_all(hdev, ally_cfg); ++ if (err) ++ return err; ++ ++ return 0; ++} ++ ++static int ally_hid_reset_resume(struct hid_device *hdev) ++{ ++ int ep = get_endpoint_address(hdev); ++ if (ep != ROG_ALLY_CFG_INTF_IN) ++ return 0; ++ ++ ally_hid_init(hdev); ++ ally_rgb_resume(); ++ ++ return ally_hid_resume(hdev); ++} ++ ++static int ally_pm_thaw(struct device *dev) ++{ ++ struct hid_device *hdev = to_hid_device(dev); ++ ++ return ally_hid_reset_resume(hdev); ++} ++ ++static int ally_pm_suspend(struct device *dev) ++{ ++ if (drvdata.led_rgb_dev) { ++ ally_rgb_store_settings(); ++ } ++ ++ return 0; ++} ++ ++static const struct dev_pm_ops ally_pm_ops = { ++ .thaw = ally_pm_thaw, ++ .suspend = ally_pm_suspend, ++ .poweroff = ally_pm_suspend, ++}; ++ ++MODULE_DEVICE_TABLE(hid, rog_ally_devices); ++ ++static struct hid_driver rog_ally_cfg = { .name = "asus_rog_ally", ++ .id_table = rog_ally_devices, ++ .probe = ally_hid_probe, ++ .remove = ally_hid_remove, ++ .raw_event = ally_raw_event, ++ /* HID is the better place for resume functions, not pm_ops */ ++ .resume = ally_hid_resume, ++ /* ALLy 1 requires this to reset device state correctly */ ++ .reset_resume = ally_hid_reset_resume, ++ .driver = { ++ .pm = &ally_pm_ops, ++ } ++}; ++ ++static int __init rog_ally_init(void) ++{ ++ return hid_register_driver(&rog_ally_cfg); ++} ++ ++static void __exit rog_ally_exit(void) ++{ ++ hid_unregister_driver(&rog_ally_cfg); ++} ++ ++module_init(rog_ally_init); ++module_exit(rog_ally_exit); ++ ++MODULE_IMPORT_NS("ASUS_WMI"); ++MODULE_IMPORT_NS("HID_ASUS"); ++MODULE_AUTHOR("Luke D. Jones"); ++MODULE_DESCRIPTION("HID Driver for ASUS ROG Ally gamepad configuration."); ++MODULE_LICENSE("GPL"); +diff --git a/drivers/hid/hid-asus-ally.h b/drivers/hid/hid-asus-ally.h +new file mode 100644 +index 000000000000..c83817589082 +--- /dev/null ++++ b/drivers/hid/hid-asus-ally.h +@@ -0,0 +1,398 @@ ++/* SPDX-License-Identifier: GPL-2.0-or-later ++ * ++ * HID driver for Asus ROG laptops and Ally ++ * ++ * Copyright (c) 2023 Luke Jones ++ */ ++ ++#include ++#include ++ ++/* ++ * the xpad_mode is used inside the mode setting packet and is used ++ * for indexing (xpad_mode - 1) ++ */ ++enum xpad_mode { ++ xpad_mode_game = 0x01, ++ xpad_mode_wasd = 0x02, ++ xpad_mode_mouse = 0x03, ++}; ++ ++/* the xpad_cmd determines which feature is set or queried */ ++enum xpad_cmd { ++ xpad_cmd_set_mode = 0x01, ++ xpad_cmd_set_mapping = 0x02, ++ xpad_cmd_set_js_dz = 0x04, /* deadzones */ ++ xpad_cmd_set_tr_dz = 0x05, /* deadzones */ ++ xpad_cmd_set_vibe_intensity = 0x06, ++ xpad_cmd_set_leds = 0x08, ++ xpad_cmd_check_ready = 0x0A, ++ xpad_cmd_set_turbo = 0x0F, ++ xpad_cmd_set_response_curve = 0x13, ++ xpad_cmd_set_adz = 0x18, ++}; ++ ++/* the xpad_cmd determines which feature is set or queried */ ++enum xpad_cmd_len { ++ xpad_cmd_len_mode = 0x01, ++ xpad_cmd_len_mapping = 0x2c, ++ xpad_cmd_len_deadzone = 0x04, ++ xpad_cmd_len_vibe_intensity = 0x02, ++ xpad_cmd_len_leds = 0x0C, ++ xpad_cmd_len_turbo = 0x20, ++ xpad_cmd_len_response_curve = 0x09, ++ xpad_cmd_len_adz = 0x02, ++}; ++ ++/* Values correspond to the actual HID byte value required */ ++enum btn_pair_index { ++ btn_pair_dpad_u_d = 0x01, ++ btn_pair_dpad_l_r = 0x02, ++ btn_pair_ls_rs = 0x03, ++ btn_pair_lb_rb = 0x04, ++ btn_pair_a_b = 0x05, ++ btn_pair_x_y = 0x06, ++ btn_pair_view_menu = 0x07, ++ btn_pair_m1_m2 = 0x08, ++ btn_pair_lt_rt = 0x09, ++}; ++ ++#define BTN_PAD_A 0x0101000000000000 ++#define BTN_PAD_B 0x0102000000000000 ++#define BTN_PAD_X 0x0103000000000000 ++#define BTN_PAD_Y 0x0104000000000000 ++#define BTN_PAD_LB 0x0105000000000000 ++#define BTN_PAD_RB 0x0106000000000000 ++#define BTN_PAD_LS 0x0107000000000000 ++#define BTN_PAD_RS 0x0108000000000000 ++#define BTN_PAD_DPAD_UP 0x0109000000000000 ++#define BTN_PAD_DPAD_DOWN 0x010A000000000000 ++#define BTN_PAD_DPAD_LEFT 0x010B000000000000 ++#define BTN_PAD_DPAD_RIGHT 0x010C000000000000 ++#define BTN_PAD_LT 0x010D000000000000 ++#define BTN_PAD_RT 0x010E000000000000 ++#define BTN_PAD_VIEW 0x0111000000000000 ++#define BTN_PAD_MENU 0x0112000000000000 ++#define BTN_PAD_XBOX 0x0113000000000000 ++ ++#define BTN_KB_M2 0x02008E0000000000 ++#define BTN_KB_M1 0x02008F0000000000 ++#define BTN_KB_ESC 0x0200760000000000 ++#define BTN_KB_F1 0x0200500000000000 ++#define BTN_KB_F2 0x0200600000000000 ++#define BTN_KB_F3 0x0200400000000000 ++#define BTN_KB_F4 0x02000C0000000000 ++#define BTN_KB_F5 0x0200030000000000 ++#define BTN_KB_F6 0x02000B0000000000 ++#define BTN_KB_F7 0x0200800000000000 ++#define BTN_KB_F8 0x02000A0000000000 ++#define BTN_KB_F9 0x0200010000000000 ++#define BTN_KB_F10 0x0200090000000000 ++#define BTN_KB_F11 0x0200780000000000 ++#define BTN_KB_F12 0x0200070000000000 ++#define BTN_KB_F14 0x0200180000000000 ++#define BTN_KB_F15 0x0200100000000000 ++#define BTN_KB_BACKTICK 0x02000E0000000000 ++#define BTN_KB_1 0x0200160000000000 ++#define BTN_KB_2 0x02001E0000000000 ++#define BTN_KB_3 0x0200260000000000 ++#define BTN_KB_4 0x0200250000000000 ++#define BTN_KB_5 0x02002E0000000000 ++#define BTN_KB_6 0x0200360000000000 ++#define BTN_KB_7 0x02003D0000000000 ++#define BTN_KB_8 0x02003E0000000000 ++#define BTN_KB_9 0x0200460000000000 ++#define BTN_KB_0 0x0200450000000000 ++#define BTN_KB_HYPHEN 0x02004E0000000000 ++#define BTN_KB_EQUALS 0x0200550000000000 ++#define BTN_KB_BACKSPACE 0x0200660000000000 ++#define BTN_KB_TAB 0x02000D0000000000 ++#define BTN_KB_Q 0x0200150000000000 ++#define BTN_KB_W 0x02001D0000000000 ++#define BTN_KB_E 0x0200240000000000 ++#define BTN_KB_R 0x02002D0000000000 ++#define BTN_KB_T 0x02002C0000000000 ++#define BTN_KB_Y 0x0200350000000000 ++#define BTN_KB_U 0x02003C0000000000 ++#define BTN_KB_O 0x0200440000000000 ++#define BTN_KB_P 0x02004D0000000000 ++#define BTN_KB_LBRACKET 0x0200540000000000 ++#define BTN_KB_RBRACKET 0x02005B0000000000 ++#define BTN_KB_BACKSLASH 0x02005D0000000000 ++#define BTN_KB_CAPS 0x0200580000000000 ++#define BTN_KB_A 0x02001C0000000000 ++#define BTN_KB_S 0x02001B0000000000 ++#define BTN_KB_D 0x0200230000000000 ++#define BTN_KB_F 0x02002B0000000000 ++#define BTN_KB_G 0x0200340000000000 ++#define BTN_KB_H 0x0200330000000000 ++#define BTN_KB_J 0x02003B0000000000 ++#define BTN_KB_K 0x0200420000000000 ++#define BTN_KB_L 0x02004B0000000000 ++#define BTN_KB_SEMI 0x02004C0000000000 ++#define BTN_KB_QUOTE 0x0200520000000000 ++#define BTN_KB_RET 0x02005A0000000000 ++#define BTN_KB_LSHIFT 0x0200880000000000 ++#define BTN_KB_Z 0x02001A0000000000 ++#define BTN_KB_X 0x0200220000000000 ++#define BTN_KB_C 0x0200210000000000 ++#define BTN_KB_V 0x02002A0000000000 ++#define BTN_KB_B 0x0200320000000000 ++#define BTN_KB_N 0x0200310000000000 ++#define BTN_KB_M 0x02003A0000000000 ++#define BTN_KB_COMMA 0x0200410000000000 ++#define BTN_KB_PERIOD 0x0200490000000000 ++#define BTN_KB_RSHIFT 0x0200890000000000 ++#define BTN_KB_LCTL 0x02008C0000000000 ++#define BTN_KB_META 0x0200820000000000 ++#define BTN_KB_LALT 0x02008A0000000000 ++#define BTN_KB_SPACE 0x0200290000000000 ++#define BTN_KB_RALT 0x02008B0000000000 ++#define BTN_KB_MENU 0x0200840000000000 ++#define BTN_KB_RCTL 0x02008D0000000000 ++#define BTN_KB_PRNTSCN 0x0200C30000000000 ++#define BTN_KB_SCRLCK 0x02007E0000000000 ++#define BTN_KB_PAUSE 0x0200910000000000 ++#define BTN_KB_INS 0x0200C20000000000 ++#define BTN_KB_HOME 0x0200940000000000 ++#define BTN_KB_PGUP 0x0200960000000000 ++#define BTN_KB_DEL 0x0200C00000000000 ++#define BTN_KB_END 0x0200950000000000 ++#define BTN_KB_PGDWN 0x0200970000000000 ++#define BTN_KB_UP_ARROW 0x0200980000000000 ++#define BTN_KB_DOWN_ARROW 0x0200990000000000 ++#define BTN_KB_LEFT_ARROW 0x0200910000000000 ++#define BTN_KB_RIGHT_ARROW 0x02009B0000000000 ++ ++#define BTN_NUMPAD_LOCK 0x0200770000000000 ++#define BTN_NUMPAD_FWDSLASH 0x0200900000000000 ++#define BTN_NUMPAD_ASTERISK 0x02007C0000000000 ++#define BTN_NUMPAD_HYPHEN 0x02007B0000000000 ++#define BTN_NUMPAD_0 0x0200700000000000 ++#define BTN_NUMPAD_1 0x0200690000000000 ++#define BTN_NUMPAD_2 0x0200720000000000 ++#define BTN_NUMPAD_3 0x02007A0000000000 ++#define BTN_NUMPAD_4 0x02006B0000000000 ++#define BTN_NUMPAD_5 0x0200730000000000 ++#define BTN_NUMPAD_6 0x0200740000000000 ++#define BTN_NUMPAD_7 0x02006C0000000000 ++#define BTN_NUMPAD_8 0x0200750000000000 ++#define BTN_NUMPAD_9 0x02007D0000000000 ++#define BTN_NUMPAD_PLUS 0x0200790000000000 ++#define BTN_NUMPAD_ENTER 0x0200810000000000 ++#define BTN_NUMPAD_PERIOD 0x0200710000000000 ++ ++#define BTN_MOUSE_LCLICK 0x0300000001000000 ++#define BTN_MOUSE_RCLICK 0x0300000002000000 ++#define BTN_MOUSE_MCLICK 0x0300000003000000 ++#define BTN_MOUSE_WHEEL_UP 0x0300000004000000 ++#define BTN_MOUSE_WHEEL_DOWN 0x0300000005000000 ++ ++#define BTN_MEDIA_SCREENSHOT 0x0500001600000000 ++#define BTN_MEDIA_SHOW_KEYBOARD 0x0500001900000000 ++#define BTN_MEDIA_SHOW_DESKTOP 0x0500001C00000000 ++#define BTN_MEDIA_START_RECORDING 0x0500001E00000000 ++#define BTN_MEDIA_MIC_OFF 0x0500000100000000 ++#define BTN_MEDIA_VOL_DOWN 0x0500000200000000 ++#define BTN_MEDIA_VOL_UP 0x0500000300000000 ++ ++#define ALLY_DEVICE_ATTR_WO(_name, _sysfs_name) \ ++ struct device_attribute dev_attr_##_name = \ ++ __ATTR(_sysfs_name, 0200, NULL, _name##_store) ++ ++/* required so we can have nested attributes with same name but different functions */ ++#define ALLY_DEVICE_ATTR_RW(_name, _sysfs_name) \ ++ struct device_attribute dev_attr_##_name = \ ++ __ATTR(_sysfs_name, 0644, _name##_show, _name##_store) ++ ++#define ALLY_DEVICE_ATTR_RO(_name, _sysfs_name) \ ++ struct device_attribute dev_attr_##_name = \ ++ __ATTR(_sysfs_name, 0444, _name##_show, NULL) ++ ++/* button specific macros */ ++#define ALLY_BTN_SHOW(_fname, _btn_name, _secondary) \ ++ static ssize_t _fname##_show(struct device *dev, \ ++ struct device_attribute *attr, char *buf) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ struct btn_data *btn; \ ++ const char* name; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name; \ ++ name = btn_to_name(_secondary ? btn->macro : btn->button); \ ++ return sysfs_emit(buf, "%s\n", name); \ ++ } ++ ++#define ALLY_BTN_STORE(_fname, _btn_name, _secondary) \ ++ static ssize_t _fname##_store(struct device *dev, \ ++ struct device_attribute *attr, \ ++ const char *buf, size_t count) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ struct btn_data *btn; \ ++ u64 code; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name; \ ++ code = name_to_btn(buf); \ ++ if (_secondary) \ ++ btn->macro = code; \ ++ else \ ++ btn->button = code; \ ++ return count; \ ++ } ++ ++#define ALLY_TURBO_SHOW(_fname, _btn_name) \ ++ static ssize_t _fname##_show(struct device *dev, \ ++ struct device_attribute *attr, char *buf) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ struct btn_data *btn; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name; \ ++ return sysfs_emit(buf, "%d\n", btn->turbo); \ ++ } ++ ++#define ALLY_TURBO_STORE(_fname, _btn_name) \ ++ static ssize_t _fname##_store(struct device *dev, \ ++ struct device_attribute *attr, \ ++ const char *buf, size_t count) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ struct btn_data *btn; \ ++ bool turbo; \ ++ int ret; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ btn = &ally_cfg->key_mapping[ally_cfg->mode - 1]._btn_name; \ ++ ret = kstrtobool(buf, &turbo); \ ++ if (ret) \ ++ return ret; \ ++ btn->turbo = turbo; \ ++ return count; \ ++ } ++ ++#define ALLY_DEADZONE_SHOW(_fname, _axis_name) \ ++ static ssize_t _fname##_show(struct device *dev, \ ++ struct device_attribute *attr, char *buf) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ struct deadzone *dz; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ dz = &ally_cfg->_axis_name; \ ++ return sysfs_emit(buf, "%d %d\n", dz->inner, dz->outer); \ ++ } ++ ++#define ALLY_DEADZONE_STORE(_fname, _axis_name) \ ++ static ssize_t _fname##_store(struct device *dev, \ ++ struct device_attribute *attr, \ ++ const char *buf, size_t count) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ struct hid_device *hdev = to_hid_device(dev); \ ++ u32 inner, outer; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ if (sscanf(buf, "%d %d", &inner, &outer) != 2) \ ++ return -EINVAL; \ ++ if (inner > 64 || outer > 64 || inner > outer) \ ++ return -EINVAL; \ ++ ally_cfg->_axis_name.inner = inner; \ ++ ally_cfg->_axis_name.outer = outer; \ ++ _gamepad_apply_deadzones(hdev, ally_cfg); \ ++ return count; \ ++ } ++ ++#define ALLY_DEADZONES(_fname, _mname) \ ++ ALLY_DEADZONE_SHOW(_fname##_deadzone, _mname); \ ++ ALLY_DEADZONE_STORE(_fname##_deadzone, _mname); \ ++ ALLY_DEVICE_ATTR_RW(_fname##_deadzone, deadzone) ++ ++/* response curve macros */ ++#define ALLY_RESP_CURVE_SHOW(_fname, _mname) \ ++static ssize_t _fname##_show(struct device *dev, \ ++ struct device_attribute *attr, \ ++ char *buf) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ return sysfs_emit(buf, "%d\n", ally_cfg->ls_rc._mname); \ ++ } ++ ++#define ALLY_RESP_CURVE_STORE(_fname, _mname) \ ++static ssize_t _fname##_store(struct device *dev, \ ++ struct device_attribute *attr, \ ++ const char *buf, size_t count) \ ++ { \ ++ struct ally_gamepad_cfg *ally_cfg = drvdata.gamepad_cfg; \ ++ int ret, val; \ ++ if (!drvdata.gamepad_cfg) \ ++ return -ENODEV; \ ++ ret = kstrtoint(buf, 0, &val); \ ++ if (ret) \ ++ return ret; \ ++ if (val < 0 || val > 100) \ ++ return -EINVAL; \ ++ ally_cfg->ls_rc._mname = val; \ ++ return count; \ ++ } ++ ++/* _point_n must start at 1 */ ++#define ALLY_JS_RC_POINT(_fname, _mname, _num) \ ++ ALLY_RESP_CURVE_SHOW(_fname##_##_mname##_##_num, _mname##_pct_##_num); \ ++ ALLY_RESP_CURVE_STORE(_fname##_##_mname##_##_num, _mname##_pct_##_num); \ ++ ALLY_DEVICE_ATTR_RW(_fname##_##_mname##_##_num, curve_##_mname##_pct_##_num) ++ ++#define ALLY_BTN_ATTRS_GROUP(_name, _fname) \ ++ static struct attribute *_fname##_attrs[] = { \ ++ &dev_attr_##_fname.attr, \ ++ &dev_attr_##_fname##_macro.attr, \ ++ }; \ ++ static const struct attribute_group _fname##_attr_group = { \ ++ .name = __stringify(_name), \ ++ .attrs = _fname##_attrs, \ ++ } ++ ++#define _ALLY_BTN_REMAP(_fname, _btn_name) \ ++ ALLY_BTN_SHOW(btn_mapping_##_fname##_remap, _btn_name, false); \ ++ ALLY_BTN_STORE(btn_mapping_##_fname##_remap, _btn_name, false); \ ++ ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_remap, remap); ++ ++#define _ALLY_BTN_MACRO(_fname, _btn_name) \ ++ ALLY_BTN_SHOW(btn_mapping_##_fname##_macro, _btn_name, true); \ ++ ALLY_BTN_STORE(btn_mapping_##_fname##_macro, _btn_name, true); \ ++ ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_macro, macro_remap); ++ ++#define ALLY_BTN_MAPPING(_fname, _btn_name) \ ++ _ALLY_BTN_REMAP(_fname, _btn_name) \ ++ _ALLY_BTN_MACRO(_fname, _btn_name) \ ++ static struct attribute *_fname##_attrs[] = { \ ++ &dev_attr_btn_mapping_##_fname##_remap.attr, \ ++ &dev_attr_btn_mapping_##_fname##_macro.attr, \ ++ NULL, \ ++ }; \ ++ static const struct attribute_group btn_mapping_##_fname##_attr_group = { \ ++ .name = __stringify(btn_##_fname), \ ++ .attrs = _fname##_attrs, \ ++ } ++ ++#define ALLY_TURBO_BTN_MAPPING(_fname, _btn_name) \ ++ _ALLY_BTN_REMAP(_fname, _btn_name) \ ++ _ALLY_BTN_MACRO(_fname, _btn_name) \ ++ ALLY_TURBO_SHOW(btn_mapping_##_fname##_turbo, _btn_name); \ ++ ALLY_TURBO_STORE(btn_mapping_##_fname##_turbo, _btn_name); \ ++ ALLY_DEVICE_ATTR_RW(btn_mapping_##_fname##_turbo, turbo); \ ++ static struct attribute *_fname##_turbo_attrs[] = { \ ++ &dev_attr_btn_mapping_##_fname##_remap.attr, \ ++ &dev_attr_btn_mapping_##_fname##_macro.attr, \ ++ &dev_attr_btn_mapping_##_fname##_turbo.attr, \ ++ NULL, \ ++ }; \ ++ static const struct attribute_group btn_mapping_##_fname##_attr_group = { \ ++ .name = __stringify(btn_##_fname), \ ++ .attrs = _fname##_turbo_attrs, \ ++ } +diff --git a/drivers/hid/hid-asus.c b/drivers/hid/hid-asus.c +index d27dcfb2b9e4..188eff9d3573 100644 +--- a/drivers/hid/hid-asus.c ++++ b/drivers/hid/hid-asus.c +@@ -23,6 +23,7 @@ + /* + */ + ++#include "linux/export.h" + #include + #include + #include +@@ -33,6 +34,7 @@ + #include + + #include "hid-ids.h" ++#include "hid-asus.h" + + MODULE_AUTHOR("Yusuke Fujimaki "); + MODULE_AUTHOR("Brendan McGrath "); +@@ -601,7 +603,7 @@ static int mcu_request_version(struct hid_device *hdev) + return ret; + } + +-static void validate_mcu_fw_version(struct hid_device *hdev, int idProduct) ++void validate_mcu_fw_version(struct hid_device *hdev, int idProduct) + { + int min_version, version; + +@@ -629,12 +631,11 @@ static void validate_mcu_fw_version(struct hid_device *hdev, int idProduct) + set_ally_mcu_powersave(true); + } + } ++EXPORT_SYMBOL_NS(validate_mcu_fw_version, "HID_ASUS"); + + static int asus_kbd_register_leds(struct hid_device *hdev) + { + struct asus_drvdata *drvdata = hid_get_drvdata(hdev); +- struct usb_interface *intf; +- struct usb_device *udev; + unsigned char kbd_func; + int ret; + +@@ -659,12 +660,14 @@ static int asus_kbd_register_leds(struct hid_device *hdev) + return ret; + } + ++ #if !IS_REACHABLE(CONFIG_HID_ASUS_ALLY) + if (drvdata->quirks & QUIRK_ROG_ALLY_XPAD) { +- intf = to_usb_interface(hdev->dev.parent); +- udev = interface_to_usbdev(intf); ++ struct usb_interface *intf = to_usb_interface(hdev->dev.parent); ++ struct usb_device *udev = interface_to_usbdev(intf); + validate_mcu_fw_version(hdev, + le16_to_cpu(udev->descriptor.idProduct)); + } ++ #endif /* !IS_REACHABLE(CONFIG_HID_ASUS_ALLY) */ + + } else { + /* Initialize keyboard */ +@@ -1122,8 +1125,10 @@ static int __maybe_unused asus_reset_resume(struct hid_device *hdev) + + static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id) + { +- int ret; + struct asus_drvdata *drvdata; ++ struct usb_host_endpoint *ep; ++ struct usb_interface *intf; ++ int ret; + + drvdata = devm_kzalloc(&hdev->dev, sizeof(*drvdata), GFP_KERNEL); + if (drvdata == NULL) { +@@ -1135,6 +1140,18 @@ static int asus_probe(struct hid_device *hdev, const struct hid_device_id *id) + + drvdata->quirks = id->driver_data; + ++ /* Ignore these endpoints as they are used by hid-asus-ally */ ++ #if IS_REACHABLE(CONFIG_HID_ASUS_ALLY) ++ if (drvdata->quirks & QUIRK_ROG_ALLY_XPAD) { ++ intf = to_usb_interface(hdev->dev.parent); ++ ep = intf->cur_altsetting->endpoint; ++ if (ep->desc.bEndpointAddress == ROG_ALLY_X_INTF_IN || ++ ep->desc.bEndpointAddress == ROG_ALLY_CFG_INTF_IN || ++ ep->desc.bEndpointAddress == ROG_ALLY_CFG_INTF_OUT) ++ return -ENODEV; ++ } ++ #endif /* IS_REACHABLE(CONFIG_HID_ASUS_ALLY) */ ++ + /* + * T90CHI's keyboard dock returns same ID values as T100CHI's dock. + * Thus, identify T90CHI dock with product name string. +diff --git a/drivers/hid/hid-asus.h b/drivers/hid/hid-asus.h +new file mode 100644 +index 000000000000..f67dd5a3a1bc +--- /dev/null ++++ b/drivers/hid/hid-asus.h +@@ -0,0 +1,13 @@ ++/* SPDX-License-Identifier: GPL-2.0 */ ++#ifndef __HID_ASUS_H ++#define __HID_ASUS_H ++ ++#include ++ ++#define ROG_ALLY_CFG_INTF_IN 0x83 ++#define ROG_ALLY_CFG_INTF_OUT 0x04 ++#define ROG_ALLY_X_INTF_IN 0x87 ++ ++void validate_mcu_fw_version(struct hid_device *hdev, int idProduct); ++ ++#endif /* __HID_ASUS_H */ +diff --git a/drivers/hid/hid-ids.h b/drivers/hid/hid-ids.h +index 149798754570..a94b734266be 100644 +--- a/drivers/hid/hid-ids.h ++++ b/drivers/hid/hid-ids.h +@@ -225,6 +225,7 @@ + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD2 0x19b6 + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_KEYBOARD3 0x1a30 + #define USB_DEVICE_ID_ASUSTEK_ROG_Z13_LIGHTBAR 0x18c6 ++#define USB_DEVICE_ID_ASUSTEK_ROG_RAIKIRI_PAD 0x1abb + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY 0x1abe + #define USB_DEVICE_ID_ASUSTEK_ROG_NKEY_ALLY_X 0x1b4c + #define USB_DEVICE_ID_ASUSTEK_ROG_CLAYMORE_II_KEYBOARD 0x196b +diff --git a/drivers/platform/x86/Kconfig b/drivers/platform/x86/Kconfig +index 6d238e120dce..fc45a7c8c201 100644 +--- a/drivers/platform/x86/Kconfig ++++ b/drivers/platform/x86/Kconfig +@@ -250,6 +250,18 @@ config ASUS_WIRELESS + If you choose to compile this driver as a module the module will be + called asus-wireless. + ++config ASUS_ARMOURY ++ tristate "ASUS Armoury driver" ++ depends on ASUS_WMI ++ select FW_ATTR_CLASS ++ help ++ Say Y here if you have a WMI aware Asus machine and would like to use the ++ firmware_attributes API to control various settings typically exposed in ++ the ASUS Armoury Crate application available on Windows. ++ ++ To compile this driver as a module, choose M here: the module will ++ be called asus-armoury. ++ + config ASUS_WMI + tristate "ASUS WMI Driver" + depends on ACPI_WMI +@@ -272,6 +284,17 @@ config ASUS_WMI + To compile this driver as a module, choose M here: the module will + be called asus-wmi. + ++config ASUS_WMI_DEPRECATED_ATTRS ++ bool "BIOS option support in WMI platform (DEPRECATED)" ++ depends on ASUS_WMI ++ default y ++ help ++ Say Y to expose the configurable BIOS options through the asus-wmi ++ driver. ++ ++ This can be used with or without the asus-armoury driver which ++ has the same attributes, but more, and better features. ++ + config ASUS_NB_WMI + tristate "Asus Notebook WMI Driver" + depends on ASUS_WMI +diff --git a/drivers/platform/x86/Makefile b/drivers/platform/x86/Makefile +index a0c5848513e3..4279f5443f30 100644 +--- a/drivers/platform/x86/Makefile ++++ b/drivers/platform/x86/Makefile +@@ -32,6 +32,7 @@ obj-$(CONFIG_APPLE_GMUX) += apple-gmux.o + # ASUS + obj-$(CONFIG_ASUS_LAPTOP) += asus-laptop.o + obj-$(CONFIG_ASUS_WIRELESS) += asus-wireless.o ++obj-$(CONFIG_ASUS_ARMOURY) += asus-armoury.o + obj-$(CONFIG_ASUS_WMI) += asus-wmi.o + obj-$(CONFIG_ASUS_NB_WMI) += asus-nb-wmi.o + obj-$(CONFIG_ASUS_TF103C_DOCK) += asus-tf103c-dock.o +diff --git a/drivers/platform/x86/asus-armoury.c b/drivers/platform/x86/asus-armoury.c +new file mode 100644 +index 000000000000..a461be936294 +--- /dev/null ++++ b/drivers/platform/x86/asus-armoury.c +@@ -0,0 +1,1174 @@ ++// SPDX-License-Identifier: GPL-2.0-or-later ++/* ++ * Asus Armoury (WMI) attributes driver. ++ * ++ * This driver uses the fw_attributes class to expose various WMI functions ++ * that are present in many gaming and some non-gaming ASUS laptops. ++ * ++ * These typically don't fit anywhere else in the sysfs such as under LED class, ++ * hwmon or others, and are set in Windows using the ASUS Armoury Crate tool. ++ * ++ * Copyright(C) 2024 Luke Jones ++ */ ++ ++#define pr_fmt(fmt) KBUILD_MODNAME ": " fmt ++ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "asus-armoury.h" ++#include "firmware_attributes_class.h" ++ ++#define ASUS_NB_WMI_EVENT_GUID "0B3CBB35-E3C2-45ED-91C2-4C5A6D195D1C" ++ ++#define ASUS_MINI_LED_MODE_MASK 0x03 ++/* Standard modes for devices with only on/off */ ++#define ASUS_MINI_LED_OFF 0x00 ++#define ASUS_MINI_LED_ON 0x01 ++/* Like "on" but the effect is more vibrant or brighter */ ++#define ASUS_MINI_LED_STRONG_MODE 0x02 ++/* New modes for devices with 3 mini-led mode types */ ++#define ASUS_MINI_LED_2024_WEAK 0x00 ++#define ASUS_MINI_LED_2024_STRONG 0x01 ++#define ASUS_MINI_LED_2024_OFF 0x02 ++ ++/* Power tunable attribute name defines */ ++#define ATTR_PPT_PL1_SPL "ppt_pl1_spl" ++#define ATTR_PPT_PL2_SPPT "ppt_pl2_sppt" ++#define ATTR_PPT_PL3_FPPT "ppt_pl3_fppt" ++#define ATTR_PPT_APU_SPPT "ppt_apu_sppt" ++#define ATTR_PPT_PLATFORM_SPPT "ppt_platform_sppt" ++#define ATTR_NV_DYNAMIC_BOOST "nv_dynamic_boost" ++#define ATTR_NV_TEMP_TARGET "nv_temp_target" ++#define ATTR_NV_BASE_TGP "nv_base_tgp" ++#define ATTR_NV_TGP "nv_tgp" ++ ++#define ASUS_POWER_CORE_MASK GENMASK(15, 8) ++#define ASUS_PERF_CORE_MASK GENMASK(7, 0) ++ ++enum cpu_core_type { ++ CPU_CORE_PERF = 0, ++ CPU_CORE_POWER, ++}; ++ ++enum cpu_core_value { ++ CPU_CORE_DEFAULT = 0, ++ CPU_CORE_MIN, ++ CPU_CORE_MAX, ++ CPU_CORE_CURRENT, ++}; ++ ++#define CPU_PERF_CORE_COUNT_MIN 4 ++#define CPU_POWR_CORE_COUNT_MIN 0 ++ ++/* Tunables provided by ASUS for gaming laptops */ ++struct cpu_cores { ++ u32 cur_perf_cores; ++ u32 min_perf_cores; ++ u32 max_perf_cores; ++ u32 cur_power_cores; ++ u32 min_power_cores; ++ u32 max_power_cores; ++}; ++ ++struct rog_tunables { ++ const struct power_limits *power_limits; ++ u32 ppt_pl1_spl; // cpu ++ u32 ppt_pl2_sppt; // cpu ++ u32 ppt_pl3_fppt; // cpu ++ u32 ppt_apu_sppt; // plat ++ u32 ppt_platform_sppt; // plat ++ ++ u32 nv_dynamic_boost; ++ u32 nv_temp_target; ++ u32 nv_tgp; ++}; ++ ++static struct asus_armoury_priv { ++ struct device *fw_attr_dev; ++ struct kset *fw_attr_kset; ++ ++ struct cpu_cores *cpu_cores; ++ /* Index 0 for DC, 1 for AC */ ++ struct rog_tunables *rog_tunables[2]; ++ u32 mini_led_dev_id; ++ u32 gpu_mux_dev_id; ++ /* ++ * Mutex to prevent big/little core count changes writing to same ++ * endpoint at the same time. Must lock during attr store. ++ */ ++ struct mutex cpu_core_mutex; ++} asus_armoury = { ++ .cpu_core_mutex = __MUTEX_INITIALIZER(asus_armoury.cpu_core_mutex) ++}; ++ ++struct fw_attrs_group { ++ bool pending_reboot; ++}; ++ ++static struct fw_attrs_group fw_attrs = { ++ .pending_reboot = false, ++}; ++ ++struct asus_attr_group { ++ const struct attribute_group *attr_group; ++ u32 wmi_devid; ++}; ++ ++static bool asus_wmi_is_present(u32 dev_id) ++{ ++ u32 retval; ++ int status; ++ ++ status = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, &retval); ++ pr_debug("%s called (0x%08x), retval: 0x%08x\n", __func__, dev_id, retval); ++ ++ return status == 0 && (retval & ASUS_WMI_DSTS_PRESENCE_BIT); ++} ++ ++static void asus_set_reboot_and_signal_event(void) ++{ ++ fw_attrs.pending_reboot = true; ++ kobject_uevent(&asus_armoury.fw_attr_dev->kobj, KOBJ_CHANGE); ++} ++ ++static ssize_t pending_reboot_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf) ++{ ++ return sysfs_emit(buf, "%d\n", fw_attrs.pending_reboot); ++} ++ ++static struct kobj_attribute pending_reboot = __ATTR_RO(pending_reboot); ++ ++static bool asus_bios_requires_reboot(struct kobj_attribute *attr) ++{ ++ return !strcmp(attr->attr.name, "gpu_mux_mode") || ++ !strcmp(attr->attr.name, "cores_performance") || ++ !strcmp(attr->attr.name, "cores_efficiency") || ++ !strcmp(attr->attr.name, "panel_hd_mode"); ++} ++ ++static int armoury_wmi_set_devstate(struct kobj_attribute *attr, u32 value, u32 wmi_dev) ++{ ++ u32 result; ++ int err; ++ ++ err = asus_wmi_set_devstate(wmi_dev, value, &result); ++ if (err) { ++ pr_err("Failed to set %s: %d\n", attr->attr.name, err); ++ return err; ++ } ++ /* ++ * !1 is usually considered a fail by ASUS, but some WMI methods do use > 1 ++ * to return a status code or similar. ++ */ ++ if (result < 1) { ++ pr_err("Failed to set %s: (result): 0x%x\n", attr->attr.name, result); ++ return -EIO; ++ } ++ ++ return 0; ++} ++ ++/** ++ * attr_uint_store() - Send an uint to wmi method, checks if within min/max exclusive. ++ * @kobj: Pointer to the driver object. ++ * @attr: Pointer to the attribute calling this function. ++ * @buf: The buffer to read from, this is parsed to `uint` type. ++ * @count: Required by sysfs attribute macros, pass in from the callee attr. ++ * @min: Minimum accepted value. Below this returns -EINVAL. ++ * @max: Maximum accepted value. Above this returns -EINVAL. ++ * @store_value: Pointer to where the parsed value should be stored. ++ * @wmi_dev: The WMI function ID to use. ++ * ++ * This function is intended to be generic so it can be called from any "_store" ++ * attribute which works only with integers. The integer to be sent to the WMI method ++ * is range checked and an error returned if out of range. ++ * ++ * If the value is valid and WMI is success, then the sysfs attribute is notified ++ * and if asus_bios_requires_reboot() is true then reboot attribute is also notified. ++ * ++ * Returns: Either count, or an error. ++ */ ++static ssize_t attr_uint_store(struct kobject *kobj, struct kobj_attribute *attr, const char *buf, ++ size_t count, u32 min, u32 max, u32 *store_value, u32 wmi_dev) ++{ ++ u32 value; ++ int err; ++ ++ err = kstrtouint(buf, 10, &value); ++ if (err) ++ return err; ++ ++ if (value < min || value > max) ++ return -EINVAL; ++ ++ err = armoury_wmi_set_devstate(attr, value, wmi_dev); ++ if (err) ++ return err; ++ ++ if (store_value != NULL) ++ *store_value = value; ++ sysfs_notify(kobj, NULL, attr->attr.name); ++ ++ if (asus_bios_requires_reboot(attr)) ++ asus_set_reboot_and_signal_event(); ++ ++ return count; ++} ++ ++static ssize_t enum_type_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sysfs_emit(buf, "enumeration\n"); ++} ++ ++static ssize_t int_type_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return sysfs_emit(buf, "integer\n"); ++} ++ ++/* Mini-LED mode **************************************************************/ ++static ssize_t mini_led_mode_current_value_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ u32 value; ++ int err; ++ ++ err = asus_wmi_get_devstate_dsts(asus_armoury.mini_led_dev_id, &value); ++ if (err) ++ return err; ++ ++ value &= ASUS_MINI_LED_MODE_MASK; ++ ++ /* ++ * Remap the mode values to match previous generation mini-LED. The last gen ++ * WMI 0 == off, while on this version WMI 2 == off (flipped). ++ */ ++ if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2) { ++ switch (value) { ++ case ASUS_MINI_LED_2024_WEAK: ++ value = ASUS_MINI_LED_ON; ++ break; ++ case ASUS_MINI_LED_2024_STRONG: ++ value = ASUS_MINI_LED_STRONG_MODE; ++ break; ++ case ASUS_MINI_LED_2024_OFF: ++ value = ASUS_MINI_LED_OFF; ++ break; ++ } ++ } ++ ++ return sysfs_emit(buf, "%u\n", value); ++} ++ ++static ssize_t mini_led_mode_current_value_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ u32 mode; ++ int err; ++ ++ err = kstrtou32(buf, 10, &mode); ++ if (err) ++ return err; ++ ++ if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE && ++ mode > ASUS_MINI_LED_ON) ++ return -EINVAL; ++ if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2 && ++ mode > ASUS_MINI_LED_STRONG_MODE) ++ return -EINVAL; ++ ++ /* ++ * Remap the mode values so expected behaviour is the same as the last ++ * generation of mini-LED with 0 == off, 1 == on. ++ */ ++ if (asus_armoury.mini_led_dev_id == ASUS_WMI_DEVID_MINI_LED_MODE2) { ++ switch (mode) { ++ case ASUS_MINI_LED_OFF: ++ mode = ASUS_MINI_LED_2024_OFF; ++ break; ++ case ASUS_MINI_LED_ON: ++ mode = ASUS_MINI_LED_2024_WEAK; ++ break; ++ case ASUS_MINI_LED_STRONG_MODE: ++ mode = ASUS_MINI_LED_2024_STRONG; ++ break; ++ } ++ } ++ ++ err = armoury_wmi_set_devstate(attr, mode, asus_armoury.mini_led_dev_id); ++ if (err) ++ return err; ++ ++ sysfs_notify(kobj, NULL, attr->attr.name); ++ ++ return count; ++} ++ ++static ssize_t mini_led_mode_possible_values_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ switch (asus_armoury.mini_led_dev_id) { ++ case ASUS_WMI_DEVID_MINI_LED_MODE: ++ return sysfs_emit(buf, "0;1\n"); ++ case ASUS_WMI_DEVID_MINI_LED_MODE2: ++ return sysfs_emit(buf, "0;1;2\n"); ++ default: ++ return -ENODEV; ++ } ++} ++ ++ATTR_GROUP_ENUM_CUSTOM(mini_led_mode, "mini_led_mode", "Set the mini-LED backlight mode"); ++ ++static ssize_t gpu_mux_mode_current_value_store(struct kobject *kobj, ++ struct kobj_attribute *attr, const char *buf, ++ size_t count) ++{ ++ int result, err; ++ u32 optimus; ++ ++ err = kstrtou32(buf, 10, &optimus); ++ if (err) ++ return err; ++ ++ if (optimus > 1) ++ return -EINVAL; ++ ++ if (asus_wmi_is_present(ASUS_WMI_DEVID_DGPU)) { ++ err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_DGPU, &result); ++ if (err) ++ return err; ++ if (result && !optimus) { ++ pr_warn("Can not switch MUX to dGPU mode when dGPU is disabled: %02X %02X\n", ++ result, optimus); ++ return -ENODEV; ++ } ++ } ++ ++ if (asus_wmi_is_present(ASUS_WMI_DEVID_EGPU)) { ++ err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_EGPU, &result); ++ if (err) ++ return err; ++ if (result && !optimus) { ++ pr_warn("Can not switch MUX to dGPU mode when eGPU is enabled\n"); ++ return -EBUSY; ++ } ++ } ++ ++ err = armoury_wmi_set_devstate(attr, optimus, asus_armoury.gpu_mux_dev_id); ++ if (err) ++ return err; ++ ++ sysfs_notify(kobj, NULL, attr->attr.name); ++ asus_set_reboot_and_signal_event(); ++ ++ return count; ++} ++WMI_SHOW_INT(gpu_mux_mode_current_value, "%u\n", asus_armoury.gpu_mux_dev_id); ++ATTR_GROUP_BOOL_CUSTOM(gpu_mux_mode, "gpu_mux_mode", "Set the GPU display MUX mode"); ++ ++/* ++ * A user may be required to store the value twice, typical store first, then ++ * rescan PCI bus to activate power, then store a second time to save correctly. ++ */ ++static ssize_t dgpu_disable_current_value_store(struct kobject *kobj, ++ struct kobj_attribute *attr, const char *buf, ++ size_t count) ++{ ++ int result, err; ++ u32 disable; ++ ++ err = kstrtou32(buf, 10, &disable); ++ if (err) ++ return err; ++ ++ if (disable > 1) ++ return -EINVAL; ++ ++ if (asus_armoury.gpu_mux_dev_id) { ++ err = asus_wmi_get_devstate_dsts(asus_armoury.gpu_mux_dev_id, &result); ++ if (err) ++ return err; ++ if (!result && disable) { ++ pr_warn("Can not disable dGPU when the MUX is in dGPU mode\n"); ++ return -EBUSY; ++ } ++ } ++ ++ err = armoury_wmi_set_devstate(attr, disable, ASUS_WMI_DEVID_DGPU); ++ if (err) ++ return err; ++ ++ sysfs_notify(kobj, NULL, attr->attr.name); ++ ++ return count; ++} ++WMI_SHOW_INT(dgpu_disable_current_value, "%d\n", ASUS_WMI_DEVID_DGPU); ++ATTR_GROUP_BOOL_CUSTOM(dgpu_disable, "dgpu_disable", "Disable the dGPU"); ++ ++/* The ACPI call to enable the eGPU also disables the internal dGPU */ ++static ssize_t egpu_enable_current_value_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int result, err; ++ u32 enable; ++ ++ err = kstrtou32(buf, 10, &enable); ++ if (err) ++ return err; ++ ++ if (enable > 1) ++ return -EINVAL; ++ ++ err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_EGPU_CONNECTED, &result); ++ if (err) { ++ pr_warn("Failed to get eGPU connection status: %d\n", err); ++ return err; ++ } ++ ++ if (asus_armoury.gpu_mux_dev_id) { ++ err = asus_wmi_get_devstate_dsts(asus_armoury.gpu_mux_dev_id, &result); ++ if (err) { ++ pr_warn("Failed to get GPU MUX status: %d\n", result); ++ return err; ++ } ++ if (!result && enable) { ++ pr_warn("Can not enable eGPU when the MUX is in dGPU mode\n"); ++ return -ENODEV; ++ } ++ } ++ ++ err = armoury_wmi_set_devstate(attr, enable, ASUS_WMI_DEVID_EGPU); ++ if (err) ++ return err; ++ ++ sysfs_notify(kobj, NULL, attr->attr.name); ++ ++ return count; ++} ++WMI_SHOW_INT(egpu_enable_current_value, "%d\n", ASUS_WMI_DEVID_EGPU); ++ATTR_GROUP_BOOL_CUSTOM(egpu_enable, "egpu_enable", "Enable the eGPU (also disables dGPU)"); ++ ++/* Device memory available to APU */ ++ ++/* Values map for APU memory: some looks out of order but are actually correct */ ++static u32 apu_mem_map[] = { ++ [0] = 0x000, /* called "AUTO" on the BIOS, is the minimum available */ ++ [1] = 0x102, ++ [2] = 0x103, ++ [3] = 0x104, ++ [4] = 0x105, ++ [5] = 0x107, ++ [6] = 0x108, ++ [7] = 0x109, ++ [8] = 0x106, ++}; ++ ++static ssize_t apu_mem_current_value_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ int err; ++ u32 mem; ++ ++ err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_APU_MEM, &mem); ++ if (err) ++ return err; ++ ++ if ((mem & ASUS_WMI_DSTS_PRESENCE_BIT) == 0) ++ return -ENODEV; ++ ++ mem &= ~ASUS_WMI_DSTS_PRESENCE_BIT; ++ ++ /* After 0x000 is set, a read will return 0x100 */ ++ if (mem == 0x100) ++ return sysfs_emit(buf, "0\n"); ++ ++ for (unsigned int i = 0; i < ARRAY_SIZE(apu_mem_map); i++) { ++ if (apu_mem_map[i] == mem) ++ return sysfs_emit(buf, "%u\n", i); ++ } ++ ++ pr_warn("Unrecognised value for APU mem 0x%08x\n", mem); ++ return sysfs_emit(buf, "%u\n", mem); ++} ++ ++static ssize_t apu_mem_current_value_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int result, err; ++ u32 requested, mem; ++ ++ result = kstrtou32(buf, 10, &requested); ++ if (result) ++ return result; ++ ++ if (requested > ARRAY_SIZE(apu_mem_map)) ++ return -EINVAL; ++ ++ mem = apu_mem_map[requested]; ++ ++ err = asus_wmi_set_devstate(ASUS_WMI_DEVID_APU_MEM, mem, &result); ++ if (err) { ++ pr_warn("Failed to set apu_mem: %d\n", err); ++ return err; ++ } ++ ++ pr_info("APU memory changed to %uGB, reboot required\n", requested+1); ++ sysfs_notify(kobj, NULL, attr->attr.name); ++ ++ asus_set_reboot_and_signal_event(); ++ ++ return count; ++} ++ ++static ssize_t apu_mem_possible_values_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ BUILD_BUG_ON(ARRAY_SIZE(apu_mem_map) != 9); ++ return sysfs_emit(buf, "0;1;2;3;4;5;6;7;8\n"); ++} ++ATTR_GROUP_ENUM_CUSTOM(apu_mem, "apu_mem", "Set available system RAM (in GB) for the APU to use"); ++ ++static int init_max_cpu_cores(void) ++{ ++ u32 cores; ++ int err; ++ ++ asus_armoury.cpu_cores = kzalloc(sizeof(struct cpu_cores), GFP_KERNEL); ++ if (!asus_armoury.cpu_cores) ++ return -ENOMEM; ++ ++ err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_CORES_MAX, &cores); ++ if (err) ++ return err; ++ ++ if ((cores & ASUS_WMI_DSTS_PRESENCE_BIT) == 0) { ++ pr_err("ACPI does not support CPU core count control\n"); ++ err = -ENODEV; ++ goto init_max_cpu_cores_err; ++ } ++ ++ asus_armoury.cpu_cores->max_power_cores = FIELD_GET(ASUS_POWER_CORE_MASK, cores); ++ asus_armoury.cpu_cores->max_perf_cores = FIELD_GET(ASUS_PERF_CORE_MASK, cores); ++ ++ err = asus_wmi_get_devstate_dsts(ASUS_WMI_DEVID_CORES, &cores); ++ if (err) { ++ pr_err("Could not get CPU core count: error %d\n", err); ++ goto init_max_cpu_cores_err; ++ } ++ ++ asus_armoury.cpu_cores->cur_perf_cores = FIELD_GET(ASUS_PERF_CORE_MASK, cores); ++ asus_armoury.cpu_cores->cur_power_cores = FIELD_GET(ASUS_POWER_CORE_MASK, cores); ++ ++ asus_armoury.cpu_cores->min_perf_cores = CPU_PERF_CORE_COUNT_MIN; ++ asus_armoury.cpu_cores->min_power_cores = CPU_POWR_CORE_COUNT_MIN; ++ ++ return 0; ++ ++init_max_cpu_cores_err: ++ kfree(asus_armoury.cpu_cores); ++ return err; ++} ++ ++static ssize_t cores_value_show(struct kobject *kobj, struct kobj_attribute *attr, char *buf, ++ enum cpu_core_type core_type, enum cpu_core_value core_value) ++{ ++ u32 cores; ++ ++ switch (core_value) { ++ case CPU_CORE_DEFAULT: ++ case CPU_CORE_MAX: ++ if (core_type == CPU_CORE_PERF) ++ return sysfs_emit(buf, "%u\n", ++ asus_armoury.cpu_cores->max_perf_cores); ++ else ++ return sysfs_emit(buf, "%u\n", ++ asus_armoury.cpu_cores->max_power_cores); ++ case CPU_CORE_MIN: ++ if (core_type == CPU_CORE_PERF) ++ return sysfs_emit(buf, "%u\n", ++ asus_armoury.cpu_cores->min_perf_cores); ++ else ++ return sysfs_emit(buf, "%u\n", ++ asus_armoury.cpu_cores->min_power_cores); ++ default: ++ break; ++ } ++ ++ if (core_type == CPU_CORE_PERF) ++ cores = asus_armoury.cpu_cores->cur_perf_cores; ++ else ++ cores = asus_armoury.cpu_cores->cur_power_cores; ++ ++ return sysfs_emit(buf, "%u\n", cores); ++} ++ ++static ssize_t cores_current_value_store(struct kobject *kobj, struct kobj_attribute *attr, ++ const char *buf, enum cpu_core_type core_type) ++{ ++ u32 new_cores, perf_cores, power_cores, out_val, min, max; ++ int result, err; ++ ++ result = kstrtou32(buf, 10, &new_cores); ++ if (result) ++ return result; ++ ++ scoped_guard(mutex, &asus_armoury.cpu_core_mutex) { ++ if (core_type == CPU_CORE_PERF) { ++ perf_cores = new_cores; ++ power_cores = asus_armoury.cpu_cores->cur_power_cores; ++ min = asus_armoury.cpu_cores->min_perf_cores; ++ max = asus_armoury.cpu_cores->max_perf_cores; ++ } else { ++ perf_cores = asus_armoury.cpu_cores->cur_perf_cores; ++ power_cores = new_cores; ++ min = asus_armoury.cpu_cores->min_power_cores; ++ max = asus_armoury.cpu_cores->max_power_cores; ++ } ++ ++ if (new_cores < min || new_cores > max) ++ return -EINVAL; ++ ++ out_val = FIELD_PREP(ASUS_PERF_CORE_MASK, perf_cores) | ++ FIELD_PREP(ASUS_POWER_CORE_MASK, power_cores); ++ ++ err = asus_wmi_set_devstate(ASUS_WMI_DEVID_CORES, out_val, &result); ++ if (err) { ++ pr_warn("Failed to set CPU core count: %d\n", err); ++ return err; ++ } ++ ++ if (result > 1) { ++ pr_warn("Failed to set CPU core count (result): 0x%x\n", result); ++ return -EIO; ++ } ++ } ++ ++ pr_info("CPU core count changed, reboot required\n"); ++ ++ sysfs_notify(kobj, NULL, attr->attr.name); ++ asus_set_reboot_and_signal_event(); ++ ++ return 0; ++} ++ ++static ssize_t cores_performance_min_value_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_MIN); ++} ++ ++static ssize_t cores_performance_max_value_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_MAX); ++} ++ ++static ssize_t cores_performance_default_value_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_DEFAULT); ++} ++ ++static ssize_t cores_performance_current_value_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_PERF, CPU_CORE_CURRENT); ++} ++ ++static ssize_t cores_performance_current_value_store(struct kobject *kobj, ++ struct kobj_attribute *attr, ++ const char *buf, size_t count) ++{ ++ int err; ++ ++ err = cores_current_value_store(kobj, attr, buf, CPU_CORE_PERF); ++ if (err) ++ return err; ++ ++ return count; ++} ++ATTR_GROUP_CORES_RW(cores_performance, "cores_performance", ++ "Set the max available performance cores"); ++ ++static ssize_t cores_efficiency_min_value_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_MIN); ++} ++ ++static ssize_t cores_efficiency_max_value_show(struct kobject *kobj, struct kobj_attribute *attr, ++ char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_MAX); ++} ++ ++static ssize_t cores_efficiency_default_value_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_DEFAULT); ++} ++ ++static ssize_t cores_efficiency_current_value_show(struct kobject *kobj, ++ struct kobj_attribute *attr, char *buf) ++{ ++ return cores_value_show(kobj, attr, buf, CPU_CORE_POWER, CPU_CORE_CURRENT); ++} ++ ++static ssize_t cores_efficiency_current_value_store(struct kobject *kobj, ++ struct kobj_attribute *attr, const char *buf, ++ size_t count) ++{ ++ int err; ++ ++ err = cores_current_value_store(kobj, attr, buf, CPU_CORE_POWER); ++ if (err) ++ return err; ++ ++ return count; ++} ++ATTR_GROUP_CORES_RW(cores_efficiency, "cores_efficiency", ++ "Set the max available efficiency cores"); ++ ++/* Define helper to access the current power mode tunable values */ ++static inline struct rog_tunables *get_current_tunables(void) ++{ ++ return asus_armoury ++ .rog_tunables[power_supply_is_system_supplied() ? 1 : 0]; ++} ++ ++/* Simple attribute creation */ ++ATTR_GROUP_ROG_TUNABLE(ppt_pl1_spl, ATTR_PPT_PL1_SPL, ASUS_WMI_DEVID_PPT_PL1_SPL, ++ "Set the CPU slow package limit"); ++ATTR_GROUP_ROG_TUNABLE(ppt_pl2_sppt, ATTR_PPT_PL2_SPPT, ASUS_WMI_DEVID_PPT_PL2_SPPT, ++ "Set the CPU fast package limit"); ++ATTR_GROUP_ROG_TUNABLE(ppt_pl3_fppt, ATTR_PPT_PL3_FPPT, ASUS_WMI_DEVID_PPT_FPPT, ++ "Set the CPU fastest package limit"); ++ATTR_GROUP_ROG_TUNABLE(ppt_apu_sppt, ATTR_PPT_APU_SPPT, ASUS_WMI_DEVID_PPT_APU_SPPT, ++ "Set the APU package limit"); ++ATTR_GROUP_ROG_TUNABLE(ppt_platform_sppt, ATTR_PPT_PLATFORM_SPPT, ASUS_WMI_DEVID_PPT_PLAT_SPPT, ++ "Set the platform package limit"); ++ATTR_GROUP_ROG_TUNABLE(nv_dynamic_boost, ATTR_NV_DYNAMIC_BOOST, ASUS_WMI_DEVID_NV_DYN_BOOST, ++ "Set the Nvidia dynamic boost limit"); ++ATTR_GROUP_ROG_TUNABLE(nv_temp_target, ATTR_NV_TEMP_TARGET, ASUS_WMI_DEVID_NV_THERM_TARGET, ++ "Set the Nvidia max thermal limit"); ++ATTR_GROUP_ROG_TUNABLE(nv_tgp, "nv_tgp", ASUS_WMI_DEVID_DGPU_SET_TGP, ++ "Set the additional TGP on top of the base TGP"); ++ATTR_GROUP_INT_VALUE_ONLY_RO(nv_base_tgp, ATTR_NV_BASE_TGP, ASUS_WMI_DEVID_DGPU_BASE_TGP, ++ "Read the base TGP value"); ++ ++ ++ATTR_GROUP_ENUM_INT_RO(charge_mode, "charge_mode", ASUS_WMI_DEVID_CHARGE_MODE, "0;1;2", ++ "Show the current mode of charging"); ++ ++ATTR_GROUP_BOOL_RW(boot_sound, "boot_sound", ASUS_WMI_DEVID_BOOT_SOUND, ++ "Set the boot POST sound"); ++ATTR_GROUP_BOOL_RW(mcu_powersave, "mcu_powersave", ASUS_WMI_DEVID_MCU_POWERSAVE, ++ "Set MCU powersaving mode"); ++ATTR_GROUP_BOOL_RW(panel_od, "panel_overdrive", ASUS_WMI_DEVID_PANEL_OD, ++ "Set the panel refresh overdrive"); ++ATTR_GROUP_BOOL_RW(panel_hd_mode, "panel_hd_mode", ASUS_WMI_DEVID_PANEL_HD, ++ "Set the panel HD mode to UHD<0> or FHD<1>"); ++ATTR_GROUP_BOOL_RW(screen_auto_brightness, "screen_auto_brightness", ++ ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS, ++ "Set the panel brightness to Off<0> or On<1>"); ++ATTR_GROUP_BOOL_RO(egpu_connected, "egpu_connected", ASUS_WMI_DEVID_EGPU_CONNECTED, ++ "Show the eGPU connection status"); ++ ++/* If an attribute does not require any special case handling add it here */ ++static const struct asus_attr_group armoury_attr_groups[] = { ++ { &egpu_connected_attr_group, ASUS_WMI_DEVID_EGPU_CONNECTED }, ++ { &egpu_enable_attr_group, ASUS_WMI_DEVID_EGPU }, ++ { &dgpu_disable_attr_group, ASUS_WMI_DEVID_DGPU }, ++ { &apu_mem_attr_group, ASUS_WMI_DEVID_APU_MEM }, ++ { &cores_efficiency_attr_group, ASUS_WMI_DEVID_CORES_MAX }, ++ { &cores_performance_attr_group, ASUS_WMI_DEVID_CORES_MAX }, ++ ++ { &ppt_pl1_spl_attr_group, ASUS_WMI_DEVID_PPT_PL1_SPL }, ++ { &ppt_pl2_sppt_attr_group, ASUS_WMI_DEVID_PPT_PL2_SPPT }, ++ { &ppt_pl3_fppt_attr_group, ASUS_WMI_DEVID_PPT_FPPT }, ++ { &ppt_apu_sppt_attr_group, ASUS_WMI_DEVID_PPT_APU_SPPT }, ++ { &ppt_platform_sppt_attr_group, ASUS_WMI_DEVID_PPT_PLAT_SPPT }, ++ { &nv_dynamic_boost_attr_group, ASUS_WMI_DEVID_NV_DYN_BOOST }, ++ { &nv_temp_target_attr_group, ASUS_WMI_DEVID_NV_THERM_TARGET }, ++ { &nv_base_tgp_attr_group, ASUS_WMI_DEVID_DGPU_BASE_TGP }, ++ { &nv_tgp_attr_group, ASUS_WMI_DEVID_DGPU_SET_TGP }, ++ ++ { &charge_mode_attr_group, ASUS_WMI_DEVID_CHARGE_MODE }, ++ { &boot_sound_attr_group, ASUS_WMI_DEVID_BOOT_SOUND }, ++ { &mcu_powersave_attr_group, ASUS_WMI_DEVID_MCU_POWERSAVE }, ++ { &panel_od_attr_group, ASUS_WMI_DEVID_PANEL_OD }, ++ { &panel_hd_mode_attr_group, ASUS_WMI_DEVID_PANEL_HD }, ++ { &screen_auto_brightness_attr_group, ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS }, ++}; ++ ++/** ++ * is_power_tunable_attr - Determines if an attribute is a power-related tunable ++ * @name: The name of the attribute to check ++ * ++ * This function checks if the given attribute name is related to power tuning. ++ * ++ * Return: true if the attribute is a power-related tunable, false otherwise ++ */ ++static bool is_power_tunable_attr(const char *name) ++{ ++ static const char * const power_tunable_attrs[] = { ++ ATTR_PPT_PL1_SPL, ATTR_PPT_PL2_SPPT, ++ ATTR_PPT_PL3_FPPT, ATTR_PPT_APU_SPPT, ++ ATTR_PPT_PLATFORM_SPPT, ATTR_NV_DYNAMIC_BOOST, ++ ATTR_NV_TEMP_TARGET, ATTR_NV_BASE_TGP, ++ ATTR_NV_TGP ++ }; ++ ++ for (unsigned int i = 0; i < ARRAY_SIZE(power_tunable_attrs); i++) { ++ if (!strcmp(name, power_tunable_attrs[i])) ++ return true; ++ } ++ ++ return false; ++} ++ ++/** ++ * has_valid_limit - Checks if a power-related attribute has a valid limit value ++ * @name: The name of the attribute to check ++ * @limits: Pointer to the power_limits structure containing limit values ++ * ++ * This function checks if a power-related attribute has a valid limit value. ++ * It returns false if limits is NULL or if the corresponding limit value is zero. ++ * ++ * Return: true if the attribute has a valid limit value, false otherwise ++ */ ++static bool has_valid_limit(const char *name, const struct power_limits *limits) ++{ ++ u32 limit_value = 0; ++ ++ if (!limits) ++ return false; ++ ++ if (!strcmp(name, ATTR_PPT_PL1_SPL)) ++ limit_value = limits->ppt_pl1_spl_max; ++ else if (!strcmp(name, ATTR_PPT_PL2_SPPT)) ++ limit_value = limits->ppt_pl2_sppt_max; ++ else if (!strcmp(name, ATTR_PPT_PL3_FPPT)) ++ limit_value = limits->ppt_pl3_fppt_max; ++ else if (!strcmp(name, ATTR_PPT_APU_SPPT)) ++ limit_value = limits->ppt_apu_sppt_max; ++ else if (!strcmp(name, ATTR_PPT_PLATFORM_SPPT)) ++ limit_value = limits->ppt_platform_sppt_max; ++ else if (!strcmp(name, ATTR_NV_DYNAMIC_BOOST)) ++ limit_value = limits->nv_dynamic_boost_max; ++ else if (!strcmp(name, ATTR_NV_TEMP_TARGET)) ++ limit_value = limits->nv_temp_target_max; ++ else if (!strcmp(name, ATTR_NV_BASE_TGP) || ++ !strcmp(name, ATTR_NV_TGP)) ++ limit_value = limits->nv_tgp_max; ++ ++ return limit_value > 0; ++} ++ ++static int asus_fw_attr_add(void) ++{ ++ const struct power_limits *limits; ++ bool should_create; ++ const char *name; ++ int err, i; ++ ++ asus_armoury.fw_attr_dev = device_create(&firmware_attributes_class, NULL, MKDEV(0, 0), ++ NULL, "%s", DRIVER_NAME); ++ if (IS_ERR(asus_armoury.fw_attr_dev)) { ++ err = PTR_ERR(asus_armoury.fw_attr_dev); ++ goto fail_class_get; ++ } ++ ++ asus_armoury.fw_attr_kset = kset_create_and_add("attributes", NULL, ++ &asus_armoury.fw_attr_dev->kobj); ++ if (!asus_armoury.fw_attr_kset) { ++ err = -ENOMEM; ++ goto err_destroy_classdev; ++ } ++ ++ err = sysfs_create_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); ++ if (err) { ++ pr_err("Failed to create sysfs level attributes\n"); ++ goto err_destroy_kset; ++ } ++ ++ asus_armoury.mini_led_dev_id = 0; ++ if (asus_wmi_is_present(ASUS_WMI_DEVID_MINI_LED_MODE)) ++ asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE; ++ else if (asus_wmi_is_present(ASUS_WMI_DEVID_MINI_LED_MODE2)) ++ asus_armoury.mini_led_dev_id = ASUS_WMI_DEVID_MINI_LED_MODE2; ++ ++ if (asus_armoury.mini_led_dev_id) { ++ err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, ++ &mini_led_mode_attr_group); ++ if (err) { ++ pr_err("Failed to create sysfs-group for mini_led\n"); ++ goto err_remove_file; ++ } ++ } ++ ++ asus_armoury.gpu_mux_dev_id = 0; ++ if (asus_wmi_is_present(ASUS_WMI_DEVID_GPU_MUX)) ++ asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX; ++ else if (asus_wmi_is_present(ASUS_WMI_DEVID_GPU_MUX_VIVO)) ++ asus_armoury.gpu_mux_dev_id = ASUS_WMI_DEVID_GPU_MUX_VIVO; ++ ++ if (asus_armoury.gpu_mux_dev_id) { ++ err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, ++ &gpu_mux_mode_attr_group); ++ if (err) { ++ pr_err("Failed to create sysfs-group for gpu_mux\n"); ++ goto err_remove_mini_led_group; ++ } ++ } ++ ++ for (i = 0; i < ARRAY_SIZE(armoury_attr_groups); i++) { ++ if (!asus_wmi_is_present(armoury_attr_groups[i].wmi_devid)) ++ continue; ++ ++ /* Always create by default, unless PPT is not present */ ++ should_create = true; ++ name = armoury_attr_groups[i].attr_group->name; ++ ++ /* Check if this is a power-related tunable requiring limits */ ++ if (asus_armoury.rog_tunables[1] && asus_armoury.rog_tunables[1]->power_limits && ++ is_power_tunable_attr(name)) { ++ limits = asus_armoury.rog_tunables[1]->power_limits; ++ /* Check only AC, if DC is not present then AC won't be either */ ++ should_create = has_valid_limit(name, limits); ++ if (!should_create) { ++ pr_debug("Missing max value on %s for tunable: %s\n", ++ dmi_get_system_info(DMI_BOARD_NAME), name); ++ } ++ } ++ ++ if (should_create) { ++ err = sysfs_create_group(&asus_armoury.fw_attr_kset->kobj, ++ armoury_attr_groups[i].attr_group); ++ if (err) { ++ pr_err("Failed to create sysfs-group for %s\n", ++ armoury_attr_groups[i].attr_group->name); ++ goto err_remove_groups; ++ } ++ } ++ } ++ ++ return 0; ++ ++err_remove_groups: ++ while (i--) { ++ if (asus_wmi_is_present(armoury_attr_groups[i].wmi_devid)) ++ sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, ++ armoury_attr_groups[i].attr_group); ++ } ++ if (asus_armoury.gpu_mux_dev_id) ++ sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &gpu_mux_mode_attr_group); ++err_remove_mini_led_group: ++ if (asus_armoury.mini_led_dev_id) ++ sysfs_remove_group(&asus_armoury.fw_attr_kset->kobj, &mini_led_mode_attr_group); ++err_remove_file: ++ sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); ++err_destroy_kset: ++ kset_unregister(asus_armoury.fw_attr_kset); ++err_destroy_classdev: ++fail_class_get: ++ device_destroy(&firmware_attributes_class, MKDEV(0, 0)); ++ return err; ++} ++ ++/* Init / exit ****************************************************************/ ++ ++/* Set up the min/max and defaults for ROG tunables */ ++static void init_rog_tunables(void) ++{ ++ const struct power_limits *ac_limits, *dc_limits; ++ const struct power_data *power_data; ++ const struct dmi_system_id *dmi_id; ++ bool ac_initialized = false, dc_initialized = false; ++ ++ /* Match the system against the power_limits table */ ++ dmi_id = dmi_first_match(power_limits); ++ if (!dmi_id) { ++ pr_warn("No matching power limits found for this system\n"); ++ return; ++ } ++ ++ /* Get the power data for this system */ ++ power_data = dmi_id->driver_data; ++ if (!power_data) { ++ pr_info("No power data available for this system\n"); ++ return; ++ } ++ ++ /* Initialize AC power tunables */ ++ ac_limits = power_data->ac_data; ++ if (ac_limits) { ++ asus_armoury.rog_tunables[1] = ++ kzalloc(sizeof(*asus_armoury.rog_tunables[1]), GFP_KERNEL); ++ if (!asus_armoury.rog_tunables[1]) ++ goto err_nomem; ++ ++ asus_armoury.rog_tunables[1]->power_limits = ac_limits; ++ ++ /* Set initial AC values */ ++ asus_armoury.rog_tunables[1]->ppt_pl1_spl = ++ ac_limits->ppt_pl1_spl_def ? ++ ac_limits->ppt_pl1_spl_def : ++ ac_limits->ppt_pl1_spl_max; ++ ++ asus_armoury.rog_tunables[1]->ppt_pl2_sppt = ++ ac_limits->ppt_pl2_sppt_def ? ++ ac_limits->ppt_pl2_sppt_def : ++ ac_limits->ppt_pl2_sppt_max; ++ ++ asus_armoury.rog_tunables[1]->ppt_pl3_fppt = ++ ac_limits->ppt_pl3_fppt_def ? ++ ac_limits->ppt_pl3_fppt_def : ++ ac_limits->ppt_pl3_fppt_max; ++ ++ asus_armoury.rog_tunables[1]->ppt_apu_sppt = ++ ac_limits->ppt_apu_sppt_def ? ++ ac_limits->ppt_apu_sppt_def : ++ ac_limits->ppt_apu_sppt_max; ++ ++ asus_armoury.rog_tunables[1]->ppt_platform_sppt = ++ ac_limits->ppt_platform_sppt_def ? ++ ac_limits->ppt_platform_sppt_def : ++ ac_limits->ppt_platform_sppt_max; ++ ++ asus_armoury.rog_tunables[1]->nv_dynamic_boost = ++ ac_limits->nv_dynamic_boost_max; ++ asus_armoury.rog_tunables[1]->nv_temp_target = ++ ac_limits->nv_temp_target_max; ++ asus_armoury.rog_tunables[1]->nv_tgp = ac_limits->nv_tgp_max; ++ ++ ac_initialized = true; ++ pr_debug("AC power limits initialized for %s\n", dmi_id->matches[0].substr); ++ } ++ ++ /* Initialize DC power tunables */ ++ dc_limits = power_data->dc_data; ++ if (dc_limits) { ++ asus_armoury.rog_tunables[0] = ++ kzalloc(sizeof(*asus_armoury.rog_tunables[0]), GFP_KERNEL); ++ if (!asus_armoury.rog_tunables[0]) { ++ if (ac_initialized) ++ kfree(asus_armoury.rog_tunables[1]); ++ goto err_nomem; ++ } ++ ++ asus_armoury.rog_tunables[0]->power_limits = dc_limits; ++ ++ /* Set initial DC values */ ++ asus_armoury.rog_tunables[0]->ppt_pl1_spl = ++ dc_limits->ppt_pl1_spl_def ? ++ dc_limits->ppt_pl1_spl_def : ++ dc_limits->ppt_pl1_spl_max; ++ ++ asus_armoury.rog_tunables[0]->ppt_pl2_sppt = ++ dc_limits->ppt_pl2_sppt_def ? ++ dc_limits->ppt_pl2_sppt_def : ++ dc_limits->ppt_pl2_sppt_max; ++ ++ asus_armoury.rog_tunables[0]->ppt_pl3_fppt = ++ dc_limits->ppt_pl3_fppt_def ? ++ dc_limits->ppt_pl3_fppt_def : ++ dc_limits->ppt_pl3_fppt_max; ++ ++ asus_armoury.rog_tunables[0]->ppt_apu_sppt = ++ dc_limits->ppt_apu_sppt_def ? ++ dc_limits->ppt_apu_sppt_def : ++ dc_limits->ppt_apu_sppt_max; ++ ++ asus_armoury.rog_tunables[0]->ppt_platform_sppt = ++ dc_limits->ppt_platform_sppt_def ? ++ dc_limits->ppt_platform_sppt_def : ++ dc_limits->ppt_platform_sppt_max; ++ ++ asus_armoury.rog_tunables[0]->nv_dynamic_boost = ++ dc_limits->nv_dynamic_boost_max; ++ asus_armoury.rog_tunables[0]->nv_temp_target = ++ dc_limits->nv_temp_target_max; ++ asus_armoury.rog_tunables[0]->nv_tgp = dc_limits->nv_tgp_max; ++ ++ dc_initialized = true; ++ pr_debug("DC power limits initialized for %s\n", dmi_id->matches[0].substr); ++ } ++ ++ if (!ac_initialized) ++ pr_debug("No AC PPT limits defined\n"); ++ ++ if (!dc_initialized) ++ pr_debug("No DC PPT limits defined\n"); ++ ++ return; ++ ++err_nomem: ++ pr_err("Failed to allocate memory for tunables\n"); ++} ++ ++static int __init asus_fw_init(void) ++{ ++ char *wmi_uid; ++ int err; ++ ++ wmi_uid = wmi_get_acpi_device_uid(ASUS_WMI_MGMT_GUID); ++ if (!wmi_uid) ++ return -ENODEV; ++ ++ /* ++ * if equal to "ASUSWMI" then it's DCTS that can't be used for this ++ * driver, DSTS is required. ++ */ ++ if (!strcmp(wmi_uid, ASUS_ACPI_UID_ASUSWMI)) ++ return -ENODEV; ++ ++ if (asus_wmi_is_present(ASUS_WMI_DEVID_CORES_MAX)) { ++ err = init_max_cpu_cores(); ++ if (err) { ++ pr_err("Could not initialise CPU core control %d\n", err); ++ return err; ++ } ++ } ++ ++ init_rog_tunables(); ++ ++ /* Must always be last step to ensure data is available */ ++ return asus_fw_attr_add(); ++} ++ ++static void __exit asus_fw_exit(void) ++{ ++ sysfs_remove_file(&asus_armoury.fw_attr_kset->kobj, &pending_reboot.attr); ++ kset_unregister(asus_armoury.fw_attr_kset); ++ device_destroy(&firmware_attributes_class, MKDEV(0, 0)); ++ ++ kfree(asus_armoury.rog_tunables[0]); ++ kfree(asus_armoury.rog_tunables[1]); ++} ++ ++module_init(asus_fw_init); ++module_exit(asus_fw_exit); ++ ++MODULE_IMPORT_NS("ASUS_WMI"); ++MODULE_AUTHOR("Luke Jones "); ++MODULE_DESCRIPTION("ASUS BIOS Configuration Driver"); ++MODULE_LICENSE("GPL"); ++MODULE_ALIAS("wmi:" ASUS_NB_WMI_EVENT_GUID); +diff --git a/drivers/platform/x86/asus-armoury.h b/drivers/platform/x86/asus-armoury.h +new file mode 100644 +index 000000000000..438768ea14cc +--- /dev/null ++++ b/drivers/platform/x86/asus-armoury.h +@@ -0,0 +1,1278 @@ ++/* SPDX-License-Identifier: GPL-2.0 ++ * ++ * Definitions for kernel modules using asus-armoury driver ++ * ++ * Copyright (c) 2024 Luke Jones ++ */ ++ ++#ifndef _ASUS_ARMOURY_H_ ++#define _ASUS_ARMOURY_H_ ++ ++#include ++#include ++#include ++ ++#define DRIVER_NAME "asus-armoury" ++ ++#define __ASUS_ATTR_RO(_func, _name) \ ++ { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = _func##_##_name##_show, \ ++ } ++ ++#define __ASUS_ATTR_RO_AS(_name, _show) \ ++ { \ ++ .attr = { .name = __stringify(_name), .mode = 0444 }, \ ++ .show = _show, \ ++ } ++ ++#define __ASUS_ATTR_RW(_func, _name) \ ++ __ATTR(_name, 0644, _func##_##_name##_show, _func##_##_name##_store) ++ ++#define __WMI_STORE_INT(_attr, _min, _max, _wmi) \ ++ static ssize_t _attr##_store(struct kobject *kobj, \ ++ struct kobj_attribute *attr, \ ++ const char *buf, size_t count) \ ++ { \ ++ return attr_uint_store(kobj, attr, buf, count, _min, \ ++ _max, NULL, _wmi); \ ++ } ++ ++#define WMI_SHOW_INT(_attr, _fmt, _wmi) \ ++ static ssize_t _attr##_show(struct kobject *kobj, \ ++ struct kobj_attribute *attr, char *buf) \ ++ { \ ++ u32 result; \ ++ int err; \ ++ \ ++ err = asus_wmi_get_devstate_dsts(_wmi, &result); \ ++ if (err) \ ++ return err; \ ++ return sysfs_emit(buf, _fmt, \ ++ result & ~ASUS_WMI_DSTS_PRESENCE_BIT); \ ++ } ++ ++/* Create functions and attributes for use in other macros or on their own */ ++ ++/* Shows a formatted static variable */ ++#define __ATTR_SHOW_FMT(_prop, _attrname, _fmt, _val) \ ++ static ssize_t _attrname##_##_prop##_show( \ ++ struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ ++ { \ ++ return sysfs_emit(buf, _fmt, _val); \ ++ } \ ++ static struct kobj_attribute attr_##_attrname##_##_prop = \ ++ __ASUS_ATTR_RO(_attrname, _prop) ++ ++#define __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname)\ ++ WMI_SHOW_INT(_attrname##_current_value, "%d\n", _wmi); \ ++ static struct kobj_attribute attr_##_attrname##_current_value = \ ++ __ASUS_ATTR_RO(_attrname, current_value); \ ++ __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ ++ __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ ++ static struct kobj_attribute attr_##_attrname##_type = \ ++ __ASUS_ATTR_RO_AS(type, enum_type_show); \ ++ static struct attribute *_attrname##_attrs[] = { \ ++ &attr_##_attrname##_current_value.attr, \ ++ &attr_##_attrname##_display_name.attr, \ ++ &attr_##_attrname##_possible_values.attr, \ ++ &attr_##_attrname##_type.attr, \ ++ NULL \ ++ }; \ ++ static const struct attribute_group _attrname##_attr_group = { \ ++ .name = _fsname, .attrs = _attrname##_attrs \ ++ } ++ ++#define __ATTR_RW_INT_GROUP_ENUM(_attrname, _minv, _maxv, _wmi, _fsname,\ ++ _possible, _dispname) \ ++ __WMI_STORE_INT(_attrname##_current_value, _minv, _maxv, _wmi); \ ++ WMI_SHOW_INT(_attrname##_current_value, "%d\n", _wmi); \ ++ static struct kobj_attribute attr_##_attrname##_current_value = \ ++ __ASUS_ATTR_RW(_attrname, current_value); \ ++ __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ ++ __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ ++ static struct kobj_attribute attr_##_attrname##_type = \ ++ __ASUS_ATTR_RO_AS(type, enum_type_show); \ ++ static struct attribute *_attrname##_attrs[] = { \ ++ &attr_##_attrname##_current_value.attr, \ ++ &attr_##_attrname##_display_name.attr, \ ++ &attr_##_attrname##_possible_values.attr, \ ++ &attr_##_attrname##_type.attr, \ ++ NULL \ ++ }; \ ++ static const struct attribute_group _attrname##_attr_group = { \ ++ .name = _fsname, .attrs = _attrname##_attrs \ ++ } ++ ++/* Boolean style enumeration, base macro. Requires adding show/store */ ++#define __ATTR_GROUP_ENUM(_attrname, _fsname, _possible, _dispname) \ ++ __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ ++ __ATTR_SHOW_FMT(possible_values, _attrname, "%s\n", _possible); \ ++ static struct kobj_attribute attr_##_attrname##_type = \ ++ __ASUS_ATTR_RO_AS(type, enum_type_show); \ ++ static struct attribute *_attrname##_attrs[] = { \ ++ &attr_##_attrname##_current_value.attr, \ ++ &attr_##_attrname##_display_name.attr, \ ++ &attr_##_attrname##_possible_values.attr, \ ++ &attr_##_attrname##_type.attr, \ ++ NULL \ ++ }; \ ++ static const struct attribute_group _attrname##_attr_group = { \ ++ .name = _fsname, .attrs = _attrname##_attrs \ ++ } ++ ++#define ATTR_GROUP_BOOL_RO(_attrname, _fsname, _wmi, _dispname) \ ++ __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, "0;1", _dispname) ++ ++ ++#define ATTR_GROUP_BOOL_RW(_attrname, _fsname, _wmi, _dispname) \ ++ __ATTR_RW_INT_GROUP_ENUM(_attrname, 0, 1, _wmi, _fsname, "0;1", _dispname) ++ ++#define ATTR_GROUP_ENUM_INT_RO(_attrname, _fsname, _wmi, _possible, _dispname) \ ++ __ATTR_RO_INT_GROUP_ENUM(_attrname, _wmi, _fsname, _possible, _dispname) ++ ++/* ++ * Requires _current_value_show(), _current_value_show() ++ */ ++#define ATTR_GROUP_BOOL_CUSTOM(_attrname, _fsname, _dispname) \ ++ static struct kobj_attribute attr_##_attrname##_current_value = \ ++ __ASUS_ATTR_RW(_attrname, current_value); \ ++ __ATTR_GROUP_ENUM(_attrname, _fsname, "0;1", _dispname) ++ ++/* ++ * Requires _current_value_show(), _current_value_show() ++ * and _possible_values_show() ++ */ ++#define ATTR_GROUP_ENUM_CUSTOM(_attrname, _fsname, _dispname) \ ++ __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ ++ static struct kobj_attribute attr_##_attrname##_current_value = \ ++ __ASUS_ATTR_RW(_attrname, current_value); \ ++ static struct kobj_attribute attr_##_attrname##_possible_values = \ ++ __ASUS_ATTR_RO(_attrname, possible_values); \ ++ static struct kobj_attribute attr_##_attrname##_type = \ ++ __ASUS_ATTR_RO_AS(type, enum_type_show); \ ++ static struct attribute *_attrname##_attrs[] = { \ ++ &attr_##_attrname##_current_value.attr, \ ++ &attr_##_attrname##_display_name.attr, \ ++ &attr_##_attrname##_possible_values.attr, \ ++ &attr_##_attrname##_type.attr, \ ++ NULL \ ++ }; \ ++ static const struct attribute_group _attrname##_attr_group = { \ ++ .name = _fsname, .attrs = _attrname##_attrs \ ++ } ++ ++/* CPU core attributes need a little different in setup */ ++#define ATTR_GROUP_CORES_RW(_attrname, _fsname, _dispname) \ ++ __ATTR_SHOW_FMT(scalar_increment, _attrname, "%d\n", 1); \ ++ __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ ++ static struct kobj_attribute attr_##_attrname##_current_value = \ ++ __ASUS_ATTR_RW(_attrname, current_value); \ ++ static struct kobj_attribute attr_##_attrname##_default_value = \ ++ __ASUS_ATTR_RO(_attrname, default_value); \ ++ static struct kobj_attribute attr_##_attrname##_min_value = \ ++ __ASUS_ATTR_RO(_attrname, min_value); \ ++ static struct kobj_attribute attr_##_attrname##_max_value = \ ++ __ASUS_ATTR_RO(_attrname, max_value); \ ++ static struct kobj_attribute attr_##_attrname##_type = \ ++ __ASUS_ATTR_RO_AS(type, int_type_show); \ ++ static struct attribute *_attrname##_attrs[] = { \ ++ &attr_##_attrname##_current_value.attr, \ ++ &attr_##_attrname##_default_value.attr, \ ++ &attr_##_attrname##_min_value.attr, \ ++ &attr_##_attrname##_max_value.attr, \ ++ &attr_##_attrname##_scalar_increment.attr, \ ++ &attr_##_attrname##_display_name.attr, \ ++ &attr_##_attrname##_type.attr, \ ++ NULL \ ++ }; \ ++ static const struct attribute_group _attrname##_attr_group = { \ ++ .name = _fsname, .attrs = _attrname##_attrs \ ++ } ++ ++#define ATTR_GROUP_INT_VALUE_ONLY_RO(_attrname, _fsname, _wmi, _dispname) \ ++ WMI_SHOW_INT(_attrname##_current_value, "%d\n", _wmi); \ ++ static struct kobj_attribute attr_##_attrname##_current_value = \ ++ __ASUS_ATTR_RO(_attrname, current_value); \ ++ __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ ++ static struct kobj_attribute attr_##_attrname##_type = \ ++ __ASUS_ATTR_RO_AS(type, int_type_show); \ ++ static struct attribute *_attrname##_attrs[] = { \ ++ &attr_##_attrname##_current_value.attr, \ ++ &attr_##_attrname##_display_name.attr, \ ++ &attr_##_attrname##_type.attr, NULL \ ++ }; \ ++ static const struct attribute_group _attrname##_attr_group = { \ ++ .name = _fsname, .attrs = _attrname##_attrs \ ++ } ++ ++/* ++ * ROG PPT attributes need a little different in setup as they ++ * require rog_tunables members. ++ */ ++ ++#define __ROG_TUNABLE_SHOW(_prop, _attrname, _val) \ ++ static ssize_t _attrname##_##_prop##_show( \ ++ struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ ++ { \ ++ struct rog_tunables *tunables = get_current_tunables(); \ ++ \ ++ if (!tunables || !tunables->power_limits) \ ++ return -ENODEV; \ ++ \ ++ return sysfs_emit(buf, "%d\n", tunables->power_limits->_val); \ ++ } \ ++ static struct kobj_attribute attr_##_attrname##_##_prop = \ ++ __ASUS_ATTR_RO(_attrname, _prop) ++ ++#define __ROG_TUNABLE_SHOW_DEFAULT(_attrname) \ ++ static ssize_t _attrname##_default_value_show( \ ++ struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ ++ { \ ++ struct rog_tunables *tunables = get_current_tunables(); \ ++ \ ++ if (!tunables || !tunables->power_limits) \ ++ return -ENODEV; \ ++ \ ++ return sysfs_emit( \ ++ buf, "%d\n", \ ++ tunables->power_limits->_attrname##_def ? \ ++ tunables->power_limits->_attrname##_def : \ ++ tunables->power_limits->_attrname##_max); \ ++ } \ ++ static struct kobj_attribute attr_##_attrname##_default_value = \ ++ __ASUS_ATTR_RO(_attrname, default_value) ++ ++#define __ROG_TUNABLE_RW(_attr, _wmi) \ ++ static ssize_t _attr##_current_value_store( \ ++ struct kobject *kobj, struct kobj_attribute *attr, \ ++ const char *buf, size_t count) \ ++ { \ ++ struct rog_tunables *tunables = get_current_tunables(); \ ++ \ ++ if (!tunables || !tunables->power_limits) \ ++ return -ENODEV; \ ++ \ ++ return attr_uint_store(kobj, attr, buf, count, \ ++ tunables->power_limits->_attr##_min, \ ++ tunables->power_limits->_attr##_max, \ ++ &tunables->_attr, _wmi); \ ++ } \ ++ static ssize_t _attr##_current_value_show( \ ++ struct kobject *kobj, struct kobj_attribute *attr, char *buf) \ ++ { \ ++ struct rog_tunables *tunables = get_current_tunables(); \ ++ \ ++ if (!tunables) \ ++ return -ENODEV; \ ++ \ ++ return sysfs_emit(buf, "%u\n", tunables->_attr); \ ++ } \ ++ static struct kobj_attribute attr_##_attr##_current_value = \ ++ __ASUS_ATTR_RW(_attr, current_value) ++ ++#define ATTR_GROUP_ROG_TUNABLE(_attrname, _fsname, _wmi, _dispname) \ ++ __ROG_TUNABLE_RW(_attrname, _wmi); \ ++ __ROG_TUNABLE_SHOW_DEFAULT(_attrname); \ ++ __ROG_TUNABLE_SHOW(min_value, _attrname, _attrname##_min); \ ++ __ROG_TUNABLE_SHOW(max_value, _attrname, _attrname##_max); \ ++ __ATTR_SHOW_FMT(scalar_increment, _attrname, "%d\n", 1); \ ++ __ATTR_SHOW_FMT(display_name, _attrname, "%s\n", _dispname); \ ++ static struct kobj_attribute attr_##_attrname##_type = \ ++ __ASUS_ATTR_RO_AS(type, int_type_show); \ ++ static struct attribute *_attrname##_attrs[] = { \ ++ &attr_##_attrname##_current_value.attr, \ ++ &attr_##_attrname##_default_value.attr, \ ++ &attr_##_attrname##_min_value.attr, \ ++ &attr_##_attrname##_max_value.attr, \ ++ &attr_##_attrname##_scalar_increment.attr, \ ++ &attr_##_attrname##_display_name.attr, \ ++ &attr_##_attrname##_type.attr, \ ++ NULL \ ++ }; \ ++ static const struct attribute_group _attrname##_attr_group = { \ ++ .name = _fsname, .attrs = _attrname##_attrs \ ++ } ++ ++/* Default is always the maximum value unless *_def is specified */ ++struct power_limits { ++ u8 ppt_pl1_spl_min; ++ u8 ppt_pl1_spl_def; ++ u8 ppt_pl1_spl_max; ++ u8 ppt_pl2_sppt_min; ++ u8 ppt_pl2_sppt_def; ++ u8 ppt_pl2_sppt_max; ++ u8 ppt_pl3_fppt_min; ++ u8 ppt_pl3_fppt_def; ++ u8 ppt_pl3_fppt_max; ++ u8 ppt_apu_sppt_min; ++ u8 ppt_apu_sppt_def; ++ u8 ppt_apu_sppt_max; ++ u8 ppt_platform_sppt_min; ++ u8 ppt_platform_sppt_def; ++ u8 ppt_platform_sppt_max; ++ /* Nvidia GPU specific, default is always max */ ++ u8 nv_dynamic_boost_def; // unused. exists for macro ++ u8 nv_dynamic_boost_min; ++ u8 nv_dynamic_boost_max; ++ u8 nv_temp_target_def; // unused. exists for macro ++ u8 nv_temp_target_min; ++ u8 nv_temp_target_max; ++ u8 nv_tgp_def; // unused. exists for macro ++ u8 nv_tgp_min; ++ u8 nv_tgp_max; ++}; ++ ++struct power_data { ++ const struct power_limits *ac_data; ++ const struct power_limits *dc_data; ++ bool requires_fan_curve; ++}; ++ ++/* ++ * For each avilable attribute there must be a min and a max. ++ * _def is not required and will be assumed to be default == max if missing. ++ */ ++static const struct dmi_system_id power_limits[] = { ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA401W"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_tgp_min = 55, ++ .nv_tgp_max = 75, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 30, ++ .ppt_pl2_sppt_min = 31, ++ .ppt_pl2_sppt_max = 44, ++ .ppt_pl3_fppt_min = 45, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA507N"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 45, ++ .ppt_pl1_spl_max = 65, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 54, ++ .ppt_pl2_sppt_max = 65, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA507R"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80 ++ }, ++ .dc_data = NULL ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA507X"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 20, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_tgp_min = 55, ++ .nv_tgp_max = 85, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 45, ++ .ppt_pl1_spl_max = 65, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 54, ++ .ppt_pl2_sppt_max = 65, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA507Z"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 65, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 105, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 15, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_tgp_min = 55, ++ .nv_tgp_max = 85, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 45, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_max = 60, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA607P"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 30, ++ .ppt_pl1_spl_def = 100, ++ .ppt_pl1_spl_max = 135, ++ .ppt_pl2_sppt_min = 30, ++ .ppt_pl2_sppt_def = 115, ++ .ppt_pl2_sppt_max = 135, ++ .ppt_pl3_fppt_min = 30, ++ .ppt_pl3_fppt_max = 135, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_tgp_min = 55, ++ .nv_tgp_max = 115, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_def = 45, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_def = 60, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 25, ++ .ppt_pl3_fppt_max = 80, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA617NS"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 15, ++ .ppt_apu_sppt_max = 80, ++ .ppt_platform_sppt_min = 30, ++ .ppt_platform_sppt_max = 120 ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 25, ++ .ppt_apu_sppt_max = 35, ++ .ppt_platform_sppt_min = 45, ++ .ppt_platform_sppt_max = 100 ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA617NT"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 15, ++ .ppt_apu_sppt_max = 80, ++ .ppt_platform_sppt_min = 30, ++ .ppt_platform_sppt_max = 115 ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 15, ++ .ppt_apu_sppt_max = 45, ++ .ppt_platform_sppt_min = 30, ++ .ppt_platform_sppt_max = 50 ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FA617XS"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 15, ++ .ppt_apu_sppt_max = 80, ++ .ppt_platform_sppt_min = 30, ++ .ppt_platform_sppt_max = 120, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 25, ++ .ppt_apu_sppt_max = 35, ++ .ppt_platform_sppt_min = 45, ++ .ppt_platform_sppt_max = 100, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "FX507Z"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 90, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 135, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 15, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 45, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_max = 60, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GA401Q"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 15, ++ .ppt_pl2_sppt_max = 80, ++ }, ++ .dc_data = NULL ++ }, ++ }, ++ { ++ .matches = { ++ // This model is full AMD. No Nvidia dGPU. ++ DMI_MATCH(DMI_BOARD_NAME, "GA402R"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 15, ++ .ppt_apu_sppt_max = 80, ++ .ppt_platform_sppt_min = 30, ++ .ppt_platform_sppt_max = 115, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_apu_sppt_min = 25, ++ .ppt_apu_sppt_def = 30, ++ .ppt_apu_sppt_max = 45, ++ .ppt_platform_sppt_min = 40, ++ .ppt_platform_sppt_max = 60, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GA402X"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 35, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_def = 65, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 35, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 35, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GA403U"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_tgp_min = 55, ++ .nv_tgp_max = 65, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 35, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 35, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GA503R"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 35, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 65, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 20, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 25, ++ .ppt_pl1_spl_max = 65, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 54, ++ .ppt_pl2_sppt_max = 60, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 65 ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GA605W"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 20, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_tgp_min = 55, ++ .nv_tgp_max = 85, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 35, ++ .ppt_pl2_sppt_min = 31, ++ .ppt_pl2_sppt_max = 44, ++ .ppt_pl3_fppt_min = 45, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GU603Z"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 60, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 135, ++ /* Only allowed in AC mode */ ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 20, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 40, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 40, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GU604V"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 65, ++ .ppt_pl1_spl_max = 120, ++ .ppt_pl2_sppt_min = 65, ++ .ppt_pl2_sppt_max = 150, ++ /* Only allowed in AC mode */ ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 40, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 40, ++ .ppt_pl2_sppt_max = 60, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GU605M"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 90, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 135, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 20, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 35, ++ .ppt_pl2_sppt_min = 38, ++ .ppt_pl2_sppt_max = 53, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GV301Q"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 45, ++ .ppt_pl2_sppt_min = 65, ++ .ppt_pl2_sppt_max = 80, ++ }, ++ .dc_data = NULL ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GV301R"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 45, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 54, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 35, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 35, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GV601R"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 35, ++ .ppt_pl1_spl_max = 90, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 54, ++ .ppt_pl2_sppt_max = 100, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_def = 80, ++ .ppt_pl3_fppt_max = 125, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 28, ++ .ppt_pl1_spl_max = 65, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 54, ++ .ppt_pl2_sppt_def = 40, ++ .ppt_pl2_sppt_max = 60, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_def = 80, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GV601V"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_def = 100, ++ .ppt_pl1_spl_max = 110, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 135, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 20, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 40, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 40, ++ .ppt_pl2_sppt_max = 60, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "GX650P"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 110, ++ .ppt_pl1_spl_max = 130, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 125, ++ .ppt_pl2_sppt_max = 130, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_def = 125, ++ .ppt_pl3_fppt_max = 135, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_def = 25, ++ .ppt_pl1_spl_max = 65, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_def = 35, ++ .ppt_pl2_sppt_max = 65, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_def = 42, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G513I"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ /* Yes this laptop is very limited */ ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 15, ++ .ppt_pl2_sppt_max = 80, ++ }, ++ .dc_data = NULL, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G513QM"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ /* Yes this laptop is very limited */ ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 100, ++ .ppt_pl2_sppt_min = 15, ++ .ppt_pl2_sppt_max = 190, ++ }, ++ .dc_data = NULL, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G513R"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 35, ++ .ppt_pl1_spl_max = 90, ++ .ppt_pl2_sppt_min = 54, ++ .ppt_pl2_sppt_max = 100, ++ .ppt_pl3_fppt_min = 54, ++ .ppt_pl3_fppt_max = 125, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 50, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 50, ++ .ppt_pl3_fppt_min = 28, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G614J"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 140, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 175, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 55, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 70, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G634J"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 140, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 175, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 55, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 70, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G733C"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 170, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 175, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 35, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 35, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G733P"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 30, ++ .ppt_pl1_spl_def = 100, ++ .ppt_pl1_spl_max = 130, ++ .ppt_pl2_sppt_min = 65, ++ .ppt_pl2_sppt_def = 125, ++ .ppt_pl2_sppt_max = 130, ++ .ppt_pl3_fppt_min = 65, ++ .ppt_pl3_fppt_def = 125, ++ .ppt_pl3_fppt_max = 130, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 65, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 65, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 75, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G814J"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 140, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 140, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 55, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 70, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "G834J"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 28, ++ .ppt_pl1_spl_max = 140, ++ .ppt_pl2_sppt_min = 28, ++ .ppt_pl2_sppt_max = 175, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 25, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 55, ++ .ppt_pl2_sppt_min = 25, ++ .ppt_pl2_sppt_max = 70, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ }, ++ .requires_fan_curve = true, ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "H7606W"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 15, ++ .ppt_pl1_spl_max = 80, ++ .ppt_pl2_sppt_min = 35, ++ .ppt_pl2_sppt_max = 80, ++ .ppt_pl3_fppt_min = 35, ++ .ppt_pl3_fppt_max = 80, ++ .nv_dynamic_boost_min = 5, ++ .nv_dynamic_boost_max = 20, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ .nv_tgp_min = 55, ++ .nv_tgp_max = 85, ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 25, ++ .ppt_pl1_spl_max = 35, ++ .ppt_pl2_sppt_min = 31, ++ .ppt_pl2_sppt_max = 44, ++ .ppt_pl3_fppt_min = 45, ++ .ppt_pl3_fppt_max = 65, ++ .nv_temp_target_min = 75, ++ .nv_temp_target_max = 87, ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "RC71"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 7, ++ .ppt_pl1_spl_max = 30, ++ .ppt_pl2_sppt_min = 15, ++ .ppt_pl2_sppt_max = 43, ++ .ppt_pl3_fppt_min = 15, ++ .ppt_pl3_fppt_max = 53 ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 7, ++ .ppt_pl1_spl_def = 15, ++ .ppt_pl1_spl_max = 25, ++ .ppt_pl2_sppt_min = 15, ++ .ppt_pl2_sppt_def = 20, ++ .ppt_pl2_sppt_max = 30, ++ .ppt_pl3_fppt_min = 15, ++ .ppt_pl3_fppt_def = 25, ++ .ppt_pl3_fppt_max = 35 ++ } ++ }, ++ }, ++ { ++ .matches = { ++ DMI_MATCH(DMI_BOARD_NAME, "RC72"), ++ }, ++ .driver_data = &(struct power_data) { ++ .ac_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 7, ++ .ppt_pl1_spl_max = 30, ++ .ppt_pl2_sppt_min = 15, ++ .ppt_pl2_sppt_max = 43, ++ .ppt_pl3_fppt_min = 15, ++ .ppt_pl3_fppt_max = 53 ++ }, ++ .dc_data = &(struct power_limits) { ++ .ppt_pl1_spl_min = 7, ++ .ppt_pl1_spl_def = 17, ++ .ppt_pl1_spl_max = 25, ++ .ppt_pl2_sppt_min = 15, ++ .ppt_pl2_sppt_def = 24, ++ .ppt_pl2_sppt_max = 30, ++ .ppt_pl3_fppt_min = 15, ++ .ppt_pl3_fppt_def = 30, ++ .ppt_pl3_fppt_max = 35 ++ } ++ }, ++ }, ++ {} ++}; ++ ++#endif /* _ASUS_ARMOURY_H_ */ +diff --git a/drivers/platform/x86/asus-wmi.c b/drivers/platform/x86/asus-wmi.c +index f7191fdded14..a6d6efdb50b7 100644 +--- a/drivers/platform/x86/asus-wmi.c ++++ b/drivers/platform/x86/asus-wmi.c +@@ -55,8 +55,6 @@ module_param(fnlock_default, bool, 0444); + #define to_asus_wmi_driver(pdrv) \ + (container_of((pdrv), struct asus_wmi_driver, platform_driver)) + +-#define ASUS_WMI_MGMT_GUID "97845ED0-4E6D-11DE-8A39-0800200C9A66" +- + #define NOTIFY_BRNUP_MIN 0x11 + #define NOTIFY_BRNUP_MAX 0x1f + #define NOTIFY_BRNDOWN_MIN 0x20 +@@ -105,8 +103,6 @@ module_param(fnlock_default, bool, 0444); + #define USB_INTEL_XUSB2PR 0xD0 + #define PCI_DEVICE_ID_INTEL_LYNXPOINT_LP_XHCI 0x9c31 + +-#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" +- + #define WMI_EVENT_MASK 0xFFFF + + #define FAN_CURVE_POINTS 8 +@@ -340,6 +336,13 @@ struct asus_wmi { + /* Global to allow setting externally without requiring driver data */ + static enum asus_ally_mcu_hack use_ally_mcu_hack = ASUS_WMI_ALLY_MCU_HACK_INIT; + ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) ++static void asus_wmi_show_deprecated(void) ++{ ++ pr_notice_once("Accessing attributes through /sys/bus/platform/asus_wmi is deprecated and will be removed in a future release. Please switch over to /sys/class/firmware_attributes.\n"); ++} ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ ++ + /* WMI ************************************************************************/ + + static int asus_wmi_evaluate_method3(u32 method_id, +@@ -390,7 +393,7 @@ int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval) + { + return asus_wmi_evaluate_method3(method_id, arg0, arg1, 0, retval); + } +-EXPORT_SYMBOL_GPL(asus_wmi_evaluate_method); ++EXPORT_SYMBOL_NS_GPL(asus_wmi_evaluate_method, "ASUS_WMI"); + + static int asus_wmi_evaluate_method5(u32 method_id, + u32 arg0, u32 arg1, u32 arg2, u32 arg3, u32 arg4, u32 *retval) +@@ -554,12 +557,46 @@ static int asus_wmi_get_devstate(struct asus_wmi *asus, u32 dev_id, u32 *retval) + return 0; + } + +-int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, +- u32 *retval) ++/** ++ * asus_wmi_get_devstate_dsts() - Get the WMI function state. ++ * @dev_id: The WMI method ID to call. ++ * @retval: A pointer to where to store the value returned from WMI. ++ * @return: 0 on success and retval is filled. ++ * @return: -ENODEV if the method ID is unsupported. ++ * @return: everything else is an error from WMI call. ++ */ ++int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) ++{ ++ int err; ++ ++ err = asus_wmi_evaluate_method(ASUS_WMI_METHODID_DSTS, dev_id, 0, retval); ++ if (err) ++ return err; ++ ++ if (*retval == ASUS_WMI_UNSUPPORTED_METHOD) ++ return -ENODEV; ++ ++ return 0; ++} ++EXPORT_SYMBOL_NS_GPL(asus_wmi_get_devstate_dsts, "ASUS_WMI"); ++ ++/** ++ * asus_wmi_set_devstate() - Set the WMI function state. ++ * @dev_id: The WMI function to call. ++ * @ctrl_param: The argument to be used for this WMI function. ++ * @retval: A pointer to where to store the value returned from WMI. ++ * @return: 0 on success and retval is filled. ++ * @return: everything else is an error from WMI call. ++ * ++ * A asus_wmi_set_devstate() call must be paired with a ++ * asus_wmi_get_devstate_dsts() to check if the WMI function is supported. ++ */ ++int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) + { + return asus_wmi_evaluate_method(ASUS_WMI_METHODID_DEVS, dev_id, + ctrl_param, retval); + } ++EXPORT_SYMBOL_NS_GPL(asus_wmi_set_devstate, "ASUS_WMI"); + + /* Helper for special devices with magic return codes */ + static int asus_wmi_get_devstate_bits(struct asus_wmi *asus, +@@ -692,6 +729,7 @@ static void asus_wmi_tablet_mode_get_state(struct asus_wmi *asus) + } + + /* Charging mode, 1=Barrel, 2=USB ******************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t charge_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -702,12 +740,16 @@ static ssize_t charge_mode_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", value & 0xff); + } + + static DEVICE_ATTR_RO(charge_mode); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* dGPU ********************************************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t dgpu_disable_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -718,6 +760,8 @@ static ssize_t dgpu_disable_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", result); + } + +@@ -771,8 +815,10 @@ static ssize_t dgpu_disable_store(struct device *dev, + return count; + } + static DEVICE_ATTR_RW(dgpu_disable); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* eGPU ********************************************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t egpu_enable_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -783,6 +829,8 @@ static ssize_t egpu_enable_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", result); + } + +@@ -839,8 +887,10 @@ static ssize_t egpu_enable_store(struct device *dev, + return count; + } + static DEVICE_ATTR_RW(egpu_enable); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* Is eGPU connected? *********************************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t egpu_connected_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -851,12 +901,16 @@ static ssize_t egpu_connected_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", result); + } + + static DEVICE_ATTR_RO(egpu_connected); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* gpu mux switch *************************************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t gpu_mux_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -867,6 +921,8 @@ static ssize_t gpu_mux_mode_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", result); + } + +@@ -925,6 +981,7 @@ static ssize_t gpu_mux_mode_store(struct device *dev, + return count; + } + static DEVICE_ATTR_RW(gpu_mux_mode); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* TUF Laptop Keyboard RGB Modes **********************************************/ + static ssize_t kbd_rgb_mode_store(struct device *dev, +@@ -1048,6 +1105,7 @@ static const struct attribute_group *kbd_rgb_mode_groups[] = { + }; + + /* Tunable: PPT: Intel=PL1, AMD=SPPT *****************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t ppt_pl2_sppt_store(struct device *dev, + struct device_attribute *attr, + const char *buf, size_t count) +@@ -1086,6 +1144,8 @@ static ssize_t ppt_pl2_sppt_show(struct device *dev, + { + struct asus_wmi *asus = dev_get_drvdata(dev); + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%u\n", asus->ppt_pl2_sppt); + } + static DEVICE_ATTR_RW(ppt_pl2_sppt); +@@ -1128,6 +1188,8 @@ static ssize_t ppt_pl1_spl_show(struct device *dev, + { + struct asus_wmi *asus = dev_get_drvdata(dev); + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%u\n", asus->ppt_pl1_spl); + } + static DEVICE_ATTR_RW(ppt_pl1_spl); +@@ -1171,6 +1233,8 @@ static ssize_t ppt_fppt_show(struct device *dev, + { + struct asus_wmi *asus = dev_get_drvdata(dev); + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%u\n", asus->ppt_fppt); + } + static DEVICE_ATTR_RW(ppt_fppt); +@@ -1214,6 +1278,8 @@ static ssize_t ppt_apu_sppt_show(struct device *dev, + { + struct asus_wmi *asus = dev_get_drvdata(dev); + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%u\n", asus->ppt_apu_sppt); + } + static DEVICE_ATTR_RW(ppt_apu_sppt); +@@ -1257,6 +1323,8 @@ static ssize_t ppt_platform_sppt_show(struct device *dev, + { + struct asus_wmi *asus = dev_get_drvdata(dev); + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%u\n", asus->ppt_platform_sppt); + } + static DEVICE_ATTR_RW(ppt_platform_sppt); +@@ -1300,6 +1368,8 @@ static ssize_t nv_dynamic_boost_show(struct device *dev, + { + struct asus_wmi *asus = dev_get_drvdata(dev); + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%u\n", asus->nv_dynamic_boost); + } + static DEVICE_ATTR_RW(nv_dynamic_boost); +@@ -1343,9 +1413,12 @@ static ssize_t nv_temp_target_show(struct device *dev, + { + struct asus_wmi *asus = dev_get_drvdata(dev); + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%u\n", asus->nv_temp_target); + } + static DEVICE_ATTR_RW(nv_temp_target); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* Ally MCU Powersave ********************************************************/ + +@@ -1386,6 +1459,7 @@ void set_ally_mcu_powersave(bool enabled) + } + EXPORT_SYMBOL_NS_GPL(set_ally_mcu_powersave, "ASUS_WMI"); + ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t mcu_powersave_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -1396,6 +1470,8 @@ static ssize_t mcu_powersave_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", result); + } + +@@ -1431,6 +1507,7 @@ static ssize_t mcu_powersave_store(struct device *dev, + return count; + } + static DEVICE_ATTR_RW(mcu_powersave); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* Battery ********************************************************************/ + +@@ -2304,6 +2381,7 @@ static int asus_wmi_rfkill_init(struct asus_wmi *asus) + } + + /* Panel Overdrive ************************************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t panel_od_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -2314,6 +2392,8 @@ static ssize_t panel_od_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", result); + } + +@@ -2350,9 +2430,10 @@ static ssize_t panel_od_store(struct device *dev, + return count; + } + static DEVICE_ATTR_RW(panel_od); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* Bootup sound ***************************************************************/ +- ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t boot_sound_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -2363,6 +2444,8 @@ static ssize_t boot_sound_show(struct device *dev, + if (result < 0) + return result; + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", result); + } + +@@ -2398,8 +2481,10 @@ static ssize_t boot_sound_store(struct device *dev, + return count; + } + static DEVICE_ATTR_RW(boot_sound); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* Mini-LED mode **************************************************************/ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t mini_led_mode_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -2430,6 +2515,8 @@ static ssize_t mini_led_mode_show(struct device *dev, + } + } + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "%d\n", value); + } + +@@ -2500,10 +2587,13 @@ static ssize_t available_mini_led_mode_show(struct device *dev, + return sysfs_emit(buf, "0 1 2\n"); + } + ++ asus_wmi_show_deprecated(); ++ + return sysfs_emit(buf, "0\n"); + } + + static DEVICE_ATTR_RO(available_mini_led_mode); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* Quirks *********************************************************************/ + +@@ -3791,6 +3881,7 @@ static int throttle_thermal_policy_set_default(struct asus_wmi *asus) + return throttle_thermal_policy_write(asus); + } + ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + static ssize_t throttle_thermal_policy_show(struct device *dev, + struct device_attribute *attr, char *buf) + { +@@ -3834,6 +3925,7 @@ static ssize_t throttle_thermal_policy_store(struct device *dev, + * Throttle thermal policy: 0 - default, 1 - overboost, 2 - silent + */ + static DEVICE_ATTR_RW(throttle_thermal_policy); ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + /* Platform profile ***********************************************************/ + static int asus_wmi_platform_profile_get(struct device *dev, +@@ -3853,7 +3945,7 @@ static int asus_wmi_platform_profile_get(struct device *dev, + *profile = PLATFORM_PROFILE_PERFORMANCE; + break; + case ASUS_THROTTLE_THERMAL_POLICY_SILENT: +- *profile = PLATFORM_PROFILE_QUIET; ++ *profile = PLATFORM_PROFILE_LOW_POWER; + break; + default: + return -EINVAL; +@@ -3877,7 +3969,7 @@ static int asus_wmi_platform_profile_set(struct device *dev, + case PLATFORM_PROFILE_BALANCED: + tp = ASUS_THROTTLE_THERMAL_POLICY_DEFAULT; + break; +- case PLATFORM_PROFILE_QUIET: ++ case PLATFORM_PROFILE_LOW_POWER: + tp = ASUS_THROTTLE_THERMAL_POLICY_SILENT; + break; + default: +@@ -3890,7 +3982,7 @@ static int asus_wmi_platform_profile_set(struct device *dev, + + static int asus_wmi_platform_profile_probe(void *drvdata, unsigned long *choices) + { +- set_bit(PLATFORM_PROFILE_QUIET, choices); ++ set_bit(PLATFORM_PROFILE_LOW_POWER, choices); + set_bit(PLATFORM_PROFILE_BALANCED, choices); + set_bit(PLATFORM_PROFILE_PERFORMANCE, choices); + +@@ -4435,27 +4527,29 @@ static struct attribute *platform_attributes[] = { + &dev_attr_camera.attr, + &dev_attr_cardr.attr, + &dev_attr_touchpad.attr, +- &dev_attr_charge_mode.attr, +- &dev_attr_egpu_enable.attr, +- &dev_attr_egpu_connected.attr, +- &dev_attr_dgpu_disable.attr, +- &dev_attr_gpu_mux_mode.attr, + &dev_attr_lid_resume.attr, + &dev_attr_als_enable.attr, + &dev_attr_fan_boost_mode.attr, +- &dev_attr_throttle_thermal_policy.attr, +- &dev_attr_ppt_pl2_sppt.attr, +- &dev_attr_ppt_pl1_spl.attr, +- &dev_attr_ppt_fppt.attr, +- &dev_attr_ppt_apu_sppt.attr, +- &dev_attr_ppt_platform_sppt.attr, +- &dev_attr_nv_dynamic_boost.attr, +- &dev_attr_nv_temp_target.attr, +- &dev_attr_mcu_powersave.attr, +- &dev_attr_boot_sound.attr, +- &dev_attr_panel_od.attr, +- &dev_attr_mini_led_mode.attr, +- &dev_attr_available_mini_led_mode.attr, ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) ++ &dev_attr_charge_mode.attr, ++ &dev_attr_egpu_enable.attr, ++ &dev_attr_egpu_connected.attr, ++ &dev_attr_dgpu_disable.attr, ++ &dev_attr_gpu_mux_mode.attr, ++ &dev_attr_ppt_pl2_sppt.attr, ++ &dev_attr_ppt_pl1_spl.attr, ++ &dev_attr_ppt_fppt.attr, ++ &dev_attr_ppt_apu_sppt.attr, ++ &dev_attr_ppt_platform_sppt.attr, ++ &dev_attr_nv_dynamic_boost.attr, ++ &dev_attr_nv_temp_target.attr, ++ &dev_attr_mcu_powersave.attr, ++ &dev_attr_boot_sound.attr, ++ &dev_attr_panel_od.attr, ++ &dev_attr_mini_led_mode.attr, ++ &dev_attr_available_mini_led_mode.attr, ++ &dev_attr_throttle_thermal_policy.attr, ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + NULL + }; + +@@ -4477,7 +4571,11 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj, + devid = ASUS_WMI_DEVID_LID_RESUME; + else if (attr == &dev_attr_als_enable.attr) + devid = ASUS_WMI_DEVID_ALS_ENABLE; +- else if (attr == &dev_attr_charge_mode.attr) ++ else if (attr == &dev_attr_fan_boost_mode.attr) ++ ok = asus->fan_boost_mode_available; ++ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) ++ if (attr == &dev_attr_charge_mode.attr) + devid = ASUS_WMI_DEVID_CHARGE_MODE; + else if (attr == &dev_attr_egpu_enable.attr) + ok = asus->egpu_enable_available; +@@ -4515,6 +4613,7 @@ static umode_t asus_sysfs_is_visible(struct kobject *kobj, + ok = asus->mini_led_dev_id != 0; + else if (attr == &dev_attr_available_mini_led_mode.attr) + ok = asus->mini_led_dev_id != 0; ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + if (devid != -1) { + ok = !(asus_wmi_get_devstate_simple(asus, devid) < 0); +@@ -4770,6 +4869,7 @@ static int asus_wmi_add(struct platform_device *pdev) + } + + /* ensure defaults for tunables */ ++#if IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) + asus->ppt_pl2_sppt = 5; + asus->ppt_pl1_spl = 5; + asus->ppt_apu_sppt = 5; +@@ -4792,17 +4892,18 @@ static int asus_wmi_add(struct platform_device *pdev) + asus->gpu_mux_dev = ASUS_WMI_DEVID_GPU_MUX; + else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_GPU_MUX_VIVO)) + asus->gpu_mux_dev = ASUS_WMI_DEVID_GPU_MUX_VIVO; +- +- if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE)) +- asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE; +- else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE2)) +- asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE2; ++#endif /* IS_ENABLED(CONFIG_ASUS_WMI_DEPRECATED_ATTRS) */ + + if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY)) + asus->throttle_thermal_policy_dev = ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY; + else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO)) + asus->throttle_thermal_policy_dev = ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO; + ++ if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE)) ++ asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE; ++ else if (asus_wmi_dev_is_present(asus, ASUS_WMI_DEVID_TUF_RGB_MODE2)) ++ asus->kbd_rgb_dev = ASUS_WMI_DEVID_TUF_RGB_MODE2; ++ + err = fan_boost_mode_check_present(asus); + if (err) + goto fail_fan_boost_mode; +diff --git a/include/linux/platform_data/x86/asus-wmi.h b/include/linux/platform_data/x86/asus-wmi.h +index 8a515179113d..86279da06ea2 100644 +--- a/include/linux/platform_data/x86/asus-wmi.h ++++ b/include/linux/platform_data/x86/asus-wmi.h +@@ -6,6 +6,9 @@ + #include + #include + ++#define ASUS_WMI_MGMT_GUID "97845ED0-4E6D-11DE-8A39-0800200C9A66" ++#define ASUS_ACPI_UID_ASUSWMI "ASUSWMI" ++ + /* WMI Methods */ + #define ASUS_WMI_METHODID_SPEC 0x43455053 /* BIOS SPECification */ + #define ASUS_WMI_METHODID_SFBD 0x44424653 /* Set First Boot Device */ +@@ -73,12 +76,14 @@ + #define ASUS_WMI_DEVID_THROTTLE_THERMAL_POLICY_VIVO 0x00110019 + + /* Misc */ ++#define ASUS_WMI_DEVID_PANEL_HD 0x0005001C + #define ASUS_WMI_DEVID_PANEL_OD 0x00050019 + #define ASUS_WMI_DEVID_CAMERA 0x00060013 + #define ASUS_WMI_DEVID_LID_FLIP 0x00060062 + #define ASUS_WMI_DEVID_LID_FLIP_ROG 0x00060077 + #define ASUS_WMI_DEVID_MINI_LED_MODE 0x0005001E + #define ASUS_WMI_DEVID_MINI_LED_MODE2 0x0005002E ++#define ASUS_WMI_DEVID_SCREEN_AUTO_BRIGHTNESS 0x0005002A + + /* Storage */ + #define ASUS_WMI_DEVID_CARDREADER 0x00080013 +@@ -133,6 +138,16 @@ + /* dgpu on/off */ + #define ASUS_WMI_DEVID_DGPU 0x00090020 + ++/* Intel E-core and P-core configuration in a format 0x0[E]0[P] */ ++#define ASUS_WMI_DEVID_CORES 0x001200D2 ++ /* Maximum Intel E-core and P-core availability */ ++#define ASUS_WMI_DEVID_CORES_MAX 0x001200D3 ++ ++#define ASUS_WMI_DEVID_APU_MEM 0x000600C1 ++ ++#define ASUS_WMI_DEVID_DGPU_BASE_TGP 0x00120099 ++#define ASUS_WMI_DEVID_DGPU_SET_TGP 0x00120098 ++ + /* gpu mux switch, 0 = dGPU, 1 = Optimus */ + #define ASUS_WMI_DEVID_GPU_MUX 0x00090016 + #define ASUS_WMI_DEVID_GPU_MUX_VIVO 0x00090026 +@@ -166,6 +181,7 @@ enum asus_ally_mcu_hack { + #if IS_REACHABLE(CONFIG_ASUS_WMI) + void set_ally_mcu_hack(enum asus_ally_mcu_hack status); + void set_ally_mcu_powersave(bool enabled); ++int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval); + int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval); + int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, u32 *retval); + #else +@@ -179,6 +195,10 @@ static inline int asus_wmi_set_devstate(u32 dev_id, u32 ctrl_param, u32 *retval) + { + return -ENODEV; + } ++static inline int asus_wmi_get_devstate_dsts(u32 dev_id, u32 *retval) ++{ ++ return -ENODEV; ++} + static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, + u32 *retval) + { +@@ -187,6 +207,7 @@ static inline int asus_wmi_evaluate_method(u32 method_id, u32 arg0, u32 arg1, + #endif + + /* To be used by both hid-asus and asus-wmi to determine which controls kbd_brightness */ ++#if IS_REACHABLE(CONFIG_ASUS_WMI) || IS_REACHABLE(CONFIG_HID_ASUS) + static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { + { + .matches = { +@@ -225,5 +246,6 @@ static const struct dmi_system_id asus_use_hid_led_dmi_ids[] = { + }, + { }, + }; ++#endif + + #endif /* __PLATFORM_DATA_X86_ASUS_WMI_H */ +-- +2.51.0 + diff --git a/sys-kernel/git-sources/0002-bbr3.patch b/sys-kernel/git-sources/0002-bbr3.patch new file mode 100644 index 0000000..dcc5932 --- /dev/null +++ b/sys-kernel/git-sources/0002-bbr3.patch @@ -0,0 +1,3404 @@ +From 3205f6b619a4a9a62d914442d0925738f05854ac Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 1 Sep 2025 09:38:54 +0800 +Subject: [PATCH 2/4] bbr3 + +Signed-off-by: Eric Naim +--- + include/linux/tcp.h | 6 +- + include/net/inet_connection_sock.h | 4 +- + include/net/tcp.h | 73 +- + include/uapi/linux/inet_diag.h | 23 + + include/uapi/linux/rtnetlink.h | 4 +- + include/uapi/linux/tcp.h | 1 + + net/ipv4/Kconfig | 21 +- + net/ipv4/bpf_tcp_ca.c | 4 +- + net/ipv4/tcp.c | 3 + + net/ipv4/tcp_bbr.c | 2232 +++++++++++++++++++++------- + net/ipv4/tcp_cong.c | 1 + + net/ipv4/tcp_input.c | 40 +- + net/ipv4/tcp_minisocks.c | 2 + + net/ipv4/tcp_output.c | 48 +- + net/ipv4/tcp_rate.c | 30 +- + net/ipv4/tcp_timer.c | 4 +- + 16 files changed, 1941 insertions(+), 555 deletions(-) + +diff --git a/include/linux/tcp.h b/include/linux/tcp.h +index 57e478bfaef2..0ea92792629c 100644 +--- a/include/linux/tcp.h ++++ b/include/linux/tcp.h +@@ -247,7 +247,8 @@ struct tcp_sock { + void (*tcp_clean_acked)(struct sock *sk, u32 acked_seq); + #endif + u32 snd_ssthresh; /* Slow start size threshold */ +- u8 recvmsg_inq : 1;/* Indicate # of bytes in queue upon recvmsg */ ++ u32 recvmsg_inq : 1,/* Indicate # of bytes in queue upon recvmsg */ ++ fast_ack_mode:1;/* ack ASAP if >1 rcv_mss received? */ + __cacheline_group_end(tcp_sock_read_rx); + + /* TX read-write hotpath cache lines */ +@@ -304,7 +305,8 @@ struct tcp_sock { + */ + struct tcp_options_received rx_opt; + u8 nonagle : 4,/* Disable Nagle algorithm? */ +- rate_app_limited:1; /* rate_{delivered,interval_us} limited? */ ++ rate_app_limited:1, /* rate_{delivered,interval_us} limited? */ ++ tlp_orig_data_app_limited:1; /* app-limited before TLP rtx? */ + __cacheline_group_end(tcp_sock_write_txrx); + + /* RX read-write hotpath cache lines */ +diff --git a/include/net/inet_connection_sock.h b/include/net/inet_connection_sock.h +index 1735db332aab..2c4a94af7093 100644 +--- a/include/net/inet_connection_sock.h ++++ b/include/net/inet_connection_sock.h +@@ -132,8 +132,8 @@ struct inet_connection_sock { + u32 icsk_probes_tstamp; + u32 icsk_user_timeout; + +- u64 icsk_ca_priv[104 / sizeof(u64)]; +-#define ICSK_CA_PRIV_SIZE sizeof_field(struct inet_connection_sock, icsk_ca_priv) ++#define ICSK_CA_PRIV_SIZE (144) ++ u64 icsk_ca_priv[ICSK_CA_PRIV_SIZE / sizeof(u64)]; + }; + + #define ICSK_TIME_RETRANS 1 /* Retransmit timer */ +diff --git a/include/net/tcp.h b/include/net/tcp.h +index 526a26e7a150..564084c537c7 100644 +--- a/include/net/tcp.h ++++ b/include/net/tcp.h +@@ -378,11 +378,14 @@ static inline void tcp_dec_quickack_mode(struct sock *sk) + #define TCP_ECN_DEMAND_CWR BIT(2) + #define TCP_ECN_SEEN BIT(3) + #define TCP_ECN_MODE_ACCECN BIT(4) ++#define TCP_ECN_LOW BIT(5) ++#define TCP_ECN_ECT_PERMANENT BIT(6) + + #define TCP_ECN_DISABLED 0 + #define TCP_ECN_MODE_PENDING (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + #define TCP_ECN_MODE_ANY (TCP_ECN_MODE_RFC3168 | TCP_ECN_MODE_ACCECN) + ++ + static inline bool tcp_ecn_mode_any(const struct tcp_sock *tp) + { + return tp->ecn_flags & TCP_ECN_MODE_ANY; +@@ -840,6 +843,15 @@ static inline void tcp_fast_path_check(struct sock *sk) + + u32 tcp_delack_max(const struct sock *sk); + ++static inline void tcp_set_ecn_low_from_dst(struct sock *sk, ++ const struct dst_entry *dst) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ ++ if (dst_feature(dst, RTAX_FEATURE_ECN_LOW)) ++ tp->ecn_flags |= TCP_ECN_LOW; ++} ++ + /* Compute the actual rto_min value */ + static inline u32 tcp_rto_min(const struct sock *sk) + { +@@ -945,6 +957,11 @@ static inline u32 tcp_stamp_us_delta(u64 t1, u64 t0) + return max_t(s64, t1 - t0, 0); + } + ++static inline u32 tcp_stamp32_us_delta(u32 t1, u32 t0) ++{ ++ return max_t(s32, t1 - t0, 0); ++} ++ + /* provide the departure time in us unit */ + static inline u64 tcp_skb_timestamp_us(const struct sk_buff *skb) + { +@@ -1043,9 +1060,14 @@ struct tcp_skb_cb { + /* pkts S/ACKed so far upon tx of skb, incl retrans: */ + __u32 delivered; + /* start of send pipeline phase */ +- u64 first_tx_mstamp; ++ u32 first_tx_mstamp; + /* when we reached the "delivered" count */ +- u64 delivered_mstamp; ++ u32 delivered_mstamp; ++#define TCPCB_IN_FLIGHT_BITS 20 ++#define TCPCB_IN_FLIGHT_MAX ((1U << TCPCB_IN_FLIGHT_BITS) - 1) ++ u32 in_flight:20, /* packets in flight at transmit */ ++ unused2:12; ++ u32 lost; /* packets lost so far upon tx of skb */ + } tx; /* only used for outgoing skbs */ + union { + struct inet_skb_parm h4; +@@ -1158,6 +1180,7 @@ enum tcp_ca_event { + CA_EVENT_LOSS, /* loss timeout */ + CA_EVENT_ECN_NO_CE, /* ECT set, but not CE marked */ + CA_EVENT_ECN_IS_CE, /* received CE marked IP packet */ ++ CA_EVENT_TLP_RECOVERY, /* a lost segment was repaired by TLP probe */ + }; + + /* Information about inbound ACK, passed to cong_ops->in_ack_event() */ +@@ -1180,7 +1203,11 @@ enum tcp_ca_ack_event_flags { + #define TCP_CONG_NON_RESTRICTED BIT(0) + /* Requires ECN/ECT set on all packets */ + #define TCP_CONG_NEEDS_ECN BIT(1) +-#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | TCP_CONG_NEEDS_ECN) ++/* Wants notification of CE events (CA_EVENT_ECN_IS_CE, CA_EVENT_ECN_NO_CE). */ ++#define TCP_CONG_WANTS_CE_EVENTS BIT(2) ++#define TCP_CONG_MASK (TCP_CONG_NON_RESTRICTED | \ ++ TCP_CONG_NEEDS_ECN | \ ++ TCP_CONG_WANTS_CE_EVENTS) + + union tcp_cc_info; + +@@ -1200,10 +1227,13 @@ struct ack_sample { + */ + struct rate_sample { + u64 prior_mstamp; /* starting timestamp for interval */ ++ u32 prior_lost; /* tp->lost at "prior_mstamp" */ + u32 prior_delivered; /* tp->delivered at "prior_mstamp" */ + u32 prior_delivered_ce;/* tp->delivered_ce at "prior_mstamp" */ ++ u32 tx_in_flight; /* packets in flight at starting timestamp */ ++ s32 lost; /* number of packets lost over interval */ + s32 delivered; /* number of packets delivered over interval */ +- s32 delivered_ce; /* number of packets delivered w/ CE marks*/ ++ s32 delivered_ce; /* packets delivered w/ CE mark over interval */ + long interval_us; /* time for tp->delivered to incr "delivered" */ + u32 snd_interval_us; /* snd interval for delivered packets */ + u32 rcv_interval_us; /* rcv interval for delivered packets */ +@@ -1214,7 +1244,9 @@ struct rate_sample { + u32 last_end_seq; /* end_seq of most recently ACKed packet */ + bool is_app_limited; /* is sample from packet with bubble in pipe? */ + bool is_retrans; /* is sample from retransmission? */ ++ bool is_acking_tlp_retrans_seq; /* ACKed a TLP retransmit sequence? */ + bool is_ack_delayed; /* is this (likely) a delayed ACK? */ ++ bool is_ece; /* did this ACK have ECN marked? */ + }; + + struct tcp_congestion_ops { +@@ -1238,8 +1270,11 @@ struct tcp_congestion_ops { + /* hook for packet ack accounting (optional) */ + void (*pkts_acked)(struct sock *sk, const struct ack_sample *sample); + +- /* override sysctl_tcp_min_tso_segs */ +- u32 (*min_tso_segs)(struct sock *sk); ++ /* pick target number of segments per TSO/GSO skb (optional): */ ++ u32 (*tso_segs)(struct sock *sk, unsigned int mss_now); ++ ++ /* react to a specific lost skb (optional) */ ++ void (*skb_marked_lost)(struct sock *sk, const struct sk_buff *skb); + + /* call when packets are delivered to update cwnd and pacing rate, + * after all the ca_state processing. (optional) +@@ -1305,6 +1340,14 @@ static inline char *tcp_ca_get_name_by_key(u32 key, char *buffer) + } + #endif + ++static inline bool tcp_ca_wants_ce_events(const struct sock *sk) ++{ ++ const struct inet_connection_sock *icsk = inet_csk(sk); ++ ++ return icsk->icsk_ca_ops->flags & (TCP_CONG_NEEDS_ECN | ++ TCP_CONG_WANTS_CE_EVENTS); ++} ++ + static inline bool tcp_ca_needs_ecn(const struct sock *sk) + { + const struct inet_connection_sock *icsk = inet_csk(sk); +@@ -1324,6 +1367,7 @@ static inline void tcp_ca_event(struct sock *sk, const enum tcp_ca_event event) + void tcp_set_ca_state(struct sock *sk, const u8 ca_state); + + /* From tcp_rate.c */ ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb); + void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + struct rate_sample *rs); +@@ -1336,6 +1380,21 @@ static inline bool tcp_skb_sent_after(u64 t1, u64 t2, u32 seq1, u32 seq2) + return t1 > t2 || (t1 == t2 && after(seq1, seq2)); + } + ++/* If a retransmit failed due to local qdisc congestion or other local issues, ++ * then we may have called tcp_set_skb_tso_segs() to increase the number of ++ * segments in the skb without increasing the tx.in_flight. In all other cases, ++ * the tx.in_flight should be at least as big as the pcount of the sk_buff. We ++ * do not have the state to know whether a retransmit failed due to local qdisc ++ * congestion or other local issues, so to avoid spurious warnings we consider ++ * that any skb marked lost may have suffered that fate. ++ */ ++static inline bool tcp_skb_tx_in_flight_is_suspicious(u32 skb_pcount, ++ u32 skb_sacked_flags, ++ u32 tx_in_flight) ++{ ++ return (skb_pcount > tx_in_flight) && !(skb_sacked_flags & TCPCB_LOST); ++} ++ + /* These functions determine how the current flow behaves in respect of SACK + * handling. SACK is negotiated with the peer, and therefore it can vary + * between different flows. +@@ -2483,7 +2542,7 @@ struct tcp_plb_state { + u8 consec_cong_rounds:5, /* consecutive congested rounds */ + unused:3; + u32 pause_until; /* jiffies32 when PLB can resume rerouting */ +-}; ++} __attribute__ ((__packed__)); + + static inline void tcp_plb_init(const struct sock *sk, + struct tcp_plb_state *plb) +diff --git a/include/uapi/linux/inet_diag.h b/include/uapi/linux/inet_diag.h +index 86bb2e8b17c9..9d9a3eb2ce9b 100644 +--- a/include/uapi/linux/inet_diag.h ++++ b/include/uapi/linux/inet_diag.h +@@ -229,6 +229,29 @@ struct tcp_bbr_info { + __u32 bbr_min_rtt; /* min-filtered RTT in uSec */ + __u32 bbr_pacing_gain; /* pacing gain shifted left 8 bits */ + __u32 bbr_cwnd_gain; /* cwnd gain shifted left 8 bits */ ++ __u32 bbr_bw_hi_lsb; /* lower 32 bits of bw_hi */ ++ __u32 bbr_bw_hi_msb; /* upper 32 bits of bw_hi */ ++ __u32 bbr_bw_lo_lsb; /* lower 32 bits of bw_lo */ ++ __u32 bbr_bw_lo_msb; /* upper 32 bits of bw_lo */ ++ __u8 bbr_mode; /* current bbr_mode in state machine */ ++ __u8 bbr_phase; /* current state machine phase */ ++ __u8 unused1; /* alignment padding; not used yet */ ++ __u8 bbr_version; /* BBR algorithm version */ ++ __u32 bbr_inflight_lo; /* lower short-term data volume bound */ ++ __u32 bbr_inflight_hi; /* higher long-term data volume bound */ ++ __u32 bbr_extra_acked; /* max excess packets ACKed in epoch */ ++}; ++ ++/* TCP BBR congestion control bbr_phase as reported in netlink/ss stats. */ ++enum tcp_bbr_phase { ++ BBR_PHASE_INVALID = 0, ++ BBR_PHASE_STARTUP = 1, ++ BBR_PHASE_DRAIN = 2, ++ BBR_PHASE_PROBE_RTT = 3, ++ BBR_PHASE_PROBE_BW_UP = 4, ++ BBR_PHASE_PROBE_BW_DOWN = 5, ++ BBR_PHASE_PROBE_BW_CRUISE = 6, ++ BBR_PHASE_PROBE_BW_REFILL = 7, + }; + + union tcp_cc_info { +diff --git a/include/uapi/linux/rtnetlink.h b/include/uapi/linux/rtnetlink.h +index dab9493c791b..cce4975fdcfe 100644 +--- a/include/uapi/linux/rtnetlink.h ++++ b/include/uapi/linux/rtnetlink.h +@@ -517,12 +517,14 @@ enum { + #define RTAX_FEATURE_TIMESTAMP (1 << 2) /* unused */ + #define RTAX_FEATURE_ALLFRAG (1 << 3) /* unused */ + #define RTAX_FEATURE_TCP_USEC_TS (1 << 4) ++#define RTAX_FEATURE_ECN_LOW (1 << 5) + + #define RTAX_FEATURE_MASK (RTAX_FEATURE_ECN | \ + RTAX_FEATURE_SACK | \ + RTAX_FEATURE_TIMESTAMP | \ + RTAX_FEATURE_ALLFRAG | \ +- RTAX_FEATURE_TCP_USEC_TS) ++ RTAX_FEATURE_TCP_USEC_TS | \ ++ RTAX_FEATURE_ECN_LOW) + + struct rta_session { + __u8 proto; +diff --git a/include/uapi/linux/tcp.h b/include/uapi/linux/tcp.h +index bdac8c42fa82..362644a272ba 100644 +--- a/include/uapi/linux/tcp.h ++++ b/include/uapi/linux/tcp.h +@@ -185,6 +185,7 @@ enum tcp_fastopen_client_fail { + #define TCPI_OPT_SYN_DATA 32 /* SYN-ACK acked data in SYN sent or rcvd */ + #define TCPI_OPT_USEC_TS 64 /* usec timestamps */ + #define TCPI_OPT_TFO_CHILD 128 /* child from a Fast Open option on SYN */ ++#define TCPI_OPT_ECN_LOW 256 /* Low-latency ECN configured at init */ + + /* + * Sender's congestion state indicating normal or abnormal situations +diff --git a/net/ipv4/Kconfig b/net/ipv4/Kconfig +index 12850a277251..3b8b96692fb4 100644 +--- a/net/ipv4/Kconfig ++++ b/net/ipv4/Kconfig +@@ -669,15 +669,18 @@ config TCP_CONG_BBR + default n + help + +- BBR (Bottleneck Bandwidth and RTT) TCP congestion control aims to +- maximize network utilization and minimize queues. It builds an explicit +- model of the bottleneck delivery rate and path round-trip propagation +- delay. It tolerates packet loss and delay unrelated to congestion. It +- can operate over LAN, WAN, cellular, wifi, or cable modem links. It can +- coexist with flows that use loss-based congestion control, and can +- operate with shallow buffers, deep buffers, bufferbloat, policers, or +- AQM schemes that do not provide a delay signal. It requires the fq +- ("Fair Queue") pacing packet scheduler. ++ BBR (Bottleneck Bandwidth and RTT) TCP congestion control is a ++ model-based congestion control algorithm that aims to maximize ++ network utilization, keep queues and retransmit rates low, and to be ++ able to coexist with Reno/CUBIC in common scenarios. It builds an ++ explicit model of the network path. It tolerates a targeted degree ++ of random packet loss and delay. It can operate over LAN, WAN, ++ cellular, wifi, or cable modem links, and can use shallow-threshold ++ ECN signals. It can coexist to some degree with flows that use ++ loss-based congestion control, and can operate with shallow buffers, ++ deep buffers, bufferbloat, policers, or AQM schemes that do not ++ provide a delay signal. It requires pacing, using either TCP internal ++ pacing or the fq ("Fair Queue") pacing packet scheduler. + + choice + prompt "Default TCP congestion control" +diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c +index e01492234b0b..27893b774e08 100644 +--- a/net/ipv4/bpf_tcp_ca.c ++++ b/net/ipv4/bpf_tcp_ca.c +@@ -280,7 +280,7 @@ static void bpf_tcp_ca_pkts_acked(struct sock *sk, const struct ack_sample *samp + { + } + +-static u32 bpf_tcp_ca_min_tso_segs(struct sock *sk) ++static u32 bpf_tcp_ca_tso_segs(struct sock *sk, unsigned int mss_now) + { + return 0; + } +@@ -315,7 +315,7 @@ static struct tcp_congestion_ops __bpf_ops_tcp_congestion_ops = { + .cwnd_event = bpf_tcp_ca_cwnd_event, + .in_ack_event = bpf_tcp_ca_in_ack_event, + .pkts_acked = bpf_tcp_ca_pkts_acked, +- .min_tso_segs = bpf_tcp_ca_min_tso_segs, ++ .tso_segs = bpf_tcp_ca_tso_segs, + .cong_control = bpf_tcp_ca_cong_control, + .undo_cwnd = bpf_tcp_ca_undo_cwnd, + .sndbuf_expand = bpf_tcp_ca_sndbuf_expand, +diff --git a/net/ipv4/tcp.c b/net/ipv4/tcp.c +index 71a956fbfc55..f9866bd97ac4 100644 +--- a/net/ipv4/tcp.c ++++ b/net/ipv4/tcp.c +@@ -3439,6 +3439,7 @@ int tcp_disconnect(struct sock *sk, int flags) + tp->rx_opt.dsack = 0; + tp->rx_opt.num_sacks = 0; + tp->rcv_ooopack = 0; ++ tp->fast_ack_mode = 0; + + + /* Clean up fastopen related fields */ +@@ -4191,6 +4192,8 @@ void tcp_get_info(struct sock *sk, struct tcp_info *info) + info->tcpi_options |= TCPI_OPT_ECN; + if (tp->ecn_flags & TCP_ECN_SEEN) + info->tcpi_options |= TCPI_OPT_ECN_SEEN; ++ if (tp->ecn_flags & TCP_ECN_LOW) ++ info->tcpi_options |= TCPI_OPT_ECN_LOW; + if (tp->syn_data_acked) + info->tcpi_options |= TCPI_OPT_SYN_DATA; + if (tp->tcp_usec_ts) +diff --git a/net/ipv4/tcp_bbr.c b/net/ipv4/tcp_bbr.c +index 760941e55153..066da5e5747c 100644 +--- a/net/ipv4/tcp_bbr.c ++++ b/net/ipv4/tcp_bbr.c +@@ -1,18 +1,19 @@ +-/* Bottleneck Bandwidth and RTT (BBR) congestion control ++/* BBR (Bottleneck Bandwidth and RTT) congestion control + * +- * BBR congestion control computes the sending rate based on the delivery +- * rate (throughput) estimated from ACKs. In a nutshell: ++ * BBR is a model-based congestion control algorithm that aims for low queues, ++ * low loss, and (bounded) Reno/CUBIC coexistence. To maintain a model of the ++ * network path, it uses measurements of bandwidth and RTT, as well as (if they ++ * occur) packet loss and/or shallow-threshold ECN signals. Note that although ++ * it can use ECN or loss signals explicitly, it does not require either; it ++ * can bound its in-flight data based on its estimate of the BDP. + * +- * On each ACK, update our model of the network path: +- * bottleneck_bandwidth = windowed_max(delivered / elapsed, 10 round trips) +- * min_rtt = windowed_min(rtt, 10 seconds) +- * pacing_rate = pacing_gain * bottleneck_bandwidth +- * cwnd = max(cwnd_gain * bottleneck_bandwidth * min_rtt, 4) +- * +- * The core algorithm does not react directly to packet losses or delays, +- * although BBR may adjust the size of next send per ACK when loss is +- * observed, or adjust the sending rate if it estimates there is a +- * traffic policer, in order to keep the drop rate reasonable. ++ * The model has both higher and lower bounds for the operating range: ++ * lo: bw_lo, inflight_lo: conservative short-term lower bound ++ * hi: bw_hi, inflight_hi: robust long-term upper bound ++ * The bandwidth-probing time scale is (a) extended dynamically based on ++ * estimated BDP to improve coexistence with Reno/CUBIC; (b) bounded by ++ * an interactive wall-clock time-scale to be more scalable and responsive ++ * than Reno and CUBIC. + * + * Here is a state transition diagram for BBR: + * +@@ -65,6 +66,13 @@ + #include + #include + ++#include ++#include "tcp_dctcp.h" ++ ++#define BBR_VERSION 3 ++ ++#define bbr_param(sk,name) (bbr_ ## name) ++ + /* Scale factor for rate in pkt/uSec unit to avoid truncation in bandwidth + * estimation. The rate unit ~= (1500 bytes / 1 usec / 2^24) ~= 715 bps. + * This handles bandwidths from 0.06pps (715bps) to 256Mpps (3Tbps) in a u32. +@@ -85,36 +93,41 @@ enum bbr_mode { + BBR_PROBE_RTT, /* cut inflight to min to probe min_rtt */ + }; + ++/* How does the incoming ACK stream relate to our bandwidth probing? */ ++enum bbr_ack_phase { ++ BBR_ACKS_INIT, /* not probing; not getting probe feedback */ ++ BBR_ACKS_REFILLING, /* sending at est. bw to fill pipe */ ++ BBR_ACKS_PROBE_STARTING, /* inflight rising to probe bw */ ++ BBR_ACKS_PROBE_FEEDBACK, /* getting feedback from bw probing */ ++ BBR_ACKS_PROBE_STOPPING, /* stopped probing; still getting feedback */ ++}; ++ + /* BBR congestion control block */ + struct bbr { + u32 min_rtt_us; /* min RTT in min_rtt_win_sec window */ + u32 min_rtt_stamp; /* timestamp of min_rtt_us */ + u32 probe_rtt_done_stamp; /* end time for BBR_PROBE_RTT mode */ +- struct minmax bw; /* Max recent delivery rate in pkts/uS << 24 */ +- u32 rtt_cnt; /* count of packet-timed rounds elapsed */ ++ u32 probe_rtt_min_us; /* min RTT in probe_rtt_win_ms win */ ++ u32 probe_rtt_min_stamp; /* timestamp of probe_rtt_min_us*/ + u32 next_rtt_delivered; /* scb->tx.delivered at end of round */ + u64 cycle_mstamp; /* time of this cycle phase start */ +- u32 mode:3, /* current bbr_mode in state machine */ ++ u32 mode:2, /* current bbr_mode in state machine */ + prev_ca_state:3, /* CA state on previous ACK */ +- packet_conservation:1, /* use packet conservation? */ + round_start:1, /* start of packet-timed tx->ack round? */ ++ ce_state:1, /* If most recent data has CE bit set */ ++ bw_probe_up_rounds:5, /* cwnd-limited rounds in PROBE_UP */ ++ try_fast_path:1, /* can we take fast path? */ + idle_restart:1, /* restarting after idle? */ + probe_rtt_round_done:1, /* a BBR_PROBE_RTT round at 4 pkts? */ +- unused:13, +- lt_is_sampling:1, /* taking long-term ("LT") samples now? */ +- lt_rtt_cnt:7, /* round trips in long-term interval */ +- lt_use_bw:1; /* use lt_bw as our bw estimate? */ +- u32 lt_bw; /* LT est delivery rate in pkts/uS << 24 */ +- u32 lt_last_delivered; /* LT intvl start: tp->delivered */ +- u32 lt_last_stamp; /* LT intvl start: tp->delivered_mstamp */ +- u32 lt_last_lost; /* LT intvl start: tp->lost */ ++ init_cwnd:7, /* initial cwnd */ ++ unused_1:10; + u32 pacing_gain:10, /* current gain for setting pacing rate */ + cwnd_gain:10, /* current gain for setting cwnd */ + full_bw_reached:1, /* reached full bw in Startup? */ + full_bw_cnt:2, /* number of rounds without large bw gains */ +- cycle_idx:3, /* current index in pacing_gain cycle array */ ++ cycle_idx:2, /* current index in pacing_gain cycle array */ + has_seen_rtt:1, /* have we seen an RTT sample yet? */ +- unused_b:5; ++ unused_2:6; + u32 prior_cwnd; /* prior cwnd upon entering loss recovery */ + u32 full_bw; /* recent bw, to estimate if pipe is full */ + +@@ -124,19 +137,67 @@ struct bbr { + u32 ack_epoch_acked:20, /* packets (S)ACKed in sampling epoch */ + extra_acked_win_rtts:5, /* age of extra_acked, in round trips */ + extra_acked_win_idx:1, /* current index in extra_acked array */ +- unused_c:6; ++ /* BBR v3 state: */ ++ full_bw_now:1, /* recently reached full bw plateau? */ ++ startup_ecn_rounds:2, /* consecutive hi ECN STARTUP rounds */ ++ loss_in_cycle:1, /* packet loss in this cycle? */ ++ ecn_in_cycle:1, /* ECN in this cycle? */ ++ unused_3:1; ++ u32 loss_round_delivered; /* scb->tx.delivered ending loss round */ ++ u32 undo_bw_lo; /* bw_lo before latest losses */ ++ u32 undo_inflight_lo; /* inflight_lo before latest losses */ ++ u32 undo_inflight_hi; /* inflight_hi before latest losses */ ++ u32 bw_latest; /* max delivered bw in last round trip */ ++ u32 bw_lo; /* lower bound on sending bandwidth */ ++ u32 bw_hi[2]; /* max recent measured bw sample */ ++ u32 inflight_latest; /* max delivered data in last round trip */ ++ u32 inflight_lo; /* lower bound of inflight data range */ ++ u32 inflight_hi; /* upper bound of inflight data range */ ++ u32 bw_probe_up_cnt; /* packets delivered per inflight_hi incr */ ++ u32 bw_probe_up_acks; /* packets (S)ACKed since inflight_hi incr */ ++ u32 probe_wait_us; /* PROBE_DOWN until next clock-driven probe */ ++ u32 prior_rcv_nxt; /* tp->rcv_nxt when CE state last changed */ ++ u32 ecn_eligible:1, /* sender can use ECN (RTT, handshake)? */ ++ ecn_alpha:9, /* EWMA delivered_ce/delivered; 0..256 */ ++ bw_probe_samples:1, /* rate samples reflect bw probing? */ ++ prev_probe_too_high:1, /* did last PROBE_UP go too high? */ ++ stopped_risky_probe:1, /* last PROBE_UP stopped due to risk? */ ++ rounds_since_probe:8, /* packet-timed rounds since probed bw */ ++ loss_round_start:1, /* loss_round_delivered round trip? */ ++ loss_in_round:1, /* loss marked in this round trip? */ ++ ecn_in_round:1, /* ECN marked in this round trip? */ ++ ack_phase:3, /* bbr_ack_phase: meaning of ACKs */ ++ loss_events_in_round:4,/* losses in STARTUP round */ ++ initialized:1; /* has bbr_init() been called? */ ++ u32 alpha_last_delivered; /* tp->delivered at alpha update */ ++ u32 alpha_last_delivered_ce; /* tp->delivered_ce at alpha update */ ++ ++ u8 unused_4; /* to preserve alignment */ ++ struct tcp_plb_state plb; + }; + +-#define CYCLE_LEN 8 /* number of phases in a pacing gain cycle */ ++struct bbr_context { ++ u32 sample_bw; ++}; + +-/* Window length of bw filter (in rounds): */ +-static const int bbr_bw_rtts = CYCLE_LEN + 2; + /* Window length of min_rtt filter (in sec): */ + static const u32 bbr_min_rtt_win_sec = 10; + /* Minimum time (in ms) spent at bbr_cwnd_min_target in BBR_PROBE_RTT mode: */ + static const u32 bbr_probe_rtt_mode_ms = 200; +-/* Skip TSO below the following bandwidth (bits/sec): */ +-static const int bbr_min_tso_rate = 1200000; ++/* Window length of probe_rtt_min_us filter (in ms), and consequently the ++ * typical interval between PROBE_RTT mode entries. The default is 5000ms. ++ * Note that bbr_probe_rtt_win_ms must be <= bbr_min_rtt_win_sec * MSEC_PER_SEC ++ */ ++static const u32 bbr_probe_rtt_win_ms = 5000; ++/* Proportion of cwnd to estimated BDP in PROBE_RTT, in units of BBR_UNIT: */ ++static const u32 bbr_probe_rtt_cwnd_gain = BBR_UNIT * 1 / 2; ++ ++/* Use min_rtt to help adapt TSO burst size, with smaller min_rtt resulting ++ * in bigger TSO bursts. We cut the RTT-based allowance in half ++ * for every 2^9 usec (aka 512 us) of RTT, so that the RTT-based allowance ++ * is below 1500 bytes after 6 * ~500 usec = 3ms. ++ */ ++static const u32 bbr_tso_rtt_shift = 9; + + /* Pace at ~1% below estimated bw, on average, to reduce queue at bottleneck. + * In order to help drive the network toward lower queues and low latency while +@@ -146,13 +207,15 @@ static const int bbr_min_tso_rate = 1200000; + */ + static const int bbr_pacing_margin_percent = 1; + +-/* We use a high_gain value of 2/ln(2) because it's the smallest pacing gain ++/* We use a startup_pacing_gain of 4*ln(2) because it's the smallest value + * that will allow a smoothly increasing pacing rate that will double each RTT + * and send the same number of packets per RTT that an un-paced, slow-starting + * Reno or CUBIC flow would: + */ +-static const int bbr_high_gain = BBR_UNIT * 2885 / 1000 + 1; +-/* The pacing gain of 1/high_gain in BBR_DRAIN is calculated to typically drain ++static const int bbr_startup_pacing_gain = BBR_UNIT * 277 / 100 + 1; ++/* The gain for deriving startup cwnd: */ ++static const int bbr_startup_cwnd_gain = BBR_UNIT * 2; ++/* The pacing gain in BBR_DRAIN is calculated to typically drain + * the queue created in BBR_STARTUP in a single round: + */ + static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; +@@ -160,13 +223,17 @@ static const int bbr_drain_gain = BBR_UNIT * 1000 / 2885; + static const int bbr_cwnd_gain = BBR_UNIT * 2; + /* The pacing_gain values for the PROBE_BW gain cycle, to discover/share bw: */ + static const int bbr_pacing_gain[] = { +- BBR_UNIT * 5 / 4, /* probe for more available bw */ +- BBR_UNIT * 3 / 4, /* drain queue and/or yield bw to other flows */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT, /* cruise at 1.0*bw to utilize pipe, */ +- BBR_UNIT, BBR_UNIT, BBR_UNIT /* without creating excess queue... */ ++ BBR_UNIT * 5 / 4, /* UP: probe for more available bw */ ++ BBR_UNIT * 91 / 100, /* DOWN: drain queue and/or yield bw */ ++ BBR_UNIT, /* CRUISE: try to use pipe w/ some headroom */ ++ BBR_UNIT, /* REFILL: refill pipe to estimated 100% */ ++}; ++enum bbr_pacing_gain_phase { ++ BBR_BW_PROBE_UP = 0, /* push up inflight to probe for bw/vol */ ++ BBR_BW_PROBE_DOWN = 1, /* drain excess inflight from the queue */ ++ BBR_BW_PROBE_CRUISE = 2, /* use pipe, w/ headroom in queue/pipe */ ++ BBR_BW_PROBE_REFILL = 3, /* refill the pipe again to 100% */ + }; +-/* Randomize the starting gain cycling phase over N phases: */ +-static const u32 bbr_cycle_rand = 7; + + /* Try to keep at least this many packets in flight, if things go smoothly. For + * smooth functioning, a sliding window protocol ACKing every other packet +@@ -174,24 +241,12 @@ static const u32 bbr_cycle_rand = 7; + */ + static const u32 bbr_cwnd_min_target = 4; + +-/* To estimate if BBR_STARTUP mode (i.e. high_gain) has filled pipe... */ ++/* To estimate if BBR_STARTUP or BBR_BW_PROBE_UP has filled pipe... */ + /* If bw has increased significantly (1.25x), there may be more bw available: */ + static const u32 bbr_full_bw_thresh = BBR_UNIT * 5 / 4; + /* But after 3 rounds w/o significant bw growth, estimate pipe is full: */ + static const u32 bbr_full_bw_cnt = 3; + +-/* "long-term" ("LT") bandwidth estimator parameters... */ +-/* The minimum number of rounds in an LT bw sampling interval: */ +-static const u32 bbr_lt_intvl_min_rtts = 4; +-/* If lost/delivered ratio > 20%, interval is "lossy" and we may be policed: */ +-static const u32 bbr_lt_loss_thresh = 50; +-/* If 2 intervals have a bw ratio <= 1/8, their bw is "consistent": */ +-static const u32 bbr_lt_bw_ratio = BBR_UNIT / 8; +-/* If 2 intervals have a bw diff <= 4 Kbit/sec their bw is "consistent": */ +-static const u32 bbr_lt_bw_diff = 4000 / 8; +-/* If we estimate we're policed, use lt_bw for this many round trips: */ +-static const u32 bbr_lt_bw_max_rtts = 48; +- + /* Gain factor for adding extra_acked to target cwnd: */ + static const int bbr_extra_acked_gain = BBR_UNIT; + /* Window length of extra_acked window. */ +@@ -201,8 +256,122 @@ static const u32 bbr_ack_epoch_acked_reset_thresh = 1U << 20; + /* Time period for clamping cwnd increment due to ack aggregation */ + static const u32 bbr_extra_acked_max_us = 100 * 1000; + ++/* Flags to control BBR ECN-related behavior... */ ++ ++/* Ensure ACKs only ACK packets with consistent ECN CE status? */ ++static const bool bbr_precise_ece_ack = true; ++ ++/* Max RTT (in usec) at which to use sender-side ECN logic. ++ * Disabled when 0 (ECN allowed at any RTT). ++ */ ++static const u32 bbr_ecn_max_rtt_us = 5000; ++ ++/* On losses, scale down inflight and pacing rate by beta scaled by BBR_SCALE. ++ * No loss response when 0. ++ */ ++static const u32 bbr_beta = BBR_UNIT * 30 / 100; ++ ++/* Gain factor for ECN mark ratio samples, scaled by BBR_SCALE (1/16 = 6.25%) */ ++static const u32 bbr_ecn_alpha_gain = BBR_UNIT * 1 / 16; ++ ++/* The initial value for ecn_alpha; 1.0 allows a flow to respond quickly ++ * to congestion if the bottleneck is congested when the flow starts up. ++ */ ++static const u32 bbr_ecn_alpha_init = BBR_UNIT; ++ ++/* On ECN, cut inflight_lo to (1 - ecn_factor * ecn_alpha) scaled by BBR_SCALE. ++ * No ECN based bounding when 0. ++ */ ++static const u32 bbr_ecn_factor = BBR_UNIT * 1 / 3; /* 1/3 = 33% */ ++ ++/* Estimate bw probing has gone too far if CE ratio exceeds this threshold. ++ * Scaled by BBR_SCALE. Disabled when 0. ++ */ ++static const u32 bbr_ecn_thresh = BBR_UNIT * 1 / 2; /* 1/2 = 50% */ ++ ++/* If non-zero, if in a cycle with no losses but some ECN marks, after ECN ++ * clears then make the first round's increment to inflight_hi the following ++ * fraction of inflight_hi. ++ */ ++static const u32 bbr_ecn_reprobe_gain = BBR_UNIT * 1 / 2; ++ ++/* Estimate bw probing has gone too far if loss rate exceeds this level. */ ++static const u32 bbr_loss_thresh = BBR_UNIT * 2 / 100; /* 2% loss */ ++ ++/* Slow down for a packet loss recovered by TLP? */ ++static const bool bbr_loss_probe_recovery = true; ++ ++/* Exit STARTUP if number of loss marking events in a Recovery round is >= N, ++ * and loss rate is higher than bbr_loss_thresh. ++ * Disabled if 0. ++ */ ++static const u32 bbr_full_loss_cnt = 6; ++ ++/* Exit STARTUP if number of round trips with ECN mark rate above ecn_thresh ++ * meets this count. ++ */ ++static const u32 bbr_full_ecn_cnt = 2; ++ ++/* Fraction of unutilized headroom to try to leave in path upon high loss. */ ++static const u32 bbr_inflight_headroom = BBR_UNIT * 15 / 100; ++ ++/* How much do we increase cwnd_gain when probing for bandwidth in ++ * BBR_BW_PROBE_UP? This specifies the increment in units of ++ * BBR_UNIT/4. The default is 1, meaning 0.25. ++ * The min value is 0 (meaning 0.0); max is 3 (meaning 0.75). ++ */ ++static const u32 bbr_bw_probe_cwnd_gain = 1; ++ ++/* Max number of packet-timed rounds to wait before probing for bandwidth. If ++ * we want to tolerate 1% random loss per round, and not have this cut our ++ * inflight too much, we must probe for bw periodically on roughly this scale. ++ * If low, limits Reno/CUBIC coexistence; if high, limits loss tolerance. ++ * We aim to be fair with Reno/CUBIC up to a BDP of at least: ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ */ ++static const u32 bbr_bw_probe_max_rounds = 63; ++ ++/* Max amount of randomness to inject in round counting for Reno-coexistence. ++ */ ++static const u32 bbr_bw_probe_rand_rounds = 2; ++ ++/* Use BBR-native probe time scale starting at this many usec. ++ * We aim to be fair with Reno/CUBIC up to an inter-loss time epoch of at least: ++ * BDP*RTT = 25Mbps * .030sec /(1514bytes) * 0.030sec = 1.9 secs ++ */ ++static const u32 bbr_bw_probe_base_us = 2 * USEC_PER_SEC; /* 2 secs */ ++ ++/* Use BBR-native probes spread over this many usec: */ ++static const u32 bbr_bw_probe_rand_us = 1 * USEC_PER_SEC; /* 1 secs */ ++ ++/* Use fast path if app-limited, no loss/ECN, and target cwnd was reached? */ ++static const bool bbr_fast_path = true; ++ ++/* Use fast ack mode? */ ++static const bool bbr_fast_ack_mode = true; ++ ++static u32 bbr_max_bw(const struct sock *sk); ++static u32 bbr_bw(const struct sock *sk); ++static void bbr_exit_probe_rtt(struct sock *sk); ++static void bbr_reset_congestion_signals(struct sock *sk); ++static void bbr_run_loss_probe_recovery(struct sock *sk); ++ + static void bbr_check_probe_rtt_done(struct sock *sk); + ++/* This connection can use ECN if both endpoints have signaled ECN support in ++ * the handshake and the per-route settings indicated this is a ++ * shallow-threshold ECN environment, meaning both: ++ * (a) ECN CE marks indicate low-latency/shallow-threshold congestion, and ++ * (b) TCP endpoints provide precise ACKs that only ACK data segments ++ * with consistent ECN CE status ++ */ ++static bool bbr_can_use_ecn(const struct sock *sk) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ ++ return (tcp_ecn_mode_any(tp)) && (tp->ecn_flags & TCP_ECN_LOW); ++} ++ + /* Do we estimate that STARTUP filled the pipe? */ + static bool bbr_full_bw_reached(const struct sock *sk) + { +@@ -214,17 +383,17 @@ static bool bbr_full_bw_reached(const struct sock *sk) + /* Return the windowed max recent bandwidth sample, in pkts/uS << BW_SCALE. */ + static u32 bbr_max_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return minmax_get(&bbr->bw); ++ return max(bbr->bw_hi[0], bbr->bw_hi[1]); + } + + /* Return the estimated bandwidth of the path, in pkts/uS << BW_SCALE. */ + static u32 bbr_bw(const struct sock *sk) + { +- struct bbr *bbr = inet_csk_ca(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); + +- return bbr->lt_use_bw ? bbr->lt_bw : bbr_max_bw(sk); ++ return min(bbr_max_bw(sk), bbr->bw_lo); + } + + /* Return maximum extra acked in past k-2k round trips, +@@ -241,15 +410,23 @@ static u16 bbr_extra_acked(const struct sock *sk) + * The order here is chosen carefully to avoid overflow of u64. This should + * work for input rates of up to 2.9Tbit/sec and gain of 2.89x. + */ +-static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain) ++static u64 bbr_rate_bytes_per_sec(struct sock *sk, u64 rate, int gain, ++ int margin) + { + unsigned int mss = tcp_sk(sk)->mss_cache; + + rate *= mss; + rate *= gain; + rate >>= BBR_SCALE; +- rate *= USEC_PER_SEC / 100 * (100 - bbr_pacing_margin_percent); +- return rate >> BW_SCALE; ++ rate *= USEC_PER_SEC / 100 * (100 - margin); ++ rate >>= BW_SCALE; ++ rate = max(rate, 1ULL); ++ return rate; ++} ++ ++static u64 bbr_bw_bytes_per_sec(struct sock *sk, u64 rate) ++{ ++ return bbr_rate_bytes_per_sec(sk, rate, BBR_UNIT, 0); + } + + /* Convert a BBR bw and gain factor to a pacing rate in bytes per second. */ +@@ -257,12 +434,13 @@ static unsigned long bbr_bw_to_pacing_rate(struct sock *sk, u32 bw, int gain) + { + u64 rate = bw; + +- rate = bbr_rate_bytes_per_sec(sk, rate, gain); ++ rate = bbr_rate_bytes_per_sec(sk, rate, gain, ++ bbr_pacing_margin_percent); + rate = min_t(u64, rate, READ_ONCE(sk->sk_max_pacing_rate)); + return rate; + } + +-/* Initialize pacing rate to: high_gain * init_cwnd / RTT. */ ++/* Initialize pacing rate to: startup_pacing_gain * init_cwnd / RTT. */ + static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -279,7 +457,8 @@ static void bbr_init_pacing_rate_from_rtt(struct sock *sk) + bw = (u64)tcp_snd_cwnd(tp) * BW_UNIT; + do_div(bw, rtt_us); + WRITE_ONCE(sk->sk_pacing_rate, +- bbr_bw_to_pacing_rate(sk, bw, bbr_high_gain)); ++ bbr_bw_to_pacing_rate(sk, bw, ++ bbr_param(sk, startup_pacing_gain))); + } + + /* Pace using current bw estimate and a gain factor. */ +@@ -295,26 +474,48 @@ static void bbr_set_pacing_rate(struct sock *sk, u32 bw, int gain) + WRITE_ONCE(sk->sk_pacing_rate, rate); + } + +-/* override sysctl_tcp_min_tso_segs */ +-__bpf_kfunc static u32 bbr_min_tso_segs(struct sock *sk) ++/* Return the number of segments BBR would like in a TSO/GSO skb, given a ++ * particular max gso size as a constraint. TODO: make this simpler and more ++ * consistent by switching bbr to just call tcp_tso_autosize(). ++ */ ++static u32 bbr_tso_segs_generic(struct sock *sk, unsigned int mss_now, ++ u32 gso_max_size) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 segs, r; ++ u64 bytes; ++ ++ /* Budget a TSO/GSO burst size allowance based on bw (pacing_rate). */ ++ bytes = READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift); ++ ++ /* Budget a TSO/GSO burst size allowance based on min_rtt. For every ++ * K = 2^tso_rtt_shift microseconds of min_rtt, halve the burst. ++ * The min_rtt-based burst allowance is: 64 KBytes / 2^(min_rtt/K) ++ */ ++ if (bbr_param(sk, tso_rtt_shift)) { ++ r = bbr->min_rtt_us >> bbr_param(sk, tso_rtt_shift); ++ if (r < BITS_PER_TYPE(u32)) /* prevent undefined behavior */ ++ bytes += GSO_LEGACY_MAX_SIZE >> r; ++ } ++ ++ bytes = min_t(u32, bytes, gso_max_size - 1 - MAX_TCP_HEADER); ++ segs = max_t(u32, bytes / mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ return segs; ++} ++ ++/* Custom tcp_tso_autosize() for BBR, used at transmit time to cap skb size. */ ++__bpf_kfunc static u32 bbr_tso_segs(struct sock *sk, unsigned int mss_now) + { +- return READ_ONCE(sk->sk_pacing_rate) < (bbr_min_tso_rate >> 3) ? 1 : 2; ++ return bbr_tso_segs_generic(sk, mss_now, sk->sk_gso_max_size); + } + ++/* Like bbr_tso_segs(), using mss_cache, ignoring driver's sk_gso_max_size. */ + static u32 bbr_tso_segs_goal(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +- u32 segs, bytes; +- +- /* Sort of tcp_tso_autosize() but ignoring +- * driver provided sk_gso_max_size. +- */ +- bytes = min_t(unsigned long, +- READ_ONCE(sk->sk_pacing_rate) >> READ_ONCE(sk->sk_pacing_shift), +- GSO_LEGACY_MAX_SIZE - 1 - MAX_TCP_HEADER); +- segs = max_t(u32, bytes / tp->mss_cache, bbr_min_tso_segs(sk)); + +- return min(segs, 0x7FU); ++ return bbr_tso_segs_generic(sk, tp->mss_cache, GSO_LEGACY_MAX_SIZE); + } + + /* Save "last known good" cwnd so we can restore it after losses or PROBE_RTT */ +@@ -334,7 +535,9 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- if (event == CA_EVENT_TX_START && tp->app_limited) { ++ if (event == CA_EVENT_TX_START) { ++ if (!tp->app_limited) ++ return; + bbr->idle_restart = 1; + bbr->ack_epoch_mstamp = tp->tcp_mstamp; + bbr->ack_epoch_acked = 0; +@@ -345,6 +548,16 @@ __bpf_kfunc static void bbr_cwnd_event(struct sock *sk, enum tcp_ca_event event) + bbr_set_pacing_rate(sk, bbr_bw(sk), BBR_UNIT); + else if (bbr->mode == BBR_PROBE_RTT) + bbr_check_probe_rtt_done(sk); ++ } else if ((event == CA_EVENT_ECN_IS_CE || ++ event == CA_EVENT_ECN_NO_CE) && ++ bbr_can_use_ecn(sk) && ++ bbr_param(sk, precise_ece_ack)) { ++ u32 state = bbr->ce_state; ++ dctcp_ece_ack_update(sk, event, &bbr->prior_rcv_nxt, &state); ++ bbr->ce_state = state; ++ } else if (event == CA_EVENT_TLP_RECOVERY && ++ bbr_param(sk, loss_probe_recovery)) { ++ bbr_run_loss_probe_recovery(sk); + } + } + +@@ -367,10 +580,10 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * default. This should only happen when the connection is not using TCP + * timestamps and has retransmitted all of the SYN/SYNACK/data packets + * ACKed so far. In this case, an RTO can cut cwnd to 1, in which +- * case we need to slow-start up toward something safe: TCP_INIT_CWND. ++ * case we need to slow-start up toward something safe: initial cwnd. + */ + if (unlikely(bbr->min_rtt_us == ~0U)) /* no valid RTT samples yet? */ +- return TCP_INIT_CWND; /* be safe: cap at default initial cwnd*/ ++ return bbr->init_cwnd; /* be safe: cap at initial cwnd */ + + w = (u64)bw * bbr->min_rtt_us; + +@@ -387,23 +600,23 @@ static u32 bbr_bdp(struct sock *sk, u32 bw, int gain) + * - one skb in sending host Qdisc, + * - one skb in sending host TSO/GSO engine + * - one skb being received by receiver host LRO/GRO/delayed-ACK engine +- * Don't worry, at low rates (bbr_min_tso_rate) this won't bloat cwnd because +- * in such cases tso_segs_goal is 1. The minimum cwnd is 4 packets, ++ * Don't worry, at low rates this won't bloat cwnd because ++ * in such cases tso_segs_goal is small. The minimum cwnd is 4 packets, + * which allows 2 outstanding 2-packet sequences, to try to keep pipe + * full even with ACK-every-other-packet delayed ACKs. + */ + static u32 bbr_quantization_budget(struct sock *sk, u32 cwnd) + { + struct bbr *bbr = inet_csk_ca(sk); ++ u32 tso_segs_goal; + +- /* Allow enough full-sized skbs in flight to utilize end systems. */ +- cwnd += 3 * bbr_tso_segs_goal(sk); +- +- /* Reduce delayed ACKs by rounding up cwnd to the next even number. */ +- cwnd = (cwnd + 1) & ~1U; ++ tso_segs_goal = 3 * bbr_tso_segs_goal(sk); + ++ /* Allow enough full-sized skbs in flight to utilize end systems. */ ++ cwnd = max_t(u32, cwnd, tso_segs_goal); ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + /* Ensure gain cycling gets inflight above BDP even for small BDPs. */ +- if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == 0) ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) + cwnd += 2; + + return cwnd; +@@ -458,10 +671,10 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + { + u32 max_aggr_cwnd, aggr_cwnd = 0; + +- if (bbr_extra_acked_gain && bbr_full_bw_reached(sk)) { ++ if (bbr_param(sk, extra_acked_gain)) { + max_aggr_cwnd = ((u64)bbr_bw(sk) * bbr_extra_acked_max_us) + / BW_UNIT; +- aggr_cwnd = (bbr_extra_acked_gain * bbr_extra_acked(sk)) ++ aggr_cwnd = (bbr_param(sk, extra_acked_gain) * bbr_extra_acked(sk)) + >> BBR_SCALE; + aggr_cwnd = min(aggr_cwnd, max_aggr_cwnd); + } +@@ -469,66 +682,27 @@ static u32 bbr_ack_aggregation_cwnd(struct sock *sk) + return aggr_cwnd; + } + +-/* An optimization in BBR to reduce losses: On the first round of recovery, we +- * follow the packet conservation principle: send P packets per P packets acked. +- * After that, we slow-start and send at most 2*P packets per P packets acked. +- * After recovery finishes, or upon undo, we restore the cwnd we had when +- * recovery started (capped by the target cwnd based on estimated BDP). +- * +- * TODO(ycheng/ncardwell): implement a rate-based approach. +- */ +-static bool bbr_set_cwnd_to_recover_or_restore( +- struct sock *sk, const struct rate_sample *rs, u32 acked, u32 *new_cwnd) ++/* Returns the cwnd for PROBE_RTT mode. */ ++static u32 bbr_probe_rtt_cwnd(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u8 prev_state = bbr->prev_ca_state, state = inet_csk(sk)->icsk_ca_state; +- u32 cwnd = tcp_snd_cwnd(tp); +- +- /* An ACK for P pkts should release at most 2*P packets. We do this +- * in two steps. First, here we deduct the number of lost packets. +- * Then, in bbr_set_cwnd() we slow start up toward the target cwnd. +- */ +- if (rs->losses > 0) +- cwnd = max_t(s32, cwnd - rs->losses, 1); +- +- if (state == TCP_CA_Recovery && prev_state != TCP_CA_Recovery) { +- /* Starting 1st round of Recovery, so do packet conservation. */ +- bbr->packet_conservation = 1; +- bbr->next_rtt_delivered = tp->delivered; /* start round now */ +- /* Cut unused cwnd from app behavior, TSQ, or TSO deferral: */ +- cwnd = tcp_packets_in_flight(tp) + acked; +- } else if (prev_state >= TCP_CA_Recovery && state < TCP_CA_Recovery) { +- /* Exiting loss recovery; restore cwnd saved before recovery. */ +- cwnd = max(cwnd, bbr->prior_cwnd); +- bbr->packet_conservation = 0; +- } +- bbr->prev_ca_state = state; +- +- if (bbr->packet_conservation) { +- *new_cwnd = max(cwnd, tcp_packets_in_flight(tp) + acked); +- return true; /* yes, using packet conservation */ +- } +- *new_cwnd = cwnd; +- return false; ++ return max_t(u32, bbr_param(sk, cwnd_min_target), ++ bbr_bdp(sk, bbr_bw(sk), bbr_param(sk, probe_rtt_cwnd_gain))); + } + + /* Slow-start up toward target cwnd (if bw estimate is growing, or packet loss + * has drawn us down below target), or snap down to target if we're above it. + */ + static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, +- u32 acked, u32 bw, int gain) ++ u32 acked, u32 bw, int gain, u32 cwnd, ++ struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u32 cwnd = tcp_snd_cwnd(tp), target_cwnd = 0; ++ u32 target_cwnd = 0; + + if (!acked) + goto done; /* no packet fully ACKed; just apply caps */ + +- if (bbr_set_cwnd_to_recover_or_restore(sk, rs, acked, &cwnd)) +- goto done; +- + target_cwnd = bbr_bdp(sk, bw, gain); + + /* Increment the cwnd to account for excess ACKed data that seems +@@ -537,74 +711,26 @@ static void bbr_set_cwnd(struct sock *sk, const struct rate_sample *rs, + target_cwnd += bbr_ack_aggregation_cwnd(sk); + target_cwnd = bbr_quantization_budget(sk, target_cwnd); + +- /* If we're below target cwnd, slow start cwnd toward target cwnd. */ +- if (bbr_full_bw_reached(sk)) /* only cut cwnd if we filled the pipe */ +- cwnd = min(cwnd + acked, target_cwnd); +- else if (cwnd < target_cwnd || tp->delivered < TCP_INIT_CWND) +- cwnd = cwnd + acked; +- cwnd = max(cwnd, bbr_cwnd_min_target); ++ /* Update cwnd and enable fast path if cwnd reaches target_cwnd. */ ++ bbr->try_fast_path = 0; ++ if (bbr_full_bw_reached(sk)) { /* only cut cwnd if we filled the pipe */ ++ cwnd += acked; ++ if (cwnd >= target_cwnd) { ++ cwnd = target_cwnd; ++ bbr->try_fast_path = 1; ++ } ++ } else if (cwnd < target_cwnd || cwnd < 2 * bbr->init_cwnd) { ++ cwnd += acked; ++ } else { ++ bbr->try_fast_path = 1; ++ } + ++ cwnd = max_t(u32, cwnd, bbr_param(sk, cwnd_min_target)); + done: +- tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* apply global cap */ ++ tcp_snd_cwnd_set(tp, min(cwnd, tp->snd_cwnd_clamp)); /* global cap */ + if (bbr->mode == BBR_PROBE_RTT) /* drain queue, refresh min_rtt */ +- tcp_snd_cwnd_set(tp, min(tcp_snd_cwnd(tp), bbr_cwnd_min_target)); +-} +- +-/* End cycle phase if it's time and/or we hit the phase's in-flight target. */ +-static bool bbr_is_next_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- bool is_full_length = +- tcp_stamp_us_delta(tp->delivered_mstamp, bbr->cycle_mstamp) > +- bbr->min_rtt_us; +- u32 inflight, bw; +- +- /* The pacing_gain of 1.0 paces at the estimated bw to try to fully +- * use the pipe without increasing the queue. +- */ +- if (bbr->pacing_gain == BBR_UNIT) +- return is_full_length; /* just use wall clock time */ +- +- inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); +- bw = bbr_max_bw(sk); +- +- /* A pacing_gain > 1.0 probes for bw by trying to raise inflight to at +- * least pacing_gain*BDP; this may take more than min_rtt if min_rtt is +- * small (e.g. on a LAN). We do not persist if packets are lost, since +- * a path with small buffers may not hold that much. +- */ +- if (bbr->pacing_gain > BBR_UNIT) +- return is_full_length && +- (rs->losses || /* perhaps pacing_gain*BDP won't fit */ +- inflight >= bbr_inflight(sk, bw, bbr->pacing_gain)); +- +- /* A pacing_gain < 1.0 tries to drain extra queue we added if bw +- * probing didn't find more bw. If inflight falls to match BDP then we +- * estimate queue is drained; persisting would underutilize the pipe. +- */ +- return is_full_length || +- inflight <= bbr_inflight(sk, bw, BBR_UNIT); +-} +- +-static void bbr_advance_cycle_phase(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->cycle_idx = (bbr->cycle_idx + 1) & (CYCLE_LEN - 1); +- bbr->cycle_mstamp = tp->delivered_mstamp; +-} +- +-/* Gain cycling: cycle pacing gain to converge to fair share of available bw. */ +-static void bbr_update_cycle_phase(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_PROBE_BW && bbr_is_next_cycle_phase(sk, rs)) +- bbr_advance_cycle_phase(sk); ++ tcp_snd_cwnd_set(tp, min_t(u32, tcp_snd_cwnd(tp), ++ bbr_probe_rtt_cwnd(sk))); + } + + static void bbr_reset_startup_mode(struct sock *sk) +@@ -614,191 +740,49 @@ static void bbr_reset_startup_mode(struct sock *sk) + bbr->mode = BBR_STARTUP; + } + +-static void bbr_reset_probe_bw_mode(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->mode = BBR_PROBE_BW; +- bbr->cycle_idx = CYCLE_LEN - 1 - get_random_u32_below(bbr_cycle_rand); +- bbr_advance_cycle_phase(sk); /* flip to next phase of gain cycle */ +-} +- +-static void bbr_reset_mode(struct sock *sk) +-{ +- if (!bbr_full_bw_reached(sk)) +- bbr_reset_startup_mode(sk); +- else +- bbr_reset_probe_bw_mode(sk); +-} +- +-/* Start a new long-term sampling interval. */ +-static void bbr_reset_lt_bw_sampling_interval(struct sock *sk) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_last_stamp = div_u64(tp->delivered_mstamp, USEC_PER_MSEC); +- bbr->lt_last_delivered = tp->delivered; +- bbr->lt_last_lost = tp->lost; +- bbr->lt_rtt_cnt = 0; +-} +- +-/* Completely reset long-term bandwidth sampling. */ +-static void bbr_reset_lt_bw_sampling(struct sock *sk) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- bbr->lt_bw = 0; +- bbr->lt_use_bw = 0; +- bbr->lt_is_sampling = false; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Long-term bw sampling interval is done. Estimate whether we're policed. */ +-static void bbr_lt_bw_interval_done(struct sock *sk, u32 bw) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 diff; +- +- if (bbr->lt_bw) { /* do we have bw from a previous interval? */ +- /* Is new bw close to the lt_bw from the previous interval? */ +- diff = abs(bw - bbr->lt_bw); +- if ((diff * BBR_UNIT <= bbr_lt_bw_ratio * bbr->lt_bw) || +- (bbr_rate_bytes_per_sec(sk, diff, BBR_UNIT) <= +- bbr_lt_bw_diff)) { +- /* All criteria are met; estimate we're policed. */ +- bbr->lt_bw = (bw + bbr->lt_bw) >> 1; /* avg 2 intvls */ +- bbr->lt_use_bw = 1; +- bbr->pacing_gain = BBR_UNIT; /* try to avoid drops */ +- bbr->lt_rtt_cnt = 0; +- return; +- } +- } +- bbr->lt_bw = bw; +- bbr_reset_lt_bw_sampling_interval(sk); +-} +- +-/* Token-bucket traffic policers are common (see "An Internet-Wide Analysis of +- * Traffic Policing", SIGCOMM 2016). BBR detects token-bucket policers and +- * explicitly models their policed rate, to reduce unnecessary losses. We +- * estimate that we're policed if we see 2 consecutive sampling intervals with +- * consistent throughput and high packet loss. If we think we're being policed, +- * set lt_bw to the "long-term" average delivery rate from those 2 intervals. ++/* See if we have reached next round trip. Upon start of the new round, ++ * returns packets delivered since previous round start plus this ACK. + */ +-static void bbr_lt_bw_sampling(struct sock *sk, const struct rate_sample *rs) +-{ +- struct tcp_sock *tp = tcp_sk(sk); +- struct bbr *bbr = inet_csk_ca(sk); +- u32 lost, delivered; +- u64 bw; +- u32 t; +- +- if (bbr->lt_use_bw) { /* already using long-term rate, lt_bw? */ +- if (bbr->mode == BBR_PROBE_BW && bbr->round_start && +- ++bbr->lt_rtt_cnt >= bbr_lt_bw_max_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* stop using lt_bw */ +- bbr_reset_probe_bw_mode(sk); /* restart gain cycling */ +- } +- return; +- } +- +- /* Wait for the first loss before sampling, to let the policer exhaust +- * its tokens and estimate the steady-state rate allowed by the policer. +- * Starting samples earlier includes bursts that over-estimate the bw. +- */ +- if (!bbr->lt_is_sampling) { +- if (!rs->losses) +- return; +- bbr_reset_lt_bw_sampling_interval(sk); +- bbr->lt_is_sampling = true; +- } +- +- /* To avoid underestimates, reset sampling if we run out of data. */ +- if (rs->is_app_limited) { +- bbr_reset_lt_bw_sampling(sk); +- return; +- } +- +- if (bbr->round_start) +- bbr->lt_rtt_cnt++; /* count round trips in this interval */ +- if (bbr->lt_rtt_cnt < bbr_lt_intvl_min_rtts) +- return; /* sampling interval needs to be longer */ +- if (bbr->lt_rtt_cnt > 4 * bbr_lt_intvl_min_rtts) { +- bbr_reset_lt_bw_sampling(sk); /* interval is too long */ +- return; +- } +- +- /* End sampling interval when a packet is lost, so we estimate the +- * policer tokens were exhausted. Stopping the sampling before the +- * tokens are exhausted under-estimates the policed rate. +- */ +- if (!rs->losses) +- return; +- +- /* Calculate packets lost and delivered in sampling interval. */ +- lost = tp->lost - bbr->lt_last_lost; +- delivered = tp->delivered - bbr->lt_last_delivered; +- /* Is loss rate (lost/delivered) >= lt_loss_thresh? If not, wait. */ +- if (!delivered || (lost << BBR_SCALE) < bbr_lt_loss_thresh * delivered) +- return; +- +- /* Find average delivery rate in this sampling interval. */ +- t = div_u64(tp->delivered_mstamp, USEC_PER_MSEC) - bbr->lt_last_stamp; +- if ((s32)t < 1) +- return; /* interval is less than one ms, so wait */ +- /* Check if can multiply without overflow */ +- if (t >= ~0U / USEC_PER_MSEC) { +- bbr_reset_lt_bw_sampling(sk); /* interval too long; reset */ +- return; +- } +- t *= USEC_PER_MSEC; +- bw = (u64)delivered * BW_UNIT; +- do_div(bw, t); +- bbr_lt_bw_interval_done(sk, bw); +-} +- +-/* Estimate the bandwidth based on how fast packets are delivered */ +-static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) ++static u32 bbr_update_round_start(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw; ++ u32 round_delivered = 0; + + bbr->round_start = 0; +- if (rs->delivered < 0 || rs->interval_us <= 0) +- return; /* Not a valid observation */ + + /* See if we've reached the next RTT */ +- if (!before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ if (rs->interval_us > 0 && ++ !before(rs->prior_delivered, bbr->next_rtt_delivered)) { ++ round_delivered = tp->delivered - bbr->next_rtt_delivered; + bbr->next_rtt_delivered = tp->delivered; +- bbr->rtt_cnt++; + bbr->round_start = 1; +- bbr->packet_conservation = 0; + } ++ return round_delivered; ++} + +- bbr_lt_bw_sampling(sk, rs); ++/* Calculate the bandwidth based on how fast packets are delivered */ ++static void bbr_calculate_bw_sample(struct sock *sk, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ u64 bw = 0; + + /* Divide delivered by the interval to find a (lower bound) bottleneck + * bandwidth sample. Delivered is in packets and interval_us in uS and + * ratio will be <<1 for most connections. So delivered is first scaled. ++ * Round up to allow growth at low rates, even with integer division. + */ +- bw = div64_long((u64)rs->delivered * BW_UNIT, rs->interval_us); +- +- /* If this sample is application-limited, it is likely to have a very +- * low delivered count that represents application behavior rather than +- * the available network rate. Such a sample could drag down estimated +- * bw, causing needless slow-down. Thus, to continue to send at the +- * last measured network rate, we filter out app-limited samples unless +- * they describe the path bw at least as well as our bw model. +- * +- * So the goal during app-limited phase is to proceed with the best +- * network rate no matter how long. We automatically leave this +- * phase when app writes faster than the network can deliver :) +- */ +- if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) { +- /* Incorporate new sample into our max bw filter. */ +- minmax_running_max(&bbr->bw, bbr_bw_rtts, bbr->rtt_cnt, bw); ++ if (rs->interval_us > 0) { ++ if (WARN_ONCE(rs->delivered < 0, ++ "negative delivered: %d interval_us: %ld\n", ++ rs->delivered, rs->interval_us)) ++ return; ++ ++ bw = DIV_ROUND_UP_ULL((u64)rs->delivered * BW_UNIT, rs->interval_us); + } ++ ++ ctx->sample_bw = bw; + } + + /* Estimates the windowed max degree of ack aggregation. +@@ -812,7 +796,7 @@ static void bbr_update_bw(struct sock *sk, const struct rate_sample *rs) + * + * Max extra_acked is clamped by cwnd and bw * bbr_extra_acked_max_us (100 ms). + * Max filter is an approximate sliding window of 5-10 (packet timed) round +- * trips. ++ * trips for non-startup phase, and 1-2 round trips for startup. + */ + static void bbr_update_ack_aggregation(struct sock *sk, + const struct rate_sample *rs) +@@ -820,15 +804,19 @@ static void bbr_update_ack_aggregation(struct sock *sk, + u32 epoch_us, expected_acked, extra_acked; + struct bbr *bbr = inet_csk_ca(sk); + struct tcp_sock *tp = tcp_sk(sk); ++ u32 extra_acked_win_rtts_thresh = bbr_param(sk, extra_acked_win_rtts); + +- if (!bbr_extra_acked_gain || rs->acked_sacked <= 0 || ++ if (!bbr_param(sk, extra_acked_gain) || rs->acked_sacked <= 0 || + rs->delivered < 0 || rs->interval_us <= 0) + return; + + if (bbr->round_start) { + bbr->extra_acked_win_rtts = min(0x1F, + bbr->extra_acked_win_rtts + 1); +- if (bbr->extra_acked_win_rtts >= bbr_extra_acked_win_rtts) { ++ if (!bbr_full_bw_reached(sk)) ++ extra_acked_win_rtts_thresh = 1; ++ if (bbr->extra_acked_win_rtts >= ++ extra_acked_win_rtts_thresh) { + bbr->extra_acked_win_rtts = 0; + bbr->extra_acked_win_idx = bbr->extra_acked_win_idx ? + 0 : 1; +@@ -862,49 +850,6 @@ static void bbr_update_ack_aggregation(struct sock *sk, + bbr->extra_acked[bbr->extra_acked_win_idx] = extra_acked; + } + +-/* Estimate when the pipe is full, using the change in delivery rate: BBR +- * estimates that STARTUP filled the pipe if the estimated bw hasn't changed by +- * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited +- * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the +- * higher rwin, 3: we get higher delivery rate samples. Or transient +- * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar +- * design goal, but uses delay and inter-ACK spacing instead of bandwidth. +- */ +-static void bbr_check_full_bw_reached(struct sock *sk, +- const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- u32 bw_thresh; +- +- if (bbr_full_bw_reached(sk) || !bbr->round_start || rs->is_app_limited) +- return; +- +- bw_thresh = (u64)bbr->full_bw * bbr_full_bw_thresh >> BBR_SCALE; +- if (bbr_max_bw(sk) >= bw_thresh) { +- bbr->full_bw = bbr_max_bw(sk); +- bbr->full_bw_cnt = 0; +- return; +- } +- ++bbr->full_bw_cnt; +- bbr->full_bw_reached = bbr->full_bw_cnt >= bbr_full_bw_cnt; +-} +- +-/* If pipe is probably full, drain the queue and then enter steady-state. */ +-static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs) +-{ +- struct bbr *bbr = inet_csk_ca(sk); +- +- if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { +- bbr->mode = BBR_DRAIN; /* drain queue we created */ +- tcp_sk(sk)->snd_ssthresh = +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); +- } /* fall through to check if in-flight is already small: */ +- if (bbr->mode == BBR_DRAIN && +- bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= +- bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) +- bbr_reset_probe_bw_mode(sk); /* we estimate queue is drained */ +-} +- + static void bbr_check_probe_rtt_done(struct sock *sk) + { + struct tcp_sock *tp = tcp_sk(sk); +@@ -914,9 +859,9 @@ static void bbr_check_probe_rtt_done(struct sock *sk) + after(tcp_jiffies32, bbr->probe_rtt_done_stamp))) + return; + +- bbr->min_rtt_stamp = tcp_jiffies32; /* wait a while until PROBE_RTT */ ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; /* schedule next PROBE_RTT */ + tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); +- bbr_reset_mode(sk); ++ bbr_exit_probe_rtt(sk); + } + + /* The goal of PROBE_RTT mode is to have BBR flows cooperatively and +@@ -942,23 +887,35 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- bool filter_expired; ++ bool probe_rtt_expired, min_rtt_expired; ++ u32 expire; + +- /* Track min RTT seen in the min_rtt_win_sec filter window: */ +- filter_expired = after(tcp_jiffies32, +- bbr->min_rtt_stamp + bbr_min_rtt_win_sec * HZ); ++ /* Track min RTT in probe_rtt_win_ms to time next PROBE_RTT state. */ ++ expire = bbr->probe_rtt_min_stamp + ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_win_ms)); ++ probe_rtt_expired = after(tcp_jiffies32, expire); + if (rs->rtt_us >= 0 && +- (rs->rtt_us < bbr->min_rtt_us || +- (filter_expired && !rs->is_ack_delayed))) { +- bbr->min_rtt_us = rs->rtt_us; +- bbr->min_rtt_stamp = tcp_jiffies32; ++ (rs->rtt_us < bbr->probe_rtt_min_us || ++ (probe_rtt_expired && !rs->is_ack_delayed))) { ++ bbr->probe_rtt_min_us = rs->rtt_us; ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ } ++ /* Track min RTT seen in the min_rtt_win_sec filter window: */ ++ expire = bbr->min_rtt_stamp + bbr_param(sk, min_rtt_win_sec) * HZ; ++ min_rtt_expired = after(tcp_jiffies32, expire); ++ if (bbr->probe_rtt_min_us <= bbr->min_rtt_us || ++ min_rtt_expired) { ++ bbr->min_rtt_us = bbr->probe_rtt_min_us; ++ bbr->min_rtt_stamp = bbr->probe_rtt_min_stamp; + } + +- if (bbr_probe_rtt_mode_ms > 0 && filter_expired && ++ if (bbr_param(sk, probe_rtt_mode_ms) > 0 && probe_rtt_expired && + !bbr->idle_restart && bbr->mode != BBR_PROBE_RTT) { + bbr->mode = BBR_PROBE_RTT; /* dip, drain queue */ + bbr_save_cwnd(sk); /* note cwnd so we can restore it */ + bbr->probe_rtt_done_stamp = 0; ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; + } + + if (bbr->mode == BBR_PROBE_RTT) { +@@ -967,9 +924,9 @@ static void bbr_update_min_rtt(struct sock *sk, const struct rate_sample *rs) + (tp->delivered + tcp_packets_in_flight(tp)) ? : 1; + /* Maintain min packets in flight for max(200 ms, 1 round). */ + if (!bbr->probe_rtt_done_stamp && +- tcp_packets_in_flight(tp) <= bbr_cwnd_min_target) { ++ tcp_packets_in_flight(tp) <= bbr_probe_rtt_cwnd(sk)) { + bbr->probe_rtt_done_stamp = tcp_jiffies32 + +- msecs_to_jiffies(bbr_probe_rtt_mode_ms); ++ msecs_to_jiffies(bbr_param(sk, probe_rtt_mode_ms)); + bbr->probe_rtt_round_done = 0; + bbr->next_rtt_delivered = tp->delivered; + } else if (bbr->probe_rtt_done_stamp) { +@@ -990,18 +947,20 @@ static void bbr_update_gains(struct sock *sk) + + switch (bbr->mode) { + case BBR_STARTUP: +- bbr->pacing_gain = bbr_high_gain; +- bbr->cwnd_gain = bbr_high_gain; ++ bbr->pacing_gain = bbr_param(sk, startup_pacing_gain); ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); + break; + case BBR_DRAIN: +- bbr->pacing_gain = bbr_drain_gain; /* slow, to drain */ +- bbr->cwnd_gain = bbr_high_gain; /* keep cwnd */ ++ bbr->pacing_gain = bbr_param(sk, drain_gain); /* slow, to drain */ ++ bbr->cwnd_gain = bbr_param(sk, startup_cwnd_gain); /* keep cwnd */ + break; + case BBR_PROBE_BW: +- bbr->pacing_gain = (bbr->lt_use_bw ? +- BBR_UNIT : +- bbr_pacing_gain[bbr->cycle_idx]); +- bbr->cwnd_gain = bbr_cwnd_gain; ++ bbr->pacing_gain = bbr_pacing_gain[bbr->cycle_idx]; ++ bbr->cwnd_gain = bbr_param(sk, cwnd_gain); ++ if (bbr_param(sk, bw_probe_cwnd_gain) && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr->cwnd_gain += ++ BBR_UNIT * bbr_param(sk, bw_probe_cwnd_gain) / 4; + break; + case BBR_PROBE_RTT: + bbr->pacing_gain = BBR_UNIT; +@@ -1013,144 +972,1387 @@ static void bbr_update_gains(struct sock *sk) + } + } + +-static void bbr_update_model(struct sock *sk, const struct rate_sample *rs) ++__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) + { +- bbr_update_bw(sk, rs); +- bbr_update_ack_aggregation(sk, rs); +- bbr_update_cycle_phase(sk, rs); +- bbr_check_full_bw_reached(sk, rs); +- bbr_check_drain(sk, rs); +- bbr_update_min_rtt(sk, rs); +- bbr_update_gains(sk); ++ /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ ++ return 3; + } + +-__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, const struct rate_sample *rs) ++/* Incorporate a new bw sample into the current window of our max filter. */ ++static void bbr_take_max_bw_sample(struct sock *sk, u32 bw) + { + struct bbr *bbr = inet_csk_ca(sk); +- u32 bw; +- +- bbr_update_model(sk, rs); + +- bw = bbr_bw(sk); +- bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); +- bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain); ++ bbr->bw_hi[1] = max(bw, bbr->bw_hi[1]); + } + +-__bpf_kfunc static void bbr_init(struct sock *sk) ++/* Keep max of last 1-2 cycles. Each PROBE_BW cycle, flip filter window. */ ++static void bbr_advance_max_bw_filter(struct sock *sk) + { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + +- bbr->prior_cwnd = 0; +- tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; +- bbr->rtt_cnt = 0; +- bbr->next_rtt_delivered = tp->delivered; +- bbr->prev_ca_state = TCP_CA_Open; +- bbr->packet_conservation = 0; +- +- bbr->probe_rtt_done_stamp = 0; +- bbr->probe_rtt_round_done = 0; +- bbr->min_rtt_us = tcp_min_rtt(tp); +- bbr->min_rtt_stamp = tcp_jiffies32; +- +- minmax_reset(&bbr->bw, bbr->rtt_cnt, 0); /* init max bw to 0 */ ++ if (!bbr->bw_hi[1]) ++ return; /* no samples in this window; remember old window */ ++ bbr->bw_hi[0] = bbr->bw_hi[1]; ++ bbr->bw_hi[1] = 0; ++} + +- bbr->has_seen_rtt = 0; +- bbr_init_pacing_rate_from_rtt(sk); ++/* Reset the estimator for reaching full bandwidth based on bw plateau. */ ++static void bbr_reset_full_bw(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); + +- bbr->round_start = 0; +- bbr->idle_restart = 0; +- bbr->full_bw_reached = 0; + bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr->cycle_mstamp = 0; +- bbr->cycle_idx = 0; +- bbr_reset_lt_bw_sampling(sk); +- bbr_reset_startup_mode(sk); ++ bbr->full_bw_now = 0; ++} + +- bbr->ack_epoch_mstamp = tp->tcp_mstamp; +- bbr->ack_epoch_acked = 0; +- bbr->extra_acked_win_rtts = 0; +- bbr->extra_acked_win_idx = 0; +- bbr->extra_acked[0] = 0; +- bbr->extra_acked[1] = 0; ++/* How much do we want in flight? Our BDP, unless congestion cut cwnd. */ ++static u32 bbr_target_inflight(struct sock *sk) ++{ ++ u32 bdp = bbr_inflight(sk, bbr_bw(sk), BBR_UNIT); + +- cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ return min(bdp, tcp_sk(sk)->snd_cwnd); + } + +-__bpf_kfunc static u32 bbr_sndbuf_expand(struct sock *sk) ++static bool bbr_is_probing_bandwidth(struct sock *sk) + { +- /* Provision 3 * cwnd since BBR may slow-start even during recovery. */ +- return 3; ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return (bbr->mode == BBR_STARTUP) || ++ (bbr->mode == BBR_PROBE_BW && ++ (bbr->cycle_idx == BBR_BW_PROBE_REFILL || ++ bbr->cycle_idx == BBR_BW_PROBE_UP)); ++} ++ ++/* Has the given amount of time elapsed since we marked the phase start? */ ++static bool bbr_has_elapsed_in_phase(const struct sock *sk, u32 interval_us) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ const struct bbr *bbr = inet_csk_ca(sk); ++ ++ return tcp_stamp_us_delta(tp->tcp_mstamp, ++ bbr->cycle_mstamp + interval_us) > 0; ++} ++ ++static void bbr_handle_queue_too_high_in_startup(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bdp; /* estimated BDP in packets, with quantization budget */ ++ ++ bbr->full_bw_reached = 1; ++ ++ bdp = bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr->inflight_hi = max(bdp, bbr->inflight_latest); ++} ++ ++/* Exit STARTUP upon N consecutive rounds with ECN mark rate > ecn_thresh. */ ++static void bbr_check_ecn_too_high_in_startup(struct sock *sk, u32 ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk) || !bbr->ecn_eligible || ++ !bbr_param(sk, full_ecn_cnt) || !bbr_param(sk, ecn_thresh)) ++ return; ++ ++ if (ce_ratio >= bbr_param(sk, ecn_thresh)) ++ bbr->startup_ecn_rounds++; ++ else ++ bbr->startup_ecn_rounds = 0; ++ ++ if (bbr->startup_ecn_rounds >= bbr_param(sk, full_ecn_cnt)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++} ++ ++/* Updates ecn_alpha and returns ce_ratio. -1 if not available. */ ++static int bbr_update_ecn_alpha(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct net *net = sock_net(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ s32 delivered, delivered_ce; ++ u64 alpha, ce_ratio; ++ u32 gain; ++ bool want_ecn_alpha; ++ ++ /* See if we should use ECN sender logic for this connection. */ ++ if (!bbr->ecn_eligible && bbr_can_use_ecn(sk) && ++ !!bbr_param(sk, ecn_factor) && ++ (bbr->min_rtt_us <= bbr_ecn_max_rtt_us || ++ !bbr_ecn_max_rtt_us)) ++ bbr->ecn_eligible = 1; ++ ++ /* Skip updating alpha only if not ECN-eligible and PLB is disabled. */ ++ want_ecn_alpha = (bbr->ecn_eligible || ++ (bbr_can_use_ecn(sk) && ++ READ_ONCE(net->ipv4.sysctl_tcp_plb_enabled))); ++ if (!want_ecn_alpha) ++ return -1; ++ ++ delivered = tp->delivered - bbr->alpha_last_delivered; ++ delivered_ce = tp->delivered_ce - bbr->alpha_last_delivered_ce; ++ ++ if (delivered == 0 || /* avoid divide by zero */ ++ WARN_ON_ONCE(delivered < 0 || delivered_ce < 0)) /* backwards? */ ++ return -1; ++ ++ BUILD_BUG_ON(BBR_SCALE != TCP_PLB_SCALE); ++ ce_ratio = (u64)delivered_ce << BBR_SCALE; ++ do_div(ce_ratio, delivered); ++ ++ gain = bbr_param(sk, ecn_alpha_gain); ++ alpha = ((BBR_UNIT - gain) * bbr->ecn_alpha) >> BBR_SCALE; ++ alpha += (gain * ce_ratio) >> BBR_SCALE; ++ bbr->ecn_alpha = min_t(u32, alpha, BBR_UNIT); ++ ++ bbr->alpha_last_delivered = tp->delivered; ++ bbr->alpha_last_delivered_ce = tp->delivered_ce; ++ ++ bbr_check_ecn_too_high_in_startup(sk, ce_ratio); ++ return (int)ce_ratio; + } + +-/* In theory BBR does not need to undo the cwnd since it does not +- * always reduce cwnd on losses (see bbr_main()). Keep it for now. ++/* Protective Load Balancing (PLB). PLB rehashes outgoing data (to a new IPv6 ++ * flow label) if it encounters sustained congestion in the form of ECN marks. + */ +-__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++static void bbr_plb(struct sock *sk, const struct rate_sample *rs, int ce_ratio) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->round_start && ce_ratio >= 0) ++ tcp_plb_update_state(sk, &bbr->plb, ce_ratio); ++ ++ tcp_plb_check_rehash(sk, &bbr->plb); ++} ++ ++/* Each round trip of BBR_BW_PROBE_UP, double volume of probing data. */ ++static void bbr_raise_inflight_hi_slope(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 growth_this_round, cnt; ++ ++ /* Calculate "slope": packets S/Acked per inflight_hi increment. */ ++ growth_this_round = 1 << bbr->bw_probe_up_rounds; ++ bbr->bw_probe_up_rounds = min(bbr->bw_probe_up_rounds + 1, 30); ++ cnt = tcp_snd_cwnd(tp) / growth_this_round; ++ cnt = max(cnt, 1U); ++ bbr->bw_probe_up_cnt = cnt; ++} ++ ++/* In BBR_BW_PROBE_UP, not seeing high loss/ECN/queue, so raise inflight_hi. */ ++static void bbr_probe_inflight_hi_upward(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 delta; ++ ++ if (!tp->is_cwnd_limited || tcp_snd_cwnd(tp) < bbr->inflight_hi) ++ return; /* not fully using inflight_hi, so don't grow it */ ++ ++ /* For each bw_probe_up_cnt packets ACKed, increase inflight_hi by 1. */ ++ bbr->bw_probe_up_acks += rs->acked_sacked; ++ if (bbr->bw_probe_up_acks >= bbr->bw_probe_up_cnt) { ++ delta = bbr->bw_probe_up_acks / bbr->bw_probe_up_cnt; ++ bbr->bw_probe_up_acks -= delta * bbr->bw_probe_up_cnt; ++ bbr->inflight_hi += delta; ++ bbr->try_fast_path = 0; /* Need to update cwnd */ ++ } ++ ++ if (bbr->round_start) ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Does loss/ECN rate for this sample say inflight is "too high"? ++ * This is used by both the bbr_check_loss_too_high_in_startup() function, ++ * and in PROBE_UP. ++ */ ++static bool bbr_is_inflight_too_high(const struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ const struct bbr *bbr = inet_csk_ca(sk); ++ u32 loss_thresh, ecn_thresh; ++ ++ if (rs->lost > 0 && rs->tx_in_flight) { ++ loss_thresh = (u64)rs->tx_in_flight * bbr_param(sk, loss_thresh) >> ++ BBR_SCALE; ++ if (rs->lost > loss_thresh) { ++ return true; ++ } ++ } ++ ++ if (rs->delivered_ce > 0 && rs->delivered > 0 && ++ bbr->ecn_eligible && !!bbr_param(sk, ecn_thresh)) { ++ ecn_thresh = (u64)rs->delivered * bbr_param(sk, ecn_thresh) >> ++ BBR_SCALE; ++ if (rs->delivered_ce > ecn_thresh) { ++ return true; ++ } ++ } ++ ++ return false; ++} ++ ++/* Calculate the tx_in_flight level that corresponded to excessive loss. ++ * We find "lost_prefix" segs of the skb where loss rate went too high, ++ * by solving for "lost_prefix" in the following equation: ++ * lost / inflight >= loss_thresh ++ * (lost_prev + lost_prefix) / (inflight_prev + lost_prefix) >= loss_thresh ++ * Then we take that equation, convert it to fixed point, and ++ * round up to the nearest packet. ++ */ ++static u32 bbr_inflight_hi_from_lost_skb(const struct sock *sk, ++ const struct rate_sample *rs, ++ const struct sk_buff *skb) ++{ ++ const struct tcp_sock *tp = tcp_sk(sk); ++ u32 loss_thresh = bbr_param(sk, loss_thresh); ++ u32 pcount, divisor, inflight_hi; ++ s32 inflight_prev, lost_prev; ++ u64 loss_budget, lost_prefix; ++ ++ pcount = tcp_skb_pcount(skb); ++ ++ /* How much data was in flight before this skb? */ ++ inflight_prev = rs->tx_in_flight - pcount; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ pcount, ++ TCP_SKB_CB(skb)->sacked, ++ rs->tx_in_flight), ++ "tx_in_flight: %u pcount: %u reneg: %u", ++ rs->tx_in_flight, pcount, tcp_sk(sk)->is_sack_reneg); ++ return ~0U; ++ } ++ ++ /* How much inflight data was marked lost before this skb? */ ++ lost_prev = rs->lost - pcount; ++ if (WARN_ONCE(lost_prev < 0, ++ "cwnd: %u ca: %d out: %u lost: %u pif: %u " ++ "tx_in_flight: %u tx.lost: %u tp->lost: %u rs->lost: %d " ++ "lost_prev: %d pcount: %d seq: %u end_seq: %u reneg: %u", ++ tcp_snd_cwnd(tp), inet_csk(sk)->icsk_ca_state, ++ tp->packets_out, tp->lost_out, tcp_packets_in_flight(tp), ++ rs->tx_in_flight, TCP_SKB_CB(skb)->tx.lost, tp->lost, ++ rs->lost, lost_prev, pcount, ++ TCP_SKB_CB(skb)->seq, TCP_SKB_CB(skb)->end_seq, ++ tp->is_sack_reneg)) ++ return ~0U; ++ ++ /* At what prefix of this lost skb did losss rate exceed loss_thresh? */ ++ loss_budget = (u64)inflight_prev * loss_thresh + BBR_UNIT - 1; ++ loss_budget >>= BBR_SCALE; ++ if (lost_prev >= loss_budget) { ++ lost_prefix = 0; /* previous losses crossed loss_thresh */ ++ } else { ++ lost_prefix = loss_budget - lost_prev; ++ lost_prefix <<= BBR_SCALE; ++ divisor = BBR_UNIT - loss_thresh; ++ if (WARN_ON_ONCE(!divisor)) /* loss_thresh is 8 bits */ ++ return ~0U; ++ do_div(lost_prefix, divisor); ++ } ++ ++ inflight_hi = inflight_prev + lost_prefix; ++ return inflight_hi; ++} ++ ++/* If loss/ECN rates during probing indicated we may have overfilled a ++ * buffer, return an operating point that tries to leave unutilized headroom in ++ * the path for other flows, for fairness convergence and lower RTTs and loss. ++ */ ++static u32 bbr_inflight_with_headroom(const struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 headroom, headroom_fraction; ++ ++ if (bbr->inflight_hi == ~0U) ++ return ~0U; ++ ++ headroom_fraction = bbr_param(sk, inflight_headroom); ++ headroom = ((u64)bbr->inflight_hi * headroom_fraction) >> BBR_SCALE; ++ headroom = max(headroom, 1U); ++ return max_t(s32, bbr->inflight_hi - headroom, ++ bbr_param(sk, cwnd_min_target)); ++} ++ ++/* Bound cwnd to a sensible level, based on our current probing state ++ * machine phase and model of a good inflight level (inflight_lo, inflight_hi). ++ */ ++static void bbr_bound_cwnd_for_inflight_model(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 cap; ++ ++ /* tcp_rcv_synsent_state_process() currently calls tcp_ack() ++ * and thus cong_control() without first initializing us(!). ++ */ ++ if (!bbr->initialized) ++ return; ++ ++ cap = ~0U; ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx != BBR_BW_PROBE_CRUISE) { ++ /* Probe to see if more packets fit in the path. */ ++ cap = bbr->inflight_hi; ++ } else { ++ if (bbr->mode == BBR_PROBE_RTT || ++ (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_CRUISE)) ++ cap = bbr_inflight_with_headroom(sk); ++ } ++ /* Adapt to any loss/ECN since our last bw probe. */ ++ cap = min(cap, bbr->inflight_lo); ++ ++ cap = max_t(u32, cap, bbr_param(sk, cwnd_min_target)); ++ tcp_snd_cwnd_set(tp, min(cap, tcp_snd_cwnd(tp))); ++} ++ ++/* How should we multiplicatively cut bw or inflight limits based on ECN? */ ++static u32 bbr_ecn_cut(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ return BBR_UNIT - ++ ((bbr->ecn_alpha * bbr_param(sk, ecn_factor)) >> BBR_SCALE); ++} ++ ++/* Init lower bounds if have not inited yet. */ ++static void bbr_init_lower_bounds(struct sock *sk, bool init_bw) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (init_bw && bbr->bw_lo == ~0U) ++ bbr->bw_lo = bbr_max_bw(sk); ++ if (bbr->inflight_lo == ~0U) ++ bbr->inflight_lo = tcp_snd_cwnd(tp); ++} ++ ++/* Reduce bw and inflight to (1 - beta). */ ++static void bbr_loss_lower_bounds(struct sock *sk, u32 *bw, u32 *inflight) ++{ ++ struct bbr* bbr = inet_csk_ca(sk); ++ u32 loss_cut = BBR_UNIT - bbr_param(sk, beta); ++ ++ *bw = max_t(u32, bbr->bw_latest, ++ (u64)bbr->bw_lo * loss_cut >> BBR_SCALE); ++ *inflight = max_t(u32, bbr->inflight_latest, ++ (u64)bbr->inflight_lo * loss_cut >> BBR_SCALE); ++} ++ ++/* Reduce inflight to (1 - alpha*ecn_factor). */ ++static void bbr_ecn_lower_bounds(struct sock *sk, u32 *inflight) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_cut = bbr_ecn_cut(sk); ++ ++ *inflight = (u64)bbr->inflight_lo * ecn_cut >> BBR_SCALE; ++} ++ ++/* Estimate a short-term lower bound on the capacity available now, based ++ * on measurements of the current delivery process and recent history. When we ++ * are seeing loss/ECN at times when we are not probing bw, then conservatively ++ * move toward flow balance by multiplicatively cutting our short-term ++ * estimated safe rate and volume of data (bw_lo and inflight_lo). We use a ++ * multiplicative decrease in order to converge to a lower capacity in time ++ * logarithmic in the magnitude of the decrease. ++ * ++ * However, we do not cut our short-term estimates lower than the current rate ++ * and volume of delivered data from this round trip, since from the current ++ * delivery process we can estimate the measured capacity available now. ++ * ++ * Anything faster than that approach would knowingly risk high loss, which can ++ * cause low bw for Reno/CUBIC and high loss recovery latency for ++ * request/response flows using any congestion control. ++ */ ++static void bbr_adapt_lower_bounds(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 ecn_inflight_lo = ~0U; ++ ++ /* We only use lower-bound estimates when not probing bw. ++ * When probing we need to push inflight higher to probe bw. ++ */ ++ if (bbr_is_probing_bandwidth(sk)) ++ return; ++ ++ /* ECN response. */ ++ if (bbr->ecn_in_round && !!bbr_param(sk, ecn_factor)) { ++ bbr_init_lower_bounds(sk, false); ++ bbr_ecn_lower_bounds(sk, &ecn_inflight_lo); ++ } ++ ++ /* Loss response. */ ++ if (bbr->loss_in_round) { ++ bbr_init_lower_bounds(sk, true); ++ bbr_loss_lower_bounds(sk, &bbr->bw_lo, &bbr->inflight_lo); ++ } ++ ++ /* Adjust to the lower of the levels implied by loss/ECN. */ ++ bbr->inflight_lo = min(bbr->inflight_lo, ecn_inflight_lo); ++ bbr->bw_lo = max(1U, bbr->bw_lo); ++} ++ ++/* Reset any short-term lower-bound adaptation to congestion, so that we can ++ * push our inflight up. ++ */ ++static void bbr_reset_lower_bounds(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->bw_lo = ~0U; ++ bbr->inflight_lo = ~0U; ++} ++ ++/* After bw probing (STARTUP/PROBE_UP), reset signals before entering a state ++ * machine phase where we adapt our lower bound based on congestion signals. ++ */ ++static void bbr_reset_congestion_signals(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++ bbr->loss_in_cycle = 0; ++ bbr->ecn_in_cycle = 0; ++ bbr->bw_latest = 0; ++ bbr->inflight_latest = 0; ++} ++ ++static void bbr_exit_loss_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ tcp_snd_cwnd_set(tp, max(tcp_snd_cwnd(tp), bbr->prior_cwnd)); ++ bbr->try_fast_path = 0; /* bound cwnd using latest model */ ++} ++ ++/* Update rate and volume of delivered data from latest round trip. */ ++static void bbr_update_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->loss_round_start = 0; ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ ++ bbr->bw_latest = max_t(u32, bbr->bw_latest, ctx->sample_bw); ++ bbr->inflight_latest = max_t(u32, bbr->inflight_latest, rs->delivered); ++ ++ if (!before(rs->prior_delivered, bbr->loss_round_delivered)) { ++ bbr->loss_round_delivered = tp->delivered; ++ bbr->loss_round_start = 1; /* mark start of new round trip */ ++ } ++} ++ ++/* Once per round, reset filter for latest rate and volume of delivered data. */ ++static void bbr_advance_latest_delivery_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* If ACK matches a TLP retransmit, persist the filter. If we detect ++ * that a TLP retransmit plugged a tail loss, we'll want to remember ++ * how much data the path delivered before the tail loss. ++ */ ++ if (bbr->loss_round_start && !rs->is_acking_tlp_retrans_seq) { ++ bbr->bw_latest = ctx->sample_bw; ++ bbr->inflight_latest = rs->delivered; ++ } ++} ++ ++/* Update (most of) our congestion signals: track the recent rate and volume of ++ * delivered data, presence of loss, and EWMA degree of ECN marking. ++ */ ++static void bbr_update_congestion_signals( ++ struct sock *sk, const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u64 bw; ++ ++ if (rs->interval_us <= 0 || !rs->acked_sacked) ++ return; /* Not a valid observation */ ++ bw = ctx->sample_bw; ++ ++ if (!rs->is_app_limited || bw >= bbr_max_bw(sk)) ++ bbr_take_max_bw_sample(sk, bw); ++ ++ bbr->loss_in_round |= (rs->losses > 0); ++ ++ if (!bbr->loss_round_start) ++ return; /* skip the per-round-trip updates */ ++ /* Now do per-round-trip updates. */ ++ bbr_adapt_lower_bounds(sk, rs); ++ ++ bbr->loss_in_round = 0; ++ bbr->ecn_in_round = 0; ++} ++ ++/* Bandwidth probing can cause loss. To help coexistence with loss-based ++ * congestion control we spread out our probing in a Reno-conscious way. Due to ++ * the shape of the Reno sawtooth, the time required between loss epochs for an ++ * idealized Reno flow is a number of round trips that is the BDP of that ++ * flow. We count packet-timed round trips directly, since measured RTT can ++ * vary widely, and Reno is driven by packet-timed round trips. ++ */ ++static bool bbr_is_reno_coexistence_probe_time(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 rounds; ++ ++ /* Random loss can shave some small percentage off of our inflight ++ * in each round. To survive this, flows need robust periodic probes. ++ */ ++ rounds = min_t(u32, bbr_param(sk, bw_probe_max_rounds), bbr_target_inflight(sk)); ++ return bbr->rounds_since_probe >= rounds; ++} ++ ++/* How long do we want to wait before probing for bandwidth (and risking ++ * loss)? We randomize the wait, for better mixing and fairness convergence. ++ * ++ * We bound the Reno-coexistence inter-bw-probe time to be 62-63 round trips. ++ * This is calculated to allow fairness with a 25Mbps, 30ms Reno flow, ++ * (eg 4K video to a broadband user): ++ * BDP = 25Mbps * .030sec /(1514bytes) = 61.9 packets ++ * ++ * We bound the BBR-native inter-bw-probe wall clock time to be: ++ * (a) higher than 2 sec: to try to avoid causing loss for a long enough time ++ * to allow Reno at 30ms to get 4K video bw, the inter-bw-probe time must ++ * be at least: 25Mbps * .030sec / (1514bytes) * 0.030sec = 1.9secs ++ * (b) lower than 3 sec: to ensure flows can start probing in a reasonable ++ * amount of time to discover unutilized bw on human-scale interactive ++ * time-scales (e.g. perhaps traffic from a web page download that we ++ * were competing with is now complete). ++ */ ++static void bbr_pick_probe_wait(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Decide the random round-trip bound for wait until probe: */ ++ bbr->rounds_since_probe = ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_rounds)); ++ /* Decide the random wall clock bound for wait until probe: */ ++ bbr->probe_wait_us = bbr_param(sk, bw_probe_base_us) + ++ get_random_u32_below(bbr_param(sk, bw_probe_rand_us)); ++} ++ ++static void bbr_set_cycle_idx(struct sock *sk, int cycle_idx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->cycle_idx = cycle_idx; ++ /* New phase, so need to update cwnd and pacing rate. */ ++ bbr->try_fast_path = 0; ++} ++ ++/* Send at estimated bw to fill the pipe, but not queue. We need this phase ++ * before PROBE_UP, because as soon as we send faster than the available bw ++ * we will start building a queue, and if the buffer is shallow we can cause ++ * loss. If we do not fill the pipe before we cause this loss, our bw_hi and ++ * inflight_hi estimates will underestimate. ++ */ ++static void bbr_start_bw_probe_refill(struct sock *sk, u32 bw_probe_up_rounds) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ bbr->bw_probe_up_rounds = bw_probe_up_rounds; ++ bbr->bw_probe_up_acks = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_REFILLING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_REFILL); ++} ++ ++/* Now probe max deliverable data rate and volume. */ ++static void bbr_start_bw_probe_up(struct sock *sk, struct bbr_context *ctx) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->ack_phase = BBR_ACKS_PROBE_STARTING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->cycle_mstamp = tp->tcp_mstamp; ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_UP); ++ bbr_raise_inflight_hi_slope(sk); ++} ++ ++/* Start a new PROBE_BW probing cycle of some wall clock length. Pick a wall ++ * clock time at which to probe beyond an inflight that we think to be ++ * safe. This will knowingly risk packet loss, so we want to do this rarely, to ++ * keep packet loss rates low. Also start a round-trip counter, to probe faster ++ * if we estimate a Reno flow at our BDP would probe faster. ++ */ ++static void bbr_start_bw_probe_down(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_probe_up_cnt = ~0U; /* not growing inflight_hi any more */ ++ bbr_pick_probe_wait(sk); ++ bbr->cycle_mstamp = tp->tcp_mstamp; /* start wall clock */ ++ bbr->ack_phase = BBR_ACKS_PROBE_STOPPING; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_DOWN); ++} ++ ++/* Cruise: maintain what we estimate to be a neutral, conservative ++ * operating point, without attempting to probe up for bandwidth or down for ++ * RTT, and only reducing inflight in response to loss/ECN signals. ++ */ ++static void bbr_start_bw_probe_cruise(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->inflight_lo != ~0U) ++ bbr->inflight_lo = min(bbr->inflight_lo, bbr->inflight_hi); ++ ++ bbr_set_cycle_idx(sk, BBR_BW_PROBE_CRUISE); ++} ++ ++/* Loss and/or ECN rate is too high while probing. ++ * Adapt (once per bw probe) by cutting inflight_hi and then restarting cycle. ++ */ ++static void bbr_handle_inflight_too_high(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ const u32 beta = bbr_param(sk, beta); ++ ++ bbr->prev_probe_too_high = 1; ++ bbr->bw_probe_samples = 0; /* only react once per probe */ ++ /* If we are app-limited then we are not robustly ++ * probing the max volume of inflight data we think ++ * might be safe (analogous to how app-limited bw ++ * samples are not known to be robustly probing bw). ++ */ ++ if (!rs->is_app_limited) { ++ bbr->inflight_hi = max_t(u32, rs->tx_in_flight, ++ (u64)bbr_target_inflight(sk) * ++ (BBR_UNIT - beta) >> BBR_SCALE); ++ } ++ if (bbr->mode == BBR_PROBE_BW && bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_start_bw_probe_down(sk); ++} ++ ++/* If we're seeing bw and loss samples reflecting our bw probing, adapt ++ * using the signals we see. If loss or ECN mark rate gets too high, then adapt ++ * inflight_hi downward. If we're able to push inflight higher without such ++ * signals, push higher: adapt inflight_hi upward. ++ */ ++static bool bbr_adapt_upper_bounds(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Track when we'll see bw/loss samples resulting from our bw probes. */ ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STARTING && bbr->round_start) ++ bbr->ack_phase = BBR_ACKS_PROBE_FEEDBACK; ++ if (bbr->ack_phase == BBR_ACKS_PROBE_STOPPING && bbr->round_start) { ++ /* End of samples from bw probing phase. */ ++ bbr->bw_probe_samples = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ /* At this point in the cycle, our current bw sample is also ++ * our best recent chance at finding the highest available bw ++ * for this flow. So now is the best time to forget the bw ++ * samples from the previous cycle, by advancing the window. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && !rs->is_app_limited) ++ bbr_advance_max_bw_filter(sk); ++ /* If we had an inflight_hi, then probed and pushed inflight all ++ * the way up to hit that inflight_hi without seeing any ++ * high loss/ECN in all the resulting ACKs from that probing, ++ * then probe up again, this time letting inflight persist at ++ * inflight_hi for a round trip, then accelerating beyond. ++ */ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->stopped_risky_probe && !bbr->prev_probe_too_high) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; /* yes, decided state transition */ ++ } ++ } ++ if (bbr_is_inflight_too_high(sk, rs)) { ++ if (bbr->bw_probe_samples) /* sample is from bw probing? */ ++ bbr_handle_inflight_too_high(sk, rs); ++ } else { ++ /* Loss/ECN rate is declared safe. Adjust upper bound upward. */ ++ ++ if (bbr->inflight_hi == ~0U) ++ return false; /* no excess queue signals yet */ ++ ++ /* To be resilient to random loss, we must raise bw/inflight_hi ++ * if we observe in any phase that a higher level is safe. ++ */ ++ if (rs->tx_in_flight > bbr->inflight_hi) { ++ bbr->inflight_hi = rs->tx_in_flight; ++ } ++ ++ if (bbr->mode == BBR_PROBE_BW && ++ bbr->cycle_idx == BBR_BW_PROBE_UP) ++ bbr_probe_inflight_hi_upward(sk, rs); ++ } ++ ++ return false; ++} ++ ++/* Check if it's time to probe for bandwidth now, and if so, kick it off. */ ++static bool bbr_check_time_to_probe_bw(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 n; ++ ++ /* If we seem to be at an operating point where we are not seeing loss ++ * but we are seeing ECN marks, then when the ECN marks cease we reprobe ++ * quickly (in case cross-traffic has ceased and freed up bw). ++ */ ++ if (bbr_param(sk, ecn_reprobe_gain) && bbr->ecn_eligible && ++ bbr->ecn_in_cycle && !bbr->loss_in_cycle && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Open) { ++ /* Calculate n so that when bbr_raise_inflight_hi_slope() ++ * computes growth_this_round as 2^n it will be roughly the ++ * desired volume of data (inflight_hi*ecn_reprobe_gain). ++ */ ++ n = ilog2((((u64)bbr->inflight_hi * ++ bbr_param(sk, ecn_reprobe_gain)) >> BBR_SCALE)); ++ bbr_start_bw_probe_refill(sk, n); ++ return true; ++ } ++ ++ if (bbr_has_elapsed_in_phase(sk, bbr->probe_wait_us) || ++ bbr_is_reno_coexistence_probe_time(sk)) { ++ bbr_start_bw_probe_refill(sk, 0); ++ return true; ++ } ++ return false; ++} ++ ++/* Is it time to transition from PROBE_DOWN to PROBE_CRUISE? */ ++static bool bbr_check_time_to_cruise(struct sock *sk, u32 inflight, u32 bw) ++{ ++ /* Always need to pull inflight down to leave headroom in queue. */ ++ if (inflight > bbr_inflight_with_headroom(sk)) ++ return false; ++ ++ return inflight <= bbr_inflight(sk, bw, BBR_UNIT); ++} ++ ++/* PROBE_BW state machine: cruise, refill, probe for bw, or drain? */ ++static void bbr_update_cycle_phase(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); ++ bool is_bw_probe_done = false; ++ u32 inflight, bw; ++ ++ if (!bbr_full_bw_reached(sk)) ++ return; ++ ++ /* In DRAIN, PROBE_BW, or PROBE_RTT, adjust upper bounds. */ ++ if (bbr_adapt_upper_bounds(sk, rs, ctx)) ++ return; /* already decided state transition */ ++ ++ if (bbr->mode != BBR_PROBE_BW) ++ return; ++ ++ inflight = bbr_packets_in_net_at_edt(sk, rs->prior_in_flight); ++ bw = bbr_max_bw(sk); ++ ++ switch (bbr->cycle_idx) { ++ /* First we spend most of our time cruising with a pacing_gain of 1.0, ++ * which paces at the estimated bw, to try to fully use the pipe ++ * without building queue. If we encounter loss/ECN marks, we adapt ++ * by slowing down. ++ */ ++ case BBR_BW_PROBE_CRUISE: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ break; ++ ++ /* After cruising, when it's time to probe, we first "refill": we send ++ * at the estimated bw to fill the pipe, before probing higher and ++ * knowingly risking overflowing the bottleneck buffer (causing loss). ++ */ ++ case BBR_BW_PROBE_REFILL: ++ if (bbr->round_start) { ++ /* After one full round trip of sending in REFILL, we ++ * start to see bw samples reflecting our REFILL, which ++ * may be putting too much data in flight. ++ */ ++ bbr->bw_probe_samples = 1; ++ bbr_start_bw_probe_up(sk, ctx); ++ } ++ break; + +- bbr->full_bw = 0; /* spurious slow-down; reset full pipe detection */ ++ /* After we refill the pipe, we probe by using a pacing_gain > 1.0, to ++ * probe for bw. If we have not seen loss/ECN, we try to raise inflight ++ * to at least pacing_gain*BDP; note that this may take more than ++ * min_rtt if min_rtt is small (e.g. on a LAN). ++ * ++ * We terminate PROBE_UP bandwidth probing upon any of the following: ++ * ++ * (1) We've pushed inflight up to hit the inflight_hi target set in the ++ * most recent previous bw probe phase. Thus we want to start ++ * draining the queue immediately because it's very likely the most ++ * recently sent packets will fill the queue and cause drops. ++ * (2) If inflight_hi has not limited bandwidth growth recently, and ++ * yet delivered bandwidth has not increased much recently ++ * (bbr->full_bw_now). ++ * (3) Loss filter says loss rate is "too high". ++ * (4) ECN filter says ECN mark rate is "too high". ++ * ++ * (1) (2) checked here, (3) (4) checked in bbr_is_inflight_too_high() ++ */ ++ case BBR_BW_PROBE_UP: ++ if (bbr->prev_probe_too_high && ++ inflight >= bbr->inflight_hi) { ++ bbr->stopped_risky_probe = 1; ++ is_bw_probe_done = true; ++ } else { ++ if (tp->is_cwnd_limited && ++ tcp_snd_cwnd(tp) >= bbr->inflight_hi) { ++ /* inflight_hi is limiting bw growth */ ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ } else if (bbr->full_bw_now) { ++ /* Plateau in estimated bw. Pipe looks full. */ ++ is_bw_probe_done = true; ++ } ++ } ++ if (is_bw_probe_done) { ++ bbr->prev_probe_too_high = 0; /* no loss/ECN (yet) */ ++ bbr_start_bw_probe_down(sk); /* restart w/ down */ ++ } ++ break; ++ ++ /* After probing in PROBE_UP, we have usually accumulated some data in ++ * the bottleneck buffer (if bw probing didn't find more bw). We next ++ * enter PROBE_DOWN to try to drain any excess data from the queue. To ++ * do this, we use a pacing_gain < 1.0. We hold this pacing gain until ++ * our inflight is less then that target cruising point, which is the ++ * minimum of (a) the amount needed to leave headroom, and (b) the ++ * estimated BDP. Once inflight falls to match the target, we estimate ++ * the queue is drained; persisting would underutilize the pipe. ++ */ ++ case BBR_BW_PROBE_DOWN: ++ if (bbr_check_time_to_probe_bw(sk, rs)) ++ return; /* already decided state transition */ ++ if (bbr_check_time_to_cruise(sk, inflight, bw)) ++ bbr_start_bw_probe_cruise(sk); ++ break; ++ ++ default: ++ WARN_ONCE(1, "BBR invalid cycle index %u\n", bbr->cycle_idx); ++ } ++} ++ ++/* Exiting PROBE_RTT, so return to bandwidth probing in STARTUP or PROBE_BW. */ ++static void bbr_exit_probe_rtt(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_lower_bounds(sk); ++ if (bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_PROBE_BW; ++ /* Raising inflight after PROBE_RTT may cause loss, so reset ++ * the PROBE_BW clock and schedule the next bandwidth probe for ++ * a friendly and randomized future point in time. ++ */ ++ bbr_start_bw_probe_down(sk); ++ /* Since we are exiting PROBE_RTT, we know inflight is ++ * below our estimated BDP, so it is reasonable to cruise. ++ */ ++ bbr_start_bw_probe_cruise(sk); ++ } else { ++ bbr->mode = BBR_STARTUP; ++ } ++} ++ ++/* Exit STARTUP based on loss rate > 1% and loss gaps in round >= N. Wait until ++ * the end of the round in recovery to get a good estimate of how many packets ++ * have been lost, and how many we need to drain with a low pacing rate. ++ */ ++static void bbr_check_loss_too_high_in_startup(struct sock *sk, ++ const struct rate_sample *rs) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr_full_bw_reached(sk)) ++ return; ++ ++ /* For STARTUP exit, check the loss rate at the end of each round trip ++ * of Recovery episodes in STARTUP. We check the loss rate at the end ++ * of the round trip to filter out noisy/low loss and have a better ++ * sense of inflight (extent of loss), so we can drain more accurately. ++ */ ++ if (rs->losses && bbr->loss_events_in_round < 0xf) ++ bbr->loss_events_in_round++; /* update saturating counter */ ++ if (bbr_param(sk, full_loss_cnt) && bbr->loss_round_start && ++ inet_csk(sk)->icsk_ca_state == TCP_CA_Recovery && ++ bbr->loss_events_in_round >= bbr_param(sk, full_loss_cnt) && ++ bbr_is_inflight_too_high(sk, rs)) { ++ bbr_handle_queue_too_high_in_startup(sk); ++ return; ++ } ++ if (bbr->loss_round_start) ++ bbr->loss_events_in_round = 0; ++} ++ ++/* Estimate when the pipe is full, using the change in delivery rate: BBR ++ * estimates bw probing filled the pipe if the estimated bw hasn't changed by ++ * at least bbr_full_bw_thresh (25%) after bbr_full_bw_cnt (3) non-app-limited ++ * rounds. Why 3 rounds: 1: rwin autotuning grows the rwin, 2: we fill the ++ * higher rwin, 3: we get higher delivery rate samples. Or transient ++ * cross-traffic or radio noise can go away. CUBIC Hystart shares a similar ++ * design goal, but uses delay and inter-ACK spacing instead of bandwidth. ++ */ ++static void bbr_check_full_bw_reached(struct sock *sk, ++ const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 bw_thresh, full_cnt, thresh; ++ ++ if (bbr->full_bw_now || rs->is_app_limited) ++ return; ++ ++ thresh = bbr_param(sk, full_bw_thresh); ++ full_cnt = bbr_param(sk, full_bw_cnt); ++ bw_thresh = (u64)bbr->full_bw * thresh >> BBR_SCALE; ++ if (ctx->sample_bw >= bw_thresh) { ++ bbr_reset_full_bw(sk); ++ bbr->full_bw = ctx->sample_bw; ++ return; ++ } ++ if (!bbr->round_start) ++ return; ++ ++bbr->full_bw_cnt; ++ bbr->full_bw_now = bbr->full_bw_cnt >= full_cnt; ++ bbr->full_bw_reached |= bbr->full_bw_now; ++} ++ ++/* If pipe is probably full, drain the queue and then enter steady-state. */ ++static void bbr_check_drain(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ if (bbr->mode == BBR_STARTUP && bbr_full_bw_reached(sk)) { ++ bbr->mode = BBR_DRAIN; /* drain queue we created */ ++ /* Set ssthresh to export purely for monitoring, to signal ++ * completion of initial STARTUP by setting to a non- ++ * TCP_INFINITE_SSTHRESH value (ssthresh is not used by BBR). ++ */ ++ tcp_sk(sk)->snd_ssthresh = ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT); ++ bbr_reset_congestion_signals(sk); ++ } /* fall through to check if in-flight is already small: */ ++ if (bbr->mode == BBR_DRAIN && ++ bbr_packets_in_net_at_edt(sk, tcp_packets_in_flight(tcp_sk(sk))) <= ++ bbr_inflight(sk, bbr_max_bw(sk), BBR_UNIT)) { ++ bbr->mode = BBR_PROBE_BW; ++ bbr_start_bw_probe_down(sk); ++ } ++} ++ ++static void bbr_update_model(struct sock *sk, const struct rate_sample *rs, ++ struct bbr_context *ctx) ++{ ++ bbr_update_congestion_signals(sk, rs, ctx); ++ bbr_update_ack_aggregation(sk, rs); ++ bbr_check_loss_too_high_in_startup(sk, rs); ++ bbr_check_full_bw_reached(sk, rs, ctx); ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++} ++ ++/* Fast path for app-limited case. ++ * ++ * On each ack, we execute bbr state machine, which primarily consists of: ++ * 1) update model based on new rate sample, and ++ * 2) update control based on updated model or state change. ++ * ++ * There are certain workload/scenarios, e.g. app-limited case, where ++ * either we can skip updating model or we can skip update of both model ++ * as well as control. This provides signifcant softirq cpu savings for ++ * processing incoming acks. ++ * ++ * In case of app-limited, if there is no congestion (loss/ecn) and ++ * if observed bw sample is less than current estimated bw, then we can ++ * skip some of the computation in bbr state processing: ++ * ++ * - if there is no rtt/mode/phase change: In this case, since all the ++ * parameters of the network model are constant, we can skip model ++ * as well control update. ++ * ++ * - else we can skip rest of the model update. But we still need to ++ * update the control to account for the new rtt/mode/phase. ++ * ++ * Returns whether we can take fast path or not. ++ */ ++static bool bbr_run_fast_path(struct sock *sk, bool *update_model, ++ const struct rate_sample *rs, struct bbr_context *ctx) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ u32 prev_min_rtt_us, prev_mode; ++ ++ if (bbr_param(sk, fast_path) && bbr->try_fast_path && ++ rs->is_app_limited && ctx->sample_bw < bbr_max_bw(sk) && ++ !bbr->loss_in_round && !bbr->ecn_in_round ) { ++ prev_mode = bbr->mode; ++ prev_min_rtt_us = bbr->min_rtt_us; ++ bbr_check_drain(sk, rs, ctx); ++ bbr_update_cycle_phase(sk, rs, ctx); ++ bbr_update_min_rtt(sk, rs); ++ ++ if (bbr->mode == prev_mode && ++ bbr->min_rtt_us == prev_min_rtt_us && ++ bbr->try_fast_path) { ++ return true; ++ } ++ ++ /* Skip model update, but control still needs to be updated */ ++ *update_model = false; ++ } ++ return false; ++} ++ ++__bpf_kfunc static void bbr_main(struct sock *sk, u32 ack, int flag, ++ const struct rate_sample *rs) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct bbr_context ctx = { 0 }; ++ bool update_model = true; ++ u32 bw, round_delivered; ++ int ce_ratio = -1; ++ ++ round_delivered = bbr_update_round_start(sk, rs, &ctx); ++ if (bbr->round_start) { ++ bbr->rounds_since_probe = ++ min_t(s32, bbr->rounds_since_probe + 1, 0xFF); ++ ce_ratio = bbr_update_ecn_alpha(sk); ++ } ++ bbr_plb(sk, rs, ce_ratio); ++ ++ bbr->ecn_in_round |= (bbr->ecn_eligible && rs->is_ece); ++ bbr_calculate_bw_sample(sk, rs, &ctx); ++ bbr_update_latest_delivery_signals(sk, rs, &ctx); ++ ++ if (bbr_run_fast_path(sk, &update_model, rs, &ctx)) ++ goto out; ++ ++ if (update_model) ++ bbr_update_model(sk, rs, &ctx); ++ ++ bbr_update_gains(sk); ++ bw = bbr_bw(sk); ++ bbr_set_pacing_rate(sk, bw, bbr->pacing_gain); ++ bbr_set_cwnd(sk, rs, rs->acked_sacked, bw, bbr->cwnd_gain, ++ tcp_snd_cwnd(tp), &ctx); ++ bbr_bound_cwnd_for_inflight_model(sk); ++ ++out: ++ bbr_advance_latest_delivery_signals(sk, rs, &ctx); ++ bbr->prev_ca_state = inet_csk(sk)->icsk_ca_state; ++ bbr->loss_in_cycle |= rs->lost > 0; ++ bbr->ecn_in_cycle |= rs->delivered_ce > 0; ++} ++ ++__bpf_kfunc static void bbr_init(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr->initialized = 1; ++ ++ bbr->init_cwnd = min(0x7FU, tcp_snd_cwnd(tp)); ++ bbr->prior_cwnd = tp->prior_cwnd; ++ tp->snd_ssthresh = TCP_INFINITE_SSTHRESH; ++ bbr->next_rtt_delivered = tp->delivered; ++ bbr->prev_ca_state = TCP_CA_Open; ++ ++ bbr->probe_rtt_done_stamp = 0; ++ bbr->probe_rtt_round_done = 0; ++ bbr->probe_rtt_min_us = tcp_min_rtt(tp); ++ bbr->probe_rtt_min_stamp = tcp_jiffies32; ++ bbr->min_rtt_us = tcp_min_rtt(tp); ++ bbr->min_rtt_stamp = tcp_jiffies32; ++ ++ bbr->has_seen_rtt = 0; ++ bbr_init_pacing_rate_from_rtt(sk); ++ ++ bbr->round_start = 0; ++ bbr->idle_restart = 0; ++ bbr->full_bw_reached = 0; ++ bbr->full_bw = 0; + bbr->full_bw_cnt = 0; +- bbr_reset_lt_bw_sampling(sk); +- return tcp_snd_cwnd(tcp_sk(sk)); ++ bbr->cycle_mstamp = 0; ++ bbr->cycle_idx = 0; ++ ++ bbr_reset_startup_mode(sk); ++ ++ bbr->ack_epoch_mstamp = tp->tcp_mstamp; ++ bbr->ack_epoch_acked = 0; ++ bbr->extra_acked_win_rtts = 0; ++ bbr->extra_acked_win_idx = 0; ++ bbr->extra_acked[0] = 0; ++ bbr->extra_acked[1] = 0; ++ ++ bbr->ce_state = 0; ++ bbr->prior_rcv_nxt = tp->rcv_nxt; ++ bbr->try_fast_path = 0; ++ ++ cmpxchg(&sk->sk_pacing_status, SK_PACING_NONE, SK_PACING_NEEDED); ++ ++ /* Start sampling ECN mark rate after first full flight is ACKed: */ ++ bbr->loss_round_delivered = tp->delivered + 1; ++ bbr->loss_round_start = 0; ++ bbr->undo_bw_lo = 0; ++ bbr->undo_inflight_lo = 0; ++ bbr->undo_inflight_hi = 0; ++ bbr->loss_events_in_round = 0; ++ bbr->startup_ecn_rounds = 0; ++ bbr_reset_congestion_signals(sk); ++ bbr->bw_lo = ~0U; ++ bbr->bw_hi[0] = 0; ++ bbr->bw_hi[1] = 0; ++ bbr->inflight_lo = ~0U; ++ bbr->inflight_hi = ~0U; ++ bbr_reset_full_bw(sk); ++ bbr->bw_probe_up_cnt = ~0U; ++ bbr->bw_probe_up_acks = 0; ++ bbr->bw_probe_up_rounds = 0; ++ bbr->probe_wait_us = 0; ++ bbr->stopped_risky_probe = 0; ++ bbr->ack_phase = BBR_ACKS_INIT; ++ bbr->rounds_since_probe = 0; ++ bbr->bw_probe_samples = 0; ++ bbr->prev_probe_too_high = 0; ++ bbr->ecn_eligible = 0; ++ bbr->ecn_alpha = bbr_param(sk, ecn_alpha_init); ++ bbr->alpha_last_delivered = 0; ++ bbr->alpha_last_delivered_ce = 0; ++ bbr->plb.pause_until = 0; ++ ++ tp->fast_ack_mode = bbr_fast_ack_mode ? 1 : 0; ++ ++ if (bbr_can_use_ecn(sk)) ++ tp->ecn_flags |= TCP_ECN_ECT_PERMANENT; ++} ++ ++/* BBR marks the current round trip as a loss round. */ ++static void bbr_note_loss(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ /* Capture "current" data over the full round trip of loss, to ++ * have a better chance of observing the full capacity of the path. ++ */ ++ if (!bbr->loss_in_round) /* first loss in this round trip? */ ++ bbr->loss_round_delivered = tp->delivered; /* set round trip */ ++ bbr->loss_in_round = 1; ++ bbr->loss_in_cycle = 1; + } + +-/* Entering loss recovery, so save cwnd for when we exit or undo recovery. */ ++/* Core TCP stack informs us that the given skb was just marked lost. */ ++__bpf_kfunc static void bbr_skb_marked_lost(struct sock *sk, ++ const struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct tcp_skb_cb *scb = TCP_SKB_CB(skb); ++ struct rate_sample rs = {}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not an skb sent while probing for bandwidth */ ++ if (unlikely(!scb->tx.delivered_mstamp)) ++ return; /* skb was SACKed, reneged, marked lost; ignore it */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this lost skb, ++ * then see if the loss rate went too high, and if so at which packet. ++ */ ++ rs.tx_in_flight = scb->tx.in_flight; ++ rs.lost = tp->lost - scb->tx.lost; ++ rs.is_app_limited = scb->tx.is_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) { ++ rs.tx_in_flight = bbr_inflight_hi_from_lost_skb(sk, &rs, skb); ++ bbr_handle_inflight_too_high(sk, &rs); ++ } ++} ++ ++static void bbr_run_loss_probe_recovery(struct sock *sk) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ struct bbr *bbr = inet_csk_ca(sk); ++ struct rate_sample rs = {0}; ++ ++ bbr_note_loss(sk); ++ ++ if (!bbr->bw_probe_samples) ++ return; /* not sent while probing for bandwidth */ ++ /* We are probing for bandwidth. Construct a rate sample that ++ * estimates what happened in the flight leading up to this ++ * loss, then see if the loss rate went too high. ++ */ ++ rs.lost = 1; /* TLP probe repaired loss of a single segment */ ++ rs.tx_in_flight = bbr->inflight_latest + rs.lost; ++ rs.is_app_limited = tp->tlp_orig_data_app_limited; ++ if (bbr_is_inflight_too_high(sk, &rs)) ++ bbr_handle_inflight_too_high(sk, &rs); ++} ++ ++/* Revert short-term model if current loss recovery event was spurious. */ ++__bpf_kfunc static u32 bbr_undo_cwnd(struct sock *sk) ++{ ++ struct bbr *bbr = inet_csk_ca(sk); ++ ++ bbr_reset_full_bw(sk); /* spurious slow-down; reset full bw detector */ ++ bbr->loss_in_round = 0; ++ ++ /* Revert to cwnd and other state saved before loss episode. */ ++ bbr->bw_lo = max(bbr->bw_lo, bbr->undo_bw_lo); ++ bbr->inflight_lo = max(bbr->inflight_lo, bbr->undo_inflight_lo); ++ bbr->inflight_hi = max(bbr->inflight_hi, bbr->undo_inflight_hi); ++ bbr->try_fast_path = 0; /* take slow path to set proper cwnd, pacing */ ++ return bbr->prior_cwnd; ++} ++ ++/* Entering loss recovery, so save state for when we undo recovery. */ + __bpf_kfunc static u32 bbr_ssthresh(struct sock *sk) + { ++ struct bbr *bbr = inet_csk_ca(sk); ++ + bbr_save_cwnd(sk); ++ /* For undo, save state that adapts based on loss signal. */ ++ bbr->undo_bw_lo = bbr->bw_lo; ++ bbr->undo_inflight_lo = bbr->inflight_lo; ++ bbr->undo_inflight_hi = bbr->inflight_hi; + return tcp_sk(sk)->snd_ssthresh; + } + ++static enum tcp_bbr_phase bbr_get_phase(struct bbr *bbr) ++{ ++ switch (bbr->mode) { ++ case BBR_STARTUP: ++ return BBR_PHASE_STARTUP; ++ case BBR_DRAIN: ++ return BBR_PHASE_DRAIN; ++ case BBR_PROBE_BW: ++ break; ++ case BBR_PROBE_RTT: ++ return BBR_PHASE_PROBE_RTT; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++ switch (bbr->cycle_idx) { ++ case BBR_BW_PROBE_UP: ++ return BBR_PHASE_PROBE_BW_UP; ++ case BBR_BW_PROBE_DOWN: ++ return BBR_PHASE_PROBE_BW_DOWN; ++ case BBR_BW_PROBE_CRUISE: ++ return BBR_PHASE_PROBE_BW_CRUISE; ++ case BBR_BW_PROBE_REFILL: ++ return BBR_PHASE_PROBE_BW_REFILL; ++ default: ++ return BBR_PHASE_INVALID; ++ } ++} ++ + static size_t bbr_get_info(struct sock *sk, u32 ext, int *attr, +- union tcp_cc_info *info) ++ union tcp_cc_info *info) + { + if (ext & (1 << (INET_DIAG_BBRINFO - 1)) || + ext & (1 << (INET_DIAG_VEGASINFO - 1))) { +- struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); +- u64 bw = bbr_bw(sk); +- +- bw = bw * tp->mss_cache * USEC_PER_SEC >> BW_SCALE; +- memset(&info->bbr, 0, sizeof(info->bbr)); +- info->bbr.bbr_bw_lo = (u32)bw; +- info->bbr.bbr_bw_hi = (u32)(bw >> 32); +- info->bbr.bbr_min_rtt = bbr->min_rtt_us; +- info->bbr.bbr_pacing_gain = bbr->pacing_gain; +- info->bbr.bbr_cwnd_gain = bbr->cwnd_gain; ++ u64 bw = bbr_bw_bytes_per_sec(sk, bbr_bw(sk)); ++ u64 bw_hi = bbr_bw_bytes_per_sec(sk, bbr_max_bw(sk)); ++ u64 bw_lo = bbr->bw_lo == ~0U ? ++ ~0ULL : bbr_bw_bytes_per_sec(sk, bbr->bw_lo); ++ struct tcp_bbr_info *bbr_info = &info->bbr; ++ ++ memset(bbr_info, 0, sizeof(*bbr_info)); ++ bbr_info->bbr_bw_lo = (u32)bw; ++ bbr_info->bbr_bw_hi = (u32)(bw >> 32); ++ bbr_info->bbr_min_rtt = bbr->min_rtt_us; ++ bbr_info->bbr_pacing_gain = bbr->pacing_gain; ++ bbr_info->bbr_cwnd_gain = bbr->cwnd_gain; ++ bbr_info->bbr_bw_hi_lsb = (u32)bw_hi; ++ bbr_info->bbr_bw_hi_msb = (u32)(bw_hi >> 32); ++ bbr_info->bbr_bw_lo_lsb = (u32)bw_lo; ++ bbr_info->bbr_bw_lo_msb = (u32)(bw_lo >> 32); ++ bbr_info->bbr_mode = bbr->mode; ++ bbr_info->bbr_phase = (__u8)bbr_get_phase(bbr); ++ bbr_info->bbr_version = (__u8)BBR_VERSION; ++ bbr_info->bbr_inflight_lo = bbr->inflight_lo; ++ bbr_info->bbr_inflight_hi = bbr->inflight_hi; ++ bbr_info->bbr_extra_acked = bbr_extra_acked(sk); + *attr = INET_DIAG_BBRINFO; +- return sizeof(info->bbr); ++ return sizeof(*bbr_info); + } + return 0; + } + + __bpf_kfunc static void bbr_set_state(struct sock *sk, u8 new_state) + { ++ struct tcp_sock *tp = tcp_sk(sk); + struct bbr *bbr = inet_csk_ca(sk); + + if (new_state == TCP_CA_Loss) { +- struct rate_sample rs = { .losses = 1 }; + + bbr->prev_ca_state = TCP_CA_Loss; +- bbr->full_bw = 0; +- bbr->round_start = 1; /* treat RTO like end of a round */ +- bbr_lt_bw_sampling(sk, &rs); ++ tcp_plb_update_state_upon_rto(sk, &bbr->plb); ++ /* The tcp_write_timeout() call to sk_rethink_txhash() likely ++ * repathed this flow, so re-learn the min network RTT on the ++ * new path: ++ */ ++ bbr_reset_full_bw(sk); ++ if (!bbr_is_probing_bandwidth(sk) && bbr->inflight_lo == ~0U) { ++ /* bbr_adapt_lower_bounds() needs cwnd before ++ * we suffered an RTO, to update inflight_lo: ++ */ ++ bbr->inflight_lo = ++ max(tcp_snd_cwnd(tp), bbr->prior_cwnd); ++ } ++ } else if (bbr->prev_ca_state == TCP_CA_Loss && ++ new_state != TCP_CA_Loss) { ++ bbr_exit_loss_recovery(sk); + } + } + ++ + static struct tcp_congestion_ops tcp_bbr_cong_ops __read_mostly = { +- .flags = TCP_CONG_NON_RESTRICTED, ++ .flags = TCP_CONG_NON_RESTRICTED | TCP_CONG_WANTS_CE_EVENTS, + .name = "bbr", + .owner = THIS_MODULE, + .init = bbr_init, + .cong_control = bbr_main, + .sndbuf_expand = bbr_sndbuf_expand, ++ .skb_marked_lost = bbr_skb_marked_lost, + .undo_cwnd = bbr_undo_cwnd, + .cwnd_event = bbr_cwnd_event, + .ssthresh = bbr_ssthresh, +- .min_tso_segs = bbr_min_tso_segs, ++ .tso_segs = bbr_tso_segs, + .get_info = bbr_get_info, + .set_state = bbr_set_state, + }; +@@ -1159,10 +2361,11 @@ BTF_KFUNCS_START(tcp_bbr_check_kfunc_ids) + BTF_ID_FLAGS(func, bbr_init) + BTF_ID_FLAGS(func, bbr_main) + BTF_ID_FLAGS(func, bbr_sndbuf_expand) ++BTF_ID_FLAGS(func, bbr_skb_marked_lost) + BTF_ID_FLAGS(func, bbr_undo_cwnd) + BTF_ID_FLAGS(func, bbr_cwnd_event) + BTF_ID_FLAGS(func, bbr_ssthresh) +-BTF_ID_FLAGS(func, bbr_min_tso_segs) ++BTF_ID_FLAGS(func, bbr_tso_segs) + BTF_ID_FLAGS(func, bbr_set_state) + BTF_KFUNCS_END(tcp_bbr_check_kfunc_ids) + +@@ -1195,5 +2398,12 @@ MODULE_AUTHOR("Van Jacobson "); + MODULE_AUTHOR("Neal Cardwell "); + MODULE_AUTHOR("Yuchung Cheng "); + MODULE_AUTHOR("Soheil Hassas Yeganeh "); ++MODULE_AUTHOR("Priyaranjan Jha "); ++MODULE_AUTHOR("Yousuk Seung "); ++MODULE_AUTHOR("Kevin Yang "); ++MODULE_AUTHOR("Arjun Roy "); ++MODULE_AUTHOR("David Morley "); ++ + MODULE_LICENSE("Dual BSD/GPL"); + MODULE_DESCRIPTION("TCP BBR (Bottleneck Bandwidth and RTT)"); ++MODULE_VERSION(__stringify(BBR_VERSION)); +diff --git a/net/ipv4/tcp_cong.c b/net/ipv4/tcp_cong.c +index df758adbb445..e98e5dbc050e 100644 +--- a/net/ipv4/tcp_cong.c ++++ b/net/ipv4/tcp_cong.c +@@ -237,6 +237,7 @@ void tcp_init_congestion_control(struct sock *sk) + struct inet_connection_sock *icsk = inet_csk(sk); + + tcp_sk(sk)->prior_ssthresh = 0; ++ tcp_sk(sk)->fast_ack_mode = 0; + if (icsk->icsk_ca_ops->init) + icsk->icsk_ca_ops->init(sk); + if (tcp_ca_needs_ecn(sk)) +diff --git a/net/ipv4/tcp_input.c b/net/ipv4/tcp_input.c +index 71b76e98371a..d7bdfbae1a1e 100644 +--- a/net/ipv4/tcp_input.c ++++ b/net/ipv4/tcp_input.c +@@ -381,7 +381,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tcp_enter_quickack_mode(sk, 2); + break; + case INET_ECN_CE: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_IS_CE); + + if (!(tp->ecn_flags & TCP_ECN_DEMAND_CWR)) { +@@ -392,7 +392,7 @@ static void tcp_data_ecn_check(struct sock *sk, const struct sk_buff *skb) + tp->ecn_flags |= TCP_ECN_SEEN; + break; + default: +- if (tcp_ca_needs_ecn(sk)) ++ if (tcp_ca_wants_ce_events(sk)) + tcp_ca_event(sk, CA_EVENT_ECN_NO_CE); + tp->ecn_flags |= TCP_ECN_SEEN; + break; +@@ -1134,7 +1134,12 @@ static void tcp_verify_retransmit_hint(struct tcp_sock *tp, struct sk_buff *skb) + */ + static void tcp_notify_skb_loss_event(struct tcp_sock *tp, const struct sk_buff *skb) + { ++ struct sock *sk = (struct sock *)tp; ++ const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; ++ + tp->lost += tcp_skb_pcount(skb); ++ if (ca_ops->skb_marked_lost) ++ ca_ops->skb_marked_lost(sk, skb); + } + + void tcp_mark_skb_lost(struct sock *sk, struct sk_buff *skb) +@@ -1498,6 +1503,17 @@ static bool tcp_shifted_skb(struct sock *sk, struct sk_buff *prev, + WARN_ON_ONCE(tcp_skb_pcount(skb) < pcount); + tcp_skb_pcount_add(skb, -pcount); + ++ /* Adjust tx.in_flight as pcount is shifted from skb to prev. */ ++ if (WARN_ONCE(TCP_SKB_CB(skb)->tx.in_flight < pcount, ++ "prev in_flight: %u skb in_flight: %u pcount: %u", ++ TCP_SKB_CB(prev)->tx.in_flight, ++ TCP_SKB_CB(skb)->tx.in_flight, ++ pcount)) ++ TCP_SKB_CB(skb)->tx.in_flight = 0; ++ else ++ TCP_SKB_CB(skb)->tx.in_flight -= pcount; ++ TCP_SKB_CB(prev)->tx.in_flight += pcount; ++ + /* When we're adding to gso_segs == 1, gso_size will be zero, + * in theory this shouldn't be necessary but as long as DSACK + * code can come after this skb later on it's better to keep +@@ -3716,7 +3732,8 @@ static int tcp_replace_ts_recent(struct tcp_sock *tp, u32 seq) + /* This routine deals with acks during a TLP episode and ends an episode by + * resetting tlp_high_seq. Ref: TLP algorithm in RFC8985 + */ +-static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) ++static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag, ++ struct rate_sample *rs) + { + struct tcp_sock *tp = tcp_sk(sk); + +@@ -3733,6 +3750,7 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + /* ACK advances: there was a loss, so reduce cwnd. Reset + * tlp_high_seq in tcp_init_cwnd_reduction() + */ ++ tcp_ca_event(sk, CA_EVENT_TLP_RECOVERY); + tcp_init_cwnd_reduction(sk); + tcp_set_ca_state(sk, TCP_CA_CWR); + tcp_end_cwnd_reduction(sk); +@@ -3743,6 +3761,11 @@ static void tcp_process_tlp_ack(struct sock *sk, u32 ack, int flag) + FLAG_NOT_DUP | FLAG_DATA_SACKED))) { + /* Pure dupack: original and TLP probe arrived; no loss */ + tp->tlp_high_seq = 0; ++ } else { ++ /* This ACK matches a TLP retransmit. We cannot yet tell if ++ * this ACK is for the original or the TLP retransmit. ++ */ ++ rs->is_acking_tlp_retrans_seq = 1; + } + } + +@@ -3862,6 +3885,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + + prior_fack = tcp_is_sack(tp) ? tcp_highest_sack_seq(tp) : tp->snd_una; + rs.prior_in_flight = tcp_packets_in_flight(tp); ++ tcp_rate_check_app_limited(sk); + + /* ts_recent update must be made after we are sure that the packet + * is in window. +@@ -3927,7 +3951,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_in_ack_event(sk, flag); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + + if (tcp_ack_is_dubious(sk, flag)) { + if (!(flag & (FLAG_SND_UNA_ADVANCED | +@@ -3951,6 +3975,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + delivered = tcp_newly_delivered(sk, delivered, flag); + lost = tp->lost - lost; /* freshly marked lost */ + rs.is_ack_delayed = !!(flag & FLAG_ACK_MAYBE_DELAYED); ++ rs.is_ece = !!(flag & FLAG_ECE); + tcp_rate_gen(sk, delivered, lost, is_sack_reneg, sack_state.rate); + tcp_cong_control(sk, ack, delivered, flag, sack_state.rate); + tcp_xmit_recovery(sk, rexmit); +@@ -3971,7 +3996,7 @@ static int tcp_ack(struct sock *sk, const struct sk_buff *skb, int flag) + tcp_ack_probe(sk); + + if (tp->tlp_high_seq) +- tcp_process_tlp_ack(sk, ack, flag); ++ tcp_process_tlp_ack(sk, ack, flag, &rs); + return 1; + + old_ack: +@@ -5677,13 +5702,14 @@ static void __tcp_ack_snd_check(struct sock *sk, int ofo_possible) + + /* More than one full frame received... */ + if (((tp->rcv_nxt - tp->rcv_wup) > inet_csk(sk)->icsk_ack.rcv_mss && ++ (tp->fast_ack_mode == 1 || + /* ... and right edge of window advances far enough. + * (tcp_recvmsg() will send ACK otherwise). + * If application uses SO_RCVLOWAT, we want send ack now if + * we have not received enough bytes to satisfy the condition. + */ +- (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || +- __tcp_select_window(sk) >= tp->rcv_wnd)) || ++ (tp->rcv_nxt - tp->copied_seq < sk->sk_rcvlowat || ++ __tcp_select_window(sk) >= tp->rcv_wnd))) || + /* We ACK each frame or... */ + tcp_in_quickack_mode(sk) || + /* Protocol state mandates a one-time immediate ACK */ +diff --git a/net/ipv4/tcp_minisocks.c b/net/ipv4/tcp_minisocks.c +index 2994c9222c9c..a53af9d32e09 100644 +--- a/net/ipv4/tcp_minisocks.c ++++ b/net/ipv4/tcp_minisocks.c +@@ -475,6 +475,8 @@ void tcp_ca_openreq_child(struct sock *sk, const struct dst_entry *dst) + u32 ca_key = dst_metric(dst, RTAX_CC_ALGO); + bool ca_got_dst = false; + ++ tcp_set_ecn_low_from_dst(sk, dst); ++ + if (ca_key != TCP_CA_UNSPEC) { + const struct tcp_congestion_ops *ca; + +diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c +index caf11920a878..61e45fbd3e5f 100644 +--- a/net/ipv4/tcp_output.c ++++ b/net/ipv4/tcp_output.c +@@ -339,10 +339,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + bool bpf_needs_ecn = tcp_bpf_ca_needs_ecn(sk); + bool use_ecn = READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_ecn) == 1 || + tcp_ca_needs_ecn(sk) || bpf_needs_ecn; ++ const struct dst_entry *dst = __sk_dst_get(sk); + + if (!use_ecn) { +- const struct dst_entry *dst = __sk_dst_get(sk); +- + if (dst && dst_feature(dst, RTAX_FEATURE_ECN)) + use_ecn = true; + } +@@ -354,6 +353,9 @@ static void tcp_ecn_send_syn(struct sock *sk, struct sk_buff *skb) + tcp_ecn_mode_set(tp, TCP_ECN_MODE_RFC3168); + if (tcp_ca_needs_ecn(sk) || bpf_needs_ecn) + INET_ECN_xmit(sk); ++ ++ if (dst) ++ tcp_set_ecn_low_from_dst(sk, dst); + } + } + +@@ -391,7 +393,8 @@ static void tcp_ecn_send(struct sock *sk, struct sk_buff *skb, + th->cwr = 1; + skb_shinfo(skb)->gso_type |= SKB_GSO_TCP_ECN; + } +- } else if (!tcp_ca_needs_ecn(sk)) { ++ } else if (!(tp->ecn_flags & TCP_ECN_ECT_PERMANENT) && ++ !tcp_ca_needs_ecn(sk)) { + /* ACK or retransmitted segment: clear ECT|CE */ + INET_ECN_dontxmit(sk); + } +@@ -1609,7 +1612,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + { + struct tcp_sock *tp = tcp_sk(sk); + struct sk_buff *buff; +- int old_factor; ++ int old_factor, inflight_prev; + long limit; + u16 flags; + int nlen; +@@ -1684,6 +1687,30 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, + + if (diff) + tcp_adjust_pcount(sk, skb, diff); ++ ++ inflight_prev = TCP_SKB_CB(skb)->tx.in_flight - old_factor; ++ if (inflight_prev < 0) { ++ WARN_ONCE(tcp_skb_tx_in_flight_is_suspicious( ++ old_factor, ++ TCP_SKB_CB(skb)->sacked, ++ TCP_SKB_CB(skb)->tx.in_flight), ++ "inconsistent: tx.in_flight: %u " ++ "old_factor: %d mss: %u sacked: %u " ++ "1st pcount: %d 2nd pcount: %d " ++ "1st len: %u 2nd len: %u ", ++ TCP_SKB_CB(skb)->tx.in_flight, old_factor, ++ mss_now, TCP_SKB_CB(skb)->sacked, ++ tcp_skb_pcount(skb), tcp_skb_pcount(buff), ++ skb->len, buff->len); ++ inflight_prev = 0; ++ } ++ /* Set 1st tx.in_flight as if 1st were sent by itself: */ ++ TCP_SKB_CB(skb)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb); ++ /* Set 2nd tx.in_flight with new 1st and 2nd pcounts: */ ++ TCP_SKB_CB(buff)->tx.in_flight = inflight_prev + ++ tcp_skb_pcount(skb) + ++ tcp_skb_pcount(buff); + } + + /* Link BUFF into the send queue. */ +@@ -2040,13 +2067,12 @@ static u32 tcp_tso_autosize(const struct sock *sk, unsigned int mss_now, + static u32 tcp_tso_segs(struct sock *sk, unsigned int mss_now) + { + const struct tcp_congestion_ops *ca_ops = inet_csk(sk)->icsk_ca_ops; +- u32 min_tso, tso_segs; +- +- min_tso = ca_ops->min_tso_segs ? +- ca_ops->min_tso_segs(sk) : +- READ_ONCE(sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); ++ u32 tso_segs; + +- tso_segs = tcp_tso_autosize(sk, mss_now, min_tso); ++ tso_segs = ca_ops->tso_segs ? ++ ca_ops->tso_segs(sk, mss_now) : ++ tcp_tso_autosize(sk, mss_now, ++ sock_net(sk)->ipv4.sysctl_tcp_min_tso_segs); + return min_t(u32, tso_segs, sk->sk_gso_max_segs); + } + +@@ -2771,6 +2797,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, + skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); + list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); + tcp_init_tso_segs(skb, mss_now); ++ tcp_set_tx_in_flight(sk, skb); + goto repair; /* Skip network transmission */ + } + +@@ -2983,6 +3010,7 @@ void tcp_send_loss_probe(struct sock *sk) + if (WARN_ON(!skb || !tcp_skb_pcount(skb))) + goto rearm_timer; + ++ tp->tlp_orig_data_app_limited = TCP_SKB_CB(skb)->tx.is_app_limited; + if (__tcp_retransmit_skb(sk, skb, 1)) + goto rearm_timer; + +diff --git a/net/ipv4/tcp_rate.c b/net/ipv4/tcp_rate.c +index a8f6d9d06f2e..8737f2134648 100644 +--- a/net/ipv4/tcp_rate.c ++++ b/net/ipv4/tcp_rate.c +@@ -34,6 +34,24 @@ + * ready to send in the write queue. + */ + ++void tcp_set_tx_in_flight(struct sock *sk, struct sk_buff *skb) ++{ ++ struct tcp_sock *tp = tcp_sk(sk); ++ u32 in_flight; ++ ++ /* Check, sanitize, and record packets in flight after skb was sent. */ ++ in_flight = tcp_packets_in_flight(tp) + tcp_skb_pcount(skb); ++ if (WARN_ONCE(in_flight > TCPCB_IN_FLIGHT_MAX, ++ "insane in_flight %u cc %s mss %u " ++ "cwnd %u pif %u %u %u %u\n", ++ in_flight, inet_csk(sk)->icsk_ca_ops->name, ++ tp->mss_cache, tp->snd_cwnd, ++ tp->packets_out, tp->retrans_out, ++ tp->sacked_out, tp->lost_out)) ++ in_flight = TCPCB_IN_FLIGHT_MAX; ++ TCP_SKB_CB(skb)->tx.in_flight = in_flight; ++} ++ + /* Snapshot the current delivery information in the skb, to generate + * a rate sample later when the skb is (s)acked in tcp_rate_skb_delivered(). + */ +@@ -66,7 +84,9 @@ void tcp_rate_skb_sent(struct sock *sk, struct sk_buff *skb) + TCP_SKB_CB(skb)->tx.delivered_mstamp = tp->delivered_mstamp; + TCP_SKB_CB(skb)->tx.delivered = tp->delivered; + TCP_SKB_CB(skb)->tx.delivered_ce = tp->delivered_ce; ++ TCP_SKB_CB(skb)->tx.lost = tp->lost; + TCP_SKB_CB(skb)->tx.is_app_limited = tp->app_limited ? 1 : 0; ++ tcp_set_tx_in_flight(sk, skb); + } + + /* When an skb is sacked or acked, we fill in the rate sample with the (prior) +@@ -91,18 +111,21 @@ void tcp_rate_skb_delivered(struct sock *sk, struct sk_buff *skb, + if (!rs->prior_delivered || + tcp_skb_sent_after(tx_tstamp, tp->first_tx_mstamp, + scb->end_seq, rs->last_end_seq)) { ++ rs->prior_lost = scb->tx.lost; + rs->prior_delivered_ce = scb->tx.delivered_ce; + rs->prior_delivered = scb->tx.delivered; + rs->prior_mstamp = scb->tx.delivered_mstamp; + rs->is_app_limited = scb->tx.is_app_limited; + rs->is_retrans = scb->sacked & TCPCB_RETRANS; ++ rs->tx_in_flight = scb->tx.in_flight; + rs->last_end_seq = scb->end_seq; + + /* Record send time of most recently ACKed packet: */ + tp->first_tx_mstamp = tx_tstamp; + /* Find the duration of the "send phase" of this window: */ +- rs->interval_us = tcp_stamp_us_delta(tp->first_tx_mstamp, +- scb->tx.first_tx_mstamp); ++ rs->interval_us = tcp_stamp32_us_delta( ++ tp->first_tx_mstamp, ++ scb->tx.first_tx_mstamp); + + } + /* Mark off the skb delivered once it's sacked to avoid being +@@ -144,6 +167,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + return; + } + rs->delivered = tp->delivered - rs->prior_delivered; ++ rs->lost = tp->lost - rs->prior_lost; + + rs->delivered_ce = tp->delivered_ce - rs->prior_delivered_ce; + /* delivered_ce occupies less than 32 bits in the skb control block */ +@@ -155,7 +179,7 @@ void tcp_rate_gen(struct sock *sk, u32 delivered, u32 lost, + * longer phase. + */ + snd_us = rs->interval_us; /* send phase */ +- ack_us = tcp_stamp_us_delta(tp->tcp_mstamp, ++ ack_us = tcp_stamp32_us_delta(tp->tcp_mstamp, + rs->prior_mstamp); /* ack phase */ + rs->interval_us = max(snd_us, ack_us); + +diff --git a/net/ipv4/tcp_timer.c b/net/ipv4/tcp_timer.c +index a207877270fb..0e67c7281410 100644 +--- a/net/ipv4/tcp_timer.c ++++ b/net/ipv4/tcp_timer.c +@@ -565,7 +565,7 @@ void tcp_retransmit_timer(struct sock *sk) + struct inet_sock *inet = inet_sk(sk); + u32 rtx_delta; + +- rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: ++ rtx_delta = tcp_time_stamp_ts(tp) - (tp->retrans_stamp ?: + tcp_skb_timestamp_ts(tp->tcp_usec_ts, skb)); + if (tp->tcp_usec_ts) + rtx_delta /= USEC_PER_MSEC; +@@ -702,6 +702,8 @@ void tcp_write_timer_handler(struct sock *sk) + icsk_timeout(icsk)); + return; + } ++ ++ tcp_rate_check_app_limited(sk); + tcp_mstamp_refresh(tcp_sk(sk)); + event = icsk->icsk_pending; + +-- +2.51.0 + diff --git a/sys-kernel/git-sources/0003-cachy.patch b/sys-kernel/git-sources/0003-cachy.patch new file mode 100644 index 0000000..0a55a31 --- /dev/null +++ b/sys-kernel/git-sources/0003-cachy.patch @@ -0,0 +1,9540 @@ +From 657b2f3ce3beb8717754f7b0c4ab900f8f3fe0a6 Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 1 Sep 2025 09:38:54 +0800 +Subject: [PATCH 3/4] cachy + +Signed-off-by: Eric Naim +--- + .gitignore | 2 + + .../admin-guide/kernel-parameters.txt | 12 + + Documentation/admin-guide/sysctl/vm.rst | 72 + + Makefile | 33 +- + arch/Kconfig | 19 + + arch/x86/Kconfig.cpu | 46 + + arch/x86/Makefile | 16 +- + arch/x86/include/asm/pci.h | 6 + + arch/x86/pci/common.c | 7 +- + block/Kconfig.iosched | 14 + + block/Makefile | 8 + + block/adios.c | 1881 ++++++++++ + block/elevator.c | 26 +- + drivers/Makefile | 13 +- + drivers/ata/ahci.c | 23 +- + drivers/cpufreq/Kconfig.x86 | 2 - + drivers/cpufreq/intel_pstate.c | 2 + + drivers/gpu/drm/amd/amdgpu/amdgpu.h | 1 + + drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c | 10 + + drivers/gpu/drm/amd/display/Kconfig | 6 + + .../gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c | 2 +- + .../amd/display/amdgpu_dm/amdgpu_dm_color.c | 2 +- + .../amd/display/amdgpu_dm/amdgpu_dm_crtc.c | 6 +- + .../amd/display/amdgpu_dm/amdgpu_dm_plane.c | 6 +- + drivers/gpu/drm/amd/pm/amdgpu_pm.c | 3 + + drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c | 14 +- + drivers/input/evdev.c | 19 +- + drivers/md/dm-crypt.c | 5 + + drivers/media/v4l2-core/Kconfig | 5 + + drivers/media/v4l2-core/Makefile | 2 + + drivers/media/v4l2-core/v4l2loopback.c | 3316 +++++++++++++++++ + drivers/media/v4l2-core/v4l2loopback.h | 108 + + .../media/v4l2-core/v4l2loopback_formats.h | 445 +++ + drivers/pci/controller/Makefile | 6 + + drivers/pci/controller/intel-nvme-remap.c | 462 +++ + drivers/pci/quirks.c | 101 + + drivers/scsi/Kconfig | 2 + + drivers/scsi/Makefile | 1 + + drivers/scsi/vhba/Kconfig | 9 + + drivers/scsi/vhba/Makefile | 4 + + drivers/scsi/vhba/vhba.c | 1132 ++++++ + include/linux/mm.h | 8 + + include/linux/pagemap.h | 2 +- + include/linux/user_namespace.h | 4 + + init/Kconfig | 26 + + kernel/Kconfig.hz | 24 + + kernel/Kconfig.preempt | 2 +- + kernel/fork.c | 14 + + kernel/locking/rwsem.c | 4 +- + kernel/sched/fair.c | 13 + + kernel/sched/sched.h | 2 +- + kernel/sysctl.c | 13 + + kernel/user_namespace.c | 7 + + mm/Kconfig | 65 +- + mm/compaction.c | 4 + + mm/huge_memory.c | 4 + + mm/mm_init.c | 1 + + mm/page-writeback.c | 8 + + mm/page_alloc.c | 4 + + mm/swap.c | 5 + + mm/util.c | 34 + + mm/vmpressure.c | 4 + + mm/vmscan.c | 157 +- + scripts/Makefile.thinlto | 38 + + scripts/Makefile.vmlinux_a | 83 + + scripts/mod/modpost.c | 15 +- + 66 files changed, 8314 insertions(+), 76 deletions(-) + create mode 100644 block/adios.c + create mode 100644 drivers/media/v4l2-core/v4l2loopback.c + create mode 100644 drivers/media/v4l2-core/v4l2loopback.h + create mode 100644 drivers/media/v4l2-core/v4l2loopback_formats.h + create mode 100644 drivers/pci/controller/intel-nvme-remap.c + create mode 100644 drivers/scsi/vhba/Kconfig + create mode 100644 drivers/scsi/vhba/Makefile + create mode 100644 drivers/scsi/vhba/vhba.c + create mode 100644 scripts/Makefile.thinlto + create mode 100644 scripts/Makefile.vmlinux_a + +diff --git a/.gitignore b/.gitignore +index 929054df5212..e4b492cc3993 100644 +--- a/.gitignore ++++ b/.gitignore +@@ -55,6 +55,7 @@ + *.zst + Module.symvers + dtbs-list ++builtin.order + modules.order + + # +@@ -66,6 +67,7 @@ modules.order + /vmlinux.32 + /vmlinux.map + /vmlinux.symvers ++/vmlinux.thinlto-index + /vmlinux.unstripped + /vmlinux-gdb.py + /vmlinuz +diff --git a/Documentation/admin-guide/kernel-parameters.txt b/Documentation/admin-guide/kernel-parameters.txt +index 747a55abf494..71751ccf0755 100644 +--- a/Documentation/admin-guide/kernel-parameters.txt ++++ b/Documentation/admin-guide/kernel-parameters.txt +@@ -2384,6 +2384,9 @@ + disable + Do not enable intel_pstate as the default + scaling driver for the supported processors ++ enable ++ Enable intel_pstate in-case "disable" was passed ++ previously in the kernel boot parameters + active + Use intel_pstate driver to bypass the scaling + governors layer of cpufreq and provides it own +@@ -4799,6 +4802,15 @@ + nomsi [MSI] If the PCI_MSI kernel config parameter is + enabled, this kernel boot option can be used to + disable the use of MSI interrupts system-wide. ++ pcie_acs_override = ++ [PCIE] Override missing PCIe ACS support for: ++ downstream ++ All downstream ports - full ACS capabilities ++ multfunction ++ All multifunction devices - multifunction ACS subset ++ id:nnnn:nnnn ++ Specfic device - full ACS capabilities ++ Specified as vid:did (vendor/device ID) in hex + noioapicquirk [APIC] Disable all boot interrupt quirks. + Safety option to keep boot IRQs enabled. This + should never be necessary. +diff --git a/Documentation/admin-guide/sysctl/vm.rst b/Documentation/admin-guide/sysctl/vm.rst +index 4d71211fdad8..57af938f1969 100644 +--- a/Documentation/admin-guide/sysctl/vm.rst ++++ b/Documentation/admin-guide/sysctl/vm.rst +@@ -25,6 +25,9 @@ files can be found in mm/swap.c. + Currently, these files are in /proc/sys/vm: + + - admin_reserve_kbytes ++- anon_min_ratio ++- clean_low_ratio ++- clean_min_ratio + - compact_memory + - compaction_proactiveness + - compact_unevictable_allowed +@@ -110,6 +113,67 @@ On x86_64 this is about 128MB. + Changing this takes effect whenever an application requests memory. + + ++anon_min_ratio ++============== ++ ++This knob provides *hard* protection of anonymous pages. The anonymous pages ++on the current node won't be reclaimed under any conditions when their amount ++is below vm.anon_min_ratio. ++ ++This knob may be used to prevent excessive swap thrashing when anonymous ++memory is low (for example, when memory is going to be overfilled by ++compressed data of zram module). ++ ++Setting this value too high (close to 100) can result in inability to ++swap and can lead to early OOM under memory pressure. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 1. ++ ++ ++clean_low_ratio ++================ ++ ++This knob provides *best-effort* protection of clean file pages. The file pages ++on the current node won't be reclaimed under memory pressure when the amount of ++clean file pages is below vm.clean_low_ratio *unless* we threaten to OOM. ++ ++Protection of clean file pages using this knob may be used when swapping is ++still possible to ++ - prevent disk I/O thrashing under memory pressure; ++ - improve performance in disk cache-bound tasks under memory pressure. ++ ++Setting it to a high value may result in a early eviction of anonymous pages ++into the swap space by attempting to hold the protected amount of clean file ++pages in memory. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 15. ++ ++ ++clean_min_ratio ++================ ++ ++This knob provides *hard* protection of clean file pages. The file pages on the ++current node won't be reclaimed under memory pressure when the amount of clean ++file pages is below vm.clean_min_ratio. ++ ++Hard protection of clean file pages using this knob may be used to ++ - prevent disk I/O thrashing under memory pressure even with no free swap space; ++ - improve performance in disk cache-bound tasks under memory pressure; ++ - avoid high latency and prevent livelock in near-OOM conditions. ++ ++Setting it to a high value may result in a early out-of-memory condition due to ++the inability to reclaim the protected amount of clean file pages when other ++types of pages cannot be reclaimed. ++ ++The unit of measurement is the percentage of the total memory of the node. ++ ++The default value is 4. ++ ++ + compact_memory + ============== + +@@ -980,6 +1044,14 @@ be 133 (x + 2x = 200, 2x = 133.33). + At 0, the kernel will not initiate swap until the amount of free and + file-backed pages is less than the high watermark in a zone. + ++This knob has no effect if the amount of clean file pages on the current ++node is below vm.clean_low_ratio or vm.clean_min_ratio. In this case, ++only anonymous pages can be reclaimed. ++ ++If the number of anonymous pages on the current node is below ++vm.anon_min_ratio, then only file pages can be reclaimed with ++any vm.swappiness value. ++ + + unprivileged_userfaultfd + ======================== +diff --git a/Makefile b/Makefile +index b9c661913250..8fc00895b0ba 100644 +--- a/Makefile ++++ b/Makefile +@@ -869,11 +869,19 @@ KBUILD_CFLAGS += -fno-delete-null-pointer-checks + ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE + KBUILD_CFLAGS += -O2 + KBUILD_RUSTFLAGS += -Copt-level=2 ++else ifdef CONFIG_CC_OPTIMIZE_FOR_PERFORMANCE_O3 ++KBUILD_CFLAGS += -O3 ++KBUILD_RUSTFLAGS += -Copt-level=3 + else ifdef CONFIG_CC_OPTIMIZE_FOR_SIZE + KBUILD_CFLAGS += -Os + KBUILD_RUSTFLAGS += -Copt-level=s + endif + ++# Perform swing modulo scheduling immediately before the first scheduling pass. ++# This pass looks at innermost loops and reorders their instructions by ++# overlapping different iterations. ++KBUILD_CFLAGS += $(call cc-option,-fmodulo-sched -fmodulo-sched-allow-regmoves -fivopts -fmodulo-sched) ++ + # Always set `debug-assertions` and `overflow-checks` because their default + # depends on `opt-level` and `debug-assertions`, respectively. + KBUILD_RUSTFLAGS += -Cdebug-assertions=$(if $(CONFIG_RUST_DEBUG_ASSERTIONS),y,n) +@@ -1003,10 +1011,10 @@ export CC_FLAGS_SCS + endif + + ifdef CONFIG_LTO_CLANG +-ifdef CONFIG_LTO_CLANG_THIN +-CC_FLAGS_LTO := -flto=thin -fsplit-lto-unit +-else ++ifdef CONFIG_LTO_CLANG_FULL + CC_FLAGS_LTO := -flto ++else ++CC_FLAGS_LTO := -flto=thin -fsplit-lto-unit + endif + CC_FLAGS_LTO += -fvisibility=hidden + +@@ -1200,7 +1208,7 @@ export ARCH_DRIVERS := $(drivers-y) $(drivers-m) + KBUILD_VMLINUX_OBJS := built-in.a $(patsubst %/, %/lib.a, $(filter %/, $(libs-y))) + KBUILD_VMLINUX_LIBS := $(filter-out %/, $(libs-y)) + +-export KBUILD_VMLINUX_LIBS ++export KBUILD_VMLINUX_OBJS KBUILD_VMLINUX_LIBS + export KBUILD_LDS := arch/$(SRCARCH)/kernel/vmlinux.lds + + ifdef CONFIG_TRIM_UNUSED_KSYMS +@@ -1209,16 +1217,12 @@ ifdef CONFIG_TRIM_UNUSED_KSYMS + KBUILD_MODULES := y + endif + +-# '$(AR) mPi' needs 'T' to workaround the bug of llvm-ar <= 14 +-quiet_cmd_ar_vmlinux.a = AR $@ +- cmd_ar_vmlinux.a = \ +- rm -f $@; \ +- $(AR) cDPrST $@ $(KBUILD_VMLINUX_OBJS); \ +- $(AR) mPiT $$($(AR) t $@ | sed -n 1p) $@ $$($(AR) t $@ | grep -F -f $(srctree)/scripts/head-object-list.txt) ++PHONY += vmlinux_a ++vmlinux_a: $(KBUILD_VMLINUX_OBJS) scripts/head-object-list.txt FORCE ++ $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.vmlinux_a + +-targets += vmlinux.a +-vmlinux.a: $(KBUILD_VMLINUX_OBJS) scripts/head-object-list.txt FORCE +- $(call if_changed,ar_vmlinux.a) ++vmlinux.a: vmlinux_a ++ @: + + PHONY += vmlinux_o + vmlinux_o: vmlinux.a $(KBUILD_VMLINUX_LIBS) +@@ -1578,6 +1582,7 @@ endif # CONFIG_MODULES + CLEAN_FILES += vmlinux.symvers modules-only.symvers \ + modules.builtin modules.builtin.modinfo modules.nsdeps \ + modules.builtin.ranges vmlinux.o.map vmlinux.unstripped \ ++ vmlinux.thinlto-index builtin.order \ + compile_commands.json rust/test \ + rust-project.json .vmlinux.objs .vmlinux.export.c \ + .builtin-dtbs-list .builtin-dtb.S +@@ -2019,7 +2024,7 @@ clean: $(clean-dirs) + $(call cmd,rmfiles) + @find . $(RCS_FIND_IGNORE) \ + \( -name '*.[aios]' -o -name '*.rsi' -o -name '*.ko' -o -name '.*.cmd' \ +- -o -name '*.ko.*' \ ++ -o -name '*.ko.*' -o -name '*.o.thinlto.bc' \ + -o -name '*.dtb' -o -name '*.dtbo' \ + -o -name '*.dtb.S' -o -name '*.dtbo.S' \ + -o -name '*.dt.yaml' -o -name 'dtbs-list' \ +diff --git a/arch/Kconfig b/arch/Kconfig +index d1b4ffd6e085..9ea0ac45923e 100644 +--- a/arch/Kconfig ++++ b/arch/Kconfig +@@ -826,6 +826,25 @@ config LTO_CLANG_THIN + https://clang.llvm.org/docs/ThinLTO.html + + If unsure, say Y. ++ ++config LTO_CLANG_THIN_DIST ++ bool "Clang ThinLTO in distributed mode (EXPERIMENTAL)" ++ depends on HAS_LTO_CLANG && ARCH_SUPPORTS_LTO_CLANG_THIN ++ select LTO_CLANG ++ help ++ This option enables Clang's ThinLTO in distributed build mode. ++ In this mode, the linker performs the thin-link, generating ++ ThinLTO index files. Subsequently, the build system explicitly ++ invokes ThinLTO backend compilation using these index files ++ and pre-linked IR objects. The resulting native object files ++ are with the .thinlto-native.o suffix. ++ ++ This build mode offers improved visibility into the ThinLTO ++ process through explicit subcommand exposure. It also makes ++ final native object files directly available, benefiting ++ tools like objtool and kpatch. Additionally, it provides ++ crucial granular control over back-end options, enabling ++ module-specific compiler options, and simplifies debugging. + endchoice + + config ARCH_SUPPORTS_AUTOFDO_CLANG +diff --git a/arch/x86/Kconfig.cpu b/arch/x86/Kconfig.cpu +index f928cf6e3252..d4ce964d9713 100644 +--- a/arch/x86/Kconfig.cpu ++++ b/arch/x86/Kconfig.cpu +@@ -255,6 +255,11 @@ config CC_HAS_MARCH_NATIVE + # usage warnings that only appear wth '-march=native'. + depends on CC_IS_GCC || CLANG_VERSION >= 190100 + ++ ++choice ++ prompt "x86_64 Compiler Build Optimization" ++ default GENERIC_CPU ++ + config X86_NATIVE_CPU + bool "Build and optimize for local/native CPU" + depends on X86_64 +@@ -269,6 +274,47 @@ config X86_NATIVE_CPU + + If unsure, say N. + ++config GENERIC_CPU ++ bool "Generic-x86-64" ++ depends on X86_64 ++ help ++ Generic x86-64 CPU. ++ Runs equally well on all x86-64 CPUs. ++ ++config MZEN4 ++ bool "AMD Ryzen 4" ++ depends on (CC_IS_GCC && GCC_VERSION >= 130000) || (CC_IS_CLANG && CLANG_VERSION >= 160000) ++ help ++ Select this for AMD Family 19h Zen 4 processors. ++ ++ Enables -march=znver4 ++ ++endchoice ++ ++config X86_64_VERSION ++ int "x86-64 compiler ISA level" ++ range 1 4 ++ depends on (CC_IS_GCC && GCC_VERSION > 110000) || (CC_IS_CLANG && CLANG_VERSION >= 120000) ++ depends on X86_64 && GENERIC_CPU ++ help ++ Specify a specific x86-64 compiler ISA level. ++ ++ There are three x86-64 ISA levels that work on top of ++ the x86-64 baseline, namely: x86-64-v2 and x86-64-v3. ++ ++ x86-64-v2 brings support for vector instructions up to Streaming SIMD ++ Extensions 4.2 (SSE4.2) and Supplemental Streaming SIMD Extensions 3 ++ (SSSE3), the POPCNT instruction, and CMPXCHG16B. ++ ++ x86-64-v3 adds vector instructions up to AVX2, MOVBE, and additional ++ bit-manipulation instructions. ++ ++ x86-64-v4 is not included since the kernel does not use AVX512 instructions ++ ++ You can find the best version for your CPU by running one of the following: ++ /lib/ld-linux-x86-64.so.2 --help | grep supported ++ /lib64/ld-linux-x86-64.so.2 --help | grep supported ++ + config X86_GENERIC + bool "Generic x86 support" + depends on X86_32 +diff --git a/arch/x86/Makefile b/arch/x86/Makefile +index 1913d342969b..82358ed864bb 100644 +--- a/arch/x86/Makefile ++++ b/arch/x86/Makefile +@@ -176,10 +176,22 @@ else + ifdef CONFIG_X86_NATIVE_CPU + KBUILD_CFLAGS += -march=native + KBUILD_RUSTFLAGS += -Ctarget-cpu=native +-else ++endif ++ ++ifdef CONFIG_MZEN4 ++ KBUILD_CFLAGS += -march=znver4 ++ KBUILD_RUSTFLAGS += -Ctarget-cpu=znver4 ++endif ++ ++ifdef CONFIG_GENERIC_CPU ++ifeq ($(CONFIG_X86_64_VERSION),1) + KBUILD_CFLAGS += -march=x86-64 -mtune=generic + KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64 -Ztune-cpu=generic +-endif ++else ++ KBUILD_CFLAGS +=-march=x86-64-v$(CONFIG_X86_64_VERSION) ++ KBUILD_RUSTFLAGS += -Ctarget-cpu=x86-64-v$(CONFIG_X86_64_VERSION) ++endif # CONFIG_X86_64_VERSION ++endif # CONFIG_GENERIC_CPU + + KBUILD_CFLAGS += -mno-red-zone + KBUILD_CFLAGS += -mcmodel=kernel +diff --git a/arch/x86/include/asm/pci.h b/arch/x86/include/asm/pci.h +index b3ab80a03365..5e883b397ff3 100644 +--- a/arch/x86/include/asm/pci.h ++++ b/arch/x86/include/asm/pci.h +@@ -26,6 +26,7 @@ struct pci_sysdata { + #if IS_ENABLED(CONFIG_VMD) + struct pci_dev *vmd_dev; /* VMD Device if in Intel VMD domain */ + #endif ++ struct pci_dev *nvme_remap_dev; /* AHCI Device if NVME remapped bus */ + }; + + extern int pci_routeirq; +@@ -69,6 +70,11 @@ static inline bool is_vmd(struct pci_bus *bus) + #define is_vmd(bus) false + #endif /* CONFIG_VMD */ + ++static inline bool is_nvme_remap(struct pci_bus *bus) ++{ ++ return to_pci_sysdata(bus)->nvme_remap_dev != NULL; ++} ++ + /* Can be used to override the logic in pci_scan_bus for skipping + already-configured bus numbers - to be used for buggy BIOSes + or architectures with incomplete PCI setup by the loader */ +diff --git a/arch/x86/pci/common.c b/arch/x86/pci/common.c +index ddb798603201..7c20387d8202 100644 +--- a/arch/x86/pci/common.c ++++ b/arch/x86/pci/common.c +@@ -723,12 +723,15 @@ int pci_ext_cfg_avail(void) + return 0; + } + +-#if IS_ENABLED(CONFIG_VMD) + struct pci_dev *pci_real_dma_dev(struct pci_dev *dev) + { ++#if IS_ENABLED(CONFIG_VMD) + if (is_vmd(dev->bus)) + return to_pci_sysdata(dev->bus)->vmd_dev; ++#endif ++ ++ if (is_nvme_remap(dev->bus)) ++ return to_pci_sysdata(dev->bus)->nvme_remap_dev; + + return dev; + } +-#endif +diff --git a/block/Kconfig.iosched b/block/Kconfig.iosched +index 27f11320b8d1..e98585dd83e0 100644 +--- a/block/Kconfig.iosched ++++ b/block/Kconfig.iosched +@@ -16,6 +16,20 @@ config MQ_IOSCHED_KYBER + synchronous writes, it will self-tune queue depths to achieve that + goal. + ++config MQ_IOSCHED_ADIOS ++ tristate "Adaptive Deadline I/O scheduler" ++ default m ++ help ++ The Adaptive Deadline I/O Scheduler (ADIOS) is a multi-queue I/O ++ scheduler with learning-based adaptive latency control. ++ ++config MQ_IOSCHED_DEFAULT_ADIOS ++ bool "Enable ADIOS I/O scheduler as default MQ I/O scheduler" ++ depends on MQ_IOSCHED_ADIOS=y ++ default n ++ help ++ Enable the ADIOS I/O scheduler as the default scheduler for MQ I/O. ++ + config IOSCHED_BFQ + tristate "BFQ I/O scheduler" + select BLK_ICQ +diff --git a/block/Makefile b/block/Makefile +index c65f4da93702..105b12fd86b8 100644 +--- a/block/Makefile ++++ b/block/Makefile +@@ -22,6 +22,7 @@ obj-$(CONFIG_BLK_CGROUP_IOLATENCY) += blk-iolatency.o + obj-$(CONFIG_BLK_CGROUP_IOCOST) += blk-iocost.o + obj-$(CONFIG_MQ_IOSCHED_DEADLINE) += mq-deadline.o + obj-$(CONFIG_MQ_IOSCHED_KYBER) += kyber-iosched.o ++obj-$(CONFIG_MQ_IOSCHED_ADIOS) += adios.o + bfq-y := bfq-iosched.o bfq-wf2q.o bfq-cgroup.o + obj-$(CONFIG_IOSCHED_BFQ) += bfq.o + +@@ -36,3 +37,10 @@ obj-$(CONFIG_BLK_INLINE_ENCRYPTION) += blk-crypto.o blk-crypto-profile.o \ + blk-crypto-sysfs.o + obj-$(CONFIG_BLK_INLINE_ENCRYPTION_FALLBACK) += blk-crypto-fallback.o + obj-$(CONFIG_BLOCK_HOLDER_DEPRECATED) += holder.o ++ ++all: ++ make -C /lib/modules/$(shell uname -r)/build M=$(PWD) modules ++ ++clean: ++ make -C /lib/modules/$(shell uname -r)/build M=$(PWD) clean ++ +diff --git a/block/adios.c b/block/adios.c +new file mode 100644 +index 000000000000..bcc90564b9ce +--- /dev/null ++++ b/block/adios.c +@@ -0,0 +1,1881 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Adaptive Deadline I/O Scheduler (ADIOS) ++ * Copyright (C) 2025 Masahito Suzuki ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include "elevator.h" ++#include "blk.h" ++#include "blk-mq.h" ++#include "blk-mq-sched.h" ++ ++#define ADIOS_VERSION "3.0.1" ++ ++/* Request Types: ++ * ++ * Tier 0 (Highest Priority): Emergency & System Integrity Requests ++ * ----------------------------------------------------------------- ++ * - Target: Requests with the BLK_MQ_INSERT_AT_HEAD flag. ++ * - Purpose: For critical, non-negotiable operations such as device error ++ * recovery or flush sequences that must bypass all other scheduling logic. ++ * - Implementation: Placed in a dedicated, high-priority FIFO queue ++ * (`prio_queue[0]`) for immediate dispatch. ++ * ++ * Tier 1 (High Priority): Data Persistence & Ordering Guarantees ++ * --------------------------------------------------------------- ++ * - Target: Requests with integrity-sensitive flags like REQ_FUA or ++ * REQ_PREFLUSH, typically originating from O_DIRECT I/O. ++ * - Purpose: To ensure strict ordering and data persistence guarantees, ++ * preventing data corruption in applications like databases. ++ * - Implementation: Handled in a separate, secondary FIFO queue ++ * (`prio_queue[1]`) to ensure they are processed in submission order and ++ * before any lower-priority requests. ++ * ++ * Tier 2 (Medium Priority): Application Responsiveness ++ * ---------------------------------------------------- ++ * - Target: Normal synchronous requests (e.g., from standard file reads). ++ * - Purpose: To ensure correct application behavior for operations that ++ * depend on sequential I/O completion (e.g., file system mounts) and to ++ * provide low latency for interactive applications. ++ * - Implementation: The deadline for these requests is set to their start ++ * time (`rq->start_time_ns`). This effectively enforces FIFO-like behavior ++ * within the deadline-sorted red-black tree, preventing out-of-order ++ * execution of dependent synchronous operations. ++ * ++ * Tier 3 (Normal Priority): Background Throughput ++ * ----------------------------------------------- ++ * - Target: Asynchronous requests. ++ * - Purpose: To maximize disk throughput for background tasks where latency ++ * is not critical. ++ * - Implementation: These are the only requests where ADIOS's adaptive ++ * latency prediction model is used. A dynamic deadline is calculated based ++ * on the predicted I/O latency, allowing for aggressive reordering to ++ * optimize I/O efficiency. ++ * ++ * Dispatch Logic: ++ * The scheduler always dispatches requests in strict priority order: ++ * 1. prio_queue[0] (Tier 0) ++ * 2. prio_queue[1] (Tier 1) ++ * 3. The deadline-sorted batch queue (which naturally prioritizes Tier 2 ++ * over Tier 3 due to their calculated deadlines). ++ */ ++ ++// Global variable to control the latency ++static u64 default_global_latency_window = 16000000ULL; ++static u64 default_global_latency_window_rotational = 22000000ULL; ++// Ratio below which batch queues should be refilled ++static u8 default_bq_refill_below_ratio = 20; ++// Maximum latency sample to input ++static u64 default_lat_model_latency_limit = 500000000ULL; ++// Batch ordering strategy ++static u64 default_batch_order = 0; ++// Flags to control compliance with block layer constraints ++static u64 default_compliance_flags = 0x7; ++ ++/* Compliance Flags: ++ * 0x1: REQ_FUA requests will be handled as Tier-1, strictly prioritized ++ * 0x2: REQ_PREFLUSH requests will be handled as Tier-1, strictly prioritized ++ * 0x4: Async requests will not be reordered based on the predicted latency ++ */ ++enum adios_compliance_flags { ++ ADIOS_CF_PRIO_FUA = 1U << 0, ++ ADIOS_CF_PRIO_PF = 1U << 1, ++ ADIOS_CF_FIXORDER = 1U << 2, ++}; ++ ++// Dynamic thresholds for shrinkage ++static u32 default_lm_shrink_at_kreqs = 5000; ++static u32 default_lm_shrink_at_gbytes = 50; ++static u32 default_lm_shrink_resist = 2; ++ ++enum adios_optype { ++ ADIOS_READ = 0, ++ ADIOS_WRITE = 1, ++ ADIOS_DISCARD = 2, ++ ADIOS_OTHER = 3, ++ ADIOS_OPTYPES = 4, ++}; ++ ++// Latency targets for each operation type ++static u64 default_latency_target[ADIOS_OPTYPES] = { ++ [ADIOS_READ] = 2ULL * NSEC_PER_MSEC, ++ [ADIOS_WRITE] = 2000ULL * NSEC_PER_MSEC, ++ [ADIOS_DISCARD] = 8000ULL * NSEC_PER_MSEC, ++ [ADIOS_OTHER] = 0ULL * NSEC_PER_MSEC, ++}; ++ ++// Maximum batch size limits for each operation type ++static u32 default_batch_limit[ADIOS_OPTYPES] = { ++ [ADIOS_READ] = 36, ++ [ADIOS_WRITE] = 72, ++ [ADIOS_DISCARD] = 1, ++ [ADIOS_OTHER] = 1, ++}; ++ ++enum adios_batch_order { ++ ADIOS_BO_OPTYPE = 0, ++ ADIOS_BO_ELEVATOR = 1, ++}; ++ ++// Thresholds for latency model control ++#define LM_BLOCK_SIZE_THRESHOLD 4096 ++#define LM_SAMPLES_THRESHOLD 1024 ++#define LM_INTERVAL_THRESHOLD 1500 ++#define LM_OUTLIER_PERCENTILE 99 ++#define LM_LAT_BUCKET_COUNT 64 ++ ++#define ADIOS_PQ_LEVELS 2 ++#define ADIOS_DL_TYPES 2 ++#define ADIOS_BQ_PAGES 2 ++ ++static u32 default_dl_prio[ADIOS_DL_TYPES] = {8, 0}; ++ ++// Bit flags for the atomic state variable, indicating which queues have requests. ++enum adios_state_flags { ++ ADIOS_STATE_PQ_0 = 1U << 0, ++ ADIOS_STATE_PQ_1 = 1U << 1, ++ ADIOS_STATE_DL_0 = 1U << 2, ++ ADIOS_STATE_DL_1 = 1U << 3, ++ ADIOS_STATE_BQ_PAGE_0 = 1U << 4, ++ ADIOS_STATE_BQ_PAGE_1 = 1U << 5, ++}; ++#define ADIOS_STATE_PQ 0 ++#define ADIOS_STATE_DL 2 ++#define ADIOS_STATE_BQ 4 ++ ++// Temporal granularity of the deadline tree node (dl_group) ++#define ADIOS_QUANTUM_SHIFT 20 ++ ++#define ADIOS_MAX_INSERTS_PER_LOCK 72 ++#define ADIOS_MAX_DELETES_PER_LOCK 24 ++ ++// Structure to hold latency bucket data for small requests ++struct latency_bucket_small { ++ u64 weighted_sum_latency; ++ u64 sum_of_weights; ++}; ++ ++// Structure to hold latency bucket data for large requests ++struct latency_bucket_large { ++ u64 weighted_sum_latency; ++ u64 weighted_sum_block_size; ++ u64 sum_of_weights; ++}; ++ ++// Structure to hold per-cpu buckets, improving data locality and code clarity. ++struct lm_buckets { ++ struct latency_bucket_small small_bucket[LM_LAT_BUCKET_COUNT]; ++ struct latency_bucket_large large_bucket[LM_LAT_BUCKET_COUNT]; ++}; ++ ++// Structure to hold RCU-protected latency model parameters ++struct latency_model_params { ++ u64 base; ++ u64 slope; ++ u64 small_sum_delay; ++ u64 small_count; ++ u64 large_sum_delay; ++ u64 large_sum_bsize; ++ u64 last_update_jiffies; ++ struct rcu_head rcu; ++}; ++ ++// Structure to hold the latency model context data ++struct latency_model { ++ spinlock_t update_lock; ++ struct latency_model_params __rcu *params; ++ ++ // Per-CPU buckets to avoid lock contention on the completion path ++ struct lm_buckets __percpu *pcpu_buckets; ++ ++ u32 lm_shrink_at_kreqs; ++ u32 lm_shrink_at_gbytes; ++ u8 lm_shrink_resist; ++}; ++ ++// Adios scheduler data ++struct adios_data { ++ spinlock_t pq_lock; ++ struct list_head prio_queue[2]; ++ ++ struct rb_root_cached dl_tree[2]; ++ spinlock_t lock; ++ s64 dl_bias; ++ s32 dl_prio[2]; ++ ++ atomic_t state; ++ u8 bq_state[ADIOS_BQ_PAGES]; ++ ++ void (*insert_request_fn)(struct blk_mq_hw_ctx *, struct request *, ++ blk_insert_t, struct list_head *); ++ ++ u64 global_latency_window; ++ u64 compliance_flags; ++ u64 latency_target[ADIOS_OPTYPES]; ++ u32 batch_limit[ADIOS_OPTYPES]; ++ u32 batch_actual_max_size[ADIOS_OPTYPES]; ++ u32 batch_actual_max_total; ++ u32 async_depth; ++ u32 lat_model_latency_limit; ++ u8 bq_refill_below_ratio; ++ u8 is_rotational; ++ u8 batch_order; ++ u8 elv_direction; ++ sector_t head_pos; ++ sector_t last_completed_pos; ++ ++ bool bq_page; ++ struct list_head batch_queue[ADIOS_BQ_PAGES][ADIOS_OPTYPES]; ++ u32 batch_count[ADIOS_BQ_PAGES][ADIOS_OPTYPES]; ++ u8 bq_batch_order[ADIOS_BQ_PAGES]; ++ spinlock_t bq_lock; ++ ++ struct lm_buckets *aggr_buckets; ++ ++ struct latency_model latency_model[ADIOS_OPTYPES]; ++ struct timer_list update_timer; ++ ++ atomic64_t total_pred_lat; ++ u64 last_completed_time; ++ ++ struct kmem_cache *rq_data_pool; ++ struct kmem_cache *dl_group_pool; ++ ++ struct request_queue *queue; ++}; ++ ++// List of requests with the same deadline in the deadline-sorted tree ++struct dl_group { ++ struct rb_node node; ++ struct list_head rqs; ++ u64 deadline; ++} __attribute__((aligned(64))); ++ ++// Structure to hold scheduler-specific data for each request ++struct adios_rq_data { ++ struct list_head *dl_group; ++ struct list_head dl_node; ++ ++ struct request *rq; ++ u64 deadline; ++ u64 pred_lat; ++ u32 block_size; ++} __attribute__((aligned(64))); ++ ++static const int adios_prio_to_wmult[40] = { ++ /* -20 */ 88761, 71755, 56483, 46273, 36291, ++ /* -15 */ 29154, 23254, 18705, 14949, 11916, ++ /* -10 */ 9548, 7620, 6100, 4904, 3906, ++ /* -5 */ 3121, 2501, 1991, 1586, 1277, ++ /* 0 */ 1024, 820, 655, 526, 423, ++ /* 5 */ 335, 272, 215, 172, 137, ++ /* 10 */ 110, 87, 70, 56, 45, ++ /* 15 */ 36, 29, 23, 18, 15, ++}; ++ ++static inline bool compliant(struct adios_data *ad, u32 flag) { ++ return ad->compliance_flags & flag; ++} ++ ++// Count the number of entries in aggregated small buckets ++static u64 lm_count_small_entries(struct latency_bucket_small *buckets) { ++ u64 total_weight = 0; ++ for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) ++ total_weight += buckets[i].sum_of_weights; ++ return total_weight; ++} ++ ++// Update the small buckets in the latency model from aggregated data ++static bool lm_update_small_buckets(struct latency_model *model, ++ struct latency_model_params *params, ++ struct latency_bucket_small *buckets, ++ u64 total_weight, bool count_all) { ++ u64 sum_latency = 0; ++ u64 sum_weight = 0; ++ u64 cumulative_weight = 0, threshold_weight = 0; ++ u8 outlier_threshold_bucket = 0; ++ u8 outlier_percentile = LM_OUTLIER_PERCENTILE; ++ u8 reduction; ++ ++ if (count_all) ++ outlier_percentile = 100; ++ ++ // Calculate the threshold weight for outlier detection ++ threshold_weight = (total_weight * outlier_percentile) / 100; ++ ++ // Identify the bucket that corresponds to the outlier threshold ++ for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) { ++ cumulative_weight += buckets[i].sum_of_weights; ++ if (cumulative_weight >= threshold_weight) { ++ outlier_threshold_bucket = i; ++ break; ++ } ++ } ++ ++ // Calculate the average latency, excluding outliers ++ for (u8 i = 0; i <= outlier_threshold_bucket; i++) { ++ struct latency_bucket_small *bucket = &buckets[i]; ++ if (i < outlier_threshold_bucket) { ++ sum_latency += bucket->weighted_sum_latency; ++ sum_weight += bucket->sum_of_weights; ++ } else { ++ // The threshold bucket's contribution is proportional ++ u64 remaining_weight = ++ threshold_weight - (cumulative_weight - bucket->sum_of_weights); ++ if (bucket->sum_of_weights > 0) { ++ sum_latency += div_u64(bucket->weighted_sum_latency * ++ remaining_weight, bucket->sum_of_weights); ++ sum_weight += remaining_weight; ++ } ++ } ++ } ++ ++ // Shrink the model if it reaches at the readjustment threshold ++ if (params->small_count >= 1000ULL * model->lm_shrink_at_kreqs) { ++ reduction = model->lm_shrink_resist; ++ if (params->small_count >> reduction) { ++ params->small_sum_delay -= params->small_sum_delay >> reduction; ++ params->small_count -= params->small_count >> reduction; ++ } ++ } ++ ++ if (!sum_weight) ++ return false; ++ ++ // Accumulate the average latency into the statistics ++ params->small_sum_delay += sum_latency; ++ params->small_count += sum_weight; ++ ++ return true; ++} ++ ++// Count the number of entries in aggregated large buckets ++static u64 lm_count_large_entries(struct latency_bucket_large *buckets) { ++ u64 total_weight = 0; ++ for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) ++ total_weight += buckets[i].sum_of_weights; ++ return total_weight; ++} ++ ++// Update the large buckets in the latency model from aggregated data ++static bool lm_update_large_buckets(struct latency_model *model, ++ struct latency_model_params *params, ++ struct latency_bucket_large *buckets, ++ u64 total_weight, bool count_all) { ++ s64 sum_latency = 0; ++ u64 sum_block_size = 0, intercept; ++ u64 cumulative_weight = 0, threshold_weight = 0; ++ u64 sum_weight = 0; ++ u8 outlier_threshold_bucket = 0; ++ u8 outlier_percentile = LM_OUTLIER_PERCENTILE; ++ u8 reduction; ++ ++ if (count_all) ++ outlier_percentile = 100; ++ ++ // Calculate the threshold weight for outlier detection ++ threshold_weight = (total_weight * outlier_percentile) / 100; ++ ++ // Identify the bucket that corresponds to the outlier threshold ++ for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) { ++ cumulative_weight += buckets[i].sum_of_weights; ++ if (cumulative_weight >= threshold_weight) { ++ outlier_threshold_bucket = i; ++ break; ++ } ++ } ++ ++ // Calculate the average latency and block size, excluding outliers ++ for (u8 i = 0; i <= outlier_threshold_bucket; i++) { ++ struct latency_bucket_large *bucket = &buckets[i]; ++ if (i < outlier_threshold_bucket) { ++ sum_latency += bucket->weighted_sum_latency; ++ sum_block_size += bucket->weighted_sum_block_size; ++ sum_weight += bucket->sum_of_weights; ++ } else { ++ // The threshold bucket's contribution is proportional ++ u64 remaining_weight = ++ threshold_weight - (cumulative_weight - bucket->sum_of_weights); ++ if (bucket->sum_of_weights > 0) { ++ sum_latency += div_u64(bucket->weighted_sum_latency * ++ remaining_weight, bucket->sum_of_weights); ++ sum_block_size += div_u64(bucket->weighted_sum_block_size * ++ remaining_weight, bucket->sum_of_weights); ++ sum_weight += remaining_weight; ++ } ++ } ++ } ++ ++ if (!sum_weight) ++ return false; ++ ++ // Shrink the model if it reaches at the readjustment threshold ++ if (params->large_sum_bsize >= 0x40000000ULL * model->lm_shrink_at_gbytes) { ++ reduction = model->lm_shrink_resist; ++ if (params->large_sum_bsize >> reduction) { ++ params->large_sum_delay -= params->large_sum_delay >> reduction; ++ params->large_sum_bsize -= params->large_sum_bsize >> reduction; ++ } ++ } ++ ++ // Accumulate the average delay into the statistics ++ intercept = params->base; ++ if (sum_latency > intercept) ++ sum_latency -= intercept; ++ ++ params->large_sum_delay += sum_latency; ++ params->large_sum_bsize += sum_block_size; ++ ++ return true; ++} ++ ++static void reset_buckets(struct lm_buckets *buckets) ++{ memset(buckets, 0, sizeof(*buckets)); } ++ ++static void lm_reset_pcpu_buckets(struct latency_model *model) { ++ int cpu; ++ for_each_possible_cpu(cpu) ++ reset_buckets(per_cpu_ptr(model->pcpu_buckets, cpu)); ++} ++ ++// Update the latency model parameters and statistics ++static void latency_model_update( ++ struct adios_data *ad, struct latency_model *model) { ++ u64 now; ++ u64 small_weight, large_weight; ++ bool time_elapsed; ++ bool small_processed = false, large_processed = false; ++ struct lm_buckets *aggr = ad->aggr_buckets; ++ struct latency_bucket_small *asb; ++ struct latency_bucket_large *alb; ++ struct lm_buckets *pcpu_b; ++ unsigned long flags; ++ int cpu; ++ struct latency_model_params *old_params, *new_params; ++ ++ spin_lock_irqsave(&model->update_lock, flags); ++ ++ old_params = rcu_dereference_protected(model->params, ++ lockdep_is_held(&model->update_lock)); ++ new_params = kmemdup(old_params, sizeof(*new_params), GFP_ATOMIC); ++ if (!new_params) { ++ spin_unlock_irqrestore(&model->update_lock, flags); ++ return; ++ } ++ ++ // Aggregate data from all CPUs and reset per-cpu buckets. ++ for_each_possible_cpu(cpu) { ++ pcpu_b = per_cpu_ptr(model->pcpu_buckets, cpu); ++ ++ for (u8 i = 0; i < LM_LAT_BUCKET_COUNT; i++) { ++ if (pcpu_b->small_bucket[i].sum_of_weights) { ++ asb = &aggr->small_bucket[i]; ++ asb->sum_of_weights += ++ pcpu_b->small_bucket[i].sum_of_weights; ++ asb->weighted_sum_latency += ++ pcpu_b->small_bucket[i].weighted_sum_latency; ++ } ++ if (pcpu_b->large_bucket[i].sum_of_weights) { ++ alb = &aggr->large_bucket[i]; ++ alb->sum_of_weights += ++ pcpu_b->large_bucket[i].sum_of_weights; ++ alb->weighted_sum_latency += ++ pcpu_b->large_bucket[i].weighted_sum_latency; ++ alb->weighted_sum_block_size += ++ pcpu_b->large_bucket[i].weighted_sum_block_size; ++ } ++ } ++ // Reset per-cpu buckets after aggregating ++ reset_buckets(pcpu_b); ++ } ++ ++ // Count the number of entries in aggregated buckets ++ small_weight = lm_count_small_entries(aggr->small_bucket); ++ large_weight = lm_count_large_entries(aggr->large_bucket); ++ ++ // Whether enough time has elapsed since the last update ++ now = jiffies; ++ time_elapsed = unlikely(!new_params->base) || ++ new_params->last_update_jiffies + ++ msecs_to_jiffies(LM_INTERVAL_THRESHOLD) <= now; ++ ++ // Update small buckets ++ if (small_weight && (time_elapsed || ++ LM_SAMPLES_THRESHOLD <= small_weight || !new_params->base)) { ++ small_processed = lm_update_small_buckets(model, new_params, ++ aggr->small_bucket, small_weight, !new_params->base); ++ memset(&aggr->small_bucket[0], 0, sizeof(aggr->small_bucket)); ++ } ++ // Update large buckets ++ if (large_weight && (time_elapsed || ++ LM_SAMPLES_THRESHOLD <= large_weight || !new_params->slope)) { ++ large_processed = lm_update_large_buckets(model, new_params, ++ aggr->large_bucket, large_weight, !new_params->slope); ++ memset(&aggr->large_bucket[0], 0, sizeof(aggr->large_bucket)); ++ } ++ ++ // Update the base parameter if small bucket was processed ++ if (small_processed && likely(new_params->small_count)) ++ new_params->base = div_u64(new_params->small_sum_delay, ++ new_params->small_count); ++ ++ // Update the slope parameter if large bucket was processed ++ if (large_processed && likely(new_params->large_sum_bsize)) ++ new_params->slope = div_u64(new_params->large_sum_delay, ++ DIV_ROUND_UP_ULL(new_params->large_sum_bsize, 1024)); ++ ++ // Update last updated jiffies if update happened or time has elapsed ++ if (small_processed || large_processed || time_elapsed) ++ new_params->last_update_jiffies = now; ++ ++ rcu_assign_pointer(model->params, new_params); ++ spin_unlock_irqrestore(&model->update_lock, flags); ++ ++ kfree_rcu(old_params, rcu); ++} ++ ++// Determine the bucket index for a given measured and predicted latency ++static u8 lm_input_bucket_index(u64 measured, u64 predicted) { ++ u8 bucket_index; ++ ++ if (measured < predicted * 2) ++ bucket_index = div_u64((measured * 20), predicted); ++ else if (measured < predicted * 5) ++ bucket_index = div_u64((measured * 10), predicted) + 20; ++ else ++ bucket_index = div_u64((measured * 3), predicted) + 40; ++ ++ return bucket_index; ++} ++ ++// Input latency data into the latency model ++static void latency_model_input(struct adios_data *ad, ++ struct latency_model *model, ++ u32 block_size, u64 latency, u64 pred_lat, u32 weight) { ++ unsigned long flags; ++ u8 bucket_index; ++ struct lm_buckets *buckets; ++ u64 current_base; ++ struct latency_model_params *params; ++ ++ local_irq_save(flags); ++ buckets = per_cpu_ptr(model->pcpu_buckets, __smp_processor_id()); ++ ++ rcu_read_lock(); ++ params = rcu_dereference(model->params); ++ current_base = params->base; ++ rcu_read_unlock(); ++ ++ if (block_size <= LM_BLOCK_SIZE_THRESHOLD) { ++ // Handle small requests ++ bucket_index = lm_input_bucket_index(latency, current_base ?: 1); ++ ++ if (bucket_index >= LM_LAT_BUCKET_COUNT) ++ bucket_index = LM_LAT_BUCKET_COUNT - 1; ++ ++ buckets->small_bucket[bucket_index].sum_of_weights += weight; ++ buckets->small_bucket[bucket_index].weighted_sum_latency += ++ latency * weight; ++ ++ local_irq_restore(flags); ++ ++ if (unlikely(!current_base)) { ++ latency_model_update(ad, model); ++ return; ++ } ++ } else { ++ // Handle large requests ++ if (!current_base || !pred_lat) { ++ local_irq_restore(flags); ++ return; ++ } ++ ++ bucket_index = lm_input_bucket_index(latency, pred_lat); ++ ++ if (bucket_index >= LM_LAT_BUCKET_COUNT) ++ bucket_index = LM_LAT_BUCKET_COUNT - 1; ++ ++ buckets->large_bucket[bucket_index].sum_of_weights += weight; ++ buckets->large_bucket[bucket_index].weighted_sum_latency += ++ latency * weight; ++ buckets->large_bucket[bucket_index].weighted_sum_block_size += ++ block_size * weight; ++ ++ local_irq_restore(flags); ++ } ++} ++ ++// Predict the latency for a given block size using the latency model ++static u64 latency_model_predict(struct latency_model *model, u32 block_size) { ++ u64 result; ++ struct latency_model_params *params; ++ ++ rcu_read_lock(); ++ params = rcu_dereference(model->params); ++ ++ result = params->base; ++ if (block_size > LM_BLOCK_SIZE_THRESHOLD) ++ result += params->slope * ++ DIV_ROUND_UP_ULL(block_size - LM_BLOCK_SIZE_THRESHOLD, 1024); ++ ++ rcu_read_unlock(); ++ ++ return result; ++} ++ ++// Determine the type of operation based on request flags ++static u8 adios_optype(struct request *rq) { ++ switch (rq->cmd_flags & REQ_OP_MASK) { ++ case REQ_OP_READ: ++ return ADIOS_READ; ++ case REQ_OP_WRITE: ++ return ADIOS_WRITE; ++ case REQ_OP_DISCARD: ++ return ADIOS_DISCARD; ++ default: ++ return ADIOS_OTHER; ++ } ++} ++ ++static inline u8 adios_optype_not_read(struct request *rq) { ++ return (rq->cmd_flags & REQ_OP_MASK) != REQ_OP_READ; ++} ++ ++// Helper function to retrieve adios_rq_data from a request ++static inline struct adios_rq_data *get_rq_data(struct request *rq) { ++ return rq->elv.priv[0]; ++} ++ ++static inline ++void set_adios_state(struct adios_data *ad, u32 shift, u32 idx, bool flag) { ++ if (flag) ++ atomic_or(1U << (idx + shift), &ad->state); ++ else ++ atomic_andnot(1U << (idx + shift), &ad->state); ++} ++ ++static inline u32 get_adios_state(struct adios_data *ad, u32 shift) ++{ return (atomic_read(&ad->state) >> shift) & 0x3; } ++ ++// Add a request to the deadline-sorted red-black tree ++static void add_to_dl_tree( ++ struct adios_data *ad, bool dl_idx, struct request *rq) { ++ struct rb_root_cached *root = &ad->dl_tree[dl_idx]; ++ struct rb_node **link = &(root->rb_root.rb_node), *parent = NULL; ++ bool leftmost = true; ++ struct adios_rq_data *rd = get_rq_data(rq); ++ struct dl_group *dlg; ++ u64 deadline; ++ bool was_empty = RB_EMPTY_ROOT(&root->rb_root); ++ ++ /* Tier-2: Synchronous Requests ++ * - Needs to be FIFO within a same optype ++ * - Relaxed order between different optypes ++ * - basically needs to be processed in early time */ ++ rd->deadline = rq->start_time_ns; ++ ++ /* Tier-3: Aynchronous Requests ++ * - Can be reordered and delayed freely */ ++ if (!(rq->cmd_flags & REQ_SYNC)) { ++ rd->deadline += ad->latency_target[adios_optype(rq)]; ++ if (!compliant(ad, ADIOS_CF_FIXORDER)) ++ rd->deadline += rd->pred_lat; ++ } ++ ++ // Now quantize the deadline (-> dlg->deadline == RB-Tree key) ++ deadline = rd->deadline & ~((1ULL << ADIOS_QUANTUM_SHIFT) - 1); ++ ++ while (*link) { ++ dlg = rb_entry(*link, struct dl_group, node); ++ s64 diff = deadline - dlg->deadline; ++ ++ parent = *link; ++ if (diff < 0) { ++ link = &((*link)->rb_left); ++ } else if (diff > 0) { ++ link = &((*link)->rb_right); ++ leftmost = false; ++ } else { // diff == 0 ++ goto found; ++ } ++ } ++ ++ dlg = rb_entry_safe(parent, struct dl_group, node); ++ if (!dlg || dlg->deadline != deadline) { ++ dlg = kmem_cache_zalloc(ad->dl_group_pool, GFP_ATOMIC); ++ if (!dlg) ++ return; ++ dlg->deadline = deadline; ++ INIT_LIST_HEAD(&dlg->rqs); ++ rb_link_node(&dlg->node, parent, link); ++ rb_insert_color_cached(&dlg->node, root, leftmost); ++ } ++found: ++ list_add_tail(&rd->dl_node, &dlg->rqs); ++ rd->dl_group = &dlg->rqs; ++ ++ if (was_empty) ++ set_adios_state(ad, ADIOS_STATE_DL, dl_idx, true); ++} ++ ++// Remove a request from the deadline-sorted red-black tree ++static void del_from_dl_tree( ++ struct adios_data *ad, bool dl_idx, struct request *rq) { ++ struct rb_root_cached *root = &ad->dl_tree[dl_idx]; ++ struct adios_rq_data *rd = get_rq_data(rq); ++ struct dl_group *dlg = container_of(rd->dl_group, struct dl_group, rqs); ++ ++ list_del_init(&rd->dl_node); ++ if (list_empty(&dlg->rqs)) { ++ rb_erase_cached(&dlg->node, root); ++ kmem_cache_free(ad->dl_group_pool, dlg); ++ } ++ rd->dl_group = NULL; ++ ++ if (RB_EMPTY_ROOT(&ad->dl_tree[dl_idx].rb_root)) ++ set_adios_state(ad, ADIOS_STATE_DL, dl_idx, false); ++} ++ ++// Remove a request from the scheduler ++static void remove_request(struct adios_data *ad, struct request *rq) { ++ bool dl_idx = adios_optype_not_read(rq); ++ struct request_queue *q = rq->q; ++ struct adios_rq_data *rd = get_rq_data(rq); ++ ++ list_del_init(&rq->queuelist); ++ ++ // We might not be on the rbtree, if we are doing an insert merge ++ if (rd->dl_group) ++ del_from_dl_tree(ad, dl_idx, rq); ++ ++ elv_rqhash_del(q, rq); ++ if (q->last_merge == rq) ++ q->last_merge = NULL; ++} ++ ++// Convert a queue depth to the corresponding word depth for shallow allocation ++static int to_word_depth(struct blk_mq_hw_ctx *hctx, unsigned int qdepth) { ++ struct sbitmap_queue *bt = &hctx->sched_tags->bitmap_tags; ++ const unsigned int nrr = hctx->queue->nr_requests; ++ ++ return ((qdepth << bt->sb.shift) + nrr - 1) / nrr; ++} ++ ++// We limit the depth of request allocation for asynchronous and write requests ++static void adios_limit_depth(blk_opf_t opf, struct blk_mq_alloc_data *data) { ++ struct adios_data *ad = data->q->elevator->elevator_data; ++ ++ // Do not throttle synchronous reads ++ if (op_is_sync(opf) && !op_is_write(opf)) ++ return; ++ ++ data->shallow_depth = to_word_depth(data->hctx, ad->async_depth); ++} ++ ++// The number of requests in the queue was notified from the block layer ++static void adios_depth_updated(struct blk_mq_hw_ctx *hctx) { ++ struct request_queue *q = hctx->queue; ++ struct adios_data *ad = q->elevator->elevator_data; ++ struct blk_mq_tags *tags = hctx->sched_tags; ++ ++ ad->async_depth = q->nr_requests; ++ ++ sbitmap_queue_min_shallow_depth(&tags->bitmap_tags, 1); ++} ++ ++// Handle request merging after a merge operation ++static void adios_request_merged(struct request_queue *q, struct request *req, ++ enum elv_merge type) { ++ bool dl_idx = adios_optype_not_read(req); ++ struct adios_data *ad = q->elevator->elevator_data; ++ ++ // Reposition request in the deadline-sorted tree ++ del_from_dl_tree(ad, dl_idx, req); ++ add_to_dl_tree(ad, dl_idx, req); ++} ++ ++// Handle merging of requests after one has been merged into another ++static void adios_merged_requests(struct request_queue *q, struct request *req, ++ struct request *next) { ++ struct adios_data *ad = q->elevator->elevator_data; ++ ++ lockdep_assert_held(&ad->lock); ++ ++ // kill knowledge of next, this one is a goner ++ remove_request(ad, next); ++} ++ ++// Try to merge a bio into an existing rq before associating it with an rq ++static bool adios_bio_merge(struct request_queue *q, struct bio *bio, ++ unsigned int nr_segs) { ++ unsigned long flags; ++ struct adios_data *ad = q->elevator->elevator_data; ++ struct request *free = NULL; ++ bool ret; ++ ++ if (!spin_trylock_irqsave(&ad->lock, flags)) ++ return false; ++ ++ ret = blk_mq_sched_try_merge(q, bio, nr_segs, &free); ++ spin_unlock_irqrestore(&ad->lock, flags); ++ ++ if (free) ++ blk_mq_free_request(free); ++ ++ return ret; ++} ++ ++// Insert a request into the scheduler (after Read & Write models stabilized) ++static void insert_request_post_stability(struct blk_mq_hw_ctx *hctx, ++ struct request *rq, blk_insert_t insert_flags, struct list_head *free) { ++ struct request_queue *q = hctx->queue; ++ struct adios_data *ad = q->elevator->elevator_data; ++ struct adios_rq_data *rd = get_rq_data(rq); ++ bool dl_idx; ++ u8 optype = adios_optype(rq); ++ u8 insert_pq_flags = 0; ++ ++ rd->block_size = blk_rq_bytes(rq); ++ rd->pred_lat = ++ latency_model_predict(&ad->latency_model[optype], rd->block_size); ++ ++ /* Tier-0: BLK_MQ_INSERT_AT_HEAD Requests ++ * - Needs to be processed ASAP at all costs in any case */ ++ if (insert_flags & BLK_MQ_INSERT_AT_HEAD) ++ { insert_pq_flags |= 0x2; } ++ /* Tier-1: Integrity-sensitive Requests ++ * - Needs to be FIFO across all optypes */ ++ if ((compliant(ad, ADIOS_CF_PRIO_FUA) && (rq->cmd_flags & REQ_FUA)) || ++ (compliant(ad, ADIOS_CF_PRIO_PF ) && (rq->cmd_flags & REQ_PREFLUSH))) ++ { insert_pq_flags |= 0x1; } ++ ++ if (insert_pq_flags) { ++ u8 pq_idx = !(insert_pq_flags >> 1); ++ if (rd->pred_lat) ++ atomic64_add(rd->pred_lat, &ad->total_pred_lat); ++ scoped_guard(spinlock_irqsave, &ad->pq_lock) { ++ bool was_empty = list_empty(&ad->prio_queue[pq_idx]); ++ list_add_tail(&rq->queuelist, &ad->prio_queue[pq_idx]); ++ if (was_empty) ++ set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, true); ++ } ++ return; ++ } ++ ++ if (blk_mq_sched_try_insert_merge(q, rq, free)) ++ return; ++ ++ dl_idx = adios_optype_not_read(rq); ++ add_to_dl_tree(ad, dl_idx, rq); ++ ++ if (rq_mergeable(rq)) { ++ elv_rqhash_add(q, rq); ++ if (!q->last_merge) ++ q->last_merge = rq; ++ } ++} ++ ++// Insert a request into the scheduler (before Read & Write models stabilizes) ++static void insert_request_pre_stability(struct blk_mq_hw_ctx *hctx, ++ struct request *rq, blk_insert_t insert_flags, struct list_head *free) { ++ struct adios_data *ad = hctx->queue->elevator->elevator_data; ++ struct adios_rq_data *rd = get_rq_data(rq); ++ u8 optype = adios_optype(rq); ++ u8 pq_idx = !(insert_flags & BLK_MQ_INSERT_AT_HEAD); ++ bool models_stable = false; ++ ++ rd->block_size = blk_rq_bytes(rq); ++ rd->pred_lat = ++ latency_model_predict(&ad->latency_model[optype], rd->block_size); ++ ++ if (rd->pred_lat) ++ atomic64_add(rd->pred_lat, &ad->total_pred_lat); ++ ++ scoped_guard(spinlock_irqsave, &ad->pq_lock) { ++ bool was_empty = list_empty(&ad->prio_queue[pq_idx]); ++ list_add_tail(&rq->queuelist, &ad->prio_queue[pq_idx]); ++ if (was_empty) ++ set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, true); ++ } ++ ++ rcu_read_lock(); ++ if (rcu_dereference(ad->latency_model[ADIOS_READ].params)->base > 0 && ++ rcu_dereference(ad->latency_model[ADIOS_WRITE].params)->base > 0) ++ models_stable = true; ++ rcu_read_unlock(); ++ ++ if (models_stable) ++ ad->insert_request_fn = insert_request_post_stability; ++} ++ ++// Insert multiple requests into the scheduler ++static void adios_insert_requests(struct blk_mq_hw_ctx *hctx, ++ struct list_head *list, ++ blk_insert_t insert_flags) { ++ struct request_queue *q = hctx->queue; ++ struct adios_data *ad = q->elevator->elevator_data; ++ struct request *rq; ++ bool stop = false; ++ LIST_HEAD(free); ++ ++ do { ++ scoped_guard(spinlock_irqsave, &ad->lock) ++ for (int i = 0; i < ADIOS_MAX_INSERTS_PER_LOCK; i++) { ++ if (list_empty(list)) { ++ stop = true; ++ break; ++ } ++ rq = list_first_entry(list, struct request, queuelist); ++ list_del_init(&rq->queuelist); ++ ad->insert_request_fn(hctx, rq, insert_flags, &free); ++ }} while (!stop); ++ ++ blk_mq_free_requests(&free); ++} ++ ++// Prepare a request before it is inserted into the scheduler ++static void adios_prepare_request(struct request *rq) { ++ struct adios_data *ad = rq->q->elevator->elevator_data; ++ struct adios_rq_data *rd = get_rq_data(rq); ++ ++ rq->elv.priv[0] = NULL; ++ ++ /* Allocate adios_rq_data from the memory pool */ ++ rd = kmem_cache_zalloc(ad->rq_data_pool, GFP_ATOMIC); ++ if (WARN(!rd, "adios_prepare_request: " ++ "Failed to allocate memory from rq_data_pool. rd is NULL\n")) ++ return; ++ ++ rd->rq = rq; ++ rq->elv.priv[0] = rd; ++} ++ ++static struct adios_rq_data *get_dl_first_rd(struct adios_data *ad, bool idx) { ++ struct rb_root_cached *root = &ad->dl_tree[idx]; ++ struct rb_node *first = rb_first_cached(root); ++ struct dl_group *dl_group = rb_entry(first, struct dl_group, node); ++ ++ return list_first_entry(&dl_group->rqs, struct adios_rq_data, dl_node); ++} ++ ++// Comparison function for sorting requests by block address ++static int cmp_rq_pos(void *priv, ++ const struct list_head *a, const struct list_head *b) { ++ struct request *rq_a = list_entry(a, struct request, queuelist); ++ struct request *rq_b = list_entry(b, struct request, queuelist); ++ u64 pos_a = blk_rq_pos(rq_a); ++ u64 pos_b = blk_rq_pos(rq_b); ++ ++ return (int)(pos_a > pos_b) - (int)(pos_a < pos_b); ++} ++ ++#ifndef list_last_entry_or_null ++#define list_last_entry_or_null(ptr, type, member) \ ++ (!list_empty(ptr) ? list_last_entry(ptr, type, member) : NULL) ++#endif ++ ++// Update the elevator direction ++static void update_elv_direction(struct adios_data *ad) { ++ if (!ad->is_rotational) ++ return; ++ ++ bool page = ad->bq_page; ++ struct list_head *q = &ad->batch_queue[page][1]; ++ if (ad->bq_batch_order[page] < ADIOS_BO_ELEVATOR || list_empty(q)) { ++ ad->elv_direction = 0; ++ return; ++ } ++ ++ // Get first and last request positions in the queue ++ struct request *rq_a = list_first_entry(q, struct request, queuelist); ++ struct request *rq_b = list_last_entry (q, struct request, queuelist); ++ u64 pos_a = blk_rq_pos(rq_a); ++ u64 pos_b = blk_rq_pos(rq_b); ++ u64 avg_rq_pos = (pos_a + pos_b) >> 1; ++ ++ ad->elv_direction = !!(ad->head_pos > avg_rq_pos); ++} ++ ++// Fill the batch queues with requests from the deadline-sorted red-black tree ++static bool fill_batch_queues(struct adios_data *ad, u64 tpl) { ++ struct adios_rq_data *rd; ++ struct request *rq; ++ struct list_head *dest_q; ++ u8 dest_idx; ++ u64 added_lat = 0; ++ u32 optype_count[ADIOS_OPTYPES] = {0}; ++ u32 count = 0; ++ u8 optype; ++ bool page = !ad->bq_page, dl_idx, bias_idx, update_bias; ++ u32 dl_queued; ++ u8 bq_batch_order; ++ bool stop = false; ++ ++ // Reset batch queue counts for the back page ++ memset(&ad->batch_count[page], 0, sizeof(ad->batch_count[page])); ++ ++ ad->bq_batch_order[page] = ++ bq_batch_order = ad->batch_order; ++ ++ do { ++ scoped_guard(spinlock_irqsave, &ad->lock) ++ for (int i = 0; i < ADIOS_MAX_DELETES_PER_LOCK; i++) { ++ bool has_base = false; ++ ++ dl_queued = get_adios_state(ad, ADIOS_STATE_DL); ++ // Check if there are any requests queued in the deadline tree ++ if (!dl_queued) { ++ stop = true; ++ break; ++ } ++ ++ // Reads if both queues have requests, otherwise pick the non-empty. ++ dl_idx = dl_queued >> 1; ++ ++ // Get the first request from the deadline-sorted tree ++ rd = get_dl_first_rd(ad, dl_idx); ++ ++ bias_idx = ad->dl_bias < 0; ++ // If read and write requests are queued, choose one based on bias ++ if (dl_queued == 0x3) { ++ struct adios_rq_data *trd[2] = {get_dl_first_rd(ad, 0), rd}; ++ rd = trd[bias_idx]; ++ ++ update_bias = (trd[bias_idx]->deadline > trd[!bias_idx]->deadline); ++ } else ++ update_bias = (bias_idx == dl_idx); ++ ++ rq = rd->rq; ++ optype = adios_optype(rq); ++ ++ rcu_read_lock(); ++ has_base = ++ !!rcu_dereference(ad->latency_model[optype].params)->base; ++ rcu_read_unlock(); ++ ++ // Check batch size and total predicted latency ++ if (count && (!has_base || ++ ad->batch_count[page][optype] >= ad->batch_limit[optype] || ++ (tpl + added_lat + rd->pred_lat) > ad->global_latency_window)) { ++ stop = true; ++ break; ++ } ++ ++ if (update_bias) { ++ s64 sign = ((s64)bias_idx << 1) - 1; ++ if (unlikely(!rd->pred_lat)) ++ ad->dl_bias = sign; ++ else ++ // Adjust the bias based on the predicted latency ++ ad->dl_bias += sign * (s64)((rd->pred_lat * ++ adios_prio_to_wmult[ad->dl_prio[bias_idx] + 20]) >> 10); ++ } ++ ++ remove_request(ad, rq); ++ ++ // Add request to the corresponding batch queue ++ dest_idx = (bq_batch_order == ADIOS_BO_OPTYPE || optype == ADIOS_OTHER)? ++ optype : !!(rd->deadline != rq->start_time_ns); ++ dest_q = &ad->batch_queue[page][dest_idx]; ++ list_add_tail(&rq->queuelist, dest_q); ++ ad->bq_state[page] |= 1U << dest_idx; ++ ad->batch_count[page][optype]++; ++ optype_count[optype]++; ++ added_lat += rd->pred_lat; ++ count++; ++ }} while (!stop); ++ ++ if (bq_batch_order == ADIOS_BO_ELEVATOR && ad->batch_count[page][1] > 1) ++ list_sort(NULL, &ad->batch_queue[page][1], cmp_rq_pos); ++ ++ if (count) { ++ if (added_lat) ++ atomic64_add(added_lat, &ad->total_pred_lat); ++ ++ set_adios_state(ad, ADIOS_STATE_BQ, page, true); ++ ++ for (optype = 0; optype < ADIOS_OPTYPES; optype++) ++ if (ad->batch_actual_max_size[optype] < optype_count[optype]) ++ ad->batch_actual_max_size[optype] = optype_count[optype]; ++ if (ad->batch_actual_max_total < count) ++ ad->batch_actual_max_total = count; ++ } ++ return count; ++} ++ ++// Flip to the next batch queue page ++static void flip_bq_page(struct adios_data *ad) { ++ ad->bq_page = !ad->bq_page; ++ update_elv_direction(ad); ++} ++ ++// Pop a request from the specified index (optype or elevator tier) ++static inline struct request *pop_bq_request( ++ struct adios_data *ad, u8 idx, bool direction) { ++ bool page = ad->bq_page; ++ struct list_head *q = &ad->batch_queue[page][idx]; ++ struct request *rq = direction ? ++ list_last_entry_or_null (q, struct request, queuelist): ++ list_first_entry_or_null(q, struct request, queuelist); ++ if (rq) { ++ list_del_init(&rq->queuelist); ++ if (list_empty(q)) ++ ad->bq_state[page] &= ~(1U << idx); ++ } ++ return rq; ++} ++ ++static struct request *pop_next_bq_request_optype(struct adios_data *ad) { ++ u32 bq_state = ad->bq_state[ad->bq_page]; ++ if (!bq_state) return NULL; ++ ++ struct request *rq; ++ u32 bq_idx = 31 - __builtin_clz(bq_state); ++ ++ // Dispatch based on optype (FIFO within each) or single-queue elevator ++ rq = pop_bq_request(ad, bq_idx, false); ++ return rq; ++} ++ ++static struct request *pop_next_bq_request_elevator(struct adios_data *ad) { ++ u32 bq_state = ad->bq_state[ad->bq_page]; ++ if (!bq_state) return NULL; ++ ++ struct request *rq; ++ u32 bq_idx = 31 - __builtin_clz(bq_state); ++ bool direction = (bq_idx == 1) & ad->elv_direction; ++ ++ // Tier-2 (sync) is always high priority ++ // Tier-3 (async) uses the pre-calculated elevator direction ++ rq = pop_bq_request(ad, bq_idx, direction); ++ ++ /* If batch queue for the sync requests just became empty */ ++ if (bq_idx == 0 && rq && !(bq_state & 0x1)) ++ update_elv_direction(ad); ++ ++ return rq; ++} ++ ++// Returns the state of the other batch queue page ++static bool more_bq_ready(struct adios_data *ad, bool page) { ++ u32 state = get_adios_state(ad, ADIOS_STATE_BQ); ++ return state & (1U << !page); ++} ++ ++// Dispatch a request from the batch queues ++static struct request *dispatch_from_bq(struct adios_data *ad) { ++ struct request *rq; ++ ++ guard(spinlock_irqsave)(&ad->bq_lock); ++ ++ u64 tpl = atomic64_read(&ad->total_pred_lat); ++ ++ if (!more_bq_ready(ad, ad->bq_page) && (!tpl || tpl < div_u64( ++ ad->global_latency_window * ad->bq_refill_below_ratio, 100))) ++ fill_batch_queues(ad, tpl); ++ ++again: ++ // Use the per-page state to decide the dispatch logic, ensuring correctness ++ rq = (ad->bq_batch_order[ad->bq_page] == ADIOS_BO_ELEVATOR) ? ++ pop_next_bq_request_elevator(ad): ++ pop_next_bq_request_optype(ad); ++ ++ if (rq) { ++ bool page = ad->bq_page; ++ bool is_empty = !ad->bq_state[page]; ++ if (is_empty) ++ set_adios_state(ad, ADIOS_STATE_BQ, page, false); ++ return rq; ++ } ++ ++ // If there's more batch queue page available, flip to it and retry ++ if (more_bq_ready(ad, ad->bq_page)) { ++ flip_bq_page(ad); ++ goto again; ++ } ++ ++ return NULL; ++} ++ ++// Dispatch a request from the priority queue ++static struct request *dispatch_from_pq(struct adios_data *ad) { ++ struct request *rq = NULL; ++ ++ guard(spinlock_irqsave)(&ad->pq_lock); ++ u32 pq_state = get_adios_state(ad, ADIOS_STATE_PQ); ++ u8 pq_idx = pq_state >> 1; ++ struct list_head *q = &ad->prio_queue[pq_idx]; ++ ++ if (unlikely(list_empty(q))) return NULL; ++ ++ rq = list_first_entry(q, struct request, queuelist); ++ list_del_init(&rq->queuelist); ++ if (list_empty(q)) { ++ set_adios_state(ad, ADIOS_STATE_PQ, pq_idx, false); ++ update_elv_direction(ad); ++ } ++ return rq; ++} ++ ++// Dispatch a request to the hardware queue ++static struct request *adios_dispatch_request(struct blk_mq_hw_ctx *hctx) { ++ struct adios_data *ad = hctx->queue->elevator->elevator_data; ++ struct request *rq; ++ ++ rq = dispatch_from_pq(ad); ++ if (rq) goto found; ++ rq = dispatch_from_bq(ad); ++ if (!rq) return NULL; ++found: ++ if (ad->is_rotational) ++ ad->head_pos = blk_rq_pos(rq) + blk_rq_sectors(rq); ++ ++ rq->rq_flags |= RQF_STARTED; ++ return rq; ++} ++ ++// Timer callback function to periodically update latency models ++static void update_timer_callback(struct timer_list *t) { ++ struct adios_data *ad = timer_container_of(ad, t, update_timer); ++ ++ for (u8 optype = 0; optype < ADIOS_OPTYPES; optype++) ++ latency_model_update(ad, &ad->latency_model[optype]); ++} ++ ++// Handle the completion of a request ++static void adios_completed_request(struct request *rq, u64 now) { ++ struct adios_data *ad = rq->q->elevator->elevator_data; ++ struct adios_rq_data *rd = get_rq_data(rq); ++ ++ u64 tpl_after = atomic64_sub_return(rd->pred_lat, &ad->total_pred_lat); ++ u8 optype = adios_optype(rq); ++ ++ if (optype == ADIOS_OTHER) { ++ // Non-positional commands make the head position unpredictable. ++ // Invalidate our knowledge of the last completed position. ++ if (ad->is_rotational) ++ ad->last_completed_pos = 0; ++ return; ++ } ++ ++ u64 lct = ad->last_completed_time ?: rq->io_start_time_ns; ++ ad->last_completed_time = (tpl_after) ? now : 0; ++ ++ if (!rq->io_start_time_ns || !rd->block_size || unlikely(now < lct)) ++ return; ++ ++ u64 latency = now - lct; ++ if (latency > ad->lat_model_latency_limit) ++ return; ++ ++ u32 weight = 1; ++ if (ad->is_rotational) { ++ sector_t current_pos = blk_rq_pos(rq); ++ // Only calculate seek distance if we have a valid last position. ++ if (ad->last_completed_pos > 0) { ++ u64 seek_distance = abs( ++ (s64)current_pos - (s64)ad->last_completed_pos); ++ weight = 65 - __builtin_clzll(seek_distance); ++ } ++ // Update (or re-synchronize) our knowledge of the head position. ++ ad->last_completed_pos = current_pos + blk_rq_sectors(rq); ++ } ++ ++ latency_model_input(ad, &ad->latency_model[optype], ++ rd->block_size, latency, rd->pred_lat, weight); ++ timer_reduce(&ad->update_timer, jiffies + msecs_to_jiffies(100)); ++} ++ ++// Clean up after a request is finished ++static void adios_finish_request(struct request *rq) { ++ struct adios_data *ad = rq->q->elevator->elevator_data; ++ ++ if (rq->elv.priv[0]) { ++ // Free adios_rq_data back to the memory pool ++ kmem_cache_free(ad->rq_data_pool, get_rq_data(rq)); ++ rq->elv.priv[0] = NULL; ++ } ++} ++ ++// Check if there are any requests available for dispatch ++static bool adios_has_work(struct blk_mq_hw_ctx *hctx) { ++ struct adios_data *ad = hctx->queue->elevator->elevator_data; ++ ++ return atomic_read(&ad->state) != 0; ++} ++ ++// Initialize the scheduler-specific data for a hardware queue ++static int adios_init_hctx(struct blk_mq_hw_ctx *hctx, unsigned int hctx_idx) { ++ adios_depth_updated(hctx); ++ return 0; ++} ++ ++// Initialize the scheduler-specific data when initializing the request queue ++static int adios_init_sched(struct request_queue *q, struct elevator_type *e) { ++ struct adios_data *ad; ++ struct elevator_queue *eq; ++ int ret = -ENOMEM; ++ u8 optype = 0; ++ ++ eq = elevator_alloc(q, e); ++ if (!eq) { ++ pr_err("adios: Failed to allocate the elevator\n"); ++ return ret; ++ } ++ ++ ad = kzalloc_node(sizeof(*ad), GFP_KERNEL, q->node); ++ if (!ad) { ++ pr_err("adios: Failed to create adios_data\n"); ++ goto put_eq; ++ } ++ ++ // Create a memory pool for adios_rq_data ++ ad->rq_data_pool = kmem_cache_create("rq_data_pool", ++ sizeof(struct adios_rq_data), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (!ad->rq_data_pool) { ++ pr_err("adios: Failed to create rq_data_pool\n"); ++ goto free_ad; ++ } ++ ++ /* Create a memory pool for dl_group */ ++ ad->dl_group_pool = kmem_cache_create("dl_group_pool", ++ sizeof(struct dl_group), ++ 0, SLAB_HWCACHE_ALIGN, NULL); ++ if (!ad->dl_group_pool) { ++ pr_err("adios: Failed to create dl_group_pool\n"); ++ goto destroy_rq_data_pool; ++ } ++ ++ for (int i = 0; i < ADIOS_PQ_LEVELS; i++) ++ INIT_LIST_HEAD(&ad->prio_queue[i]); ++ ++ for (u8 i = 0; i < ADIOS_DL_TYPES; i++) { ++ ad->dl_tree[i] = RB_ROOT_CACHED; ++ ad->dl_prio[i] = default_dl_prio[i]; ++ } ++ ad->dl_bias = 0; ++ ++ for (u8 page = 0; page < ADIOS_BQ_PAGES; page++) ++ for (optype = 0; optype < ADIOS_OPTYPES; optype++) ++ INIT_LIST_HEAD(&ad->batch_queue[page][optype]); ++ ++ ad->aggr_buckets = kzalloc(sizeof(*ad->aggr_buckets), GFP_KERNEL); ++ if (!ad->aggr_buckets) { ++ pr_err("adios: Failed to allocate aggregation buckets\n"); ++ goto destroy_dl_group_pool; ++ } ++ ++ for (optype = 0; optype < ADIOS_OPTYPES; optype++) { ++ struct latency_model *model = &ad->latency_model[optype]; ++ struct latency_model_params *params; ++ ++ spin_lock_init(&model->update_lock); ++ params = kzalloc(sizeof(*params), GFP_KERNEL); ++ if (!params) { ++ pr_err("adios: Failed to allocate latency_model_params\n"); ++ goto free_buckets; ++ } ++ params->last_update_jiffies = jiffies; ++ RCU_INIT_POINTER(model->params, params); ++ ++ model->pcpu_buckets = alloc_percpu(struct lm_buckets); ++ if (!model->pcpu_buckets) { ++ pr_err("adios: Failed to allocate per-CPU buckets\n"); ++ kfree(params); ++ goto free_buckets; ++ } ++ ++ model->lm_shrink_at_kreqs = default_lm_shrink_at_kreqs; ++ model->lm_shrink_at_gbytes = default_lm_shrink_at_gbytes; ++ model->lm_shrink_resist = default_lm_shrink_resist; ++ } ++ ++ for (optype = 0; optype < ADIOS_OPTYPES; optype++) { ++ ad->latency_target[optype] = default_latency_target[optype]; ++ ad->batch_limit[optype] = default_batch_limit[optype]; ++ } ++ ++ eq->elevator_data = ad; ++ ++ ad->is_rotational = !!(q->limits.features & BLK_FEAT_ROTATIONAL); ++ ad->global_latency_window = (ad->is_rotational)? ++ default_global_latency_window_rotational: ++ default_global_latency_window; ++ ad->bq_refill_below_ratio = default_bq_refill_below_ratio; ++ ad->lat_model_latency_limit = default_lat_model_latency_limit; ++ ad->batch_order = default_batch_order; ++ ad->compliance_flags = default_compliance_flags; ++ ++ ad->insert_request_fn = insert_request_pre_stability; ++ ++ atomic_set(&ad->state, 0); ++ ++ spin_lock_init(&ad->lock); ++ spin_lock_init(&ad->pq_lock); ++ spin_lock_init(&ad->bq_lock); ++ ++ timer_setup(&ad->update_timer, update_timer_callback, 0); ++ ++ /* We dispatch from request queue wide instead of hw queue */ ++ blk_queue_flag_set(QUEUE_FLAG_SQ_SCHED, q); ++ ++ ad->queue = q; ++ blk_stat_enable_accounting(q); ++ ++ q->elevator = eq; ++ return 0; ++ ++free_buckets: ++ pr_err("adios: Failed to allocate per-cpu buckets\n"); ++ while (optype-- > 0) { ++ struct latency_model *prev_model = &ad->latency_model[optype]; ++ kfree(rcu_access_pointer(prev_model->params)); ++ free_percpu(prev_model->pcpu_buckets); ++ } ++ kfree(ad->aggr_buckets); ++destroy_dl_group_pool: ++ kmem_cache_destroy(ad->dl_group_pool); ++destroy_rq_data_pool: ++ kmem_cache_destroy(ad->rq_data_pool); ++free_ad: ++ kfree(ad); ++put_eq: ++ kobject_put(&eq->kobj); ++ return ret; ++} ++ ++// Clean up and free resources when exiting the scheduler ++static void adios_exit_sched(struct elevator_queue *e) { ++ struct adios_data *ad = e->elevator_data; ++ ++ timer_shutdown_sync(&ad->update_timer); ++ ++ for (int i = 0; i < 2; i++) ++ WARN_ON_ONCE(!list_empty(&ad->prio_queue[i])); ++ ++ for (u8 i = 0; i < ADIOS_OPTYPES; i++) { ++ struct latency_model *model = &ad->latency_model[i]; ++ struct latency_model_params *params = rcu_access_pointer(model->params); ++ ++ RCU_INIT_POINTER(model->params, NULL); ++ kfree_rcu(params, rcu); ++ ++ free_percpu(model->pcpu_buckets); ++ } ++ ++ synchronize_rcu(); ++ ++ kfree(ad->aggr_buckets); ++ ++ if (ad->rq_data_pool) ++ kmem_cache_destroy(ad->rq_data_pool); ++ ++ if (ad->dl_group_pool) ++ kmem_cache_destroy(ad->dl_group_pool); ++ ++ blk_stat_disable_accounting(ad->queue); ++ ++ kfree(ad); ++} ++ ++static void sideload_latency_model( ++ struct latency_model *model, u64 base, u64 slope) { ++ struct latency_model_params *old_params, *new_params; ++ unsigned long flags; ++ ++ new_params = kzalloc(sizeof(*new_params), GFP_KERNEL); ++ if (!new_params) ++ return; ++ ++ spin_lock_irqsave(&model->update_lock, flags); ++ ++ old_params = rcu_dereference_protected(model->params, ++ lockdep_is_held(&model->update_lock)); ++ ++ new_params->last_update_jiffies = jiffies; ++ ++ // Initialize base and its statistics as a single sample. ++ new_params->base = base; ++ new_params->small_sum_delay = base; ++ new_params->small_count = 1; ++ ++ // Initialize slope and its statistics as a single sample. ++ new_params->slope = slope; ++ new_params->large_sum_delay = slope; ++ new_params->large_sum_bsize = 1024; /* Corresponds to 1 KiB */ ++ ++ lm_reset_pcpu_buckets(model); ++ ++ rcu_assign_pointer(model->params, new_params); ++ spin_unlock_irqrestore(&model->update_lock, flags); ++ ++ kfree_rcu(old_params, rcu); ++} ++ ++// Define sysfs attributes for operation types ++#define SYSFS_OPTYPE_DECL(name, optype) \ ++static ssize_t adios_lat_model_##name##_show( \ ++ struct elevator_queue *e, char *page) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ struct latency_model *model = &ad->latency_model[optype]; \ ++ struct latency_model_params *params; \ ++ ssize_t len = 0; \ ++ u64 base, slope; \ ++ rcu_read_lock(); \ ++ params = rcu_dereference(model->params); \ ++ base = params->base; \ ++ slope = params->slope; \ ++ rcu_read_unlock(); \ ++ len += sprintf(page, "base : %llu ns\n", base); \ ++ len += sprintf(page + len, "slope: %llu ns/KiB\n", slope); \ ++ return len; \ ++} \ ++static ssize_t adios_lat_model_##name##_store( \ ++ struct elevator_queue *e, const char *page, size_t count) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ struct latency_model *model = &ad->latency_model[optype]; \ ++ u64 base, slope; \ ++ int ret; \ ++ ret = sscanf(page, "%llu %llu", &base, &slope); \ ++ if (ret != 2) \ ++ return -EINVAL; \ ++ sideload_latency_model(model, base, slope); \ ++ reset_buckets(ad->aggr_buckets); \ ++ return count; \ ++} \ ++static ssize_t adios_lat_target_##name##_show( \ ++ struct elevator_queue *e, char *page) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ return sprintf(page, "%llu\n", ad->latency_target[optype]); \ ++} \ ++static ssize_t adios_lat_target_##name##_store( \ ++ struct elevator_queue *e, const char *page, size_t count) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ unsigned long nsec; \ ++ int ret; \ ++ ret = kstrtoul(page, 10, &nsec); \ ++ if (ret) \ ++ return ret; \ ++ sideload_latency_model(&ad->latency_model[optype], 0, 0); \ ++ ad->latency_target[optype] = nsec; \ ++ return count; \ ++} \ ++static ssize_t adios_batch_limit_##name##_show( \ ++ struct elevator_queue *e, char *page) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ return sprintf(page, "%u\n", ad->batch_limit[optype]); \ ++} \ ++static ssize_t adios_batch_limit_##name##_store( \ ++ struct elevator_queue *e, const char *page, size_t count) { \ ++ unsigned long max_batch; \ ++ int ret; \ ++ ret = kstrtoul(page, 10, &max_batch); \ ++ if (ret || max_batch == 0) \ ++ return -EINVAL; \ ++ struct adios_data *ad = e->elevator_data; \ ++ ad->batch_limit[optype] = max_batch; \ ++ return count; \ ++} ++ ++SYSFS_OPTYPE_DECL(read, ADIOS_READ); ++SYSFS_OPTYPE_DECL(write, ADIOS_WRITE); ++SYSFS_OPTYPE_DECL(discard, ADIOS_DISCARD); ++ ++// Show the maximum batch size actually achieved for each operation type ++static ssize_t adios_batch_actual_max_show( ++ struct elevator_queue *e, char *page) { ++ struct adios_data *ad = e->elevator_data; ++ u32 total_count, read_count, write_count, discard_count; ++ ++ total_count = ad->batch_actual_max_total; ++ read_count = ad->batch_actual_max_size[ADIOS_READ]; ++ write_count = ad->batch_actual_max_size[ADIOS_WRITE]; ++ discard_count = ad->batch_actual_max_size[ADIOS_DISCARD]; ++ ++ return sprintf(page, ++ "Total : %u\nDiscard: %u\nRead : %u\nWrite : %u\n", ++ total_count, discard_count, read_count, write_count); ++} ++ ++#define SYSFS_ULL_DECL(field, min_val, max_val) \ ++static ssize_t adios_##field##_show( \ ++ struct elevator_queue *e, char *page) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ return sprintf(page, "%llu\n", ad->field); \ ++} \ ++static ssize_t adios_##field##_store( \ ++ struct elevator_queue *e, const char *page, size_t count) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ unsigned long val; \ ++ int ret; \ ++ ret = kstrtoul(page, 10, &val); \ ++ if (ret || val < (min_val) || val > (max_val)) \ ++ return -EINVAL; \ ++ ad->field = val; \ ++ return count; \ ++} ++ ++SYSFS_ULL_DECL(global_latency_window, 0, ULLONG_MAX) ++SYSFS_ULL_DECL(compliance_flags, 0, ULLONG_MAX) ++ ++#define SYSFS_INT_DECL(field, min_val, max_val) \ ++static ssize_t adios_##field##_show( \ ++ struct elevator_queue *e, char *page) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ return sprintf(page, "%d\n", ad->field); \ ++} \ ++static ssize_t adios_##field##_store( \ ++ struct elevator_queue *e, const char *page, size_t count) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ int val; \ ++ int ret; \ ++ ret = kstrtoint(page, 10, &val); \ ++ if (ret || val < (min_val) || val > (max_val)) \ ++ return -EINVAL; \ ++ ad->field = val; \ ++ return count; \ ++} ++ ++SYSFS_INT_DECL(bq_refill_below_ratio, 0, 100) ++SYSFS_INT_DECL(lat_model_latency_limit, 0, 2*NSEC_PER_SEC) ++SYSFS_INT_DECL(batch_order, ADIOS_BO_OPTYPE, !!ad->is_rotational) ++ ++// Show the read priority ++static ssize_t adios_read_priority_show( ++ struct elevator_queue *e, char *page) { ++ struct adios_data *ad = e->elevator_data; ++ return sprintf(page, "%d\n", ad->dl_prio[0]); ++} ++ ++// Set the read priority ++static ssize_t adios_read_priority_store( ++ struct elevator_queue *e, const char *page, size_t count) { ++ struct adios_data *ad = e->elevator_data; ++ int prio; ++ int ret; ++ ++ ret = kstrtoint(page, 10, &prio); ++ if (ret || prio < -20 || prio > 19) ++ return -EINVAL; ++ ++ guard(spinlock_irqsave)(&ad->lock); ++ ad->dl_prio[0] = prio; ++ ad->dl_bias = 0; ++ ++ return count; ++} ++ ++// Reset batch queue statistics ++static ssize_t adios_reset_bq_stats_store( ++ struct elevator_queue *e, const char *page, size_t count) { ++ struct adios_data *ad = e->elevator_data; ++ unsigned long val; ++ int ret; ++ ++ ret = kstrtoul(page, 10, &val); ++ if (ret || val != 1) ++ return -EINVAL; ++ ++ for (u8 i = 0; i < ADIOS_OPTYPES; i++) ++ ad->batch_actual_max_size[i] = 0; ++ ++ ad->batch_actual_max_total = 0; ++ ++ return count; ++} ++ ++// Reset the latency model parameters or load them from user input ++static ssize_t adios_reset_lat_model_store( ++ struct elevator_queue *e, const char *page, size_t count) ++{ ++ struct adios_data *ad = e->elevator_data; ++ struct latency_model *model; ++ int ret; ++ ++ /* ++ * Differentiate between two modes based on input format: ++ * 1. "1": Fully reset the model (backward compatibility). ++ * 2. "R_base R_slope W_base W_slope D_base D_slope": Load values. ++ */ ++ if (!strchr(page, ' ')) { ++ // Mode 1: Full reset. ++ unsigned long val; ++ ++ ret = kstrtoul(page, 10, &val); ++ if (ret || val != 1) ++ return -EINVAL; ++ ++ for (u8 i = 0; i < ADIOS_OPTYPES; i++) { ++ model = &ad->latency_model[i]; ++ sideload_latency_model(model, 0, 0); ++ } ++ } else { ++ // Mode 2: Load initial values for all latency models. ++ u64 params[3][2]; /* 0:base, 1:slope for R, W, D */ ++ ++ ret = sscanf(page, "%llu %llu %llu %llu %llu %llu", ++ ¶ms[ADIOS_READ ][0], ¶ms[ADIOS_READ ][1], ++ ¶ms[ADIOS_WRITE ][0], ¶ms[ADIOS_WRITE ][1], ++ ¶ms[ADIOS_DISCARD][0], ¶ms[ADIOS_DISCARD][1]); ++ ++ if (ret != 6) ++ return -EINVAL; ++ ++ for (u8 i = ADIOS_READ; i <= ADIOS_DISCARD; i++) { ++ model = &ad->latency_model[i]; ++ sideload_latency_model(model, params[i][0], params[i][1]); ++ } ++ } ++ reset_buckets(ad->aggr_buckets); ++ ++ return count; ++} ++ ++// Show the ADIOS version ++static ssize_t adios_version_show(struct elevator_queue *e, char *page) { ++ return sprintf(page, "%s\n", ADIOS_VERSION); ++} ++ ++// Define sysfs attributes for dynamic thresholds ++#define SHRINK_THRESHOLD_ATTR_RW(name, model_field, min_value, max_value) \ ++static ssize_t adios_shrink_##name##_store( \ ++ struct elevator_queue *e, const char *page, size_t count) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ unsigned long val; \ ++ int ret; \ ++ ret = kstrtoul(page, 10, &val); \ ++ if (ret || val < min_value || val > max_value) \ ++ return -EINVAL; \ ++ for (u8 i = 0; i < ADIOS_OPTYPES; i++) { \ ++ struct latency_model *model = &ad->latency_model[i]; \ ++ unsigned long flags; \ ++ spin_lock_irqsave(&model->update_lock, flags); \ ++ model->model_field = val; \ ++ spin_unlock_irqrestore(&model->update_lock, flags); \ ++ } \ ++ return count; \ ++} \ ++static ssize_t adios_shrink_##name##_show( \ ++ struct elevator_queue *e, char *page) { \ ++ struct adios_data *ad = e->elevator_data; \ ++ u32 val = 0; \ ++ unsigned long flags; \ ++ struct latency_model *model = &ad->latency_model[0]; \ ++ spin_lock_irqsave(&model->update_lock, flags); \ ++ val = model->model_field; \ ++ spin_unlock_irqrestore(&model->update_lock, flags); \ ++ return sprintf(page, "%u\n", val); \ ++} ++ ++SHRINK_THRESHOLD_ATTR_RW(at_kreqs, lm_shrink_at_kreqs, 1, 100000) ++SHRINK_THRESHOLD_ATTR_RW(at_gbytes, lm_shrink_at_gbytes, 1, 1000) ++SHRINK_THRESHOLD_ATTR_RW(resist, lm_shrink_resist, 1, 3) ++ ++// Define sysfs attributes ++#define AD_ATTR(name, show_func, store_func) \ ++ __ATTR(name, 0644, show_func, store_func) ++#define AD_ATTR_RW(name) \ ++ __ATTR(name, 0644, adios_##name##_show, adios_##name##_store) ++#define AD_ATTR_RO(name) \ ++ __ATTR(name, 0444, adios_##name##_show, NULL) ++#define AD_ATTR_WO(name) \ ++ __ATTR(name, 0200, NULL, adios_##name##_store) ++ ++// Define sysfs attributes for ADIOS scheduler ++static struct elv_fs_entry adios_sched_attrs[] = { ++ AD_ATTR_RO(batch_actual_max), ++ AD_ATTR_RW(bq_refill_below_ratio), ++ AD_ATTR_RW(global_latency_window), ++ AD_ATTR_RW(lat_model_latency_limit), ++ AD_ATTR_RW(batch_order), ++ AD_ATTR_RW(compliance_flags), ++ ++ AD_ATTR_RW(batch_limit_read), ++ AD_ATTR_RW(batch_limit_write), ++ AD_ATTR_RW(batch_limit_discard), ++ ++ AD_ATTR_RW(lat_model_read), ++ AD_ATTR_RW(lat_model_write), ++ AD_ATTR_RW(lat_model_discard), ++ ++ AD_ATTR_RW(lat_target_read), ++ AD_ATTR_RW(lat_target_write), ++ AD_ATTR_RW(lat_target_discard), ++ ++ AD_ATTR_RW(shrink_at_kreqs), ++ AD_ATTR_RW(shrink_at_gbytes), ++ AD_ATTR_RW(shrink_resist), ++ ++ AD_ATTR_RW(read_priority), ++ ++ AD_ATTR_WO(reset_bq_stats), ++ AD_ATTR_WO(reset_lat_model), ++ AD_ATTR(adios_version, adios_version_show, NULL), ++ ++ __ATTR_NULL ++}; ++ ++// Define the ADIOS scheduler type ++static struct elevator_type mq_adios = { ++ .ops = { ++ .next_request = elv_rb_latter_request, ++ .former_request = elv_rb_former_request, ++ .limit_depth = adios_limit_depth, ++ .depth_updated = adios_depth_updated, ++ .request_merged = adios_request_merged, ++ .requests_merged = adios_merged_requests, ++ .bio_merge = adios_bio_merge, ++ .insert_requests = adios_insert_requests, ++ .prepare_request = adios_prepare_request, ++ .dispatch_request = adios_dispatch_request, ++ .completed_request = adios_completed_request, ++ .finish_request = adios_finish_request, ++ .has_work = adios_has_work, ++ .init_hctx = adios_init_hctx, ++ .init_sched = adios_init_sched, ++ .exit_sched = adios_exit_sched, ++ }, ++ .elevator_attrs = adios_sched_attrs, ++ .elevator_name = "adios", ++ .elevator_owner = THIS_MODULE, ++}; ++MODULE_ALIAS("mq-adios-iosched"); ++ ++#define ADIOS_PROGNAME "Adaptive Deadline I/O Scheduler" ++#define ADIOS_AUTHOR "Masahito Suzuki" ++ ++// Initialize the ADIOS scheduler module ++static int __init adios_init(void) { ++ printk(KERN_INFO "%s %s by %s\n", ++ ADIOS_PROGNAME, ADIOS_VERSION, ADIOS_AUTHOR); ++ return elv_register(&mq_adios); ++} ++ ++// Exit the ADIOS scheduler module ++static void __exit adios_exit(void) { ++ elv_unregister(&mq_adios); ++} ++ ++module_init(adios_init); ++module_exit(adios_exit); ++ ++MODULE_AUTHOR(ADIOS_AUTHOR); ++MODULE_LICENSE("GPL"); ++MODULE_DESCRIPTION(ADIOS_PROGNAME); +\ No newline at end of file +diff --git a/block/elevator.c b/block/elevator.c +index fe96c6f4753c..7b4f2913841f 100644 +--- a/block/elevator.c ++++ b/block/elevator.c +@@ -752,6 +752,21 @@ void elevator_set_default(struct request_queue *q) + if (q->tag_set->flags & BLK_MQ_F_NO_SCHED_BY_DEFAULT) + return; + ++#ifdef CONFIG_MQ_IOSCHED_DEFAULT_ADIOS ++ ctx.name = "adios"; ++#else // !CONFIG_MQ_IOSCHED_DEFAULT_ADIOS ++ bool is_sq = q->nr_hw_queues == 1 || blk_mq_is_shared_tags(q->tag_set->flags); ++#ifdef CONFIG_CACHY ++#ifdef CONFIG_IOSCHED_BFQ ++ if (is_sq) ++ ctx.name = "bfq"; ++#endif /* CONFIG_IOSCHED_BFQ */ ++#else ++ if (!is_sq) ++ return; ++#endif /* CONFIG_CACHY */ ++#endif /* CONFIG_MQ_IOSCHED_DEFAULT_ADIOS */ ++ + /* + * For single queue devices, default to using mq-deadline. If we + * have multiple queues or mq-deadline is not available, default +@@ -761,13 +776,10 @@ void elevator_set_default(struct request_queue *q) + if (!e) + return; + +- if ((q->nr_hw_queues == 1 || +- blk_mq_is_shared_tags(q->tag_set->flags))) { +- err = elevator_change(q, &ctx); +- if (err < 0) +- pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", +- ctx.name, err); +- } ++ err = elevator_change(q, &ctx); ++ if (err < 0) ++ pr_warn("\"%s\" elevator initialization, failed %d, falling back to \"none\"\n", ++ ctx.name, err); + elevator_put(e); + } + +diff --git a/drivers/Makefile b/drivers/Makefile +index b5749cf67044..5beba9f57254 100644 +--- a/drivers/Makefile ++++ b/drivers/Makefile +@@ -64,14 +64,8 @@ obj-y += char/ + # iommu/ comes before gpu as gpu are using iommu controllers + obj-y += iommu/ + +-# gpu/ comes after char for AGP vs DRM startup and after iommu +-obj-y += gpu/ +- + obj-$(CONFIG_CONNECTOR) += connector/ + +-# i810fb depends on char/agp/ +-obj-$(CONFIG_FB_I810) += video/fbdev/i810/ +- + obj-$(CONFIG_PARPORT) += parport/ + obj-y += base/ block/ misc/ mfd/ nfc/ + obj-$(CONFIG_LIBNVDIMM) += nvdimm/ +@@ -83,6 +77,13 @@ obj-y += macintosh/ + obj-y += scsi/ + obj-y += nvme/ + obj-$(CONFIG_ATA) += ata/ ++ ++# gpu/ comes after char for AGP vs DRM startup and after iommu ++obj-y += gpu/ ++ ++# i810fb depends on char/agp/ ++obj-$(CONFIG_FB_I810) += video/fbdev/i810/ ++ + obj-$(CONFIG_TARGET_CORE) += target/ + obj-$(CONFIG_MTD) += mtd/ + obj-$(CONFIG_SPI) += spi/ +diff --git a/drivers/ata/ahci.c b/drivers/ata/ahci.c +index 7a7f88b3fa2b..cb26ab099da2 100644 +--- a/drivers/ata/ahci.c ++++ b/drivers/ata/ahci.c +@@ -1672,7 +1672,7 @@ static irqreturn_t ahci_thunderx_irq_handler(int irq, void *dev_instance) + } + #endif + +-static void ahci_remap_check(struct pci_dev *pdev, int bar, ++static int ahci_remap_check(struct pci_dev *pdev, int bar, + struct ahci_host_priv *hpriv) + { + int i; +@@ -1685,7 +1685,7 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, + pci_resource_len(pdev, bar) < SZ_512K || + bar != AHCI_PCI_BAR_STANDARD || + !(readl(hpriv->mmio + AHCI_VSCAP) & 1)) +- return; ++ return 0; + + cap = readq(hpriv->mmio + AHCI_REMAP_CAP); + for (i = 0; i < AHCI_MAX_REMAP; i++) { +@@ -1700,18 +1700,11 @@ static void ahci_remap_check(struct pci_dev *pdev, int bar, + } + + if (!hpriv->remapped_nvme) +- return; +- +- dev_warn(&pdev->dev, "Found %u remapped NVMe devices.\n", +- hpriv->remapped_nvme); +- dev_warn(&pdev->dev, +- "Switch your BIOS from RAID to AHCI mode to use them.\n"); ++ return 0; + +- /* +- * Don't rely on the msi-x capability in the remap case, +- * share the legacy interrupt across ahci and remapped devices. +- */ +- hpriv->flags |= AHCI_HFLAG_NO_MSI; ++ /* Abort probe, allowing intel-nvme-remap to step in when available */ ++ dev_info(&pdev->dev, "Device will be handled by intel-nvme-remap.\n"); ++ return -ENODEV; + } + + static int ahci_get_irq_vector(struct ata_host *host, int port) +@@ -1975,7 +1968,9 @@ static int ahci_init_one(struct pci_dev *pdev, const struct pci_device_id *ent) + return -ENOMEM; + + /* detect remapped nvme devices */ +- ahci_remap_check(pdev, ahci_pci_bar, hpriv); ++ rc = ahci_remap_check(pdev, ahci_pci_bar, hpriv); ++ if (rc) ++ return rc; + + sysfs_add_file_to_group(&pdev->dev.kobj, + &dev_attr_remapped_nvme.attr, +diff --git a/drivers/cpufreq/Kconfig.x86 b/drivers/cpufreq/Kconfig.x86 +index 2c5c228408bf..918e2bebfe78 100644 +--- a/drivers/cpufreq/Kconfig.x86 ++++ b/drivers/cpufreq/Kconfig.x86 +@@ -9,7 +9,6 @@ config X86_INTEL_PSTATE + select ACPI_PROCESSOR if ACPI + select ACPI_CPPC_LIB if X86_64 && ACPI && SCHED_MC_PRIO + select CPU_FREQ_GOV_PERFORMANCE +- select CPU_FREQ_GOV_SCHEDUTIL if SMP + help + This driver provides a P state for Intel core processors. + The driver implements an internal governor and will become +@@ -39,7 +38,6 @@ config X86_AMD_PSTATE + depends on X86 && ACPI + select ACPI_PROCESSOR + select ACPI_CPPC_LIB if X86_64 +- select CPU_FREQ_GOV_SCHEDUTIL if SMP + help + This driver adds a CPUFreq driver which utilizes a fine grain + processor performance frequency control range instead of legacy +diff --git a/drivers/cpufreq/intel_pstate.c b/drivers/cpufreq/intel_pstate.c +index f366d35c5840..a04b6bfeb1c2 100644 +--- a/drivers/cpufreq/intel_pstate.c ++++ b/drivers/cpufreq/intel_pstate.c +@@ -3950,6 +3950,8 @@ static int __init intel_pstate_setup(char *str) + + if (!strcmp(str, "disable")) + no_load = 1; ++ else if (!strcmp(str, "enable")) ++ no_load = 0; + else if (!strcmp(str, "active")) + default_driver = &intel_pstate; + else if (!strcmp(str, "passive")) +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu.h b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +index ef3af170dda4..cf918b18db53 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu.h ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu.h +@@ -163,6 +163,7 @@ struct amdgpu_watchdog_timer { + */ + extern int amdgpu_modeset; + extern unsigned int amdgpu_vram_limit; ++extern int amdgpu_ignore_min_pcap; + extern int amdgpu_vis_vram_limit; + extern int amdgpu_gart_size; + extern int amdgpu_gtt_size; +diff --git a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +index 395c6be901ce..fb1607b2805a 100644 +--- a/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c ++++ b/drivers/gpu/drm/amd/amdgpu/amdgpu_drv.c +@@ -148,6 +148,7 @@ enum AMDGPU_DEBUG_MASK { + }; + + unsigned int amdgpu_vram_limit = UINT_MAX; ++int amdgpu_ignore_min_pcap = 0; /* do not ignore by default */ + int amdgpu_vis_vram_limit; + int amdgpu_gart_size = -1; /* auto */ + int amdgpu_gtt_size = -1; /* auto */ +@@ -269,6 +270,15 @@ struct amdgpu_watchdog_timer amdgpu_watchdog_timer = { + .period = 0x0, /* default to 0x0 (timeout disable) */ + }; + ++/** ++ * DOC: ignore_min_pcap (int) ++ * Ignore the minimum power cap. ++ * Useful on graphics cards where the minimum power cap is very high. ++ * The default is 0 (Do not ignore). ++ */ ++MODULE_PARM_DESC(ignore_min_pcap, "Ignore the minimum power cap"); ++module_param_named(ignore_min_pcap, amdgpu_ignore_min_pcap, int, 0600); ++ + /** + * DOC: vramlimit (int) + * Restrict the total amount of VRAM in MiB for testing. The default is 0 (Use full VRAM). +diff --git a/drivers/gpu/drm/amd/display/Kconfig b/drivers/gpu/drm/amd/display/Kconfig +index abd3b6564373..46937e6fa78d 100644 +--- a/drivers/gpu/drm/amd/display/Kconfig ++++ b/drivers/gpu/drm/amd/display/Kconfig +@@ -56,4 +56,10 @@ config DRM_AMD_SECURE_DISPLAY + This option enables the calculation of crc of specific region via + debugfs. Cooperate with specific DMCU FW. + ++config AMD_PRIVATE_COLOR ++ bool "Enable KMS color management by AMD for AMD" ++ default n ++ help ++ This option extends the KMS color management API with AMD driver-specific properties to enhance the color management support on AMD Steam Deck. ++ + endmenu +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +index a0ca3b2c6bd8..c4ea09496f95 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm.c +@@ -4675,7 +4675,7 @@ static int amdgpu_dm_mode_config_init(struct amdgpu_device *adev) + return r; + } + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + if (amdgpu_dm_create_color_properties(adev)) { + dc_state_release(state->context); + kfree(state); +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +index ebabfe3a512f..4d3ebcaacca1 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_color.c +@@ -97,7 +97,7 @@ static inline struct fixed31_32 amdgpu_dm_fixpt_from_s3132(__u64 x) + return val; + } + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + /* Pre-defined Transfer Functions (TF) + * + * AMD driver supports pre-defined mathematical functions for transferring +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +index 45feb404b097..ee8672919a05 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_crtc.c +@@ -491,7 +491,7 @@ static int amdgpu_dm_crtc_late_register(struct drm_crtc *crtc) + } + #endif + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + /** + * dm_crtc_additional_color_mgmt - enable additional color properties + * @crtc: DRM CRTC +@@ -573,7 +573,7 @@ static const struct drm_crtc_funcs amdgpu_dm_crtc_funcs = { + #if defined(CONFIG_DEBUG_FS) + .late_register = amdgpu_dm_crtc_late_register, + #endif +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + .atomic_set_property = amdgpu_dm_atomic_crtc_set_property, + .atomic_get_property = amdgpu_dm_atomic_crtc_get_property, + #endif +@@ -770,7 +770,7 @@ int amdgpu_dm_crtc_init(struct amdgpu_display_manager *dm, + + drm_mode_crtc_set_gamma_size(&acrtc->base, MAX_COLOR_LEGACY_LUT_ENTRIES); + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + dm_crtc_additional_color_mgmt(&acrtc->base); + #endif + return 0; +diff --git a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +index eef51652ca35..d5c932c191b2 100644 +--- a/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c ++++ b/drivers/gpu/drm/amd/display/amdgpu_dm/amdgpu_dm_plane.c +@@ -1601,7 +1601,7 @@ static void amdgpu_dm_plane_drm_plane_destroy_state(struct drm_plane *plane, + drm_atomic_helper_plane_destroy_state(plane, state); + } + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + static void + dm_atomic_plane_attach_color_mgmt_properties(struct amdgpu_display_manager *dm, + struct drm_plane *plane) +@@ -1792,7 +1792,7 @@ static const struct drm_plane_funcs dm_plane_funcs = { + .atomic_duplicate_state = amdgpu_dm_plane_drm_plane_duplicate_state, + .atomic_destroy_state = amdgpu_dm_plane_drm_plane_destroy_state, + .format_mod_supported = amdgpu_dm_plane_format_mod_supported, +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + .atomic_set_property = dm_atomic_plane_set_property, + .atomic_get_property = dm_atomic_plane_get_property, + #endif +@@ -1888,7 +1888,7 @@ int amdgpu_dm_plane_init(struct amdgpu_display_manager *dm, + else + drm_plane_helper_add(plane, &dm_plane_helper_funcs); + +-#ifdef AMD_PRIVATE_COLOR ++#ifdef CONFIG_AMD_PRIVATE_COLOR + dm_atomic_plane_attach_color_mgmt_properties(dm, plane); + #endif + /* Create (reset) the plane state */ +diff --git a/drivers/gpu/drm/amd/pm/amdgpu_pm.c b/drivers/gpu/drm/amd/pm/amdgpu_pm.c +index 5fbfe7333b54..9e81953043be 100644 +--- a/drivers/gpu/drm/amd/pm/amdgpu_pm.c ++++ b/drivers/gpu/drm/amd/pm/amdgpu_pm.c +@@ -3073,6 +3073,9 @@ static ssize_t amdgpu_hwmon_show_power_cap_min(struct device *dev, + struct device_attribute *attr, + char *buf) + { ++ if (amdgpu_ignore_min_pcap) ++ return sysfs_emit(buf, "%i\n", 0); ++ + return amdgpu_hwmon_show_power_cap_generic(dev, attr, buf, PP_PWR_LIMIT_MIN); + } + +diff --git a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +index b47cb4a5f488..f9f6b0d96f97 100644 +--- a/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c ++++ b/drivers/gpu/drm/amd/pm/swsmu/amdgpu_smu.c +@@ -2921,7 +2921,10 @@ int smu_get_power_limit(void *handle, + *limit = smu->max_power_limit; + break; + case SMU_PPT_LIMIT_MIN: +- *limit = smu->min_power_limit; ++ if (amdgpu_ignore_min_pcap) ++ *limit = 0; ++ else ++ *limit = smu->min_power_limit; + break; + default: + return -EINVAL; +@@ -2945,7 +2948,14 @@ static int smu_set_power_limit(void *handle, uint32_t limit) + if (smu->ppt_funcs->set_power_limit) + return smu->ppt_funcs->set_power_limit(smu, limit_type, limit); + +- if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) { ++ if (amdgpu_ignore_min_pcap) { ++ if ((limit > smu->max_power_limit)) { ++ dev_err(smu->adev->dev, ++ "New power limit (%d) is over the max allowed %d\n", ++ limit, smu->max_power_limit); ++ return -EINVAL; ++ } ++ } else if ((limit > smu->max_power_limit) || (limit < smu->min_power_limit)) { + dev_err(smu->adev->dev, + "New power limit (%d) is out of range [%d,%d]\n", + limit, smu->min_power_limit, smu->max_power_limit); +diff --git a/drivers/input/evdev.c b/drivers/input/evdev.c +index 90ff6be85cf4..15159c1cf6e1 100644 +--- a/drivers/input/evdev.c ++++ b/drivers/input/evdev.c +@@ -46,6 +46,7 @@ struct evdev_client { + struct fasync_struct *fasync; + struct evdev *evdev; + struct list_head node; ++ struct rcu_head rcu; + enum input_clock_type clk_type; + bool revoked; + unsigned long *evmasks[EV_CNT]; +@@ -368,13 +369,22 @@ static void evdev_attach_client(struct evdev *evdev, + spin_unlock(&evdev->client_lock); + } + ++static void evdev_reclaim_client(struct rcu_head *rp) ++{ ++ struct evdev_client *client = container_of(rp, struct evdev_client, rcu); ++ unsigned int i; ++ for (i = 0; i < EV_CNT; ++i) ++ bitmap_free(client->evmasks[i]); ++ kvfree(client); ++} ++ + static void evdev_detach_client(struct evdev *evdev, + struct evdev_client *client) + { + spin_lock(&evdev->client_lock); + list_del_rcu(&client->node); + spin_unlock(&evdev->client_lock); +- synchronize_rcu(); ++ call_rcu(&client->rcu, evdev_reclaim_client); + } + + static int evdev_open_device(struct evdev *evdev) +@@ -427,7 +437,6 @@ static int evdev_release(struct inode *inode, struct file *file) + { + struct evdev_client *client = file->private_data; + struct evdev *evdev = client->evdev; +- unsigned int i; + + mutex_lock(&evdev->mutex); + +@@ -439,11 +448,6 @@ static int evdev_release(struct inode *inode, struct file *file) + + evdev_detach_client(evdev, client); + +- for (i = 0; i < EV_CNT; ++i) +- bitmap_free(client->evmasks[i]); +- +- kvfree(client); +- + evdev_close_device(evdev); + + return 0; +@@ -486,7 +490,6 @@ static int evdev_open(struct inode *inode, struct file *file) + + err_free_client: + evdev_detach_client(evdev, client); +- kvfree(client); + return error; + } + +diff --git a/drivers/md/dm-crypt.c b/drivers/md/dm-crypt.c +index 5ef43231fe77..5d754058c023 100644 +--- a/drivers/md/dm-crypt.c ++++ b/drivers/md/dm-crypt.c +@@ -3305,6 +3305,11 @@ static int crypt_ctr(struct dm_target *ti, unsigned int argc, char **argv) + goto bad; + } + ++#ifdef CONFIG_CACHY ++ set_bit(DM_CRYPT_NO_READ_WORKQUEUE, &cc->flags); ++ set_bit(DM_CRYPT_NO_WRITE_WORKQUEUE, &cc->flags); ++#endif ++ + ret = crypt_ctr_cipher(ti, argv[0], argv[1]); + if (ret < 0) + goto bad; +diff --git a/drivers/media/v4l2-core/Kconfig b/drivers/media/v4l2-core/Kconfig +index 331b8e535e5b..80dabeebf580 100644 +--- a/drivers/media/v4l2-core/Kconfig ++++ b/drivers/media/v4l2-core/Kconfig +@@ -40,6 +40,11 @@ config VIDEO_TUNER + config V4L2_JPEG_HELPER + tristate + ++config V4L2_LOOPBACK ++ tristate "V4L2 loopback device" ++ help ++ V4L2 loopback device ++ + # Used by drivers that need v4l2-h264.ko + config V4L2_H264 + tristate +diff --git a/drivers/media/v4l2-core/Makefile b/drivers/media/v4l2-core/Makefile +index 2177b9d63a8f..c179507cedc4 100644 +--- a/drivers/media/v4l2-core/Makefile ++++ b/drivers/media/v4l2-core/Makefile +@@ -33,5 +33,7 @@ obj-$(CONFIG_V4L2_JPEG_HELPER) += v4l2-jpeg.o + obj-$(CONFIG_V4L2_MEM2MEM_DEV) += v4l2-mem2mem.o + obj-$(CONFIG_V4L2_VP9) += v4l2-vp9.o + ++obj-$(CONFIG_V4L2_LOOPBACK) += v4l2loopback.o ++ + obj-$(CONFIG_VIDEO_TUNER) += tuner.o + obj-$(CONFIG_VIDEO_DEV) += v4l2-dv-timings.o videodev.o +diff --git a/drivers/media/v4l2-core/v4l2loopback.c b/drivers/media/v4l2-core/v4l2loopback.c +new file mode 100644 +index 000000000000..3be7c4abc1e7 +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback.c +@@ -0,0 +1,3316 @@ ++/* -*- c-file-style: "linux" -*- */ ++/* ++ * v4l2loopback.c -- video4linux2 loopback driver ++ * ++ * Copyright (C) 2005-2009 Vasily Levin (vasaka@gmail.com) ++ * Copyright (C) 2010-2023 IOhannes m zmoelnig (zmoelnig@iem.at) ++ * Copyright (C) 2011 Stefan Diewald (stefan.diewald@mytum.de) ++ * Copyright (C) 2012 Anton Novikov (random.plant@gmail.com) ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ */ ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++#include ++ ++#include ++#include "v4l2loopback.h" ++ ++#define V4L2LOOPBACK_CTL_ADD_legacy 0x4C80 ++#define V4L2LOOPBACK_CTL_REMOVE_legacy 0x4C81 ++#define V4L2LOOPBACK_CTL_QUERY_legacy 0x4C82 ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 0, 0) ++#error This module is not supported on kernels before 4.0.0. ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 3, 0) ++#define strscpy strlcpy ++#endif ++ ++#if defined(timer_setup) ++#define HAVE_TIMER_SETUP ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(5, 7, 0) ++#define VFL_TYPE_VIDEO VFL_TYPE_GRABBER ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 2, 0) ++#define timer_delete_sync del_timer_sync ++#endif ++ ++#define V4L2LOOPBACK_VERSION_CODE \ ++ KERNEL_VERSION(V4L2LOOPBACK_VERSION_MAJOR, V4L2LOOPBACK_VERSION_MINOR, \ ++ V4L2LOOPBACK_VERSION_BUGFIX) ++ ++MODULE_DESCRIPTION("V4L2 loopback video device"); ++MODULE_AUTHOR("Vasily Levin, " ++ "IOhannes m zmoelnig ," ++ "Stefan Diewald," ++ "Anton Novikov" ++ "et al."); ++#ifdef SNAPSHOT_VERSION ++MODULE_VERSION(__stringify(SNAPSHOT_VERSION)); ++#else ++MODULE_VERSION("" __stringify(V4L2LOOPBACK_VERSION_MAJOR) "." __stringify( ++ V4L2LOOPBACK_VERSION_MINOR) "." __stringify(V4L2LOOPBACK_VERSION_BUGFIX)); ++#endif ++MODULE_LICENSE("GPL"); ++ ++/* ++ * helpers ++ */ ++#define dprintk(fmt, args...) \ ++ do { \ ++ if (debug > 0) { \ ++ printk(KERN_INFO "v4l2-loopback[" __stringify( \ ++ __LINE__) "], pid(%d): " fmt, \ ++ task_pid_nr(current), ##args); \ ++ } \ ++ } while (0) ++ ++#define MARK() \ ++ do { \ ++ if (debug > 1) { \ ++ printk(KERN_INFO "%s:%d[%s], pid(%d)\n", __FILE__, \ ++ __LINE__, __func__, task_pid_nr(current)); \ ++ } \ ++ } while (0) ++ ++#define dprintkrw(fmt, args...) \ ++ do { \ ++ if (debug > 2) { \ ++ printk(KERN_INFO "v4l2-loopback[" __stringify( \ ++ __LINE__) "], pid(%d): " fmt, \ ++ task_pid_nr(current), ##args); \ ++ } \ ++ } while (0) ++ ++static inline void v4l2l_get_timestamp(struct v4l2_buffer *b) ++{ ++ struct timespec64 ts; ++ ktime_get_ts64(&ts); ++ ++ b->timestamp.tv_sec = ts.tv_sec; ++ b->timestamp.tv_usec = (ts.tv_nsec / NSEC_PER_USEC); ++ b->flags |= V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; ++ b->flags &= ~V4L2_BUF_FLAG_TIMESTAMP_COPY; ++} ++ ++#if BITS_PER_LONG == 32 ++#include /* do_div() for 64bit division */ ++static inline int v4l2l_mod64(const s64 A, const u32 B) ++{ ++ u64 a = (u64)A; ++ u32 b = B; ++ ++ if (A > 0) ++ return do_div(a, b); ++ a = -A; ++ return -do_div(a, b); ++} ++#else ++static inline int v4l2l_mod64(const s64 A, const u32 B) ++{ ++ return A % B; ++} ++#endif ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 16, 0) ++typedef unsigned __poll_t; ++#endif ++ ++/* module constants ++ * can be overridden during he build process using something like ++ * make KCPPFLAGS="-DMAX_DEVICES=100" ++ */ ++ ++/* maximum number of v4l2loopback devices that can be created */ ++#ifndef MAX_DEVICES ++#define MAX_DEVICES 8 ++#endif ++ ++/* whether the default is to announce capabilities exclusively or not */ ++#ifndef V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS ++#define V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS 0 ++#endif ++ ++/* when a producer is considered to have gone stale */ ++#ifndef MAX_TIMEOUT ++#define MAX_TIMEOUT (100 * 1000) /* in msecs */ ++#endif ++ ++/* max buffers that can be mapped, actually they ++ * are all mapped to max_buffers buffers */ ++#ifndef MAX_BUFFERS ++#define MAX_BUFFERS 32 ++#endif ++ ++/* module parameters */ ++static int debug = 0; ++module_param(debug, int, S_IRUGO | S_IWUSR); ++MODULE_PARM_DESC(debug, "debugging level (higher values == more verbose)"); ++ ++#define V4L2LOOPBACK_DEFAULT_MAX_BUFFERS 2 ++static int max_buffers = V4L2LOOPBACK_DEFAULT_MAX_BUFFERS; ++module_param(max_buffers, int, S_IRUGO); ++MODULE_PARM_DESC(max_buffers, ++ "how many buffers should be allocated [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_MAX_BUFFERS) "]"); ++ ++/* how many times a device can be opened ++ * the per-module default value can be overridden on a per-device basis using ++ * the /sys/devices interface ++ * ++ * note that max_openers should be at least 2 in order to get a working system: ++ * one opener for the producer and one opener for the consumer ++ * however, we leave that to the user ++ */ ++#define V4L2LOOPBACK_DEFAULT_MAX_OPENERS 10 ++static int max_openers = V4L2LOOPBACK_DEFAULT_MAX_OPENERS; ++module_param(max_openers, int, S_IRUGO | S_IWUSR); ++MODULE_PARM_DESC( ++ max_openers, ++ "how many users can open the loopback device [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_MAX_OPENERS) "]"); ++ ++static int devices = -1; ++module_param(devices, int, 0); ++MODULE_PARM_DESC(devices, "how many devices should be created"); ++ ++static int video_nr[MAX_DEVICES] = { [0 ...(MAX_DEVICES - 1)] = -1 }; ++module_param_array(video_nr, int, NULL, 0444); ++MODULE_PARM_DESC(video_nr, ++ "video device numbers (-1=auto, 0=/dev/video0, etc.)"); ++ ++static char *card_label[MAX_DEVICES]; ++module_param_array(card_label, charp, NULL, 0000); ++MODULE_PARM_DESC(card_label, "card labels for each device"); ++ ++static bool exclusive_caps[MAX_DEVICES] = { ++ [0 ...(MAX_DEVICES - 1)] = V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS ++}; ++module_param_array(exclusive_caps, bool, NULL, 0444); ++/* FIXXME: wording */ ++MODULE_PARM_DESC( ++ exclusive_caps, ++ "whether to announce OUTPUT/CAPTURE capabilities exclusively or not [DEFAULT: " __stringify( ++ V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS) "]"); ++ ++/* format specifications */ ++#define V4L2LOOPBACK_SIZE_MIN_WIDTH 2 ++#define V4L2LOOPBACK_SIZE_MIN_HEIGHT 1 ++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH 8192 ++#define V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT 8192 ++ ++#define V4L2LOOPBACK_SIZE_DEFAULT_WIDTH 640 ++#define V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT 480 ++ ++static int max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; ++module_param(max_width, int, S_IRUGO); ++MODULE_PARM_DESC(max_width, ++ "maximum allowed frame width [DEFAULT: " __stringify( ++ V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH) "]"); ++static int max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; ++module_param(max_height, int, S_IRUGO); ++MODULE_PARM_DESC(max_height, ++ "maximum allowed frame height [DEFAULT: " __stringify( ++ V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT) "]"); ++ ++static DEFINE_IDR(v4l2loopback_index_idr); ++static DEFINE_MUTEX(v4l2loopback_ctl_mutex); ++ ++/* frame intervals */ ++#define V4L2LOOPBACK_FRAME_INTERVAL_MAX __UINT32_MAX__ ++#define V4L2LOOPBACK_FPS_DEFAULT 30 ++#define V4L2LOOPBACK_FPS_MAX 1000 ++ ++/* control IDs */ ++#define V4L2LOOPBACK_CID_BASE (V4L2_CID_USER_BASE | 0xf000) ++#define CID_KEEP_FORMAT (V4L2LOOPBACK_CID_BASE + 0) ++#define CID_SUSTAIN_FRAMERATE (V4L2LOOPBACK_CID_BASE + 1) ++#define CID_TIMEOUT (V4L2LOOPBACK_CID_BASE + 2) ++#define CID_TIMEOUT_IMAGE_IO (V4L2LOOPBACK_CID_BASE + 3) ++ ++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl); ++static const struct v4l2_ctrl_ops v4l2loopback_ctrl_ops = { ++ .s_ctrl = v4l2loopback_s_ctrl, ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_keepformat = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_KEEP_FORMAT, ++ .name = "keep_format", ++ .type = V4L2_CTRL_TYPE_BOOLEAN, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_sustainframerate = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_SUSTAIN_FRAMERATE, ++ .name = "sustain_framerate", ++ .type = V4L2_CTRL_TYPE_BOOLEAN, ++ .min = 0, ++ .max = 1, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeout = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_TIMEOUT, ++ .name = "timeout", ++ .type = V4L2_CTRL_TYPE_INTEGER, ++ .min = 0, ++ .max = MAX_TIMEOUT, ++ .step = 1, ++ .def = 0, ++ // clang-format on ++}; ++static const struct v4l2_ctrl_config v4l2loopback_ctrl_timeoutimageio = { ++ // clang-format off ++ .ops = &v4l2loopback_ctrl_ops, ++ .id = CID_TIMEOUT_IMAGE_IO, ++ .name = "timeout_image_io", ++ .type = V4L2_CTRL_TYPE_BUTTON, ++ .min = 0, ++ .max = 0, ++ .step = 0, ++ .def = 0, ++ // clang-format on ++}; ++ ++/* module structures */ ++struct v4l2loopback_private { ++ int device_nr; ++}; ++ ++/* TODO(vasaka) use typenames which are common to kernel, but first find out if ++ * it is needed */ ++/* struct keeping state and settings of loopback device */ ++ ++struct v4l2l_buffer { ++ struct v4l2_buffer buffer; ++ struct list_head list_head; ++ atomic_t use_count; ++}; ++ ++struct v4l2_loopback_device { ++ struct v4l2_device v4l2_dev; ++ struct v4l2_ctrl_handler ctrl_handler; ++ struct video_device *vdev; ++ ++ /* loopback device-specific parameters */ ++ char card_label[32]; ++ bool announce_all_caps; /* announce both OUTPUT and CAPTURE capabilities ++ * when true; else announce OUTPUT when no ++ * writer is streaming, otherwise CAPTURE. */ ++ int max_openers; /* how many times can this device be opened */ ++ int min_width, max_width; ++ int min_height, max_height; ++ ++ /* pixel and stream format */ ++ struct v4l2_pix_format pix_format; ++ bool pix_format_has_valid_sizeimage; ++ struct v4l2_captureparm capture_param; ++ unsigned long frame_jiffies; ++ ++ /* ctrls */ ++ int keep_format; /* CID_KEEP_FORMAT; lock the format, do not free ++ * on close(), and when `!announce_all_caps` do NOT ++ * fall back to OUTPUT when no writers attached (clear ++ * `keep_format` to attach a new writer) */ ++ int sustain_framerate; /* CID_SUSTAIN_FRAMERATE; duplicate frames to maintain ++ (close to) nominal framerate */ ++ unsigned long timeout_jiffies; /* CID_TIMEOUT; 0 means disabled */ ++ int timeout_image_io; /* CID_TIMEOUT_IMAGE_IO; next opener will ++ * queue/dequeue the timeout image buffer */ ++ ++ /* buffers for OUTPUT and CAPTURE */ ++ u8 *image; /* pointer to actual buffers data */ ++ unsigned long image_size; /* number of bytes alloc'd for all buffers */ ++ struct v4l2l_buffer buffers[MAX_BUFFERS]; /* inner driver buffers */ ++ u32 buffer_count; /* should not be big, 4 is a good choice */ ++ u32 buffer_size; /* number of bytes alloc'd per buffer */ ++ u32 used_buffer_count; /* number of buffers allocated to openers */ ++ struct list_head outbufs_list; /* FIFO queue for OUTPUT buffers */ ++ u32 bufpos2index[MAX_BUFFERS]; /* mapping of `(position % used_buffers)` ++ * to `buffers[index]` */ ++ s64 write_position; /* sequence number of last 'displayed' buffer plus ++ * one */ ++ ++ /* synchronization between openers */ ++ atomic_t open_count; ++ struct mutex image_mutex; /* mutex for allocating image(s) and ++ * exchanging format tokens */ ++ spinlock_t lock; /* lock for the timeout and framerate timers */ ++ spinlock_t list_lock; /* lock for the OUTPUT buffer queue */ ++ wait_queue_head_t read_event; ++ u32 format_tokens; /* tokens to 'set format' for OUTPUT, CAPTURE, or ++ * timeout buffers */ ++ u32 stream_tokens; /* tokens to 'start' OUTPUT, CAPTURE, or timeout ++ * stream */ ++ ++ /* sustain framerate */ ++ struct timer_list sustain_timer; ++ unsigned int reread_count; ++ ++ /* timeout */ ++ u8 *timeout_image; /* copied to outgoing buffers when timeout passes */ ++ struct v4l2l_buffer timeout_buffer; ++ u32 timeout_buffer_size; /* number bytes alloc'd for timeout buffer */ ++ struct timer_list timeout_timer; ++ int timeout_happened; ++}; ++ ++enum v4l2l_io_method { ++ V4L2L_IO_NONE = 0, ++ V4L2L_IO_MMAP = 1, ++ V4L2L_IO_FILE = 2, ++ V4L2L_IO_TIMEOUT = 3, ++}; ++ ++/* struct keeping state and type of opener */ ++struct v4l2_loopback_opener { ++ u32 format_token; /* token (if any) for type used in call to S_FMT or ++ * REQBUFS */ ++ u32 stream_token; /* token (if any) for type used in call to STREAMON */ ++ u32 buffer_count; /* number of buffers (if any) that opener acquired via ++ * REQBUFS */ ++ s64 read_position; /* sequence number of the next 'captured' frame */ ++ unsigned int reread_count; ++ enum v4l2l_io_method io_method; ++ ++ struct v4l2_fh fh; ++}; ++ ++#define fh_to_opener(ptr) container_of((ptr), struct v4l2_loopback_opener, fh) ++ ++/* this is heavily inspired by the bttv driver found in the linux kernel */ ++struct v4l2l_format { ++ char *name; ++ int fourcc; /* video4linux 2 */ ++ int depth; /* bit/pixel */ ++ int flags; ++}; ++/* set the v4l2l_format.flags to PLANAR for non-packed formats */ ++#define FORMAT_FLAGS_PLANAR 0x01 ++#define FORMAT_FLAGS_COMPRESSED 0x02 ++ ++#include "v4l2loopback_formats.h" ++ ++#ifndef V4L2_TYPE_IS_CAPTURE ++#define V4L2_TYPE_IS_CAPTURE(type) \ ++ ((type) == V4L2_BUF_TYPE_VIDEO_CAPTURE || \ ++ (type) == V4L2_BUF_TYPE_VIDEO_CAPTURE_MPLANE) ++#endif /* V4L2_TYPE_IS_CAPTURE */ ++#ifndef V4L2_TYPE_IS_OUTPUT ++#define V4L2_TYPE_IS_OUTPUT(type) \ ++ ((type) == V4L2_BUF_TYPE_VIDEO_OUTPUT || \ ++ (type) == V4L2_BUF_TYPE_VIDEO_OUTPUT_MPLANE) ++#endif /* V4L2_TYPE_IS_OUTPUT */ ++ ++/* token values for privilege to set format or start/stop stream */ ++#define V4L2L_TOKEN_CAPTURE 0x01 ++#define V4L2L_TOKEN_OUTPUT 0x02 ++#define V4L2L_TOKEN_TIMEOUT 0x04 ++#define V4L2L_TOKEN_MASK \ ++ (V4L2L_TOKEN_CAPTURE | V4L2L_TOKEN_OUTPUT | V4L2L_TOKEN_TIMEOUT) ++ ++/* helpers for token exchange and token status */ ++#define token_from_type(type) \ ++ (V4L2_TYPE_IS_CAPTURE(type) ? V4L2L_TOKEN_CAPTURE : V4L2L_TOKEN_OUTPUT) ++#define acquire_token(dev, opener, label, token) \ ++ do { \ ++ (opener)->label##_token = token; \ ++ (dev)->label##_tokens &= ~token; \ ++ } while (0) ++#define release_token(dev, opener, label) \ ++ do { \ ++ (dev)->label##_tokens |= (opener)->label##_token; \ ++ (opener)->label##_token = 0; \ ++ } while (0) ++#define has_output_token(token) (token & V4L2L_TOKEN_OUTPUT) ++#define has_capture_token(token) (token & V4L2L_TOKEN_CAPTURE) ++#define has_no_owners(dev) ((~((dev)->format_tokens) & V4L2L_TOKEN_MASK) == 0) ++#define has_other_owners(opener, dev) \ ++ (~((dev)->format_tokens ^ (opener)->format_token) & V4L2L_TOKEN_MASK) ++#define need_timeout_buffer(dev, token) \ ++ ((dev)->timeout_jiffies > 0 || (token) & V4L2L_TOKEN_TIMEOUT) ++ ++static const unsigned int FORMATS = ARRAY_SIZE(formats); ++ ++static char *fourcc2str(unsigned int fourcc, char buf[5]) ++{ ++ buf[0] = (fourcc >> 0) & 0xFF; ++ buf[1] = (fourcc >> 8) & 0xFF; ++ buf[2] = (fourcc >> 16) & 0xFF; ++ buf[3] = (fourcc >> 24) & 0xFF; ++ buf[4] = 0; ++ ++ return buf; ++} ++ ++static const struct v4l2l_format *format_by_fourcc(int fourcc) ++{ ++ unsigned int i; ++ char buf[5]; ++ ++ for (i = 0; i < FORMATS; i++) { ++ if (formats[i].fourcc == fourcc) ++ return formats + i; ++ } ++ ++ dprintk("unsupported format '%4s'\n", fourcc2str(fourcc, buf)); ++ return NULL; ++} ++ ++static void pix_format_set_size(struct v4l2_pix_format *f, ++ const struct v4l2l_format *fmt, ++ unsigned int width, unsigned int height) ++{ ++ f->width = width; ++ f->height = height; ++ ++ if (fmt->flags & FORMAT_FLAGS_PLANAR) { ++ f->bytesperline = width; /* Y plane */ ++ f->sizeimage = (width * height * fmt->depth) >> 3; ++ } else if (fmt->flags & FORMAT_FLAGS_COMPRESSED) { ++ /* doesn't make sense for compressed formats */ ++ f->bytesperline = 0; ++ f->sizeimage = (width * height * fmt->depth) >> 3; ++ } else { ++ f->bytesperline = (width * fmt->depth) >> 3; ++ f->sizeimage = height * f->bytesperline; ++ } ++} ++ ++static int v4l2l_fill_format(struct v4l2_format *fmt, const u32 minwidth, ++ const u32 maxwidth, const u32 minheight, ++ const u32 maxheight) ++{ ++ u32 width = fmt->fmt.pix.width, height = fmt->fmt.pix.height; ++ u32 pixelformat = fmt->fmt.pix.pixelformat; ++ struct v4l2_format fmt0 = *fmt; ++ u32 bytesperline = 0, sizeimage = 0; ++ ++ if (!width) ++ width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; ++ if (!height) ++ height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; ++ width = clamp_val(width, minwidth, maxwidth); ++ height = clamp_val(height, minheight, maxheight); ++ ++ /* sets: width,height,pixelformat,bytesperline,sizeimage */ ++ if (!(V4L2_TYPE_IS_MULTIPLANAR(fmt0.type))) { ++ fmt0.fmt.pix.bytesperline = 0; ++ fmt0.fmt.pix.sizeimage = 0; ++ } ++ ++ if (0) { ++ ; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) ++ } else if (!v4l2_fill_pixfmt(&fmt0.fmt.pix, pixelformat, width, ++ height)) { ++ ; ++ } else if (!v4l2_fill_pixfmt_mp(&fmt0.fmt.pix_mp, pixelformat, width, ++ height)) { ++ ; ++#endif ++ } else { ++ const struct v4l2l_format *format = ++ format_by_fourcc(pixelformat); ++ if (!format) ++ return -EINVAL; ++ pix_format_set_size(&fmt0.fmt.pix, format, width, height); ++ fmt0.fmt.pix.pixelformat = format->fourcc; ++ } ++ ++ if (V4L2_TYPE_IS_MULTIPLANAR(fmt0.type)) { ++ *fmt = fmt0; ++ ++ if ((fmt->fmt.pix_mp.colorspace == V4L2_COLORSPACE_DEFAULT) || ++ (fmt->fmt.pix_mp.colorspace > V4L2_COLORSPACE_DCI_P3)) ++ fmt->fmt.pix_mp.colorspace = V4L2_COLORSPACE_SRGB; ++ if (V4L2_FIELD_ANY == fmt->fmt.pix_mp.field) ++ fmt->fmt.pix_mp.field = V4L2_FIELD_NONE; ++ } else { ++ bytesperline = fmt->fmt.pix.bytesperline; ++ sizeimage = fmt->fmt.pix.sizeimage; ++ ++ *fmt = fmt0; ++ ++ if (!fmt->fmt.pix.bytesperline) ++ fmt->fmt.pix.bytesperline = bytesperline; ++ if (!fmt->fmt.pix.sizeimage) ++ fmt->fmt.pix.sizeimage = sizeimage; ++ ++ if ((fmt->fmt.pix.colorspace == V4L2_COLORSPACE_DEFAULT) || ++ (fmt->fmt.pix.colorspace > V4L2_COLORSPACE_DCI_P3)) ++ fmt->fmt.pix.colorspace = V4L2_COLORSPACE_SRGB; ++ if (V4L2_FIELD_ANY == fmt->fmt.pix.field) ++ fmt->fmt.pix.field = V4L2_FIELD_NONE; ++ } ++ ++ return 0; ++} ++ ++/* Checks if v4l2l_fill_format() has set a valid, fixed sizeimage val. */ ++static bool v4l2l_pix_format_has_valid_sizeimage(struct v4l2_format *fmt) ++{ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 2, 0) ++ const struct v4l2_format_info *info; ++ ++ info = v4l2_format_info(fmt->fmt.pix.pixelformat); ++ if (info && info->mem_planes == 1) ++ return true; ++#endif ++ ++ return false; ++} ++ ++static int pix_format_eq(const struct v4l2_pix_format *ref, ++ const struct v4l2_pix_format *tgt, int strict) ++{ ++ /* check if the two formats are equivalent. ++ * ANY fields are handled gracefully ++ */ ++#define _pix_format_eq0(x) \ ++ if (ref->x != tgt->x) \ ++ result = 0 ++#define _pix_format_eq1(x, def) \ ++ do { \ ++ if ((def != tgt->x) && (ref->x != tgt->x)) { \ ++ printk(KERN_INFO #x " failed"); \ ++ result = 0; \ ++ } \ ++ } while (0) ++ int result = 1; ++ _pix_format_eq0(width); ++ _pix_format_eq0(height); ++ _pix_format_eq0(pixelformat); ++ if (!strict) ++ return result; ++ _pix_format_eq1(field, V4L2_FIELD_ANY); ++ _pix_format_eq0(bytesperline); ++ _pix_format_eq0(sizeimage); ++ _pix_format_eq1(colorspace, V4L2_COLORSPACE_DEFAULT); ++ return result; ++} ++ ++static void set_timeperframe(struct v4l2_loopback_device *dev, ++ struct v4l2_fract *tpf) ++{ ++ if (!tpf->denominator && !tpf->numerator) { ++ tpf->numerator = 1; ++ tpf->denominator = V4L2LOOPBACK_FPS_DEFAULT; ++ } else if (tpf->numerator > ++ V4L2LOOPBACK_FRAME_INTERVAL_MAX * tpf->denominator) { ++ /* divide-by-zero or greater than maximum interval => min FPS */ ++ tpf->numerator = V4L2LOOPBACK_FRAME_INTERVAL_MAX; ++ tpf->denominator = 1; ++ } else if (tpf->numerator * V4L2LOOPBACK_FPS_MAX < tpf->denominator) { ++ /* zero or lower than minimum interval => max FPS */ ++ tpf->numerator = 1; ++ tpf->denominator = V4L2LOOPBACK_FPS_MAX; ++ } ++ ++ dev->capture_param.timeperframe = *tpf; ++ dev->frame_jiffies = ++ max(1UL, (msecs_to_jiffies(1000) * tpf->numerator) / ++ tpf->denominator); ++} ++ ++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd); ++ ++/* device attributes */ ++/* available via sysfs: /sys/devices/virtual/video4linux/video* */ ++ ++static ssize_t attr_show_format(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ /* gets the current format as "FOURCC:WxH@f/s", e.g. "YUYV:320x240@1000/30" */ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ const struct v4l2_fract *tpf; ++ char buf4cc[5], buf_fps[32]; ++ ++ if (!dev || (has_no_owners(dev) && !dev->keep_format)) ++ return 0; ++ tpf = &dev->capture_param.timeperframe; ++ ++ fourcc2str(dev->pix_format.pixelformat, buf4cc); ++ if (tpf->numerator == 1) ++ snprintf(buf_fps, sizeof(buf_fps), "%u", tpf->denominator); ++ else ++ snprintf(buf_fps, sizeof(buf_fps), "%u/%u", tpf->denominator, ++ tpf->numerator); ++ return sprintf(buf, "%4s:%ux%u@%s\n", buf4cc, dev->pix_format.width, ++ dev->pix_format.height, buf_fps); ++} ++ ++static ssize_t attr_store_format(struct device *cd, ++ struct device_attribute *attr, const char *buf, ++ size_t len) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ int fps_num = 0, fps_den = 1; ++ ++ if (!dev) ++ return -ENODEV; ++ ++ /* only fps changing is supported */ ++ if (sscanf(buf, "@%u/%u", &fps_num, &fps_den) > 0) { ++ struct v4l2_fract f = { .numerator = fps_den, ++ .denominator = fps_num }; ++ set_timeperframe(dev, &f); ++ return len; ++ } ++ return -EINVAL; ++} ++ ++static DEVICE_ATTR(format, S_IRUGO | S_IWUSR, attr_show_format, ++ attr_store_format); ++ ++static ssize_t attr_show_buffers(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ return sprintf(buf, "%u\n", dev->used_buffer_count); ++} ++ ++static DEVICE_ATTR(buffers, S_IRUGO, attr_show_buffers, NULL); ++ ++static ssize_t attr_show_maxopeners(struct device *cd, ++ struct device_attribute *attr, char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ return sprintf(buf, "%d\n", dev->max_openers); ++} ++ ++static ssize_t attr_store_maxopeners(struct device *cd, ++ struct device_attribute *attr, ++ const char *buf, size_t len) ++{ ++ struct v4l2_loopback_device *dev = NULL; ++ unsigned long curr = 0; ++ ++ if (kstrtoul(buf, 0, &curr)) ++ return -EINVAL; ++ ++ dev = v4l2loopback_cd2dev(cd); ++ if (!dev) ++ return -ENODEV; ++ ++ if (dev->max_openers == curr) ++ return len; ++ ++ if (curr > __INT_MAX__ || dev->open_count.counter > curr) { ++ /* request to limit to less openers as are currently attached to us */ ++ return -EINVAL; ++ } ++ ++ dev->max_openers = (int)curr; ++ ++ return len; ++} ++ ++static DEVICE_ATTR(max_openers, S_IRUGO | S_IWUSR, attr_show_maxopeners, ++ attr_store_maxopeners); ++ ++static ssize_t attr_show_state(struct device *cd, struct device_attribute *attr, ++ char *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_cd2dev(cd); ++ ++ if (!dev) ++ return -ENODEV; ++ ++ if (!has_output_token(dev->stream_tokens) || dev->keep_format) { ++ return sprintf(buf, "capture\n"); ++ } else ++ return sprintf(buf, "output\n"); ++ ++ return -EAGAIN; ++} ++ ++static DEVICE_ATTR(state, S_IRUGO, attr_show_state, NULL); ++ ++static void v4l2loopback_remove_sysfs(struct video_device *vdev) ++{ ++#define V4L2_SYSFS_DESTROY(x) device_remove_file(&vdev->dev, &dev_attr_##x) ++ ++ if (vdev) { ++ V4L2_SYSFS_DESTROY(format); ++ V4L2_SYSFS_DESTROY(buffers); ++ V4L2_SYSFS_DESTROY(max_openers); ++ V4L2_SYSFS_DESTROY(state); ++ /* ... */ ++ } ++} ++ ++static void v4l2loopback_create_sysfs(struct video_device *vdev) ++{ ++ int res = 0; ++ ++#define V4L2_SYSFS_CREATE(x) \ ++ res = device_create_file(&vdev->dev, &dev_attr_##x); \ ++ if (res < 0) \ ++ break ++ if (!vdev) ++ return; ++ do { ++ V4L2_SYSFS_CREATE(format); ++ V4L2_SYSFS_CREATE(buffers); ++ V4L2_SYSFS_CREATE(max_openers); ++ V4L2_SYSFS_CREATE(state); ++ /* ... */ ++ } while (0); ++ ++ if (res >= 0) ++ return; ++ dev_err(&vdev->dev, "%s error: %d\n", __func__, res); ++} ++ ++/* Event APIs */ ++ ++#define V4L2LOOPBACK_EVENT_BASE (V4L2_EVENT_PRIVATE_START) ++#define V4L2LOOPBACK_EVENT_OFFSET 0x08E00000 ++#define V4L2_EVENT_PRI_CLIENT_USAGE \ ++ (V4L2LOOPBACK_EVENT_BASE + V4L2LOOPBACK_EVENT_OFFSET + 1) ++ ++struct v4l2_event_client_usage { ++ __u32 count; ++}; ++ ++/* global module data */ ++/* find a device based on it's device-number (e.g. '3' for /dev/video3) */ ++struct v4l2loopback_lookup_cb_data { ++ int device_nr; ++ struct v4l2_loopback_device *device; ++}; ++static int v4l2loopback_lookup_cb(int id, void *ptr, void *data) ++{ ++ struct v4l2_loopback_device *device = ptr; ++ struct v4l2loopback_lookup_cb_data *cbdata = data; ++ if (cbdata && device && device->vdev) { ++ if (device->vdev->num == cbdata->device_nr) { ++ cbdata->device = device; ++ cbdata->device_nr = id; ++ return 1; ++ } ++ } ++ return 0; ++} ++static int v4l2loopback_lookup(int device_nr, ++ struct v4l2_loopback_device **device) ++{ ++ struct v4l2loopback_lookup_cb_data data = { ++ .device_nr = device_nr, ++ .device = NULL, ++ }; ++ int err = idr_for_each(&v4l2loopback_index_idr, &v4l2loopback_lookup_cb, ++ &data); ++ if (1 == err) { ++ if (device) ++ *device = data.device; ++ return data.device_nr; ++ } ++ return -ENODEV; ++} ++#define v4l2loopback_get_vdev_nr(vdev) \ ++ ((struct v4l2loopback_private *)video_get_drvdata(vdev))->device_nr ++static struct v4l2_loopback_device *v4l2loopback_cd2dev(struct device *cd) ++{ ++ struct video_device *loopdev = to_video_device(cd); ++ int device_nr = v4l2loopback_get_vdev_nr(loopdev); ++ ++ return idr_find(&v4l2loopback_index_idr, device_nr); ++} ++ ++static struct v4l2_loopback_device *v4l2loopback_getdevice(struct file *f) ++{ ++ struct v4l2loopback_private *ptr = video_drvdata(f); ++ int nr = ptr->device_nr; ++ ++ return idr_find(&v4l2loopback_index_idr, nr); ++} ++ ++/* forward declarations */ ++static void client_usage_queue_event(struct video_device *vdev); ++static bool any_buffers_mapped(struct v4l2_loopback_device *dev); ++static int allocate_buffers(struct v4l2_loopback_device *dev, ++ struct v4l2_pix_format *pix_format); ++static void init_buffers(struct v4l2_loopback_device *dev, u32 bytes_used, ++ u32 buffer_size); ++static void free_buffers(struct v4l2_loopback_device *dev); ++static int allocate_timeout_buffer(struct v4l2_loopback_device *dev); ++static void free_timeout_buffer(struct v4l2_loopback_device *dev); ++static void check_timers(struct v4l2_loopback_device *dev); ++static const struct v4l2_file_operations v4l2_loopback_fops; ++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops; ++ ++/* V4L2 ioctl caps and params calls */ ++/* returns device capabilities ++ * called on VIDIOC_QUERYCAP ++ */ ++static int vidioc_querycap(struct file *file, void *fh, ++ struct v4l2_capability *cap) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ int device_nr = v4l2loopback_get_vdev_nr(dev->vdev); ++ __u32 capabilities = V4L2_CAP_STREAMING | V4L2_CAP_READWRITE; ++ ++ strscpy(cap->driver, "v4l2 loopback", sizeof(cap->driver)); ++ snprintf(cap->card, sizeof(cap->card), "%s", dev->card_label); ++ snprintf(cap->bus_info, sizeof(cap->bus_info), ++ "platform:v4l2loopback-%03d", device_nr); ++ ++ if (dev->announce_all_caps) { ++ capabilities |= V4L2_CAP_VIDEO_CAPTURE | V4L2_CAP_VIDEO_OUTPUT; ++ } else { ++ if (opener->io_method == V4L2L_IO_TIMEOUT || ++ (has_output_token(dev->stream_tokens) && ++ !dev->keep_format)) { ++ capabilities |= V4L2_CAP_VIDEO_OUTPUT; ++ } else ++ capabilities |= V4L2_CAP_VIDEO_CAPTURE; ++ } ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) ++ dev->vdev->device_caps = ++#endif /* >=linux-4.7.0 */ ++ cap->device_caps = cap->capabilities = capabilities; ++ ++ cap->capabilities |= V4L2_CAP_DEVICE_CAPS; ++ ++ memset(cap->reserved, 0, sizeof(cap->reserved)); ++ return 0; ++} ++ ++static int vidioc_enum_framesizes(struct file *file, void *fh, ++ struct v4l2_frmsizeenum *argp) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ ++ /* there can be only one... */ ++ if (argp->index) ++ return -EINVAL; ++ ++ if (dev->keep_format || has_other_owners(opener, dev)) { ++ /* only current frame size supported */ ++ if (argp->pixel_format != dev->pix_format.pixelformat) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; ++ ++ argp->discrete.width = dev->pix_format.width; ++ argp->discrete.height = dev->pix_format.height; ++ } else { ++ /* return continuous sizes if pixel format is supported */ ++ if (NULL == format_by_fourcc(argp->pixel_format)) ++ return -EINVAL; ++ ++ if (dev->min_width == dev->max_width && ++ dev->min_height == dev->max_height) { ++ argp->type = V4L2_FRMSIZE_TYPE_DISCRETE; ++ ++ argp->discrete.width = dev->min_width; ++ argp->discrete.height = dev->min_height; ++ } else { ++ argp->type = V4L2_FRMSIZE_TYPE_CONTINUOUS; ++ ++ argp->stepwise.min_width = dev->min_width; ++ argp->stepwise.min_height = dev->min_height; ++ ++ argp->stepwise.max_width = dev->max_width; ++ argp->stepwise.max_height = dev->max_height; ++ ++ argp->stepwise.step_width = 1; ++ argp->stepwise.step_height = 1; ++ } ++ } ++ return 0; ++} ++ ++/* Test if the device is currently 'capable' of the buffer (stream) type when ++ * the `exclusive_caps` parameter is set. `keep_format` should lock the format ++ * and prevent free of buffers */ ++static int check_buffer_capability(struct v4l2_loopback_device *dev, ++ struct v4l2_loopback_opener *opener, ++ enum v4l2_buf_type type) ++{ ++ /* short-circuit for (non-compliant) timeout image mode */ ++ if (opener->io_method == V4L2L_IO_TIMEOUT) ++ return 0; ++ if (dev->announce_all_caps) ++ return (type == V4L2_BUF_TYPE_VIDEO_CAPTURE || ++ type == V4L2_BUF_TYPE_VIDEO_OUTPUT) ? ++ 0 : ++ -EINVAL; ++ /* CAPTURE if opener has a capture format or a writer is streaming; ++ * else OUTPUT. */ ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if (!(has_capture_token(opener->format_token) || ++ !has_output_token(dev->stream_tokens))) ++ return -EINVAL; ++ break; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (!(has_output_token(opener->format_token) || ++ has_output_token(dev->stream_tokens))) ++ return -EINVAL; ++ break; ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++/* returns frameinterval (fps) for the set resolution ++ * called on VIDIOC_ENUM_FRAMEINTERVALS ++ */ ++static int vidioc_enum_frameintervals(struct file *file, void *fh, ++ struct v4l2_frmivalenum *argp) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ ++ /* there can be only one... */ ++ if (argp->index) ++ return -EINVAL; ++ ++ if (dev->keep_format || has_other_owners(opener, dev)) { ++ /* keep_format also locks the frame rate */ ++ if (argp->width != dev->pix_format.width || ++ argp->height != dev->pix_format.height || ++ argp->pixel_format != dev->pix_format.pixelformat) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMIVAL_TYPE_DISCRETE; ++ argp->discrete = dev->capture_param.timeperframe; ++ } else { ++ if (argp->width < dev->min_width || ++ argp->width > dev->max_width || ++ argp->height < dev->min_height || ++ argp->height > dev->max_height || ++ !format_by_fourcc(argp->pixel_format)) ++ return -EINVAL; ++ ++ argp->type = V4L2_FRMIVAL_TYPE_CONTINUOUS; ++ argp->stepwise.min.numerator = 1; ++ argp->stepwise.min.denominator = V4L2LOOPBACK_FPS_MAX; ++ argp->stepwise.max.numerator = V4L2LOOPBACK_FRAME_INTERVAL_MAX; ++ argp->stepwise.max.denominator = 1; ++ argp->stepwise.step.numerator = 1; ++ argp->stepwise.step.denominator = 1; ++ } ++ ++ return 0; ++} ++ ++/* Enumerate device formats ++ * Returns: ++ * - EINVAL the index is out of bounds; or if non-zero when format is fixed ++ * - EFAULT unexpected null pointer */ ++static int vidioc_enum_fmt_vid(struct file *file, void *fh, ++ struct v4l2_fmtdesc *f) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ int fixed = dev->keep_format || has_other_owners(opener, dev); ++ const struct v4l2l_format *fmt; ++ ++ if (check_buffer_capability(dev, opener, f->type) < 0) ++ return -EINVAL; ++ ++ if (!(f->index < FORMATS)) ++ return -EINVAL; ++ /* TODO: Support 6.14 V4L2_FMTDESC_FLAG_ENUM_ALL */ ++ if (fixed && f->index) ++ return -EINVAL; ++ ++ fmt = fixed ? format_by_fourcc(dev->pix_format.pixelformat) : ++ &formats[f->index]; ++ if (!fmt) ++ return -EFAULT; ++ ++ f->flags = 0; ++ if (fmt->flags & FORMAT_FLAGS_COMPRESSED) ++ f->flags |= V4L2_FMT_FLAG_COMPRESSED; ++ snprintf(f->description, sizeof(f->description), fmt->name); ++ f->pixelformat = fmt->fourcc; ++ return 0; ++} ++ ++/* Tests (or tries) the format. ++ * Returns: ++ * - EINVAL if the buffer type or format is not supported ++ */ ++static int vidioc_try_fmt_vid(struct file *file, void *fh, ++ struct v4l2_format *f) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ ++ if (check_buffer_capability(dev, opener, f->type) < 0) ++ return -EINVAL; ++ if (v4l2l_fill_format(f, dev->min_width, dev->max_width, ++ dev->min_height, dev->max_height) != 0) ++ return -EINVAL; ++ if (dev->keep_format || has_other_owners(opener, dev)) ++ /* use existing format - including colorspace info */ ++ f->fmt.pix = dev->pix_format; ++ ++ return 0; ++} ++ ++/* Sets new format. Fills 'f' argument with the requested or existing format. ++ * Side-effect: buffers are allocated for the (returned) format. ++ * Returns: ++ * - EINVAL if the type is not supported ++ * - EBUSY if buffers are already allocated ++ * TODO: (vasaka) set subregions of input ++ */ ++static int vidioc_s_fmt_vid(struct file *file, void *fh, struct v4l2_format *f) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ u32 token = opener->io_method == V4L2L_IO_TIMEOUT ? ++ V4L2L_TOKEN_TIMEOUT : ++ token_from_type(f->type); ++ int changed, result; ++ char buf[5]; ++ ++ result = vidioc_try_fmt_vid(file, fh, f); ++ if (result < 0) ++ return result; ++ ++ if (opener->buffer_count > 0) ++ /* must free buffers before format can be set */ ++ return -EBUSY; ++ ++ result = mutex_lock_killable(&dev->image_mutex); ++ if (result < 0) ++ return result; ++ ++ if (opener->format_token) ++ release_token(dev, opener, format); ++ if (!(dev->format_tokens & token)) { ++ result = -EBUSY; ++ goto exit_s_fmt_unlock; ++ } ++ ++ dprintk("S_FMT[%s] %4s:%ux%u size=%u\n", ++ V4L2_TYPE_IS_CAPTURE(f->type) ? "CAPTURE" : "OUTPUT", ++ fourcc2str(f->fmt.pix.pixelformat, buf), f->fmt.pix.width, ++ f->fmt.pix.height, f->fmt.pix.sizeimage); ++ changed = !pix_format_eq(&dev->pix_format, &f->fmt.pix, 0); ++ if (changed || has_no_owners(dev)) { ++ result = allocate_buffers(dev, &f->fmt.pix); ++ if (result < 0) ++ goto exit_s_fmt_unlock; ++ } ++ if ((dev->timeout_image && changed) || ++ (!dev->timeout_image && need_timeout_buffer(dev, token))) { ++ result = allocate_timeout_buffer(dev); ++ if (result < 0) ++ goto exit_s_fmt_free; ++ } ++ if (changed) { ++ dev->pix_format = f->fmt.pix; ++ dev->pix_format_has_valid_sizeimage = ++ v4l2l_pix_format_has_valid_sizeimage(f); ++ } ++ acquire_token(dev, opener, format, token); ++ if (opener->io_method == V4L2L_IO_TIMEOUT) ++ dev->timeout_image_io = 0; ++ goto exit_s_fmt_unlock; ++exit_s_fmt_free: ++ free_buffers(dev); ++exit_s_fmt_unlock: ++ mutex_unlock(&dev->image_mutex); ++ return result; ++} ++ ++/* ------------------ CAPTURE ----------------------- */ ++/* ioctl for VIDIOC_ENUM_FMT, _G_FMT, _S_FMT, and _TRY_FMT when buffer type ++ * is V4L2_BUF_TYPE_VIDEO_CAPTURE */ ++ ++static int vidioc_enum_fmt_cap(struct file *file, void *fh, ++ struct v4l2_fmtdesc *f) ++{ ++ return vidioc_enum_fmt_vid(file, fh, f); ++} ++ ++static int vidioc_g_fmt_cap(struct file *file, void *fh, struct v4l2_format *f) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ if (check_buffer_capability(dev, opener, f->type) < 0) ++ return -EINVAL; ++ f->fmt.pix = dev->pix_format; ++ return 0; ++} ++ ++static int vidioc_try_fmt_cap(struct file *file, void *fh, ++ struct v4l2_format *f) ++{ ++ return vidioc_try_fmt_vid(file, fh, f); ++} ++ ++static int vidioc_s_fmt_cap(struct file *file, void *fh, struct v4l2_format *f) ++{ ++ return vidioc_s_fmt_vid(file, fh, f); ++} ++ ++/* ------------------ OUTPUT ----------------------- */ ++/* ioctl for VIDIOC_ENUM_FMT, _G_FMT, _S_FMT, and _TRY_FMT when buffer type ++ * is V4L2_BUF_TYPE_VIDEO_OUTPUT */ ++ ++static int vidioc_enum_fmt_out(struct file *file, void *fh, ++ struct v4l2_fmtdesc *f) ++{ ++ return vidioc_enum_fmt_vid(file, fh, f); ++} ++ ++static int vidioc_g_fmt_out(struct file *file, void *fh, struct v4l2_format *f) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ if (check_buffer_capability(dev, opener, f->type) < 0) ++ return -EINVAL; ++ /* ++ * LATER: this should return the currently valid format ++ * gstreamer doesn't like it, if this returns -EINVAL, as it ++ * then concludes that there is _no_ valid format ++ * CHECK whether this assumption is wrong, ++ * or whether we have to always provide a valid format ++ */ ++ f->fmt.pix = dev->pix_format; ++ return 0; ++} ++ ++static int vidioc_try_fmt_out(struct file *file, void *fh, ++ struct v4l2_format *f) ++{ ++ return vidioc_try_fmt_vid(file, fh, f); ++} ++ ++static int vidioc_s_fmt_out(struct file *file, void *fh, struct v4l2_format *f) ++{ ++ return vidioc_s_fmt_vid(file, fh, f); ++} ++ ++// #define V4L2L_OVERLAY ++#ifdef V4L2L_OVERLAY ++/* ------------------ OVERLAY ----------------------- */ ++/* currently unsupported */ ++/* GSTreamer's v4l2sink is buggy, as it requires the overlay to work ++ * while it should only require it, if overlay is requested ++ * once the gstreamer element is fixed, remove the overlay dummies ++ */ ++#warning OVERLAY dummies ++static int vidioc_g_fmt_overlay(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ return 0; ++} ++ ++static int vidioc_s_fmt_overlay(struct file *file, void *priv, ++ struct v4l2_format *fmt) ++{ ++ return 0; ++} ++#endif /* V4L2L_OVERLAY */ ++ ++/* ------------------ PARAMs ----------------------- */ ++ ++/* get some data flow parameters, only capability, fps and readbuffers has ++ * effect on this driver ++ * called on VIDIOC_G_PARM ++ */ ++static int vidioc_g_parm(struct file *file, void *fh, ++ struct v4l2_streamparm *parm) ++{ ++ /* do not care about type of opener, hope these enums would always be ++ * compatible */ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ if (check_buffer_capability(dev, opener, parm->type) < 0) ++ return -EINVAL; ++ parm->parm.capture = dev->capture_param; ++ return 0; ++} ++ ++/* get some data flow parameters, only capability, fps and readbuffers has ++ * effect on this driver ++ * called on VIDIOC_S_PARM ++ */ ++static int vidioc_s_parm(struct file *file, void *fh, ++ struct v4l2_streamparm *parm) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ ++ dprintk("S_PARM(frame-time=%u/%u)\n", ++ parm->parm.capture.timeperframe.numerator, ++ parm->parm.capture.timeperframe.denominator); ++ if (check_buffer_capability(dev, opener, parm->type) < 0) ++ return -EINVAL; ++ ++ switch (parm->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ set_timeperframe(dev, &parm->parm.capture.timeperframe); ++ break; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ set_timeperframe(dev, &parm->parm.output.timeperframe); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ parm->parm.capture = dev->capture_param; ++ return 0; ++} ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++/* sets a tv standard, actually we do not need to handle this any special way ++ * added to support effecttv ++ * called on VIDIOC_S_STD ++ */ ++static int vidioc_s_std(struct file *file, void *fh, v4l2_std_id *_std) ++{ ++ v4l2_std_id req_std = 0, supported_std = 0; ++ const v4l2_std_id all_std = V4L2_STD_ALL, no_std = 0; ++ ++ if (_std) { ++ req_std = *_std; ++ *_std = all_std; ++ } ++ ++ /* we support everything in V4L2_STD_ALL, but not more... */ ++ supported_std = (all_std & req_std); ++ if (no_std == supported_std) ++ return -EINVAL; ++ ++ return 0; ++} ++ ++/* gets a fake video standard ++ * called on VIDIOC_G_STD ++ */ ++static int vidioc_g_std(struct file *file, void *fh, v4l2_std_id *norm) ++{ ++ if (norm) ++ *norm = V4L2_STD_ALL; ++ return 0; ++} ++/* gets a fake video standard ++ * called on VIDIOC_QUERYSTD ++ */ ++static int vidioc_querystd(struct file *file, void *fh, v4l2_std_id *norm) ++{ ++ if (norm) ++ *norm = V4L2_STD_ALL; ++ return 0; ++} ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++static int v4l2loopback_set_ctrl(struct v4l2_loopback_device *dev, u32 id, ++ s64 val) ++{ ++ int result = 0; ++ switch (id) { ++ case CID_KEEP_FORMAT: ++ if (val < 0 || val > 1) ++ return -EINVAL; ++ dev->keep_format = val; ++ result = mutex_lock_killable(&dev->image_mutex); ++ if (result < 0) ++ return result; ++ if (!dev->keep_format) { ++ if (has_no_owners(dev) && !any_buffers_mapped(dev)) ++ free_buffers(dev); ++ } ++ mutex_unlock(&dev->image_mutex); ++ break; ++ case CID_SUSTAIN_FRAMERATE: ++ if (val < 0 || val > 1) ++ return -EINVAL; ++ spin_lock_bh(&dev->lock); ++ dev->sustain_framerate = val; ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++ break; ++ case CID_TIMEOUT: ++ if (val < 0 || val > MAX_TIMEOUT) ++ return -EINVAL; ++ if (val > 0) { ++ result = mutex_lock_killable(&dev->image_mutex); ++ if (result < 0) ++ return result; ++ /* on-the-fly allocate if device is owned; else ++ * allocate occurs on next S_FMT or REQBUFS */ ++ if (!has_no_owners(dev)) ++ result = allocate_timeout_buffer(dev); ++ mutex_unlock(&dev->image_mutex); ++ if (result < 0) { ++ /* disable timeout as buffer not alloc'd */ ++ spin_lock_bh(&dev->lock); ++ dev->timeout_jiffies = 0; ++ spin_unlock_bh(&dev->lock); ++ return result; ++ } ++ } ++ spin_lock_bh(&dev->lock); ++ dev->timeout_jiffies = msecs_to_jiffies(val); ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++ break; ++ case CID_TIMEOUT_IMAGE_IO: ++ dev->timeout_image_io = 1; ++ break; ++ default: ++ return -EINVAL; ++ } ++ return 0; ++} ++ ++static int v4l2loopback_s_ctrl(struct v4l2_ctrl *ctrl) ++{ ++ struct v4l2_loopback_device *dev = container_of( ++ ctrl->handler, struct v4l2_loopback_device, ctrl_handler); ++ return v4l2loopback_set_ctrl(dev, ctrl->id, ctrl->val); ++} ++ ++/* returns set of device outputs, in our case there is only one ++ * called on VIDIOC_ENUMOUTPUT ++ */ ++static int vidioc_enum_output(struct file *file, void *fh, ++ struct v4l2_output *outp) ++{ ++ __u32 index = outp->index; ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ ++ if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_OUTPUT)) ++ return -ENOTTY; ++ if (index) ++ return -EINVAL; ++ ++ /* clear all data (including the reserved fields) */ ++ memset(outp, 0, sizeof(*outp)); ++ ++ outp->index = index; ++ strscpy(outp->name, "loopback in", sizeof(outp->name)); ++ outp->type = V4L2_OUTPUT_TYPE_ANALOG; ++ outp->audioset = 0; ++ outp->modulator = 0; ++#ifdef V4L2LOOPBACK_WITH_STD ++ outp->std = V4L2_STD_ALL; ++#ifdef V4L2_OUT_CAP_STD ++ outp->capabilities |= V4L2_OUT_CAP_STD; ++#endif /* V4L2_OUT_CAP_STD */ ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ return 0; ++} ++ ++/* which output is currently active, ++ * called on VIDIOC_G_OUTPUT ++ */ ++static int vidioc_g_output(struct file *file, void *fh, unsigned int *index) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_OUTPUT)) ++ return -ENOTTY; ++ if (index) ++ *index = 0; ++ return 0; ++} ++ ++/* set output, can make sense if we have more than one video src, ++ * called on VIDIOC_S_OUTPUT ++ */ ++static int vidioc_s_output(struct file *file, void *fh, unsigned int index) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_OUTPUT)) ++ return -ENOTTY; ++ return index == 0 ? index : -EINVAL; ++} ++ ++/* returns set of device inputs, in our case there is only one, ++ * but later I may add more ++ * called on VIDIOC_ENUMINPUT ++ */ ++static int vidioc_enum_input(struct file *file, void *fh, ++ struct v4l2_input *inp) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ __u32 index = inp->index; ++ ++ if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_CAPTURE)) ++ return -ENOTTY; ++ if (index) ++ return -EINVAL; ++ ++ /* clear all data (including the reserved fields) */ ++ memset(inp, 0, sizeof(*inp)); ++ ++ inp->index = index; ++ strscpy(inp->name, "loopback", sizeof(inp->name)); ++ inp->type = V4L2_INPUT_TYPE_CAMERA; ++ inp->audioset = 0; ++ inp->tuner = 0; ++ inp->status = 0; ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ inp->std = V4L2_STD_ALL; ++#ifdef V4L2_IN_CAP_STD ++ inp->capabilities |= V4L2_IN_CAP_STD; ++#endif ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ if (has_output_token(dev->stream_tokens) && !dev->keep_format) ++ /* if no outputs attached; pretend device is powered off */ ++ inp->status |= V4L2_IN_ST_NO_SIGNAL; ++ ++ return 0; ++} ++ ++/* which input is currently active, ++ * called on VIDIOC_G_INPUT ++ */ ++static int vidioc_g_input(struct file *file, void *fh, unsigned int *index) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_CAPTURE)) ++ return -ENOTTY; /* NOTE: -EAGAIN might be more informative */ ++ if (index) ++ *index = 0; ++ return 0; ++} ++ ++/* set input, can make sense if we have more than one video src, ++ * called on VIDIOC_S_INPUT ++ */ ++static int vidioc_s_input(struct file *file, void *fh, unsigned int index) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ if (index != 0) ++ return -EINVAL; ++ if (check_buffer_capability(dev, opener, V4L2_BUF_TYPE_VIDEO_CAPTURE)) ++ return -ENOTTY; /* NOTE: -EAGAIN might be more informative */ ++ return 0; ++} ++ ++/* --------------- V4L2 ioctl buffer related calls ----------------- */ ++ ++#define is_allocated(opener, type, index) \ ++ (opener->format_token & (opener->io_method == V4L2L_IO_TIMEOUT ? \ ++ V4L2L_TOKEN_TIMEOUT : \ ++ token_from_type(type)) && \ ++ (index) < (opener)->buffer_count) ++#define BUFFER_DEBUG_FMT_STR \ ++ "buffer#%u @ %p type=%u bytesused=%u length=%u flags=%x " \ ++ "field=%u timestamp= %lld.%06lldsequence=%u\n" ++#define BUFFER_DEBUG_FMT_ARGS(buf) \ ++ (buf)->index, (buf), (buf)->type, (buf)->bytesused, (buf)->length, \ ++ (buf)->flags, (buf)->field, \ ++ (long long)(buf)->timestamp.tv_sec, \ ++ (long long)(buf)->timestamp.tv_usec, (buf)->sequence ++/* Buffer flag helpers */ ++#define unset_flags(flags) \ ++ do { \ ++ flags &= ~V4L2_BUF_FLAG_QUEUED; \ ++ flags &= ~V4L2_BUF_FLAG_DONE; \ ++ } while (0) ++#define set_queued(flags) \ ++ do { \ ++ flags |= V4L2_BUF_FLAG_QUEUED; \ ++ flags &= ~V4L2_BUF_FLAG_DONE; \ ++ } while (0) ++#define set_done(flags) \ ++ do { \ ++ flags &= ~V4L2_BUF_FLAG_QUEUED; \ ++ flags |= V4L2_BUF_FLAG_DONE; \ ++ } while (0) ++ ++static bool any_buffers_mapped(struct v4l2_loopback_device *dev) ++{ ++ u32 index; ++ for (index = 0; index < dev->buffer_count; ++index) ++ if (dev->buffers[index].buffer.flags & V4L2_BUF_FLAG_MAPPED) ++ return true; ++ return false; ++} ++ ++static void prepare_buffer_queue(struct v4l2_loopback_device *dev, int count) ++{ ++ struct v4l2l_buffer *bufd, *n; ++ u32 pos; ++ ++ spin_lock_bh(&dev->list_lock); ++ ++ /* ensure sufficient number of buffers in queue */ ++ for (pos = 0; pos < count; ++pos) { ++ bufd = &dev->buffers[pos]; ++ if (list_empty(&bufd->list_head)) ++ list_add_tail(&bufd->list_head, &dev->outbufs_list); ++ } ++ if (list_empty(&dev->outbufs_list)) ++ goto exit_prepare_queue_unlock; ++ ++ /* remove any excess buffers */ ++ list_for_each_entry_safe(bufd, n, &dev->outbufs_list, list_head) { ++ if (bufd->buffer.index >= count) ++ list_del_init(&bufd->list_head); ++ } ++ ++ /* buffers are no longer queued; and `write_position` will correspond ++ * to the first item of `outbufs_list`. */ ++ pos = v4l2l_mod64(dev->write_position, count); ++ list_for_each_entry(bufd, &dev->outbufs_list, list_head) { ++ unset_flags(bufd->buffer.flags); ++ dev->bufpos2index[pos % count] = bufd->buffer.index; ++ ++pos; ++ } ++exit_prepare_queue_unlock: ++ spin_unlock_bh(&dev->list_lock); ++} ++ ++/* forward declaration */ ++static int vidioc_streamoff(struct file *file, void *fh, ++ enum v4l2_buf_type type); ++/* negotiate buffer type ++ * only mmap streaming supported ++ * called on VIDIOC_REQBUFS ++ */ ++static int vidioc_reqbufs(struct file *file, void *fh, ++ struct v4l2_requestbuffers *reqbuf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ u32 token = opener->io_method == V4L2L_IO_TIMEOUT ? ++ V4L2L_TOKEN_TIMEOUT : ++ token_from_type(reqbuf->type); ++ u32 req_count = reqbuf->count; ++ int result = 0; ++ ++ dprintk("REQBUFS(memory=%u, req_count=%u) and device-bufs=%u/%u " ++ "[used/max]\n", ++ reqbuf->memory, req_count, dev->used_buffer_count, ++ dev->buffer_count); ++ ++ switch (reqbuf->memory) { ++ case V4L2_MEMORY_MMAP: ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 20, 0) ++ reqbuf->capabilities = 0; /* only guarantee MMAP support */ ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) ++ reqbuf->flags = 0; /* no memory consistency support */ ++#endif ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ if (opener->format_token & ~token) ++ /* different (buffer) type already assigned to descriptor by ++ * S_FMT or REQBUFS */ ++ return -EINVAL; ++ ++ MARK(); ++ result = mutex_lock_killable(&dev->image_mutex); ++ if (result < 0) ++ return result; /* -EINTR */ ++ ++ /* CASE queue/dequeue timeout-buffer only: */ ++ if (opener->format_token & V4L2L_TOKEN_TIMEOUT) { ++ opener->buffer_count = req_count; ++ if (req_count == 0) ++ release_token(dev, opener, format); ++ goto exit_reqbufs_unlock; ++ } ++ ++ MARK(); ++ /* CASE count is zero: streamoff, free buffers, release their token */ ++ if (req_count == 0) { ++ if (dev->format_tokens & token) { ++ acquire_token(dev, opener, format, token); ++ opener->io_method = V4L2L_IO_MMAP; ++ } ++ result = vidioc_streamoff(file, fh, reqbuf->type); ++ opener->buffer_count = 0; ++ /* undocumented requirement - REQBUFS with count zero should ++ * ALSO release lock on logical stream */ ++ if (opener->format_token) ++ release_token(dev, opener, format); ++ if (has_no_owners(dev)) ++ dev->used_buffer_count = 0; ++ goto exit_reqbufs_unlock; ++ } ++ ++ /* CASE count non-zero: allocate buffers and acquire token for them */ ++ MARK(); ++ switch (reqbuf->type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (!(dev->format_tokens & token || ++ opener->format_token & token)) ++ /* only exclusive ownership for each stream */ ++ result = -EBUSY; ++ break; ++ default: ++ result = -EINVAL; ++ } ++ if (result < 0) ++ goto exit_reqbufs_unlock; ++ ++ if (has_other_owners(opener, dev) && dev->used_buffer_count > 0) { ++ /* allow 'allocation' of existing number of buffers */ ++ req_count = dev->used_buffer_count; ++ } else if (any_buffers_mapped(dev)) { ++ /* do not allow re-allocation if buffers are mapped */ ++ result = -EBUSY; ++ goto exit_reqbufs_unlock; ++ } ++ ++ MARK(); ++ opener->buffer_count = 0; ++ ++ if (req_count > dev->buffer_count) ++ req_count = dev->buffer_count; ++ ++ if (has_no_owners(dev)) { ++ result = allocate_buffers(dev, &dev->pix_format); ++ if (result < 0) ++ goto exit_reqbufs_unlock; ++ } ++ if (!dev->timeout_image && need_timeout_buffer(dev, token)) { ++ result = allocate_timeout_buffer(dev); ++ if (result < 0) ++ goto exit_reqbufs_unlock; ++ } ++ acquire_token(dev, opener, format, token); ++ ++ MARK(); ++ switch (opener->io_method) { ++ case V4L2L_IO_TIMEOUT: ++ dev->timeout_image_io = 0; ++ opener->buffer_count = req_count; ++ break; ++ default: ++ opener->io_method = V4L2L_IO_MMAP; ++ prepare_buffer_queue(dev, req_count); ++ dev->used_buffer_count = opener->buffer_count = req_count; ++ } ++exit_reqbufs_unlock: ++ mutex_unlock(&dev->image_mutex); ++ reqbuf->count = opener->buffer_count; ++ return result; ++} ++ ++/* returns buffer asked for; ++ * give app as many buffers as it wants, if it less than MAX, ++ * but map them in our inner buffers ++ * called on VIDIOC_QUERYBUF ++ */ ++static int vidioc_querybuf(struct file *file, void *fh, struct v4l2_buffer *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ u32 type = buf->type; ++ u32 index = buf->index; ++ ++ if ((type != V4L2_BUF_TYPE_VIDEO_CAPTURE) && ++ (type != V4L2_BUF_TYPE_VIDEO_OUTPUT)) ++ return -EINVAL; ++ if (!is_allocated(opener, type, index)) ++ return -EINVAL; ++ ++ if (opener->format_token & V4L2L_TOKEN_TIMEOUT) { ++ *buf = dev->timeout_buffer.buffer; ++ buf->index = index; ++ } else ++ *buf = dev->buffers[index].buffer; ++ ++ buf->type = type; ++ ++ if (!(buf->flags & (V4L2_BUF_FLAG_DONE | V4L2_BUF_FLAG_QUEUED))) { ++ /* v4l2-compliance requires these to be zero */ ++ buf->sequence = 0; ++ buf->timestamp.tv_sec = buf->timestamp.tv_usec = 0; ++ } else if (V4L2_TYPE_IS_CAPTURE(type)) { ++ /* guess flags based on sequence values */ ++ if (buf->sequence >= opener->read_position) { ++ set_done(buf->flags); ++ } else if (buf->flags & V4L2_BUF_FLAG_DONE) { ++ set_queued(buf->flags); ++ } ++ } ++ dprintkrw("QUERYBUF(%s, index=%u) -> " BUFFER_DEBUG_FMT_STR, ++ V4L2_TYPE_IS_CAPTURE(type) ? "CAPTURE" : "OUTPUT", index, ++ BUFFER_DEBUG_FMT_ARGS(buf)); ++ return 0; ++} ++ ++static void buffer_written(struct v4l2_loopback_device *dev, ++ struct v4l2l_buffer *buf) ++{ ++ timer_delete_sync(&dev->sustain_timer); ++ timer_delete_sync(&dev->timeout_timer); ++ ++ spin_lock_bh(&dev->list_lock); ++ list_move_tail(&buf->list_head, &dev->outbufs_list); ++ spin_unlock_bh(&dev->list_lock); ++ ++ spin_lock_bh(&dev->lock); ++ dev->bufpos2index[v4l2l_mod64(dev->write_position, ++ dev->used_buffer_count)] = ++ buf->buffer.index; ++ ++dev->write_position; ++ dev->reread_count = 0; ++ ++ check_timers(dev); ++ spin_unlock_bh(&dev->lock); ++} ++ ++/* put buffer to queue ++ * called on VIDIOC_QBUF ++ */ ++static int vidioc_qbuf(struct file *file, void *fh, struct v4l2_buffer *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ struct v4l2l_buffer *bufd; ++ u32 index = buf->index; ++ u32 type = buf->type; ++ ++ if (!is_allocated(opener, type, index)) ++ return -EINVAL; ++ bufd = &dev->buffers[index]; ++ ++ switch (buf->memory) { ++ case V4L2_MEMORY_MMAP: ++ if (!(bufd->buffer.flags & V4L2_BUF_FLAG_MAPPED)) ++ dprintkrw("QBUF() unmapped buffer [index=%u]\n", index); ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ if (opener->format_token & V4L2L_TOKEN_TIMEOUT) { ++ set_queued(buf->flags); ++ return 0; ++ } ++ ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ dprintkrw("QBUF(CAPTURE, index=%u) -> " BUFFER_DEBUG_FMT_STR, ++ index, BUFFER_DEBUG_FMT_ARGS(buf)); ++ set_queued(buf->flags); ++ break; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ dprintkrw("QBUF(OUTPUT, index=%u) -> " BUFFER_DEBUG_FMT_STR, ++ index, BUFFER_DEBUG_FMT_ARGS(buf)); ++ if (!(bufd->buffer.flags & V4L2_BUF_FLAG_TIMESTAMP_COPY) && ++ (buf->timestamp.tv_sec == 0 && ++ buf->timestamp.tv_usec == 0)) { ++ v4l2l_get_timestamp(&bufd->buffer); ++ } else { ++ bufd->buffer.timestamp = buf->timestamp; ++ bufd->buffer.flags |= V4L2_BUF_FLAG_TIMESTAMP_COPY; ++ bufd->buffer.flags &= ++ ~V4L2_BUF_FLAG_TIMESTAMP_MONOTONIC; ++ } ++ if (dev->pix_format_has_valid_sizeimage) { ++ if (buf->bytesused >= dev->pix_format.sizeimage) { ++ bufd->buffer.bytesused = ++ dev->pix_format.sizeimage; ++ } else { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 5, 0) ++ dev_warn_ratelimited( ++ &dev->vdev->dev, ++#else ++ dprintkrw( ++#endif ++ "warning queued output buffer bytesused too small %u < %u\n", ++ buf->bytesused, ++ dev->pix_format.sizeimage); ++ bufd->buffer.bytesused = buf->bytesused; ++ } ++ } else { ++ bufd->buffer.bytesused = buf->bytesused; ++ } ++ bufd->buffer.sequence = dev->write_position; ++ set_queued(bufd->buffer.flags); ++ *buf = bufd->buffer; ++ buffer_written(dev, bufd); ++ set_done(bufd->buffer.flags); ++ wake_up_all(&dev->read_event); ++ break; ++ default: ++ return -EINVAL; ++ } ++ buf->type = type; ++ return 0; ++} ++ ++static int can_read(struct v4l2_loopback_device *dev, ++ struct v4l2_loopback_opener *opener) ++{ ++ int ret; ++ ++ spin_lock_bh(&dev->lock); ++ check_timers(dev); ++ ret = dev->write_position > opener->read_position || ++ dev->reread_count > opener->reread_count || dev->timeout_happened; ++ spin_unlock_bh(&dev->lock); ++ return ret; ++} ++ ++static int get_capture_buffer(struct file *file) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); ++ int pos, timeout_happened; ++ u32 index; ++ ++ if ((file->f_flags & O_NONBLOCK) && ++ (dev->write_position <= opener->read_position && ++ dev->reread_count <= opener->reread_count && ++ !dev->timeout_happened)) ++ return -EAGAIN; ++ wait_event_interruptible(dev->read_event, can_read(dev, opener)); ++ ++ spin_lock_bh(&dev->lock); ++ if (dev->write_position == opener->read_position) { ++ if (dev->reread_count > opener->reread_count + 2) ++ opener->reread_count = dev->reread_count - 1; ++ ++opener->reread_count; ++ pos = v4l2l_mod64(opener->read_position + ++ dev->used_buffer_count - 1, ++ dev->used_buffer_count); ++ } else { ++ opener->reread_count = 0; ++ if (dev->write_position > ++ opener->read_position + dev->used_buffer_count) ++ opener->read_position = dev->write_position - 1; ++ pos = v4l2l_mod64(opener->read_position, ++ dev->used_buffer_count); ++ ++opener->read_position; ++ } ++ timeout_happened = dev->timeout_happened && (dev->timeout_jiffies > 0); ++ dev->timeout_happened = 0; ++ spin_unlock_bh(&dev->lock); ++ ++ index = dev->bufpos2index[pos]; ++ if (timeout_happened) { ++ if (index >= dev->used_buffer_count) { ++ dprintkrw("get_capture_buffer() read position is at " ++ "an unallocated buffer [index=%u]\n", ++ index); ++ return -EFAULT; ++ } ++ /* although allocated on-demand, timeout_image is freed only ++ * in free_buffers(), so we don't need to worry about it being ++ * deallocated suddenly */ ++ memcpy(dev->image + dev->buffers[index].buffer.m.offset, ++ dev->timeout_image, dev->buffer_size); ++ } ++ return (int)index; ++} ++ ++/* put buffer to dequeue ++ * called on VIDIOC_DQBUF ++ */ ++static int vidioc_dqbuf(struct file *file, void *fh, struct v4l2_buffer *buf) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ u32 type = buf->type; ++ int index; ++ struct v4l2l_buffer *bufd; ++ ++ if (buf->memory != V4L2_MEMORY_MMAP) ++ return -EINVAL; ++ if (opener->format_token & V4L2L_TOKEN_TIMEOUT) { ++ *buf = dev->timeout_buffer.buffer; ++ buf->type = type; ++ unset_flags(buf->flags); ++ return 0; ++ } ++ if ((opener->buffer_count == 0) || ++ !(opener->format_token & token_from_type(type))) ++ return -EINVAL; ++ ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ index = get_capture_buffer(file); ++ if (index < 0) ++ return index; ++ *buf = dev->buffers[index].buffer; ++ unset_flags(buf->flags); ++ break; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ spin_lock_bh(&dev->list_lock); ++ ++ bufd = list_first_entry_or_null(&dev->outbufs_list, ++ struct v4l2l_buffer, list_head); ++ if (bufd) ++ list_move_tail(&bufd->list_head, &dev->outbufs_list); ++ ++ spin_unlock_bh(&dev->list_lock); ++ if (!bufd) ++ return -EFAULT; ++ unset_flags(bufd->buffer.flags); ++ *buf = bufd->buffer; ++ break; ++ default: ++ return -EINVAL; ++ } ++ ++ buf->type = type; ++ dprintkrw("DQBUF(%s, index=%u) -> " BUFFER_DEBUG_FMT_STR, ++ V4L2_TYPE_IS_CAPTURE(type) ? "CAPTURE" : "OUTPUT", index, ++ BUFFER_DEBUG_FMT_ARGS(buf)); ++ return 0; ++} ++ ++/* ------------- STREAMING ------------------- */ ++ ++/* start streaming ++ * called on VIDIOC_STREAMON ++ */ ++static int vidioc_streamon(struct file *file, void *fh, enum v4l2_buf_type type) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ u32 token = token_from_type(type); ++ ++ /* short-circuit when using timeout buffer set */ ++ if (opener->format_token & V4L2L_TOKEN_TIMEOUT) ++ return 0; ++ /* opener must have claimed (same) buffer set via REQBUFS */ ++ if (!opener->buffer_count || !(opener->format_token & token)) ++ return -EINVAL; ++ ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if (has_output_token(dev->stream_tokens) && !dev->keep_format) ++ return -EIO; ++ if (dev->stream_tokens & token) { ++ acquire_token(dev, opener, stream, token); ++ client_usage_queue_event(dev->vdev); ++ } ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (dev->stream_tokens & token) ++ acquire_token(dev, opener, stream, token); ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++/* stop streaming ++ * called on VIDIOC_STREAMOFF ++ */ ++static int vidioc_streamoff(struct file *file, void *fh, ++ enum v4l2_buf_type type) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ u32 token = token_from_type(type); ++ ++ /* short-circuit when using timeout buffer set */ ++ if (opener->format_token & V4L2L_TOKEN_TIMEOUT) ++ return 0; ++ /* short-circuit when buffer set has no owner */ ++ if (dev->format_tokens & token) ++ return 0; ++ /* opener needs a claim to buffer set */ ++ if (!opener->format_token) ++ return -EBUSY; ++ if (opener->format_token & ~token) ++ return -EINVAL; ++ ++ switch (type) { ++ case V4L2_BUF_TYPE_VIDEO_OUTPUT: ++ if (opener->stream_token & token) ++ release_token(dev, opener, stream); ++ /* reset output queue */ ++ if (dev->used_buffer_count > 0) ++ prepare_buffer_queue(dev, dev->used_buffer_count); ++ return 0; ++ case V4L2_BUF_TYPE_VIDEO_CAPTURE: ++ if (opener->stream_token & token) { ++ release_token(dev, opener, stream); ++ client_usage_queue_event(dev->vdev); ++ } ++ return 0; ++ default: ++ return -EINVAL; ++ } ++} ++ ++#ifdef CONFIG_VIDEO_V4L1_COMPAT ++static int vidiocgmbuf(struct file *file, void *fh, struct video_mbuf *p) ++{ ++ struct v4l2_loopback_device *dev; ++ MARK(); ++ ++ dev = v4l2loopback_getdevice(file); ++ p->frames = dev->buffer_count; ++ p->offsets[0] = 0; ++ p->offsets[1] = 0; ++ p->size = dev->buffer_size; ++ return 0; ++} ++#endif ++ ++static void client_usage_queue_event(struct video_device *vdev) ++{ ++ struct v4l2_event ev; ++ struct v4l2_loopback_device *dev; ++ ++ dev = container_of(vdev->v4l2_dev, struct v4l2_loopback_device, ++ v4l2_dev); ++ ++ memset(&ev, 0, sizeof(ev)); ++ ev.type = V4L2_EVENT_PRI_CLIENT_USAGE; ++ ((struct v4l2_event_client_usage *)&ev.u)->count = ++ !has_capture_token(dev->stream_tokens); ++ ++ v4l2_event_queue(vdev, &ev); ++} ++ ++static int client_usage_ops_add(struct v4l2_subscribed_event *sev, ++ unsigned elems) ++{ ++ if (!(sev->flags & V4L2_EVENT_SUB_FL_SEND_INITIAL)) ++ return 0; ++ ++ client_usage_queue_event(sev->fh->vdev); ++ return 0; ++} ++ ++static void client_usage_ops_replace(struct v4l2_event *old, ++ const struct v4l2_event *new) ++{ ++ *((struct v4l2_event_client_usage *)&old->u) = ++ *((struct v4l2_event_client_usage *)&new->u); ++} ++ ++static void client_usage_ops_merge(const struct v4l2_event *old, ++ struct v4l2_event *new) ++{ ++ *((struct v4l2_event_client_usage *)&new->u) = ++ *((struct v4l2_event_client_usage *)&old->u); ++} ++ ++const struct v4l2_subscribed_event_ops client_usage_ops = { ++ .add = client_usage_ops_add, ++ .replace = client_usage_ops_replace, ++ .merge = client_usage_ops_merge, ++}; ++ ++static int vidioc_subscribe_event(struct v4l2_fh *fh, ++ const struct v4l2_event_subscription *sub) ++{ ++ switch (sub->type) { ++ case V4L2_EVENT_CTRL: ++ return v4l2_ctrl_subscribe_event(fh, sub); ++ case V4L2_EVENT_PRI_CLIENT_USAGE: ++ return v4l2_event_subscribe(fh, sub, 0, &client_usage_ops); ++ } ++ ++ return -EINVAL; ++} ++ ++/* file operations */ ++static void vm_open(struct vm_area_struct *vma) ++{ ++ struct v4l2l_buffer *buf; ++ MARK(); ++ ++ buf = vma->vm_private_data; ++ atomic_inc(&buf->use_count); ++ buf->buffer.flags |= V4L2_BUF_FLAG_MAPPED; ++} ++ ++static void vm_close(struct vm_area_struct *vma) ++{ ++ struct v4l2l_buffer *buf; ++ MARK(); ++ ++ buf = vma->vm_private_data; ++ if (atomic_dec_and_test(&buf->use_count)) ++ buf->buffer.flags &= ~V4L2_BUF_FLAG_MAPPED; ++} ++ ++static struct vm_operations_struct vm_ops = { ++ .open = vm_open, ++ .close = vm_close, ++}; ++ ++static int v4l2_loopback_mmap(struct file *file, struct vm_area_struct *vma) ++{ ++ u8 *addr; ++ unsigned long start, size, offset; ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); ++ struct v4l2l_buffer *buffer = NULL; ++ int result = 0; ++ MARK(); ++ ++ offset = (unsigned long)vma->vm_pgoff << PAGE_SHIFT; ++ start = (unsigned long)vma->vm_start; ++ size = (unsigned long)(vma->vm_end - vma->vm_start); /* always != 0 */ ++ ++ /* ensure buffer size, count, and allocated image(s) are not altered by ++ * other file descriptors */ ++ result = mutex_lock_killable(&dev->image_mutex); ++ if (result < 0) ++ return result; ++ ++ if (size > dev->buffer_size) { ++ dprintk("mmap() attempt to map %lubytes when %ubytes are " ++ "allocated to buffers\n", ++ size, dev->buffer_size); ++ result = -EINVAL; ++ goto exit_mmap_unlock; ++ } ++ if (offset % dev->buffer_size != 0) { ++ dprintk("mmap() offset does not match start of any buffer\n"); ++ result = -EINVAL; ++ goto exit_mmap_unlock; ++ } ++ switch (opener->format_token) { ++ case V4L2L_TOKEN_TIMEOUT: ++ if (offset != (unsigned long)dev->buffer_size * MAX_BUFFERS) { ++ dprintk("mmap() incorrect offset for timeout image\n"); ++ result = -EINVAL; ++ goto exit_mmap_unlock; ++ } ++ buffer = &dev->timeout_buffer; ++ addr = dev->timeout_image; ++ break; ++ default: ++ if (offset >= dev->image_size) { ++ dprintk("mmap() attempt to map beyond all buffers\n"); ++ result = -EINVAL; ++ goto exit_mmap_unlock; ++ } ++ u32 index = offset / dev->buffer_size; ++ buffer = &dev->buffers[index]; ++ addr = dev->image + offset; ++ break; ++ } ++ ++ while (size > 0) { ++ struct page *page = vmalloc_to_page(addr); ++ ++ result = vm_insert_page(vma, start, page); ++ if (result < 0) ++ goto exit_mmap_unlock; ++ ++ start += PAGE_SIZE; ++ addr += PAGE_SIZE; ++ size -= PAGE_SIZE; ++ } ++ ++ vma->vm_ops = &vm_ops; ++ vma->vm_private_data = buffer; ++ ++ vm_open(vma); ++exit_mmap_unlock: ++ mutex_unlock(&dev->image_mutex); ++ return result; ++} ++ ++static unsigned int v4l2_loopback_poll(struct file *file, ++ struct poll_table_struct *pts) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); ++ __poll_t req_events = poll_requested_events(pts); ++ int ret_mask = 0; ++ ++ /* call poll_wait in first call, regardless, to ensure that the ++ * wait-queue is not null */ ++ poll_wait(file, &dev->read_event, pts); ++ poll_wait(file, &opener->fh.wait, pts); ++ ++ if (req_events & POLLPRI) { ++ if (v4l2_event_pending(&opener->fh)) { ++ ret_mask |= POLLPRI; ++ if (!(req_events & DEFAULT_POLLMASK)) ++ return ret_mask; ++ } ++ } ++ ++ switch (opener->format_token) { ++ case V4L2L_TOKEN_OUTPUT: ++ if (opener->stream_token != 0 || ++ opener->io_method == V4L2L_IO_NONE) ++ ret_mask |= POLLOUT | POLLWRNORM; ++ break; ++ case V4L2L_TOKEN_CAPTURE: ++ if ((opener->io_method == V4L2L_IO_NONE || ++ opener->stream_token != 0) && ++ can_read(dev, opener)) ++ ret_mask |= POLLIN | POLLWRNORM; ++ break; ++ case V4L2L_TOKEN_TIMEOUT: ++ ret_mask |= POLLOUT | POLLWRNORM; ++ break; ++ default: ++ break; ++ } ++ ++ return ret_mask; ++} ++ ++/* do not want to limit device opens, it can be as many readers as user want, ++ * writers are limited by means of setting writer field */ ++static int v4l2_loopback_open(struct file *file) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_opener *opener; ++ ++ dev = v4l2loopback_getdevice(file); ++ if (dev->open_count.counter >= dev->max_openers) ++ return -EBUSY; ++ /* kfree on close */ ++ opener = kzalloc(sizeof(*opener), GFP_KERNEL); ++ if (opener == NULL) ++ return -ENOMEM; ++ ++ atomic_inc(&dev->open_count); ++ if (dev->timeout_image_io && dev->format_tokens & V4L2L_TOKEN_TIMEOUT) ++ /* will clear timeout_image_io once buffer set acquired */ ++ opener->io_method = V4L2L_IO_TIMEOUT; ++ ++ v4l2_fh_init(&opener->fh, video_devdata(file)); ++ file->private_data = &opener->fh; ++ ++ v4l2_fh_add(&opener->fh); ++ dprintk("open() -> dev@%p with image@%p\n", dev, ++ dev ? dev->image : NULL); ++ return 0; ++} ++ ++static int v4l2_loopback_close(struct file *file) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(file->private_data); ++ int result = 0; ++ dprintk("close() -> dev@%p with image@%p\n", dev, ++ dev ? dev->image : NULL); ++ ++ if (opener->format_token) { ++ struct v4l2_requestbuffers reqbuf = { ++ .count = 0, .memory = V4L2_MEMORY_MMAP, .type = 0 ++ }; ++ switch (opener->format_token) { ++ case V4L2L_TOKEN_CAPTURE: ++ reqbuf.type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ break; ++ case V4L2L_TOKEN_OUTPUT: ++ case V4L2L_TOKEN_TIMEOUT: ++ reqbuf.type = V4L2_BUF_TYPE_VIDEO_OUTPUT; ++ break; ++ } ++ if (reqbuf.type) ++ result = vidioc_reqbufs(file, file->private_data, ++ &reqbuf); ++ if (result < 0) ++ dprintk("failed to free buffers REQBUFS(count=0) " ++ " returned %d\n", ++ result); ++ mutex_lock(&dev->image_mutex); ++ release_token(dev, opener, format); ++ mutex_unlock(&dev->image_mutex); ++ } ++ ++ if (atomic_dec_and_test(&dev->open_count)) { ++ timer_delete_sync(&dev->sustain_timer); ++ timer_delete_sync(&dev->timeout_timer); ++ if (!dev->keep_format) { ++ mutex_lock(&dev->image_mutex); ++ free_buffers(dev); ++ mutex_unlock(&dev->image_mutex); ++ } ++ } ++ ++ v4l2_fh_del(&opener->fh); ++ v4l2_fh_exit(&opener->fh); ++ ++ kfree(opener); ++ return 0; ++} ++ ++static int start_fileio(struct file *file, void *fh, enum v4l2_buf_type type) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_loopback_opener *opener = fh_to_opener(fh); ++ struct v4l2_requestbuffers reqbuf = { .count = dev->buffer_count, ++ .memory = V4L2_MEMORY_MMAP, ++ .type = type }; ++ int token = token_from_type(type); ++ int result; ++ ++ if (opener->format_token & V4L2L_TOKEN_TIMEOUT || ++ opener->format_token & ~token) ++ return -EBUSY; /* NOTE: -EBADF might be more informative */ ++ ++ /* short-circuit if already have stream token */ ++ if (opener->stream_token && opener->io_method == V4L2L_IO_FILE) ++ return 0; ++ ++ /* otherwise attempt to acquire stream token and assign IO method */ ++ if (!(dev->stream_tokens & token) || opener->io_method != V4L2L_IO_NONE) ++ return -EBUSY; ++ ++ result = vidioc_reqbufs(file, fh, &reqbuf); ++ if (result < 0) ++ return result; ++ result = vidioc_streamon(file, fh, type); ++ if (result < 0) ++ return result; ++ ++ opener->io_method = V4L2L_IO_FILE; ++ return 0; ++} ++ ++static ssize_t v4l2_loopback_read(struct file *file, char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_buffer *b; ++ int index, result; ++ ++ dprintkrw("read() %zu bytes\n", count); ++ result = start_fileio(file, file->private_data, ++ V4L2_BUF_TYPE_VIDEO_CAPTURE); ++ if (result < 0) ++ return result; ++ ++ index = get_capture_buffer(file); ++ if (index < 0) ++ return index; ++ b = &dev->buffers[index].buffer; ++ if (count > b->bytesused) ++ count = b->bytesused; ++ if (copy_to_user((void *)buf, (void *)(dev->image + b->m.offset), ++ count)) { ++ printk(KERN_ERR "v4l2-loopback read() failed copy_to_user()\n"); ++ return -EFAULT; ++ } ++ return count; ++} ++ ++static ssize_t v4l2_loopback_write(struct file *file, const char __user *buf, ++ size_t count, loff_t *ppos) ++{ ++ struct v4l2_loopback_device *dev = v4l2loopback_getdevice(file); ++ struct v4l2_buffer *b; ++ int index, result; ++ ++ dprintkrw("write() %zu bytes\n", count); ++ result = start_fileio(file, file->private_data, ++ V4L2_BUF_TYPE_VIDEO_OUTPUT); ++ if (result < 0) ++ return result; ++ ++ if (count > dev->buffer_size) ++ count = dev->buffer_size; ++ index = v4l2l_mod64(dev->write_position, dev->used_buffer_count); ++ b = &dev->buffers[index].buffer; ++ ++ if (copy_from_user((void *)(dev->image + b->m.offset), (void *)buf, ++ count)) { ++ printk(KERN_ERR ++ "v4l2-loopback write() failed copy_from_user()\n"); ++ return -EFAULT; ++ } ++ b->bytesused = count; ++ ++ v4l2l_get_timestamp(b); ++ b->sequence = dev->write_position; ++ set_queued(b->flags); ++ buffer_written(dev, &dev->buffers[index]); ++ set_done(b->flags); ++ wake_up_all(&dev->read_event); ++ ++ return count; ++} ++ ++/* init functions */ ++/* frees buffers, if allocated */ ++static void free_buffers(struct v4l2_loopback_device *dev) ++{ ++ dprintk("free_buffers() with image@%p\n", dev->image); ++ if (!dev->image) ++ return; ++ if (!has_no_owners(dev) || any_buffers_mapped(dev)) ++ /* maybe an opener snuck in before image_mutex was acquired */ ++ printk(KERN_WARNING ++ "v4l2-loopback free_buffers() buffers of video device " ++ "#%u freed while still mapped to userspace\n", ++ dev->vdev->num); ++ vfree(dev->image); ++ dev->image = NULL; ++ dev->image_size = 0; ++ dev->buffer_size = 0; ++} ++ ++static void free_timeout_buffer(struct v4l2_loopback_device *dev) ++{ ++ dprintk("free_timeout_buffer() with timeout_image@%p\n", ++ dev->timeout_image); ++ if (!dev->timeout_image) ++ return; ++ ++ if ((dev->timeout_jiffies > 0 && !has_no_owners(dev)) || ++ dev->timeout_buffer.buffer.flags & V4L2_BUF_FLAG_MAPPED) ++ printk(KERN_WARNING ++ "v4l2-loopback free_timeout_buffer() timeout image " ++ "of device #%u freed while still mapped to userspace\n", ++ dev->vdev->num); ++ ++ vfree(dev->timeout_image); ++ dev->timeout_image = NULL; ++ dev->timeout_buffer_size = 0; ++} ++/* allocates buffers if no (other) openers are already using them */ ++static int allocate_buffers(struct v4l2_loopback_device *dev, ++ struct v4l2_pix_format *pix_format) ++{ ++ u32 buffer_size = PAGE_ALIGN(pix_format->sizeimage); ++ unsigned long image_size = ++ (unsigned long)buffer_size * (unsigned long)dev->buffer_count; ++ /* vfree on close file operation in case no open handles left */ ++ ++ if (buffer_size == 0 || dev->buffer_count == 0 || ++ buffer_size < pix_format->sizeimage) ++ return -EINVAL; ++ ++ if ((__LONG_MAX__ / buffer_size) < dev->buffer_count) ++ return -ENOSPC; ++ ++ dprintk("allocate_buffers() size %lubytes = %ubytes x %ubuffers\n", ++ image_size, buffer_size, dev->buffer_count); ++ if (dev->image) { ++ /* check that no buffers are expected in user-space */ ++ if (!has_no_owners(dev) || any_buffers_mapped(dev)) ++ return -EBUSY; ++ dprintk("allocate_buffers() existing size=%lubytes\n", ++ dev->image_size); ++ /* FIXME: prevent double allocation more intelligently! */ ++ if (image_size == dev->image_size) { ++ dprintk("allocate_buffers() keep existing\n"); ++ return 0; ++ } ++ free_buffers(dev); ++ } ++ ++ /* FIXME: set buffers to 0 */ ++ dev->image = vmalloc(image_size); ++ if (dev->image == NULL) { ++ dev->buffer_size = dev->image_size = 0; ++ return -ENOMEM; ++ } ++ init_buffers(dev, pix_format->sizeimage, buffer_size); ++ dev->buffer_size = buffer_size; ++ dev->image_size = image_size; ++ dprintk("allocate_buffers() -> vmalloc'd %lubytes\n", dev->image_size); ++ return 0; ++} ++static int allocate_timeout_buffer(struct v4l2_loopback_device *dev) ++{ ++ /* device's `buffer_size` and `buffers` must be initialised in ++ * allocate_buffers() */ ++ ++ dprintk("allocate_timeout_buffer() size %ubytes\n", dev->buffer_size); ++ if (dev->buffer_size == 0) ++ return -EINVAL; ++ ++ if (dev->timeout_image) { ++ if (dev->timeout_buffer.buffer.flags & V4L2_BUF_FLAG_MAPPED) ++ return -EBUSY; ++ if (dev->buffer_size == dev->timeout_buffer_size) ++ return 0; ++ free_timeout_buffer(dev); ++ } ++ ++ dev->timeout_image = vzalloc(dev->buffer_size); ++ if (!dev->timeout_image) { ++ dev->timeout_buffer_size = 0; ++ return -ENOMEM; ++ } ++ dev->timeout_buffer_size = dev->buffer_size; ++ return 0; ++} ++/* init inner buffers, they are capture mode and flags are set as for capture ++ * mode buffers */ ++static void init_buffers(struct v4l2_loopback_device *dev, u32 bytes_used, ++ u32 buffer_size) ++{ ++ u32 i; ++ ++ for (i = 0; i < dev->buffer_count; ++i) { ++ struct v4l2_buffer *b = &dev->buffers[i].buffer; ++ b->index = i; ++ b->bytesused = bytes_used; ++ b->length = buffer_size; ++ b->field = V4L2_FIELD_NONE; ++ b->flags = 0; ++ b->m.offset = i * buffer_size; ++ b->memory = V4L2_MEMORY_MMAP; ++ b->sequence = 0; ++ b->timestamp.tv_sec = 0; ++ b->timestamp.tv_usec = 0; ++ b->type = V4L2_BUF_TYPE_VIDEO_CAPTURE; ++ ++ v4l2l_get_timestamp(b); ++ } ++ dev->timeout_buffer = dev->buffers[0]; ++ dev->timeout_buffer.buffer.m.offset = MAX_BUFFERS * buffer_size; ++} ++ ++/* fills and register video device */ ++static void init_vdev(struct video_device *vdev, int nr) ++{ ++#ifdef V4L2LOOPBACK_WITH_STD ++ vdev->tvnorms = V4L2_STD_ALL; ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ vdev->vfl_type = VFL_TYPE_VIDEO; ++ vdev->fops = &v4l2_loopback_fops; ++ vdev->ioctl_ops = &v4l2_loopback_ioctl_ops; ++ vdev->release = &video_device_release; ++ vdev->minor = -1; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 7, 0) ++ vdev->device_caps = V4L2_CAP_DEVICE_CAPS | V4L2_CAP_VIDEO_CAPTURE | ++ V4L2_CAP_VIDEO_OUTPUT | V4L2_CAP_READWRITE | ++ V4L2_CAP_STREAMING; ++#endif ++ ++ if (debug > 1) ++ vdev->dev_debug = V4L2_DEV_DEBUG_IOCTL | ++ V4L2_DEV_DEBUG_IOCTL_ARG; ++ ++ vdev->vfl_dir = VFL_DIR_M2M; ++} ++ ++/* init default capture parameters, only fps may be changed in future */ ++static void init_capture_param(struct v4l2_captureparm *capture_param) ++{ ++ capture_param->capability = V4L2_CAP_TIMEPERFRAME; /* since 2.16 */ ++ capture_param->capturemode = 0; ++ capture_param->extendedmode = 0; ++ capture_param->readbuffers = max_buffers; ++ capture_param->timeperframe.numerator = 1; ++ capture_param->timeperframe.denominator = V4L2LOOPBACK_FPS_DEFAULT; ++} ++ ++static void check_timers(struct v4l2_loopback_device *dev) ++{ ++ if (has_output_token(dev->stream_tokens)) ++ return; ++ ++ if (dev->timeout_jiffies > 0 && !timer_pending(&dev->timeout_timer)) ++ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); ++ if (dev->sustain_framerate && !timer_pending(&dev->sustain_timer)) ++ mod_timer(&dev->sustain_timer, ++ jiffies + dev->frame_jiffies * 3 / 2); ++} ++#ifdef HAVE_TIMER_SETUP ++static void sustain_timer_clb(struct timer_list *t) ++{ ++ struct v4l2_loopback_device *dev = ++ container_of(t, struct v4l2_loopback_device, sustain_timer); ++#else ++static void sustain_timer_clb(unsigned long nr) ++{ ++ struct v4l2_loopback_device *dev = ++ idr_find(&v4l2loopback_index_idr, nr); ++#endif ++ spin_lock(&dev->lock); ++ if (dev->sustain_framerate) { ++ dev->reread_count++; ++ dprintkrw("sustain_timer_clb() write_pos=%lld reread=%u\n", ++ (long long)dev->write_position, dev->reread_count); ++ if (dev->reread_count == 1) ++ mod_timer(&dev->sustain_timer, ++ jiffies + max(1UL, dev->frame_jiffies / 2)); ++ else ++ mod_timer(&dev->sustain_timer, ++ jiffies + dev->frame_jiffies); ++ wake_up_all(&dev->read_event); ++ } ++ spin_unlock(&dev->lock); ++} ++#ifdef HAVE_TIMER_SETUP ++static void timeout_timer_clb(struct timer_list *t) ++{ ++ struct v4l2_loopback_device *dev = ++ container_of(t, struct v4l2_loopback_device, timeout_timer); ++#else ++static void timeout_timer_clb(unsigned long nr) ++{ ++ struct v4l2_loopback_device *dev = ++ idr_find(&v4l2loopback_index_idr, nr); ++#endif ++ spin_lock(&dev->lock); ++ if (dev->timeout_jiffies > 0) { ++ dev->timeout_happened = 1; ++ mod_timer(&dev->timeout_timer, jiffies + dev->timeout_jiffies); ++ wake_up_all(&dev->read_event); ++ } ++ spin_unlock(&dev->lock); ++} ++ ++/* init loopback main structure */ ++#define DEFAULT_FROM_CONF(confmember, default_condition, default_value) \ ++ ((conf) ? \ ++ ((conf->confmember default_condition) ? (default_value) : \ ++ (conf->confmember)) : \ ++ default_value) ++ ++static int v4l2_loopback_add(struct v4l2_loopback_config *conf, int *ret_nr) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_ctrl_handler *hdl; ++ struct v4l2loopback_private *vdev_priv = NULL; ++ int err; ++ ++ u32 _width = V4L2LOOPBACK_SIZE_DEFAULT_WIDTH; ++ u32 _height = V4L2LOOPBACK_SIZE_DEFAULT_HEIGHT; ++ ++ u32 _min_width = DEFAULT_FROM_CONF(min_width, ++ < V4L2LOOPBACK_SIZE_MIN_WIDTH, ++ V4L2LOOPBACK_SIZE_MIN_WIDTH); ++ u32 _min_height = DEFAULT_FROM_CONF(min_height, ++ < V4L2LOOPBACK_SIZE_MIN_HEIGHT, ++ V4L2LOOPBACK_SIZE_MIN_HEIGHT); ++ u32 _max_width = DEFAULT_FROM_CONF(max_width, < _min_width, max_width); ++ u32 _max_height = ++ DEFAULT_FROM_CONF(max_height, < _min_height, max_height); ++ bool _announce_all_caps = (conf && conf->announce_all_caps >= 0) ? ++ (bool)(conf->announce_all_caps) : ++ !(V4L2LOOPBACK_DEFAULT_EXCLUSIVECAPS); ++ int _max_buffers = DEFAULT_FROM_CONF(max_buffers, <= 0, max_buffers); ++ int _max_openers = DEFAULT_FROM_CONF(max_openers, <= 0, max_openers); ++ struct v4l2_format _fmt; ++ ++ int nr = -1; ++ ++ if (conf) { ++ const int output_nr = conf->output_nr; ++#ifdef SPLIT_DEVICES ++ const int capture_nr = conf->capture_nr; ++#else ++ const int capture_nr = output_nr; ++#endif ++ if (capture_nr >= 0 && output_nr == capture_nr) { ++ nr = output_nr; ++ } else if (capture_nr < 0 && output_nr < 0) { ++ nr = -1; ++ } else if (capture_nr < 0) { ++ nr = output_nr; ++ } else if (output_nr < 0) { ++ nr = capture_nr; ++ } else { ++ printk(KERN_ERR ++ "v4l2-loopback add() split OUTPUT and CAPTURE " ++ "devices not yet supported.\n"); ++ printk(KERN_INFO ++ "v4l2-loopback add() both devices must have the " ++ "same number (%d != %d).\n", ++ output_nr, capture_nr); ++ return -EINVAL; ++ } ++ } ++ ++ if (idr_find(&v4l2loopback_index_idr, nr)) ++ return -EEXIST; ++ ++ /* initialisation of a new device */ ++ dprintk("add() creating device #%d\n", nr); ++ dev = kzalloc(sizeof(*dev), GFP_KERNEL); ++ if (!dev) ++ return -ENOMEM; ++ ++ /* allocate id, if @id >= 0, we're requesting that specific id */ ++ if (nr >= 0) { ++ err = idr_alloc(&v4l2loopback_index_idr, dev, nr, nr + 1, ++ GFP_KERNEL); ++ if (err == -ENOSPC) ++ err = -EEXIST; ++ } else { ++ err = idr_alloc(&v4l2loopback_index_idr, dev, 0, 0, GFP_KERNEL); ++ } ++ if (err < 0) ++ goto out_free_dev; ++ ++ /* register new device */ ++ MARK(); ++ nr = err; ++ ++ if (conf && conf->card_label[0]) { ++ snprintf(dev->card_label, sizeof(dev->card_label), "%s", ++ conf->card_label); ++ } else { ++ snprintf(dev->card_label, sizeof(dev->card_label), ++ "Dummy video device (0x%04X)", nr); ++ } ++ snprintf(dev->v4l2_dev.name, sizeof(dev->v4l2_dev.name), ++ "v4l2loopback-%03d", nr); ++ ++ err = v4l2_device_register(NULL, &dev->v4l2_dev); ++ if (err) ++ goto out_free_idr; ++ ++ /* initialise the _video_ device */ ++ MARK(); ++ err = -ENOMEM; ++ dev->vdev = video_device_alloc(); ++ if (dev->vdev == NULL) ++ goto out_unregister; ++ ++ vdev_priv = kzalloc(sizeof(struct v4l2loopback_private), GFP_KERNEL); ++ if (vdev_priv == NULL) ++ goto out_unregister; ++ ++ video_set_drvdata(dev->vdev, vdev_priv); ++ if (video_get_drvdata(dev->vdev) == NULL) ++ goto out_unregister; ++ ++ snprintf(dev->vdev->name, sizeof(dev->vdev->name), "%s", ++ dev->card_label); ++ vdev_priv->device_nr = nr; ++ init_vdev(dev->vdev, nr); ++ dev->vdev->v4l2_dev = &dev->v4l2_dev; ++ ++ /* initialise v4l2-loopback specific parameters */ ++ MARK(); ++ dev->announce_all_caps = _announce_all_caps; ++ dev->min_width = _min_width; ++ dev->min_height = _min_height; ++ dev->max_width = _max_width; ++ dev->max_height = _max_height; ++ dev->max_openers = _max_openers; ++ ++ /* set (initial) pixel and stream format */ ++ _width = clamp_val(_width, _min_width, _max_width); ++ _height = clamp_val(_height, _min_height, _max_height); ++ _fmt = (struct v4l2_format){ ++ .type = V4L2_BUF_TYPE_VIDEO_CAPTURE, ++ .fmt.pix = { .width = _width, ++ .height = _height, ++ .pixelformat = formats[0].fourcc, ++ .colorspace = V4L2_COLORSPACE_DEFAULT, ++ .field = V4L2_FIELD_NONE } ++ }; ++ ++ err = v4l2l_fill_format(&_fmt, _min_width, _max_width, _min_height, ++ _max_height); ++ if (err) ++ /* highly unexpected failure to assign default format */ ++ goto out_unregister; ++ dev->pix_format = _fmt.fmt.pix; ++ init_capture_param(&dev->capture_param); ++ set_timeperframe(dev, &dev->capture_param.timeperframe); ++ ++ /* ctrls parameters */ ++ dev->keep_format = 0; ++ dev->sustain_framerate = 0; ++ dev->timeout_jiffies = 0; ++ dev->timeout_image_io = 0; ++ ++ /* initialise OUTPUT and CAPTURE buffer values */ ++ dev->image = NULL; ++ dev->image_size = 0; ++ dev->buffer_count = _max_buffers; ++ dev->buffer_size = 0; ++ dev->used_buffer_count = 0; ++ INIT_LIST_HEAD(&dev->outbufs_list); ++ do { ++ u32 index; ++ for (index = 0; index < dev->buffer_count; ++index) ++ INIT_LIST_HEAD(&dev->buffers[index].list_head); ++ ++ } while (0); ++ memset(dev->bufpos2index, 0, sizeof(dev->bufpos2index)); ++ dev->write_position = 0; ++ ++ /* initialise synchronisation data */ ++ atomic_set(&dev->open_count, 0); ++ mutex_init(&dev->image_mutex); ++ spin_lock_init(&dev->lock); ++ spin_lock_init(&dev->list_lock); ++ init_waitqueue_head(&dev->read_event); ++ dev->format_tokens = V4L2L_TOKEN_MASK; ++ dev->stream_tokens = V4L2L_TOKEN_MASK; ++ ++ /* initialise sustain frame rate and timeout parameters, and timers */ ++ dev->reread_count = 0; ++ dev->timeout_image = NULL; ++ dev->timeout_happened = 0; ++#ifdef HAVE_TIMER_SETUP ++ timer_setup(&dev->sustain_timer, sustain_timer_clb, 0); ++ timer_setup(&dev->timeout_timer, timeout_timer_clb, 0); ++#else ++ setup_timer(&dev->sustain_timer, sustain_timer_clb, nr); ++ setup_timer(&dev->timeout_timer, timeout_timer_clb, nr); ++#endif ++ ++ /* initialise the control handler and add controls */ ++ MARK(); ++ hdl = &dev->ctrl_handler; ++ err = v4l2_ctrl_handler_init(hdl, 4); ++ if (err) ++ goto out_unregister; ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_keepformat, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_sustainframerate, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeout, NULL); ++ v4l2_ctrl_new_custom(hdl, &v4l2loopback_ctrl_timeoutimageio, NULL); ++ if (hdl->error) { ++ err = hdl->error; ++ goto out_free_handler; ++ } ++ dev->v4l2_dev.ctrl_handler = hdl; ++ ++ err = v4l2_ctrl_handler_setup(hdl); ++ if (err) ++ goto out_free_handler; ++ ++ /* register the device (creates /dev/video*) */ ++ MARK(); ++ if (video_register_device(dev->vdev, VFL_TYPE_VIDEO, nr) < 0) { ++ printk(KERN_ERR ++ "v4l2-loopback add() failed video_register_device()\n"); ++ err = -EFAULT; ++ goto out_free_device; ++ } ++ v4l2loopback_create_sysfs(dev->vdev); ++ /* NOTE: ambivalent if sysfs entries fail */ ++ ++ if (ret_nr) ++ *ret_nr = dev->vdev->num; ++ return 0; ++ ++out_free_device: ++ video_device_release(dev->vdev); ++out_free_handler: ++ v4l2_ctrl_handler_free(&dev->ctrl_handler); ++out_unregister: ++ video_set_drvdata(dev->vdev, NULL); ++ if (vdev_priv != NULL) ++ kfree(vdev_priv); ++ v4l2_device_unregister(&dev->v4l2_dev); ++out_free_idr: ++ idr_remove(&v4l2loopback_index_idr, nr); ++out_free_dev: ++ kfree(dev); ++ return err; ++} ++ ++static void v4l2_loopback_remove(struct v4l2_loopback_device *dev) ++{ ++ int device_nr = v4l2loopback_get_vdev_nr(dev->vdev); ++ mutex_lock(&dev->image_mutex); ++ free_buffers(dev); ++ free_timeout_buffer(dev); ++ mutex_unlock(&dev->image_mutex); ++ v4l2loopback_remove_sysfs(dev->vdev); ++ v4l2_ctrl_handler_free(&dev->ctrl_handler); ++ kfree(video_get_drvdata(dev->vdev)); ++ video_unregister_device(dev->vdev); ++ v4l2_device_unregister(&dev->v4l2_dev); ++ idr_remove(&v4l2loopback_index_idr, device_nr); ++ kfree(dev); ++} ++ ++static long v4l2loopback_control_ioctl(struct file *file, unsigned int cmd, ++ unsigned long parm) ++{ ++ struct v4l2_loopback_device *dev; ++ struct v4l2_loopback_config conf; ++ struct v4l2_loopback_config *confptr = &conf; ++ int device_nr, capture_nr, output_nr; ++ int ret; ++ const __u32 version = V4L2LOOPBACK_VERSION_CODE; ++ ++ ret = mutex_lock_killable(&v4l2loopback_ctl_mutex); ++ if (ret) ++ return ret; ++ ++ ret = -EINVAL; ++ switch (cmd) { ++ default: ++ ret = -ENOSYS; ++ break; ++ /* add a v4l2loopback device (pair), based on the user-provided specs */ ++ case V4L2LOOPBACK_CTL_ADD: ++ case V4L2LOOPBACK_CTL_ADD_legacy: ++ if (parm) { ++ if ((ret = copy_from_user(&conf, (void *)parm, ++ sizeof(conf))) < 0) ++ break; ++ } else ++ confptr = NULL; ++ ret = v4l2_loopback_add(confptr, &device_nr); ++ if (ret >= 0) ++ ret = device_nr; ++ break; ++ /* remove a v4l2loopback device (both capture and output) */ ++ case V4L2LOOPBACK_CTL_REMOVE: ++ case V4L2LOOPBACK_CTL_REMOVE_legacy: ++ ret = v4l2loopback_lookup((__u32)parm, &dev); ++ if (ret >= 0 && dev) { ++ ret = -EBUSY; ++ if (dev->open_count.counter > 0) ++ break; ++ v4l2_loopback_remove(dev); ++ ret = 0; ++ }; ++ break; ++ /* get information for a loopback device. ++ * this is mostly about limits (which cannot be queried directly with VIDIOC_G_FMT and friends ++ */ ++ case V4L2LOOPBACK_CTL_QUERY: ++ case V4L2LOOPBACK_CTL_QUERY_legacy: ++ if (!parm) ++ break; ++ if ((ret = copy_from_user(&conf, (void *)parm, sizeof(conf))) < ++ 0) ++ break; ++ capture_nr = output_nr = conf.output_nr; ++#ifdef SPLIT_DEVICES ++ capture_nr = conf.capture_nr; ++#endif ++ device_nr = (output_nr < 0) ? capture_nr : output_nr; ++ MARK(); ++ /* get the device from either capture_nr or output_nr (whatever is valid) */ ++ if ((ret = v4l2loopback_lookup(device_nr, &dev)) < 0) ++ break; ++ MARK(); ++ /* if we got the device from output_nr and there is a valid capture_nr, ++ * make sure that both refer to the same device (or bail out) ++ */ ++ if ((device_nr != capture_nr) && (capture_nr >= 0) && ++ ((ret = v4l2loopback_lookup(capture_nr, 0)) < 0)) ++ break; ++ MARK(); ++ /* if otoh, we got the device from capture_nr and there is a valid output_nr, ++ * make sure that both refer to the same device (or bail out) ++ */ ++ if ((device_nr != output_nr) && (output_nr >= 0) && ++ ((ret = v4l2loopback_lookup(output_nr, 0)) < 0)) ++ break; ++ ++ /* v4l2_loopback_config identified a single device, so fetch the data */ ++ snprintf(conf.card_label, sizeof(conf.card_label), "%s", ++ dev->card_label); ++ ++ conf.output_nr = dev->vdev->num; ++#ifdef SPLIT_DEVICES ++ conf.capture_nr = dev->vdev->num; ++#endif ++ conf.min_width = dev->min_width; ++ conf.min_height = dev->min_height; ++ conf.max_width = dev->max_width; ++ conf.max_height = dev->max_height; ++ conf.announce_all_caps = dev->announce_all_caps; ++ conf.max_buffers = dev->buffer_count; ++ conf.max_openers = dev->max_openers; ++ conf.debug = debug; ++ MARK(); ++ if (copy_to_user((void *)parm, &conf, sizeof(conf))) { ++ ret = -EFAULT; ++ break; ++ } ++ ret = 0; ++ break; ++ case V4L2LOOPBACK_CTL_VERSION: ++ if (!parm) ++ break; ++ if (copy_to_user((void *)parm, &version, sizeof(version))) { ++ ret = -EFAULT; ++ break; ++ } ++ ret = 0; ++ break; ++ } ++ ++ mutex_unlock(&v4l2loopback_ctl_mutex); ++ MARK(); ++ return ret; ++} ++ ++/* LINUX KERNEL */ ++ ++static const struct file_operations v4l2loopback_ctl_fops = { ++ // clang-format off ++ .owner = THIS_MODULE, ++ .open = nonseekable_open, ++ .unlocked_ioctl = v4l2loopback_control_ioctl, ++ .compat_ioctl = v4l2loopback_control_ioctl, ++ .llseek = noop_llseek, ++ // clang-format on ++}; ++ ++static struct miscdevice v4l2loopback_misc = { ++ // clang-format off ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "v4l2loopback", ++ .fops = &v4l2loopback_ctl_fops, ++ // clang-format on ++}; ++ ++static const struct v4l2_file_operations v4l2_loopback_fops = { ++ // clang-format off ++ .owner = THIS_MODULE, ++ .open = v4l2_loopback_open, ++ .release = v4l2_loopback_close, ++ .read = v4l2_loopback_read, ++ .write = v4l2_loopback_write, ++ .poll = v4l2_loopback_poll, ++ .mmap = v4l2_loopback_mmap, ++ .unlocked_ioctl = video_ioctl2, ++ // clang-format on ++}; ++ ++static const struct v4l2_ioctl_ops v4l2_loopback_ioctl_ops = { ++ // clang-format off ++ .vidioc_querycap = &vidioc_querycap, ++ .vidioc_enum_framesizes = &vidioc_enum_framesizes, ++ .vidioc_enum_frameintervals = &vidioc_enum_frameintervals, ++ ++ .vidioc_enum_output = &vidioc_enum_output, ++ .vidioc_g_output = &vidioc_g_output, ++ .vidioc_s_output = &vidioc_s_output, ++ ++ .vidioc_enum_input = &vidioc_enum_input, ++ .vidioc_g_input = &vidioc_g_input, ++ .vidioc_s_input = &vidioc_s_input, ++ ++ .vidioc_enum_fmt_vid_cap = &vidioc_enum_fmt_cap, ++ .vidioc_g_fmt_vid_cap = &vidioc_g_fmt_cap, ++ .vidioc_s_fmt_vid_cap = &vidioc_s_fmt_cap, ++ .vidioc_try_fmt_vid_cap = &vidioc_try_fmt_cap, ++ ++ .vidioc_enum_fmt_vid_out = &vidioc_enum_fmt_out, ++ .vidioc_s_fmt_vid_out = &vidioc_s_fmt_out, ++ .vidioc_g_fmt_vid_out = &vidioc_g_fmt_out, ++ .vidioc_try_fmt_vid_out = &vidioc_try_fmt_out, ++ ++#ifdef V4L2L_OVERLAY ++ .vidioc_s_fmt_vid_overlay = &vidioc_s_fmt_overlay, ++ .vidioc_g_fmt_vid_overlay = &vidioc_g_fmt_overlay, ++#endif ++ ++#ifdef V4L2LOOPBACK_WITH_STD ++ .vidioc_s_std = &vidioc_s_std, ++ .vidioc_g_std = &vidioc_g_std, ++ .vidioc_querystd = &vidioc_querystd, ++#endif /* V4L2LOOPBACK_WITH_STD */ ++ ++ .vidioc_g_parm = &vidioc_g_parm, ++ .vidioc_s_parm = &vidioc_s_parm, ++ ++ .vidioc_reqbufs = &vidioc_reqbufs, ++ .vidioc_querybuf = &vidioc_querybuf, ++ .vidioc_qbuf = &vidioc_qbuf, ++ .vidioc_dqbuf = &vidioc_dqbuf, ++ ++ .vidioc_streamon = &vidioc_streamon, ++ .vidioc_streamoff = &vidioc_streamoff, ++ ++#ifdef CONFIG_VIDEO_V4L1_COMPAT ++ .vidiocgmbuf = &vidiocgmbuf, ++#endif ++ ++ .vidioc_subscribe_event = &vidioc_subscribe_event, ++ .vidioc_unsubscribe_event = &v4l2_event_unsubscribe, ++ // clang-format on ++}; ++ ++static int free_device_cb(int id, void *ptr, void *data) ++{ ++ struct v4l2_loopback_device *dev = ptr; ++ v4l2_loopback_remove(dev); ++ return 0; ++} ++static void free_devices(void) ++{ ++ idr_for_each(&v4l2loopback_index_idr, &free_device_cb, NULL); ++ idr_destroy(&v4l2loopback_index_idr); ++} ++ ++static int __init v4l2loopback_init_module(void) ++{ ++ const u32 min_width = V4L2LOOPBACK_SIZE_MIN_WIDTH; ++ const u32 min_height = V4L2LOOPBACK_SIZE_MIN_HEIGHT; ++ int err; ++ int i; ++ MARK(); ++ ++ err = misc_register(&v4l2loopback_misc); ++ if (err < 0) ++ return err; ++ ++ if (devices < 0) { ++ devices = 1; ++ ++ /* try guessing the devices from the "video_nr" parameter */ ++ for (i = MAX_DEVICES - 1; i >= 0; i--) { ++ if (video_nr[i] >= 0) { ++ devices = i + 1; ++ break; ++ } ++ } ++ } ++ ++ if (devices > MAX_DEVICES) { ++ devices = MAX_DEVICES; ++ printk(KERN_INFO ++ "v4l2-loopback init() number of initial devices is " ++ "limited to: %d\n", ++ MAX_DEVICES); ++ } ++ ++ if (max_buffers > MAX_BUFFERS) { ++ max_buffers = MAX_BUFFERS; ++ printk(KERN_INFO ++ "v4l2-loopback init() number of buffers is limited " ++ "to: %d\n", ++ MAX_BUFFERS); ++ } ++ ++ if (max_openers < 0) { ++ printk(KERN_INFO ++ "v4l2-loopback init() allowing %d openers rather " ++ "than %d\n", ++ 2, max_openers); ++ max_openers = 2; ++ } ++ ++ if (max_width < min_width) { ++ max_width = V4L2LOOPBACK_SIZE_DEFAULT_MAX_WIDTH; ++ printk(KERN_INFO "v4l2-loopback init() using max_width %d\n", ++ max_width); ++ } ++ if (max_height < min_height) { ++ max_height = V4L2LOOPBACK_SIZE_DEFAULT_MAX_HEIGHT; ++ printk(KERN_INFO "v4l2-loopback init() using max_height %d\n", ++ max_height); ++ } ++ ++ for (i = 0; i < devices; i++) { ++ struct v4l2_loopback_config cfg = { ++ // clang-format off ++ .output_nr = video_nr[i], ++#ifdef SPLIT_DEVICES ++ .capture_nr = video_nr[i], ++#endif ++ .min_width = min_width, ++ .min_height = min_height, ++ .max_width = max_width, ++ .max_height = max_height, ++ .announce_all_caps = (!exclusive_caps[i]), ++ .max_buffers = max_buffers, ++ .max_openers = max_openers, ++ .debug = debug, ++ // clang-format on ++ }; ++ cfg.card_label[0] = 0; ++ if (card_label[i]) ++ snprintf(cfg.card_label, sizeof(cfg.card_label), "%s", ++ card_label[i]); ++ err = v4l2_loopback_add(&cfg, 0); ++ if (err) { ++ free_devices(); ++ goto error; ++ } ++ } ++ ++ dprintk("module installed\n"); ++ ++ printk(KERN_INFO "v4l2-loopback driver version %d.%d.%d%s loaded\n", ++ // clang-format off ++ (V4L2LOOPBACK_VERSION_CODE >> 16) & 0xff, ++ (V4L2LOOPBACK_VERSION_CODE >> 8) & 0xff, ++ (V4L2LOOPBACK_VERSION_CODE ) & 0xff, ++#ifdef SNAPSHOT_VERSION ++ " (" __stringify(SNAPSHOT_VERSION) ")" ++#else ++ "" ++#endif ++ ); ++ // clang-format on ++ ++ return 0; ++error: ++ misc_deregister(&v4l2loopback_misc); ++ return err; ++} ++ ++static void v4l2loopback_cleanup_module(void) ++{ ++ MARK(); ++ /* unregister the device -> it deletes /dev/video* */ ++ free_devices(); ++ /* and get rid of /dev/v4l2loopback */ ++ misc_deregister(&v4l2loopback_misc); ++ dprintk("module removed\n"); ++} ++ ++MODULE_ALIAS_MISCDEV(MISC_DYNAMIC_MINOR); ++ ++module_init(v4l2loopback_init_module); ++module_exit(v4l2loopback_cleanup_module); +diff --git a/drivers/media/v4l2-core/v4l2loopback.h b/drivers/media/v4l2-core/v4l2loopback.h +new file mode 100644 +index 000000000000..e48e0ce5949d +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback.h +@@ -0,0 +1,108 @@ ++/* SPDX-License-Identifier: GPL-2.0+ WITH Linux-syscall-note */ ++/* ++ * v4l2loopback.h ++ * ++ * Written by IOhannes m zmölnig, 7/1/20. ++ * ++ * Copyright 2020 by IOhannes m zmölnig. Redistribution of this file is ++ * permitted under the GNU General Public License. ++ */ ++#ifndef _V4L2LOOPBACK_H ++#define _V4L2LOOPBACK_H ++ ++#define V4L2LOOPBACK_VERSION_MAJOR 0 ++#define V4L2LOOPBACK_VERSION_MINOR 15 ++#define V4L2LOOPBACK_VERSION_BUGFIX 0 ++ ++/* /dev/v4l2loopback interface */ ++ ++struct v4l2_loopback_config { ++ /** ++ * the device-number (/dev/video) ++ * V4L2LOOPBACK_CTL_ADD: ++ * setting this to a value<0, will allocate an available one ++ * if nr>=0 and the device already exists, the ioctl will EEXIST ++ * if output_nr and capture_nr are the same, only a single device will be created ++ * NOTE: currently split-devices (where output_nr and capture_nr differ) ++ * are not implemented yet. ++ * until then, requesting different device-IDs will result in EINVAL. ++ * ++ * V4L2LOOPBACK_CTL_QUERY: ++ * either both output_nr and capture_nr must refer to the same loopback, ++ * or one (and only one) of them must be -1 ++ * ++ */ ++ __s32 output_nr; ++ __s32 unused; /*capture_nr;*/ ++ ++ /** ++ * a nice name for your device ++ * if (*card_label)==0, an automatic name is assigned ++ */ ++ char card_label[32]; ++ ++ /** ++ * allowed frame size ++ * if too low, default values are used ++ */ ++ __u32 min_width; ++ __u32 max_width; ++ __u32 min_height; ++ __u32 max_height; ++ ++ /** ++ * number of buffers to allocate for the queue ++ * if set to <=0, default values are used ++ */ ++ __s32 max_buffers; ++ ++ /** ++ * how many consumers are allowed to open this device concurrently ++ * if set to <=0, default values are used ++ */ ++ __s32 max_openers; ++ ++ /** ++ * set the debugging level for this device ++ */ ++ __s32 debug; ++ ++ /** ++ * whether to announce OUTPUT/CAPTURE capabilities exclusively ++ * for this device or not ++ * (!exclusive_caps) ++ * NOTE: this is going to be removed once separate output/capture ++ * devices are implemented ++ */ ++ __s32 announce_all_caps; ++}; ++ ++#define V4L2LOOPBACK_CTL_IOCTLMAGIC '~' ++ ++/* a pointer to an (unsigned int) that - on success - will hold ++ * the version code of the v4l2loopback module ++ * as returned by KERNEL_VERSION(MAJOR, MINOR, BUGFIX) ++ */ ++#define V4L2LOOPBACK_CTL_VERSION _IOR(V4L2LOOPBACK_CTL_IOCTLMAGIC, 0, __u32) ++ ++/* a pointer to a (struct v4l2_loopback_config) that has all values you wish to impose on the ++ * to-be-created device set. ++ * if the ptr is NULL, a new device is created with default values at the driver's discretion. ++ * ++ * returns the device_nr of the OUTPUT device (which can be used with V4L2LOOPBACK_CTL_QUERY, ++ * to get more information on the device) ++ */ ++#define V4L2LOOPBACK_CTL_ADD \ ++ _IOW(V4L2LOOPBACK_CTL_IOCTLMAGIC, 1, struct v4l2_loopback_config) ++ ++/* the device-number (either CAPTURE or OUTPUT) associated with the loopback-device */ ++#define V4L2LOOPBACK_CTL_REMOVE _IOW(V4L2LOOPBACK_CTL_IOCTLMAGIC, 2, __u32) ++ ++/* a pointer to a (struct v4l2_loopback_config) that has output_nr and/or capture_nr set ++ * (the two values must either refer to video-devices associated with the same loopback device ++ * or exactly one of them must be <0 ++ */ ++#define V4L2LOOPBACK_CTL_QUERY \ ++ _IOWR(V4L2LOOPBACK_CTL_IOCTLMAGIC, 3, struct v4l2_loopback_config) ++ ++#endif /* _V4L2LOOPBACK_H */ +diff --git a/drivers/media/v4l2-core/v4l2loopback_formats.h b/drivers/media/v4l2-core/v4l2loopback_formats.h +new file mode 100644 +index 000000000000..d855a3796554 +--- /dev/null ++++ b/drivers/media/v4l2-core/v4l2loopback_formats.h +@@ -0,0 +1,445 @@ ++static const struct v4l2l_format formats[] = { ++#ifndef V4L2_PIX_FMT_VP9 ++#define V4L2_PIX_FMT_VP9 v4l2_fourcc('V', 'P', '9', '0') ++#endif ++#ifndef V4L2_PIX_FMT_HEVC ++#define V4L2_PIX_FMT_HEVC v4l2_fourcc('H', 'E', 'V', 'C') ++#endif ++ ++ /* here come the packed formats */ ++ { ++ .name = "32 bpp RGB, le", ++ .fourcc = V4L2_PIX_FMT_BGR32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "32 bpp RGB, be", ++ .fourcc = V4L2_PIX_FMT_RGB32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "24 bpp RGB, le", ++ .fourcc = V4L2_PIX_FMT_BGR24, ++ .depth = 24, ++ .flags = 0, ++ }, ++ { ++ .name = "24 bpp RGB, be", ++ .fourcc = V4L2_PIX_FMT_RGB24, ++ .depth = 24, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_ABGR32 ++ { ++ .name = "32 bpp RGBA, le", ++ .fourcc = V4L2_PIX_FMT_ABGR32, ++ .depth = 32, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_RGBA32 ++ { ++ .name = "32 bpp RGBA", ++ .fourcc = V4L2_PIX_FMT_RGBA32, ++ .depth = 32, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_RGB332 ++ { ++ .name = "8 bpp RGB-3-3-2", ++ .fourcc = V4L2_PIX_FMT_RGB332, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB332 */ ++#ifdef V4L2_PIX_FMT_RGB444 ++ { ++ .name = "16 bpp RGB (xxxxrrrr ggggbbbb)", ++ .fourcc = V4L2_PIX_FMT_RGB444, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB444 */ ++#ifdef V4L2_PIX_FMT_RGB555 ++ { ++ .name = "16 bpp RGB-5-5-5", ++ .fourcc = V4L2_PIX_FMT_RGB555, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB555 */ ++#ifdef V4L2_PIX_FMT_RGB565 ++ { ++ .name = "16 bpp RGB-5-6-5", ++ .fourcc = V4L2_PIX_FMT_RGB565, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB565 */ ++#ifdef V4L2_PIX_FMT_RGB555X ++ { ++ .name = "16 bpp RGB-5-5-5 BE", ++ .fourcc = V4L2_PIX_FMT_RGB555X, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB555X */ ++#ifdef V4L2_PIX_FMT_RGB565X ++ { ++ .name = "16 bpp RGB-5-6-5 BE", ++ .fourcc = V4L2_PIX_FMT_RGB565X, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_RGB565X */ ++#ifdef V4L2_PIX_FMT_BGR666 ++ { ++ .name = "18 bpp BGR-6-6-6", ++ .fourcc = V4L2_PIX_FMT_BGR666, ++ .depth = 18, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_BGR666 */ ++ { ++ .name = "4:2:2, packed, YUYV", ++ .fourcc = V4L2_PIX_FMT_YUYV, ++ .depth = 16, ++ .flags = 0, ++ }, ++ { ++ .name = "4:2:2, packed, UYVY", ++ .fourcc = V4L2_PIX_FMT_UYVY, ++ .depth = 16, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_YVYU ++ { ++ .name = "4:2:2, packed YVYU", ++ .fourcc = V4L2_PIX_FMT_YVYU, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif ++#ifdef V4L2_PIX_FMT_VYUY ++ { ++ .name = "4:2:2, packed VYUY", ++ .fourcc = V4L2_PIX_FMT_VYUY, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif ++ { ++ .name = "4:2:2, packed YYUV", ++ .fourcc = V4L2_PIX_FMT_YYUV, ++ .depth = 16, ++ .flags = 0, ++ }, ++ { ++ .name = "YUV-8-8-8-8", ++ .fourcc = V4L2_PIX_FMT_YUV32, ++ .depth = 32, ++ .flags = 0, ++ }, ++ { ++ .name = "8 bpp, Greyscale", ++ .fourcc = V4L2_PIX_FMT_GREY, ++ .depth = 8, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_Y4 ++ { ++ .name = "4 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y4, ++ .depth = 4, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y4 */ ++#ifdef V4L2_PIX_FMT_Y6 ++ { ++ .name = "6 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y6, ++ .depth = 6, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y6 */ ++#ifdef V4L2_PIX_FMT_Y10 ++ { ++ .name = "10 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y10, ++ .depth = 10, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y10 */ ++#ifdef V4L2_PIX_FMT_Y12 ++ { ++ .name = "12 bpp Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y12, ++ .depth = 12, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_Y12 */ ++ { ++ .name = "16 bpp, Greyscale", ++ .fourcc = V4L2_PIX_FMT_Y16, ++ .depth = 16, ++ .flags = 0, ++ }, ++#ifdef V4L2_PIX_FMT_YUV444 ++ { ++ .name = "16 bpp xxxxyyyy uuuuvvvv", ++ .fourcc = V4L2_PIX_FMT_YUV444, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV444 */ ++#ifdef V4L2_PIX_FMT_YUV555 ++ { ++ .name = "16 bpp YUV-5-5-5", ++ .fourcc = V4L2_PIX_FMT_YUV555, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV555 */ ++#ifdef V4L2_PIX_FMT_YUV565 ++ { ++ .name = "16 bpp YUV-5-6-5", ++ .fourcc = V4L2_PIX_FMT_YUV565, ++ .depth = 16, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_YUV565 */ ++ ++/* bayer formats */ ++#ifdef V4L2_PIX_FMT_SRGGB8 ++ { ++ .name = "Bayer RGGB 8bit", ++ .fourcc = V4L2_PIX_FMT_SRGGB8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SRGGB8 */ ++#ifdef V4L2_PIX_FMT_SGRBG8 ++ { ++ .name = "Bayer GRBG 8bit", ++ .fourcc = V4L2_PIX_FMT_SGRBG8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SGRBG8 */ ++#ifdef V4L2_PIX_FMT_SGBRG8 ++ { ++ .name = "Bayer GBRG 8bit", ++ .fourcc = V4L2_PIX_FMT_SGBRG8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SGBRG8 */ ++#ifdef V4L2_PIX_FMT_SBGGR8 ++ { ++ .name = "Bayer BA81 8bit", ++ .fourcc = V4L2_PIX_FMT_SBGGR8, ++ .depth = 8, ++ .flags = 0, ++ }, ++#endif /* V4L2_PIX_FMT_SBGGR8 */ ++ ++ /* here come the planar formats */ ++ { ++ .name = "4:1:0, planar, Y-Cr-Cb", ++ .fourcc = V4L2_PIX_FMT_YVU410, ++ .depth = 9, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:2:0, planar, Y-Cr-Cb", ++ .fourcc = V4L2_PIX_FMT_YVU420, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:1:0, planar, Y-Cb-Cr", ++ .fourcc = V4L2_PIX_FMT_YUV410, ++ .depth = 9, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++ { ++ .name = "4:2:0, planar, Y-Cb-Cr", ++ .fourcc = V4L2_PIX_FMT_YUV420, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#ifdef V4L2_PIX_FMT_YUV422P ++ { ++ .name = "16 bpp YVU422 planar", ++ .fourcc = V4L2_PIX_FMT_YUV422P, ++ .depth = 16, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_YUV422P */ ++#ifdef V4L2_PIX_FMT_YUV411P ++ { ++ .name = "16 bpp YVU411 planar", ++ .fourcc = V4L2_PIX_FMT_YUV411P, ++ .depth = 16, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_YUV411P */ ++#ifdef V4L2_PIX_FMT_Y41P ++ { ++ .name = "12 bpp YUV 4:1:1", ++ .fourcc = V4L2_PIX_FMT_Y41P, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_Y41P */ ++#ifdef V4L2_PIX_FMT_NV12 ++ { ++ .name = "12 bpp Y/CbCr 4:2:0 ", ++ .fourcc = V4L2_PIX_FMT_NV12, ++ .depth = 12, ++ .flags = FORMAT_FLAGS_PLANAR, ++ }, ++#endif /* V4L2_PIX_FMT_NV12 */ ++ ++/* here come the compressed formats */ ++ ++#ifdef V4L2_PIX_FMT_MJPEG ++ { ++ .name = "Motion-JPEG", ++ .fourcc = V4L2_PIX_FMT_MJPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MJPEG */ ++#ifdef V4L2_PIX_FMT_JPEG ++ { ++ .name = "JFIF JPEG", ++ .fourcc = V4L2_PIX_FMT_JPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_JPEG */ ++#ifdef V4L2_PIX_FMT_DV ++ { ++ .name = "DV1394", ++ .fourcc = V4L2_PIX_FMT_DV, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_DV */ ++#ifdef V4L2_PIX_FMT_MPEG ++ { ++ .name = "MPEG-1/2/4 Multiplexed", ++ .fourcc = V4L2_PIX_FMT_MPEG, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG */ ++#ifdef V4L2_PIX_FMT_H264 ++ { ++ .name = "H264 with start codes", ++ .fourcc = V4L2_PIX_FMT_H264, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264 */ ++#ifdef V4L2_PIX_FMT_H264_NO_SC ++ { ++ .name = "H264 without start codes", ++ .fourcc = V4L2_PIX_FMT_H264_NO_SC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264_NO_SC */ ++#ifdef V4L2_PIX_FMT_H264_MVC ++ { ++ .name = "H264 MVC", ++ .fourcc = V4L2_PIX_FMT_H264_MVC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H264_MVC */ ++#ifdef V4L2_PIX_FMT_H263 ++ { ++ .name = "H263", ++ .fourcc = V4L2_PIX_FMT_H263, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_H263 */ ++#ifdef V4L2_PIX_FMT_MPEG1 ++ { ++ .name = "MPEG-1 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG1, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG1 */ ++#ifdef V4L2_PIX_FMT_MPEG2 ++ { ++ .name = "MPEG-2 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG2, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG2 */ ++#ifdef V4L2_PIX_FMT_MPEG4 ++ { ++ .name = "MPEG-4 part 2 ES", ++ .fourcc = V4L2_PIX_FMT_MPEG4, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_MPEG4 */ ++#ifdef V4L2_PIX_FMT_XVID ++ { ++ .name = "Xvid", ++ .fourcc = V4L2_PIX_FMT_XVID, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_XVID */ ++#ifdef V4L2_PIX_FMT_VC1_ANNEX_G ++ { ++ .name = "SMPTE 421M Annex G compliant stream", ++ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_G, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VC1_ANNEX_G */ ++#ifdef V4L2_PIX_FMT_VC1_ANNEX_L ++ { ++ .name = "SMPTE 421M Annex L compliant stream", ++ .fourcc = V4L2_PIX_FMT_VC1_ANNEX_L, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VC1_ANNEX_L */ ++#ifdef V4L2_PIX_FMT_VP8 ++ { ++ .name = "VP8", ++ .fourcc = V4L2_PIX_FMT_VP8, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VP8 */ ++#ifdef V4L2_PIX_FMT_VP9 ++ { ++ .name = "VP9", ++ .fourcc = V4L2_PIX_FMT_VP9, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_VP9 */ ++#ifdef V4L2_PIX_FMT_HEVC ++ { ++ .name = "HEVC", ++ .fourcc = V4L2_PIX_FMT_HEVC, ++ .depth = 32, ++ .flags = FORMAT_FLAGS_COMPRESSED, ++ }, ++#endif /* V4L2_PIX_FMT_HEVC */ ++}; +diff --git a/drivers/pci/controller/Makefile b/drivers/pci/controller/Makefile +index 038ccbd9e3ba..de5e4f5145af 100644 +--- a/drivers/pci/controller/Makefile ++++ b/drivers/pci/controller/Makefile +@@ -1,4 +1,10 @@ + # SPDX-License-Identifier: GPL-2.0 ++ifdef CONFIG_X86_64 ++ifdef CONFIG_SATA_AHCI ++obj-y += intel-nvme-remap.o ++endif ++endif ++ + obj-$(CONFIG_PCIE_CADENCE) += cadence/ + obj-$(CONFIG_PCI_FTPCI100) += pci-ftpci100.o + obj-$(CONFIG_PCI_IXP4XX) += pci-ixp4xx.o +diff --git a/drivers/pci/controller/intel-nvme-remap.c b/drivers/pci/controller/intel-nvme-remap.c +new file mode 100644 +index 000000000000..e105e6f5cc91 +--- /dev/null ++++ b/drivers/pci/controller/intel-nvme-remap.c +@@ -0,0 +1,462 @@ ++// SPDX-License-Identifier: GPL-2.0 ++/* ++ * Intel remapped NVMe device support. ++ * ++ * Copyright (c) 2019 Endless Mobile, Inc. ++ * Author: Daniel Drake ++ * ++ * Some products ship by default with the SATA controller in "RAID" or ++ * "Intel RST Premium With Intel Optane System Acceleration" mode. Under this ++ * mode, which we refer to as "remapped NVMe" mode, any installed NVMe ++ * devices disappear from the PCI bus, and instead their I/O memory becomes ++ * available within the AHCI device BARs. ++ * ++ * This scheme is understood to be a way of avoiding usage of the standard ++ * Windows NVMe driver under that OS, instead mandating usage of Intel's ++ * driver instead, which has better power management, and presumably offers ++ * some RAID/disk-caching solutions too. ++ * ++ * Here in this driver, we support the remapped NVMe mode by claiming the ++ * AHCI device and creating a fake PCIe root port. On the new bus, the ++ * original AHCI device is exposed with only minor tweaks. Then, fake PCI ++ * devices corresponding to the remapped NVMe devices are created. The usual ++ * ahci and nvme drivers are then expected to bind to these devices and ++ * operate as normal. ++ * ++ * The PCI configuration space for the NVMe devices is completely ++ * unavailable, so we fake a minimal one and hope for the best. ++ * ++ * Interrupts are shared between the AHCI and NVMe devices. For simplicity, ++ * we only support the legacy interrupt here, although MSI support ++ * could potentially be added later. ++ */ ++ ++#define MODULE_NAME "intel-nvme-remap" ++ ++#include ++#include ++#include ++#include ++#include ++ ++#define AHCI_PCI_BAR_STANDARD 5 ++ ++struct nvme_remap_dev { ++ struct pci_dev *dev; /* AHCI device */ ++ struct pci_bus *bus; /* our fake PCI bus */ ++ struct pci_sysdata sysdata; ++ int irq_base; /* our fake interrupts */ ++ ++ /* ++ * When we detect an all-ones write to a BAR register, this flag ++ * is set, so that we return the BAR size on the next read (a ++ * standard PCI behaviour). ++ * This includes the assumption that an all-ones BAR write is ++ * immediately followed by a read of the same register. ++ */ ++ bool bar_sizing; ++ ++ /* ++ * Resources copied from the AHCI device, to be regarded as ++ * resources on our fake bus. ++ */ ++ struct resource ahci_resources[PCI_NUM_RESOURCES]; ++ ++ /* Resources corresponding to the NVMe devices. */ ++ struct resource remapped_dev_mem[AHCI_MAX_REMAP]; ++ ++ /* Number of remapped NVMe devices found. */ ++ int num_remapped_devices; ++}; ++ ++static inline struct nvme_remap_dev *nrdev_from_bus(struct pci_bus *bus) ++{ ++ return container_of(bus->sysdata, struct nvme_remap_dev, sysdata); ++} ++ ++ ++/******** PCI configuration space **********/ ++ ++/* ++ * Helper macros for tweaking returned contents of PCI configuration space. ++ * ++ * value contains len bytes of data read from reg. ++ * If fixup_reg is included in that range, fix up the contents of that ++ * register to fixed_value. ++ */ ++#define NR_FIX8(fixup_reg, fixed_value) do { \ ++ if (reg <= fixup_reg && fixup_reg < reg + len) \ ++ ((u8 *) value)[fixup_reg - reg] = (u8) (fixed_value); \ ++ } while (0) ++ ++#define NR_FIX16(fixup_reg, fixed_value) do { \ ++ NR_FIX8(fixup_reg, fixed_value); \ ++ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \ ++ } while (0) ++ ++#define NR_FIX24(fixup_reg, fixed_value) do { \ ++ NR_FIX8(fixup_reg, fixed_value); \ ++ NR_FIX8(fixup_reg + 1, fixed_value >> 8); \ ++ NR_FIX8(fixup_reg + 2, fixed_value >> 16); \ ++ } while (0) ++ ++#define NR_FIX32(fixup_reg, fixed_value) do { \ ++ NR_FIX16(fixup_reg, (u16) fixed_value); \ ++ NR_FIX16(fixup_reg + 2, fixed_value >> 16); \ ++ } while (0) ++ ++/* ++ * Read PCI config space of the slot 0 (AHCI) device. ++ * We pass through the read request to the underlying device, but ++ * tweak the results in some cases. ++ */ ++static int nvme_remap_pci_read_slot0(struct pci_bus *bus, int reg, ++ int len, u32 *value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ struct pci_bus *ahci_dev_bus = nrdev->dev->bus; ++ int ret; ++ ++ ret = ahci_dev_bus->ops->read(ahci_dev_bus, nrdev->dev->devfn, ++ reg, len, value); ++ if (ret) ++ return ret; ++ ++ /* ++ * Adjust the device class, to prevent this driver from attempting to ++ * additionally probe the device we're simulating here. ++ */ ++ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_SATA_AHCI); ++ ++ /* ++ * Unset interrupt pin, otherwise ACPI tries to find routing ++ * info for our virtual IRQ, fails, and complains. ++ */ ++ NR_FIX8(PCI_INTERRUPT_PIN, 0); ++ ++ /* ++ * Truncate the AHCI BAR to not include the region that covers the ++ * hidden devices. This will cause the ahci driver to successfully ++ * probe th new device (instead of handing it over to this driver). ++ */ ++ if (nrdev->bar_sizing) { ++ NR_FIX32(PCI_BASE_ADDRESS_5, ~(SZ_16K - 1)); ++ nrdev->bar_sizing = false; ++ } ++ ++ return PCIBIOS_SUCCESSFUL; ++} ++ ++/* ++ * Read PCI config space of a remapped device. ++ * Since the original PCI config space is inaccessible, we provide a minimal, ++ * fake config space instead. ++ */ ++static int nvme_remap_pci_read_remapped(struct pci_bus *bus, unsigned int port, ++ int reg, int len, u32 *value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ struct resource *remapped_mem; ++ ++ if (port > nrdev->num_remapped_devices) ++ return PCIBIOS_DEVICE_NOT_FOUND; ++ ++ *value = 0; ++ remapped_mem = &nrdev->remapped_dev_mem[port - 1]; ++ ++ /* Set a Vendor ID, otherwise Linux assumes no device is present */ ++ NR_FIX16(PCI_VENDOR_ID, PCI_VENDOR_ID_INTEL); ++ ++ /* Always appear on & bus mastering */ ++ NR_FIX16(PCI_COMMAND, PCI_COMMAND_MEMORY | PCI_COMMAND_MASTER); ++ ++ /* Set class so that nvme driver probes us */ ++ NR_FIX24(PCI_CLASS_PROG, PCI_CLASS_STORAGE_EXPRESS); ++ ++ if (nrdev->bar_sizing) { ++ NR_FIX32(PCI_BASE_ADDRESS_0, ++ ~(resource_size(remapped_mem) - 1)); ++ nrdev->bar_sizing = false; ++ } else { ++ resource_size_t mem_start = remapped_mem->start; ++ ++ mem_start |= PCI_BASE_ADDRESS_MEM_TYPE_64; ++ NR_FIX32(PCI_BASE_ADDRESS_0, mem_start); ++ mem_start >>= 32; ++ NR_FIX32(PCI_BASE_ADDRESS_1, mem_start); ++ } ++ ++ return PCIBIOS_SUCCESSFUL; ++} ++ ++/* Read PCI configuration space. */ ++static int nvme_remap_pci_read(struct pci_bus *bus, unsigned int devfn, ++ int reg, int len, u32 *value) ++{ ++ if (PCI_SLOT(devfn) == 0) ++ return nvme_remap_pci_read_slot0(bus, reg, len, value); ++ else ++ return nvme_remap_pci_read_remapped(bus, PCI_SLOT(devfn), ++ reg, len, value); ++} ++ ++/* ++ * Write PCI config space of the slot 0 (AHCI) device. ++ * Apart from the special case of BAR sizing, we disable all writes. ++ * Otherwise, the ahci driver could make changes (e.g. unset PCI bus master) ++ * that would affect the operation of the NVMe devices. ++ */ ++static int nvme_remap_pci_write_slot0(struct pci_bus *bus, int reg, ++ int len, u32 value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ struct pci_bus *ahci_dev_bus = nrdev->dev->bus; ++ ++ if (reg >= PCI_BASE_ADDRESS_0 && reg <= PCI_BASE_ADDRESS_5) { ++ /* ++ * Writing all-ones to a BAR means that the size of the ++ * memory region is being checked. Flag this so that we can ++ * reply with an appropriate size on the next read. ++ */ ++ if (value == ~0) ++ nrdev->bar_sizing = true; ++ ++ return ahci_dev_bus->ops->write(ahci_dev_bus, ++ nrdev->dev->devfn, ++ reg, len, value); ++ } ++ ++ return PCIBIOS_SET_FAILED; ++} ++ ++/* ++ * Write PCI config space of a remapped device. ++ * Since the original PCI config space is inaccessible, we reject all ++ * writes, except for the special case of BAR probing. ++ */ ++static int nvme_remap_pci_write_remapped(struct pci_bus *bus, ++ unsigned int port, ++ int reg, int len, u32 value) ++{ ++ struct nvme_remap_dev *nrdev = nrdev_from_bus(bus); ++ ++ if (port > nrdev->num_remapped_devices) ++ return PCIBIOS_DEVICE_NOT_FOUND; ++ ++ /* ++ * Writing all-ones to a BAR means that the size of the memory ++ * region is being checked. Flag this so that we can reply with ++ * an appropriate size on the next read. ++ */ ++ if (value == ~0 && reg >= PCI_BASE_ADDRESS_0 ++ && reg <= PCI_BASE_ADDRESS_5) { ++ nrdev->bar_sizing = true; ++ return PCIBIOS_SUCCESSFUL; ++ } ++ ++ return PCIBIOS_SET_FAILED; ++} ++ ++/* Write PCI configuration space. */ ++static int nvme_remap_pci_write(struct pci_bus *bus, unsigned int devfn, ++ int reg, int len, u32 value) ++{ ++ if (PCI_SLOT(devfn) == 0) ++ return nvme_remap_pci_write_slot0(bus, reg, len, value); ++ else ++ return nvme_remap_pci_write_remapped(bus, PCI_SLOT(devfn), ++ reg, len, value); ++} ++ ++static struct pci_ops nvme_remap_pci_ops = { ++ .read = nvme_remap_pci_read, ++ .write = nvme_remap_pci_write, ++}; ++ ++ ++/******** Initialization & exit **********/ ++ ++/* ++ * Find a PCI domain ID to use for our fake bus. ++ * Start at 0x10000 to not clash with ACPI _SEG domains (16 bits). ++ */ ++static int find_free_domain(void) ++{ ++ int domain = 0xffff; ++ struct pci_bus *bus = NULL; ++ ++ while ((bus = pci_find_next_bus(bus)) != NULL) ++ domain = max_t(int, domain, pci_domain_nr(bus)); ++ ++ return domain + 1; ++} ++ ++static int find_remapped_devices(struct nvme_remap_dev *nrdev, ++ struct list_head *resources) ++{ ++ void __iomem *mmio; ++ int i, count = 0; ++ u32 cap; ++ ++ mmio = pcim_iomap(nrdev->dev, AHCI_PCI_BAR_STANDARD, ++ pci_resource_len(nrdev->dev, ++ AHCI_PCI_BAR_STANDARD)); ++ if (!mmio) ++ return -ENODEV; ++ ++ /* Check if this device might have remapped nvme devices. */ ++ if (pci_resource_len(nrdev->dev, AHCI_PCI_BAR_STANDARD) < SZ_512K || ++ !(readl(mmio + AHCI_VSCAP) & 1)) ++ return -ENODEV; ++ ++ cap = readq(mmio + AHCI_REMAP_CAP); ++ for (i = AHCI_MAX_REMAP-1; i >= 0; i--) { ++ struct resource *remapped_mem; ++ ++ if ((cap & (1 << i)) == 0) ++ continue; ++ if (readl(mmio + ahci_remap_dcc(i)) ++ != PCI_CLASS_STORAGE_EXPRESS) ++ continue; ++ ++ /* We've found a remapped device */ ++ remapped_mem = &nrdev->remapped_dev_mem[count++]; ++ remapped_mem->start = ++ pci_resource_start(nrdev->dev, AHCI_PCI_BAR_STANDARD) ++ + ahci_remap_base(i); ++ remapped_mem->end = remapped_mem->start ++ + AHCI_REMAP_N_SIZE - 1; ++ remapped_mem->flags = IORESOURCE_MEM | IORESOURCE_PCI_FIXED; ++ pci_add_resource(resources, remapped_mem); ++ } ++ ++ pcim_iounmap(nrdev->dev, mmio); ++ ++ if (count == 0) ++ return -ENODEV; ++ ++ nrdev->num_remapped_devices = count; ++ dev_info(&nrdev->dev->dev, "Found %d remapped NVMe devices\n", ++ nrdev->num_remapped_devices); ++ return 0; ++} ++ ++static void nvme_remap_remove_root_bus(void *data) ++{ ++ struct pci_bus *bus = data; ++ ++ pci_stop_root_bus(bus); ++ pci_remove_root_bus(bus); ++} ++ ++static int nvme_remap_probe(struct pci_dev *dev, ++ const struct pci_device_id *id) ++{ ++ struct nvme_remap_dev *nrdev; ++ LIST_HEAD(resources); ++ int i; ++ int ret; ++ struct pci_dev *child; ++ ++ nrdev = devm_kzalloc(&dev->dev, sizeof(*nrdev), GFP_KERNEL); ++ nrdev->sysdata.domain = find_free_domain(); ++ nrdev->sysdata.nvme_remap_dev = dev; ++ nrdev->dev = dev; ++ pci_set_drvdata(dev, nrdev); ++ ++ ret = pcim_enable_device(dev); ++ if (ret < 0) ++ return ret; ++ ++ pci_set_master(dev); ++ ++ ret = find_remapped_devices(nrdev, &resources); ++ if (ret) ++ return ret; ++ ++ /* Add resources from the original AHCI device */ ++ for (i = 0; i < PCI_NUM_RESOURCES; i++) { ++ struct resource *res = &dev->resource[i]; ++ ++ if (res->start) { ++ struct resource *nr_res = &nrdev->ahci_resources[i]; ++ ++ nr_res->start = res->start; ++ nr_res->end = res->end; ++ nr_res->flags = res->flags; ++ pci_add_resource(&resources, nr_res); ++ } ++ } ++ ++ /* Create virtual interrupts */ ++ nrdev->irq_base = devm_irq_alloc_descs(&dev->dev, -1, 0, ++ nrdev->num_remapped_devices + 1, ++ 0); ++ if (nrdev->irq_base < 0) ++ return nrdev->irq_base; ++ ++ /* Create and populate PCI bus */ ++ nrdev->bus = pci_create_root_bus(&dev->dev, 0, &nvme_remap_pci_ops, ++ &nrdev->sysdata, &resources); ++ if (!nrdev->bus) ++ return -ENODEV; ++ ++ if (devm_add_action_or_reset(&dev->dev, nvme_remap_remove_root_bus, ++ nrdev->bus)) ++ return -ENOMEM; ++ ++ /* We don't support sharing MSI interrupts between these devices */ ++ nrdev->bus->bus_flags |= PCI_BUS_FLAGS_NO_MSI; ++ ++ pci_scan_child_bus(nrdev->bus); ++ ++ list_for_each_entry(child, &nrdev->bus->devices, bus_list) { ++ /* ++ * Prevent PCI core from trying to move memory BARs around. ++ * The hidden NVMe devices are at fixed locations. ++ */ ++ for (i = 0; i < PCI_NUM_RESOURCES; i++) { ++ struct resource *res = &child->resource[i]; ++ ++ if (res->flags & IORESOURCE_MEM) ++ res->flags |= IORESOURCE_PCI_FIXED; ++ } ++ ++ /* Share the legacy IRQ between all devices */ ++ child->irq = dev->irq; ++ } ++ ++ pci_assign_unassigned_bus_resources(nrdev->bus); ++ pci_bus_add_devices(nrdev->bus); ++ ++ return 0; ++} ++ ++static const struct pci_device_id nvme_remap_ids[] = { ++ /* ++ * Match all Intel RAID controllers. ++ * ++ * There's overlap here with the set of devices detected by the ahci ++ * driver, but ahci will only successfully probe when there ++ * *aren't* any remapped NVMe devices, and this driver will only ++ * successfully probe when there *are* remapped NVMe devices that ++ * need handling. ++ */ ++ { ++ PCI_VDEVICE(INTEL, PCI_ANY_ID), ++ .class = PCI_CLASS_STORAGE_RAID << 8, ++ .class_mask = 0xffffff00, ++ }, ++ {0,} ++}; ++MODULE_DEVICE_TABLE(pci, nvme_remap_ids); ++ ++static struct pci_driver nvme_remap_drv = { ++ .name = MODULE_NAME, ++ .id_table = nvme_remap_ids, ++ .probe = nvme_remap_probe, ++}; ++module_pci_driver(nvme_remap_drv); ++ ++MODULE_AUTHOR("Daniel Drake "); ++MODULE_LICENSE("GPL v2"); +diff --git a/drivers/pci/quirks.c b/drivers/pci/quirks.c +index d97335a40193..acab5556a354 100644 +--- a/drivers/pci/quirks.c ++++ b/drivers/pci/quirks.c +@@ -3745,6 +3745,106 @@ static void quirk_no_bus_reset(struct pci_dev *dev) + dev->dev_flags |= PCI_DEV_FLAGS_NO_BUS_RESET; + } + ++static bool acs_on_downstream; ++static bool acs_on_multifunction; ++ ++#define NUM_ACS_IDS 16 ++struct acs_on_id { ++ unsigned short vendor; ++ unsigned short device; ++}; ++static struct acs_on_id acs_on_ids[NUM_ACS_IDS]; ++static u8 max_acs_id; ++ ++static __init int pcie_acs_override_setup(char *p) ++{ ++ if (!p) ++ return -EINVAL; ++ ++ while (*p) { ++ if (!strncmp(p, "downstream", 10)) ++ acs_on_downstream = true; ++ if (!strncmp(p, "multifunction", 13)) ++ acs_on_multifunction = true; ++ if (!strncmp(p, "id:", 3)) { ++ char opt[5]; ++ int ret; ++ long val; ++ ++ if (max_acs_id >= NUM_ACS_IDS - 1) { ++ pr_warn("Out of PCIe ACS override slots (%d)\n", ++ NUM_ACS_IDS); ++ goto next; ++ } ++ ++ p += 3; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].vendor = val; ++ ++ p += strcspn(p, ":"); ++ if (*p != ':') { ++ pr_warn("PCIe ACS invalid ID\n"); ++ goto next; ++ } ++ ++ p++; ++ snprintf(opt, 5, "%s", p); ++ ret = kstrtol(opt, 16, &val); ++ if (ret) { ++ pr_warn("PCIe ACS ID parse error %d\n", ret); ++ goto next; ++ } ++ acs_on_ids[max_acs_id].device = val; ++ max_acs_id++; ++ } ++next: ++ p += strcspn(p, ","); ++ if (*p == ',') ++ p++; ++ } ++ ++ if (acs_on_downstream || acs_on_multifunction || max_acs_id) ++ pr_warn("Warning: PCIe ACS overrides enabled; This may allow non-IOMMU protected peer-to-peer DMA\n"); ++ ++ return 0; ++} ++early_param("pcie_acs_override", pcie_acs_override_setup); ++ ++static int pcie_acs_overrides(struct pci_dev *dev, u16 acs_flags) ++{ ++ int i; ++ ++ /* Never override ACS for legacy devices or devices with ACS caps */ ++ if (!pci_is_pcie(dev) || ++ pci_find_ext_capability(dev, PCI_EXT_CAP_ID_ACS)) ++ return -ENOTTY; ++ ++ for (i = 0; i < max_acs_id; i++) ++ if (acs_on_ids[i].vendor == dev->vendor && ++ acs_on_ids[i].device == dev->device) ++ return 1; ++ ++ switch (pci_pcie_type(dev)) { ++ case PCI_EXP_TYPE_DOWNSTREAM: ++ case PCI_EXP_TYPE_ROOT_PORT: ++ if (acs_on_downstream) ++ return 1; ++ break; ++ case PCI_EXP_TYPE_ENDPOINT: ++ case PCI_EXP_TYPE_UPSTREAM: ++ case PCI_EXP_TYPE_LEG_END: ++ case PCI_EXP_TYPE_RC_END: ++ if (acs_on_multifunction && dev->multifunction) ++ return 1; ++ } ++ ++ return -ENOTTY; ++} + /* + * Some NVIDIA GPU devices do not work with bus reset, SBR needs to be + * prevented for those affected devices. +@@ -5192,6 +5292,7 @@ static const struct pci_dev_acs_enabled { + { PCI_VENDOR_ID_ZHAOXIN, PCI_ANY_ID, pci_quirk_zhaoxin_pcie_ports_acs }, + /* Wangxun nics */ + { PCI_VENDOR_ID_WANGXUN, PCI_ANY_ID, pci_quirk_wangxun_nic_acs }, ++ { PCI_ANY_ID, PCI_ANY_ID, pcie_acs_overrides }, + { 0 } + }; + +diff --git a/drivers/scsi/Kconfig b/drivers/scsi/Kconfig +index 5522310bab8d..9e1c4634eb7b 100644 +--- a/drivers/scsi/Kconfig ++++ b/drivers/scsi/Kconfig +@@ -1524,4 +1524,6 @@ endif # SCSI_LOWLEVEL + + source "drivers/scsi/device_handler/Kconfig" + ++source "drivers/scsi/vhba/Kconfig" ++ + endmenu +diff --git a/drivers/scsi/Makefile b/drivers/scsi/Makefile +index 16de3e41f94c..4e88f6e3e67b 100644 +--- a/drivers/scsi/Makefile ++++ b/drivers/scsi/Makefile +@@ -152,6 +152,7 @@ obj-$(CONFIG_CHR_DEV_SCH) += ch.o + obj-$(CONFIG_SCSI_ENCLOSURE) += ses.o + + obj-$(CONFIG_SCSI_HISI_SAS) += hisi_sas/ ++obj-$(CONFIG_VHBA) += vhba/ + + # This goes last, so that "real" scsi devices probe earlier + obj-$(CONFIG_SCSI_DEBUG) += scsi_debug.o +diff --git a/drivers/scsi/vhba/Kconfig b/drivers/scsi/vhba/Kconfig +new file mode 100644 +index 000000000000..e70a381fe3df +--- /dev/null ++++ b/drivers/scsi/vhba/Kconfig +@@ -0,0 +1,9 @@ ++config VHBA ++ tristate "Virtual (SCSI) Host Bus Adapter" ++ depends on SCSI ++ help ++ This is the in-kernel part of CDEmu, a CD/DVD-ROM device ++ emulator. ++ ++ This driver can also be built as a module. If so, the module ++ will be called vhba. +diff --git a/drivers/scsi/vhba/Makefile b/drivers/scsi/vhba/Makefile +new file mode 100644 +index 000000000000..2d7524b66199 +--- /dev/null ++++ b/drivers/scsi/vhba/Makefile +@@ -0,0 +1,4 @@ ++VHBA_VERSION := 20240917 ++ ++obj-$(CONFIG_VHBA) += vhba.o ++ccflags-y := -DVHBA_VERSION=\"$(VHBA_VERSION)\" -Werror +diff --git a/drivers/scsi/vhba/vhba.c b/drivers/scsi/vhba/vhba.c +new file mode 100644 +index 000000000000..878a3be0ba2b +--- /dev/null ++++ b/drivers/scsi/vhba/vhba.c +@@ -0,0 +1,1132 @@ ++/* ++ * vhba.c ++ * ++ * Copyright (C) 2007-2012 Chia-I Wu ++ * ++ * This program is free software; you can redistribute it and/or modify ++ * it under the terms of the GNU General Public License as published by ++ * the Free Software Foundation; either version 2 of the License, or ++ * (at your option) any later version. ++ * ++ * This program is distributed in the hope that it will be useful, ++ * but WITHOUT ANY WARRANTY; without even the implied warranty of ++ * MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the ++ * GNU General Public License for more details. ++ * ++ * You should have received a copy of the GNU General Public License along ++ * with this program; if not, write to the Free Software Foundation, Inc., ++ * 51 Franklin Street, Fifth Floor, Boston, MA 02110-1301 USA. ++ */ ++ ++#define pr_fmt(fmt) "vhba: " fmt ++ ++#include ++ ++#include ++#include ++#include ++#include ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 11, 0) ++#include ++#else ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#ifdef CONFIG_COMPAT ++#include ++#endif ++#include ++#include ++#include ++#include ++#include ++#include ++ ++ ++MODULE_AUTHOR("Chia-I Wu"); ++MODULE_VERSION(VHBA_VERSION); ++MODULE_DESCRIPTION("Virtual SCSI HBA"); ++MODULE_LICENSE("GPL"); ++ ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 15, 0) ++#define sdev_dbg(sdev, fmt, a...) \ ++ dev_dbg(&(sdev)->sdev_gendev, fmt, ##a) ++#define scmd_dbg(scmd, fmt, a...) \ ++ dev_dbg(&(scmd)->device->sdev_gendev, fmt, ##a) ++#endif ++ ++#define VHBA_MAX_SECTORS_PER_IO 256 ++#define VHBA_MAX_BUS 16 ++#define VHBA_MAX_ID 16 ++#define VHBA_MAX_DEVICES (VHBA_MAX_BUS * (VHBA_MAX_ID-1)) ++#define VHBA_KBUF_SIZE PAGE_SIZE ++ ++#define DATA_TO_DEVICE(dir) ((dir) == DMA_TO_DEVICE || (dir) == DMA_BIDIRECTIONAL) ++#define DATA_FROM_DEVICE(dir) ((dir) == DMA_FROM_DEVICE || (dir) == DMA_BIDIRECTIONAL) ++ ++ ++static int vhba_can_queue = 32; ++module_param_named(can_queue, vhba_can_queue, int, 0); ++ ++ ++enum vhba_req_state { ++ VHBA_REQ_FREE, ++ VHBA_REQ_PENDING, ++ VHBA_REQ_READING, ++ VHBA_REQ_SENT, ++ VHBA_REQ_WRITING, ++}; ++ ++struct vhba_command { ++ struct scsi_cmnd *cmd; ++ /* metatags are per-host. not to be confused with ++ queue tags that are usually per-lun */ ++ unsigned long metatag; ++ int status; ++ struct list_head entry; ++}; ++ ++struct vhba_device { ++ unsigned int num; ++ spinlock_t cmd_lock; ++ struct list_head cmd_list; ++ wait_queue_head_t cmd_wq; ++ atomic_t refcnt; ++ ++ unsigned char *kbuf; ++ size_t kbuf_size; ++}; ++ ++struct vhba_host { ++ struct Scsi_Host *shost; ++ spinlock_t cmd_lock; ++ int cmd_next; ++ struct vhba_command *commands; ++ spinlock_t dev_lock; ++ struct vhba_device *devices[VHBA_MAX_DEVICES]; ++ int num_devices; ++ DECLARE_BITMAP(chgmap, VHBA_MAX_DEVICES); ++ int chgtype[VHBA_MAX_DEVICES]; ++ struct work_struct scan_devices; ++}; ++ ++#define MAX_COMMAND_SIZE 16 ++ ++struct vhba_request { ++ __u32 metatag; ++ __u32 lun; ++ __u8 cdb[MAX_COMMAND_SIZE]; ++ __u8 cdb_len; ++ __u32 data_len; ++}; ++ ++struct vhba_response { ++ __u32 metatag; ++ __u32 status; ++ __u32 data_len; ++}; ++ ++ ++ ++static struct vhba_command *vhba_alloc_command (void); ++static void vhba_free_command (struct vhba_command *vcmd); ++ ++static struct platform_device vhba_platform_device; ++ ++ ++ ++/* These functions define a symmetric 1:1 mapping between device numbers and ++ the bus and id. We have reserved the last id per bus for the host itself. */ ++static void devnum_to_bus_and_id(unsigned int devnum, unsigned int *bus, unsigned int *id) ++{ ++ *bus = devnum / (VHBA_MAX_ID-1); ++ *id = devnum % (VHBA_MAX_ID-1); ++} ++ ++static unsigned int bus_and_id_to_devnum(unsigned int bus, unsigned int id) ++{ ++ return (bus * (VHBA_MAX_ID-1)) + id; ++} ++ ++static struct vhba_device *vhba_device_alloc (void) ++{ ++ struct vhba_device *vdev; ++ ++ vdev = kzalloc(sizeof(struct vhba_device), GFP_KERNEL); ++ if (!vdev) { ++ return NULL; ++ } ++ ++ spin_lock_init(&vdev->cmd_lock); ++ INIT_LIST_HEAD(&vdev->cmd_list); ++ init_waitqueue_head(&vdev->cmd_wq); ++ atomic_set(&vdev->refcnt, 1); ++ ++ vdev->kbuf = NULL; ++ vdev->kbuf_size = 0; ++ ++ return vdev; ++} ++ ++static void vhba_device_put (struct vhba_device *vdev) ++{ ++ if (atomic_dec_and_test(&vdev->refcnt)) { ++ kfree(vdev); ++ } ++} ++ ++static struct vhba_device *vhba_device_get (struct vhba_device *vdev) ++{ ++ atomic_inc(&vdev->refcnt); ++ ++ return vdev; ++} ++ ++static int vhba_device_queue (struct vhba_device *vdev, struct scsi_cmnd *cmd) ++{ ++ struct vhba_host *vhost; ++ struct vhba_command *vcmd; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ vcmd = vhba_alloc_command(); ++ if (!vcmd) { ++ return SCSI_MLQUEUE_HOST_BUSY; ++ } ++ ++ vcmd->cmd = cmd; ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) ++ vcmd->metatag = scsi_cmd_to_rq(vcmd->cmd)->tag; ++#else ++ vcmd->metatag = vcmd->cmd->request->tag; ++#endif ++ list_add_tail(&vcmd->entry, &vdev->cmd_list); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ wake_up_interruptible(&vdev->cmd_wq); ++ ++ return 0; ++} ++ ++static int vhba_device_dequeue (struct vhba_device *vdev, struct scsi_cmnd *cmd) ++{ ++ struct vhba_command *vcmd; ++ int retval; ++ unsigned long flags; ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ if (vcmd->cmd == cmd) { ++ list_del_init(&vcmd->entry); ++ break; ++ } ++ } ++ ++ /* command not found */ ++ if (&vcmd->entry == &vdev->cmd_list) { ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ return SUCCESS; ++ } ++ ++ while (vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING) { ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ scmd_dbg(cmd, "wait for I/O before aborting\n"); ++ schedule_timeout(1); ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ } ++ ++ retval = (vcmd->status == VHBA_REQ_SENT) ? FAILED : SUCCESS; ++ ++ vhba_free_command(vcmd); ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return retval; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) ++static int vhba_slave_alloc(struct scsi_device *sdev) ++{ ++ struct Scsi_Host *shost = sdev->host; ++ ++ sdev_dbg(sdev, "enabling tagging (queue depth: %i).\n", sdev->queue_depth); ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 17, 0) ++ if (!shost_use_blk_mq(shost) && shost->bqt) { ++#else ++ if (shost->bqt) { ++#endif ++ blk_queue_init_tags(sdev->request_queue, sdev->queue_depth, shost->bqt); ++ } ++ scsi_adjust_queue_depth(sdev, 0, sdev->queue_depth); ++ ++ return 0; ++} ++#endif ++ ++static void vhba_scan_devices_add (struct vhba_host *vhost, int bus, int id) ++{ ++ struct scsi_device *sdev; ++ ++ sdev = scsi_device_lookup(vhost->shost, bus, id, 0); ++ if (!sdev) { ++ scsi_add_device(vhost->shost, bus, id, 0); ++ } else { ++ dev_warn(&vhost->shost->shost_gendev, "tried to add an already-existing device %d:%d:0!\n", bus, id); ++ scsi_device_put(sdev); ++ } ++} ++ ++static void vhba_scan_devices_remove (struct vhba_host *vhost, int bus, int id) ++{ ++ struct scsi_device *sdev; ++ ++ sdev = scsi_device_lookup(vhost->shost, bus, id, 0); ++ if (sdev) { ++ scsi_remove_device(sdev); ++ scsi_device_put(sdev); ++ } else { ++ dev_warn(&vhost->shost->shost_gendev, "tried to remove non-existing device %d:%d:0!\n", bus, id); ++ } ++} ++ ++static void vhba_scan_devices (struct work_struct *work) ++{ ++ struct vhba_host *vhost = container_of(work, struct vhba_host, scan_devices); ++ unsigned long flags; ++ int change, exists; ++ unsigned int devnum; ++ unsigned int bus, id; ++ ++ for (;;) { ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ ++ devnum = find_first_bit(vhost->chgmap, VHBA_MAX_DEVICES); ++ if (devnum >= VHBA_MAX_DEVICES) { ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ break; ++ } ++ change = vhost->chgtype[devnum]; ++ exists = vhost->devices[devnum] != NULL; ++ ++ vhost->chgtype[devnum] = 0; ++ clear_bit(devnum, vhost->chgmap); ++ ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ ++ devnum_to_bus_and_id(devnum, &bus, &id); ++ ++ if (change < 0) { ++ dev_dbg(&vhost->shost->shost_gendev, "trying to remove target %d:%d:0\n", bus, id); ++ vhba_scan_devices_remove(vhost, bus, id); ++ } else if (change > 0) { ++ dev_dbg(&vhost->shost->shost_gendev, "trying to add target %d:%d:0\n", bus, id); ++ vhba_scan_devices_add(vhost, bus, id); ++ } else { ++ /* quick sequence of add/remove or remove/add; we determine ++ which one it was by checking if device structure exists */ ++ if (exists) { ++ /* remove followed by add: remove and (re)add */ ++ dev_dbg(&vhost->shost->shost_gendev, "trying to (re)add target %d:%d:0\n", bus, id); ++ vhba_scan_devices_remove(vhost, bus, id); ++ vhba_scan_devices_add(vhost, bus, id); ++ } else { ++ /* add followed by remove: no-op */ ++ dev_dbg(&vhost->shost->shost_gendev, "no-op for target %d:%d:0\n", bus, id); ++ } ++ } ++ } ++} ++ ++static int vhba_add_device (struct vhba_device *vdev) ++{ ++ struct vhba_host *vhost; ++ unsigned int devnum; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ vhba_device_get(vdev); ++ ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ if (vhost->num_devices >= VHBA_MAX_DEVICES) { ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ vhba_device_put(vdev); ++ return -EBUSY; ++ } ++ ++ for (devnum = 0; devnum < VHBA_MAX_DEVICES; devnum++) { ++ if (vhost->devices[devnum] == NULL) { ++ vdev->num = devnum; ++ vhost->devices[devnum] = vdev; ++ vhost->num_devices++; ++ set_bit(devnum, vhost->chgmap); ++ vhost->chgtype[devnum]++; ++ break; ++ } ++ } ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ ++ schedule_work(&vhost->scan_devices); ++ ++ return 0; ++} ++ ++static int vhba_remove_device (struct vhba_device *vdev) ++{ ++ struct vhba_host *vhost; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ set_bit(vdev->num, vhost->chgmap); ++ vhost->chgtype[vdev->num]--; ++ vhost->devices[vdev->num] = NULL; ++ vhost->num_devices--; ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ ++ vhba_device_put(vdev); ++ ++ schedule_work(&vhost->scan_devices); ++ ++ return 0; ++} ++ ++static struct vhba_device *vhba_lookup_device (int devnum) ++{ ++ struct vhba_host *vhost; ++ struct vhba_device *vdev = NULL; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ if (likely(devnum < VHBA_MAX_DEVICES)) { ++ spin_lock_irqsave(&vhost->dev_lock, flags); ++ vdev = vhost->devices[devnum]; ++ if (vdev) { ++ vdev = vhba_device_get(vdev); ++ } ++ ++ spin_unlock_irqrestore(&vhost->dev_lock, flags); ++ } ++ ++ return vdev; ++} ++ ++static struct vhba_command *vhba_alloc_command (void) ++{ ++ struct vhba_host *vhost; ++ struct vhba_command *vcmd; ++ unsigned long flags; ++ int i; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ spin_lock_irqsave(&vhost->cmd_lock, flags); ++ ++ vcmd = vhost->commands + vhost->cmd_next++; ++ if (vcmd->status != VHBA_REQ_FREE) { ++ for (i = 0; i < vhba_can_queue; i++) { ++ vcmd = vhost->commands + i; ++ ++ if (vcmd->status == VHBA_REQ_FREE) { ++ vhost->cmd_next = i + 1; ++ break; ++ } ++ } ++ ++ if (i == vhba_can_queue) { ++ vcmd = NULL; ++ } ++ } ++ ++ if (vcmd) { ++ vcmd->status = VHBA_REQ_PENDING; ++ } ++ ++ vhost->cmd_next %= vhba_can_queue; ++ ++ spin_unlock_irqrestore(&vhost->cmd_lock, flags); ++ ++ return vcmd; ++} ++ ++static void vhba_free_command (struct vhba_command *vcmd) ++{ ++ struct vhba_host *vhost; ++ unsigned long flags; ++ ++ vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ spin_lock_irqsave(&vhost->cmd_lock, flags); ++ vcmd->status = VHBA_REQ_FREE; ++ spin_unlock_irqrestore(&vhost->cmd_lock, flags); ++} ++ ++static int vhba_queuecommand (struct Scsi_Host *shost, struct scsi_cmnd *cmd) ++{ ++ struct vhba_device *vdev; ++ int retval; ++ unsigned int devnum; ++ ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 15, 0) ++ scmd_dbg(cmd, "queue %p tag %i\n", cmd, scsi_cmd_to_rq(cmd)->tag); ++#else ++ scmd_dbg(cmd, "queue %p tag %i\n", cmd, cmd->request->tag); ++#endif ++ ++ devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id); ++ vdev = vhba_lookup_device(devnum); ++ if (!vdev) { ++ scmd_dbg(cmd, "no such device\n"); ++ ++ cmd->result = DID_NO_CONNECT << 16; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) ++ scsi_done(cmd); ++#else ++ cmd->scsi_done(cmd); ++#endif ++ ++ return 0; ++ } ++ ++ retval = vhba_device_queue(vdev, cmd); ++ ++ vhba_device_put(vdev); ++ ++ return retval; ++} ++ ++static int vhba_abort (struct scsi_cmnd *cmd) ++{ ++ struct vhba_device *vdev; ++ int retval = SUCCESS; ++ unsigned int devnum; ++ ++ scmd_dbg(cmd, "abort %p\n", cmd); ++ ++ devnum = bus_and_id_to_devnum(cmd->device->channel, cmd->device->id); ++ vdev = vhba_lookup_device(devnum); ++ if (vdev) { ++ retval = vhba_device_dequeue(vdev, cmd); ++ vhba_device_put(vdev); ++ } else { ++ cmd->result = DID_NO_CONNECT << 16; ++ } ++ ++ return retval; ++} ++ ++static struct scsi_host_template vhba_template = { ++ .module = THIS_MODULE, ++ .name = "vhba", ++ .proc_name = "vhba", ++ .queuecommand = vhba_queuecommand, ++ .eh_abort_handler = vhba_abort, ++ .this_id = -1, ++ .max_sectors = VHBA_MAX_SECTORS_PER_IO, ++ .sg_tablesize = 256, ++#if LINUX_VERSION_CODE < KERNEL_VERSION(3, 19, 0) ++ .slave_alloc = vhba_slave_alloc, ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(4, 0, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(6, 14, 0) ++ .tag_alloc_policy = BLK_TAG_ALLOC_RR, ++#else ++ .tag_alloc_policy_rr = true, ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(3, 19, 0) && LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) ++ .use_blk_tags = 1, ++#endif ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 0, 0) ++ .max_segment_size = VHBA_KBUF_SIZE, ++#endif ++}; ++ ++static ssize_t do_request (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, char __user *buf, size_t buf_len) ++{ ++ struct vhba_request vreq; ++ ssize_t ret; ++ ++ scmd_dbg(cmd, "request %lu (%p), cdb 0x%x, bufflen %d, sg count %d\n", ++ metatag, cmd, cmd->cmnd[0], scsi_bufflen(cmd), scsi_sg_count(cmd)); ++ ++ ret = sizeof(vreq); ++ if (DATA_TO_DEVICE(cmd->sc_data_direction)) { ++ ret += scsi_bufflen(cmd); ++ } ++ ++ if (ret > buf_len) { ++ scmd_dbg(cmd, "buffer too small (%zd < %zd) for a request\n", buf_len, ret); ++ return -EIO; ++ } ++ ++ vreq.metatag = metatag; ++ vreq.lun = cmd->device->lun; ++ memcpy(vreq.cdb, cmd->cmnd, MAX_COMMAND_SIZE); ++ vreq.cdb_len = cmd->cmd_len; ++ vreq.data_len = scsi_bufflen(cmd); ++ ++ if (copy_to_user(buf, &vreq, sizeof(vreq))) { ++ return -EFAULT; ++ } ++ ++ if (DATA_TO_DEVICE(cmd->sc_data_direction) && vreq.data_len) { ++ buf += sizeof(vreq); ++ ++ if (scsi_sg_count(cmd)) { ++ unsigned char *kaddr, *uaddr; ++ struct scatterlist *sglist = scsi_sglist(cmd); ++ struct scatterlist *sg; ++ int i; ++ ++ uaddr = (unsigned char *) buf; ++ ++ for_each_sg(sglist, sg, scsi_sg_count(cmd), i) { ++ size_t len = sg->length; ++ ++ if (len > vdev->kbuf_size) { ++ scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size); ++ len = vdev->kbuf_size; ++ } ++ ++ kaddr = kmap_atomic(sg_page(sg)); ++ memcpy(vdev->kbuf, kaddr + sg->offset, len); ++ kunmap_atomic(kaddr); ++ ++ if (copy_to_user(uaddr, vdev->kbuf, len)) { ++ return -EFAULT; ++ } ++ uaddr += len; ++ } ++ } else { ++ if (copy_to_user(buf, scsi_sglist(cmd), vreq.data_len)) { ++ return -EFAULT; ++ } ++ } ++ } ++ ++ return ret; ++} ++ ++static ssize_t do_response (struct vhba_device *vdev, unsigned long metatag, struct scsi_cmnd *cmd, const char __user *buf, size_t buf_len, struct vhba_response *res) ++{ ++ ssize_t ret = 0; ++ ++ scmd_dbg(cmd, "response %lu (%p), status %x, data len %d, sg count %d\n", ++ metatag, cmd, res->status, res->data_len, scsi_sg_count(cmd)); ++ ++ if (res->status) { ++ if (res->data_len > SCSI_SENSE_BUFFERSIZE) { ++ scmd_dbg(cmd, "truncate sense (%d < %d)", SCSI_SENSE_BUFFERSIZE, res->data_len); ++ res->data_len = SCSI_SENSE_BUFFERSIZE; ++ } ++ ++ if (copy_from_user(cmd->sense_buffer, buf, res->data_len)) { ++ return -EFAULT; ++ } ++ ++ cmd->result = res->status; ++ ++ ret += res->data_len; ++ } else if (DATA_FROM_DEVICE(cmd->sc_data_direction) && scsi_bufflen(cmd)) { ++ size_t to_read; ++ ++ if (res->data_len > scsi_bufflen(cmd)) { ++ scmd_dbg(cmd, "truncate data (%d < %d)\n", scsi_bufflen(cmd), res->data_len); ++ res->data_len = scsi_bufflen(cmd); ++ } ++ ++ to_read = res->data_len; ++ ++ if (scsi_sg_count(cmd)) { ++ unsigned char *kaddr, *uaddr; ++ struct scatterlist *sglist = scsi_sglist(cmd); ++ struct scatterlist *sg; ++ int i; ++ ++ uaddr = (unsigned char *)buf; ++ ++ for_each_sg(sglist, sg, scsi_sg_count(cmd), i) { ++ size_t len = (sg->length < to_read) ? sg->length : to_read; ++ ++ if (len > vdev->kbuf_size) { ++ scmd_dbg(cmd, "segment size (%zu) exceeds kbuf size (%zu)!", len, vdev->kbuf_size); ++ len = vdev->kbuf_size; ++ } ++ ++ if (copy_from_user(vdev->kbuf, uaddr, len)) { ++ return -EFAULT; ++ } ++ uaddr += len; ++ ++ kaddr = kmap_atomic(sg_page(sg)); ++ memcpy(kaddr + sg->offset, vdev->kbuf, len); ++ kunmap_atomic(kaddr); ++ ++ to_read -= len; ++ if (to_read == 0) { ++ break; ++ } ++ } ++ } else { ++ if (copy_from_user(scsi_sglist(cmd), buf, res->data_len)) { ++ return -EFAULT; ++ } ++ ++ to_read -= res->data_len; ++ } ++ ++ scsi_set_resid(cmd, to_read); ++ ++ ret += res->data_len - to_read; ++ } ++ ++ return ret; ++} ++ ++static struct vhba_command *next_command (struct vhba_device *vdev) ++{ ++ struct vhba_command *vcmd; ++ ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ if (vcmd->status == VHBA_REQ_PENDING) { ++ break; ++ } ++ } ++ ++ if (&vcmd->entry == &vdev->cmd_list) { ++ vcmd = NULL; ++ } ++ ++ return vcmd; ++} ++ ++static struct vhba_command *match_command (struct vhba_device *vdev, __u32 metatag) ++{ ++ struct vhba_command *vcmd; ++ ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ if (vcmd->metatag == metatag) { ++ break; ++ } ++ } ++ ++ if (&vcmd->entry == &vdev->cmd_list) { ++ vcmd = NULL; ++ } ++ ++ return vcmd; ++} ++ ++static struct vhba_command *wait_command (struct vhba_device *vdev, unsigned long flags) ++{ ++ struct vhba_command *vcmd; ++ DEFINE_WAIT(wait); ++ ++ while (!(vcmd = next_command(vdev))) { ++ if (signal_pending(current)) { ++ break; ++ } ++ ++ prepare_to_wait(&vdev->cmd_wq, &wait, TASK_INTERRUPTIBLE); ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ schedule(); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ } ++ ++ finish_wait(&vdev->cmd_wq, &wait); ++ if (vcmd) { ++ vcmd->status = VHBA_REQ_READING; ++ } ++ ++ return vcmd; ++} ++ ++static ssize_t vhba_ctl_read (struct file *file, char __user *buf, size_t buf_len, loff_t *offset) ++{ ++ struct vhba_device *vdev; ++ struct vhba_command *vcmd; ++ ssize_t ret; ++ unsigned long flags; ++ ++ vdev = file->private_data; ++ ++ /* Get next command */ ++ if (file->f_flags & O_NONBLOCK) { ++ /* Non-blocking variant */ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ vcmd = next_command(vdev); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ if (!vcmd) { ++ return -EWOULDBLOCK; ++ } ++ } else { ++ /* Blocking variant */ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ vcmd = wait_command(vdev, flags); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ if (!vcmd) { ++ return -ERESTARTSYS; ++ } ++ } ++ ++ ret = do_request(vdev, vcmd->metatag, vcmd->cmd, buf, buf_len); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ if (ret >= 0) { ++ vcmd->status = VHBA_REQ_SENT; ++ *offset += ret; ++ } else { ++ vcmd->status = VHBA_REQ_PENDING; ++ } ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return ret; ++} ++ ++static ssize_t vhba_ctl_write (struct file *file, const char __user *buf, size_t buf_len, loff_t *offset) ++{ ++ struct vhba_device *vdev; ++ struct vhba_command *vcmd; ++ struct vhba_response res; ++ ssize_t ret; ++ unsigned long flags; ++ ++ if (buf_len < sizeof(res)) { ++ return -EIO; ++ } ++ ++ if (copy_from_user(&res, buf, sizeof(res))) { ++ return -EFAULT; ++ } ++ ++ vdev = file->private_data; ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ vcmd = match_command(vdev, res.metatag); ++ if (!vcmd || vcmd->status != VHBA_REQ_SENT) { ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ pr_debug("ctl dev #%u not expecting response\n", vdev->num); ++ return -EIO; ++ } ++ vcmd->status = VHBA_REQ_WRITING; ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ ret = do_response(vdev, vcmd->metatag, vcmd->cmd, buf + sizeof(res), buf_len - sizeof(res), &res); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ if (ret >= 0) { ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) ++ scsi_done(vcmd->cmd); ++#else ++ vcmd->cmd->scsi_done(vcmd->cmd); ++#endif ++ ret += sizeof(res); ++ ++ /* don't compete with vhba_device_dequeue */ ++ if (!list_empty(&vcmd->entry)) { ++ list_del_init(&vcmd->entry); ++ vhba_free_command(vcmd); ++ } ++ } else { ++ vcmd->status = VHBA_REQ_SENT; ++ } ++ ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return ret; ++} ++ ++static long vhba_ctl_ioctl (struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ struct vhba_device *vdev = file->private_data; ++ struct vhba_host *vhost = platform_get_drvdata(&vhba_platform_device); ++ ++ switch (cmd) { ++ case 0xBEEF001: { ++ unsigned int ident[4]; /* host, channel, id, lun */ ++ ++ ident[0] = vhost->shost->host_no; ++ devnum_to_bus_and_id(vdev->num, &ident[1], &ident[2]); ++ ident[3] = 0; /* lun */ ++ ++ if (copy_to_user((void *) arg, ident, sizeof(ident))) { ++ return -EFAULT; ++ } ++ ++ return 0; ++ } ++ case 0xBEEF002: { ++ unsigned int devnum = vdev->num; ++ ++ if (copy_to_user((void *) arg, &devnum, sizeof(devnum))) { ++ return -EFAULT; ++ } ++ ++ return 0; ++ } ++ } ++ ++ return -ENOTTY; ++} ++ ++#ifdef CONFIG_COMPAT ++static long vhba_ctl_compat_ioctl (struct file *file, unsigned int cmd, unsigned long arg) ++{ ++ unsigned long compat_arg = (unsigned long)compat_ptr(arg); ++ return vhba_ctl_ioctl(file, cmd, compat_arg); ++} ++#endif ++ ++static unsigned int vhba_ctl_poll (struct file *file, poll_table *wait) ++{ ++ struct vhba_device *vdev = file->private_data; ++ unsigned int mask = 0; ++ unsigned long flags; ++ ++ poll_wait(file, &vdev->cmd_wq, wait); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ if (next_command(vdev)) { ++ mask |= POLLIN | POLLRDNORM; ++ } ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ return mask; ++} ++ ++static int vhba_ctl_open (struct inode *inode, struct file *file) ++{ ++ struct vhba_device *vdev; ++ int retval; ++ ++ pr_debug("ctl dev open\n"); ++ ++ /* check if vhba is probed */ ++ if (!platform_get_drvdata(&vhba_platform_device)) { ++ return -ENODEV; ++ } ++ ++ vdev = vhba_device_alloc(); ++ if (!vdev) { ++ return -ENOMEM; ++ } ++ ++ vdev->kbuf_size = VHBA_KBUF_SIZE; ++ vdev->kbuf = kzalloc(vdev->kbuf_size, GFP_KERNEL); ++ if (!vdev->kbuf) { ++ return -ENOMEM; ++ } ++ ++ if (!(retval = vhba_add_device(vdev))) { ++ file->private_data = vdev; ++ } ++ ++ vhba_device_put(vdev); ++ ++ return retval; ++} ++ ++static int vhba_ctl_release (struct inode *inode, struct file *file) ++{ ++ struct vhba_device *vdev; ++ struct vhba_command *vcmd; ++ unsigned long flags; ++ ++ vdev = file->private_data; ++ ++ pr_debug("ctl dev release\n"); ++ ++ vhba_device_get(vdev); ++ vhba_remove_device(vdev); ++ ++ spin_lock_irqsave(&vdev->cmd_lock, flags); ++ list_for_each_entry(vcmd, &vdev->cmd_list, entry) { ++ WARN_ON(vcmd->status == VHBA_REQ_READING || vcmd->status == VHBA_REQ_WRITING); ++ ++ scmd_dbg(vcmd->cmd, "device released with command %lu (%p)\n", vcmd->metatag, vcmd->cmd); ++ vcmd->cmd->result = DID_NO_CONNECT << 16; ++#if LINUX_VERSION_CODE >= KERNEL_VERSION(5, 16, 0) ++ scsi_done(vcmd->cmd); ++#else ++ vcmd->cmd->scsi_done(vcmd->cmd); ++#endif ++ vhba_free_command(vcmd); ++ } ++ INIT_LIST_HEAD(&vdev->cmd_list); ++ spin_unlock_irqrestore(&vdev->cmd_lock, flags); ++ ++ kfree(vdev->kbuf); ++ vdev->kbuf = NULL; ++ ++ vhba_device_put(vdev); ++ ++ return 0; ++} ++ ++static struct file_operations vhba_ctl_fops = { ++ .owner = THIS_MODULE, ++ .open = vhba_ctl_open, ++ .release = vhba_ctl_release, ++ .read = vhba_ctl_read, ++ .write = vhba_ctl_write, ++ .poll = vhba_ctl_poll, ++ .unlocked_ioctl = vhba_ctl_ioctl, ++#ifdef CONFIG_COMPAT ++ .compat_ioctl = vhba_ctl_compat_ioctl, ++#endif ++}; ++ ++static struct miscdevice vhba_miscdev = { ++ .minor = MISC_DYNAMIC_MINOR, ++ .name = "vhba_ctl", ++ .fops = &vhba_ctl_fops, ++}; ++ ++static int vhba_probe (struct platform_device *pdev) ++{ ++ struct Scsi_Host *shost; ++ struct vhba_host *vhost; ++ int i; ++ ++ vhba_can_queue = clamp(vhba_can_queue, 1, 256); ++ ++ shost = scsi_host_alloc(&vhba_template, sizeof(struct vhba_host)); ++ if (!shost) { ++ return -ENOMEM; ++ } ++ ++ shost->max_channel = VHBA_MAX_BUS-1; ++ shost->max_id = VHBA_MAX_ID; ++ /* we don't support lun > 0 */ ++ shost->max_lun = 1; ++ shost->max_cmd_len = MAX_COMMAND_SIZE; ++ shost->can_queue = vhba_can_queue; ++ shost->cmd_per_lun = vhba_can_queue; ++ ++ vhost = (struct vhba_host *)shost->hostdata; ++ memset(vhost, 0, sizeof(struct vhba_host)); ++ ++ vhost->shost = shost; ++ vhost->num_devices = 0; ++ spin_lock_init(&vhost->dev_lock); ++ spin_lock_init(&vhost->cmd_lock); ++ INIT_WORK(&vhost->scan_devices, vhba_scan_devices); ++ vhost->cmd_next = 0; ++ vhost->commands = kzalloc(vhba_can_queue * sizeof(struct vhba_command), GFP_KERNEL); ++ if (!vhost->commands) { ++ return -ENOMEM; ++ } ++ ++ for (i = 0; i < vhba_can_queue; i++) { ++ vhost->commands[i].status = VHBA_REQ_FREE; ++ } ++ ++ platform_set_drvdata(pdev, vhost); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(4, 4, 0) ++ i = scsi_init_shared_tag_map(shost, vhba_can_queue); ++ if (i) return i; ++#endif ++ ++ if (scsi_add_host(shost, &pdev->dev)) { ++ scsi_host_put(shost); ++ return -ENOMEM; ++ } ++ ++ return 0; ++} ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) ++static int vhba_remove (struct platform_device *pdev) ++#else ++static void vhba_remove (struct platform_device *pdev) ++#endif ++{ ++ struct vhba_host *vhost; ++ struct Scsi_Host *shost; ++ ++ vhost = platform_get_drvdata(pdev); ++ shost = vhost->shost; ++ ++ scsi_remove_host(shost); ++ scsi_host_put(shost); ++ ++ kfree(vhost->commands); ++ ++#if LINUX_VERSION_CODE < KERNEL_VERSION(6, 11, 0) ++ return 0; ++#endif ++} ++ ++static void vhba_release (struct device * dev) ++{ ++ return; ++} ++ ++static struct platform_device vhba_platform_device = { ++ .name = "vhba", ++ .id = -1, ++ .dev = { ++ .release = vhba_release, ++ }, ++}; ++ ++static struct platform_driver vhba_platform_driver = { ++ .driver = { ++ .owner = THIS_MODULE, ++ .name = "vhba", ++ }, ++ .probe = vhba_probe, ++ .remove = vhba_remove, ++}; ++ ++static int __init vhba_init (void) ++{ ++ int ret; ++ ++ ret = platform_device_register(&vhba_platform_device); ++ if (ret < 0) { ++ return ret; ++ } ++ ++ ret = platform_driver_register(&vhba_platform_driver); ++ if (ret < 0) { ++ platform_device_unregister(&vhba_platform_device); ++ return ret; ++ } ++ ++ ret = misc_register(&vhba_miscdev); ++ if (ret < 0) { ++ platform_driver_unregister(&vhba_platform_driver); ++ platform_device_unregister(&vhba_platform_device); ++ return ret; ++ } ++ ++ return 0; ++} ++ ++static void __exit vhba_exit(void) ++{ ++ misc_deregister(&vhba_miscdev); ++ platform_driver_unregister(&vhba_platform_driver); ++ platform_device_unregister(&vhba_platform_device); ++} ++ ++module_init(vhba_init); ++module_exit(vhba_exit); ++ +diff --git a/include/linux/mm.h b/include/linux/mm.h +index 1ae97a0b8ec7..db640e1b17ec 100644 +--- a/include/linux/mm.h ++++ b/include/linux/mm.h +@@ -194,6 +194,14 @@ static inline void __mm_zero_struct_page(struct page *page) + + extern int sysctl_max_map_count; + ++extern bool sysctl_workingset_protection; ++extern u8 sysctl_anon_min_ratio; ++extern u8 sysctl_clean_low_ratio; ++extern u8 sysctl_clean_min_ratio; ++int vm_workingset_protection_update_handler( ++ const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos); ++ + extern unsigned long sysctl_user_reserve_kbytes; + extern unsigned long sysctl_admin_reserve_kbytes; + +diff --git a/include/linux/pagemap.h b/include/linux/pagemap.h +index 12a12dae727d..b460a691b357 100644 +--- a/include/linux/pagemap.h ++++ b/include/linux/pagemap.h +@@ -1337,7 +1337,7 @@ struct readahead_control { + ._index = i, \ + } + +-#define VM_READAHEAD_PAGES (SZ_128K / PAGE_SIZE) ++#define VM_READAHEAD_PAGES (SZ_8M / PAGE_SIZE) + + void page_cache_ra_unbounded(struct readahead_control *, + unsigned long nr_to_read, unsigned long lookahead_count); +diff --git a/include/linux/user_namespace.h b/include/linux/user_namespace.h +index a0bb6d012137..93129fea552e 100644 +--- a/include/linux/user_namespace.h ++++ b/include/linux/user_namespace.h +@@ -168,6 +168,8 @@ static inline void set_userns_rlimit_max(struct user_namespace *ns, + + #ifdef CONFIG_USER_NS + ++extern int unprivileged_userns_clone; ++ + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) + { + if (ns) +@@ -201,6 +203,8 @@ extern bool current_in_userns(const struct user_namespace *target_ns); + struct ns_common *ns_get_owner(struct ns_common *ns); + #else + ++#define unprivileged_userns_clone 0 ++ + static inline struct user_namespace *get_user_ns(struct user_namespace *ns) + { + return &init_user_ns; +diff --git a/init/Kconfig b/init/Kconfig +index d811cad02a75..e4b7a7062838 100644 +--- a/init/Kconfig ++++ b/init/Kconfig +@@ -171,6 +171,10 @@ config THREAD_INFO_IN_TASK + + menu "General setup" + ++config CACHY ++ bool "Some kernel tweaks by CachyOS" ++ default y ++ + config BROKEN + bool + help +@@ -1375,6 +1379,22 @@ config USER_NS + + If unsure, say N. + ++config USER_NS_UNPRIVILEGED ++ bool "Allow unprivileged users to create namespaces" ++ default y ++ depends on USER_NS ++ help ++ When disabled, unprivileged users will not be able to create ++ new namespaces. Allowing users to create their own namespaces ++ has been part of several recent local privilege escalation ++ exploits, so if you need user namespaces but are ++ paranoid^Wsecurity-conscious you want to disable this. ++ ++ This setting can be overridden at runtime via the ++ kernel.unprivileged_userns_clone sysctl. ++ ++ If unsure, say Y. ++ + config PID_NS + bool "PID Namespaces" + default y +@@ -1524,6 +1544,12 @@ config CC_OPTIMIZE_FOR_PERFORMANCE + with the "-O2" compiler flag for best performance and most + helpful compile-time warnings. + ++config CC_OPTIMIZE_FOR_PERFORMANCE_O3 ++ bool "Optimize more for performance (-O3)" ++ help ++ Choosing this option will pass "-O3" to your compiler to optimize ++ the kernel yet more for performance. ++ + config CC_OPTIMIZE_FOR_SIZE + bool "Optimize for size (-Os)" + help +diff --git a/kernel/Kconfig.hz b/kernel/Kconfig.hz +index ce1435cb08b1..e1359db5561e 100644 +--- a/kernel/Kconfig.hz ++++ b/kernel/Kconfig.hz +@@ -40,6 +40,27 @@ choice + on SMP and NUMA systems and exactly dividing by both PAL and + NTSC frame rates for video and multimedia work. + ++ config HZ_500 ++ bool "500 HZ" ++ help ++ 500 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with good smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ ++ config HZ_600 ++ bool "600 HZ" ++ help ++ 600 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with good smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ ++ config HZ_750 ++ bool "750 HZ" ++ help ++ 750 Hz is a balanced timer frequency. Provides fast interactivity ++ on desktops with good smoothness without increasing CPU power ++ consumption and sacrificing the battery life on laptops. ++ + config HZ_1000 + bool "1000 HZ" + help +@@ -53,6 +74,9 @@ config HZ + default 100 if HZ_100 + default 250 if HZ_250 + default 300 if HZ_300 ++ default 500 if HZ_500 ++ default 600 if HZ_600 ++ default 750 if HZ_750 + default 1000 if HZ_1000 + + config SCHED_HRTICK +diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt +index 54ea59ff8fbe..18f87e0dd137 100644 +--- a/kernel/Kconfig.preempt ++++ b/kernel/Kconfig.preempt +@@ -88,7 +88,7 @@ endchoice + + config PREEMPT_RT + bool "Fully Preemptible Kernel (Real-Time)" +- depends on EXPERT && ARCH_SUPPORTS_RT && !COMPILE_TEST ++ depends on ARCH_SUPPORTS_RT && !COMPILE_TEST + select PREEMPTION + help + This option turns the kernel into a real-time kernel by replacing +diff --git a/kernel/fork.c b/kernel/fork.c +index af673856499d..d91fa2d9bce1 100644 +--- a/kernel/fork.c ++++ b/kernel/fork.c +@@ -107,6 +107,10 @@ + #include + #include + ++#ifdef CONFIG_USER_NS ++#include ++#endif ++ + #include + #include + #include +@@ -1938,6 +1942,10 @@ __latent_entropy struct task_struct *copy_process( + if ((clone_flags & (CLONE_NEWUSER|CLONE_FS)) == (CLONE_NEWUSER|CLONE_FS)) + return ERR_PTR(-EINVAL); + ++ if ((clone_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) ++ if (!capable(CAP_SYS_ADMIN)) ++ return ERR_PTR(-EPERM); ++ + /* + * Thread groups must share signals as well, and detached threads + * can only be started up within the thread group. +@@ -3105,6 +3113,12 @@ int ksys_unshare(unsigned long unshare_flags) + if (unshare_flags & CLONE_NEWNS) + unshare_flags |= CLONE_FS; + ++ if ((unshare_flags & CLONE_NEWUSER) && !unprivileged_userns_clone) { ++ err = -EPERM; ++ if (!capable(CAP_SYS_ADMIN)) ++ goto bad_unshare_out; ++ } ++ + err = check_unshare_flags(unshare_flags); + if (err) + goto bad_unshare_out; +diff --git a/kernel/locking/rwsem.c b/kernel/locking/rwsem.c +index 24df4d98f7d2..1d5923996fa5 100644 +--- a/kernel/locking/rwsem.c ++++ b/kernel/locking/rwsem.c +@@ -746,6 +746,7 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) + struct task_struct *new, *owner; + unsigned long flags, new_flags; + enum owner_state state; ++ int i = 0; + + lockdep_assert_preemption_disabled(); + +@@ -782,7 +783,8 @@ rwsem_spin_on_owner(struct rw_semaphore *sem) + break; + } + +- cpu_relax(); ++ if (i++ > 1000) ++ cpu_relax(); + } + + return state; +diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c +index b173a059315c..226a96cd2536 100644 +--- a/kernel/sched/fair.c ++++ b/kernel/sched/fair.c +@@ -76,10 +76,19 @@ unsigned int sysctl_sched_tunable_scaling = SCHED_TUNABLESCALING_LOG; + * + * (default: 0.70 msec * (1 + ilog(ncpus)), units: nanoseconds) + */ ++#ifdef CONFIG_CACHY ++unsigned int sysctl_sched_base_slice = 350000ULL; ++static unsigned int normalized_sysctl_sched_base_slice = 350000ULL; ++#else + unsigned int sysctl_sched_base_slice = 700000ULL; + static unsigned int normalized_sysctl_sched_base_slice = 700000ULL; ++#endif /* CONFIG_CACHY */ + ++#ifdef CONFIG_CACHY ++__read_mostly unsigned int sysctl_sched_migration_cost = 300000UL; ++#else + __read_mostly unsigned int sysctl_sched_migration_cost = 500000UL; ++#endif + + static int __init setup_sched_thermal_decay_shift(char *str) + { +@@ -122,8 +131,12 @@ int __weak arch_asym_cpu_priority(int cpu) + * + * (default: 5 msec, units: microseconds) + */ ++#ifdef CONFIG_CACHY ++static unsigned int sysctl_sched_cfs_bandwidth_slice = 3000UL; ++#else + static unsigned int sysctl_sched_cfs_bandwidth_slice = 5000UL; + #endif ++#endif + + #ifdef CONFIG_NUMA_BALANCING + /* Restrict the NUMA promotion throughput (MB/s) for each target node. */ +diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h +index be9745d104f7..4ee277cb92b9 100644 +--- a/kernel/sched/sched.h ++++ b/kernel/sched/sched.h +@@ -2769,7 +2769,7 @@ extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); + + extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); + +-#ifdef CONFIG_PREEMPT_RT ++#if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY) + # define SCHED_NR_MIGRATE_BREAK 8 + #else + # define SCHED_NR_MIGRATE_BREAK 32 +diff --git a/kernel/sysctl.c b/kernel/sysctl.c +index cb6196e3fa99..cc5bf841e3fe 100644 +--- a/kernel/sysctl.c ++++ b/kernel/sysctl.c +@@ -23,6 +23,10 @@ + #include + #include + ++#ifdef CONFIG_USER_NS ++#include ++#endif ++ + /* shared constants to be used in various sysctls */ + const int sysctl_vals[] = { 0, 1, 2, 3, 4, 100, 200, 1000, 3000, INT_MAX, 65535, -1 }; + EXPORT_SYMBOL(sysctl_vals); +@@ -1455,6 +1459,15 @@ int proc_do_static_key(const struct ctl_table *table, int write, + } + + static const struct ctl_table sysctl_subsys_table[] = { ++#ifdef CONFIG_USER_NS ++ { ++ .procname = "unprivileged_userns_clone", ++ .data = &unprivileged_userns_clone, ++ .maxlen = sizeof(int), ++ .mode = 0644, ++ .proc_handler = proc_dointvec, ++ }, ++#endif + #ifdef CONFIG_PROC_SYSCTL + { + .procname = "sysctl_writes_strict", +diff --git a/kernel/user_namespace.c b/kernel/user_namespace.c +index 682f40d5632d..434a25f7b2ed 100644 +--- a/kernel/user_namespace.c ++++ b/kernel/user_namespace.c +@@ -22,6 +22,13 @@ + #include + #include + ++/* sysctl */ ++#ifdef CONFIG_USER_NS_UNPRIVILEGED ++int unprivileged_userns_clone = 1; ++#else ++int unprivileged_userns_clone; ++#endif ++ + static struct kmem_cache *user_ns_cachep __ro_after_init; + static DEFINE_MUTEX(userns_state_mutex); + +diff --git a/mm/Kconfig b/mm/Kconfig +index e443fe8cd6cf..d3148d9d335d 100644 +--- a/mm/Kconfig ++++ b/mm/Kconfig +@@ -462,6 +462,69 @@ config ARCH_WANT_OPTIMIZE_HUGETLB_VMEMMAP + config ARCH_WANT_HUGETLB_VMEMMAP_PREINIT + bool + ++config ANON_MIN_RATIO ++ int "Default value for vm.anon_min_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 1 ++ help ++ This option sets the default value for vm.anon_min_ratio sysctl knob. ++ ++ The vm.anon_min_ratio sysctl knob provides *hard* protection of ++ anonymous pages. The anonymous pages on the current node won't be ++ reclaimed under any conditions when their amount is below ++ vm.anon_min_ratio. This knob may be used to prevent excessive swap ++ thrashing when anonymous memory is low (for example, when memory is ++ going to be overfilled by compressed data of zram module). ++ ++ Setting this value too high (close to MemTotal) can result in ++ inability to swap and can lead to early OOM under memory pressure. ++ ++config CLEAN_LOW_RATIO ++ int "Default value for vm.clean_low_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 15 ++ help ++ This option sets the default value for vm.clean_low_ratio sysctl knob. ++ ++ The vm.clean_low_ratio sysctl knob provides *best-effort* ++ protection of clean file pages. The file pages on the current node ++ won't be reclaimed under memory pressure when the amount of clean file ++ pages is below vm.clean_low_ratio *unless* we threaten to OOM. ++ Protection of clean file pages using this knob may be used when ++ swapping is still possible to ++ - prevent disk I/O thrashing under memory pressure; ++ - improve performance in disk cache-bound tasks under memory ++ pressure. ++ ++ Setting it to a high value may result in a early eviction of anonymous ++ pages into the swap space by attempting to hold the protected amount ++ of clean file pages in memory. ++ ++config CLEAN_MIN_RATIO ++ int "Default value for vm.clean_min_ratio" ++ depends on SYSCTL ++ range 0 100 ++ default 4 ++ help ++ This option sets the default value for vm.clean_min_ratio sysctl knob. ++ ++ The vm.clean_min_ratio sysctl knob provides *hard* protection of ++ clean file pages. The file pages on the current node won't be ++ reclaimed under memory pressure when the amount of clean file pages is ++ below vm.clean_min_ratio. Hard protection of clean file pages using ++ this knob may be used to ++ - prevent disk I/O thrashing under memory pressure even with no free ++ swap space; ++ - improve performance in disk cache-bound tasks under memory ++ pressure; ++ - avoid high latency and prevent livelock in near-OOM conditions. ++ ++ Setting it to a high value may result in a early out-of-memory condition ++ due to the inability to reclaim the protected amount of clean file pages ++ when other types of pages cannot be reclaimed. ++ + config HAVE_MEMBLOCK_PHYS_MAP + bool + +@@ -658,7 +721,7 @@ config COMPACTION + config COMPACT_UNEVICTABLE_DEFAULT + int + depends on COMPACTION +- default 0 if PREEMPT_RT ++ default 0 if PREEMPT_RT || CACHY + default 1 + + # +diff --git a/mm/compaction.c b/mm/compaction.c +index bf021b31c7ec..cd1c1ece9888 100644 +--- a/mm/compaction.c ++++ b/mm/compaction.c +@@ -1887,7 +1887,11 @@ static int sysctl_compact_unevictable_allowed __read_mostly = CONFIG_COMPACT_UNE + * aggressively the kernel should compact memory in the + * background. It takes values in the range [0, 100]. + */ ++#ifdef CONFIG_CACHY ++static unsigned int __read_mostly sysctl_compaction_proactiveness; ++#else + static unsigned int __read_mostly sysctl_compaction_proactiveness = 20; ++#endif + static int sysctl_extfrag_threshold = 500; + static int __read_mostly sysctl_compact_memory; + +diff --git a/mm/huge_memory.c b/mm/huge_memory.c +index 9c38a95e9f09..4bc77b92d649 100644 +--- a/mm/huge_memory.c ++++ b/mm/huge_memory.c +@@ -63,7 +63,11 @@ unsigned long transparent_hugepage_flags __read_mostly = + #ifdef CONFIG_TRANSPARENT_HUGEPAGE_MADVISE + (1<> (20 - PAGE_SHIFT); + + /* Use a smaller cluster for small-memory machines */ +@@ -1103,6 +1107,7 @@ void __init swap_setup(void) + page_cluster = 2; + else + page_cluster = 3; ++#endif /* CONFIG_CACHY */ + /* + * Right now other parts of the system means that we + * _really_ don't want to cluster much more +diff --git a/mm/util.c b/mm/util.c +index f814e6a59ab1..a84d4f4a6195 100644 +--- a/mm/util.c ++++ b/mm/util.c +@@ -858,6 +858,40 @@ static const struct ctl_table util_sysctl_table[] = { + .mode = 0644, + .proc_handler = proc_doulongvec_minmax, + }, ++ { ++ .procname = "workingset_protection", ++ .data = &sysctl_workingset_protection, ++ .maxlen = sizeof(bool), ++ .mode = 0644, ++ .proc_handler = &proc_dobool, ++ }, ++ { ++ .procname = "anon_min_ratio", ++ .data = &sysctl_anon_min_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, ++ { ++ .procname = "clean_low_ratio", ++ .data = &sysctl_clean_low_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, ++ { ++ .procname = "clean_min_ratio", ++ .data = &sysctl_clean_min_ratio, ++ .maxlen = sizeof(u8), ++ .mode = 0644, ++ .proc_handler = &vm_workingset_protection_update_handler, ++ .extra1 = SYSCTL_ZERO, ++ .extra2 = SYSCTL_ONE_HUNDRED, ++ }, + }; + + static int __init init_vm_util_sysctls(void) +diff --git a/mm/vmpressure.c b/mm/vmpressure.c +index c197ed47bcc4..1b359dcc88c4 100644 +--- a/mm/vmpressure.c ++++ b/mm/vmpressure.c +@@ -43,7 +43,11 @@ static const unsigned long vmpressure_win = SWAP_CLUSTER_MAX * 16; + * essence, they are percents: the higher the value, the more number + * unsuccessful reclaims there were. + */ ++#ifdef CONFIG_CACHY ++static const unsigned int vmpressure_level_med = 65; ++#else + static const unsigned int vmpressure_level_med = 60; ++#endif + static const unsigned int vmpressure_level_critical = 95; + + /* +diff --git a/mm/vmscan.c b/mm/vmscan.c +index a48aec8bfd92..e2c3f8712bbb 100644 +--- a/mm/vmscan.c ++++ b/mm/vmscan.c +@@ -147,6 +147,15 @@ struct scan_control { + /* The file folios on the current node are dangerously low */ + unsigned int file_is_tiny:1; + ++ /* The anonymous pages on the current node are below vm.anon_min_ratio */ ++ unsigned int anon_below_min:1; ++ ++ /* The clean file pages on the current node are below vm.clean_low_ratio */ ++ unsigned int clean_below_low:1; ++ ++ /* The clean file pages on the current node are below vm.clean_min_ratio */ ++ unsigned int clean_below_min:1; ++ + /* Always discard instead of demoting to lower tier memory */ + unsigned int no_demotion:1; + +@@ -196,10 +205,23 @@ struct scan_control { + #define prefetchw_prev_lru_folio(_folio, _base, _field) do { } while (0) + #endif + ++bool sysctl_workingset_protection __read_mostly = true; ++u8 sysctl_anon_min_ratio __read_mostly = CONFIG_ANON_MIN_RATIO; ++u8 sysctl_clean_low_ratio __read_mostly = CONFIG_CLEAN_LOW_RATIO; ++u8 sysctl_clean_min_ratio __read_mostly = CONFIG_CLEAN_MIN_RATIO; ++static u64 sysctl_anon_min_ratio_kb __read_mostly = 0; ++static u64 sysctl_clean_low_ratio_kb __read_mostly = 0; ++static u64 sysctl_clean_min_ratio_kb __read_mostly = 0; ++static u64 workingset_protection_prev_totalram __read_mostly = 0; ++ + /* + * From 0 .. MAX_SWAPPINESS. Higher means more swappy. + */ ++#ifdef CONFIG_CACHY ++int vm_swappiness = 100; ++#else + int vm_swappiness = 60; ++#endif + + #ifdef CONFIG_MEMCG + +@@ -1157,6 +1179,10 @@ static unsigned int shrink_folio_list(struct list_head *folio_list, + if (!sc->may_unmap && folio_mapped(folio)) + goto keep_locked; + ++ if (folio_is_file_lru(folio) ? sc->clean_below_min : ++ (sc->anon_below_min && !sc->clean_below_min)) ++ goto keep_locked; ++ + /* + * The number of dirty pages determines if a node is marked + * reclaim_congested. kswapd will stall and start writing +@@ -2606,6 +2632,15 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + goto out; + } + ++ /* ++ * Force-scan anon if clean file pages is under vm.clean_low_ratio ++ * or vm.clean_min_ratio. ++ */ ++ if (sc->clean_below_low || sc->clean_below_min) { ++ scan_balance = SCAN_ANON; ++ goto out; ++ } ++ + /* + * If there is enough inactive page cache, we do not reclaim + * anything from the anonymous working right now to make sure +@@ -2664,6 +2699,14 @@ static void get_scan_count(struct lruvec *lruvec, struct scan_control *sc, + BUG(); + } + ++ /* ++ * Hard protection of the working set. ++ * Don't reclaim anon/file pages when the amount is ++ * below the watermark of the same type. ++ */ ++ if (file ? sc->clean_below_min : sc->anon_below_min) ++ scan = 0; ++ + nr[lru] = scan; + } + } +@@ -2684,6 +2727,96 @@ static bool can_age_anon_pages(struct lruvec *lruvec, + lruvec_memcg(lruvec)); + } + ++int vm_workingset_protection_update_handler(const struct ctl_table *table, int write, ++ void __user *buffer, size_t *lenp, loff_t *ppos) ++{ ++ int ret = proc_dou8vec_minmax(table, write, buffer, lenp, ppos); ++ if (ret || !write) ++ return ret; ++ ++ workingset_protection_prev_totalram = 0; ++ ++ return 0; ++} ++ ++static void prepare_workingset_protection(pg_data_t *pgdat, struct scan_control *sc) ++{ ++ unsigned long node_mem_total; ++ struct sysinfo i; ++ ++ if (!(sysctl_workingset_protection)) { ++ sc->anon_below_min = 0; ++ sc->clean_below_low = 0; ++ sc->clean_below_min = 0; ++ return; ++ } ++ ++ if (likely(sysctl_anon_min_ratio || ++ sysctl_clean_low_ratio || ++ sysctl_clean_min_ratio)) { ++#ifdef CONFIG_NUMA ++ si_meminfo_node(&i, pgdat->node_id); ++#else //CONFIG_NUMA ++ si_meminfo(&i); ++#endif //CONFIG_NUMA ++ node_mem_total = i.totalram; ++ ++ if (unlikely(workingset_protection_prev_totalram != node_mem_total)) { ++ sysctl_anon_min_ratio_kb = ++ node_mem_total * sysctl_anon_min_ratio / 100; ++ sysctl_clean_low_ratio_kb = ++ node_mem_total * sysctl_clean_low_ratio / 100; ++ sysctl_clean_min_ratio_kb = ++ node_mem_total * sysctl_clean_min_ratio / 100; ++ workingset_protection_prev_totalram = node_mem_total; ++ } ++ } ++ ++ /* ++ * Check the number of anonymous pages to protect them from ++ * reclaiming if their amount is below the specified. ++ */ ++ if (sysctl_anon_min_ratio) { ++ unsigned long reclaimable_anon; ++ ++ reclaimable_anon = ++ node_page_state(pgdat, NR_ACTIVE_ANON) + ++ node_page_state(pgdat, NR_INACTIVE_ANON) + ++ node_page_state(pgdat, NR_ISOLATED_ANON); ++ ++ sc->anon_below_min = reclaimable_anon < sysctl_anon_min_ratio_kb; ++ } else ++ sc->anon_below_min = 0; ++ ++ /* ++ * Check the number of clean file pages to protect them from ++ * reclaiming if their amount is below the specified. ++ */ ++ if (sysctl_clean_low_ratio || sysctl_clean_min_ratio) { ++ unsigned long reclaimable_file, dirty, clean; ++ ++ reclaimable_file = ++ node_page_state(pgdat, NR_ACTIVE_FILE) + ++ node_page_state(pgdat, NR_INACTIVE_FILE) + ++ node_page_state(pgdat, NR_ISOLATED_FILE); ++ dirty = node_page_state(pgdat, NR_FILE_DIRTY); ++ /* ++ * node_page_state() sum can go out of sync since ++ * all the values are not read at once. ++ */ ++ if (likely(reclaimable_file > dirty)) ++ clean = reclaimable_file - dirty; ++ else ++ clean = 0; ++ ++ sc->clean_below_low = clean < sysctl_clean_low_ratio_kb; ++ sc->clean_below_min = clean < sysctl_clean_min_ratio_kb; ++ } else { ++ sc->clean_below_low = 0; ++ sc->clean_below_min = 0; ++ } ++} ++ + #ifdef CONFIG_LRU_GEN + + #ifdef CONFIG_LRU_GEN_ENABLED +@@ -4667,11 +4800,21 @@ static int get_tier_idx(struct lruvec *lruvec, int type) + return tier - 1; + } + +-static int get_type_to_scan(struct lruvec *lruvec, int swappiness) ++static int get_type_to_scan(struct lruvec *lruvec, struct scan_control *sc, int swappiness) + { + struct ctrl_pos sp, pv; + +- if (swappiness <= MIN_SWAPPINESS + 1) ++ if (swappiness == MIN_SWAPPINESS) ++ return LRU_GEN_FILE; ++ ++ if (sc->clean_below_min) ++ return LRU_GEN_ANON; ++ if (sc->anon_below_min) ++ return LRU_GEN_FILE; ++ if (sc->clean_below_low) ++ return LRU_GEN_ANON; ++ ++ if (swappiness == MIN_SWAPPINESS + 1) + return LRU_GEN_FILE; + + if (swappiness >= MAX_SWAPPINESS) +@@ -4691,7 +4834,7 @@ static int isolate_folios(unsigned long nr_to_scan, struct lruvec *lruvec, + int *type_scanned, struct list_head *list) + { + int i; +- int type = get_type_to_scan(lruvec, swappiness); ++ int type = get_type_to_scan(lruvec, sc, swappiness); + + for_each_evictable_type(i, swappiness) { + int scanned; +@@ -4937,6 +5080,12 @@ static int shrink_one(struct lruvec *lruvec, struct scan_control *sc) + struct mem_cgroup *memcg = lruvec_memcg(lruvec); + struct pglist_data *pgdat = lruvec_pgdat(lruvec); + ++ prepare_workingset_protection(pgdat, sc); ++ ++ if (sysctl_workingset_protection && sc->clean_below_min && ++ !can_reclaim_anon_pages(memcg, pgdat->node_id, sc)) ++ return 0; ++ + /* lru_gen_age_node() called mem_cgroup_calculate_protection() */ + if (mem_cgroup_below_min(NULL, memcg)) + return MEMCG_LRU_YOUNG; +@@ -6089,6 +6238,8 @@ static void shrink_node(pg_data_t *pgdat, struct scan_control *sc) + + prepare_scan_control(pgdat, sc); + ++ prepare_workingset_protection(pgdat, sc); ++ + shrink_node_memcgs(pgdat, sc); + + flush_reclaim_state(sc); +diff --git a/scripts/Makefile.thinlto b/scripts/Makefile.thinlto +new file mode 100644 +index 000000000000..ec98fa2ead3b +--- /dev/null ++++ b/scripts/Makefile.thinlto +@@ -0,0 +1,38 @@ ++PHONY := __default ++__default: ++ ++include include/config/auto.conf ++include $(srctree)/scripts/Kbuild.include ++include $(srctree)/scripts/Makefile.lib ++ ++native-objs := $(patsubst %.o,%.thinlto-native.o,$(call read-file, vmlinux.thinlto-index)) ++ ++__default: $(native-objs) ++ ++# Generate .thinlto-native.o (obj) from .o (bitcode) and .thinlto.bc (summary) files ++# --------------------------------------------------------------------------- ++quiet_cmd_cc_o_bc = CC $(quiet_modtag) $@ ++ cmd_cc_o_bc = \ ++ $(CC) $(_c_flags) -fno-lto -Wno-unused-command-line-argument \ ++ -fthinlto-index=$(word 2, $^) -c -o $@ $< ++ ++targets += $(native-objs) ++$(native-objs): %.thinlto-native.o: %.o %.o.thinlto.bc FORCE ++ $(call if_changed,cc_o_bc) ++ ++# Add FORCE to the prerequisites of a target to force it to be always rebuilt. ++# --------------------------------------------------------------------------- ++ ++PHONY += FORCE ++FORCE: ++ ++# Read all saved command lines and dependencies for the $(targets) we ++# may be building above, using $(if_changed{,_dep}). As an ++# optimization, we don't need to read them if the target does not ++# exist, we will rebuild anyway in that case. ++ ++existing-targets := $(wildcard $(sort $(targets))) ++ ++-include $(foreach f,$(existing-targets),$(dir $(f)).$(notdir $(f)).cmd) ++ ++.PHONY: $(PHONY) +diff --git a/scripts/Makefile.vmlinux_a b/scripts/Makefile.vmlinux_a +new file mode 100644 +index 000000000000..73c9545de7cf +--- /dev/null ++++ b/scripts/Makefile.vmlinux_a +@@ -0,0 +1,83 @@ ++# SPDX-License-Identifier: GPL-2.0-only ++ ++PHONY := __default ++__default: vmlinux.a ++ ++include include/config/auto.conf ++include $(srctree)/scripts/Kbuild.include ++include $(srctree)/scripts/Makefile.lib ++ ++# Link of built-in-fixup.a ++# --------------------------------------------------------------------------- ++ ++# '$(AR) mPi' needs 'T' to workaround the bug of llvm-ar <= 14 ++quiet_cmd_ar_builtin_fixup = AR $@ ++ cmd_ar_builtin_fixup = \ ++ rm -f $@; \ ++ $(AR) cDPrST $@ $(KBUILD_VMLINUX_OBJS); \ ++ $(AR) mPiT $$($(AR) t $@ | sed -n 1p) $@ $$($(AR) t $@ | grep -F -f $(srctree)/scripts/head-object-list.txt) ++ ++targets += built-in-fixup.a ++built-in-fixup.a: $(KBUILD_VMLINUX_OBJS) scripts/head-object-list.txt FORCE ++ $(call if_changed,ar_builtin_fixup) ++ ++ifdef CONFIG_LTO_CLANG_THIN_DIST ++ ++quiet_cmd_builtin.order = GEN $@ ++ cmd_builtin.order = $(AR) t $< > $@ ++ ++targets += builtin.order ++builtin.order: built-in-fixup.a FORCE ++ $(call if_changed,builtin.order) ++ ++quiet_cmd_ld_thinlto_index = LD $@ ++ cmd_ld_thinlto_index = \ ++ $(LD) $(KBUILD_LDFLAGS) -r --thinlto-index-only=$@ @$< ++ ++targets += vmlinux.thinlto-index ++vmlinux.thinlto-index: builtin.order FORCE ++ $(call if_changed,ld_thinlto_index) ++ ++quiet_cmd_ar_vmlinux.a = GEN $@ ++ cmd_ar_vmlinux.a = \ ++ rm -f $@; \ ++ while read -r obj; do \ ++ if grep -q $${obj} $(word 2, $^); then \ ++ echo $${obj%.o}.thinlto-native.o; \ ++ else \ ++ echo $${obj}; \ ++ fi; \ ++ done < $< | xargs $(AR) cDPrS $@ ++ ++targets += vmlinux.a ++vmlinux.a: builtin.order vmlinux.thinlto-index FORCE ++ $(Q)$(MAKE) -f $(srctree)/scripts/Makefile.thinlto ++ $(call if_changed,ar_vmlinux.a) ++ ++else ++ ++# vmlinux.a ++# --------------------------------------------------------------------------- ++ ++targets += vmlinux.a ++vmlinux.a: built-in-fixup.a FORCE ++ $(call if_changed,copy) ++ ++endif ++ ++# Add FORCE to the prerequisites of a target to force it to be always rebuilt. ++# --------------------------------------------------------------------------- ++ ++PHONY += FORCE ++FORCE: ++ ++# Read all saved command lines and dependencies for the $(targets) we ++# may be building above, using $(if_changed{,_dep}). As an ++# optimization, we don't need to read them if the target does not ++# exist, we will rebuild anyway in that case. ++ ++existing-targets := $(wildcard $(sort $(targets))) ++ ++-include $(foreach f,$(existing-targets),$(dir $(f)).$(notdir $(f)).cmd) ++ ++.PHONY: $(PHONY) +diff --git a/scripts/mod/modpost.c b/scripts/mod/modpost.c +index 5ca7c268294e..8b01746c9ce6 100644 +--- a/scripts/mod/modpost.c ++++ b/scripts/mod/modpost.c +@@ -1473,13 +1473,22 @@ static void extract_crcs_for_object(const char *object, struct module *mod) + char cmd_file[PATH_MAX]; + char *buf, *p; + const char *base; +- int dirlen, ret; ++ int dirlen, baselen_without_suffix, ret; + + base = get_basename(object); + dirlen = base - object; + +- ret = snprintf(cmd_file, sizeof(cmd_file), "%.*s.%s.cmd", +- dirlen, object, base); ++ baselen_without_suffix = strlen(object) - dirlen - strlen(".o"); ++ ++ /* ++ * When CONFIG_LTO_CLANG_THIN_DIST=y, the ELF is *.thinlto-native.o ++ * but the symbol CRCs are recorded in *.o.cmd file. ++ */ ++ if (strends(object, ".thinlto-native.o")) ++ baselen_without_suffix -= strlen(".thinlto-native"); ++ ++ ret = snprintf(cmd_file, sizeof(cmd_file), "%.*s.%.*s.o.cmd", ++ dirlen, object, baselen_without_suffix, base); + if (ret >= sizeof(cmd_file)) { + error("%s: too long path was truncated\n", cmd_file); + return; +-- +2.51.0 + diff --git a/sys-kernel/git-sources/0004-fixes.patch b/sys-kernel/git-sources/0004-fixes.patch new file mode 100644 index 0000000..1f68361 --- /dev/null +++ b/sys-kernel/git-sources/0004-fixes.patch @@ -0,0 +1,107 @@ +From 3a2358a5db595bd3797db3e5d65cd01863f42b94 Mon Sep 17 00:00:00 2001 +From: Eric Naim +Date: Mon, 1 Sep 2025 09:38:55 +0800 +Subject: [PATCH 4/4] fixes + +Signed-off-by: Eric Naim +--- + drivers/gpu/drm/drm_atomic_uapi.c | 23 ++++++++++++----------- + include/linux/btf.h | 2 +- + net/ipv4/route.c | 7 ++++++- + scripts/package/PKGBUILD | 5 +++++ + 4 files changed, 24 insertions(+), 13 deletions(-) + +diff --git a/drivers/gpu/drm/drm_atomic_uapi.c b/drivers/gpu/drm/drm_atomic_uapi.c +index ecc73d52bfae..85dbdaa4a2e2 100644 +--- a/drivers/gpu/drm/drm_atomic_uapi.c ++++ b/drivers/gpu/drm/drm_atomic_uapi.c +@@ -1078,19 +1078,20 @@ int drm_atomic_set_property(struct drm_atomic_state *state, + } + + if (async_flip) { +- /* check if the prop does a nop change */ +- if ((prop != config->prop_fb_id && +- prop != config->prop_in_fence_fd && +- prop != config->prop_fb_damage_clips)) { +- ret = drm_atomic_plane_get_property(plane, plane_state, +- prop, &old_val); +- ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); +- } ++ /* no-op changes are always allowed */ ++ ret = drm_atomic_plane_get_property(plane, plane_state, ++ prop, &old_val); ++ ret = drm_atomic_check_prop_changes(ret, old_val, prop_value, prop); + +- /* ask the driver if this non-primary plane is supported */ +- if (plane->type != DRM_PLANE_TYPE_PRIMARY) { +- ret = -EINVAL; ++ /* fail everything that isn't no-op or a pure flip */ ++ if (ret && prop != config->prop_fb_id && ++ prop != config->prop_in_fence_fd && ++ prop != config->prop_fb_damage_clips) { ++ break; ++ } + ++ if (ret && plane->type != DRM_PLANE_TYPE_PRIMARY) { ++ /* ask the driver if this non-primary plane is supported */ + if (plane_funcs && plane_funcs->atomic_async_check) + ret = plane_funcs->atomic_async_check(plane, state, true); + +diff --git a/include/linux/btf.h b/include/linux/btf.h +index 9eda6b113f9b..f06976ffb63f 100644 +--- a/include/linux/btf.h ++++ b/include/linux/btf.h +@@ -86,7 +86,7 @@ + * as to avoid issues such as the compiler inlining or eliding either a static + * kfunc, or a global kfunc in an LTO build. + */ +-#define __bpf_kfunc __used __retain noinline ++#define __bpf_kfunc __used __retain __noclone noinline + + #define __bpf_kfunc_start_defs() \ + __diag_push(); \ +diff --git a/net/ipv4/route.c b/net/ipv4/route.c +index baa43e5966b1..05a5d185807a 100644 +--- a/net/ipv4/route.c ++++ b/net/ipv4/route.c +@@ -2592,6 +2592,7 @@ static struct rtable *__mkroute_output(const struct fib_result *res, + do_cache = true; + if (type == RTN_BROADCAST) { + flags |= RTCF_BROADCAST | RTCF_LOCAL; ++ fi = NULL; + } else if (type == RTN_MULTICAST) { + flags |= RTCF_MULTICAST | RTCF_LOCAL; + if (!ip_check_mc_rcu(in_dev, fl4->daddr, fl4->saddr, +@@ -2661,8 +2662,12 @@ static struct rtable *__mkroute_output(const struct fib_result *res, + rth->dst.output = ip_mc_output; + RT_CACHE_STAT_INC(out_slow_mc); + } ++ if (type == RTN_BROADCAST && res->fi) { ++ /* ensure MTU value for broadcast routes is retained */ ++ ip_dst_init_metrics(&rth->dst, res->fi->fib_metrics); ++ } + #ifdef CONFIG_IP_MROUTE +- if (type == RTN_MULTICAST) { ++ else if (type == RTN_MULTICAST) { + if (IN_DEV_MFORWARD(in_dev) && + !ipv4_is_local_multicast(fl4->daddr)) { + rth->dst.input = ip_mr_input; +diff --git a/scripts/package/PKGBUILD b/scripts/package/PKGBUILD +index 452374d63c24..08f80d7c5df0 100644 +--- a/scripts/package/PKGBUILD ++++ b/scripts/package/PKGBUILD +@@ -90,6 +90,11 @@ _package-headers() { + "${srctree}/scripts/package/install-extmod-build" "${builddir}" + fi + ++ # required when DEBUG_INFO_BTF_MODULES is enabled ++ if [ -f tools/bpf/resolve_btfids/resolve_btfids ]; then ++ install -Dt "$builddir/tools/bpf/resolve_btfids" tools/bpf/resolve_btfids/resolve_btfids ++ fi ++ + echo "Installing System.map and config..." + mkdir -p "${builddir}" + cp System.map "${builddir}/System.map" +-- +2.51.0 + diff --git a/sys-kernel/git-sources/0005-sched-ext.patch b/sys-kernel/git-sources/0005-sched-ext.patch deleted file mode 100644 index e14973c..0000000 --- a/sys-kernel/git-sources/0005-sched-ext.patch +++ /dev/null @@ -1,21992 +0,0 @@ -From a202d5f9500a682f40f4ba89dfeeae27177a56af Mon Sep 17 00:00:00 2001 -From: Piotr Gorski -Date: Fri, 7 Jun 2024 08:41:45 +0200 -Subject: [PATCH] sched-ext - -Signed-off-by: Piotr Gorski ---- - .github/workflows/run-schedulers | 73 + - .github/workflows/sched-ext.config | 34 + - .github/workflows/test-kernel.yml | 56 + - .gitignore | 3 + - Documentation/bpf/libbpf/libbpf_overview.rst | 8 + - Documentation/bpf/standardization/abi.rst | 3 + - .../bpf/standardization/instruction-set.rst | 261 +- - Documentation/scheduler/index.rst | 1 + - Documentation/scheduler/sched-ext.rst | 314 + - MAINTAINERS | 13 + - Makefile | 8 +- - arch/riscv/Kconfig | 12 + - arch/riscv/net/bpf_jit.h | 51 + - arch/riscv/net/bpf_jit_comp32.c | 3 +- - arch/riscv/net/bpf_jit_comp64.c | 21 +- - drivers/isdn/mISDN/dsp_blowfish.c | 5 - - drivers/net/ethernet/8390/ne2k-pci.c | 11 - - drivers/net/ethernet/adaptec/starfire.c | 8 - - .../net/ethernet/cavium/liquidio/lio_main.c | 6 - - .../ethernet/cavium/liquidio/octeon_droq.c | 5 - - drivers/net/ethernet/mellanox/mlx4/main.c | 6 - - drivers/net/usb/lan78xx.c | 5 - - drivers/net/usb/smsc75xx.c | 5 - - drivers/tty/sysrq.c | 1 + - include/asm-generic/vmlinux.lds.h | 1 + - include/linux/bpf.h | 13 +- - include/linux/cgroup-defs.h | 8 + - include/linux/cgroup.h | 5 +- - include/linux/filter.h | 2 +- - include/linux/sched.h | 5 + - include/linux/sched/ext.h | 210 + - include/linux/sched/task.h | 3 +- - include/linux/skbuff.h | 68 +- - include/net/inet_frag.h | 4 +- - include/trace/events/sched_ext.h | 32 + - include/uapi/linux/bpf.h | 15 +- - include/uapi/linux/sched.h | 1 + - init/Kconfig | 5 + - init/init_task.c | 12 + - kernel/Kconfig.preempt | 24 +- - kernel/bpf/bpf_local_storage.c | 4 +- - kernel/bpf/bpf_struct_ops.c | 75 +- - kernel/bpf/helpers.c | 119 + - kernel/bpf/syscall.c | 34 +- - kernel/cgroup/cgroup.c | 97 +- - kernel/fork.c | 17 +- - kernel/sched/build_policy.c | 9 + - kernel/sched/core.c | 316 +- - kernel/sched/cpufreq_schedutil.c | 50 +- - kernel/sched/debug.c | 3 + - kernel/sched/ext.c | 6973 +++++++++++++++++ - kernel/sched/ext.h | 143 + - kernel/sched/fair.c | 21 +- - kernel/sched/idle.c | 2 + - kernel/sched/sched.h | 116 +- - lib/dump_stack.c | 1 + - lib/test_bpf.c | 1 + - net/bpf/bpf_dummy_struct_ops.c | 4 +- - net/bridge/netfilter/nf_conntrack_bridge.c | 6 +- - net/core/dev.c | 2 +- - net/core/filter.c | 62 +- - net/core/sock.c | 19 +- - net/ieee802154/6lowpan/reassembly.c | 2 +- - net/ipv4/bpf_tcp_ca.c | 6 +- - net/ipv4/inet_fragment.c | 2 +- - net/ipv4/ip_fragment.c | 2 +- - net/ipv4/ip_output.c | 14 +- - net/ipv4/raw.c | 2 +- - net/ipv4/tcp_ipv4.c | 2 + - net/ipv4/tcp_output.c | 14 +- - net/ipv6/ip6_output.c | 11 +- - net/ipv6/netfilter.c | 6 +- - net/ipv6/netfilter/nf_conntrack_reasm.c | 2 +- - net/ipv6/raw.c | 2 +- - net/ipv6/reassembly.c | 2 +- - net/ipv6/tcp_ipv6.c | 12 +- - net/netfilter/nf_conntrack_bpf.c | 68 +- - net/packet/af_packet.c | 7 +- - net/sched/act_bpf.c | 4 +- - net/sched/cls_bpf.c | 4 +- - samples/bpf/cpustat_kern.c | 3 +- - scripts/Makefile.btf | 4 +- - tools/Makefile | 10 +- - .../bpf/bpftool/Documentation/bpftool-btf.rst | 6 +- - tools/bpf/bpftool/Makefile | 3 +- - tools/bpf/bpftool/bash-completion/bpftool | 3 + - tools/bpf/bpftool/btf.c | 138 +- - tools/bpf/bpftool/common.c | 2 +- - tools/bpf/bpftool/skeleton/pid_iter.bpf.c | 7 +- - tools/bpf/bpftool/skeleton/profiler.bpf.c | 14 +- - tools/include/uapi/linux/bpf.h | 15 +- - tools/lib/bpf/libbpf.c | 25 +- - tools/lib/bpf/libbpf.h | 5 +- - tools/lib/bpf/libbpf_internal.h | 10 +- - tools/sched_ext/.gitignore | 2 + - tools/sched_ext/Makefile | 246 + - tools/sched_ext/README.md | 270 + - .../sched_ext/include/bpf-compat/gnu/stubs.h | 11 + - tools/sched_ext/include/scx/common.bpf.h | 349 + - tools/sched_ext/include/scx/common.h | 71 + - tools/sched_ext/include/scx/compat.bpf.h | 120 + - tools/sched_ext/include/scx/compat.h | 208 + - tools/sched_ext/include/scx/user_exit_info.h | 111 + - tools/sched_ext/scx_central.bpf.c | 362 + - tools/sched_ext/scx_central.c | 135 + - tools/sched_ext/scx_flatcg.bpf.c | 939 +++ - tools/sched_ext/scx_flatcg.c | 233 + - tools/sched_ext/scx_flatcg.h | 51 + - tools/sched_ext/scx_qmap.bpf.c | 728 ++ - tools/sched_ext/scx_qmap.c | 154 + - tools/sched_ext/scx_show_state.py | 39 + - tools/sched_ext/scx_simple.bpf.c | 157 + - tools/sched_ext/scx_simple.c | 107 + - .../bpf/bpf_test_no_cfi/bpf_test_no_cfi.c | 4 +- - .../selftests/bpf/bpf_testmod/bpf_testmod.c | 6 +- - tools/testing/selftests/bpf/config | 1 + - tools/testing/selftests/bpf/network_helpers.c | 32 +- - tools/testing/selftests/bpf/network_helpers.h | 8 +- - .../testing/selftests/bpf/prog_tests/bpf_nf.c | 7 + - .../selftests/bpf/prog_tests/bpf_tcp_ca.c | 92 +- - .../bpf/prog_tests/bpf_verif_scale.c | 6 - - .../selftests/bpf/prog_tests/ctx_rewrite.c | 10 +- - .../bpf/prog_tests/sockopt_inherit.c | 2 +- - .../selftests/bpf/prog_tests/tc_redirect.c | 3 - - .../bpf/prog_tests/test_struct_ops_module.c | 57 + - .../selftests/bpf/prog_tests/verifier.c | 2 + - .../bpf/progs/bpf_iter_bpf_array_map.c | 6 - - .../bpf/progs/bpf_iter_bpf_percpu_array_map.c | 6 - - .../selftests/bpf/progs/struct_ops_detach.c | 10 + - .../testing/selftests/bpf/progs/test_bpf_nf.c | 108 + - .../selftests/bpf/progs/test_sockmap_kern.h | 20 +- - .../selftests/bpf/progs/test_tc_dtime.c | 39 +- - .../selftests/bpf/progs/verifier_bits_iter.c | 153 + - tools/testing/selftests/bpf/test_sockmap.c | 136 +- - .../bpf/test_tcp_check_syncookie_user.c | 4 +- - tools/testing/selftests/bpf/test_verifier.c | 5 - - tools/testing/selftests/sched_ext/.gitignore | 6 + - tools/testing/selftests/sched_ext/Makefile | 218 + - tools/testing/selftests/sched_ext/config | 9 + - .../selftests/sched_ext/create_dsq.bpf.c | 58 + - .../testing/selftests/sched_ext/create_dsq.c | 57 + - .../sched_ext/ddsp_bogus_dsq_fail.bpf.c | 42 + - .../selftests/sched_ext/ddsp_bogus_dsq_fail.c | 57 + - .../sched_ext/ddsp_vtimelocal_fail.bpf.c | 39 + - .../sched_ext/ddsp_vtimelocal_fail.c | 56 + - .../selftests/sched_ext/dsp_local_on.bpf.c | 65 + - .../selftests/sched_ext/dsp_local_on.c | 58 + - .../sched_ext/enq_last_no_enq_fails.bpf.c | 21 + - .../sched_ext/enq_last_no_enq_fails.c | 60 + - .../sched_ext/enq_select_cpu_fails.bpf.c | 43 + - .../sched_ext/enq_select_cpu_fails.c | 61 + - tools/testing/selftests/sched_ext/exit.bpf.c | 84 + - tools/testing/selftests/sched_ext/exit.c | 55 + - tools/testing/selftests/sched_ext/exit_test.h | 20 + - .../testing/selftests/sched_ext/hotplug.bpf.c | 61 + - tools/testing/selftests/sched_ext/hotplug.c | 168 + - .../selftests/sched_ext/hotplug_test.h | 15 + - .../sched_ext/init_enable_count.bpf.c | 53 + - .../selftests/sched_ext/init_enable_count.c | 166 + - .../testing/selftests/sched_ext/maximal.bpf.c | 164 + - tools/testing/selftests/sched_ext/maximal.c | 51 + - .../selftests/sched_ext/maybe_null.bpf.c | 26 + - .../testing/selftests/sched_ext/maybe_null.c | 40 + - .../selftests/sched_ext/maybe_null_fail.bpf.c | 25 + - .../testing/selftests/sched_ext/minimal.bpf.c | 21 + - tools/testing/selftests/sched_ext/minimal.c | 58 + - .../selftests/sched_ext/prog_run.bpf.c | 32 + - tools/testing/selftests/sched_ext/prog_run.c | 78 + - .../testing/selftests/sched_ext/reload_loop.c | 75 + - tools/testing/selftests/sched_ext/runner.c | 201 + - tools/testing/selftests/sched_ext/scx_test.h | 131 + - .../selftests/sched_ext/select_cpu_dfl.bpf.c | 40 + - .../selftests/sched_ext/select_cpu_dfl.c | 72 + - .../sched_ext/select_cpu_dfl_nodispatch.bpf.c | 89 + - .../sched_ext/select_cpu_dfl_nodispatch.c | 72 + - .../sched_ext/select_cpu_dispatch.bpf.c | 41 + - .../selftests/sched_ext/select_cpu_dispatch.c | 70 + - .../select_cpu_dispatch_bad_dsq.bpf.c | 37 + - .../sched_ext/select_cpu_dispatch_bad_dsq.c | 56 + - .../select_cpu_dispatch_dbl_dsp.bpf.c | 38 + - .../sched_ext/select_cpu_dispatch_dbl_dsp.c | 56 + - .../sched_ext/select_cpu_vtime.bpf.c | 92 + - .../selftests/sched_ext/select_cpu_vtime.c | 59 + - .../selftests/sched_ext/test_example.c | 49 + - tools/testing/selftests/sched_ext/util.c | 71 + - tools/testing/selftests/sched_ext/util.h | 13 + - 186 files changed, 17378 insertions(+), 663 deletions(-) - create mode 100755 .github/workflows/run-schedulers - create mode 100644 .github/workflows/sched-ext.config - create mode 100644 .github/workflows/test-kernel.yml - create mode 100644 Documentation/scheduler/sched-ext.rst - create mode 100644 include/linux/sched/ext.h - create mode 100644 include/trace/events/sched_ext.h - create mode 100644 kernel/sched/ext.c - create mode 100644 kernel/sched/ext.h - create mode 100644 tools/sched_ext/.gitignore - create mode 100644 tools/sched_ext/Makefile - create mode 100644 tools/sched_ext/README.md - create mode 100644 tools/sched_ext/include/bpf-compat/gnu/stubs.h - create mode 100644 tools/sched_ext/include/scx/common.bpf.h - create mode 100644 tools/sched_ext/include/scx/common.h - create mode 100644 tools/sched_ext/include/scx/compat.bpf.h - create mode 100644 tools/sched_ext/include/scx/compat.h - create mode 100644 tools/sched_ext/include/scx/user_exit_info.h - create mode 100644 tools/sched_ext/scx_central.bpf.c - create mode 100644 tools/sched_ext/scx_central.c - create mode 100644 tools/sched_ext/scx_flatcg.bpf.c - create mode 100644 tools/sched_ext/scx_flatcg.c - create mode 100644 tools/sched_ext/scx_flatcg.h - create mode 100644 tools/sched_ext/scx_qmap.bpf.c - create mode 100644 tools/sched_ext/scx_qmap.c - create mode 100644 tools/sched_ext/scx_show_state.py - create mode 100644 tools/sched_ext/scx_simple.bpf.c - create mode 100644 tools/sched_ext/scx_simple.c - create mode 100644 tools/testing/selftests/bpf/progs/struct_ops_detach.c - create mode 100644 tools/testing/selftests/bpf/progs/verifier_bits_iter.c - create mode 100644 tools/testing/selftests/sched_ext/.gitignore - create mode 100644 tools/testing/selftests/sched_ext/Makefile - create mode 100644 tools/testing/selftests/sched_ext/config - create mode 100644 tools/testing/selftests/sched_ext/create_dsq.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/create_dsq.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c - create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/dsp_local_on.c - create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c - create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/enq_select_cpu_fails.c - create mode 100644 tools/testing/selftests/sched_ext/exit.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/exit.c - create mode 100644 tools/testing/selftests/sched_ext/exit_test.h - create mode 100644 tools/testing/selftests/sched_ext/hotplug.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/hotplug.c - create mode 100644 tools/testing/selftests/sched_ext/hotplug_test.h - create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/init_enable_count.c - create mode 100644 tools/testing/selftests/sched_ext/maximal.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/maximal.c - create mode 100644 tools/testing/selftests/sched_ext/maybe_null.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/maybe_null.c - create mode 100644 tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/minimal.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/minimal.c - create mode 100644 tools/testing/selftests/sched_ext/prog_run.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/prog_run.c - create mode 100644 tools/testing/selftests/sched_ext/reload_loop.c - create mode 100644 tools/testing/selftests/sched_ext/runner.c - create mode 100644 tools/testing/selftests/sched_ext/scx_test.h - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c - create mode 100644 tools/testing/selftests/sched_ext/select_cpu_vtime.c - create mode 100644 tools/testing/selftests/sched_ext/test_example.c - create mode 100644 tools/testing/selftests/sched_ext/util.c - create mode 100644 tools/testing/selftests/sched_ext/util.h - -diff --git a/.github/workflows/run-schedulers b/.github/workflows/run-schedulers -new file mode 100755 -index 000000000000..fc1f92270f59 ---- /dev/null -+++ b/.github/workflows/run-schedulers -@@ -0,0 +1,73 @@ -+#!/bin/bash -+# -+# Run sched-ext scheduler for TIMEOUT seconds inside virtme-ng and catch -+# potential errors, then unload the scheduler and return the exit status. -+ -+# Maximum time for each scheduler run. -+TEST_TIMEOUT=30 -+ -+# Maximum timeout for the guest used for each scheduler run (this is used to -+# hard-shutdown the guest in case of system hangs). -+GUEST_TIMEOUT=60 -+ -+# Check if virtme-ng is available. -+if [ ! -x `which vng` ]; then -+ echo "vng not found, please install virtme-ng to enable testing" -+ exit 1 -+fi -+ -+function runtest() { -+ local bin="${1}" -+ -+ if [ -z "${bin}" ]; then -+ echo "No binary passed to runtest" -+ exit 1 -+ fi -+ -+ if ! [ -f "${bin}" ]; then -+ echo "Binary ${bin} was not a regular file" -+ exit 1 -+ fi -+ -+ rm -f /tmp/output -+ (timeout --foreground --preserve-status ${GUEST_TIMEOUT} \ -+ vng --force-9p --verbose -- \ -+ "timeout --foreground --preserve-status ${TEST_TIMEOUT} ${bin}" \ -+ 2>&1 `_ -+`RFC8174 `_ -+when, and only when, they appear in all capitals, as shown here. -+ - For brevity and consistency, this document refers to families - of types using a shorthand syntax and refers to several expository, - mnemonic functions when describing the semantics of instructions. -@@ -25,7 +32,7 @@ Types - This document refers to integer types with the notation `SN` to specify - a type's signedness (`S`) and bit width (`N`), respectively. - --.. table:: Meaning of signedness notation. -+.. table:: Meaning of signedness notation - - ==== ========= - S Meaning -@@ -34,7 +41,7 @@ a type's signedness (`S`) and bit width (`N`), respectively. - s signed - ==== ========= - --.. table:: Meaning of bit-width notation. -+.. table:: Meaning of bit-width notation - - ===== ========= - N Bit width -@@ -106,9 +113,9 @@ Conformance groups - - An implementation does not need to support all instructions specified in this - document (e.g., deprecated instructions). Instead, a number of conformance --groups are specified. An implementation must support the base32 conformance --group and may support additional conformance groups, where supporting a --conformance group means it must support all instructions in that conformance -+groups are specified. An implementation MUST support the base32 conformance -+group and MAY support additional conformance groups, where supporting a -+conformance group means it MUST support all instructions in that conformance - group. - - The use of named conformance groups enables interoperability between a runtime -@@ -209,7 +216,7 @@ For example:: - 07 1 0 00 00 11 22 33 44 r1 += 0x11223344 // big - - Note that most instructions do not use all of the fields. --Unused fields shall be cleared to zero. -+Unused fields SHALL be cleared to zero. - - Wide instruction encoding - -------------------------- -@@ -256,18 +263,20 @@ Instruction classes - - The three least significant bits of the 'opcode' field store the instruction class: - --===== ===== =============================== =================================== --class value description reference --===== ===== =============================== =================================== --LD 0x0 non-standard load operations `Load and store instructions`_ --LDX 0x1 load into register operations `Load and store instructions`_ --ST 0x2 store from immediate operations `Load and store instructions`_ --STX 0x3 store from register operations `Load and store instructions`_ --ALU 0x4 32-bit arithmetic operations `Arithmetic and jump instructions`_ --JMP 0x5 64-bit jump operations `Arithmetic and jump instructions`_ --JMP32 0x6 32-bit jump operations `Arithmetic and jump instructions`_ --ALU64 0x7 64-bit arithmetic operations `Arithmetic and jump instructions`_ --===== ===== =============================== =================================== -+.. table:: Instruction class -+ -+ ===== ===== =============================== =================================== -+ class value description reference -+ ===== ===== =============================== =================================== -+ LD 0x0 non-standard load operations `Load and store instructions`_ -+ LDX 0x1 load into register operations `Load and store instructions`_ -+ ST 0x2 store from immediate operations `Load and store instructions`_ -+ STX 0x3 store from register operations `Load and store instructions`_ -+ ALU 0x4 32-bit arithmetic operations `Arithmetic and jump instructions`_ -+ JMP 0x5 64-bit jump operations `Arithmetic and jump instructions`_ -+ JMP32 0x6 32-bit jump operations `Arithmetic and jump instructions`_ -+ ALU64 0x7 64-bit arithmetic operations `Arithmetic and jump instructions`_ -+ ===== ===== =============================== =================================== - - Arithmetic and jump instructions - ================================ -@@ -285,12 +294,14 @@ For arithmetic and jump instructions (``ALU``, ``ALU64``, ``JMP`` and - **s (source)** - the source operand location, which unless otherwise specified is one of: - -- ====== ===== ============================================== -- source value description -- ====== ===== ============================================== -- K 0 use 32-bit 'imm' value as source operand -- X 1 use 'src_reg' register value as source operand -- ====== ===== ============================================== -+ .. table:: Source operand location -+ -+ ====== ===== ============================================== -+ source value description -+ ====== ===== ============================================== -+ K 0 use 32-bit 'imm' value as source operand -+ X 1 use 'src_reg' register value as source operand -+ ====== ===== ============================================== - - **instruction class** - the instruction class (see `Instruction classes`_) -@@ -305,27 +316,29 @@ The 'code' field encodes the operation as below, where 'src' refers to the - the source operand and 'dst' refers to the value of the destination - register. - --===== ===== ======= ========================================================== --name code offset description --===== ===== ======= ========================================================== --ADD 0x0 0 dst += src --SUB 0x1 0 dst -= src --MUL 0x2 0 dst \*= src --DIV 0x3 0 dst = (src != 0) ? (dst / src) : 0 --SDIV 0x3 1 dst = (src != 0) ? (dst s/ src) : 0 --OR 0x4 0 dst \|= src --AND 0x5 0 dst &= src --LSH 0x6 0 dst <<= (src & mask) --RSH 0x7 0 dst >>= (src & mask) --NEG 0x8 0 dst = -dst --MOD 0x9 0 dst = (src != 0) ? (dst % src) : dst --SMOD 0x9 1 dst = (src != 0) ? (dst s% src) : dst --XOR 0xa 0 dst ^= src --MOV 0xb 0 dst = src --MOVSX 0xb 8/16/32 dst = (s8,s16,s32)src --ARSH 0xc 0 :term:`sign extending` dst >>= (src & mask) --END 0xd 0 byte swap operations (see `Byte swap instructions`_ below) --===== ===== ======= ========================================================== -+.. table:: Arithmetic instructions -+ -+ ===== ===== ======= ========================================================== -+ name code offset description -+ ===== ===== ======= ========================================================== -+ ADD 0x0 0 dst += src -+ SUB 0x1 0 dst -= src -+ MUL 0x2 0 dst \*= src -+ DIV 0x3 0 dst = (src != 0) ? (dst / src) : 0 -+ SDIV 0x3 1 dst = (src != 0) ? (dst s/ src) : 0 -+ OR 0x4 0 dst \|= src -+ AND 0x5 0 dst &= src -+ LSH 0x6 0 dst <<= (src & mask) -+ RSH 0x7 0 dst >>= (src & mask) -+ NEG 0x8 0 dst = -dst -+ MOD 0x9 0 dst = (src != 0) ? (dst % src) : dst -+ SMOD 0x9 1 dst = (src != 0) ? (dst s% src) : dst -+ XOR 0xa 0 dst ^= src -+ MOV 0xb 0 dst = src -+ MOVSX 0xb 8/16/32 dst = (s8,s16,s32)src -+ ARSH 0xc 0 :term:`sign extending` dst >>= (src & mask) -+ END 0xd 0 byte swap operations (see `Byte swap instructions`_ below) -+ ===== ===== ======= ========================================================== - - Underflow and overflow are allowed during arithmetic operations, meaning - the 64-bit or 32-bit value will wrap. If BPF program execution would -@@ -374,7 +387,7 @@ interpreted as a 64-bit signed value. - Note that there are varying definitions of the signed modulo operation - when the dividend or divisor are negative, where implementations often - vary by language such that Python, Ruby, etc. differ from C, Go, Java, --etc. This specification requires that signed modulo use truncated division -+etc. This specification requires that signed modulo MUST use truncated division - (where -13 % 3 == -1) as implemented in C, Go, etc.:: - - a % n = a - n * trunc(a / n) -@@ -386,6 +399,19 @@ The ``MOVSX`` instruction does a move operation with sign extension. - operands into 64-bit operands. Unlike other arithmetic instructions, - ``MOVSX`` is only defined for register source operands (``X``). - -+``{MOV, K, ALU64}`` means:: -+ -+ dst = (s64)imm -+ -+``{MOV, X, ALU}`` means:: -+ -+ dst = (u32)src -+ -+``{MOVSX, X, ALU}`` with 'offset' 8 means:: -+ -+ dst = (u32)(s32)(s8)src -+ -+ - The ``NEG`` instruction is only defined when the source bit is clear - (``K``). - -@@ -404,15 +430,17 @@ only and do not use a separate source register or immediate value. - For ``ALU``, the 1-bit source operand field in the opcode is used to - select what byte order the operation converts from or to. For - ``ALU64``, the 1-bit source operand field in the opcode is reserved --and must be set to 0. -+and MUST be set to 0. - --===== ======== ===== ================================================= --class source value description --===== ======== ===== ================================================= --ALU TO_LE 0 convert between host byte order and little endian --ALU TO_BE 1 convert between host byte order and big endian --ALU64 Reserved 0 do byte swap unconditionally --===== ======== ===== ================================================= -+.. table:: Byte swap instructions -+ -+ ===== ======== ===== ================================================= -+ class source value description -+ ===== ======== ===== ================================================= -+ ALU TO_LE 0 convert between host byte order and little endian -+ ALU TO_BE 1 convert between host byte order and big endian -+ ALU64 Reserved 0 do byte swap unconditionally -+ ===== ======== ===== ================================================= - - The 'imm' field encodes the width of the swap operations. The following widths - are supported: 16, 32 and 64. Width 64 operations belong to the base64 -@@ -448,27 +476,29 @@ otherwise identical operations, and indicates the base64 conformance - group unless otherwise specified. - The 'code' field encodes the operation as below: - --======== ===== ======= ================================= =================================================== --code value src_reg description notes --======== ===== ======= ================================= =================================================== --JA 0x0 0x0 PC += offset {JA, K, JMP} only --JA 0x0 0x0 PC += imm {JA, K, JMP32} only --JEQ 0x1 any PC += offset if dst == src --JGT 0x2 any PC += offset if dst > src unsigned --JGE 0x3 any PC += offset if dst >= src unsigned --JSET 0x4 any PC += offset if dst & src --JNE 0x5 any PC += offset if dst != src --JSGT 0x6 any PC += offset if dst > src signed --JSGE 0x7 any PC += offset if dst >= src signed --CALL 0x8 0x0 call helper function by static ID {CALL, K, JMP} only, see `Helper functions`_ --CALL 0x8 0x1 call PC += imm {CALL, K, JMP} only, see `Program-local functions`_ --CALL 0x8 0x2 call helper function by BTF ID {CALL, K, JMP} only, see `Helper functions`_ --EXIT 0x9 0x0 return {CALL, K, JMP} only --JLT 0xa any PC += offset if dst < src unsigned --JLE 0xb any PC += offset if dst <= src unsigned --JSLT 0xc any PC += offset if dst < src signed --JSLE 0xd any PC += offset if dst <= src signed --======== ===== ======= ================================= =================================================== -+.. table:: Jump instructions -+ -+ ======== ===== ======= ================================= =================================================== -+ code value src_reg description notes -+ ======== ===== ======= ================================= =================================================== -+ JA 0x0 0x0 PC += offset {JA, K, JMP} only -+ JA 0x0 0x0 PC += imm {JA, K, JMP32} only -+ JEQ 0x1 any PC += offset if dst == src -+ JGT 0x2 any PC += offset if dst > src unsigned -+ JGE 0x3 any PC += offset if dst >= src unsigned -+ JSET 0x4 any PC += offset if dst & src -+ JNE 0x5 any PC += offset if dst != src -+ JSGT 0x6 any PC += offset if dst > src signed -+ JSGE 0x7 any PC += offset if dst >= src signed -+ CALL 0x8 0x0 call helper function by static ID {CALL, K, JMP} only, see `Helper functions`_ -+ CALL 0x8 0x1 call PC += imm {CALL, K, JMP} only, see `Program-local functions`_ -+ CALL 0x8 0x2 call helper function by BTF ID {CALL, K, JMP} only, see `Helper functions`_ -+ EXIT 0x9 0x0 return {CALL, K, JMP} only -+ JLT 0xa any PC += offset if dst < src unsigned -+ JLE 0xb any PC += offset if dst <= src unsigned -+ JSLT 0xc any PC += offset if dst < src signed -+ JSLE 0xd any PC += offset if dst <= src signed -+ ======== ===== ======= ================================= =================================================== - - where 'PC' denotes the program counter, and the offset to increment by - is in units of 64-bit instructions relative to the instruction following -@@ -476,9 +506,6 @@ the jump instruction. Thus 'PC += 1' skips execution of the next - instruction if it's a basic instruction or results in undefined behavior - if the next instruction is a 128-bit wide instruction. - --The BPF program needs to store the return value into register R0 before doing an --``EXIT``. -- - Example: - - ``{JSGE, X, JMP32}`` means:: -@@ -487,6 +514,10 @@ Example: - - where 's>=' indicates a signed '>=' comparison. - -+``{JLE, K, JMP}`` means:: -+ -+ if dst <= (u64)(s64)imm goto +offset -+ - ``{JA, K, JMP32}`` means:: - - gotol +imm -@@ -515,14 +546,16 @@ for each program type, but static IDs are unique across all program types. - - Platforms that support the BPF Type Format (BTF) support identifying - a helper function by a BTF ID encoded in the 'imm' field, where the BTF ID --identifies the helper name and type. -+identifies the helper name and type. Further documentation of BTF -+is outside the scope of this document and is left for future work. - - Program-local functions - ~~~~~~~~~~~~~~~~~~~~~~~ - Program-local functions are functions exposed by the same BPF program as the --caller, and are referenced by offset from the call instruction, similar to --``JA``. The offset is encoded in the 'imm' field of the call instruction. --An ``EXIT`` within the program-local function will return to the caller. -+caller, and are referenced by offset from the instruction following the call -+instruction, similar to ``JA``. The offset is encoded in the 'imm' field of -+the call instruction. An ``EXIT`` within the program-local function will -+return to the caller. - - Load and store instructions - =========================== -@@ -537,6 +570,8 @@ For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the - **mode** - The mode modifier is one of: - -+ .. table:: Mode modifier -+ - ============= ===== ==================================== ============= - mode modifier value description reference - ============= ===== ==================================== ============= -@@ -551,6 +586,8 @@ For load and store instructions (``LD``, ``LDX``, ``ST``, and ``STX``), the - **sz (size)** - The size modifier is one of: - -+ .. table:: Size modifier -+ - ==== ===== ===================== - size value description - ==== ===== ===================== -@@ -619,14 +656,16 @@ The 'imm' field is used to encode the actual atomic operation. - Simple atomic operation use a subset of the values defined to encode - arithmetic operations in the 'imm' field to encode the atomic operation: - --======== ===== =========== --imm value description --======== ===== =========== --ADD 0x00 atomic add --OR 0x40 atomic or --AND 0x50 atomic and --XOR 0xa0 atomic xor --======== ===== =========== -+.. table:: Simple atomic operations -+ -+ ======== ===== =========== -+ imm value description -+ ======== ===== =========== -+ ADD 0x00 atomic add -+ OR 0x40 atomic or -+ AND 0x50 atomic and -+ XOR 0xa0 atomic xor -+ ======== ===== =========== - - - ``{ATOMIC, W, STX}`` with 'imm' = ADD means:: -@@ -640,13 +679,15 @@ XOR 0xa0 atomic xor - In addition to the simple atomic operations, there also is a modifier and - two complex atomic operations: - --=========== ================ =========================== --imm value description --=========== ================ =========================== --FETCH 0x01 modifier: return old value --XCHG 0xe0 | FETCH atomic exchange --CMPXCHG 0xf0 | FETCH atomic compare and exchange --=========== ================ =========================== -+.. table:: Complex atomic operations -+ -+ =========== ================ =========================== -+ imm value description -+ =========== ================ =========================== -+ FETCH 0x01 modifier: return old value -+ XCHG 0xe0 | FETCH atomic exchange -+ CMPXCHG 0xf0 | FETCH atomic compare and exchange -+ =========== ================ =========================== - - The ``FETCH`` modifier is optional for simple atomic operations, and - always set for the complex atomic operations. If the ``FETCH`` flag -@@ -673,17 +714,19 @@ The following table defines a set of ``{IMM, DW, LD}`` instructions - with opcode subtypes in the 'src_reg' field, using new terms such as "map" - defined further below: - --======= ========================================= =========== ============== --src_reg pseudocode imm type dst type --======= ========================================= =========== ============== --0x0 dst = (next_imm << 32) | imm integer integer --0x1 dst = map_by_fd(imm) map fd map --0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data address --0x3 dst = var_addr(imm) variable id data address --0x4 dst = code_addr(imm) integer code address --0x5 dst = map_by_idx(imm) map index map --0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data address --======= ========================================= =========== ============== -+.. table:: 64-bit immediate instructions -+ -+ ======= ========================================= =========== ============== -+ src_reg pseudocode imm type dst type -+ ======= ========================================= =========== ============== -+ 0x0 dst = (next_imm << 32) | imm integer integer -+ 0x1 dst = map_by_fd(imm) map fd map -+ 0x2 dst = map_val(map_by_fd(imm)) + next_imm map fd data address -+ 0x3 dst = var_addr(imm) variable id data address -+ 0x4 dst = code_addr(imm) integer code address -+ 0x5 dst = map_by_idx(imm) map index map -+ 0x6 dst = map_val(map_by_idx(imm)) + next_imm map index data address -+ ======= ========================================= =========== ============== - - where - -@@ -725,5 +768,5 @@ carried over from classic BPF. These instructions used an instruction - class of ``LD``, a size modifier of ``W``, ``H``, or ``B``, and a - mode modifier of ``ABS`` or ``IND``. The 'dst_reg' and 'offset' fields were - set to zero, and 'src_reg' was set to zero for ``ABS``. However, these --instructions are deprecated and should no longer be used. All legacy packet -+instructions are deprecated and SHOULD no longer be used. All legacy packet - access instructions belong to the "packet" conformance group. -diff --git a/Documentation/scheduler/index.rst b/Documentation/scheduler/index.rst -index 43bd8a145b7a..0611dc3dda8e 100644 ---- a/Documentation/scheduler/index.rst -+++ b/Documentation/scheduler/index.rst -@@ -20,6 +20,7 @@ Scheduler - sched-nice-design - sched-rt-group - sched-stats -+ sched-ext - sched-debug - - text_files -diff --git a/Documentation/scheduler/sched-ext.rst b/Documentation/scheduler/sched-ext.rst -new file mode 100644 -index 000000000000..497eeaa5ecbe ---- /dev/null -+++ b/Documentation/scheduler/sched-ext.rst -@@ -0,0 +1,314 @@ -+========================== -+Extensible Scheduler Class -+========================== -+ -+sched_ext is a scheduler class whose behavior can be defined by a set of BPF -+programs - the BPF scheduler. -+ -+* sched_ext exports a full scheduling interface so that any scheduling -+ algorithm can be implemented on top. -+ -+* The BPF scheduler can group CPUs however it sees fit and schedule them -+ together, as tasks aren't tied to specific CPUs at the time of wakeup. -+ -+* The BPF scheduler can be turned on and off dynamically anytime. -+ -+* The system integrity is maintained no matter what the BPF scheduler does. -+ The default scheduling behavior is restored anytime an error is detected, -+ a runnable task stalls, or on invoking the SysRq key sequence -+ :kbd:`SysRq-S`. -+ -+* When the BPF scheduler triggers an error, debug information is dumped to -+ aid debugging. The debug dump is passed to and printed out by the -+ scheduler binary. The debug dump can also be accessed through the -+ `sched_ext_dump` tracepoint. The SysRq key sequence :kbd:`SysRq-D` -+ triggers a debug dump. This doesn't terminate the BPF scheduler and can -+ only be read through the tracepoint. -+ -+Switching to and from sched_ext -+=============================== -+ -+``CONFIG_SCHED_CLASS_EXT`` is the config option to enable sched_ext and -+``tools/sched_ext`` contains the example schedulers. The following config -+options should be enabled to use sched_ext: -+ -+.. code-block:: none -+ -+ CONFIG_BPF=y -+ CONFIG_SCHED_CLASS_EXT=y -+ CONFIG_BPF_SYSCALL=y -+ CONFIG_BPF_JIT=y -+ CONFIG_DEBUG_INFO_BTF=y -+ CONFIG_BPF_JIT_ALWAYS_ON=y -+ CONFIG_BPF_JIT_DEFAULT_ON=y -+ CONFIG_PAHOLE_HAS_SPLIT_BTF=y -+ CONFIG_PAHOLE_HAS_BTF_TAG=y -+ -+sched_ext is used only when the BPF scheduler is loaded and running. -+ -+If a task explicitly sets its scheduling policy to ``SCHED_EXT``, it will be -+treated as ``SCHED_NORMAL`` and scheduled by CFS until the BPF scheduler is -+loaded. On load, such tasks will be switched to and scheduled by sched_ext. -+ -+The BPF scheduler can choose to schedule all normal and lower class tasks by -+calling ``scx_bpf_switch_all()`` from its ``init()`` operation. In this -+case, all ``SCHED_NORMAL``, ``SCHED_BATCH``, ``SCHED_IDLE`` and -+``SCHED_EXT`` tasks are scheduled by sched_ext. In the example schedulers, -+this mode can be selected with the ``-a`` option. -+ -+Terminating the sched_ext scheduler program, triggering :kbd:`SysRq-S`, or -+detection of any internal error including stalled runnable tasks aborts the -+BPF scheduler and reverts all tasks back to CFS. -+ -+.. code-block:: none -+ -+ # make -j16 -C tools/sched_ext -+ # tools/sched_ext/scx_simple -+ local=0 global=3 -+ local=5 global=24 -+ local=9 global=44 -+ local=13 global=56 -+ local=17 global=72 -+ ^CEXIT: BPF scheduler unregistered -+ -+The current status of the BPF scheduler can be determined as follows: -+ -+.. code-block:: none -+ -+ # cat /sys/kernel/sched_ext/state -+ enabled -+ # cat /sys/kernel/sched_ext/root/ops -+ simple -+ -+``tools/sched_ext/scx_show_state.py`` is a drgn script which shows more -+detailed information: -+ -+.. code-block:: none -+ -+ # tools/sched_ext/scx_show_state.py -+ ops : simple -+ enabled : 1 -+ switching_all : 1 -+ switched_all : 1 -+ enable_state : enabled (2) -+ bypass_depth : 0 -+ nr_rejected : 0 -+ -+If ``CONFIG_SCHED_DEBUG`` is set, whether a given task is on sched_ext can -+be determined as follows: -+ -+.. code-block:: none -+ -+ # grep ext /proc/self/sched -+ ext.enabled : 1 -+ -+The Basics -+========== -+ -+Userspace can implement an arbitrary BPF scheduler by loading a set of BPF -+programs that implement ``struct sched_ext_ops``. The only mandatory field -+is ``ops.name`` which must be a valid BPF object name. All operations are -+optional. The following modified excerpt is from -+``tools/sched/scx_simple.bpf.c`` showing a minimal global FIFO scheduler. -+ -+.. code-block:: c -+ -+ /* -+ * Decide which CPU a task should be migrated to before being -+ * enqueued (either at wakeup, fork time, or exec time). If an -+ * idle core is found by the default ops.select_cpu() implementation, -+ * then dispatch the task directly to SCX_DSQ_LOCAL and skip the -+ * ops.enqueue() callback. -+ * -+ * Note that this implementation has exactly the same behavior as the -+ * default ops.select_cpu implementation. The behavior of the scheduler -+ * would be exactly same if the implementation just didn't define the -+ * simple_select_cpu() struct_ops prog. -+ */ -+ s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+ { -+ s32 cpu; -+ /* Need to initialize or the BPF verifier will reject the program */ -+ bool direct = false; -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &direct); -+ -+ if (direct) -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); -+ -+ return cpu; -+ } -+ -+ /* -+ * Do a direct dispatch of a task to the global DSQ. This ops.enqueue() -+ * callback will only be invoked if we failed to find a core to dispatch -+ * to in ops.select_cpu() above. -+ * -+ * Note that this implementation has exactly the same behavior as the -+ * default ops.enqueue implementation, which just dispatches the task -+ * to SCX_DSQ_GLOBAL. The behavior of the scheduler would be exactly same -+ * if the implementation just didn't define the simple_enqueue struct_ops -+ * prog. -+ */ -+ void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) -+ { -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+ } -+ -+ s32 BPF_STRUCT_OPS(simple_init) -+ { -+ /* -+ * All SCHED_OTHER, SCHED_IDLE, and SCHED_BATCH tasks should -+ * use sched_ext. -+ */ -+ scx_bpf_switch_all(); -+ return 0; -+ } -+ -+ void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) -+ { -+ exit_type = ei->type; -+ } -+ -+ SEC(".struct_ops") -+ struct sched_ext_ops simple_ops = { -+ .select_cpu = (void *)simple_select_cpu, -+ .enqueue = (void *)simple_enqueue, -+ .init = (void *)simple_init, -+ .exit = (void *)simple_exit, -+ .name = "simple", -+ }; -+ -+Dispatch Queues -+--------------- -+ -+To match the impedance between the scheduler core and the BPF scheduler, -+sched_ext uses DSQs (dispatch queues) which can operate as both a FIFO and a -+priority queue. By default, there is one global FIFO (``SCX_DSQ_GLOBAL``), -+and one local dsq per CPU (``SCX_DSQ_LOCAL``). The BPF scheduler can manage -+an arbitrary number of dsq's using ``scx_bpf_create_dsq()`` and -+``scx_bpf_destroy_dsq()``. -+ -+A CPU always executes a task from its local DSQ. A task is "dispatched" to a -+DSQ. A non-local DSQ is "consumed" to transfer a task to the consuming CPU's -+local DSQ. -+ -+When a CPU is looking for the next task to run, if the local DSQ is not -+empty, the first task is picked. Otherwise, the CPU tries to consume the -+global DSQ. If that doesn't yield a runnable task either, ``ops.dispatch()`` -+is invoked. -+ -+Scheduling Cycle -+---------------- -+ -+The following briefly shows how a waking task is scheduled and executed. -+ -+1. When a task is waking up, ``ops.select_cpu()`` is the first operation -+ invoked. This serves two purposes. First, CPU selection optimization -+ hint. Second, waking up the selected CPU if idle. -+ -+ The CPU selected by ``ops.select_cpu()`` is an optimization hint and not -+ binding. The actual decision is made at the last step of scheduling. -+ However, there is a small performance gain if the CPU -+ ``ops.select_cpu()`` returns matches the CPU the task eventually runs on. -+ -+ A side-effect of selecting a CPU is waking it up from idle. While a BPF -+ scheduler can wake up any cpu using the ``scx_bpf_kick_cpu()`` helper, -+ using ``ops.select_cpu()`` judiciously can be simpler and more efficient. -+ -+ A task can be immediately dispatched to a DSQ from ``ops.select_cpu()`` by -+ calling ``scx_bpf_dispatch()``. If the task is dispatched to -+ ``SCX_DSQ_LOCAL`` from ``ops.select_cpu()``, it will be dispatched to the -+ local DSQ of whichever CPU is returned from ``ops.select_cpu()``. -+ Additionally, dispatching directly from ``ops.select_cpu()`` will cause the -+ ``ops.enqueue()`` callback to be skipped. -+ -+ Note that the scheduler core will ignore an invalid CPU selection, for -+ example, if it's outside the allowed cpumask of the task. -+ -+2. Once the target CPU is selected, ``ops.enqueue()`` is invoked (unless the -+ task was dispatched directly from ``ops.select_cpu()``). ``ops.enqueue()`` -+ can make one of the following decisions: -+ -+ * Immediately dispatch the task to either the global or local DSQ by -+ calling ``scx_bpf_dispatch()`` with ``SCX_DSQ_GLOBAL`` or -+ ``SCX_DSQ_LOCAL``, respectively. -+ -+ * Immediately dispatch the task to a custom DSQ by calling -+ ``scx_bpf_dispatch()`` with a DSQ ID which is smaller than 2^63. -+ -+ * Queue the task on the BPF side. -+ -+3. When a CPU is ready to schedule, it first looks at its local DSQ. If -+ empty, it then looks at the global DSQ. If there still isn't a task to -+ run, ``ops.dispatch()`` is invoked which can use the following two -+ functions to populate the local DSQ. -+ -+ * ``scx_bpf_dispatch()`` dispatches a task to a DSQ. Any target DSQ can -+ be used - ``SCX_DSQ_LOCAL``, ``SCX_DSQ_LOCAL_ON | cpu``, -+ ``SCX_DSQ_GLOBAL`` or a custom DSQ. While ``scx_bpf_dispatch()`` -+ currently can't be called with BPF locks held, this is being worked on -+ and will be supported. ``scx_bpf_dispatch()`` schedules dispatching -+ rather than performing them immediately. There can be up to -+ ``ops.dispatch_max_batch`` pending tasks. -+ -+ * ``scx_bpf_consume()`` tranfers a task from the specified non-local DSQ -+ to the dispatching DSQ. This function cannot be called with any BPF -+ locks held. ``scx_bpf_consume()`` flushes the pending dispatched tasks -+ before trying to consume the specified DSQ. -+ -+4. After ``ops.dispatch()`` returns, if there are tasks in the local DSQ, -+ the CPU runs the first one. If empty, the following steps are taken: -+ -+ * Try to consume the global DSQ. If successful, run the task. -+ -+ * If ``ops.dispatch()`` has dispatched any tasks, retry #3. -+ -+ * If the previous task is an SCX task and still runnable, keep executing -+ it (see ``SCX_OPS_ENQ_LAST``). -+ -+ * Go idle. -+ -+Note that the BPF scheduler can always choose to dispatch tasks immediately -+in ``ops.enqueue()`` as illustrated in the above simple example. If only the -+built-in DSQs are used, there is no need to implement ``ops.dispatch()`` as -+a task is never queued on the BPF scheduler and both the local and global -+DSQs are consumed automatically. -+ -+``scx_bpf_dispatch()`` queues the task on the FIFO of the target DSQ. Use -+``scx_bpf_dispatch_vtime()`` for the priority queue. Internal DSQs such as -+``SCX_DSQ_LOCAL`` and ``SCX_DSQ_GLOBAL`` do not support priority-queue -+dispatching, and must be dispatched to with ``scx_bpf_dispatch()``. See the -+function documentation and usage in ``tools/sched_ext/scx_simple.bpf.c`` for -+more information. -+ -+Where to Look -+============= -+ -+* ``include/linux/sched/ext.h`` defines the core data structures, ops table -+ and constants. -+ -+* ``kernel/sched/ext.c`` contains sched_ext core implementation and helpers. -+ The functions prefixed with ``scx_bpf_`` can be called from the BPF -+ scheduler. -+ -+* ``tools/sched_ext/`` hosts example BPF scheduler implementations. -+ -+ * ``scx_simple[.bpf].c``: Minimal global FIFO scheduler example using a -+ custom DSQ. -+ -+ * ``scx_qmap[.bpf].c``: A multi-level FIFO scheduler supporting five -+ levels of priority implemented with ``BPF_MAP_TYPE_QUEUE``. -+ -+ABI Instability -+=============== -+ -+The APIs provided by sched_ext to BPF schedulers programs have no stability -+guarantees. This includes the ops table callbacks and constants defined in -+``include/linux/sched/ext.h``, as well as the ``scx_bpf_`` kfuncs defined in -+``kernel/sched/ext.c``. -+ -+While we will attempt to provide a relatively stable API surface when -+possible, they are subject to change without warning between kernel -+versions. -diff --git a/MAINTAINERS b/MAINTAINERS -index 7bcdcb4b7806..03e2d4690d51 100644 ---- a/MAINTAINERS -+++ b/MAINTAINERS -@@ -19964,6 +19964,19 @@ F: include/linux/wait.h - F: include/uapi/linux/sched.h - F: kernel/sched/ - -+SCHEDULER - SCHED_EXT -+R: Tejun Heo -+R: David Vernet -+L: linux-kernel@vger.kernel.org -+S: Maintained -+W: https://github.com/sched-ext/scx -+T: git://git.kernel.org/pub/scm/linux/kernel/git/tj/sched_ext.git -+F: include/linux/sched/ext.h -+F: kernel/sched/ext.h -+F: kernel/sched/ext.c -+F: tools/sched_ext/ -+F: tools/testing/selftests/sched_ext -+ - SCSI LIBSAS SUBSYSTEM - R: John Garry - R: Jason Yan -diff --git a/Makefile b/Makefile -index 6235b1ebb38b..0e3c1aadad69 100644 ---- a/Makefile -+++ b/Makefile -@@ -1358,6 +1358,12 @@ ifneq ($(wildcard $(resolve_btfids_O)),) - $(Q)$(MAKE) -sC $(srctree)/tools/bpf/resolve_btfids O=$(resolve_btfids_O) clean - endif - -+tools-clean-targets := sched_ext -+PHONY += $(tools-clean-targets) -+$(tools-clean-targets): -+ $(Q)$(MAKE) -sC tools $@_clean -+tools_clean: $(tools-clean-targets) -+ - # Clear a bunch of variables before executing the submake - ifeq ($(quiet),silent_) - tools_silent=s -@@ -1530,7 +1536,7 @@ PHONY += $(mrproper-dirs) mrproper - $(mrproper-dirs): - $(Q)$(MAKE) $(clean)=$(patsubst _mrproper_%,%,$@) - --mrproper: clean $(mrproper-dirs) -+mrproper: clean $(mrproper-dirs) tools_clean - $(call cmd,rmfiles) - @find . $(RCS_FIND_IGNORE) \ - \( -name '*.rmeta' \) \ -diff --git a/arch/riscv/Kconfig b/arch/riscv/Kconfig -index 0525ee2d63c7..9f38a5ecbee3 100644 ---- a/arch/riscv/Kconfig -+++ b/arch/riscv/Kconfig -@@ -610,6 +610,18 @@ config TOOLCHAIN_HAS_VECTOR_CRYPTO - def_bool $(as-instr, .option arch$(comma) +v$(comma) +zvkb) - depends on AS_HAS_OPTION_ARCH - -+config RISCV_ISA_ZBA -+ bool "Zba extension support for bit manipulation instructions" -+ default y -+ help -+ Add support for enabling optimisations in the kernel when the Zba -+ extension is detected at boot. -+ -+ The Zba extension provides instructions to accelerate the generation -+ of addresses that index into arrays of basic data types. -+ -+ If you don't know what to do here, say Y. -+ - config RISCV_ISA_ZBB - bool "Zbb extension support for bit manipulation instructions" - depends on TOOLCHAIN_HAS_ZBB -diff --git a/arch/riscv/net/bpf_jit.h b/arch/riscv/net/bpf_jit.h -index fdbf88ca8b70..1d1c78d4cff1 100644 ---- a/arch/riscv/net/bpf_jit.h -+++ b/arch/riscv/net/bpf_jit.h -@@ -18,6 +18,11 @@ static inline bool rvc_enabled(void) - return IS_ENABLED(CONFIG_RISCV_ISA_C); - } - -+static inline bool rvzba_enabled(void) -+{ -+ return IS_ENABLED(CONFIG_RISCV_ISA_ZBA) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBA); -+} -+ - static inline bool rvzbb_enabled(void) - { - return IS_ENABLED(CONFIG_RISCV_ISA_ZBB) && riscv_has_extension_likely(RISCV_ISA_EXT_ZBB); -@@ -737,6 +742,17 @@ static inline u16 rvc_swsp(u32 imm8, u8 rs2) - return rv_css_insn(0x6, imm, rs2, 0x2); - } - -+/* RVZBA instructions. */ -+static inline u32 rvzba_sh2add(u8 rd, u8 rs1, u8 rs2) -+{ -+ return rv_r_insn(0x10, rs2, rs1, 0x4, rd, 0x33); -+} -+ -+static inline u32 rvzba_sh3add(u8 rd, u8 rs1, u8 rs2) -+{ -+ return rv_r_insn(0x10, rs2, rs1, 0x6, rd, 0x33); -+} -+ - /* RVZBB instructions. */ - static inline u32 rvzbb_sextb(u8 rd, u8 rs1) - { -@@ -939,6 +955,14 @@ static inline u16 rvc_sdsp(u32 imm9, u8 rs2) - return rv_css_insn(0x7, imm, rs2, 0x2); - } - -+/* RV64-only ZBA instructions. */ -+ -+static inline u32 rvzba_zextw(u8 rd, u8 rs1) -+{ -+ /* add.uw rd, rs1, ZERO */ -+ return rv_r_insn(0x04, RV_REG_ZERO, rs1, 0, rd, 0x3b); -+} -+ - #endif /* __riscv_xlen == 64 */ - - /* Helper functions that emit RVC instructions when possible. */ -@@ -1082,6 +1106,28 @@ static inline void emit_sw(u8 rs1, s32 off, u8 rs2, struct rv_jit_context *ctx) - emit(rv_sw(rs1, off, rs2), ctx); - } - -+static inline void emit_sh2add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) -+{ -+ if (rvzba_enabled()) { -+ emit(rvzba_sh2add(rd, rs1, rs2), ctx); -+ return; -+ } -+ -+ emit_slli(rd, rs1, 2, ctx); -+ emit_add(rd, rd, rs2, ctx); -+} -+ -+static inline void emit_sh3add(u8 rd, u8 rs1, u8 rs2, struct rv_jit_context *ctx) -+{ -+ if (rvzba_enabled()) { -+ emit(rvzba_sh3add(rd, rs1, rs2), ctx); -+ return; -+ } -+ -+ emit_slli(rd, rs1, 3, ctx); -+ emit_add(rd, rd, rs2, ctx); -+} -+ - /* RV64-only helper functions. */ - #if __riscv_xlen == 64 - -@@ -1161,6 +1207,11 @@ static inline void emit_zexth(u8 rd, u8 rs, struct rv_jit_context *ctx) - - static inline void emit_zextw(u8 rd, u8 rs, struct rv_jit_context *ctx) - { -+ if (rvzba_enabled()) { -+ emit(rvzba_zextw(rd, rs), ctx); -+ return; -+ } -+ - emit_slli(rd, rs, 32, ctx); - emit_srli(rd, rd, 32, ctx); - } -diff --git a/arch/riscv/net/bpf_jit_comp32.c b/arch/riscv/net/bpf_jit_comp32.c -index f5ba73bb153d..592dd86fbf81 100644 ---- a/arch/riscv/net/bpf_jit_comp32.c -+++ b/arch/riscv/net/bpf_jit_comp32.c -@@ -811,8 +811,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) - * if (!prog) - * goto out; - */ -- emit(rv_slli(RV_REG_T0, lo(idx_reg), 2), ctx); -- emit(rv_add(RV_REG_T0, RV_REG_T0, lo(arr_reg)), ctx); -+ emit_sh2add(RV_REG_T0, lo(idx_reg), lo(arr_reg), ctx); - off = offsetof(struct bpf_array, ptrs); - if (is_12b_check(off, insn)) - return -1; -diff --git a/arch/riscv/net/bpf_jit_comp64.c b/arch/riscv/net/bpf_jit_comp64.c -index 79a001d5533e..d5cebb0b0afe 100644 ---- a/arch/riscv/net/bpf_jit_comp64.c -+++ b/arch/riscv/net/bpf_jit_comp64.c -@@ -380,8 +380,7 @@ static int emit_bpf_tail_call(int insn, struct rv_jit_context *ctx) - * if (!prog) - * goto out; - */ -- emit_slli(RV_REG_T2, RV_REG_A2, 3, ctx); -- emit_add(RV_REG_T2, RV_REG_T2, RV_REG_A1, ctx); -+ emit_sh3add(RV_REG_T2, RV_REG_A2, RV_REG_A1, ctx); - off = offsetof(struct bpf_array, ptrs); - if (is_12b_check(off, insn)) - return -1; -@@ -537,8 +536,10 @@ static void emit_atomic(u8 rd, u8 rs, s16 off, s32 imm, bool is64, - /* r0 = atomic_cmpxchg(dst_reg + off16, r0, src_reg); */ - case BPF_CMPXCHG: - r0 = bpf_to_rv_reg(BPF_REG_0, ctx); -- emit(is64 ? rv_addi(RV_REG_T2, r0, 0) : -- rv_addiw(RV_REG_T2, r0, 0), ctx); -+ if (is64) -+ emit_mv(RV_REG_T2, r0, ctx); -+ else -+ emit_addiw(RV_REG_T2, r0, 0, ctx); - emit(is64 ? rv_lr_d(r0, 0, rd, 0, 0) : - rv_lr_w(r0, 0, rd, 0, 0), ctx); - jmp_offset = ninsns_rvoff(8); -@@ -868,7 +869,7 @@ static int __arch_prepare_bpf_trampoline(struct bpf_tramp_image *im, - stack_size += 8; - sreg_off = stack_size; - -- stack_size = round_up(stack_size, 16); -+ stack_size = round_up(stack_size, STACK_ALIGN); - - if (!is_struct_ops) { - /* For the trampoline called from function entry, -@@ -1097,12 +1098,10 @@ int bpf_jit_emit_insn(const struct bpf_insn *insn, struct rv_jit_context *ctx, - /* Load current CPU number in T1 */ - emit_ld(RV_REG_T1, offsetof(struct thread_info, cpu), - RV_REG_TP, ctx); -- /* << 3 because offsets are 8 bytes */ -- emit_slli(RV_REG_T1, RV_REG_T1, 3, ctx); - /* Load address of __per_cpu_offset array in T2 */ - emit_addr(RV_REG_T2, (u64)&__per_cpu_offset, extra_pass, ctx); -- /* Add offset of current CPU to __per_cpu_offset */ -- emit_add(RV_REG_T1, RV_REG_T2, RV_REG_T1, ctx); -+ /* Get address of __per_cpu_offset[cpu] in T1 */ -+ emit_sh3add(RV_REG_T1, RV_REG_T1, RV_REG_T2, ctx); - /* Load __per_cpu_offset[cpu] in T1 */ - emit_ld(RV_REG_T1, 0, RV_REG_T1, ctx); - /* Add the offset to Rd */ -@@ -1960,7 +1959,7 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) - { - int i, stack_adjust = 0, store_offset, bpf_stack_adjust; - -- bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, 16); -+ bpf_stack_adjust = round_up(ctx->prog->aux->stack_depth, STACK_ALIGN); - if (bpf_stack_adjust) - mark_fp(ctx); - -@@ -1982,7 +1981,7 @@ void bpf_jit_build_prologue(struct rv_jit_context *ctx, bool is_subprog) - if (ctx->arena_vm_start) - stack_adjust += 8; - -- stack_adjust = round_up(stack_adjust, 16); -+ stack_adjust = round_up(stack_adjust, STACK_ALIGN); - stack_adjust += bpf_stack_adjust; - - store_offset = stack_adjust - 8; -diff --git a/drivers/isdn/mISDN/dsp_blowfish.c b/drivers/isdn/mISDN/dsp_blowfish.c -index 0aa572f3858d..0e77c282c862 100644 ---- a/drivers/isdn/mISDN/dsp_blowfish.c -+++ b/drivers/isdn/mISDN/dsp_blowfish.c -@@ -73,11 +73,6 @@ - * crypto-api for faster implementation - */ - --struct bf_ctx { -- u32 p[18]; -- u32 s[1024]; --}; -- - static const u32 bf_pbox[16 + 2] = { - 0x243f6a88, 0x85a308d3, 0x13198a2e, 0x03707344, - 0xa4093822, 0x299f31d0, 0x082efa98, 0xec4e6c89, -diff --git a/drivers/net/ethernet/8390/ne2k-pci.c b/drivers/net/ethernet/8390/ne2k-pci.c -index 65f56a98c0a0..1a34da07c0db 100644 ---- a/drivers/net/ethernet/8390/ne2k-pci.c -+++ b/drivers/net/ethernet/8390/ne2k-pci.c -@@ -186,17 +186,6 @@ static void ne2k_pci_block_output(struct net_device *dev, const int count, - static const struct ethtool_ops ne2k_pci_ethtool_ops; - - -- --/* There is no room in the standard 8390 structure for extra info we need, -- * so we build a meta/outer-wrapper structure.. -- */ --struct ne2k_pci_card { -- struct net_device *dev; -- struct pci_dev *pci_dev; --}; -- -- -- - /* NEx000-clone boards have a Station Address (SA) PROM (SAPROM) in the packet - * buffer memory space. By-the-spec NE2000 clones have 0x57,0x57 in bytes - * 0x0e,0x0f of the SAPROM, while other supposed NE2000 clones must be -diff --git a/drivers/net/ethernet/adaptec/starfire.c b/drivers/net/ethernet/adaptec/starfire.c -index 857361c74f5d..e1b8794b14c9 100644 ---- a/drivers/net/ethernet/adaptec/starfire.c -+++ b/drivers/net/ethernet/adaptec/starfire.c -@@ -441,14 +441,6 @@ enum rx_desc_bits { - }; - - /* Completion queue entry. */ --struct short_rx_done_desc { -- __le32 status; /* Low 16 bits is length. */ --}; --struct basic_rx_done_desc { -- __le32 status; /* Low 16 bits is length. */ -- __le16 vlanid; -- __le16 status2; --}; - struct csum_rx_done_desc { - __le32 status; /* Low 16 bits is length. */ - __le16 csum; /* Partial checksum */ -diff --git a/drivers/net/ethernet/cavium/liquidio/lio_main.c b/drivers/net/ethernet/cavium/liquidio/lio_main.c -index 34f02a8ec2ca..1d79f6eaa41f 100644 ---- a/drivers/net/ethernet/cavium/liquidio/lio_main.c -+++ b/drivers/net/ethernet/cavium/liquidio/lio_main.c -@@ -92,12 +92,6 @@ static int octeon_console_debug_enabled(u32 console) - /* time to wait for possible in-flight requests in milliseconds */ - #define WAIT_INFLIGHT_REQUEST msecs_to_jiffies(1000) - --struct oct_link_status_resp { -- u64 rh; -- struct oct_link_info link_info; -- u64 status; --}; -- - struct oct_timestamp_resp { - u64 rh; - u64 timestamp; -diff --git a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c -index 0d6ee30affb9..eef12fdd246d 100644 ---- a/drivers/net/ethernet/cavium/liquidio/octeon_droq.c -+++ b/drivers/net/ethernet/cavium/liquidio/octeon_droq.c -@@ -30,11 +30,6 @@ - #include "cn23xx_pf_device.h" - #include "cn23xx_vf_device.h" - --struct niclist { -- struct list_head list; -- void *ptr; --}; -- - struct __dispatch { - struct list_head list; - struct octeon_recv_info *rinfo; -diff --git a/drivers/net/ethernet/mellanox/mlx4/main.c b/drivers/net/ethernet/mellanox/mlx4/main.c -index 98688e4dbec5..febeadfdd5a5 100644 ---- a/drivers/net/ethernet/mellanox/mlx4/main.c -+++ b/drivers/net/ethernet/mellanox/mlx4/main.c -@@ -169,12 +169,6 @@ module_param_array(port_type_array, int, &arr_argc, 0444); - MODULE_PARM_DESC(port_type_array, "Array of port types: HW_DEFAULT (0) is default " - "1 for IB, 2 for Ethernet"); - --struct mlx4_port_config { -- struct list_head list; -- enum mlx4_port_type port_type[MLX4_MAX_PORTS + 1]; -- struct pci_dev *pdev; --}; -- - static atomic_t pf_loading = ATOMIC_INIT(0); - - static int mlx4_devlink_ierr_reset_get(struct devlink *devlink, u32 id, -diff --git a/drivers/net/usb/lan78xx.c b/drivers/net/usb/lan78xx.c -index 5a2c38b63012..7a5cc49ebec6 100644 ---- a/drivers/net/usb/lan78xx.c -+++ b/drivers/net/usb/lan78xx.c -@@ -380,11 +380,6 @@ struct skb_data { /* skb->cb is one of these */ - int num_of_packet; - }; - --struct usb_context { -- struct usb_ctrlrequest req; -- struct lan78xx_net *dev; --}; -- - #define EVENT_TX_HALT 0 - #define EVENT_RX_HALT 1 - #define EVENT_RX_MEMORY 2 -diff --git a/drivers/net/usb/smsc75xx.c b/drivers/net/usb/smsc75xx.c -index 0726e18bee6f..78c821349f48 100644 ---- a/drivers/net/usb/smsc75xx.c -+++ b/drivers/net/usb/smsc75xx.c -@@ -61,11 +61,6 @@ struct smsc75xx_priv { - u8 suspend_flags; - }; - --struct usb_context { -- struct usb_ctrlrequest req; -- struct usbnet *dev; --}; -- - static bool turbo_mode = true; - module_param(turbo_mode, bool, 0644); - MODULE_PARM_DESC(turbo_mode, "Enable multiple frames per Rx transaction"); -diff --git a/drivers/tty/sysrq.c b/drivers/tty/sysrq.c -index e5974b8239c9..167e877b8bef 100644 ---- a/drivers/tty/sysrq.c -+++ b/drivers/tty/sysrq.c -@@ -531,6 +531,7 @@ static const struct sysrq_key_op *sysrq_key_table[62] = { - NULL, /* P */ - NULL, /* Q */ - &sysrq_replay_logs_op, /* R */ -+ /* S: May be registered by sched_ext for resetting */ - NULL, /* S */ - NULL, /* T */ - NULL, /* U */ -diff --git a/include/asm-generic/vmlinux.lds.h b/include/asm-generic/vmlinux.lds.h -index 5703526d6ebf..2e712183ba09 100644 ---- a/include/asm-generic/vmlinux.lds.h -+++ b/include/asm-generic/vmlinux.lds.h -@@ -133,6 +133,7 @@ - *(__dl_sched_class) \ - *(__rt_sched_class) \ - *(__fair_sched_class) \ -+ *(__ext_sched_class) \ - *(__idle_sched_class) \ - __sched_class_lowest = .; - -diff --git a/include/linux/bpf.h b/include/linux/bpf.h -index 5e694a308081..a834f4b761bc 100644 ---- a/include/linux/bpf.h -+++ b/include/linux/bpf.h -@@ -1612,6 +1612,7 @@ struct bpf_link_ops { - struct bpf_link_info *info); - int (*update_map)(struct bpf_link *link, struct bpf_map *new_map, - struct bpf_map *old_map); -+ __poll_t (*poll)(struct file *file, struct poll_table_struct *pts); - }; - - struct bpf_tramp_link { -@@ -1730,9 +1731,9 @@ struct bpf_struct_ops { - int (*init_member)(const struct btf_type *t, - const struct btf_member *member, - void *kdata, const void *udata); -- int (*reg)(void *kdata); -- void (*unreg)(void *kdata); -- int (*update)(void *kdata, void *old_kdata); -+ int (*reg)(void *kdata, struct bpf_link *link); -+ void (*unreg)(void *kdata, struct bpf_link *link); -+ int (*update)(void *kdata, void *old_kdata, struct bpf_link *link); - int (*validate)(void *kdata); - void *cfi_stubs; - struct module *owner; -@@ -2333,6 +2334,7 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer); - int bpf_link_settle(struct bpf_link_primer *primer); - void bpf_link_cleanup(struct bpf_link_primer *primer); - void bpf_link_inc(struct bpf_link *link); -+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link); - void bpf_link_put(struct bpf_link *link); - int bpf_link_new_fd(struct bpf_link *link); - struct bpf_link *bpf_link_get_from_fd(u32 ufd); -@@ -2704,6 +2706,11 @@ static inline void bpf_link_inc(struct bpf_link *link) - { - } - -+static inline struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) -+{ -+ return NULL; -+} -+ - static inline void bpf_link_put(struct bpf_link *link) - { - } -diff --git a/include/linux/cgroup-defs.h b/include/linux/cgroup-defs.h -index ea48c861cd36..bfc027311950 100644 ---- a/include/linux/cgroup-defs.h -+++ b/include/linux/cgroup-defs.h -@@ -132,12 +132,18 @@ enum { - CFTYPE_WORLD_WRITABLE = (1 << 4), /* (DON'T USE FOR NEW FILES) S_IWUGO */ - CFTYPE_DEBUG = (1 << 5), /* create when cgroup_debug */ - -+ CFTYPE_HIDDEN = (1 << 6), /* file type hidden, see cgroup_show_cftypes() */ -+ - /* internal flags, do not use outside cgroup core proper */ - __CFTYPE_ONLY_ON_DFL = (1 << 16), /* only on default hierarchy */ - __CFTYPE_NOT_ON_DFL = (1 << 17), /* not on default hierarchy */ - __CFTYPE_ADDED = (1 << 18), - }; - -+enum cfile_flags { -+ CFILE_HIDDEN = (1 << 0), /* file instance hidden */ -+}; -+ - /* - * cgroup_file is the handle for a file instance created in a cgroup which - * is used, for example, to generate file changed notifications. This can -@@ -145,7 +151,9 @@ enum { - */ - struct cgroup_file { - /* do not access any fields from outside cgroup core */ -+ struct cftype *cft; - struct kernfs_node *kn; -+ unsigned int flags; - unsigned long notified_at; - struct timer_list notify_timer; - }; -diff --git a/include/linux/cgroup.h b/include/linux/cgroup.h -index 2150ca60394b..06bb4ca93414 100644 ---- a/include/linux/cgroup.h -+++ b/include/linux/cgroup.h -@@ -29,8 +29,6 @@ - - struct kernel_clone_args; - --#ifdef CONFIG_CGROUPS -- - /* - * All weight knobs on the default hierarchy should use the following min, - * default and max values. The default value is the logarithmic center of -@@ -40,6 +38,8 @@ struct kernel_clone_args; - #define CGROUP_WEIGHT_DFL 100 - #define CGROUP_WEIGHT_MAX 10000 - -+#ifdef CONFIG_CGROUPS -+ - enum { - CSS_TASK_ITER_PROCS = (1U << 0), /* walk only threadgroup leaders */ - CSS_TASK_ITER_THREADED = (1U << 1), /* walk all threaded css_sets in the domain */ -@@ -114,6 +114,7 @@ int cgroup_transfer_tasks(struct cgroup *to, struct cgroup *from); - int cgroup_add_dfl_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); - int cgroup_add_legacy_cftypes(struct cgroup_subsys *ss, struct cftype *cfts); - int cgroup_rm_cftypes(struct cftype *cfts); -+void cgroup_show_cftype(struct cftype *cft, bool show); - void cgroup_file_notify(struct cgroup_file *cfile); - void cgroup_file_show(struct cgroup_file *cfile, bool show); - -diff --git a/include/linux/filter.h b/include/linux/filter.h -index 0f12cf01070e..b02aea291b7e 100644 ---- a/include/linux/filter.h -+++ b/include/linux/filter.h -@@ -1406,7 +1406,7 @@ struct bpf_sock_ops_kern { - - struct bpf_sysctl_kern { - struct ctl_table_header *head; -- struct ctl_table *table; -+ const struct ctl_table *table; - void *cur_val; - size_t cur_len; - void *new_val; -diff --git a/include/linux/sched.h b/include/linux/sched.h -index 61591ac6eab6..55912a3830b7 100644 ---- a/include/linux/sched.h -+++ b/include/linux/sched.h -@@ -80,6 +80,8 @@ struct task_group; - struct task_struct; - struct user_event_mm; - -+#include -+ - /* - * Task state bitmask. NOTE! These bits are also - * encoded in fs/proc/array.c: get_task_state(). -@@ -802,6 +804,9 @@ struct task_struct { - struct sched_rt_entity rt; - struct sched_dl_entity dl; - struct sched_dl_entity *dl_server; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ struct sched_ext_entity scx; -+#endif - const struct sched_class *sched_class; - - #ifdef CONFIG_SCHED_CORE -diff --git a/include/linux/sched/ext.h b/include/linux/sched/ext.h -new file mode 100644 -index 000000000000..6e510c0cb10c ---- /dev/null -+++ b/include/linux/sched/ext.h -@@ -0,0 +1,210 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef _LINUX_SCHED_EXT_H -+#define _LINUX_SCHED_EXT_H -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ -+#include -+#include -+ -+enum scx_public_consts { -+ SCX_OPS_NAME_LEN = 128, -+ -+ SCX_SLICE_DFL = 20 * 1000000, /* 20ms */ -+ SCX_SLICE_INF = U64_MAX, /* infinite, implies nohz */ -+}; -+ -+/* -+ * DSQ (dispatch queue) IDs are 64bit of the format: -+ * -+ * Bits: [63] [62 .. 0] -+ * [ B] [ ID ] -+ * -+ * B: 1 for IDs for built-in DSQs, 0 for ops-created user DSQs -+ * ID: 63 bit ID -+ * -+ * Built-in IDs: -+ * -+ * Bits: [63] [62] [61..32] [31 .. 0] -+ * [ 1] [ L] [ R ] [ V ] -+ * -+ * 1: 1 for built-in DSQs. -+ * L: 1 for LOCAL_ON DSQ IDs, 0 for others -+ * V: For LOCAL_ON DSQ IDs, a CPU number. For others, a pre-defined value. -+ */ -+enum scx_dsq_id_flags { -+ SCX_DSQ_FLAG_BUILTIN = 1LLU << 63, -+ SCX_DSQ_FLAG_LOCAL_ON = 1LLU << 62, -+ -+ SCX_DSQ_INVALID = SCX_DSQ_FLAG_BUILTIN | 0, -+ SCX_DSQ_GLOBAL = SCX_DSQ_FLAG_BUILTIN | 1, -+ SCX_DSQ_LOCAL = SCX_DSQ_FLAG_BUILTIN | 2, -+ SCX_DSQ_LOCAL_ON = SCX_DSQ_FLAG_BUILTIN | SCX_DSQ_FLAG_LOCAL_ON, -+ SCX_DSQ_LOCAL_CPU_MASK = 0xffffffffLLU, -+}; -+ -+/* -+ * A dispatch queue (DSQ) can be either a FIFO or p->scx.dsq_vtime ordered -+ * queue. A built-in DSQ is always a FIFO. The built-in local DSQs are used to -+ * buffer between the scheduler core and the BPF scheduler. See the -+ * documentation for more details. -+ */ -+struct scx_dispatch_q { -+ raw_spinlock_t lock; -+ struct list_head list; /* tasks in dispatch order */ -+ struct rb_root priq; /* used to order by p->scx.dsq_vtime */ -+ u32 nr; -+ u64 seq; /* used by BPF iter */ -+ u64 id; -+ struct rhash_head hash_node; -+ struct llist_node free_node; -+ struct rcu_head rcu; -+}; -+ -+/* scx_entity.flags */ -+enum scx_ent_flags { -+ SCX_TASK_QUEUED = 1 << 0, /* on ext runqueue */ -+ SCX_TASK_BAL_KEEP = 1 << 1, /* balance decided to keep current */ -+ SCX_TASK_RESET_RUNNABLE_AT = 1 << 2, /* runnable_at should be reset */ -+ SCX_TASK_DEQD_FOR_SLEEP = 1 << 3, /* last dequeue was for SLEEP */ -+ -+ SCX_TASK_STATE_SHIFT = 8, /* bit 8 and 9 are used to carry scx_task_state */ -+ SCX_TASK_STATE_BITS = 2, -+ SCX_TASK_STATE_MASK = ((1 << SCX_TASK_STATE_BITS) - 1) << SCX_TASK_STATE_SHIFT, -+ -+ SCX_TASK_CURSOR = 1 << 31, /* iteration cursor, not a task */ -+}; -+ -+/* scx_entity.flags & SCX_TASK_STATE_MASK */ -+enum scx_task_state { -+ SCX_TASK_NONE, /* ops.init_task() not called yet */ -+ SCX_TASK_INIT, /* ops.init_task() succeeded, but task can be cancelled */ -+ SCX_TASK_READY, /* fully initialized, but not in sched_ext */ -+ SCX_TASK_ENABLED, /* fully initialized and in sched_ext */ -+ -+ SCX_TASK_NR_STATES, -+}; -+ -+/* scx_entity.dsq_flags */ -+enum scx_ent_dsq_flags { -+ SCX_TASK_DSQ_ON_PRIQ = 1 << 0, /* task is queued on the priority queue of a dsq */ -+ -+ SCX_TASK_DSQ_CURSOR = 1 << 31, /* iteration cursor, not a task */ -+}; -+ -+/* -+ * Mask bits for scx_entity.kf_mask. Not all kfuncs can be called from -+ * everywhere and the following bits track which kfunc sets are currently -+ * allowed for %current. This simple per-task tracking works because SCX ops -+ * nest in a limited way. BPF will likely implement a way to allow and disallow -+ * kfuncs depending on the calling context which will replace this manual -+ * mechanism. See scx_kf_allow(). -+ */ -+enum scx_kf_mask { -+ SCX_KF_UNLOCKED = 0, /* not sleepable, not rq locked */ -+ /* all non-sleepables may be nested inside SLEEPABLE */ -+ SCX_KF_SLEEPABLE = 1 << 0, /* sleepable init operations */ -+ /* ENQUEUE and DISPATCH may be nested inside CPU_RELEASE */ -+ SCX_KF_CPU_RELEASE = 1 << 1, /* ops.cpu_release() */ -+ /* ops.dequeue (in REST) may be nested inside DISPATCH */ -+ SCX_KF_DISPATCH = 1 << 2, /* ops.dispatch() */ -+ SCX_KF_ENQUEUE = 1 << 3, /* ops.enqueue() and ops.select_cpu() */ -+ SCX_KF_SELECT_CPU = 1 << 4, /* ops.select_cpu() */ -+ SCX_KF_REST = 1 << 5, /* other rq-locked operations */ -+ -+ __SCX_KF_RQ_LOCKED = SCX_KF_CPU_RELEASE | SCX_KF_DISPATCH | -+ SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, -+ __SCX_KF_TERMINAL = SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU | SCX_KF_REST, -+}; -+ -+struct scx_dsq_node { -+ struct list_head list; /* dispatch order */ -+ struct rb_node priq; /* p->scx.dsq_vtime order */ -+ u32 flags; /* SCX_TASK_DSQ_* flags */ -+}; -+ -+/* -+ * The following is embedded in task_struct and contains all fields necessary -+ * for a task to be scheduled by SCX. -+ */ -+struct sched_ext_entity { -+ struct scx_dispatch_q *dsq; -+ struct scx_dsq_node dsq_node; /* protected by dsq lock */ -+ u64 dsq_seq; -+ u32 flags; /* protected by rq lock */ -+ u32 weight; -+ s32 sticky_cpu; -+ s32 holding_cpu; -+ u32 kf_mask; /* see scx_kf_mask above */ -+ struct task_struct *kf_tasks[2]; /* see SCX_CALL_OP_TASK() */ -+ atomic_long_t ops_state; -+ -+ struct list_head runnable_node; /* rq->scx.runnable_list */ -+ unsigned long runnable_at; -+ -+#ifdef CONFIG_SCHED_CORE -+ u64 core_sched_at; /* see scx_prio_less() */ -+#endif -+ u64 ddsp_dsq_id; -+ u64 ddsp_enq_flags; -+ -+ /* BPF scheduler modifiable fields */ -+ -+ /* -+ * Runtime budget in nsecs. This is usually set through -+ * scx_bpf_dispatch() but can also be modified directly by the BPF -+ * scheduler. Automatically decreased by SCX as the task executes. On -+ * depletion, a scheduling event is triggered. -+ * -+ * This value is cleared to zero if the task is preempted by -+ * %SCX_KICK_PREEMPT and shouldn't be used to determine how long the -+ * task ran. Use p->se.sum_exec_runtime instead. -+ */ -+ u64 slice; -+ -+ /* -+ * Used to order tasks when dispatching to the vtime-ordered priority -+ * queue of a dsq. This is usually set through scx_bpf_dispatch_vtime() -+ * but can also be modified directly by the BPF scheduler. Modifying it -+ * while a task is queued on a dsq may mangle the ordering and is not -+ * recommended. -+ */ -+ u64 dsq_vtime; -+ -+ /* -+ * If set, reject future sched_setscheduler(2) calls updating the policy -+ * to %SCHED_EXT with -%EACCES. -+ * -+ * If set from ops.init_task() and the task's policy is already -+ * %SCHED_EXT, which can happen while the BPF scheduler is being loaded -+ * or by inhering the parent's policy during fork, the task's policy is -+ * rejected and forcefully reverted to %SCHED_NORMAL. The number of -+ * such events are reported through /sys/kernel/debug/sched_ext::nr_rejected. -+ */ -+ bool disallow; /* reject switching into SCX */ -+ -+ /* cold fields */ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ struct cgroup *cgrp_moving_from; -+#endif -+ /* must be the last field, see init_scx_entity() */ -+ struct list_head tasks_node; -+}; -+ -+void sched_ext_free(struct task_struct *p); -+void print_scx_info(const char *log_lvl, struct task_struct *p); -+ -+#else /* !CONFIG_SCHED_CLASS_EXT */ -+ -+static inline void sched_ext_free(struct task_struct *p) {} -+static inline void print_scx_info(const char *log_lvl, struct task_struct *p) {} -+ -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+#endif /* _LINUX_SCHED_EXT_H */ -diff --git a/include/linux/sched/task.h b/include/linux/sched/task.h -index d362aacf9f89..4df2f9055587 100644 ---- a/include/linux/sched/task.h -+++ b/include/linux/sched/task.h -@@ -63,7 +63,8 @@ extern asmlinkage void schedule_tail(struct task_struct *prev); - extern void init_idle(struct task_struct *idle, int cpu); - - extern int sched_fork(unsigned long clone_flags, struct task_struct *p); --extern void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); -+extern int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs); -+extern void sched_cancel_fork(struct task_struct *p); - extern void sched_post_fork(struct task_struct *p); - extern void sched_dead(struct task_struct *p); - -diff --git a/include/linux/skbuff.h b/include/linux/skbuff.h -index 1c2902eaebd3..fe7d8dbef77e 100644 ---- a/include/linux/skbuff.h -+++ b/include/linux/skbuff.h -@@ -706,6 +706,13 @@ typedef unsigned int sk_buff_data_t; - typedef unsigned char *sk_buff_data_t; - #endif - -+enum skb_tstamp_type { -+ SKB_CLOCK_REALTIME, -+ SKB_CLOCK_MONOTONIC, -+ SKB_CLOCK_TAI, -+ __SKB_CLOCK_MAX = SKB_CLOCK_TAI, -+}; -+ - /** - * DOC: Basic sk_buff geometry - * -@@ -823,10 +830,8 @@ typedef unsigned char *sk_buff_data_t; - * @dst_pending_confirm: need to confirm neighbour - * @decrypted: Decrypted SKB - * @slow_gro: state present at GRO time, slower prepare step required -- * @mono_delivery_time: When set, skb->tstamp has the -- * delivery_time in mono clock base (i.e. EDT). Otherwise, the -- * skb->tstamp has the (rcv) timestamp at ingress and -- * delivery_time at egress. -+ * @tstamp_type: When set, skb->tstamp has the -+ * delivery_time clock base of skb->tstamp. - * @napi_id: id of the NAPI struct this skb came from - * @sender_cpu: (aka @napi_id) source CPU in XPS - * @alloc_cpu: CPU which did the skb allocation. -@@ -954,7 +959,7 @@ struct sk_buff { - /* private: */ - __u8 __mono_tc_offset[0]; - /* public: */ -- __u8 mono_delivery_time:1; /* See SKB_MONO_DELIVERY_TIME_MASK */ -+ __u8 tstamp_type:2; /* See skb_tstamp_type */ - #ifdef CONFIG_NET_XGRESS - __u8 tc_at_ingress:1; /* See TC_AT_INGRESS_MASK */ - __u8 tc_skip_classify:1; -@@ -1084,15 +1089,16 @@ struct sk_buff { - #endif - #define PKT_TYPE_OFFSET offsetof(struct sk_buff, __pkt_type_offset) - --/* if you move tc_at_ingress or mono_delivery_time -+/* if you move tc_at_ingress or tstamp_type - * around, you also must adapt these constants. - */ - #ifdef __BIG_ENDIAN_BITFIELD --#define SKB_MONO_DELIVERY_TIME_MASK (1 << 7) --#define TC_AT_INGRESS_MASK (1 << 6) -+#define SKB_TSTAMP_TYPE_MASK (3 << 6) -+#define SKB_TSTAMP_TYPE_RSHIFT (6) -+#define TC_AT_INGRESS_MASK (1 << 5) - #else --#define SKB_MONO_DELIVERY_TIME_MASK (1 << 0) --#define TC_AT_INGRESS_MASK (1 << 1) -+#define SKB_TSTAMP_TYPE_MASK (3) -+#define TC_AT_INGRESS_MASK (1 << 2) - #endif - #define SKB_BF_MONO_TC_OFFSET offsetof(struct sk_buff, __mono_tc_offset) - -@@ -4179,7 +4185,7 @@ static inline void skb_get_new_timestampns(const struct sk_buff *skb, - static inline void __net_timestamp(struct sk_buff *skb) - { - skb->tstamp = ktime_get_real(); -- skb->mono_delivery_time = 0; -+ skb->tstamp_type = SKB_CLOCK_REALTIME; - } - - static inline ktime_t net_timedelta(ktime_t t) -@@ -4188,10 +4194,36 @@ static inline ktime_t net_timedelta(ktime_t t) - } - - static inline void skb_set_delivery_time(struct sk_buff *skb, ktime_t kt, -- bool mono) -+ u8 tstamp_type) - { - skb->tstamp = kt; -- skb->mono_delivery_time = kt && mono; -+ -+ if (kt) -+ skb->tstamp_type = tstamp_type; -+ else -+ skb->tstamp_type = SKB_CLOCK_REALTIME; -+} -+ -+static inline void skb_set_delivery_type_by_clockid(struct sk_buff *skb, -+ ktime_t kt, clockid_t clockid) -+{ -+ u8 tstamp_type = SKB_CLOCK_REALTIME; -+ -+ switch (clockid) { -+ case CLOCK_REALTIME: -+ break; -+ case CLOCK_MONOTONIC: -+ tstamp_type = SKB_CLOCK_MONOTONIC; -+ break; -+ case CLOCK_TAI: -+ tstamp_type = SKB_CLOCK_TAI; -+ break; -+ default: -+ WARN_ON_ONCE(1); -+ kt = 0; -+ } -+ -+ skb_set_delivery_time(skb, kt, tstamp_type); - } - - DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); -@@ -4201,8 +4233,8 @@ DECLARE_STATIC_KEY_FALSE(netstamp_needed_key); - */ - static inline void skb_clear_delivery_time(struct sk_buff *skb) - { -- if (skb->mono_delivery_time) { -- skb->mono_delivery_time = 0; -+ if (skb->tstamp_type) { -+ skb->tstamp_type = SKB_CLOCK_REALTIME; - if (static_branch_unlikely(&netstamp_needed_key)) - skb->tstamp = ktime_get_real(); - else -@@ -4212,7 +4244,7 @@ static inline void skb_clear_delivery_time(struct sk_buff *skb) - - static inline void skb_clear_tstamp(struct sk_buff *skb) - { -- if (skb->mono_delivery_time) -+ if (skb->tstamp_type) - return; - - skb->tstamp = 0; -@@ -4220,7 +4252,7 @@ static inline void skb_clear_tstamp(struct sk_buff *skb) - - static inline ktime_t skb_tstamp(const struct sk_buff *skb) - { -- if (skb->mono_delivery_time) -+ if (skb->tstamp_type) - return 0; - - return skb->tstamp; -@@ -4228,7 +4260,7 @@ static inline ktime_t skb_tstamp(const struct sk_buff *skb) - - static inline ktime_t skb_tstamp_cond(const struct sk_buff *skb, bool cond) - { -- if (!skb->mono_delivery_time && skb->tstamp) -+ if (skb->tstamp_type != SKB_CLOCK_MONOTONIC && skb->tstamp) - return skb->tstamp; - - if (static_branch_unlikely(&netstamp_needed_key) || cond) -diff --git a/include/net/inet_frag.h b/include/net/inet_frag.h -index 153960663ce4..5af6eb14c5db 100644 ---- a/include/net/inet_frag.h -+++ b/include/net/inet_frag.h -@@ -76,7 +76,7 @@ struct frag_v6_compare_key { - * @stamp: timestamp of the last received fragment - * @len: total length of the original datagram - * @meat: length of received fragments so far -- * @mono_delivery_time: stamp has a mono delivery time (EDT) -+ * @tstamp_type: stamp has a mono delivery time (EDT) - * @flags: fragment queue flags - * @max_size: maximum received fragment size - * @fqdir: pointer to struct fqdir -@@ -97,7 +97,7 @@ struct inet_frag_queue { - ktime_t stamp; - int len; - int meat; -- u8 mono_delivery_time; -+ u8 tstamp_type; - __u8 flags; - u16 max_size; - struct fqdir *fqdir; -diff --git a/include/trace/events/sched_ext.h b/include/trace/events/sched_ext.h -new file mode 100644 -index 000000000000..fe19da7315a9 ---- /dev/null -+++ b/include/trace/events/sched_ext.h -@@ -0,0 +1,32 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+#undef TRACE_SYSTEM -+#define TRACE_SYSTEM sched_ext -+ -+#if !defined(_TRACE_SCHED_EXT_H) || defined(TRACE_HEADER_MULTI_READ) -+#define _TRACE_SCHED_EXT_H -+ -+#include -+ -+TRACE_EVENT(sched_ext_dump, -+ -+ TP_PROTO(const char *line), -+ -+ TP_ARGS(line), -+ -+ TP_STRUCT__entry( -+ __string(line, line) -+ ), -+ -+ TP_fast_assign( -+ __assign_str(line); -+ ), -+ -+ TP_printk("%s", -+ __get_str(line) -+ ) -+); -+ -+#endif /* _TRACE_SCHED_EXT_H */ -+ -+/* This part must be outside protection */ -+#include -diff --git a/include/uapi/linux/bpf.h b/include/uapi/linux/bpf.h -index 90706a47f6ff..25ea393cf084 100644 ---- a/include/uapi/linux/bpf.h -+++ b/include/uapi/linux/bpf.h -@@ -6207,12 +6207,17 @@ union { \ - __u64 :64; \ - } __attribute__((aligned(8))) - -+/* The enum used in skb->tstamp_type. It specifies the clock type -+ * of the time stored in the skb->tstamp. -+ */ - enum { -- BPF_SKB_TSTAMP_UNSPEC, -- BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ -- /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, -- * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC -- * and try to deduce it by ingress, egress or skb->sk->sk_clockid. -+ BPF_SKB_TSTAMP_UNSPEC = 0, /* DEPRECATED */ -+ BPF_SKB_TSTAMP_DELIVERY_MONO = 1, /* DEPRECATED */ -+ BPF_SKB_CLOCK_REALTIME = 0, -+ BPF_SKB_CLOCK_MONOTONIC = 1, -+ BPF_SKB_CLOCK_TAI = 2, -+ /* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle, -+ * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid. - */ - }; - -diff --git a/include/uapi/linux/sched.h b/include/uapi/linux/sched.h -index 3bac0a8ceab2..359a14cc76a4 100644 ---- a/include/uapi/linux/sched.h -+++ b/include/uapi/linux/sched.h -@@ -118,6 +118,7 @@ struct clone_args { - /* SCHED_ISO: reserved but not implemented yet */ - #define SCHED_IDLE 5 - #define SCHED_DEADLINE 6 -+#define SCHED_EXT 7 - - /* Can be ORed in to make sure the process is reverted back to SCHED_NORMAL on fork */ - #define SCHED_RESET_ON_FORK 0x40000000 -diff --git a/init/Kconfig b/init/Kconfig -index 44616ffe0af5..c4f6fc369754 100644 ---- a/init/Kconfig -+++ b/init/Kconfig -@@ -1041,6 +1041,11 @@ config RT_GROUP_SCHED - realtime bandwidth for them. - See Documentation/scheduler/sched-rt-group.rst for more information. - -+config EXT_GROUP_SCHED -+ bool -+ depends on SCHED_CLASS_EXT && CGROUP_SCHED -+ default y -+ - endif #CGROUP_SCHED - - config SCHED_MM_CID -diff --git a/init/init_task.c b/init/init_task.c -index eeb110c65fe2..5726b3a0eea9 100644 ---- a/init/init_task.c -+++ b/init/init_task.c -@@ -6,6 +6,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -98,6 +99,17 @@ struct task_struct init_task __aligned(L1_CACHE_BYTES) = { - #endif - #ifdef CONFIG_CGROUP_SCHED - .sched_task_group = &root_task_group, -+#endif -+#ifdef CONFIG_SCHED_CLASS_EXT -+ .scx = { -+ .dsq_node.list = LIST_HEAD_INIT(init_task.scx.dsq_node.list), -+ .sticky_cpu = -1, -+ .holding_cpu = -1, -+ .runnable_node = LIST_HEAD_INIT(init_task.scx.runnable_node), -+ .runnable_at = INITIAL_JIFFIES, -+ .ddsp_dsq_id = SCX_DSQ_INVALID, -+ .slice = SCX_SLICE_DFL, -+ }, - #endif - .ptraced = LIST_HEAD_INIT(init_task.ptraced), - .ptrace_entry = LIST_HEAD_INIT(init_task.ptrace_entry), -diff --git a/kernel/Kconfig.preempt b/kernel/Kconfig.preempt -index c2f1fd95a821..bae49b743834 100644 ---- a/kernel/Kconfig.preempt -+++ b/kernel/Kconfig.preempt -@@ -133,4 +133,26 @@ config SCHED_CORE - which is the likely usage by Linux distributions, there should - be no measurable impact on performance. - -- -+config SCHED_CLASS_EXT -+ bool "Extensible Scheduling Class" -+ depends on BPF_SYSCALL && BPF_JIT -+ help -+ This option enables a new scheduler class sched_ext (SCX), which -+ allows scheduling policies to be implemented as BPF programs to -+ achieve the following: -+ -+ - Ease of experimentation and exploration: Enabling rapid -+ iteration of new scheduling policies. -+ - Customization: Building application-specific schedulers which -+ implement policies that are not applicable to general-purpose -+ schedulers. -+ - Rapid scheduler deployments: Non-disruptive swap outs of -+ scheduling policies in production environments. -+ -+ sched_ext leverages BPF’s struct_ops feature to define a structure -+ which exports function callbacks and flags to BPF programs that -+ wish to implement scheduling policies. The struct_ops structure -+ exported by sched_ext is struct sched_ext_ops, and is conceptually -+ similar to struct sched_class. -+ -+ See Documentation/scheduler/sched-ext.rst for more details. -diff --git a/kernel/bpf/bpf_local_storage.c b/kernel/bpf/bpf_local_storage.c -index 976cb258a0ed..c938dea5ddbf 100644 ---- a/kernel/bpf/bpf_local_storage.c -+++ b/kernel/bpf/bpf_local_storage.c -@@ -782,8 +782,8 @@ bpf_local_storage_map_alloc(union bpf_attr *attr, - nbuckets = max_t(u32, 2, nbuckets); - smap->bucket_log = ilog2(nbuckets); - -- smap->buckets = bpf_map_kvcalloc(&smap->map, sizeof(*smap->buckets), -- nbuckets, GFP_USER | __GFP_NOWARN); -+ smap->buckets = bpf_map_kvcalloc(&smap->map, nbuckets, -+ sizeof(*smap->buckets), GFP_USER | __GFP_NOWARN); - if (!smap->buckets) { - err = -ENOMEM; - goto free_smap; -diff --git a/kernel/bpf/bpf_struct_ops.c b/kernel/bpf/bpf_struct_ops.c -index 86c7884abaf8..a2cf31b14be4 100644 ---- a/kernel/bpf/bpf_struct_ops.c -+++ b/kernel/bpf/bpf_struct_ops.c -@@ -12,6 +12,7 @@ - #include - #include - #include -+#include - - struct bpf_struct_ops_value { - struct bpf_struct_ops_common_value common; -@@ -56,6 +57,7 @@ struct bpf_struct_ops_map { - struct bpf_struct_ops_link { - struct bpf_link link; - struct bpf_map __rcu *map; -+ wait_queue_head_t wait_hup; - }; - - static DEFINE_MUTEX(update_mutex); -@@ -757,7 +759,7 @@ static long bpf_struct_ops_map_update_elem(struct bpf_map *map, void *key, - goto unlock; - } - -- err = st_ops->reg(kdata); -+ err = st_ops->reg(kdata, NULL); - if (likely(!err)) { - /* This refcnt increment on the map here after - * 'st_ops->reg()' is secure since the state of the -@@ -805,7 +807,7 @@ static long bpf_struct_ops_map_delete_elem(struct bpf_map *map, void *key) - BPF_STRUCT_OPS_STATE_TOBEFREE); - switch (prev_state) { - case BPF_STRUCT_OPS_STATE_INUSE: -- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); -+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, NULL); - bpf_map_put(map); - return 0; - case BPF_STRUCT_OPS_STATE_TOBEFREE: -@@ -1057,10 +1059,7 @@ static void bpf_struct_ops_map_link_dealloc(struct bpf_link *link) - st_map = (struct bpf_struct_ops_map *) - rcu_dereference_protected(st_link->map, true); - if (st_map) { -- /* st_link->map can be NULL if -- * bpf_struct_ops_link_create() fails to register. -- */ -- st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data); -+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); - bpf_map_put(&st_map->map); - } - kfree(st_link); -@@ -1075,7 +1074,8 @@ static void bpf_struct_ops_map_link_show_fdinfo(const struct bpf_link *link, - st_link = container_of(link, struct bpf_struct_ops_link, link); - rcu_read_lock(); - map = rcu_dereference(st_link->map); -- seq_printf(seq, "map_id:\t%d\n", map->id); -+ if (map) -+ seq_printf(seq, "map_id:\t%d\n", map->id); - rcu_read_unlock(); - } - -@@ -1088,7 +1088,8 @@ static int bpf_struct_ops_map_link_fill_link_info(const struct bpf_link *link, - st_link = container_of(link, struct bpf_struct_ops_link, link); - rcu_read_lock(); - map = rcu_dereference(st_link->map); -- info->struct_ops.map_id = map->id; -+ if (map) -+ info->struct_ops.map_id = map->id; - rcu_read_unlock(); - return 0; - } -@@ -1113,6 +1114,10 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map - mutex_lock(&update_mutex); - - old_map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); -+ if (!old_map) { -+ err = -ENOLINK; -+ goto err_out; -+ } - if (expected_old_map && old_map != expected_old_map) { - err = -EPERM; - goto err_out; -@@ -1125,7 +1130,7 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map - goto err_out; - } - -- err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data); -+ err = st_map->st_ops_desc->st_ops->update(st_map->kvalue.data, old_st_map->kvalue.data, link); - if (err) - goto err_out; - -@@ -1139,11 +1144,53 @@ static int bpf_struct_ops_map_link_update(struct bpf_link *link, struct bpf_map - return err; - } - -+static int bpf_struct_ops_map_link_detach(struct bpf_link *link) -+{ -+ struct bpf_struct_ops_link *st_link = container_of(link, struct bpf_struct_ops_link, link); -+ struct bpf_struct_ops_map *st_map; -+ struct bpf_map *map; -+ -+ mutex_lock(&update_mutex); -+ -+ map = rcu_dereference_protected(st_link->map, lockdep_is_held(&update_mutex)); -+ if (!map) { -+ mutex_unlock(&update_mutex); -+ return 0; -+ } -+ st_map = container_of(map, struct bpf_struct_ops_map, map); -+ -+ st_map->st_ops_desc->st_ops->unreg(&st_map->kvalue.data, link); -+ -+ RCU_INIT_POINTER(st_link->map, NULL); -+ /* Pair with bpf_map_get() in bpf_struct_ops_link_create() or -+ * bpf_map_inc() in bpf_struct_ops_map_link_update(). -+ */ -+ bpf_map_put(&st_map->map); -+ -+ mutex_unlock(&update_mutex); -+ -+ wake_up_interruptible_poll(&st_link->wait_hup, EPOLLHUP); -+ -+ return 0; -+} -+ -+static __poll_t bpf_struct_ops_map_link_poll(struct file *file, -+ struct poll_table_struct *pts) -+{ -+ struct bpf_struct_ops_link *st_link = file->private_data; -+ -+ poll_wait(file, &st_link->wait_hup, pts); -+ -+ return rcu_access_pointer(st_link->map) ? 0 : EPOLLHUP; -+} -+ - static const struct bpf_link_ops bpf_struct_ops_map_lops = { - .dealloc = bpf_struct_ops_map_link_dealloc, -+ .detach = bpf_struct_ops_map_link_detach, - .show_fdinfo = bpf_struct_ops_map_link_show_fdinfo, - .fill_link_info = bpf_struct_ops_map_link_fill_link_info, - .update_map = bpf_struct_ops_map_link_update, -+ .poll = bpf_struct_ops_map_link_poll, - }; - - int bpf_struct_ops_link_create(union bpf_attr *attr) -@@ -1176,13 +1223,21 @@ int bpf_struct_ops_link_create(union bpf_attr *attr) - if (err) - goto err_out; - -- err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data); -+ init_waitqueue_head(&link->wait_hup); -+ -+ /* Hold the update_mutex such that the subsystem cannot -+ * do link->ops->detach() before the link is fully initialized. -+ */ -+ mutex_lock(&update_mutex); -+ err = st_map->st_ops_desc->st_ops->reg(st_map->kvalue.data, &link->link); - if (err) { -+ mutex_unlock(&update_mutex); - bpf_link_cleanup(&link_primer); - link = NULL; - goto err_out; - } - RCU_INIT_POINTER(link->map, map); -+ mutex_unlock(&update_mutex); - - return bpf_link_settle(&link_primer); - -diff --git a/kernel/bpf/helpers.c b/kernel/bpf/helpers.c -index 2a69a9a36c0f..6f1abcb4b084 100644 ---- a/kernel/bpf/helpers.c -+++ b/kernel/bpf/helpers.c -@@ -2744,6 +2744,122 @@ __bpf_kfunc void bpf_preempt_enable(void) - preempt_enable(); - } - -+struct bpf_iter_bits { -+ __u64 __opaque[2]; -+} __aligned(8); -+ -+struct bpf_iter_bits_kern { -+ union { -+ unsigned long *bits; -+ unsigned long bits_copy; -+ }; -+ u32 nr_bits; -+ int bit; -+} __aligned(8); -+ -+/** -+ * bpf_iter_bits_new() - Initialize a new bits iterator for a given memory area -+ * @it: The new bpf_iter_bits to be created -+ * @unsafe_ptr__ign: A pointer pointing to a memory area to be iterated over -+ * @nr_words: The size of the specified memory area, measured in 8-byte units. -+ * Due to the limitation of memalloc, it can't be greater than 512. -+ * -+ * This function initializes a new bpf_iter_bits structure for iterating over -+ * a memory area which is specified by the @unsafe_ptr__ign and @nr_words. It -+ * copies the data of the memory area to the newly created bpf_iter_bits @it for -+ * subsequent iteration operations. -+ * -+ * On success, 0 is returned. On failure, ERR is returned. -+ */ -+__bpf_kfunc int -+bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, u32 nr_words) -+{ -+ struct bpf_iter_bits_kern *kit = (void *)it; -+ u32 nr_bytes = nr_words * sizeof(u64); -+ u32 nr_bits = BYTES_TO_BITS(nr_bytes); -+ int err; -+ -+ BUILD_BUG_ON(sizeof(struct bpf_iter_bits_kern) != sizeof(struct bpf_iter_bits)); -+ BUILD_BUG_ON(__alignof__(struct bpf_iter_bits_kern) != -+ __alignof__(struct bpf_iter_bits)); -+ -+ kit->nr_bits = 0; -+ kit->bits_copy = 0; -+ kit->bit = -1; -+ -+ if (!unsafe_ptr__ign || !nr_words) -+ return -EINVAL; -+ -+ /* Optimization for u64 mask */ -+ if (nr_bits == 64) { -+ err = bpf_probe_read_kernel_common(&kit->bits_copy, nr_bytes, unsafe_ptr__ign); -+ if (err) -+ return -EFAULT; -+ -+ kit->nr_bits = nr_bits; -+ return 0; -+ } -+ -+ /* Fallback to memalloc */ -+ kit->bits = bpf_mem_alloc(&bpf_global_ma, nr_bytes); -+ if (!kit->bits) -+ return -ENOMEM; -+ -+ err = bpf_probe_read_kernel_common(kit->bits, nr_bytes, unsafe_ptr__ign); -+ if (err) { -+ bpf_mem_free(&bpf_global_ma, kit->bits); -+ return err; -+ } -+ -+ kit->nr_bits = nr_bits; -+ return 0; -+} -+ -+/** -+ * bpf_iter_bits_next() - Get the next bit in a bpf_iter_bits -+ * @it: The bpf_iter_bits to be checked -+ * -+ * This function returns a pointer to a number representing the value of the -+ * next bit in the bits. -+ * -+ * If there are no further bits available, it returns NULL. -+ */ -+__bpf_kfunc int *bpf_iter_bits_next(struct bpf_iter_bits *it) -+{ -+ struct bpf_iter_bits_kern *kit = (void *)it; -+ u32 nr_bits = kit->nr_bits; -+ const unsigned long *bits; -+ int bit; -+ -+ if (nr_bits == 0) -+ return NULL; -+ -+ bits = nr_bits == 64 ? &kit->bits_copy : kit->bits; -+ bit = find_next_bit(bits, nr_bits, kit->bit + 1); -+ if (bit >= nr_bits) { -+ kit->nr_bits = 0; -+ return NULL; -+ } -+ -+ kit->bit = bit; -+ return &kit->bit; -+} -+ -+/** -+ * bpf_iter_bits_destroy() - Destroy a bpf_iter_bits -+ * @it: The bpf_iter_bits to be destroyed -+ * -+ * Destroy the resource associated with the bpf_iter_bits. -+ */ -+__bpf_kfunc void bpf_iter_bits_destroy(struct bpf_iter_bits *it) -+{ -+ struct bpf_iter_bits_kern *kit = (void *)it; -+ -+ if (kit->nr_bits <= 64) -+ return; -+ bpf_mem_free(&bpf_global_ma, kit->bits); -+} -+ - __bpf_kfunc_end_defs(); - - BTF_KFUNCS_START(generic_btf_ids) -@@ -2826,6 +2942,9 @@ BTF_ID_FLAGS(func, bpf_wq_set_callback_impl) - BTF_ID_FLAGS(func, bpf_wq_start) - BTF_ID_FLAGS(func, bpf_preempt_disable) - BTF_ID_FLAGS(func, bpf_preempt_enable) -+BTF_ID_FLAGS(func, bpf_iter_bits_new, KF_ITER_NEW) -+BTF_ID_FLAGS(func, bpf_iter_bits_next, KF_ITER_NEXT | KF_RET_NULL) -+BTF_ID_FLAGS(func, bpf_iter_bits_destroy, KF_ITER_DESTROY) - BTF_KFUNCS_END(common_btf_ids) - - static const struct btf_kfunc_id_set common_kfunc_set = { -diff --git a/kernel/bpf/syscall.c b/kernel/bpf/syscall.c -index 2222c3ff88e7..5070fa20d05c 100644 ---- a/kernel/bpf/syscall.c -+++ b/kernel/bpf/syscall.c -@@ -3150,6 +3150,13 @@ static void bpf_link_show_fdinfo(struct seq_file *m, struct file *filp) - } - #endif - -+static __poll_t bpf_link_poll(struct file *file, struct poll_table_struct *pts) -+{ -+ struct bpf_link *link = file->private_data; -+ -+ return link->ops->poll(file, pts); -+} -+ - static const struct file_operations bpf_link_fops = { - #ifdef CONFIG_PROC_FS - .show_fdinfo = bpf_link_show_fdinfo, -@@ -3159,6 +3166,16 @@ static const struct file_operations bpf_link_fops = { - .write = bpf_dummy_write, - }; - -+static const struct file_operations bpf_link_fops_poll = { -+#ifdef CONFIG_PROC_FS -+ .show_fdinfo = bpf_link_show_fdinfo, -+#endif -+ .release = bpf_link_release, -+ .read = bpf_dummy_read, -+ .write = bpf_dummy_write, -+ .poll = bpf_link_poll, -+}; -+ - static int bpf_link_alloc_id(struct bpf_link *link) - { - int id; -@@ -3201,7 +3218,9 @@ int bpf_link_prime(struct bpf_link *link, struct bpf_link_primer *primer) - return id; - } - -- file = anon_inode_getfile("bpf_link", &bpf_link_fops, link, O_CLOEXEC); -+ file = anon_inode_getfile("bpf_link", -+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, -+ link, O_CLOEXEC); - if (IS_ERR(file)) { - bpf_link_free_id(id); - put_unused_fd(fd); -@@ -3229,7 +3248,9 @@ int bpf_link_settle(struct bpf_link_primer *primer) - - int bpf_link_new_fd(struct bpf_link *link) - { -- return anon_inode_getfd("bpf-link", &bpf_link_fops, link, O_CLOEXEC); -+ return anon_inode_getfd("bpf-link", -+ link->ops->poll ? &bpf_link_fops_poll : &bpf_link_fops, -+ link, O_CLOEXEC); - } - - struct bpf_link *bpf_link_get_from_fd(u32 ufd) -@@ -3239,7 +3260,7 @@ struct bpf_link *bpf_link_get_from_fd(u32 ufd) - - if (!f.file) - return ERR_PTR(-EBADF); -- if (f.file->f_op != &bpf_link_fops) { -+ if (f.file->f_op != &bpf_link_fops && f.file->f_op != &bpf_link_fops_poll) { - fdput(f); - return ERR_PTR(-EINVAL); - } -@@ -4971,7 +4992,7 @@ static int bpf_obj_get_info_by_fd(const union bpf_attr *attr, - uattr); - else if (f.file->f_op == &btf_fops) - err = bpf_btf_get_info_by_fd(f.file, f.file->private_data, attr, uattr); -- else if (f.file->f_op == &bpf_link_fops) -+ else if (f.file->f_op == &bpf_link_fops || f.file->f_op == &bpf_link_fops_poll) - err = bpf_link_get_info_by_fd(f.file, f.file->private_data, - attr, uattr); - else -@@ -5106,7 +5127,7 @@ static int bpf_task_fd_query(const union bpf_attr *attr, - if (!file) - return -EBADF; - -- if (file->f_op == &bpf_link_fops) { -+ if (file->f_op == &bpf_link_fops || file->f_op == &bpf_link_fops_poll) { - struct bpf_link *link = file->private_data; - - if (link->ops == &bpf_raw_tp_link_lops) { -@@ -5416,10 +5437,11 @@ static int link_detach(union bpf_attr *attr) - return ret; - } - --static struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) -+struct bpf_link *bpf_link_inc_not_zero(struct bpf_link *link) - { - return atomic64_fetch_add_unless(&link->refcnt, 1, 0) ? link : ERR_PTR(-ENOENT); - } -+EXPORT_SYMBOL(bpf_link_inc_not_zero); - - struct bpf_link *bpf_link_by_id(u32 id) - { -diff --git a/kernel/cgroup/cgroup.c b/kernel/cgroup/cgroup.c -index e32b6972c478..47dcf14b33c8 100644 ---- a/kernel/cgroup/cgroup.c -+++ b/kernel/cgroup/cgroup.c -@@ -4206,10 +4206,13 @@ static int cgroup_add_file(struct cgroup_subsys_state *css, struct cgroup *cgrp, - if (IS_ERR(kn)) - return PTR_ERR(kn); - -+ kernfs_show(kn, !(cft->flags & CFTYPE_HIDDEN)); -+ - if (cft->file_offset) { - struct cgroup_file *cfile = (void *)css + cft->file_offset; - - timer_setup(&cfile->notify_timer, cgroup_file_notify_timer, 0); -+ cfile->cft = cft; - - spin_lock_irq(&cgroup_file_kn_lock); - cfile->kn = kn; -@@ -4485,6 +4488,24 @@ void cgroup_file_notify(struct cgroup_file *cfile) - spin_unlock_irqrestore(&cgroup_file_kn_lock, flags); - } - -+static struct kernfs_node *cfile_kn_get(struct cgroup_file *cfile) -+{ -+ struct kernfs_node *kn; -+ -+ spin_lock_irq(&cgroup_file_kn_lock); -+ kn = cfile->kn; -+ kernfs_get(kn); -+ spin_unlock_irq(&cgroup_file_kn_lock); -+ -+ return kn; -+} -+ -+static bool cfile_visible(struct cgroup_file *cfile) -+{ -+ return !(cfile->cft->flags & CFTYPE_HIDDEN) && -+ !(cfile->flags & CFILE_HIDDEN); -+} -+ - /** - * cgroup_file_show - show or hide a hidden cgroup file - * @cfile: target cgroup_file obtained by setting cftype->file_offset -@@ -4494,15 +4515,20 @@ void cgroup_file_show(struct cgroup_file *cfile, bool show) - { - struct kernfs_node *kn; - -- spin_lock_irq(&cgroup_file_kn_lock); -- kn = cfile->kn; -- kernfs_get(kn); -- spin_unlock_irq(&cgroup_file_kn_lock); -+ mutex_lock(&cgroup_mutex); - -- if (kn) -- kernfs_show(kn, show); -+ if (show) -+ cfile->flags &= ~CFILE_HIDDEN; -+ else -+ cfile->flags |= CFILE_HIDDEN; - -- kernfs_put(kn); -+ kn = cfile_kn_get(cfile); -+ if (kn) { -+ kernfs_show(kn, cfile_visible(cfile)); -+ kernfs_put(kn); -+ } -+ -+ mutex_unlock(&cgroup_mutex); - } - - /** -@@ -5527,6 +5553,63 @@ static void offline_css(struct cgroup_subsys_state *css) - wake_up_all(&css->cgroup->offline_waitq); - } - -+/** -+ * cgroup_show_cftype - show or hide a cgroup file type -+ * @cft: cftype to show or hide -+ * @show: whether to show or hide -+ * -+ * Sets %CFTYPE_HIDDEN and shows/hides the matching files according to @show. -+ * @cft may or may not be added at the time of this call. After hiding, it's -+ * guaranteed that there are no in-flight operations on the hidden files. -+ */ -+void cgroup_show_cftype(struct cftype *cft, bool show) -+{ -+ struct cgroup_subsys *ss = cft->ss; -+ struct cgroup *root = ss ? &ss->root->cgrp : &cgrp_dfl_root.cgrp; -+ struct cgroup_subsys_state *css; -+ -+ mutex_lock(&cgroup_mutex); -+ -+ if (show) -+ cft->flags &= ~CFTYPE_HIDDEN; -+ else -+ cft->flags |= CFTYPE_HIDDEN; -+ -+ if (!(cft->flags & __CFTYPE_ADDED)) -+ goto out_unlock; -+ -+ css_for_each_descendant_pre(css, cgroup_css(root, ss)) { -+ struct cgroup *cgrp = css->cgroup; -+ struct kernfs_node *kn; -+ -+ if (!(css->flags & CSS_VISIBLE)) -+ continue; -+ -+ if (cft->file_offset) { -+ struct cgroup_file *cfile = -+ (void *)css + cft->file_offset; -+ -+ kn = cfile_kn_get(cfile); -+ if (kn) { -+ kernfs_show(kn, cfile_visible(cfile)); -+ kernfs_put(kn); -+ } -+ } else { -+ char buf[CGROUP_FILE_NAME_MAX]; -+ -+ kn = kernfs_find_and_get(cgrp->kn, -+ cgroup_file_name(cgrp, cft, buf)); -+ if (kn) { -+ kernfs_show(kn, show); -+ kernfs_put(kn); -+ } -+ } -+ } -+ -+out_unlock: -+ mutex_unlock(&cgroup_mutex); -+} -+ - /** - * css_create - create a cgroup_subsys_state - * @cgrp: the cgroup new css will be associated with -diff --git a/kernel/fork.c b/kernel/fork.c -index 18750b83c564..d973d23b3768 100644 ---- a/kernel/fork.c -+++ b/kernel/fork.c -@@ -23,6 +23,7 @@ - #include - #include - #include -+#include - #include - #include - #include -@@ -975,6 +976,7 @@ void __put_task_struct(struct task_struct *tsk) - WARN_ON(refcount_read(&tsk->usage)); - WARN_ON(tsk == current); - -+ sched_ext_free(tsk); - io_uring_free(tsk); - cgroup_free(tsk); - task_numa_free(tsk, true); -@@ -2371,7 +2373,7 @@ __latent_entropy struct task_struct *copy_process( - - retval = perf_event_init_task(p, clone_flags); - if (retval) -- goto bad_fork_cleanup_policy; -+ goto bad_fork_sched_cancel_fork; - retval = audit_alloc(p); - if (retval) - goto bad_fork_cleanup_perf; -@@ -2504,7 +2506,9 @@ __latent_entropy struct task_struct *copy_process( - * cgroup specific, it unconditionally needs to place the task on a - * runqueue. - */ -- sched_cgroup_fork(p, args); -+ retval = sched_cgroup_fork(p, args); -+ if (retval) -+ goto bad_fork_cancel_cgroup; - - /* - * From this point on we must avoid any synchronous user-space -@@ -2550,13 +2554,13 @@ __latent_entropy struct task_struct *copy_process( - /* Don't start children in a dying pid namespace */ - if (unlikely(!(ns_of_pid(pid)->pid_allocated & PIDNS_ADDING))) { - retval = -ENOMEM; -- goto bad_fork_cancel_cgroup; -+ goto bad_fork_core_free; - } - - /* Let kill terminate clone/fork in the middle */ - if (fatal_signal_pending(current)) { - retval = -EINTR; -- goto bad_fork_cancel_cgroup; -+ goto bad_fork_core_free; - } - - /* No more failure paths after this point. */ -@@ -2630,10 +2634,11 @@ __latent_entropy struct task_struct *copy_process( - - return p; - --bad_fork_cancel_cgroup: -+bad_fork_core_free: - sched_core_free(p); - spin_unlock(¤t->sighand->siglock); - write_unlock_irq(&tasklist_lock); -+bad_fork_cancel_cgroup: - cgroup_cancel_fork(p, args); - bad_fork_put_pidfd: - if (clone_flags & CLONE_PIDFD) { -@@ -2672,6 +2677,8 @@ __latent_entropy struct task_struct *copy_process( - audit_free(p); - bad_fork_cleanup_perf: - perf_event_free_task(p); -+bad_fork_sched_cancel_fork: -+ sched_cancel_fork(p); - bad_fork_cleanup_policy: - lockdep_free_task(p); - #ifdef CONFIG_NUMA -diff --git a/kernel/sched/build_policy.c b/kernel/sched/build_policy.c -index d9dc9ab3773f..0de5477f876e 100644 ---- a/kernel/sched/build_policy.c -+++ b/kernel/sched/build_policy.c -@@ -21,13 +21,19 @@ - - #include - #include -+#include - #include -+#include - #include -+#include -+#include - #include - #include - #include - #include - #include -+#include -+#include - - #include - -@@ -52,3 +58,6 @@ - #include "cputime.c" - #include "deadline.c" - -+#ifdef CONFIG_SCHED_CLASS_EXT -+# include "ext.c" -+#endif -diff --git a/kernel/sched/core.c b/kernel/sched/core.c -index bcf2c4cc0522..6161dd1928d4 100644 ---- a/kernel/sched/core.c -+++ b/kernel/sched/core.c -@@ -168,7 +168,10 @@ static inline int __task_prio(const struct task_struct *p) - if (p->sched_class == &idle_sched_class) - return MAX_RT_PRIO + NICE_WIDTH; /* 140 */ - -- return MAX_RT_PRIO + MAX_NICE; /* 120, squash fair */ -+ if (task_on_scx(p)) -+ return MAX_RT_PRIO + MAX_NICE + 1; /* 120, squash ext */ -+ -+ return MAX_RT_PRIO + MAX_NICE; /* 119, squash fair */ - } - - /* -@@ -197,6 +200,11 @@ static inline bool prio_less(const struct task_struct *a, - if (pa == MAX_RT_PRIO + MAX_NICE) /* fair */ - return cfs_prio_less(a, b, in_fi); - -+#ifdef CONFIG_SCHED_CLASS_EXT -+ if (pa == MAX_RT_PRIO + MAX_NICE + 1) /* ext */ -+ return scx_prio_less(a, b, in_fi); -+#endif -+ - return false; - } - -@@ -1255,11 +1263,14 @@ bool sched_can_stop_tick(struct rq *rq) - return true; - - /* -- * If there are no DL,RR/FIFO tasks, there must only be CFS tasks left; -- * if there's more than one we need the tick for involuntary -- * preemption. -+ * If there are no DL,RR/FIFO tasks, there must only be CFS or SCX tasks -+ * left. For CFS, if there's more than one we need the tick for -+ * involuntary preemption. For SCX, ask. - */ -- if (rq->nr_running > 1) -+ if (!scx_switched_all() && rq->nr_running > 1) -+ return false; -+ -+ if (scx_enabled() && !scx_can_stop_tick(rq)) - return false; - - /* -@@ -1342,8 +1353,8 @@ static void set_load_weight(struct task_struct *p, bool update_load) - * SCHED_OTHER tasks have to update their load when changing their - * weight - */ -- if (update_load && p->sched_class == &fair_sched_class) { -- reweight_task(p, prio); -+ if (update_load && p->sched_class->reweight_task) { -+ p->sched_class->reweight_task(task_rq(p), p, prio); - } else { - load->weight = scale_load(sched_prio_to_weight[prio]); - load->inv_weight = sched_prio_to_wmult[prio]; -@@ -2214,6 +2225,17 @@ inline int task_curr(const struct task_struct *p) - return cpu_curr(task_cpu(p)) == p; - } - -+/* -+ * ->switching_to() is called with the pi_lock and rq_lock held and must not -+ * mess with locking. -+ */ -+void check_class_changing(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class) -+{ -+ if (prev_class != p->sched_class && p->sched_class->switching_to) -+ p->sched_class->switching_to(rq, p); -+} -+ - /* - * switched_from, switched_to and prio_changed must _NOT_ drop rq->lock, - * use the balance_callback list if you want balancing. -@@ -2221,9 +2243,9 @@ inline int task_curr(const struct task_struct *p) - * this means any call to check_class_changed() must be followed by a call to - * balance_callback(). - */ --static inline void check_class_changed(struct rq *rq, struct task_struct *p, -- const struct sched_class *prev_class, -- int oldprio) -+void check_class_changed(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class, -+ int oldprio) - { - if (prev_class != p->sched_class) { - if (prev_class->switched_from) -@@ -3986,6 +4008,15 @@ bool cpus_share_resources(int this_cpu, int that_cpu) - - static inline bool ttwu_queue_cond(struct task_struct *p, int cpu) - { -+ /* -+ * The BPF scheduler may depend on select_task_rq() being invoked during -+ * wakeups. In addition, @p may end up executing on a different CPU -+ * regardless of what happens in the wakeup path making the ttwu_queue -+ * optimization less meaningful. Skip if on SCX. -+ */ -+ if (task_on_scx(p)) -+ return false; -+ - /* - * Do not complicate things with the async wake_list while the CPU is - * in hotplug state. -@@ -4553,6 +4584,10 @@ static void __sched_fork(unsigned long clone_flags, struct task_struct *p) - p->rt.on_rq = 0; - p->rt.on_list = 0; - -+#ifdef CONFIG_SCHED_CLASS_EXT -+ init_scx_entity(&p->scx); -+#endif -+ - #ifdef CONFIG_PREEMPT_NOTIFIERS - INIT_HLIST_HEAD(&p->preempt_notifiers); - #endif -@@ -4755,6 +4790,8 @@ late_initcall(sched_core_sysctl_init); - */ - int sched_fork(unsigned long clone_flags, struct task_struct *p) - { -+ int ret; -+ - __sched_fork(clone_flags, p); - /* - * We mark the process as NEW here. This guarantees that -@@ -4791,12 +4828,20 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - p->sched_reset_on_fork = 0; - } - -- if (dl_prio(p->prio)) -- return -EAGAIN; -- else if (rt_prio(p->prio)) -+ scx_pre_fork(p); -+ -+ if (dl_prio(p->prio)) { -+ ret = -EAGAIN; -+ goto out_cancel; -+ } else if (rt_prio(p->prio)) { - p->sched_class = &rt_sched_class; -- else -+#ifdef CONFIG_SCHED_CLASS_EXT -+ } else if (task_should_scx(p)) { -+ p->sched_class = &ext_sched_class; -+#endif -+ } else { - p->sched_class = &fair_sched_class; -+ } - - init_entity_runnable_average(&p->se); - -@@ -4814,9 +4859,13 @@ int sched_fork(unsigned long clone_flags, struct task_struct *p) - RB_CLEAR_NODE(&p->pushable_dl_tasks); - #endif - return 0; -+ -+out_cancel: -+ scx_cancel_fork(p); -+ return ret; - } - --void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) -+int sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) - { - unsigned long flags; - -@@ -4843,11 +4892,19 @@ void sched_cgroup_fork(struct task_struct *p, struct kernel_clone_args *kargs) - if (p->sched_class->task_fork) - p->sched_class->task_fork(p); - raw_spin_unlock_irqrestore(&p->pi_lock, flags); -+ -+ return scx_fork(p); -+} -+ -+void sched_cancel_fork(struct task_struct *p) -+{ -+ scx_cancel_fork(p); - } - - void sched_post_fork(struct task_struct *p) - { - uclamp_post_fork(p); -+ scx_post_fork(p); - } - - unsigned long to_ratio(u64 period, u64 runtime) -@@ -5686,6 +5743,7 @@ void sched_tick(void) - calc_global_load_tick(rq); - sched_core_tick(rq); - task_tick_mm_cid(rq, curr); -+ scx_tick(rq); - - rq_unlock(rq, &rf); - -@@ -5698,8 +5756,10 @@ void sched_tick(void) - wq_worker_tick(curr); - - #ifdef CONFIG_SMP -- rq->idle_balance = idle_cpu(cpu); -- sched_balance_trigger(rq); -+ if (!scx_switched_all()) { -+ rq->idle_balance = idle_cpu(cpu); -+ sched_balance_trigger(rq); -+ } - #endif - } - -@@ -5999,7 +6059,7 @@ static void put_prev_task_balance(struct rq *rq, struct task_struct *prev, - * We can terminate the balance pass as soon as we know there is - * a runnable task of @class priority or higher. - */ -- for_class_range(class, prev->sched_class, &idle_sched_class) { -+ for_balance_class_range(class, prev->sched_class, &idle_sched_class) { - if (class->balance(rq, prev, rf)) - break; - } -@@ -6017,6 +6077,9 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - const struct sched_class *class; - struct task_struct *p; - -+ if (scx_enabled()) -+ goto restart; -+ - /* - * Optimization: we know that if all tasks are in the fair class we can - * call that function directly, but only if the @prev task wasn't of a -@@ -6057,10 +6120,12 @@ __pick_next_task(struct rq *rq, struct task_struct *prev, struct rq_flags *rf) - if (prev->dl_server) - prev->dl_server = NULL; - -- for_each_class(class) { -+ for_each_active_class(class) { - p = class->pick_next_task(rq); -- if (p) -+ if (p) { -+ scx_next_task_picked(rq, p, class); - return p; -+ } - } - - BUG(); /* The idle class should always have a runnable task. */ -@@ -6090,7 +6155,7 @@ static inline struct task_struct *pick_task(struct rq *rq) - const struct sched_class *class; - struct task_struct *p; - -- for_each_class(class) { -+ for_each_active_class(class) { - p = class->pick_task(rq); - if (p) - return p; -@@ -7080,12 +7145,16 @@ int default_wake_function(wait_queue_entry_t *curr, unsigned mode, int wake_flag - } - EXPORT_SYMBOL(default_wake_function); - --static void __setscheduler_prio(struct task_struct *p, int prio) -+void __setscheduler_prio(struct task_struct *p, int prio) - { - if (dl_prio(prio)) - p->sched_class = &dl_sched_class; - else if (rt_prio(prio)) - p->sched_class = &rt_sched_class; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ else if (task_should_scx(p)) -+ p->sched_class = &ext_sched_class; -+#endif - else - p->sched_class = &fair_sched_class; - -@@ -7246,6 +7315,7 @@ void rt_mutex_setprio(struct task_struct *p, struct task_struct *pi_task) - } - - __setscheduler_prio(p, prio); -+ check_class_changing(rq, p, prev_class); - - if (queued) - enqueue_task(rq, p, queue_flag); -@@ -7467,6 +7537,25 @@ int sched_core_idle_cpu(int cpu) - #endif - - #ifdef CONFIG_SMP -+/* -+ * Load avg and utiliztion metrics need to be updated periodically and before -+ * consumption. This function updates the metrics for all subsystems except for -+ * the fair class. @rq must be locked and have its clock updated. -+ */ -+bool update_other_load_avgs(struct rq *rq) -+{ -+ u64 now = rq_clock_pelt(rq); -+ const struct sched_class *curr_class = rq->curr->sched_class; -+ unsigned long hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); -+ -+ lockdep_assert_rq_held(rq); -+ -+ return update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | -+ update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | -+ update_hw_load_avg(now, rq, hw_pressure) | -+ update_irq_load_avg(rq, 0); -+} -+ - /* - * This function computes an effective utilization for the given CPU, to be - * used for frequency selection given the linear relation: f = u * f_max. -@@ -7789,6 +7878,10 @@ static int __sched_setscheduler(struct task_struct *p, - goto unlock; - } - -+ retval = scx_check_setscheduler(p, policy); -+ if (retval) -+ goto unlock; -+ - /* - * If not changing anything there's no need to proceed further, - * but store a possible modification of reset_on_fork. -@@ -7891,6 +7984,7 @@ static int __sched_setscheduler(struct task_struct *p, - __setscheduler_prio(p, newprio); - } - __setscheduler_uclamp(p, attr); -+ check_class_changing(rq, p, prev_class); - - if (queued) { - /* -@@ -9066,6 +9160,7 @@ SYSCALL_DEFINE1(sched_get_priority_max, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - break; - } -@@ -9093,6 +9188,7 @@ SYSCALL_DEFINE1(sched_get_priority_min, int, policy) - case SCHED_NORMAL: - case SCHED_BATCH: - case SCHED_IDLE: -+ case SCHED_EXT: - ret = 0; - } - return ret; -@@ -9188,6 +9284,7 @@ void sched_show_task(struct task_struct *p) - - print_worker_info(KERN_INFO, p); - print_stop_info(KERN_INFO, p); -+ print_scx_info(KERN_INFO, p); - show_stack(p, NULL, KERN_INFO); - put_task_stack(p); - } -@@ -9680,6 +9777,8 @@ int sched_cpu_activate(unsigned int cpu) - cpuset_cpu_active(); - } - -+ scx_rq_activate(rq); -+ - /* - * Put the rq online, if not already. This happens: - * -@@ -9740,6 +9839,8 @@ int sched_cpu_deactivate(unsigned int cpu) - } - rq_unlock_irqrestore(rq, &rf); - -+ scx_rq_deactivate(rq); -+ - #ifdef CONFIG_SCHED_SMT - /* - * When going down, decrement the number of cores with SMT present. -@@ -9923,11 +10024,15 @@ void __init sched_init(void) - int i; - - /* Make sure the linker didn't screw up */ -- BUG_ON(&idle_sched_class != &fair_sched_class + 1 || -- &fair_sched_class != &rt_sched_class + 1 || -- &rt_sched_class != &dl_sched_class + 1); - #ifdef CONFIG_SMP -- BUG_ON(&dl_sched_class != &stop_sched_class + 1); -+ BUG_ON(!sched_class_above(&stop_sched_class, &dl_sched_class)); -+#endif -+ BUG_ON(!sched_class_above(&dl_sched_class, &rt_sched_class)); -+ BUG_ON(!sched_class_above(&rt_sched_class, &fair_sched_class)); -+ BUG_ON(!sched_class_above(&fair_sched_class, &idle_sched_class)); -+#ifdef CONFIG_SCHED_CLASS_EXT -+ BUG_ON(!sched_class_above(&fair_sched_class, &ext_sched_class)); -+ BUG_ON(!sched_class_above(&ext_sched_class, &idle_sched_class)); - #endif - - wait_bit_init(); -@@ -9951,6 +10056,9 @@ void __init sched_init(void) - root_task_group.shares = ROOT_TASK_GROUP_LOAD; - init_cfs_bandwidth(&root_task_group.cfs_bandwidth, NULL); - #endif /* CONFIG_FAIR_GROUP_SCHED */ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ root_task_group.scx_weight = CGROUP_WEIGHT_DFL; -+#endif /* CONFIG_EXT_GROUP_SCHED */ - #ifdef CONFIG_RT_GROUP_SCHED - root_task_group.rt_se = (struct sched_rt_entity **)ptr; - ptr += nr_cpu_ids * sizeof(void **); -@@ -10096,6 +10204,7 @@ void __init sched_init(void) - balance_push_set(smp_processor_id(), false); - #endif - init_sched_fair_class(); -+ init_sched_ext_class(); - - psi_init(); - -@@ -10381,6 +10490,7 @@ struct task_group *sched_create_group(struct task_group *parent) - if (!alloc_rt_sched_group(tg, parent)) - goto err; - -+ scx_group_set_weight(tg, CGROUP_WEIGHT_DFL); - alloc_uclamp_sched_group(tg, parent); - - return tg; -@@ -10508,6 +10618,7 @@ void sched_move_task(struct task_struct *tsk) - put_prev_task(rq, tsk); - - sched_change_group(tsk, group); -+ scx_move_task(tsk); - - if (queued) - enqueue_task(rq, tsk, queue_flags); -@@ -10522,11 +10633,6 @@ void sched_move_task(struct task_struct *tsk) - } - } - --static inline struct task_group *css_tg(struct cgroup_subsys_state *css) --{ -- return css ? container_of(css, struct task_group, css) : NULL; --} -- - static struct cgroup_subsys_state * - cpu_cgroup_css_alloc(struct cgroup_subsys_state *parent_css) - { -@@ -10550,6 +10656,11 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) - { - struct task_group *tg = css_tg(css); - struct task_group *parent = css_tg(css->parent); -+ int ret; -+ -+ ret = scx_tg_online(tg); -+ if (ret) -+ return ret; - - if (parent) - sched_online_group(tg, parent); -@@ -10564,6 +10675,13 @@ static int cpu_cgroup_css_online(struct cgroup_subsys_state *css) - return 0; - } - -+static void cpu_cgroup_css_offline(struct cgroup_subsys_state *css) -+{ -+ struct task_group *tg = css_tg(css); -+ -+ scx_tg_offline(tg); -+} -+ - static void cpu_cgroup_css_released(struct cgroup_subsys_state *css) - { - struct task_group *tg = css_tg(css); -@@ -10581,9 +10699,10 @@ static void cpu_cgroup_css_free(struct cgroup_subsys_state *css) - sched_unregister_group(tg); - } - --#ifdef CONFIG_RT_GROUP_SCHED -+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) - static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) - { -+#ifdef CONFIG_RT_GROUP_SCHED - struct task_struct *task; - struct cgroup_subsys_state *css; - -@@ -10591,7 +10710,8 @@ static int cpu_cgroup_can_attach(struct cgroup_taskset *tset) - if (!sched_rt_can_attach(css_tg(css), task)) - return -EINVAL; - } -- return 0; -+#endif -+ return scx_cgroup_can_attach(tset); - } - #endif - -@@ -10602,8 +10722,17 @@ static void cpu_cgroup_attach(struct cgroup_taskset *tset) - - cgroup_taskset_for_each(task, css, tset) - sched_move_task(task); -+ -+ scx_cgroup_finish_attach(); - } - -+#ifdef CONFIG_EXT_GROUP_SCHED -+static void cpu_cgroup_cancel_attach(struct cgroup_taskset *tset) -+{ -+ scx_cgroup_cancel_attach(tset); -+} -+#endif -+ - #ifdef CONFIG_UCLAMP_TASK_GROUP - static void cpu_util_update_eff(struct cgroup_subsys_state *css) - { -@@ -10782,9 +10911,15 @@ static int cpu_uclamp_max_show(struct seq_file *sf, void *v) - static int cpu_shares_write_u64(struct cgroup_subsys_state *css, - struct cftype *cftype, u64 shareval) - { -+ int ret; -+ - if (shareval > scale_load_down(ULONG_MAX)) - shareval = MAX_SHARES; -- return sched_group_set_shares(css_tg(css), scale_load(shareval)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(shareval)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), -+ sched_weight_to_cgroup(shareval)); -+ return ret; - } - - static u64 cpu_shares_read_u64(struct cgroup_subsys_state *css, -@@ -11181,7 +11316,7 @@ static int cpu_idle_write_s64(struct cgroup_subsys_state *css, - } - #endif - --static struct cftype cpu_legacy_files[] = { -+static struct cftype cpu_legacy_cftypes[] = { - #ifdef CONFIG_FAIR_GROUP_SCHED - { - .name = "shares", -@@ -11292,38 +11427,44 @@ static int cpu_local_stat_show(struct seq_file *sf, - return 0; - } - -+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) -+ -+static unsigned long tg_weight(struct task_group *tg) -+{ - #ifdef CONFIG_FAIR_GROUP_SCHED -+ return scale_load_down(tg->shares); -+#else -+ return sched_weight_from_cgroup(tg->scx_weight); -+#endif -+} -+ - static u64 cpu_weight_read_u64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -- struct task_group *tg = css_tg(css); -- u64 weight = scale_load_down(tg->shares); -- -- return DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024); -+ return sched_weight_to_cgroup(tg_weight(css_tg(css))); - } - - static int cpu_weight_write_u64(struct cgroup_subsys_state *css, -- struct cftype *cft, u64 weight) -+ struct cftype *cft, u64 cgrp_weight) - { -- /* -- * cgroup weight knobs should use the common MIN, DFL and MAX -- * values which are 1, 100 and 10000 respectively. While it loses -- * a bit of range on both ends, it maps pretty well onto the shares -- * value used by scheduler and the round-trip conversions preserve -- * the original value over the entire range. -- */ -- if (weight < CGROUP_WEIGHT_MIN || weight > CGROUP_WEIGHT_MAX) -+ unsigned long weight; -+ int ret; -+ -+ if (cgrp_weight < CGROUP_WEIGHT_MIN || cgrp_weight > CGROUP_WEIGHT_MAX) - return -ERANGE; - -- weight = DIV_ROUND_CLOSEST_ULL(weight * 1024, CGROUP_WEIGHT_DFL); -+ weight = sched_weight_from_cgroup(cgrp_weight); - -- return sched_group_set_shares(css_tg(css), scale_load(weight)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), cgrp_weight); -+ return ret; - } - - static s64 cpu_weight_nice_read_s64(struct cgroup_subsys_state *css, - struct cftype *cft) - { -- unsigned long weight = scale_load_down(css_tg(css)->shares); -+ unsigned long weight = tg_weight(css_tg(css)); - int last_delta = INT_MAX; - int prio, delta; - -@@ -11342,7 +11483,7 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, - struct cftype *cft, s64 nice) - { - unsigned long weight; -- int idx; -+ int idx, ret; - - if (nice < MIN_NICE || nice > MAX_NICE) - return -ERANGE; -@@ -11351,7 +11492,11 @@ static int cpu_weight_nice_write_s64(struct cgroup_subsys_state *css, - idx = array_index_nospec(idx, 40); - weight = sched_prio_to_weight[idx]; - -- return sched_group_set_shares(css_tg(css), scale_load(weight)); -+ ret = sched_group_set_shares(css_tg(css), scale_load(weight)); -+ if (!ret) -+ scx_group_set_weight(css_tg(css), -+ sched_weight_to_cgroup(weight)); -+ return ret; - } - #endif - -@@ -11412,21 +11557,23 @@ static ssize_t cpu_max_write(struct kernfs_open_file *of, - } - #endif - --static struct cftype cpu_files[] = { --#ifdef CONFIG_FAIR_GROUP_SCHED -- { -+struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1] = { -+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) -+ [CPU_CFTYPE_WEIGHT] = { - .name = "weight", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_weight_read_u64, - .write_u64 = cpu_weight_write_u64, - }, -- { -+ [CPU_CFTYPE_WEIGHT_NICE] = { - .name = "weight.nice", - .flags = CFTYPE_NOT_ON_ROOT, - .read_s64 = cpu_weight_nice_read_s64, - .write_s64 = cpu_weight_nice_write_s64, - }, -- { -+#endif -+#ifdef CONFIG_FAIR_GROUP_SCHED -+ [CPU_CFTYPE_IDLE] = { - .name = "idle", - .flags = CFTYPE_NOT_ON_ROOT, - .read_s64 = cpu_idle_read_s64, -@@ -11434,13 +11581,13 @@ static struct cftype cpu_files[] = { - }, - #endif - #ifdef CONFIG_CFS_BANDWIDTH -- { -+ [CPU_CFTYPE_MAX] = { - .name = "max", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_max_show, - .write = cpu_max_write, - }, -- { -+ [CPU_CFTYPE_MAX_BURST] = { - .name = "max.burst", - .flags = CFTYPE_NOT_ON_ROOT, - .read_u64 = cpu_cfs_burst_read_u64, -@@ -11448,13 +11595,13 @@ static struct cftype cpu_files[] = { - }, - #endif - #ifdef CONFIG_UCLAMP_TASK_GROUP -- { -+ [CPU_CFTYPE_UCLAMP_MIN] = { - .name = "uclamp.min", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_uclamp_min_show, - .write = cpu_uclamp_min_write, - }, -- { -+ [CPU_CFTYPE_UCLAMP_MAX] = { - .name = "uclamp.max", - .flags = CFTYPE_NOT_ON_ROOT, - .seq_show = cpu_uclamp_max_show, -@@ -11467,16 +11614,20 @@ static struct cftype cpu_files[] = { - struct cgroup_subsys cpu_cgrp_subsys = { - .css_alloc = cpu_cgroup_css_alloc, - .css_online = cpu_cgroup_css_online, -+ .css_offline = cpu_cgroup_css_offline, - .css_released = cpu_cgroup_css_released, - .css_free = cpu_cgroup_css_free, - .css_extra_stat_show = cpu_extra_stat_show, - .css_local_stat_show = cpu_local_stat_show, --#ifdef CONFIG_RT_GROUP_SCHED -+#if defined(CONFIG_RT_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) - .can_attach = cpu_cgroup_can_attach, - #endif - .attach = cpu_cgroup_attach, -- .legacy_cftypes = cpu_legacy_files, -- .dfl_cftypes = cpu_files, -+#ifdef CONFIG_EXT_GROUP_SCHED -+ .cancel_attach = cpu_cgroup_cancel_attach, -+#endif -+ .legacy_cftypes = cpu_legacy_cftypes, -+ .dfl_cftypes = cpu_cftypes, - .early_init = true, - .threaded = true, - }; -@@ -12064,3 +12215,38 @@ void sched_mm_cid_fork(struct task_struct *t) - t->mm_cid_active = 1; - } - #endif -+ -+#ifdef CONFIG_SCHED_CLASS_EXT -+void sched_deq_and_put_task(struct task_struct *p, int queue_flags, -+ struct sched_enq_and_set_ctx *ctx) -+{ -+ struct rq *rq = task_rq(p); -+ -+ lockdep_assert_rq_held(rq); -+ -+ *ctx = (struct sched_enq_and_set_ctx){ -+ .p = p, -+ .queue_flags = queue_flags, -+ .queued = task_on_rq_queued(p), -+ .running = task_current(rq, p), -+ }; -+ -+ update_rq_clock(rq); -+ if (ctx->queued) -+ dequeue_task(rq, p, queue_flags | DEQUEUE_NOCLOCK); -+ if (ctx->running) -+ put_prev_task(rq, p); -+} -+ -+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx) -+{ -+ struct rq *rq = task_rq(ctx->p); -+ -+ lockdep_assert_rq_held(rq); -+ -+ if (ctx->queued) -+ enqueue_task(rq, ctx->p, ctx->queue_flags | ENQUEUE_NOCLOCK); -+ if (ctx->running) -+ set_next_task(rq, ctx->p); -+} -+#endif /* CONFIG_SCHED_CLASS_EXT */ -diff --git a/kernel/sched/cpufreq_schedutil.c b/kernel/sched/cpufreq_schedutil.c -index eece6244f9d2..12174c0137a5 100644 ---- a/kernel/sched/cpufreq_schedutil.c -+++ b/kernel/sched/cpufreq_schedutil.c -@@ -197,7 +197,9 @@ unsigned long sugov_effective_cpu_perf(int cpu, unsigned long actual, - - static void sugov_get_util(struct sugov_cpu *sg_cpu, unsigned long boost) - { -- unsigned long min, max, util = cpu_util_cfs_boost(sg_cpu->cpu); -+ unsigned long min, max; -+ unsigned long util = cpu_util_cfs_boost(sg_cpu->cpu) + -+ scx_cpuperf_target(sg_cpu->cpu); - - util = effective_cpu_util(sg_cpu->cpu, util, &min, &max); - util = max(util, boost); -@@ -325,16 +327,35 @@ static unsigned long sugov_iowait_apply(struct sugov_cpu *sg_cpu, u64 time, - } - - #ifdef CONFIG_NO_HZ_COMMON --static bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) -+static bool sugov_hold_freq(struct sugov_cpu *sg_cpu) - { -- unsigned long idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); -- bool ret = idle_calls == sg_cpu->saved_idle_calls; -+ unsigned long idle_calls; -+ bool ret; -+ -+ /* -+ * The heuristics in this function is for the fair class. For SCX, the -+ * performance target comes directly from the BPF scheduler. Let's just -+ * follow it. -+ */ -+ if (scx_switched_all()) -+ return false; -+ -+ /* if capped by uclamp_max, always update to be in compliance */ -+ if (uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu))) -+ return false; -+ -+ /* -+ * Maintain the frequency if the CPU has not been idle recently, as -+ * reduction is likely to be premature. -+ */ -+ idle_calls = tick_nohz_get_idle_calls_cpu(sg_cpu->cpu); -+ ret = idle_calls == sg_cpu->saved_idle_calls; - - sg_cpu->saved_idle_calls = idle_calls; - return ret; - } - #else --static inline bool sugov_cpu_is_busy(struct sugov_cpu *sg_cpu) { return false; } -+static inline bool sugov_hold_freq(struct sugov_cpu *sg_cpu) { return false; } - #endif /* CONFIG_NO_HZ_COMMON */ - - /* -@@ -382,14 +403,8 @@ static void sugov_update_single_freq(struct update_util_data *hook, u64 time, - return; - - next_f = get_next_freq(sg_policy, sg_cpu->util, max_cap); -- /* -- * Do not reduce the frequency if the CPU has not been idle -- * recently, as the reduction is likely to be premature then. -- * -- * Except when the rq is capped by uclamp_max. -- */ -- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && -- sugov_cpu_is_busy(sg_cpu) && next_f < sg_policy->next_freq && -+ -+ if (sugov_hold_freq(sg_cpu) && next_f < sg_policy->next_freq && - !sg_policy->need_freq_update) { - next_f = sg_policy->next_freq; - -@@ -436,14 +451,7 @@ static void sugov_update_single_perf(struct update_util_data *hook, u64 time, - if (!sugov_update_single_common(sg_cpu, time, max_cap, flags)) - return; - -- /* -- * Do not reduce the target performance level if the CPU has not been -- * idle recently, as the reduction is likely to be premature then. -- * -- * Except when the rq is capped by uclamp_max. -- */ -- if (!uclamp_rq_is_capped(cpu_rq(sg_cpu->cpu)) && -- sugov_cpu_is_busy(sg_cpu) && sg_cpu->util < prev_util) -+ if (sugov_hold_freq(sg_cpu) && sg_cpu->util < prev_util) - sg_cpu->util = prev_util; - - cpufreq_driver_adjust_perf(sg_cpu->cpu, sg_cpu->bw_min, -diff --git a/kernel/sched/debug.c b/kernel/sched/debug.c -index c1eb9a1afd13..c057ef46c5f8 100644 ---- a/kernel/sched/debug.c -+++ b/kernel/sched/debug.c -@@ -1090,6 +1090,9 @@ void proc_sched_show_task(struct task_struct *p, struct pid_namespace *ns, - P(dl.runtime); - P(dl.deadline); - } -+#ifdef CONFIG_SCHED_CLASS_EXT -+ __PS("ext.enabled", task_on_scx(p)); -+#endif - #undef PN_SCHEDSTAT - #undef P_SCHEDSTAT - -diff --git a/kernel/sched/ext.c b/kernel/sched/ext.c -new file mode 100644 -index 000000000000..93e041e2f8d7 ---- /dev/null -+++ b/kernel/sched/ext.c -@@ -0,0 +1,6973 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#define SCX_OP_IDX(op) (offsetof(struct sched_ext_ops, op) / sizeof(void (*)(void))) -+ -+enum scx_consts { -+ SCX_DSP_DFL_MAX_BATCH = 32, -+ SCX_DSP_MAX_LOOPS = 32, -+ SCX_WATCHDOG_MAX_TIMEOUT = 30 * HZ, -+ -+ SCX_EXIT_BT_LEN = 64, -+ SCX_EXIT_MSG_LEN = 1024, -+ SCX_EXIT_DUMP_DFL_LEN = 32768, -+ -+ SCX_CPUPERF_ONE = SCHED_CAPACITY_SCALE, -+}; -+ -+enum scx_exit_kind { -+ SCX_EXIT_NONE, -+ SCX_EXIT_DONE, -+ -+ SCX_EXIT_UNREG = 64, /* user-space initiated unregistration */ -+ SCX_EXIT_UNREG_BPF, /* BPF-initiated unregistration */ -+ SCX_EXIT_UNREG_KERN, /* kernel-initiated unregistration */ -+ SCX_EXIT_SYSRQ, /* requested by 'S' sysrq */ -+ -+ SCX_EXIT_ERROR = 1024, /* runtime error, error msg contains details */ -+ SCX_EXIT_ERROR_BPF, /* ERROR but triggered through scx_bpf_error() */ -+ SCX_EXIT_ERROR_STALL, /* watchdog detected stalled runnable tasks */ -+}; -+ -+/* -+ * An exit code can be specified when exiting with scx_bpf_exit() or -+ * scx_ops_exit(), corresponding to exit_kind UNREG_BPF and UNREG_KERN -+ * respectively. The codes are 64bit of the format: -+ * -+ * Bits: [63 .. 48 47 .. 32 31 .. 0] -+ * [ SYS ACT ] [ SYS RSN ] [ USR ] -+ * -+ * SYS ACT: System-defined exit actions -+ * SYS RSN: System-defined exit reasons -+ * USR : User-defined exit codes and reasons -+ * -+ * Using the above, users may communicate intention and context by ORing system -+ * actions and/or system reasons with a user-defined exit code. -+ */ -+enum scx_exit_code { -+ /* Reasons */ -+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, -+ -+ /* Actions */ -+ SCX_ECODE_ACT_RESTART = 1LLU << 48, -+}; -+ -+/* -+ * scx_exit_info is passed to ops.exit() to describe why the BPF scheduler is -+ * being disabled. -+ */ -+struct scx_exit_info { -+ /* %SCX_EXIT_* - broad category of the exit reason */ -+ enum scx_exit_kind kind; -+ -+ /* exit code if gracefully exiting */ -+ s64 exit_code; -+ -+ /* textual representation of the above */ -+ const char *reason; -+ -+ /* backtrace if exiting due to an error */ -+ unsigned long *bt; -+ u32 bt_len; -+ -+ /* informational message */ -+ char *msg; -+ -+ /* debug dump */ -+ char *dump; -+}; -+ -+/* sched_ext_ops.flags */ -+enum scx_ops_flags { -+ /* -+ * Keep built-in idle tracking even if ops.update_idle() is implemented. -+ */ -+ SCX_OPS_KEEP_BUILTIN_IDLE = 1LLU << 0, -+ -+ /* -+ * By default, if there are no other task to run on the CPU, ext core -+ * keeps running the current task even after its slice expires. If this -+ * flag is specified, such tasks are passed to ops.enqueue() with -+ * %SCX_ENQ_LAST. See the comment above %SCX_ENQ_LAST for more info. -+ */ -+ SCX_OPS_ENQ_LAST = 1LLU << 1, -+ -+ /* -+ * An exiting task may schedule after PF_EXITING is set. In such cases, -+ * bpf_task_from_pid() may not be able to find the task and if the BPF -+ * scheduler depends on pid lookup for dispatching, the task will be -+ * lost leading to various issues including RCU grace period stalls. -+ * -+ * To mask this problem, by default, unhashed tasks are automatically -+ * dispatched to the local DSQ on enqueue. If the BPF scheduler doesn't -+ * depend on pid lookups and wants to handle these tasks directly, the -+ * following flag can be used. -+ */ -+ SCX_OPS_ENQ_EXITING = 1LLU << 2, -+ -+ /* -+ * If set, only tasks with policy set to SCHED_EXT are attached to -+ * sched_ext. If clear, SCHED_NORMAL tasks are also included. -+ */ -+ SCX_OPS_SWITCH_PARTIAL = 1LLU << 3, -+ -+ /* -+ * CPU cgroup knob enable flags -+ */ -+ SCX_OPS_CGROUP_KNOB_WEIGHT = 1LLU << 16, /* cpu.weight */ -+ -+ SCX_OPS_ALL_FLAGS = SCX_OPS_KEEP_BUILTIN_IDLE | -+ SCX_OPS_ENQ_LAST | -+ SCX_OPS_ENQ_EXITING | -+ SCX_OPS_SWITCH_PARTIAL | -+ SCX_OPS_CGROUP_KNOB_WEIGHT, -+}; -+ -+/* argument container for ops.init_task() */ -+struct scx_init_task_args { -+ /* -+ * Set if ops.init_task() is being invoked on the fork path, as opposed -+ * to the scheduler transition path. -+ */ -+ bool fork; -+#ifdef CONFIG_EXT_GROUP_SCHED -+ /* the cgroup the task is joining */ -+ struct cgroup *cgroup; -+#endif -+}; -+ -+/* argument container for ops.exit_task() */ -+struct scx_exit_task_args { -+ /* Whether the task exited before running on sched_ext. */ -+ bool cancelled; -+}; -+ -+/* argument container for ops->cgroup_init() */ -+struct scx_cgroup_init_args { -+ /* the weight of the cgroup [1..10000] */ -+ u32 weight; -+}; -+ -+enum scx_cpu_preempt_reason { -+ /* next task is being scheduled by &sched_class_rt */ -+ SCX_CPU_PREEMPT_RT, -+ /* next task is being scheduled by &sched_class_dl */ -+ SCX_CPU_PREEMPT_DL, -+ /* next task is being scheduled by &sched_class_stop */ -+ SCX_CPU_PREEMPT_STOP, -+ /* unknown reason for SCX being preempted */ -+ SCX_CPU_PREEMPT_UNKNOWN, -+}; -+ -+/* -+ * Argument container for ops->cpu_acquire(). Currently empty, but may be -+ * expanded in the future. -+ */ -+struct scx_cpu_acquire_args {}; -+ -+/* argument container for ops->cpu_release() */ -+struct scx_cpu_release_args { -+ /* the reason the CPU was preempted */ -+ enum scx_cpu_preempt_reason reason; -+ -+ /* the task that's going to be scheduled on the CPU */ -+ struct task_struct *task; -+}; -+ -+/* -+ * Informational context provided to dump operations. -+ */ -+struct scx_dump_ctx { -+ enum scx_exit_kind kind; -+ s64 exit_code; -+ const char *reason; -+ u64 at_ns; -+ u64 at_jiffies; -+}; -+ -+/** -+ * struct sched_ext_ops - Operation table for BPF scheduler implementation -+ * -+ * Userland can implement an arbitrary scheduling policy by implementing and -+ * loading operations in this table. -+ */ -+struct sched_ext_ops { -+ /** -+ * select_cpu - Pick the target CPU for a task which is being woken up -+ * @p: task being woken up -+ * @prev_cpu: the cpu @p was on before sleeping -+ * @wake_flags: SCX_WAKE_* -+ * -+ * Decision made here isn't final. @p may be moved to any CPU while it -+ * is getting dispatched for execution later. However, as @p is not on -+ * the rq at this point, getting the eventual execution CPU right here -+ * saves a small bit of overhead down the line. -+ * -+ * If an idle CPU is returned, the CPU is kicked and will try to -+ * dispatch. While an explicit custom mechanism can be added, -+ * select_cpu() serves as the default way to wake up idle CPUs. -+ * -+ * @p may be dispatched directly by calling scx_bpf_dispatch(). If @p -+ * is dispatched, the ops.enqueue() callback will be skipped. Finally, -+ * if @p is dispatched to SCX_DSQ_LOCAL, it will be dispatched to the -+ * local DSQ of whatever CPU is returned by this callback. -+ */ -+ s32 (*select_cpu)(struct task_struct *p, s32 prev_cpu, u64 wake_flags); -+ -+ /** -+ * enqueue - Enqueue a task on the BPF scheduler -+ * @p: task being enqueued -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * @p is ready to run. Dispatch directly by calling scx_bpf_dispatch() -+ * or enqueue on the BPF scheduler. If not directly dispatched, the bpf -+ * scheduler owns @p and if it fails to dispatch @p, the task will -+ * stall. -+ * -+ * If @p was dispatched from ops.select_cpu(), this callback is -+ * skipped. -+ */ -+ void (*enqueue)(struct task_struct *p, u64 enq_flags); -+ -+ /** -+ * dequeue - Remove a task from the BPF scheduler -+ * @p: task being dequeued -+ * @deq_flags: %SCX_DEQ_* -+ * -+ * Remove @p from the BPF scheduler. This is usually called to isolate -+ * the task while updating its scheduling properties (e.g. priority). -+ * -+ * The ext core keeps track of whether the BPF side owns a given task or -+ * not and can gracefully ignore spurious dispatches from BPF side, -+ * which makes it safe to not implement this method. However, depending -+ * on the scheduling logic, this can lead to confusing behaviors - e.g. -+ * scheduling position not being updated across a priority change. -+ */ -+ void (*dequeue)(struct task_struct *p, u64 deq_flags); -+ -+ /** -+ * dispatch - Dispatch tasks from the BPF scheduler and/or consume DSQs -+ * @cpu: CPU to dispatch tasks for -+ * @prev: previous task being switched out -+ * -+ * Called when a CPU's local dsq is empty. The operation should dispatch -+ * one or more tasks from the BPF scheduler into the DSQs using -+ * scx_bpf_dispatch() and/or consume user DSQs into the local DSQ using -+ * scx_bpf_consume(). -+ * -+ * The maximum number of times scx_bpf_dispatch() can be called without -+ * an intervening scx_bpf_consume() is specified by -+ * ops.dispatch_max_batch. See the comments on top of the two functions -+ * for more details. -+ * -+ * When not %NULL, @prev is an SCX task with its slice depleted. If -+ * @prev is still runnable as indicated by set %SCX_TASK_QUEUED in -+ * @prev->scx.flags, it is not enqueued yet and will be enqueued after -+ * ops.dispatch() returns. To keep executing @prev, return without -+ * dispatching or consuming any tasks. Also see %SCX_OPS_ENQ_LAST. -+ */ -+ void (*dispatch)(s32 cpu, struct task_struct *prev); -+ -+ /** -+ * tick - Periodic tick -+ * @p: task running currently -+ * -+ * This operation is called every 1/HZ seconds on CPUs which are -+ * executing an SCX task. Setting @p->scx.slice to 0 will trigger an -+ * immediate dispatch cycle on the CPU. -+ */ -+ void (*tick)(struct task_struct *p); -+ -+ /** -+ * runnable - A task is becoming runnable on its associated CPU -+ * @p: task becoming runnable -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * This and the following three functions can be used to track a task's -+ * execution state transitions. A task becomes ->runnable() on a CPU, -+ * and then goes through one or more ->running() and ->stopping() pairs -+ * as it runs on the CPU, and eventually becomes ->quiescent() when it's -+ * done running on the CPU. -+ * -+ * @p is becoming runnable on the CPU because it's -+ * -+ * - waking up (%SCX_ENQ_WAKEUP) -+ * - being moved from another CPU -+ * - being restored after temporarily taken off the queue for an -+ * attribute change. -+ * -+ * This and ->enqueue() are related but not coupled. This operation -+ * notifies @p's state transition and may not be followed by ->enqueue() -+ * e.g. when @p is being dispatched to a remote CPU, or when @p is -+ * being enqueued on a CPU experiencing a hotplug event. Likewise, a -+ * task may be ->enqueue()'d without being preceded by this operation -+ * e.g. after exhausting its slice. -+ */ -+ void (*runnable)(struct task_struct *p, u64 enq_flags); -+ -+ /** -+ * running - A task is starting to run on its associated CPU -+ * @p: task starting to run -+ * -+ * See ->runnable() for explanation on the task state notifiers. -+ */ -+ void (*running)(struct task_struct *p); -+ -+ /** -+ * stopping - A task is stopping execution -+ * @p: task stopping to run -+ * @runnable: is task @p still runnable? -+ * -+ * See ->runnable() for explanation on the task state notifiers. If -+ * !@runnable, ->quiescent() will be invoked after this operation -+ * returns. -+ */ -+ void (*stopping)(struct task_struct *p, bool runnable); -+ -+ /** -+ * quiescent - A task is becoming not runnable on its associated CPU -+ * @p: task becoming not runnable -+ * @deq_flags: %SCX_DEQ_* -+ * -+ * See ->runnable() for explanation on the task state notifiers. -+ * -+ * @p is becoming quiescent on the CPU because it's -+ * -+ * - sleeping (%SCX_DEQ_SLEEP) -+ * - being moved to another CPU -+ * - being temporarily taken off the queue for an attribute change -+ * (%SCX_DEQ_SAVE) -+ * -+ * This and ->dequeue() are related but not coupled. This operation -+ * notifies @p's state transition and may not be preceded by ->dequeue() -+ * e.g. when @p is being dispatched to a remote CPU. -+ */ -+ void (*quiescent)(struct task_struct *p, u64 deq_flags); -+ -+ /** -+ * yield - Yield CPU -+ * @from: yielding task -+ * @to: optional yield target task -+ * -+ * If @to is NULL, @from is yielding the CPU to other runnable tasks. -+ * The BPF scheduler should ensure that other available tasks are -+ * dispatched before the yielding task. Return value is ignored in this -+ * case. -+ * -+ * If @to is not-NULL, @from wants to yield the CPU to @to. If the bpf -+ * scheduler can implement the request, return %true; otherwise, %false. -+ */ -+ bool (*yield)(struct task_struct *from, struct task_struct *to); -+ -+ /** -+ * core_sched_before - Task ordering for core-sched -+ * @a: task A -+ * @b: task B -+ * -+ * Used by core-sched to determine the ordering between two tasks. See -+ * Documentation/admin-guide/hw-vuln/core-scheduling.rst for details on -+ * core-sched. -+ * -+ * Both @a and @b are runnable and may or may not currently be queued on -+ * the BPF scheduler. Should return %true if @a should run before @b. -+ * %false if there's no required ordering or @b should run before @a. -+ * -+ * If not specified, the default is ordering them according to when they -+ * became runnable. -+ */ -+ bool (*core_sched_before)(struct task_struct *a, struct task_struct *b); -+ -+ /** -+ * set_weight - Set task weight -+ * @p: task to set weight for -+ * @weight: new eight [1..10000] -+ * -+ * Update @p's weight to @weight. -+ */ -+ void (*set_weight)(struct task_struct *p, u32 weight); -+ -+ /** -+ * set_cpumask - Set CPU affinity -+ * @p: task to set CPU affinity for -+ * @cpumask: cpumask of cpus that @p can run on -+ * -+ * Update @p's CPU affinity to @cpumask. -+ */ -+ void (*set_cpumask)(struct task_struct *p, -+ const struct cpumask *cpumask); -+ -+ /** -+ * update_idle - Update the idle state of a CPU -+ * @cpu: CPU to udpate the idle state for -+ * @idle: whether entering or exiting the idle state -+ * -+ * This operation is called when @rq's CPU goes or leaves the idle -+ * state. By default, implementing this operation disables the built-in -+ * idle CPU tracking and the following helpers become unavailable: -+ * -+ * - scx_bpf_select_cpu_dfl() -+ * - scx_bpf_test_and_clear_cpu_idle() -+ * - scx_bpf_pick_idle_cpu() -+ * -+ * The user also must implement ops.select_cpu() as the default -+ * implementation relies on scx_bpf_select_cpu_dfl(). -+ * -+ * Specify the %SCX_OPS_KEEP_BUILTIN_IDLE flag to keep the built-in idle -+ * tracking. -+ */ -+ void (*update_idle)(s32 cpu, bool idle); -+ -+ /** -+ * cpu_acquire - A CPU is becoming available to the BPF scheduler -+ * @cpu: The CPU being acquired by the BPF scheduler. -+ * @args: Acquire arguments, see the struct definition. -+ * -+ * A CPU that was previously released from the BPF scheduler is now once -+ * again under its control. -+ */ -+ void (*cpu_acquire)(s32 cpu, struct scx_cpu_acquire_args *args); -+ -+ /** -+ * cpu_release - A CPU is taken away from the BPF scheduler -+ * @cpu: The CPU being released by the BPF scheduler. -+ * @args: Release arguments, see the struct definition. -+ * -+ * The specified CPU is no longer under the control of the BPF -+ * scheduler. This could be because it was preempted by a higher -+ * priority sched_class, though there may be other reasons as well. The -+ * caller should consult @args->reason to determine the cause. -+ */ -+ void (*cpu_release)(s32 cpu, struct scx_cpu_release_args *args); -+ -+ /** -+ * init_task - Initialize a task to run in a BPF scheduler -+ * @p: task to initialize for BPF scheduling -+ * @args: init arguments, see the struct definition -+ * -+ * Either we're loading a BPF scheduler or a new task is being forked. -+ * Initialize @p for BPF scheduling. This operation may block and can -+ * be used for allocations, and is called exactly once for a task. -+ * -+ * Return 0 for success, -errno for failure. An error return while -+ * loading will abort loading of the BPF scheduler. During a fork, it -+ * will abort that specific fork. -+ */ -+ s32 (*init_task)(struct task_struct *p, struct scx_init_task_args *args); -+ -+ /** -+ * exit_task - Exit a previously-running task from the system -+ * @p: task to exit -+ * -+ * @p is exiting or the BPF scheduler is being unloaded. Perform any -+ * necessary cleanup for @p. -+ */ -+ void (*exit_task)(struct task_struct *p, struct scx_exit_task_args *args); -+ -+ /** -+ * enable - Enable BPF scheduling for a task -+ * @p: task to enable BPF scheduling for -+ * -+ * Enable @p for BPF scheduling. enable() is called on @p any time it -+ * enters SCX, and is always paired with a matching disable(). -+ */ -+ void (*enable)(struct task_struct *p); -+ -+ /** -+ * disable - Disable BPF scheduling for a task -+ * @p: task to disable BPF scheduling for -+ * -+ * @p is exiting, leaving SCX or the BPF scheduler is being unloaded. -+ * Disable BPF scheduling for @p. A disable() call is always matched -+ * with a prior enable() call. -+ */ -+ void (*disable)(struct task_struct *p); -+ -+ /** -+ * dump - Dump BPF scheduler state on error -+ * @ctx: debug dump context -+ * -+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump. -+ */ -+ void (*dump)(struct scx_dump_ctx *ctx); -+ -+ /** -+ * dump_cpu - Dump BPF scheduler state for a CPU on error -+ * @ctx: debug dump context -+ * @cpu: CPU to generate debug dump for -+ * @idle: @cpu is currently idle without any runnable tasks -+ * -+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for -+ * @cpu. If @idle is %true and this operation doesn't produce any -+ * output, @cpu is skipped for dump. -+ */ -+ void (*dump_cpu)(struct scx_dump_ctx *ctx, s32 cpu, bool idle); -+ -+ /** -+ * dump_task - Dump BPF scheduler state for a runnable task on error -+ * @ctx: debug dump context -+ * @p: runnable task to generate debug dump for -+ * -+ * Use scx_bpf_dump() to generate BPF scheduler specific debug dump for -+ * @p. -+ */ -+ void (*dump_task)(struct scx_dump_ctx *ctx, struct task_struct *p); -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ /** -+ * cgroup_init - Initialize a cgroup -+ * @cgrp: cgroup being initialized -+ * @args: init arguments, see the struct definition -+ * -+ * Either the BPF scheduler is being loaded or @cgrp created, initialize -+ * @cgrp for sched_ext. This operation may block. -+ * -+ * Return 0 for success, -errno for failure. An error return while -+ * loading will abort loading of the BPF scheduler. During cgroup -+ * creation, it will abort the specific cgroup creation. -+ */ -+ s32 (*cgroup_init)(struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args); -+ -+ /** -+ * cgroup_exit - Exit a cgroup -+ * @cgrp: cgroup being exited -+ * -+ * Either the BPF scheduler is being unloaded or @cgrp destroyed, exit -+ * @cgrp for sched_ext. This operation my block. -+ */ -+ void (*cgroup_exit)(struct cgroup *cgrp); -+ -+ /** -+ * cgroup_prep_move - Prepare a task to be moved to a different cgroup -+ * @p: task being moved -+ * @from: cgroup @p is being moved from -+ * @to: cgroup @p is being moved to -+ * -+ * Prepare @p for move from cgroup @from to @to. This operation may -+ * block and can be used for allocations. -+ * -+ * Return 0 for success, -errno for failure. An error return aborts the -+ * migration. -+ */ -+ s32 (*cgroup_prep_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_move - Commit cgroup move -+ * @p: task being moved -+ * @from: cgroup @p is being moved from -+ * @to: cgroup @p is being moved to -+ * -+ * Commit the move. @p is dequeued during this operation. -+ */ -+ void (*cgroup_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_cancel_move - Cancel cgroup move -+ * @p: task whose cgroup move is being canceled -+ * @from: cgroup @p was being moved from -+ * @to: cgroup @p was being moved to -+ * -+ * @p was cgroup_prep_move()'d but failed before reaching cgroup_move(). -+ * Undo the preparation. -+ */ -+ void (*cgroup_cancel_move)(struct task_struct *p, -+ struct cgroup *from, struct cgroup *to); -+ -+ /** -+ * cgroup_set_weight - A cgroup's weight is being changed -+ * @cgrp: cgroup whose weight is being updated -+ * @weight: new weight [1..10000] -+ * -+ * Update @tg's weight to @weight. -+ */ -+ void (*cgroup_set_weight)(struct cgroup *cgrp, u32 weight); -+#endif /* CONFIG_CGROUPS */ -+ -+ /* -+ * All online ops must come before ops.cpu_online(). -+ */ -+ -+ /** -+ * cpu_online - A CPU became online -+ * @cpu: CPU which just came up -+ * -+ * @cpu just came online. @cpu will not call ops.enqueue() or -+ * ops.dispatch(), nor run tasks associated with other CPUs beforehand. -+ */ -+ void (*cpu_online)(s32 cpu); -+ -+ /** -+ * cpu_offline - A CPU is going offline -+ * @cpu: CPU which is going offline -+ * -+ * @cpu is going offline. @cpu will not call ops.enqueue() or -+ * ops.dispatch(), nor run tasks associated with other CPUs afterwards. -+ */ -+ void (*cpu_offline)(s32 cpu); -+ -+ /* -+ * All CPU hotplug ops must come before ops.init(). -+ */ -+ -+ /** -+ * init - Initialize the BPF scheduler -+ */ -+ s32 (*init)(void); -+ -+ /** -+ * exit - Clean up after the BPF scheduler -+ * @info: Exit info -+ */ -+ void (*exit)(struct scx_exit_info *info); -+ -+ /** -+ * dispatch_max_batch - Max nr of tasks that dispatch() can dispatch -+ */ -+ u32 dispatch_max_batch; -+ -+ /** -+ * flags - %SCX_OPS_* flags -+ */ -+ u64 flags; -+ -+ /** -+ * timeout_ms - The maximum amount of time, in milliseconds, that a -+ * runnable task should be able to wait before being scheduled. The -+ * maximum timeout may not exceed the default timeout of 30 seconds. -+ * -+ * Defaults to the maximum allowed timeout value of 30 seconds. -+ */ -+ u32 timeout_ms; -+ -+ /** -+ * exit_dump_len - scx_exit_info.dump buffer length. If 0, the default -+ * value of 32768 is used. -+ */ -+ u32 exit_dump_len; -+ -+ /** -+ * hotplug_seq - A sequence number that may be set by the scheduler to -+ * detect when a hotplug event has occurred during the loading process. -+ * If 0, no detection occurs. Otherwise, the scheduler will fail to -+ * load if the sequence number does not match @scx_hotplug_seq on the -+ * enable path. -+ */ -+ u64 hotplug_seq; -+ -+ /** -+ * name - BPF scheduler's name -+ * -+ * Must be a non-zero valid BPF object name including only isalnum(), -+ * '_' and '.' chars. Shows up in kernel.sched_ext_ops sysctl while the -+ * BPF scheduler is enabled. -+ */ -+ char name[SCX_OPS_NAME_LEN]; -+}; -+ -+enum scx_opi { -+ SCX_OPI_BEGIN = 0, -+ SCX_OPI_NORMAL_BEGIN = 0, -+ SCX_OPI_NORMAL_END = SCX_OP_IDX(cpu_online), -+ SCX_OPI_CPU_HOTPLUG_BEGIN = SCX_OP_IDX(cpu_online), -+ SCX_OPI_CPU_HOTPLUG_END = SCX_OP_IDX(init), -+ SCX_OPI_END = SCX_OP_IDX(init), -+}; -+ -+enum scx_wake_flags { -+ /* expose select WF_* flags as enums */ -+ SCX_WAKE_FORK = WF_FORK, -+ SCX_WAKE_TTWU = WF_TTWU, -+ SCX_WAKE_SYNC = WF_SYNC, -+}; -+ -+enum scx_enq_flags { -+ /* expose select ENQUEUE_* flags as enums */ -+ SCX_ENQ_WAKEUP = ENQUEUE_WAKEUP, -+ SCX_ENQ_HEAD = ENQUEUE_HEAD, -+ -+ /* high 32bits are SCX specific */ -+ -+ /* -+ * Set the following to trigger preemption when calling -+ * scx_bpf_dispatch() with a local dsq as the target. The slice of the -+ * current task is cleared to zero and the CPU is kicked into the -+ * scheduling path. Implies %SCX_ENQ_HEAD. -+ */ -+ SCX_ENQ_PREEMPT = 1LLU << 32, -+ -+ /* -+ * The task being enqueued was previously enqueued on the current CPU's -+ * %SCX_DSQ_LOCAL, but was removed from it in a call to the -+ * bpf_scx_reenqueue_local() kfunc. If bpf_scx_reenqueue_local() was -+ * invoked in a ->cpu_release() callback, and the task is again -+ * dispatched back to %SCX_LOCAL_DSQ by this current ->enqueue(), the -+ * task will not be scheduled on the CPU until at least the next invocation -+ * of the ->cpu_acquire() callback. -+ */ -+ SCX_ENQ_REENQ = 1LLU << 40, -+ -+ /* -+ * The task being enqueued is the only task available for the cpu. By -+ * default, ext core keeps executing such tasks but when -+ * %SCX_OPS_ENQ_LAST is specified, they're ops.enqueue()'d with the -+ * %SCX_ENQ_LAST flag set. -+ * -+ * If the BPF scheduler wants to continue executing the task, -+ * ops.enqueue() should dispatch the task to %SCX_DSQ_LOCAL immediately. -+ * If the task gets queued on a different dsq or the BPF side, the BPF -+ * scheduler is responsible for triggering a follow-up scheduling event. -+ * Otherwise, Execution may stall. -+ */ -+ SCX_ENQ_LAST = 1LLU << 41, -+ -+ /* high 8 bits are internal */ -+ __SCX_ENQ_INTERNAL_MASK = 0xffLLU << 56, -+ -+ SCX_ENQ_CLEAR_OPSS = 1LLU << 56, -+ SCX_ENQ_DSQ_PRIQ = 1LLU << 57, -+}; -+ -+enum scx_deq_flags { -+ /* expose select DEQUEUE_* flags as enums */ -+ SCX_DEQ_SLEEP = DEQUEUE_SLEEP, -+ -+ /* high 32bits are SCX specific */ -+ -+ /* -+ * The generic core-sched layer decided to execute the task even though -+ * it hasn't been dispatched yet. Dequeue from the BPF side. -+ */ -+ SCX_DEQ_CORE_SCHED_EXEC = 1LLU << 32, -+}; -+ -+enum scx_pick_idle_cpu_flags { -+ SCX_PICK_IDLE_CORE = 1LLU << 0, /* pick a CPU whose SMT siblings are also idle */ -+}; -+ -+enum scx_kick_flags { -+ /* -+ * Kick the target CPU if idle. Guarantees that the target CPU goes -+ * through at least one full scheduling cycle before going idle. If the -+ * target CPU can be determined to be currently not idle and going to go -+ * through a scheduling cycle before going idle, noop. -+ */ -+ SCX_KICK_IDLE = 1LLU << 0, -+ -+ /* -+ * Preempt the current task and execute the dispatch path. If the -+ * current task of the target CPU is an SCX task, its ->scx.slice is -+ * cleared to zero before the scheduling path is invoked so that the -+ * task expires and the dispatch path is invoked. -+ */ -+ SCX_KICK_PREEMPT = 1LLU << 1, -+ -+ /* -+ * Wait for the CPU to be rescheduled. The scx_bpf_kick_cpu() call will -+ * return after the target CPU finishes picking the next task. -+ */ -+ SCX_KICK_WAIT = 1LLU << 2, -+}; -+ -+enum scx_tg_flags { -+ SCX_TG_ONLINE = 1U << 0, -+ SCX_TG_INITED = 1U << 1, -+}; -+ -+enum scx_ops_enable_state { -+ SCX_OPS_PREPPING, -+ SCX_OPS_ENABLING, -+ SCX_OPS_ENABLED, -+ SCX_OPS_DISABLING, -+ SCX_OPS_DISABLED, -+}; -+ -+static const char *scx_ops_enable_state_str[] = { -+ [SCX_OPS_PREPPING] = "prepping", -+ [SCX_OPS_ENABLING] = "enabling", -+ [SCX_OPS_ENABLED] = "enabled", -+ [SCX_OPS_DISABLING] = "disabling", -+ [SCX_OPS_DISABLED] = "disabled", -+}; -+ -+/* -+ * sched_ext_entity->ops_state -+ * -+ * Used to track the task ownership between the SCX core and the BPF scheduler. -+ * State transitions look as follows: -+ * -+ * NONE -> QUEUEING -> QUEUED -> DISPATCHING -+ * ^ | | -+ * | v v -+ * \-------------------------------/ -+ * -+ * QUEUEING and DISPATCHING states can be waited upon. See wait_ops_state() call -+ * sites for explanations on the conditions being waited upon and why they are -+ * safe. Transitions out of them into NONE or QUEUED must store_release and the -+ * waiters should load_acquire. -+ * -+ * Tracking scx_ops_state enables sched_ext core to reliably determine whether -+ * any given task can be dispatched by the BPF scheduler at all times and thus -+ * relaxes the requirements on the BPF scheduler. This allows the BPF scheduler -+ * to try to dispatch any task anytime regardless of its state as the SCX core -+ * can safely reject invalid dispatches. -+ */ -+enum scx_ops_state { -+ SCX_OPSS_NONE, /* owned by the SCX core */ -+ SCX_OPSS_QUEUEING, /* in transit to the BPF scheduler */ -+ SCX_OPSS_QUEUED, /* owned by the BPF scheduler */ -+ SCX_OPSS_DISPATCHING, /* in transit back to the SCX core */ -+ -+ /* -+ * QSEQ brands each QUEUED instance so that, when dispatch races -+ * dequeue/requeue, the dispatcher can tell whether it still has a claim -+ * on the task being dispatched. -+ * -+ * As some 32bit archs can't do 64bit store_release/load_acquire, -+ * p->scx.ops_state is atomic_long_t which leaves 30 bits for QSEQ on -+ * 32bit machines. The dispatch race window QSEQ protects is very narrow -+ * and runs with IRQ disabled. 30 bits should be sufficient. -+ */ -+ SCX_OPSS_QSEQ_SHIFT = 2, -+}; -+ -+/* Use macros to ensure that the type is unsigned long for the masks */ -+#define SCX_OPSS_STATE_MASK ((1LU << SCX_OPSS_QSEQ_SHIFT) - 1) -+#define SCX_OPSS_QSEQ_MASK (~SCX_OPSS_STATE_MASK) -+ -+/* -+ * During exit, a task may schedule after losing its PIDs. When disabling the -+ * BPF scheduler, we need to be able to iterate tasks in every state to -+ * guarantee system safety. Maintain a dedicated task list which contains every -+ * task between its fork and eventual free. -+ */ -+static DEFINE_SPINLOCK(scx_tasks_lock); -+static LIST_HEAD(scx_tasks); -+ -+/* ops enable/disable */ -+static struct kthread_worker *scx_ops_helper; -+static DEFINE_MUTEX(scx_ops_enable_mutex); -+DEFINE_STATIC_KEY_FALSE(__scx_ops_enabled); -+DEFINE_STATIC_PERCPU_RWSEM(scx_fork_rwsem); -+static atomic_t scx_ops_enable_state_var = ATOMIC_INIT(SCX_OPS_DISABLED); -+static atomic_t scx_ops_bypass_depth = ATOMIC_INIT(0); -+static bool scx_switching_all; -+DEFINE_STATIC_KEY_FALSE(__scx_switched_all); -+ -+static struct sched_ext_ops scx_ops; -+static bool scx_warned_zero_slice; -+ -+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_last); -+static DEFINE_STATIC_KEY_FALSE(scx_ops_enq_exiting); -+DEFINE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -+static DEFINE_STATIC_KEY_FALSE(scx_builtin_idle_enabled); -+ -+struct static_key_false scx_has_op[SCX_OPI_END] = -+ { [0 ... SCX_OPI_END-1] = STATIC_KEY_FALSE_INIT }; -+ -+static atomic_t scx_exit_kind = ATOMIC_INIT(SCX_EXIT_DONE); -+static struct scx_exit_info *scx_exit_info; -+ -+static atomic_long_t scx_nr_rejected = ATOMIC_LONG_INIT(0); -+static atomic_long_t scx_hotplug_seq = ATOMIC_LONG_INIT(0); -+ -+/* -+ * The maximum amount of time in jiffies that a task may be runnable without -+ * being scheduled on a CPU. If this timeout is exceeded, it will trigger -+ * scx_ops_error(). -+ */ -+static unsigned long scx_watchdog_timeout; -+ -+/* -+ * The last time the delayed work was run. This delayed work relies on -+ * ksoftirqd being able to run to service timer interrupts, so it's possible -+ * that this work itself could get wedged. To account for this, we check that -+ * it's not stalled in the timer tick, and trigger an error if it is. -+ */ -+static unsigned long scx_watchdog_timestamp = INITIAL_JIFFIES; -+ -+static struct delayed_work scx_watchdog_work; -+ -+/* idle tracking */ -+#ifdef CONFIG_SMP -+#ifdef CONFIG_CPUMASK_OFFSTACK -+#define CL_ALIGNED_IF_ONSTACK -+#else -+#define CL_ALIGNED_IF_ONSTACK __cacheline_aligned_in_smp -+#endif -+ -+static struct { -+ cpumask_var_t cpu; -+ cpumask_var_t smt; -+} idle_masks CL_ALIGNED_IF_ONSTACK; -+ -+#endif /* CONFIG_SMP */ -+ -+/* for %SCX_KICK_WAIT */ -+static unsigned long __percpu *scx_kick_cpus_pnt_seqs; -+ -+/* -+ * Direct dispatch marker. -+ * -+ * Non-NULL values are used for direct dispatch from enqueue path. A valid -+ * pointer points to the task currently being enqueued. An ERR_PTR value is used -+ * to indicate that direct dispatch has already happened. -+ */ -+static DEFINE_PER_CPU(struct task_struct *, direct_dispatch_task); -+ -+/* dispatch queues */ -+static struct scx_dispatch_q __cacheline_aligned_in_smp scx_dsq_global; -+ -+static const struct rhashtable_params dsq_hash_params = { -+ .key_len = 8, -+ .key_offset = offsetof(struct scx_dispatch_q, id), -+ .head_offset = offsetof(struct scx_dispatch_q, hash_node), -+}; -+ -+static struct rhashtable dsq_hash; -+static LLIST_HEAD(dsqs_to_free); -+ -+/* dispatch buf */ -+struct scx_dsp_buf_ent { -+ struct task_struct *task; -+ unsigned long qseq; -+ u64 dsq_id; -+ u64 enq_flags; -+}; -+ -+static u32 scx_dsp_max_batch; -+ -+struct scx_dsp_ctx { -+ struct rq *rq; -+ struct rq_flags *rf; -+ u32 cursor; -+ u32 nr_tasks; -+ struct scx_dsp_buf_ent buf[]; -+}; -+ -+static struct scx_dsp_ctx __percpu *scx_dsp_ctx; -+ -+/* string formatting from BPF */ -+struct scx_bstr_buf { -+ u64 data[MAX_BPRINTF_VARARGS]; -+ char line[SCX_EXIT_MSG_LEN]; -+}; -+ -+static DEFINE_RAW_SPINLOCK(scx_exit_bstr_buf_lock); -+static struct scx_bstr_buf scx_exit_bstr_buf; -+ -+/* ops debug dump */ -+struct scx_dump_data { -+ s32 cpu; -+ bool first; -+ s32 cursor; -+ struct seq_buf *s; -+ const char *prefix; -+ struct scx_bstr_buf buf; -+}; -+ -+struct scx_dump_data scx_dump_data = { -+ .cpu = -1, -+}; -+ -+/* /sys/kernel/sched_ext interface */ -+static struct kset *scx_kset; -+static struct kobject *scx_root_kobj; -+ -+#define CREATE_TRACE_POINTS -+#include -+ -+static void scx_bpf_kick_cpu(s32 cpu, u64 flags); -+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, -+ s64 exit_code, -+ const char *fmt, ...); -+ -+#define scx_ops_error_kind(err, fmt, args...) \ -+ scx_ops_exit_kind((err), 0, fmt, ##args) -+ -+#define scx_ops_exit(code, fmt, args...) \ -+ scx_ops_exit_kind(SCX_EXIT_UNREG_KERN, (code), fmt, ##args) -+ -+#define scx_ops_error(fmt, args...) \ -+ scx_ops_error_kind(SCX_EXIT_ERROR, fmt, ##args) -+ -+#define SCX_HAS_OP(op) static_branch_likely(&scx_has_op[SCX_OP_IDX(op)]) -+ -+static long jiffies_delta_msecs(unsigned long at, unsigned long now) -+{ -+ if (time_after(at, now)) -+ return jiffies_to_msecs(at - now); -+ else -+ return -(long)jiffies_to_msecs(now - at); -+} -+ -+/* if the highest set bit is N, return a mask with bits [N+1, 31] set */ -+static u32 higher_bits(u32 flags) -+{ -+ return ~((1 << fls(flags)) - 1); -+} -+ -+/* return the mask with only the highest bit set */ -+static u32 highest_bit(u32 flags) -+{ -+ int bit = fls(flags); -+ return ((u64) 1 << bit) >> 1; -+} -+ -+/* -+ * scx_kf_mask enforcement. Some kfuncs can only be called from specific SCX -+ * ops. When invoking SCX ops, SCX_CALL_OP[_RET]() should be used to indicate -+ * the allowed kfuncs and those kfuncs should use scx_kf_allowed() to check -+ * whether it's running from an allowed context. -+ * -+ * @mask is constant, always inline to cull the mask calculations. -+ */ -+static __always_inline void scx_kf_allow(u32 mask) -+{ -+ /* nesting is allowed only in increasing scx_kf_mask order */ -+ WARN_ONCE((mask | higher_bits(mask)) & current->scx.kf_mask, -+ "invalid nesting current->scx.kf_mask=0x%x mask=0x%x\n", -+ current->scx.kf_mask, mask); -+ current->scx.kf_mask |= mask; -+ barrier(); -+} -+ -+static void scx_kf_disallow(u32 mask) -+{ -+ barrier(); -+ current->scx.kf_mask &= ~mask; -+} -+ -+#define SCX_CALL_OP(mask, op, args...) \ -+do { \ -+ if (mask) { \ -+ scx_kf_allow(mask); \ -+ scx_ops.op(args); \ -+ scx_kf_disallow(mask); \ -+ } else { \ -+ scx_ops.op(args); \ -+ } \ -+} while (0) -+ -+#define SCX_CALL_OP_RET(mask, op, args...) \ -+({ \ -+ __typeof__(scx_ops.op(args)) __ret; \ -+ if (mask) { \ -+ scx_kf_allow(mask); \ -+ __ret = scx_ops.op(args); \ -+ scx_kf_disallow(mask); \ -+ } else { \ -+ __ret = scx_ops.op(args); \ -+ } \ -+ __ret; \ -+}) -+ -+/* -+ * Some kfuncs are allowed only on the tasks that are subjects of the -+ * in-progress scx_ops operation for, e.g., locking guarantees. To enforce such -+ * restrictions, the following SCX_CALL_OP_*() variants should be used when -+ * invoking scx_ops operations that take task arguments. These can only be used -+ * for non-nesting operations due to the way the tasks are tracked. -+ * -+ * kfuncs which can only operate on such tasks can in turn use -+ * scx_kf_allowed_on_arg_tasks() to test whether the invocation is allowed on -+ * the specific task. -+ */ -+#define SCX_CALL_OP_TASK(mask, op, task, args...) \ -+do { \ -+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task; \ -+ SCX_CALL_OP(mask, op, task, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+} while (0) -+ -+#define SCX_CALL_OP_TASK_RET(mask, op, task, args...) \ -+({ \ -+ __typeof__(scx_ops.op(task, ##args)) __ret; \ -+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task; \ -+ __ret = SCX_CALL_OP_RET(mask, op, task, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+ __ret; \ -+}) -+ -+#define SCX_CALL_OP_2TASKS_RET(mask, op, task0, task1, args...) \ -+({ \ -+ __typeof__(scx_ops.op(task0, task1, ##args)) __ret; \ -+ BUILD_BUG_ON((mask) & ~__SCX_KF_TERMINAL); \ -+ current->scx.kf_tasks[0] = task0; \ -+ current->scx.kf_tasks[1] = task1; \ -+ __ret = SCX_CALL_OP_RET(mask, op, task0, task1, ##args); \ -+ current->scx.kf_tasks[0] = NULL; \ -+ current->scx.kf_tasks[1] = NULL; \ -+ __ret; \ -+}) -+ -+/* @mask is constant, always inline to cull unnecessary branches */ -+static __always_inline bool scx_kf_allowed(u32 mask) -+{ -+ if (unlikely(!(current->scx.kf_mask & mask))) { -+ scx_ops_error("kfunc with mask 0x%x called from an operation only allowing 0x%x", -+ mask, current->scx.kf_mask); -+ return false; -+ } -+ -+ if (unlikely((mask & SCX_KF_SLEEPABLE) && in_interrupt())) { -+ scx_ops_error("sleepable kfunc called from non-sleepable context"); -+ return false; -+ } -+ -+ /* -+ * Enforce nesting boundaries. e.g. A kfunc which can be called from -+ * DISPATCH must not be called if we're running DEQUEUE which is nested -+ * inside ops.dispatch(). We don't need to check the SCX_KF_SLEEPABLE -+ * boundary thanks to the above in_interrupt() check. -+ */ -+ if (unlikely(highest_bit(mask) == SCX_KF_CPU_RELEASE && -+ (current->scx.kf_mask & higher_bits(SCX_KF_CPU_RELEASE)))) { -+ scx_ops_error("cpu_release kfunc called from a nested operation"); -+ return false; -+ } -+ -+ if (unlikely(highest_bit(mask) == SCX_KF_DISPATCH && -+ (current->scx.kf_mask & higher_bits(SCX_KF_DISPATCH)))) { -+ scx_ops_error("dispatch kfunc called from a nested operation"); -+ return false; -+ } -+ -+ return true; -+} -+ -+/* see SCX_CALL_OP_TASK() */ -+static __always_inline bool scx_kf_allowed_on_arg_tasks(u32 mask, -+ struct task_struct *p) -+{ -+ if (!scx_kf_allowed(mask)) -+ return false; -+ -+ if (unlikely((p != current->scx.kf_tasks[0] && -+ p != current->scx.kf_tasks[1]))) { -+ scx_ops_error("called on a task not being operated on"); -+ return false; -+ } -+ -+ return true; -+} -+ -+/** -+ * nldsq_next_task - Iterate to the next task in a non-local DSQ -+ * @dsq: user dsq being interated -+ * @cur: current position, %NULL to start iteration -+ * @rev: walk backwards -+ * -+ * Returns %NULL when iteration is finished. -+ */ -+static struct task_struct *nldsq_next_task(struct scx_dispatch_q *dsq, -+ struct task_struct *cur, bool rev) -+{ -+ struct list_head *list_node; -+ struct scx_dsq_node *dsq_node; -+ -+ lockdep_assert_held(&dsq->lock); -+ -+ if (cur) -+ list_node = &cur->scx.dsq_node.list; -+ else -+ list_node = &dsq->list; -+ -+ /* find the next task, need to skip BPF iteration cursors */ -+ do { -+ if (rev) -+ list_node = list_node->prev; -+ else -+ list_node = list_node->next; -+ -+ if (list_node == &dsq->list) -+ return NULL; -+ -+ dsq_node = container_of(list_node, struct scx_dsq_node, list); -+ } while (dsq_node->flags & SCX_TASK_DSQ_CURSOR); -+ -+ return container_of(dsq_node, struct task_struct, scx.dsq_node); -+} -+ -+#define nldsq_for_each_task(p, dsq) \ -+ for ((p) = nldsq_next_task((dsq), NULL, false); (p); \ -+ (p) = nldsq_next_task((dsq), (p), false)) -+ -+ -+/* -+ * BPF DSQ iterator. Tasks in a non-local DSQ can be iterated in [reverse] -+ * dispatch order. BPF-visible iterator is opaque and larger to allow future -+ * changes without breaking backward compatibility. Can be used with -+ * bpf_for_each(). See bpf_iter_scx_dsq_*(). -+ */ -+enum scx_dsq_iter_flags { -+ /* iterate in the reverse dispatch order */ -+ SCX_DSQ_ITER_REV = 1LLU << 0, -+ -+ __SCX_DSQ_ITER_ALL_FLAGS = SCX_DSQ_ITER_REV, -+}; -+ -+struct bpf_iter_scx_dsq_kern { -+ /* -+ * Must be the first field. Used to work around BPF restriction and pass -+ * in the iterator pointer to scx_bpf_consume_task(). -+ */ -+ struct bpf_iter_scx_dsq_kern *self; -+ -+ struct scx_dsq_node cursor; -+ struct scx_dispatch_q *dsq; -+ u64 dsq_seq; -+ u64 flags; -+} __attribute__((aligned(8))); -+ -+struct bpf_iter_scx_dsq { -+ u64 __opaque[12]; -+} __attribute__((aligned(8))); -+ -+ -+/* -+ * SCX task iterator. -+ */ -+struct scx_task_iter { -+ struct sched_ext_entity cursor; -+ struct task_struct *locked; -+ struct rq *rq; -+ struct rq_flags rf; -+}; -+ -+/** -+ * scx_task_iter_init - Initialize a task iterator -+ * @iter: iterator to init -+ * -+ * Initialize @iter. Must be called with scx_tasks_lock held. Once initialized, -+ * @iter must eventually be exited with scx_task_iter_exit(). -+ * -+ * scx_tasks_lock may be released between this and the first next() call or -+ * between any two next() calls. If scx_tasks_lock is released between two -+ * next() calls, the caller is responsible for ensuring that the task being -+ * iterated remains accessible either through RCU read lock or obtaining a -+ * reference count. -+ * -+ * All tasks which existed when the iteration started are guaranteed to be -+ * visited as long as they still exist. -+ */ -+static void scx_task_iter_init(struct scx_task_iter *iter) -+{ -+ lockdep_assert_held(&scx_tasks_lock); -+ -+ iter->cursor = (struct sched_ext_entity){ .flags = SCX_TASK_CURSOR }; -+ list_add(&iter->cursor.tasks_node, &scx_tasks); -+ iter->locked = NULL; -+} -+ -+/** -+ * scx_task_iter_rq_unlock - Unlock rq locked by a task iterator -+ * @iter: iterator to unlock rq for -+ * -+ * If @iter is in the middle of a locked iteration, it may be locking the rq of -+ * the task currently being visited. Unlock the rq if so. This function can be -+ * safely called anytime during an iteration. -+ * -+ * Returns %true if the rq @iter was locking is unlocked. %false if @iter was -+ * not locking an rq. -+ */ -+static bool scx_task_iter_rq_unlock(struct scx_task_iter *iter) -+{ -+ if (iter->locked) { -+ task_rq_unlock(iter->rq, iter->locked, &iter->rf); -+ iter->locked = NULL; -+ return true; -+ } else { -+ return false; -+ } -+} -+ -+/** -+ * scx_task_iter_exit - Exit a task iterator -+ * @iter: iterator to exit -+ * -+ * Exit a previously initialized @iter. Must be called with scx_tasks_lock held. -+ * If the iterator holds a task's rq lock, that rq lock is released. See -+ * scx_task_iter_init() for details. -+ */ -+static void scx_task_iter_exit(struct scx_task_iter *iter) -+{ -+ lockdep_assert_held(&scx_tasks_lock); -+ -+ scx_task_iter_rq_unlock(iter); -+ list_del_init(&iter->cursor.tasks_node); -+} -+ -+/** -+ * scx_task_iter_next - Next task -+ * @iter: iterator to walk -+ * -+ * Visit the next task. See scx_task_iter_init() for details. -+ */ -+static struct task_struct *scx_task_iter_next(struct scx_task_iter *iter) -+{ -+ struct list_head *cursor = &iter->cursor.tasks_node; -+ struct sched_ext_entity *pos; -+ -+ lockdep_assert_held(&scx_tasks_lock); -+ -+ list_for_each_entry(pos, cursor, tasks_node) { -+ if (&pos->tasks_node == &scx_tasks) -+ return NULL; -+ if (!(pos->flags & SCX_TASK_CURSOR)) { -+ list_move(cursor, &pos->tasks_node); -+ return container_of(pos, struct task_struct, scx); -+ } -+ } -+ -+ /* can't happen, should always terminate at scx_tasks above */ -+ BUG(); -+} -+ -+/** -+ * scx_task_iter_next_locked - Next non-idle task with its rq locked -+ * @iter: iterator to walk -+ * @include_dead: Whether we should include dead tasks in the iteration -+ * -+ * Visit the non-idle task with its rq lock held. Allows callers to specify -+ * whether they would like to filter out dead tasks. See scx_task_iter_init() -+ * for details. -+ */ -+static struct task_struct * -+scx_task_iter_next_locked(struct scx_task_iter *iter, bool include_dead) -+{ -+ struct task_struct *p; -+retry: -+ scx_task_iter_rq_unlock(iter); -+ -+ while ((p = scx_task_iter_next(iter))) { -+ /* -+ * is_idle_task() tests %PF_IDLE which may not be set for CPUs -+ * which haven't yet been onlined. Test sched_class directly. -+ */ -+ if (p->sched_class != &idle_sched_class) -+ break; -+ } -+ if (!p) -+ return NULL; -+ -+ iter->rq = task_rq_lock(p, &iter->rf); -+ iter->locked = p; -+ -+ /* -+ * If we see %TASK_DEAD, @p already disabled preemption, is about to do -+ * the final __schedule(), won't ever need to be scheduled again and can -+ * thus be safely ignored. If we don't see %TASK_DEAD, @p can't enter -+ * the final __schedle() while we're locking its rq and thus will stay -+ * alive until the rq is unlocked. -+ */ -+ if (!include_dead && READ_ONCE(p->__state) == TASK_DEAD) -+ goto retry; -+ -+ return p; -+} -+ -+static enum scx_ops_enable_state scx_ops_enable_state(void) -+{ -+ return atomic_read(&scx_ops_enable_state_var); -+} -+ -+static enum scx_ops_enable_state -+scx_ops_set_enable_state(enum scx_ops_enable_state to) -+{ -+ return atomic_xchg(&scx_ops_enable_state_var, to); -+} -+ -+static bool scx_ops_tryset_enable_state(enum scx_ops_enable_state to, -+ enum scx_ops_enable_state from) -+{ -+ int from_v = from; -+ -+ return atomic_try_cmpxchg(&scx_ops_enable_state_var, &from_v, to); -+} -+ -+static bool scx_ops_bypassing(void) -+{ -+ return unlikely(atomic_read(&scx_ops_bypass_depth)); -+} -+ -+/** -+ * wait_ops_state - Busy-wait the specified ops state to end -+ * @p: target task -+ * @opss: state to wait the end of -+ * -+ * Busy-wait for @p to transition out of @opss. This can only be used when the -+ * state part of @opss is %SCX_QUEUEING or %SCX_DISPATCHING. This function also -+ * has load_acquire semantics to ensure that the caller can see the updates made -+ * in the enqueueing and dispatching paths. -+ */ -+static void wait_ops_state(struct task_struct *p, unsigned long opss) -+{ -+ do { -+ cpu_relax(); -+ } while (atomic_long_read_acquire(&p->scx.ops_state) == opss); -+} -+ -+/** -+ * ops_cpu_valid - Verify a cpu number -+ * @cpu: cpu number which came from a BPF ops -+ * @where: extra information reported on error -+ * -+ * @cpu is a cpu number which came from the BPF scheduler and can be any value. -+ * Verify that it is in range and one of the possible cpus. If invalid, trigger -+ * an ops error. -+ */ -+static bool ops_cpu_valid(s32 cpu, const char *where) -+{ -+ if (likely(cpu >= 0 && cpu < nr_cpu_ids && cpu_possible(cpu))) { -+ return true; -+ } else { -+ scx_ops_error("invalid CPU %d%s%s", cpu, -+ where ? " " : "", where ?: ""); -+ return false; -+ } -+} -+ -+/** -+ * ops_sanitize_err - Sanitize a -errno value -+ * @ops_name: operation to blame on failure -+ * @err: -errno value to sanitize -+ * -+ * Verify @err is a valid -errno. If not, trigger scx_ops_error() and return -+ * -%EPROTO. This is necessary because returning a rogue -errno up the chain can -+ * cause misbehaviors. For an example, a large negative return from -+ * ops.init_task() triggers an oops when passed up the call chain because the -+ * value fails IS_ERR() test after being encoded with ERR_PTR() and then is -+ * handled as a pointer. -+ */ -+static int ops_sanitize_err(const char *ops_name, s32 err) -+{ -+ if (err < 0 && err >= -MAX_ERRNO) -+ return err; -+ -+ scx_ops_error("ops.%s() returned an invalid errno %d", ops_name, err); -+ return -EPROTO; -+} -+ -+/** -+ * touch_core_sched - Update timestamp used for core-sched task ordering -+ * @rq: rq to read clock from, must be locked -+ * @p: task to update the timestamp for -+ * -+ * Update @p->scx.core_sched_at timestamp. This is used by scx_prio_less() to -+ * implement global or local-DSQ FIFO ordering for core-sched. Should be called -+ * when a task becomes runnable and its turn on the CPU ends (e.g. slice -+ * exhaustion). -+ */ -+static void touch_core_sched(struct rq *rq, struct task_struct *p) -+{ -+#ifdef CONFIG_SCHED_CORE -+ /* -+ * It's okay to update the timestamp spuriously. Use -+ * sched_core_disabled() which is cheaper than enabled(). -+ */ -+ if (!sched_core_disabled()) -+ p->scx.core_sched_at = rq_clock_task(rq); -+#endif -+} -+ -+/** -+ * touch_core_sched_dispatch - Update core-sched timestamp on dispatch -+ * @rq: rq to read clock from, must be locked -+ * @p: task being dispatched -+ * -+ * If the BPF scheduler implements custom core-sched ordering via -+ * ops.core_sched_before(), @p->scx.core_sched_at is used to implement FIFO -+ * ordering within each local DSQ. This function is called from dispatch paths -+ * and updates @p->scx.core_sched_at if custom core-sched ordering is in effect. -+ */ -+static void touch_core_sched_dispatch(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_rq_held(rq); -+ assert_clock_updated(rq); -+ -+#ifdef CONFIG_SCHED_CORE -+ if (SCX_HAS_OP(core_sched_before)) -+ touch_core_sched(rq, p); -+#endif -+} -+ -+static void update_curr_scx(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ u64 now = rq_clock_task(rq); -+ u64 delta_exec; -+ -+ if (time_before_eq64(now, curr->se.exec_start)) -+ return; -+ -+ delta_exec = now - curr->se.exec_start; -+ curr->se.exec_start = now; -+ curr->se.sum_exec_runtime += delta_exec; -+ account_group_exec_runtime(curr, delta_exec); -+ cgroup_account_cputime(curr, delta_exec); -+ -+ if (curr->scx.slice != SCX_SLICE_INF) { -+ curr->scx.slice -= min(curr->scx.slice, delta_exec); -+ if (!curr->scx.slice) -+ touch_core_sched(rq, curr); -+ } -+} -+ -+static bool scx_dsq_priq_less(struct rb_node *node_a, -+ const struct rb_node *node_b) -+{ -+ const struct task_struct *a = -+ container_of(node_a, struct task_struct, scx.dsq_node.priq); -+ const struct task_struct *b = -+ container_of(node_b, struct task_struct, scx.dsq_node.priq); -+ -+ return time_before64(a->scx.dsq_vtime, b->scx.dsq_vtime); -+} -+ -+static void dsq_mod_nr(struct scx_dispatch_q *dsq, s32 delta) -+{ -+ /* scx_bpf_dsq_nr_queued() reads ->nr without locking, use WRITE_ONCE() */ -+ WRITE_ONCE(dsq->nr, dsq->nr + delta); -+} -+ -+static void dispatch_enqueue(struct scx_dispatch_q *dsq, struct task_struct *p, -+ u64 enq_flags) -+{ -+ bool is_local = dsq->id == SCX_DSQ_LOCAL; -+ -+ WARN_ON_ONCE(p->scx.dsq || !list_empty(&p->scx.dsq_node.list)); -+ WARN_ON_ONCE((p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) || -+ !RB_EMPTY_NODE(&p->scx.dsq_node.priq)); -+ -+ if (!is_local) { -+ raw_spin_lock(&dsq->lock); -+ if (unlikely(dsq->id == SCX_DSQ_INVALID)) { -+ scx_ops_error("attempting to dispatch to a destroyed dsq"); -+ /* fall back to the global dsq */ -+ raw_spin_unlock(&dsq->lock); -+ dsq = &scx_dsq_global; -+ raw_spin_lock(&dsq->lock); -+ } -+ } -+ -+ if (unlikely((dsq->id & SCX_DSQ_FLAG_BUILTIN) && -+ (enq_flags & SCX_ENQ_DSQ_PRIQ))) { -+ /* -+ * SCX_DSQ_LOCAL and SCX_DSQ_GLOBAL DSQs always consume from -+ * their FIFO queues. To avoid confusion and accidentally -+ * starving vtime-dispatched tasks by FIFO-dispatched tasks, we -+ * disallow any internal DSQ from doing vtime ordering of -+ * tasks. -+ */ -+ scx_ops_error("cannot use vtime ordering for built-in DSQs"); -+ enq_flags &= ~SCX_ENQ_DSQ_PRIQ; -+ } -+ -+ if (enq_flags & SCX_ENQ_DSQ_PRIQ) { -+ struct rb_node *rbp; -+ -+ /* -+ * A PRIQ DSQ shouldn't be using FIFO enqueueing. As tasks are -+ * linked to both the rbtree and list on PRIQs, this can only be -+ * tested easily when adding the first task. -+ */ -+ if (unlikely(RB_EMPTY_ROOT(&dsq->priq) && -+ nldsq_next_task(dsq, NULL, false))) -+ scx_ops_error("DSQ ID 0x%016llx already had FIFO-enqueued tasks", -+ dsq->id); -+ -+ p->scx.dsq_node.flags |= SCX_TASK_DSQ_ON_PRIQ; -+ rb_add(&p->scx.dsq_node.priq, &dsq->priq, scx_dsq_priq_less); -+ -+ /* -+ * Find the previous task and insert after it on the list so -+ * that @dsq->list is vtime ordered. -+ */ -+ rbp = rb_prev(&p->scx.dsq_node.priq); -+ if (rbp) { -+ struct task_struct *prev = -+ container_of(rbp, struct task_struct, -+ scx.dsq_node.priq); -+ list_add(&p->scx.dsq_node.list, &prev->scx.dsq_node.list); -+ } else { -+ list_add(&p->scx.dsq_node.list, &dsq->list); -+ } -+ } else { -+ /* a FIFO DSQ shouldn't be using PRIQ enqueuing */ -+ if (unlikely(!RB_EMPTY_ROOT(&dsq->priq))) -+ scx_ops_error("DSQ ID 0x%016llx already had PRIQ-enqueued tasks", -+ dsq->id); -+ -+ if (enq_flags & (SCX_ENQ_HEAD | SCX_ENQ_PREEMPT)) -+ list_add(&p->scx.dsq_node.list, &dsq->list); -+ else -+ list_add_tail(&p->scx.dsq_node.list, &dsq->list); -+ } -+ -+ /* seq records the order tasks are queued, used by BPF DSQ iterator */ -+ dsq->seq++; -+ p->scx.dsq_seq = dsq->seq; -+ -+ dsq_mod_nr(dsq, 1); -+ WRITE_ONCE(p->scx.dsq, dsq); -+ -+ /* -+ * scx.ddsp_dsq_id and scx.ddsp_enq_flags are only relevant on the -+ * direct dispatch path, but we clear them here because the direct -+ * dispatch verdict may be overridden on the enqueue path during e.g. -+ * bypass. -+ */ -+ p->scx.ddsp_dsq_id = SCX_DSQ_INVALID; -+ p->scx.ddsp_enq_flags = 0; -+ -+ /* -+ * We're transitioning out of QUEUEING or DISPATCHING. store_release to -+ * match waiters' load_acquire. -+ */ -+ if (enq_flags & SCX_ENQ_CLEAR_OPSS) -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ -+ if (is_local) { -+ struct rq *rq = container_of(dsq, struct rq, scx.local_dsq); -+ bool preempt = false; -+ -+ if ((enq_flags & SCX_ENQ_PREEMPT) && p != rq->curr && -+ rq->curr->sched_class == &ext_sched_class) { -+ rq->curr->scx.slice = 0; -+ preempt = true; -+ } -+ -+ if (preempt || sched_class_above(&ext_sched_class, -+ rq->curr->sched_class)) -+ resched_curr(rq); -+ } else { -+ raw_spin_unlock(&dsq->lock); -+ } -+} -+ -+static void task_unlink_from_dsq(struct task_struct *p, -+ struct scx_dispatch_q *dsq) -+{ -+ if (p->scx.dsq_node.flags & SCX_TASK_DSQ_ON_PRIQ) { -+ rb_erase(&p->scx.dsq_node.priq, &dsq->priq); -+ RB_CLEAR_NODE(&p->scx.dsq_node.priq); -+ p->scx.dsq_node.flags &= ~SCX_TASK_DSQ_ON_PRIQ; -+ } -+ -+ list_del_init(&p->scx.dsq_node.list); -+} -+ -+static bool task_linked_on_dsq(struct task_struct *p) -+{ -+ return !list_empty(&p->scx.dsq_node.list); -+} -+ -+static void dispatch_dequeue(struct rq *rq, struct task_struct *p) -+{ -+ struct scx_dispatch_q *dsq = p->scx.dsq; -+ bool is_local = dsq == &rq->scx.local_dsq; -+ -+ if (!dsq) { -+ WARN_ON_ONCE(task_linked_on_dsq(p)); -+ /* -+ * When dispatching directly from the BPF scheduler to a local -+ * DSQ, the task isn't associated with any DSQ but -+ * @p->scx.holding_cpu may be set under the protection of -+ * %SCX_OPSS_DISPATCHING. -+ */ -+ if (p->scx.holding_cpu >= 0) -+ p->scx.holding_cpu = -1; -+ return; -+ } -+ -+ if (!is_local) -+ raw_spin_lock(&dsq->lock); -+ -+ /* -+ * Now that we hold @dsq->lock, @p->holding_cpu and @p->scx.dsq_node -+ * can't change underneath us. -+ */ -+ if (p->scx.holding_cpu < 0) { -+ /* @p must still be on @dsq, dequeue */ -+ WARN_ON_ONCE(!task_linked_on_dsq(p)); -+ task_unlink_from_dsq(p, dsq); -+ dsq_mod_nr(dsq, -1); -+ } else { -+ /* -+ * We're racing against dispatch_to_local_dsq() which already -+ * removed @p from @dsq and set @p->scx.holding_cpu. Clear the -+ * holding_cpu which tells dispatch_to_local_dsq() that it lost -+ * the race. -+ */ -+ WARN_ON_ONCE(task_linked_on_dsq(p)); -+ p->scx.holding_cpu = -1; -+ } -+ WRITE_ONCE(p->scx.dsq, NULL); -+ -+ if (!is_local) -+ raw_spin_unlock(&dsq->lock); -+} -+ -+static struct scx_dispatch_q *find_user_dsq(u64 dsq_id) -+{ -+ return rhashtable_lookup_fast(&dsq_hash, &dsq_id, dsq_hash_params); -+} -+ -+static struct scx_dispatch_q *find_non_local_dsq(u64 dsq_id) -+{ -+ lockdep_assert(rcu_read_lock_any_held()); -+ -+ if (dsq_id == SCX_DSQ_GLOBAL) -+ return &scx_dsq_global; -+ else -+ return find_user_dsq(dsq_id); -+} -+ -+static struct scx_dispatch_q *find_dsq_for_dispatch(struct rq *rq, u64 dsq_id, -+ struct task_struct *p) -+{ -+ struct scx_dispatch_q *dsq; -+ -+ if (dsq_id == SCX_DSQ_LOCAL) -+ return &rq->scx.local_dsq; -+ -+ dsq = find_non_local_dsq(dsq_id); -+ if (unlikely(!dsq)) { -+ scx_ops_error("non-existent DSQ 0x%llx for %s[%d]", -+ dsq_id, p->comm, p->pid); -+ return &scx_dsq_global; -+ } -+ -+ return dsq; -+} -+ -+static void mark_direct_dispatch(struct task_struct *ddsp_task, -+ struct task_struct *p, u64 dsq_id, -+ u64 enq_flags) -+{ -+ /* -+ * Mark that dispatch already happened from ops.select_cpu() or -+ * ops.enqueue() by spoiling direct_dispatch_task with a non-NULL value -+ * which can never match a valid task pointer. -+ */ -+ __this_cpu_write(direct_dispatch_task, ERR_PTR(-ESRCH)); -+ -+ /* @p must match the task on the enqueue path */ -+ if (unlikely(p != ddsp_task)) { -+ if (IS_ERR(ddsp_task)) -+ scx_ops_error("%s[%d] already direct-dispatched", -+ p->comm, p->pid); -+ else -+ scx_ops_error("scheduling for %s[%d] but trying to direct-dispatch %s[%d]", -+ ddsp_task->comm, ddsp_task->pid, -+ p->comm, p->pid); -+ return; -+ } -+ -+ /* -+ * %SCX_DSQ_LOCAL_ON is not supported during direct dispatch because -+ * dispatching to the local DSQ of a different CPU requires unlocking -+ * the current rq which isn't allowed in the enqueue path. Use -+ * ops.select_cpu() to be on the target CPU and then %SCX_DSQ_LOCAL. -+ */ -+ if (unlikely((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON)) { -+ scx_ops_error("SCX_DSQ_LOCAL_ON can't be used for direct-dispatch"); -+ return; -+ } -+ -+ WARN_ON_ONCE(p->scx.ddsp_dsq_id != SCX_DSQ_INVALID); -+ WARN_ON_ONCE(p->scx.ddsp_enq_flags); -+ -+ p->scx.ddsp_dsq_id = dsq_id; -+ p->scx.ddsp_enq_flags = enq_flags; -+} -+ -+static void direct_dispatch(struct task_struct *p, u64 enq_flags) -+{ -+ struct scx_dispatch_q *dsq; -+ -+ touch_core_sched_dispatch(task_rq(p), p); -+ -+ enq_flags |= (p->scx.ddsp_enq_flags | SCX_ENQ_CLEAR_OPSS); -+ dsq = find_dsq_for_dispatch(task_rq(p), p->scx.ddsp_dsq_id, p); -+ dispatch_enqueue(dsq, p, enq_flags); -+} -+ -+static bool scx_rq_online(struct rq *rq) -+{ -+ return likely(rq->scx.flags & SCX_RQ_ONLINE); -+} -+ -+static void do_enqueue_task(struct rq *rq, struct task_struct *p, u64 enq_flags, -+ int sticky_cpu) -+{ -+ struct task_struct **ddsp_taskp; -+ unsigned long qseq; -+ -+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); -+ -+ /* rq migration */ -+ if (sticky_cpu == cpu_of(rq)) -+ goto local_norefill; -+ -+ /* -+ * If !rq->online, we already told the BPF scheduler that the CPU is -+ * offline. We're just trying to on/offline the CPU. Don't bother the -+ * BPF scheduler. -+ */ -+ if (!scx_rq_online(rq)) -+ goto local; -+ -+ if (scx_ops_bypassing()) { -+ if (enq_flags & SCX_ENQ_LAST) -+ goto local; -+ else -+ goto global; -+ } -+ -+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) -+ goto direct; -+ -+ /* see %SCX_OPS_ENQ_EXITING */ -+ if (!static_branch_unlikely(&scx_ops_enq_exiting) && -+ unlikely(p->flags & PF_EXITING)) -+ goto local; -+ -+ /* see %SCX_OPS_ENQ_LAST */ -+ if (!static_branch_unlikely(&scx_ops_enq_last) && -+ (enq_flags & SCX_ENQ_LAST)) -+ goto local; -+ -+ if (!SCX_HAS_OP(enqueue)) -+ goto global; -+ -+ /* DSQ bypass didn't trigger, enqueue on the BPF scheduler */ -+ qseq = rq->scx.ops_qseq++ << SCX_OPSS_QSEQ_SHIFT; -+ -+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); -+ atomic_long_set(&p->scx.ops_state, SCX_OPSS_QUEUEING | qseq); -+ -+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); -+ WARN_ON_ONCE(*ddsp_taskp); -+ *ddsp_taskp = p; -+ -+ SCX_CALL_OP_TASK(SCX_KF_ENQUEUE, enqueue, p, enq_flags); -+ -+ *ddsp_taskp = NULL; -+ if (p->scx.ddsp_dsq_id != SCX_DSQ_INVALID) -+ goto direct; -+ -+ /* -+ * If not directly dispatched, QUEUEING isn't clear yet and dispatch or -+ * dequeue may be waiting. The store_release matches their load_acquire. -+ */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_QUEUED | qseq); -+ return; -+ -+direct: -+ direct_dispatch(p, enq_flags); -+ return; -+ -+local: -+ /* -+ * For task-ordering, slice refill must be treated as implying the end -+ * of the current slice. Otherwise, the longer @p stays on the CPU, the -+ * higher priority it becomes from scx_prio_less()'s POV. -+ */ -+ touch_core_sched(rq, p); -+ p->scx.slice = SCX_SLICE_DFL; -+local_norefill: -+ dispatch_enqueue(&rq->scx.local_dsq, p, enq_flags); -+ return; -+ -+global: -+ touch_core_sched(rq, p); /* see the comment in local: */ -+ p->scx.slice = SCX_SLICE_DFL; -+ dispatch_enqueue(&scx_dsq_global, p, enq_flags); -+} -+ -+static bool task_runnable(const struct task_struct *p) -+{ -+ return !list_empty(&p->scx.runnable_node); -+} -+ -+static void set_task_runnable(struct rq *rq, struct task_struct *p) -+{ -+ lockdep_assert_rq_held(rq); -+ -+ if (p->scx.flags & SCX_TASK_RESET_RUNNABLE_AT) { -+ p->scx.runnable_at = jiffies; -+ p->scx.flags &= ~SCX_TASK_RESET_RUNNABLE_AT; -+ } -+ -+ /* -+ * list_add_tail() must be used. scx_ops_bypass() depends on tasks being -+ * appened to the runnable_list. -+ */ -+ list_add_tail(&p->scx.runnable_node, &rq->scx.runnable_list); -+} -+ -+static void clr_task_runnable(struct task_struct *p, bool reset_runnable_at) -+{ -+ list_del_init(&p->scx.runnable_node); -+ if (reset_runnable_at) -+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; -+} -+ -+static void enqueue_task_scx(struct rq *rq, struct task_struct *p, int enq_flags) -+{ -+ int sticky_cpu = p->scx.sticky_cpu; -+ -+ enq_flags |= rq->scx.extra_enq_flags; -+ -+ if (sticky_cpu >= 0) -+ p->scx.sticky_cpu = -1; -+ -+ /* -+ * Restoring a running task will be immediately followed by -+ * set_next_task_scx() which expects the task to not be on the BPF -+ * scheduler as tasks can only start running through local DSQs. Force -+ * direct-dispatch into the local DSQ by setting the sticky_cpu. -+ */ -+ if (unlikely(enq_flags & ENQUEUE_RESTORE) && task_current(rq, p)) -+ sticky_cpu = cpu_of(rq); -+ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ WARN_ON_ONCE(!task_runnable(p)); -+ return; -+ } -+ -+ set_task_runnable(rq, p); -+ p->scx.flags |= SCX_TASK_QUEUED; -+ rq->scx.nr_running++; -+ add_nr_running(rq, 1); -+ -+ if (SCX_HAS_OP(runnable)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, runnable, p, enq_flags); -+ -+ if (enq_flags & SCX_ENQ_WAKEUP) -+ touch_core_sched(rq, p); -+ -+ do_enqueue_task(rq, p, enq_flags, sticky_cpu); -+} -+ -+static void ops_dequeue(struct task_struct *p, u64 deq_flags) -+{ -+ unsigned long opss; -+ -+ /* dequeue is always temporary, don't reset runnable_at */ -+ clr_task_runnable(p, false); -+ -+ /* acquire ensures that we see the preceding updates on QUEUED */ -+ opss = atomic_long_read_acquire(&p->scx.ops_state); -+ -+ switch (opss & SCX_OPSS_STATE_MASK) { -+ case SCX_OPSS_NONE: -+ break; -+ case SCX_OPSS_QUEUEING: -+ /* -+ * QUEUEING is started and finished while holding @p's rq lock. -+ * As we're holding the rq lock now, we shouldn't see QUEUEING. -+ */ -+ BUG(); -+ case SCX_OPSS_QUEUED: -+ if (SCX_HAS_OP(dequeue)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, dequeue, p, deq_flags); -+ -+ if (atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, -+ SCX_OPSS_NONE)) -+ break; -+ fallthrough; -+ case SCX_OPSS_DISPATCHING: -+ /* -+ * If @p is being dispatched from the BPF scheduler to a DSQ, -+ * wait for the transfer to complete so that @p doesn't get -+ * added to its DSQ after dequeueing is complete. -+ * -+ * As we're waiting on DISPATCHING with the rq locked, the -+ * dispatching side shouldn't try to lock the rq while -+ * DISPATCHING is set. See dispatch_to_local_dsq(). -+ * -+ * DISPATCHING shouldn't have qseq set and control can reach -+ * here with NONE @opss from the above QUEUED case block. -+ * Explicitly wait on %SCX_OPSS_DISPATCHING instead of @opss. -+ */ -+ wait_ops_state(p, SCX_OPSS_DISPATCHING); -+ BUG_ON(atomic_long_read(&p->scx.ops_state) != SCX_OPSS_NONE); -+ break; -+ } -+} -+ -+static void dequeue_task_scx(struct rq *rq, struct task_struct *p, int deq_flags) -+{ -+ if (!(p->scx.flags & SCX_TASK_QUEUED)) { -+ WARN_ON_ONCE(task_runnable(p)); -+ return; -+ } -+ -+ ops_dequeue(p, deq_flags); -+ -+ /* -+ * A currently running task which is going off @rq first gets dequeued -+ * and then stops running. As we want running <-> stopping transitions -+ * to be contained within runnable <-> quiescent transitions, trigger -+ * ->stopping() early here instead of in put_prev_task_scx(). -+ * -+ * @p may go through multiple stopping <-> running transitions between -+ * here and put_prev_task_scx() if task attribute changes occur while -+ * balance_scx() leaves @rq unlocked. However, they don't contain any -+ * information meaningful to the BPF scheduler and can be suppressed by -+ * skipping the callbacks if the task is !QUEUED. -+ */ -+ if (SCX_HAS_OP(stopping) && task_current(rq, p)) { -+ update_curr_scx(rq); -+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, false); -+ } -+ -+ if (SCX_HAS_OP(quiescent)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, quiescent, p, deq_flags); -+ -+ if (deq_flags & SCX_DEQ_SLEEP) -+ p->scx.flags |= SCX_TASK_DEQD_FOR_SLEEP; -+ else -+ p->scx.flags &= ~SCX_TASK_DEQD_FOR_SLEEP; -+ -+ p->scx.flags &= ~SCX_TASK_QUEUED; -+ rq->scx.nr_running--; -+ sub_nr_running(rq, 1); -+ -+ dispatch_dequeue(rq, p); -+} -+ -+static void yield_task_scx(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (SCX_HAS_OP(yield)) -+ SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, p, NULL); -+ else -+ p->scx.slice = 0; -+} -+ -+static bool yield_to_task_scx(struct rq *rq, struct task_struct *to) -+{ -+ struct task_struct *from = rq->curr; -+ -+ if (SCX_HAS_OP(yield)) -+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, yield, from, to); -+ else -+ return false; -+} -+ -+#ifdef CONFIG_SMP -+/** -+ * move_task_to_local_dsq - Move a task from a different rq to a local DSQ -+ * @rq: rq to move the task into, currently locked -+ * @p: task to move -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * Move @p which is currently on a different rq to @rq's local DSQ. The caller -+ * must: -+ * -+ * 1. Start with exclusive access to @p either through its DSQ lock or -+ * %SCX_OPSS_DISPATCHING flag. -+ * -+ * 2. Set @p->scx.holding_cpu to raw_smp_processor_id(). -+ * -+ * 3. Remember task_rq(@p). Release the exclusive access so that we don't -+ * deadlock with dequeue. -+ * -+ * 4. Lock @rq and the task_rq from #3. -+ * -+ * 5. Call this function. -+ * -+ * Returns %true if @p was successfully moved. %false after racing dequeue and -+ * losing. -+ */ -+static bool move_task_to_local_dsq(struct rq *rq, struct task_struct *p, -+ u64 enq_flags) -+{ -+ struct rq *task_rq; -+ -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * If dequeue got to @p while we were trying to lock both rq's, it'd -+ * have cleared @p->scx.holding_cpu to -1. While other cpus may have -+ * updated it to different values afterwards, as this operation can't be -+ * preempted or recurse, @p->scx.holding_cpu can never become -+ * raw_smp_processor_id() again before we're done. Thus, we can tell -+ * whether we lost to dequeue by testing whether @p->scx.holding_cpu is -+ * still raw_smp_processor_id(). -+ * -+ * See dispatch_dequeue() for the counterpart. -+ */ -+ if (unlikely(p->scx.holding_cpu != raw_smp_processor_id())) -+ return false; -+ -+ /* @p->rq couldn't have changed if we're still the holding cpu */ -+ task_rq = task_rq(p); -+ lockdep_assert_rq_held(task_rq); -+ -+ WARN_ON_ONCE(!cpumask_test_cpu(cpu_of(rq), p->cpus_ptr)); -+ deactivate_task(task_rq, p, 0); -+ set_task_cpu(p, cpu_of(rq)); -+ p->scx.sticky_cpu = cpu_of(rq); -+ -+ /* -+ * We want to pass scx-specific enq_flags but activate_task() will -+ * truncate the upper 32 bit. As we own @rq, we can pass them through -+ * @rq->scx.extra_enq_flags instead. -+ */ -+ WARN_ON_ONCE(rq->scx.extra_enq_flags); -+ rq->scx.extra_enq_flags = enq_flags; -+ activate_task(rq, p, 0); -+ rq->scx.extra_enq_flags = 0; -+ -+ return true; -+} -+ -+/** -+ * dispatch_to_local_dsq_lock - Ensure source and desitnation rq's are locked -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to -+ * -+ * We're holding @rq lock and trying to dispatch a task from @src_rq to -+ * @dst_rq's local DSQ and thus need to lock both @src_rq and @dst_rq. Whether -+ * @rq stays locked isn't important as long as the state is restored after -+ * dispatch_to_local_dsq_unlock(). -+ */ -+static void dispatch_to_local_dsq_lock(struct rq *rq, struct rq_flags *rf, -+ struct rq *src_rq, struct rq *dst_rq) -+{ -+ rq_unpin_lock(rq, rf); -+ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(rq); -+ raw_spin_rq_lock(dst_rq); -+ } else if (rq == src_rq) { -+ double_lock_balance(rq, dst_rq); -+ rq_repin_lock(rq, rf); -+ } else if (rq == dst_rq) { -+ double_lock_balance(rq, src_rq); -+ rq_repin_lock(rq, rf); -+ } else { -+ raw_spin_rq_unlock(rq); -+ double_rq_lock(src_rq, dst_rq); -+ } -+} -+ -+/** -+ * dispatch_to_local_dsq_unlock - Undo dispatch_to_local_dsq_lock() -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @src_rq: rq to move task from -+ * @dst_rq: rq to move task to -+ * -+ * Unlock @src_rq and @dst_rq and ensure that @rq is locked on return. -+ */ -+static void dispatch_to_local_dsq_unlock(struct rq *rq, struct rq_flags *rf, -+ struct rq *src_rq, struct rq *dst_rq) -+{ -+ if (src_rq == dst_rq) { -+ raw_spin_rq_unlock(dst_rq); -+ raw_spin_rq_lock(rq); -+ rq_repin_lock(rq, rf); -+ } else if (rq == src_rq) { -+ double_unlock_balance(rq, dst_rq); -+ } else if (rq == dst_rq) { -+ double_unlock_balance(rq, src_rq); -+ } else { -+ double_rq_unlock(src_rq, dst_rq); -+ raw_spin_rq_lock(rq); -+ rq_repin_lock(rq, rf); -+ } -+} -+#endif /* CONFIG_SMP */ -+ -+static void consume_local_task(struct rq *rq, struct scx_dispatch_q *dsq, -+ struct task_struct *p) -+{ -+ lockdep_assert_held(&dsq->lock); /* released on return */ -+ -+ /* @dsq is locked and @p is on this rq */ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ task_unlink_from_dsq(p, dsq); -+ list_add_tail(&p->scx.dsq_node.list, &rq->scx.local_dsq.list); -+ dsq_mod_nr(dsq, -1); -+ dsq_mod_nr(&rq->scx.local_dsq, 1); -+ WRITE_ONCE(p->scx.dsq, &rq->scx.local_dsq); -+ raw_spin_unlock(&dsq->lock); -+} -+ -+#ifdef CONFIG_SMP -+/* -+ * Similar to kernel/sched/core.c::is_cpu_allowed() but we're testing whether @p -+ * can be pulled to @rq. -+ */ -+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (!cpumask_test_cpu(cpu, p->cpus_ptr)) -+ return false; -+ if (unlikely(is_migration_disabled(p))) -+ return false; -+ if (!(p->flags & PF_KTHREAD) && unlikely(!task_cpu_possible(cpu, p))) -+ return false; -+ if (!scx_rq_online(rq)) -+ return false; -+ return true; -+} -+ -+static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, -+ struct scx_dispatch_q *dsq, -+ struct task_struct *p, struct rq *task_rq) -+{ -+ bool moved = false; -+ -+ lockdep_assert_held(&dsq->lock); /* released on return */ -+ -+ /* -+ * @dsq is locked and @p is on a remote rq. @p is currently protected by -+ * @dsq->lock. We want to pull @p to @rq but may deadlock if we grab -+ * @task_rq while holding @dsq and @rq locks. As dequeue can't drop the -+ * rq lock or fail, do a little dancing from our side. See -+ * move_task_to_local_dsq(). -+ */ -+ WARN_ON_ONCE(p->scx.holding_cpu >= 0); -+ task_unlink_from_dsq(p, dsq); -+ dsq_mod_nr(dsq, -1); -+ p->scx.holding_cpu = raw_smp_processor_id(); -+ raw_spin_unlock(&dsq->lock); -+ -+ rq_unpin_lock(rq, rf); -+ double_lock_balance(rq, task_rq); -+ rq_repin_lock(rq, rf); -+ -+ moved = move_task_to_local_dsq(rq, p, 0); -+ -+ double_unlock_balance(rq, task_rq); -+ -+ return moved; -+} -+#else /* CONFIG_SMP */ -+static bool task_can_run_on_remote_rq(struct task_struct *p, struct rq *rq) { return false; } -+static bool consume_remote_task(struct rq *rq, struct rq_flags *rf, -+ struct scx_dispatch_q *dsq, -+ struct task_struct *p, struct rq *task_rq) { return false; } -+#endif /* CONFIG_SMP */ -+ -+static bool consume_dispatch_q(struct rq *rq, struct rq_flags *rf, -+ struct scx_dispatch_q *dsq) -+{ -+ struct task_struct *p; -+retry: -+ if (list_empty(&dsq->list)) -+ return false; -+ -+ raw_spin_lock(&dsq->lock); -+ -+ nldsq_for_each_task(p, dsq) { -+ struct rq *task_rq = task_rq(p); -+ -+ if (rq == task_rq) { -+ consume_local_task(rq, dsq, p); -+ return true; -+ } -+ -+ if (task_can_run_on_remote_rq(p, rq)) { -+ if (likely(consume_remote_task(rq, rf, dsq, p, task_rq))) -+ return true; -+ goto retry; -+ } -+ } -+ -+ raw_spin_unlock(&dsq->lock); -+ return false; -+} -+ -+enum dispatch_to_local_dsq_ret { -+ DTL_DISPATCHED, /* successfully dispatched */ -+ DTL_LOST, /* lost race to dequeue */ -+ DTL_NOT_LOCAL, /* destination is not a local DSQ */ -+ DTL_INVALID, /* invalid local dsq_id */ -+}; -+ -+/** -+ * dispatch_to_local_dsq - Dispatch a task to a local dsq -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @dsq_id: destination dsq ID -+ * @p: task to dispatch -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * We're holding @rq lock and want to dispatch @p to the local DSQ identified by -+ * @dsq_id. This function performs all the synchronization dancing needed -+ * because local DSQs are protected with rq locks. -+ * -+ * The caller must have exclusive ownership of @p (e.g. through -+ * %SCX_OPSS_DISPATCHING). -+ */ -+static enum dispatch_to_local_dsq_ret -+dispatch_to_local_dsq(struct rq *rq, struct rq_flags *rf, u64 dsq_id, -+ struct task_struct *p, u64 enq_flags) -+{ -+ struct rq *src_rq = task_rq(p); -+ struct rq *dst_rq; -+ -+ /* -+ * We're synchronized against dequeue through DISPATCHING. As @p can't -+ * be dequeued, its task_rq and cpus_allowed are stable too. -+ */ -+ if (dsq_id == SCX_DSQ_LOCAL) { -+ dst_rq = rq; -+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (!ops_cpu_valid(cpu, "in SCX_DSQ_LOCAL_ON dispatch verdict")) -+ return DTL_INVALID; -+ dst_rq = cpu_rq(cpu); -+ } else { -+ return DTL_NOT_LOCAL; -+ } -+ -+ /* if dispatching to @rq that @p is already on, no lock dancing needed */ -+ if (rq == src_rq && rq == dst_rq) { -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags | SCX_ENQ_CLEAR_OPSS); -+ return DTL_DISPATCHED; -+ } -+ -+#ifdef CONFIG_SMP -+ if (cpumask_test_cpu(cpu_of(dst_rq), p->cpus_ptr)) { -+ struct rq *locked_dst_rq = dst_rq; -+ bool dsp; -+ -+ /* -+ * @p is on a possibly remote @src_rq which we need to lock to -+ * move the task. If dequeue is in progress, it'd be locking -+ * @src_rq and waiting on DISPATCHING, so we can't grab @src_rq -+ * lock while holding DISPATCHING. -+ * -+ * As DISPATCHING guarantees that @p is wholly ours, we can -+ * pretend that we're moving from a DSQ and use the same -+ * mechanism - mark the task under transfer with holding_cpu, -+ * release DISPATCHING and then follow the same protocol. -+ */ -+ p->scx.holding_cpu = raw_smp_processor_id(); -+ -+ /* store_release ensures that dequeue sees the above */ -+ atomic_long_set_release(&p->scx.ops_state, SCX_OPSS_NONE); -+ -+ dispatch_to_local_dsq_lock(rq, rf, src_rq, locked_dst_rq); -+ -+ /* -+ * We don't require the BPF scheduler to avoid dispatching to -+ * offline CPUs mostly for convenience but also because CPUs can -+ * go offline between scx_bpf_dispatch() calls and here. If @p -+ * is destined to an offline CPU, queue it on its current CPU -+ * instead, which should always be safe. As this is an allowed -+ * behavior, don't trigger an ops error. -+ */ -+ if (!scx_rq_online(dst_rq)) -+ dst_rq = src_rq; -+ -+ if (src_rq == dst_rq) { -+ /* -+ * As @p is staying on the same rq, there's no need to -+ * go through the full deactivate/activate cycle. -+ * Optimize by abbreviating the operations in -+ * move_task_to_local_dsq(). -+ */ -+ dsp = p->scx.holding_cpu == raw_smp_processor_id(); -+ if (likely(dsp)) { -+ p->scx.holding_cpu = -1; -+ dispatch_enqueue(&dst_rq->scx.local_dsq, p, -+ enq_flags); -+ } -+ } else { -+ dsp = move_task_to_local_dsq(dst_rq, p, enq_flags); -+ } -+ -+ /* if the destination CPU is idle, wake it up */ -+ if (dsp && p->sched_class < dst_rq->curr->sched_class) -+ resched_curr(dst_rq); -+ -+ dispatch_to_local_dsq_unlock(rq, rf, src_rq, locked_dst_rq); -+ -+ return dsp ? DTL_DISPATCHED : DTL_LOST; -+ } -+#endif /* CONFIG_SMP */ -+ -+ scx_ops_error("SCX_DSQ_LOCAL[_ON] verdict target cpu %d not allowed for %s[%d]", -+ cpu_of(dst_rq), p->comm, p->pid); -+ return DTL_INVALID; -+} -+ -+/** -+ * finish_dispatch - Asynchronously finish dispatching a task -+ * @rq: current rq which is locked -+ * @rf: rq_flags to use when unlocking @rq -+ * @p: task to finish dispatching -+ * @qseq_at_dispatch: qseq when @p started getting dispatched -+ * @dsq_id: destination DSQ ID -+ * @enq_flags: %SCX_ENQ_* -+ * -+ * Dispatching to local DSQs may need to wait for queueing to complete or -+ * require rq lock dancing. As we don't wanna do either while inside -+ * ops.dispatch() to avoid locking order inversion, we split dispatching into -+ * two parts. scx_bpf_dispatch() which is called by ops.dispatch() records the -+ * task and its qseq. Once ops.dispatch() returns, this function is called to -+ * finish up. -+ * -+ * There is no guarantee that @p is still valid for dispatching or even that it -+ * was valid in the first place. Make sure that the task is still owned by the -+ * BPF scheduler and claim the ownership before dispatching. -+ */ -+static void finish_dispatch(struct rq *rq, struct rq_flags *rf, -+ struct task_struct *p, -+ unsigned long qseq_at_dispatch, -+ u64 dsq_id, u64 enq_flags) -+{ -+ struct scx_dispatch_q *dsq; -+ unsigned long opss; -+ -+ touch_core_sched_dispatch(rq, p); -+retry: -+ /* -+ * No need for _acquire here. @p is accessed only after a successful -+ * try_cmpxchg to DISPATCHING. -+ */ -+ opss = atomic_long_read(&p->scx.ops_state); -+ -+ switch (opss & SCX_OPSS_STATE_MASK) { -+ case SCX_OPSS_DISPATCHING: -+ case SCX_OPSS_NONE: -+ /* someone else already got to it */ -+ return; -+ case SCX_OPSS_QUEUED: -+ /* -+ * If qseq doesn't match, @p has gone through at least one -+ * dispatch/dequeue and re-enqueue cycle between -+ * scx_bpf_dispatch() and here and we have no claim on it. -+ */ -+ if ((opss & SCX_OPSS_QSEQ_MASK) != qseq_at_dispatch) -+ return; -+ -+ /* -+ * While we know @p is accessible, we don't yet have a claim on -+ * it - the BPF scheduler is allowed to dispatch tasks -+ * spuriously and there can be a racing dequeue attempt. Let's -+ * claim @p by atomically transitioning it from QUEUED to -+ * DISPATCHING. -+ */ -+ if (likely(atomic_long_try_cmpxchg(&p->scx.ops_state, &opss, -+ SCX_OPSS_DISPATCHING))) -+ break; -+ goto retry; -+ case SCX_OPSS_QUEUEING: -+ /* -+ * do_enqueue_task() is in the process of transferring the task -+ * to the BPF scheduler while holding @p's rq lock. As we aren't -+ * holding any kernel or BPF resource that the enqueue path may -+ * depend upon, it's safe to wait. -+ */ -+ wait_ops_state(p, opss); -+ goto retry; -+ } -+ -+ BUG_ON(!(p->scx.flags & SCX_TASK_QUEUED)); -+ -+ switch (dispatch_to_local_dsq(rq, rf, dsq_id, p, enq_flags)) { -+ case DTL_DISPATCHED: -+ break; -+ case DTL_LOST: -+ break; -+ case DTL_INVALID: -+ dsq_id = SCX_DSQ_GLOBAL; -+ fallthrough; -+ case DTL_NOT_LOCAL: -+ dsq = find_dsq_for_dispatch(cpu_rq(raw_smp_processor_id()), -+ dsq_id, p); -+ dispatch_enqueue(dsq, p, enq_flags | SCX_ENQ_CLEAR_OPSS); -+ break; -+ } -+} -+ -+static void flush_dispatch_buf(struct rq *rq, struct rq_flags *rf) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ u32 u; -+ -+ for (u = 0; u < dspc->cursor; u++) { -+ struct scx_dsp_buf_ent *ent = &dspc->buf[u]; -+ -+ finish_dispatch(rq, rf, ent->task, ent->qseq, ent->dsq_id, -+ ent->enq_flags); -+ } -+ -+ dspc->nr_tasks += dspc->cursor; -+ dspc->cursor = 0; -+} -+ -+static int balance_one(struct rq *rq, struct task_struct *prev, -+ struct rq_flags *rf, bool local) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ bool prev_on_scx = prev->sched_class == &ext_sched_class; -+ int nr_loops = SCX_DSP_MAX_LOOPS; -+ bool has_tasks = false; -+ -+ lockdep_assert_rq_held(rq); -+ rq->scx.flags |= SCX_RQ_BALANCING; -+ -+ if (static_branch_unlikely(&scx_ops_cpu_preempt) && -+ unlikely(rq->scx.cpu_released)) { -+ /* -+ * If the previous sched_class for the current CPU was not SCX, -+ * notify the BPF scheduler that it again has control of the -+ * core. This callback complements ->cpu_release(), which is -+ * emitted in scx_next_task_picked(). -+ */ -+ if (SCX_HAS_OP(cpu_acquire)) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cpu_acquire, cpu_of(rq), -+ NULL); -+ rq->scx.cpu_released = false; -+ } -+ -+ if (prev_on_scx) { -+ WARN_ON_ONCE(local && (prev->scx.flags & SCX_TASK_BAL_KEEP)); -+ update_curr_scx(rq); -+ -+ /* -+ * If @prev is runnable & has slice left, it has priority and -+ * fetching more just increases latency for the fetched tasks. -+ * Tell put_prev_task_scx() to put @prev on local_dsq. If the -+ * BPF scheduler wants to handle this explicitly, it should -+ * implement ->cpu_released(). -+ * -+ * See scx_ops_disable_workfn() for the explanation on the -+ * bypassing test. -+ * -+ * When balancing a remote CPU for core-sched, there won't be a -+ * following put_prev_task_scx() call and we don't own -+ * %SCX_TASK_BAL_KEEP. Instead, pick_task_scx() will test the -+ * same conditions later and pick @rq->curr accordingly. -+ */ -+ if ((prev->scx.flags & SCX_TASK_QUEUED) && -+ prev->scx.slice && !scx_ops_bypassing()) { -+ if (local) -+ prev->scx.flags |= SCX_TASK_BAL_KEEP; -+ goto has_tasks; -+ } -+ } -+ -+ /* if there already are tasks to run, nothing to do */ -+ if (rq->scx.local_dsq.nr) -+ goto has_tasks; -+ -+ if (consume_dispatch_q(rq, rf, &scx_dsq_global)) -+ goto has_tasks; -+ -+ if (!SCX_HAS_OP(dispatch) || scx_ops_bypassing() || !scx_rq_online(rq)) -+ goto out; -+ -+ dspc->rq = rq; -+ dspc->rf = rf; -+ -+ /* -+ * The dispatch loop. Because flush_dispatch_buf() may drop the rq lock, -+ * the local DSQ might still end up empty after a successful -+ * ops.dispatch(). If the local DSQ is empty even after ops.dispatch() -+ * produced some tasks, retry. The BPF scheduler may depend on this -+ * looping behavior to simplify its implementation. -+ */ -+ do { -+ dspc->nr_tasks = 0; -+ -+ SCX_CALL_OP(SCX_KF_DISPATCH, dispatch, cpu_of(rq), -+ prev_on_scx ? prev : NULL); -+ -+ flush_dispatch_buf(rq, rf); -+ -+ if (rq->scx.local_dsq.nr) -+ goto has_tasks; -+ if (consume_dispatch_q(rq, rf, &scx_dsq_global)) -+ goto has_tasks; -+ -+ /* -+ * ops.dispatch() can trap us in this loop by repeatedly -+ * dispatching ineligible tasks. Break out once in a while to -+ * allow the watchdog to run. As IRQ can't be enabled in -+ * balance(), we want to complete this scheduling cycle and then -+ * start a new one. IOW, we want to call resched_curr() on the -+ * next, most likely idle, task, not the current one. Use -+ * scx_bpf_kick_cpu() for deferred kicking. -+ */ -+ if (unlikely(!--nr_loops)) { -+ scx_bpf_kick_cpu(cpu_of(rq), 0); -+ break; -+ } -+ } while (dspc->nr_tasks); -+ -+ goto out; -+ -+has_tasks: -+ has_tasks = true; -+out: -+ rq->scx.flags &= ~SCX_RQ_BALANCING; -+ return has_tasks; -+} -+ -+static int balance_scx(struct rq *rq, struct task_struct *prev, -+ struct rq_flags *rf) -+{ -+ int ret; -+ -+ ret = balance_one(rq, prev, rf, true); -+ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * When core-sched is enabled, this ops.balance() call will be followed -+ * by put_prev_scx() and pick_task_scx() on this CPU and pick_task_scx() -+ * on the SMT siblings. Balance the siblings too. -+ */ -+ if (sched_core_enabled(rq)) { -+ const struct cpumask *smt_mask = cpu_smt_mask(cpu_of(rq)); -+ int scpu; -+ -+ for_each_cpu_andnot(scpu, smt_mask, cpumask_of(cpu_of(rq))) { -+ struct rq *srq = cpu_rq(scpu); -+ struct rq_flags srf; -+ struct task_struct *sprev = srq->curr; -+ -+ /* -+ * While core-scheduling, rq lock is shared among -+ * siblings but the debug annotations and rq clock -+ * aren't. Do pinning dance to transfer the ownership. -+ */ -+ WARN_ON_ONCE(__rq_lockp(rq) != __rq_lockp(srq)); -+ rq_unpin_lock(rq, rf); -+ rq_pin_lock(srq, &srf); -+ -+ update_rq_clock(srq); -+ balance_one(srq, sprev, &srf, false); -+ -+ rq_unpin_lock(srq, &srf); -+ rq_repin_lock(rq, rf); -+ } -+ } -+#endif -+ return ret; -+} -+ -+static void set_next_task_scx(struct rq *rq, struct task_struct *p, bool first) -+{ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ /* -+ * Core-sched might decide to execute @p before it is -+ * dispatched. Call ops_dequeue() to notify the BPF scheduler. -+ */ -+ ops_dequeue(p, SCX_DEQ_CORE_SCHED_EXEC); -+ dispatch_dequeue(rq, p); -+ } -+ -+ p->se.exec_start = rq_clock_task(rq); -+ -+ /* see dequeue_task_scx() on why we skip when !QUEUED */ -+ if (SCX_HAS_OP(running) && (p->scx.flags & SCX_TASK_QUEUED)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, running, p); -+ -+ clr_task_runnable(p, true); -+ -+ /* -+ * @p is getting newly scheduled or got kicked after someone updated its -+ * slice. Refresh whether tick can be stopped. See scx_can_stop_tick(). -+ */ -+ if ((p->scx.slice == SCX_SLICE_INF) != -+ (bool)(rq->scx.flags & SCX_RQ_CAN_STOP_TICK)) { -+ if (p->scx.slice == SCX_SLICE_INF) -+ rq->scx.flags |= SCX_RQ_CAN_STOP_TICK; -+ else -+ rq->scx.flags &= ~SCX_RQ_CAN_STOP_TICK; -+ -+ sched_update_tick_dependency(rq); -+ -+ /* -+ * For now, let's refresh the load_avgs just when transitioning -+ * in and out of nohz. In the future, we might want to add a -+ * mechanism which calls the following periodically on -+ * tick-stopped CPUs. -+ */ -+ update_other_load_avgs(rq); -+ } -+} -+ -+static void put_prev_task_scx(struct rq *rq, struct task_struct *p) -+{ -+#ifndef CONFIG_SMP -+ /* -+ * UP workaround. -+ * -+ * Because SCX may transfer tasks across CPUs during dispatch, dispatch -+ * is performed from its balance operation which isn't called in UP. -+ * Let's work around by calling it from the operations which come right -+ * after. -+ * -+ * 1. If the prev task is on SCX, pick_next_task() calls -+ * .put_prev_task() right after. As .put_prev_task() is also called -+ * from other places, we need to distinguish the calls which can be -+ * done by looking at the previous task's state - if still queued or -+ * dequeued with %SCX_DEQ_SLEEP, the caller must be pick_next_task(). -+ * This case is handled here. -+ * -+ * 2. If the prev task is not on SCX, the first following call into SCX -+ * will be .pick_next_task(), which is covered by calling -+ * balance_scx() from pick_next_task_scx(). -+ * -+ * Note that we can't merge the first case into the second as -+ * balance_scx() must be called before the previous SCX task goes -+ * through put_prev_task_scx(). -+ * -+ * As UP doesn't transfer tasks around, balance_scx() doesn't need @rf. -+ * Pass in %NULL. -+ */ -+ if (p->scx.flags & (SCX_TASK_QUEUED | SCX_TASK_DEQD_FOR_SLEEP)) -+ balance_scx(rq, p, NULL); -+#endif -+ -+ update_curr_scx(rq); -+ -+ /* see dequeue_task_scx() on why we skip when !QUEUED */ -+ if (SCX_HAS_OP(stopping) && (p->scx.flags & SCX_TASK_QUEUED)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, stopping, p, true); -+ -+ /* -+ * If we're being called from put_prev_task_balance(), balance_scx() may -+ * have decided that @p should keep running. -+ */ -+ if (p->scx.flags & SCX_TASK_BAL_KEEP) { -+ p->scx.flags &= ~SCX_TASK_BAL_KEEP; -+ set_task_runnable(rq, p); -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ if (p->scx.flags & SCX_TASK_QUEUED) { -+ set_task_runnable(rq, p); -+ -+ /* -+ * If @p has slice left and balance_scx() didn't tag it for -+ * keeping, @p is getting preempted by a higher priority -+ * scheduler class or core-sched forcing a different task. Leave -+ * it at the head of the local DSQ. -+ */ -+ if (p->scx.slice && !scx_ops_bypassing()) { -+ dispatch_enqueue(&rq->scx.local_dsq, p, SCX_ENQ_HEAD); -+ return; -+ } -+ -+ /* -+ * If we're in the pick_next_task path, balance_scx() should -+ * have already populated the local DSQ if there are any other -+ * available tasks. If empty, tell ops.enqueue() that @p is the -+ * only one available for this cpu. ops.enqueue() should put it -+ * on the local DSQ so that the subsequent pick_next_task_scx() -+ * can find the task unless it wants to trigger a separate -+ * follow-up scheduling event. -+ */ -+ if (list_empty(&rq->scx.local_dsq.list)) -+ do_enqueue_task(rq, p, SCX_ENQ_LAST, -1); -+ else -+ do_enqueue_task(rq, p, 0, -1); -+ } -+} -+ -+static struct task_struct *first_local_task(struct rq *rq) -+{ -+ return list_first_entry_or_null(&rq->scx.local_dsq.list, -+ struct task_struct, scx.dsq_node.list); -+} -+ -+static struct task_struct *pick_next_task_scx(struct rq *rq) -+{ -+ struct task_struct *p; -+ -+#ifndef CONFIG_SMP -+ /* UP workaround - see the comment at the head of put_prev_task_scx() */ -+ if (unlikely(rq->curr->sched_class != &ext_sched_class)) -+ balance_scx(rq, rq->curr, NULL); -+#endif -+ -+ p = first_local_task(rq); -+ if (!p) -+ return NULL; -+ -+ set_next_task_scx(rq, p, true); -+ -+ if (unlikely(!p->scx.slice)) { -+ if (!scx_ops_bypassing() && !scx_warned_zero_slice) { -+ printk_deferred(KERN_WARNING "sched_ext: %s[%d] has zero slice in pick_next_task_scx()\n", -+ p->comm, p->pid); -+ scx_warned_zero_slice = true; -+ } -+ p->scx.slice = SCX_SLICE_DFL; -+ } -+ -+ return p; -+} -+ -+#ifdef CONFIG_SCHED_CORE -+/** -+ * scx_prio_less - Task ordering for core-sched -+ * @a: task A -+ * @b: task B -+ * -+ * Core-sched is implemented as an additional scheduling layer on top of the -+ * usual sched_class'es and needs to find out the expected task ordering. For -+ * SCX, core-sched calls this function to interrogate the task ordering. -+ * -+ * Unless overridden by ops.core_sched_before(), @p->scx.core_sched_at is used -+ * to implement the default task ordering. The older the timestamp, the higher -+ * prority the task - the global FIFO ordering matching the default scheduling -+ * behavior. -+ * -+ * When ops.core_sched_before() is enabled, @p->scx.core_sched_at is used to -+ * implement FIFO ordering within each local DSQ. See pick_task_scx(). -+ */ -+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, -+ bool in_fi) -+{ -+ /* -+ * The const qualifiers are dropped from task_struct pointers when -+ * calling ops.core_sched_before(). Accesses are controlled by the -+ * verifier. -+ */ -+ if (SCX_HAS_OP(core_sched_before) && !scx_ops_bypassing()) -+ return SCX_CALL_OP_2TASKS_RET(SCX_KF_REST, core_sched_before, -+ (struct task_struct *)a, -+ (struct task_struct *)b); -+ else -+ return time_after64(a->scx.core_sched_at, b->scx.core_sched_at); -+} -+ -+/** -+ * pick_task_scx - Pick a candidate task for core-sched -+ * @rq: rq to pick the candidate task from -+ * -+ * Core-sched calls this function on each SMT sibling to determine the next -+ * tasks to run on the SMT siblings. balance_one() has been called on all -+ * siblings and put_prev_task_scx() has been called only for the current CPU. -+ * -+ * As put_prev_task_scx() hasn't been called on remote CPUs, we can't just look -+ * at the first task in the local dsq. @rq->curr has to be considered explicitly -+ * to mimic %SCX_TASK_BAL_KEEP. -+ */ -+static struct task_struct *pick_task_scx(struct rq *rq) -+{ -+ struct task_struct *curr = rq->curr; -+ struct task_struct *first = first_local_task(rq); -+ -+ if (curr->scx.flags & SCX_TASK_QUEUED) { -+ /* is curr the only runnable task? */ -+ if (!first) -+ return curr; -+ -+ /* -+ * Does curr trump first? We can always go by core_sched_at for -+ * this comparison as it represents global FIFO ordering when -+ * the default core-sched ordering is used and local-DSQ FIFO -+ * ordering otherwise. -+ * -+ * We can have a task with an earlier timestamp on the DSQ. For -+ * example, when a current task is preempted by a sibling -+ * picking a different cookie, the task would be requeued at the -+ * head of the local DSQ with an earlier timestamp than the -+ * core-sched picked next task. Besides, the BPF scheduler may -+ * dispatch any tasks to the local DSQ anytime. -+ */ -+ if (curr->scx.slice && time_before64(curr->scx.core_sched_at, -+ first->scx.core_sched_at)) -+ return curr; -+ } -+ -+ return first; /* this may be %NULL */ -+} -+#endif /* CONFIG_SCHED_CORE */ -+ -+static enum scx_cpu_preempt_reason -+preempt_reason_from_class(const struct sched_class *class) -+{ -+#ifdef CONFIG_SMP -+ if (class == &stop_sched_class) -+ return SCX_CPU_PREEMPT_STOP; -+#endif -+ if (class == &dl_sched_class) -+ return SCX_CPU_PREEMPT_DL; -+ if (class == &rt_sched_class) -+ return SCX_CPU_PREEMPT_RT; -+ return SCX_CPU_PREEMPT_UNKNOWN; -+} -+ -+void scx_next_task_picked(struct rq *rq, struct task_struct *p, -+ const struct sched_class *active) -+{ -+ lockdep_assert_rq_held(rq); -+ -+ if (!scx_enabled()) -+ return; -+#ifdef CONFIG_SMP -+ /* -+ * Pairs with the smp_load_acquire() issued by a CPU in -+ * kick_cpus_irq_workfn() who is waiting for this CPU to perform a -+ * resched. -+ */ -+ smp_store_release(&rq->scx.pnt_seq, rq->scx.pnt_seq + 1); -+#endif -+ if (!static_branch_unlikely(&scx_ops_cpu_preempt)) -+ return; -+ -+ /* -+ * The callback is conceptually meant to convey that the CPU is no -+ * longer under the control of SCX. Therefore, don't invoke the -+ * callback if the CPU is is staying on SCX, or going idle (in which -+ * case the SCX scheduler has actively decided not to schedule any -+ * tasks on the CPU). -+ */ -+ if (likely(active >= &ext_sched_class)) -+ return; -+ -+ /* -+ * At this point we know that SCX was preempted by a higher priority -+ * sched_class, so invoke the ->cpu_release() callback if we have not -+ * done so already. We only send the callback once between SCX being -+ * preempted, and it regaining control of the CPU. -+ * -+ * ->cpu_release() complements ->cpu_acquire(), which is emitted the -+ * next time that balance_scx() is invoked. -+ */ -+ if (!rq->scx.cpu_released) { -+ if (SCX_HAS_OP(cpu_release)) { -+ struct scx_cpu_release_args args = { -+ .reason = preempt_reason_from_class(active), -+ .task = p, -+ }; -+ -+ SCX_CALL_OP(SCX_KF_CPU_RELEASE, -+ cpu_release, cpu_of(rq), &args); -+ } -+ rq->scx.cpu_released = true; -+ } -+} -+ -+#ifdef CONFIG_SMP -+ -+static bool test_and_clear_cpu_idle(int cpu) -+{ -+#ifdef CONFIG_SCHED_SMT -+ /* -+ * SMT mask should be cleared whether we can claim @cpu or not. The SMT -+ * cluster is not wholly idle either way. This also prevents -+ * scx_pick_idle_cpu() from getting caught in an infinite loop. -+ */ -+ if (sched_smt_active()) { -+ const struct cpumask *smt = cpu_smt_mask(cpu); -+ -+ /* -+ * If offline, @cpu is not its own sibling and -+ * scx_pick_idle_cpu() can get caught in an infinite loop as -+ * @cpu is never cleared from idle_masks.smt. Ensure that @cpu -+ * is eventually cleared. -+ */ -+ if (cpumask_intersects(smt, idle_masks.smt)) -+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); -+ else if (cpumask_test_cpu(cpu, idle_masks.smt)) -+ __cpumask_clear_cpu(cpu, idle_masks.smt); -+ } -+#endif -+ return cpumask_test_and_clear_cpu(cpu, idle_masks.cpu); -+} -+ -+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) -+{ -+ int cpu; -+ -+retry: -+ if (sched_smt_active()) { -+ cpu = cpumask_any_and_distribute(idle_masks.smt, cpus_allowed); -+ if (cpu < nr_cpu_ids) -+ goto found; -+ -+ if (flags & SCX_PICK_IDLE_CORE) -+ return -EBUSY; -+ } -+ -+ cpu = cpumask_any_and_distribute(idle_masks.cpu, cpus_allowed); -+ if (cpu >= nr_cpu_ids) -+ return -EBUSY; -+ -+found: -+ if (test_and_clear_cpu_idle(cpu)) -+ return cpu; -+ else -+ goto retry; -+} -+ -+static s32 scx_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags, bool *found) -+{ -+ s32 cpu; -+ -+ *found = false; -+ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return prev_cpu; -+ } -+ -+ /* -+ * If WAKE_SYNC, the waker's local DSQ is empty, and the system is -+ * under utilized, wake up @p to the local DSQ of the waker. Checking -+ * only for an empty local DSQ is insufficient as it could give the -+ * wakee an unfair advantage when the system is oversaturated. -+ * Checking only for the presence of idle CPUs is also insufficient as -+ * the local DSQ of the waker could have tasks piled up on it even if -+ * there is an idle core elsewhere on the system. -+ */ -+ cpu = smp_processor_id(); -+ if ((wake_flags & SCX_WAKE_SYNC) && p->nr_cpus_allowed > 1 && -+ !cpumask_empty(idle_masks.cpu) && !(current->flags & PF_EXITING) && -+ cpu_rq(cpu)->scx.local_dsq.nr == 0) { -+ if (cpumask_test_cpu(cpu, p->cpus_ptr)) -+ goto cpu_found; -+ } -+ -+ if (p->nr_cpus_allowed == 1) { -+ if (test_and_clear_cpu_idle(prev_cpu)) { -+ cpu = prev_cpu; -+ goto cpu_found; -+ } else { -+ return prev_cpu; -+ } -+ } -+ -+ /* -+ * If CPU has SMT, any wholly idle CPU is likely a better pick than -+ * partially idle @prev_cpu. -+ */ -+ if (sched_smt_active()) { -+ if (cpumask_test_cpu(prev_cpu, idle_masks.smt) && -+ test_and_clear_cpu_idle(prev_cpu)) { -+ cpu = prev_cpu; -+ goto cpu_found; -+ } -+ -+ cpu = scx_pick_idle_cpu(p->cpus_ptr, SCX_PICK_IDLE_CORE); -+ if (cpu >= 0) -+ goto cpu_found; -+ } -+ -+ if (test_and_clear_cpu_idle(prev_cpu)) { -+ cpu = prev_cpu; -+ goto cpu_found; -+ } -+ -+ cpu = scx_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ goto cpu_found; -+ -+ return prev_cpu; -+ -+cpu_found: -+ *found = true; -+ return cpu; -+} -+ -+static int select_task_rq_scx(struct task_struct *p, int prev_cpu, int wake_flags) -+{ -+ /* -+ * sched_exec() calls with %WF_EXEC when @p is about to exec(2) as it -+ * can be a good migration opportunity with low cache and memory -+ * footprint. Returning a CPU different than @prev_cpu triggers -+ * immediate rq migration. However, for SCX, as the current rq -+ * association doesn't dictate where the task is going to run, this -+ * doesn't fit well. If necessary, we can later add a dedicated method -+ * which can decide to preempt self to force it through the regular -+ * scheduling path. -+ */ -+ if (unlikely(wake_flags & WF_EXEC)) -+ return prev_cpu; -+ -+ if (SCX_HAS_OP(select_cpu)) { -+ s32 cpu; -+ struct task_struct **ddsp_taskp; -+ -+ ddsp_taskp = this_cpu_ptr(&direct_dispatch_task); -+ WARN_ON_ONCE(*ddsp_taskp); -+ *ddsp_taskp = p; -+ -+ cpu = SCX_CALL_OP_TASK_RET(SCX_KF_ENQUEUE | SCX_KF_SELECT_CPU, -+ select_cpu, p, prev_cpu, wake_flags); -+ *ddsp_taskp = NULL; -+ if (ops_cpu_valid(cpu, "from ops.select_cpu()")) -+ return cpu; -+ else -+ return prev_cpu; -+ } else { -+ bool found; -+ s32 cpu; -+ -+ cpu = scx_select_cpu_dfl(p, prev_cpu, wake_flags, &found); -+ if (found) { -+ p->scx.slice = SCX_SLICE_DFL; -+ p->scx.ddsp_dsq_id = SCX_DSQ_LOCAL; -+ } -+ return cpu; -+ } -+} -+ -+static void set_cpus_allowed_scx(struct task_struct *p, -+ struct affinity_context *ac) -+{ -+ set_cpus_allowed_common(p, ac); -+ -+ /* -+ * The effective cpumask is stored in @p->cpus_ptr which may temporarily -+ * differ from the configured one in @p->cpus_mask. Always tell the bpf -+ * scheduler the effective one. -+ * -+ * Fine-grained memory write control is enforced by BPF making the const -+ * designation pointless. Cast it away when calling the operation. -+ */ -+ if (SCX_HAS_OP(set_cpumask)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, -+ (struct cpumask *)p->cpus_ptr); -+} -+ -+static void reset_idle_masks(void) -+{ -+ /* -+ * Consider all online cpus idle. Should converge to the actual state -+ * quickly. -+ */ -+ cpumask_copy(idle_masks.cpu, cpu_online_mask); -+ cpumask_copy(idle_masks.smt, cpu_online_mask); -+} -+ -+void __scx_update_idle(struct rq *rq, bool idle) -+{ -+ int cpu = cpu_of(rq); -+ -+ if (SCX_HAS_OP(update_idle)) { -+ SCX_CALL_OP(SCX_KF_REST, update_idle, cpu_of(rq), idle); -+ if (!static_branch_unlikely(&scx_builtin_idle_enabled)) -+ return; -+ } -+ -+ if (idle) -+ cpumask_set_cpu(cpu, idle_masks.cpu); -+ else -+ cpumask_clear_cpu(cpu, idle_masks.cpu); -+ -+#ifdef CONFIG_SCHED_SMT -+ if (sched_smt_active()) { -+ const struct cpumask *smt = cpu_smt_mask(cpu); -+ -+ if (idle) { -+ /* -+ * idle_masks.smt handling is racy but that's fine as -+ * it's only for optimization and self-correcting. -+ */ -+ for_each_cpu(cpu, smt) { -+ if (!cpumask_test_cpu(cpu, idle_masks.cpu)) -+ return; -+ } -+ cpumask_or(idle_masks.smt, idle_masks.smt, smt); -+ } else { -+ cpumask_andnot(idle_masks.smt, idle_masks.smt, smt); -+ } -+ } -+#endif -+} -+ -+static void handle_hotplug(struct rq *rq, bool online) -+{ -+ int cpu = cpu_of(rq); -+ -+ atomic_long_inc(&scx_hotplug_seq); -+ -+ if (online && SCX_HAS_OP(cpu_online)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_online, cpu); -+ else if (!online && SCX_HAS_OP(cpu_offline)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cpu_offline, cpu); -+ else -+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, -+ "cpu %d going %s, exiting scheduler", cpu, -+ online ? "online" : "offline"); -+} -+ -+static void rq_online_scx(struct rq *rq) -+{ -+ rq->scx.flags |= SCX_RQ_ONLINE; -+} -+ -+static void rq_offline_scx(struct rq *rq) -+{ -+ rq->scx.flags &= ~SCX_RQ_ONLINE; -+} -+ -+void scx_rq_activate(struct rq *rq) -+{ -+ handle_hotplug(rq, true); -+} -+ -+void scx_rq_deactivate(struct rq *rq) -+{ -+ handle_hotplug(rq, false); -+} -+ -+#else /* CONFIG_SMP */ -+ -+static bool test_and_clear_cpu_idle(int cpu) { return false; } -+static s32 scx_pick_idle_cpu(const struct cpumask *cpus_allowed, u64 flags) { return -EBUSY; } -+static void reset_idle_masks(void) {} -+ -+#endif /* CONFIG_SMP */ -+ -+static bool check_rq_for_timeouts(struct rq *rq) -+{ -+ struct task_struct *p; -+ struct rq_flags rf; -+ bool timed_out = false; -+ -+ rq_lock_irqsave(rq, &rf); -+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) { -+ unsigned long last_runnable = p->scx.runnable_at; -+ -+ if (unlikely(time_after(jiffies, -+ last_runnable + scx_watchdog_timeout))) { -+ u32 dur_ms = jiffies_to_msecs(jiffies - last_runnable); -+ -+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, -+ "%s[%d] failed to run for %u.%03us", -+ p->comm, p->pid, -+ dur_ms / 1000, dur_ms % 1000); -+ timed_out = true; -+ break; -+ } -+ } -+ rq_unlock_irqrestore(rq, &rf); -+ -+ return timed_out; -+} -+ -+static void scx_watchdog_workfn(struct work_struct *work) -+{ -+ int cpu; -+ -+ WRITE_ONCE(scx_watchdog_timestamp, jiffies); -+ -+ for_each_online_cpu(cpu) { -+ if (unlikely(check_rq_for_timeouts(cpu_rq(cpu)))) -+ break; -+ -+ cond_resched(); -+ } -+ queue_delayed_work(system_unbound_wq, to_delayed_work(work), -+ scx_watchdog_timeout / 2); -+} -+ -+void scx_tick(struct rq *rq) -+{ -+ unsigned long last_check; -+ -+ if (!scx_enabled()) -+ return; -+ -+ last_check = READ_ONCE(scx_watchdog_timestamp); -+ if (unlikely(time_after(jiffies, -+ last_check + READ_ONCE(scx_watchdog_timeout)))) { -+ u32 dur_ms = jiffies_to_msecs(jiffies - last_check); -+ -+ scx_ops_error_kind(SCX_EXIT_ERROR_STALL, -+ "watchdog failed to check in for %u.%03us", -+ dur_ms / 1000, dur_ms % 1000); -+ } -+ -+ update_other_load_avgs(rq); -+} -+ -+static void task_tick_scx(struct rq *rq, struct task_struct *curr, int queued) -+{ -+ update_curr_scx(rq); -+ -+ /* -+ * While disabling, always resched and refresh core-sched timestamp as -+ * we can't trust the slice management or ops.core_sched_before(). -+ */ -+ if (scx_ops_bypassing()) { -+ curr->scx.slice = 0; -+ touch_core_sched(rq, curr); -+ } else if (SCX_HAS_OP(tick)) { -+ SCX_CALL_OP(SCX_KF_REST, tick, curr); -+ } -+ -+ if (!curr->scx.slice) -+ resched_curr(rq); -+} -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+static struct cgroup *tg_cgrp(struct task_group *tg) -+{ -+ /* -+ * If CGROUP_SCHED is disabled, @tg is NULL. If @tg is an autogroup, -+ * @tg->css.cgroup is NULL. In both cases, @tg can be treated as the -+ * root cgroup. -+ */ -+ if (tg && tg->css.cgroup) -+ return tg->css.cgroup; -+ else -+ return &cgrp_dfl_root.cgrp; -+} -+ -+#define SCX_INIT_TASK_ARGS_CGROUP(tg) .cgroup = tg_cgrp(tg), -+ -+#else /* CONFIG_EXT_GROUP_SCHED */ -+ -+#define SCX_INIT_TASK_ARGS_CGROUP(tg) -+ -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+ -+static enum scx_task_state scx_get_task_state(const struct task_struct *p) -+{ -+ return (p->scx.flags & SCX_TASK_STATE_MASK) >> SCX_TASK_STATE_SHIFT; -+} -+ -+static void scx_set_task_state(struct task_struct *p, enum scx_task_state state) -+{ -+ enum scx_task_state prev_state = scx_get_task_state(p); -+ bool warn = false; -+ -+ BUILD_BUG_ON(SCX_TASK_NR_STATES > (1 << SCX_TASK_STATE_BITS)); -+ -+ switch (state) { -+ case SCX_TASK_NONE: -+ break; -+ case SCX_TASK_INIT: -+ warn = prev_state != SCX_TASK_NONE; -+ break; -+ case SCX_TASK_READY: -+ warn = prev_state == SCX_TASK_NONE; -+ break; -+ case SCX_TASK_ENABLED: -+ warn = prev_state != SCX_TASK_READY; -+ break; -+ default: -+ warn = true; -+ return; -+ } -+ -+ WARN_ONCE(warn, "sched_ext: Invalid task state transition %d -> %d for %s[%d]", -+ prev_state, state, p->comm, p->pid); -+ -+ p->scx.flags &= ~SCX_TASK_STATE_MASK; -+ p->scx.flags |= state << SCX_TASK_STATE_SHIFT; -+} -+ -+static int scx_ops_init_task(struct task_struct *p, struct task_group *tg, bool fork) -+{ -+ int ret; -+ -+ p->scx.disallow = false; -+ -+ if (SCX_HAS_OP(init_task)) { -+ struct scx_init_task_args args = { -+ SCX_INIT_TASK_ARGS_CGROUP(tg) -+ .fork = fork, -+ }; -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init_task, p, &args); -+ if (unlikely(ret)) { -+ ret = ops_sanitize_err("init_task", ret); -+ return ret; -+ } -+ } -+ -+ scx_set_task_state(p, SCX_TASK_INIT); -+ -+ if (p->scx.disallow) { -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ rq = task_rq_lock(p, &rf); -+ -+ /* -+ * We're either in fork or load path and @p->policy will be -+ * applied right after. Reverting @p->policy here and rejecting -+ * %SCHED_EXT transitions from scx_check_setscheduler() -+ * guarantees that if ops.init_task() sets @p->disallow, @p can -+ * never be in SCX. -+ */ -+ if (p->policy == SCHED_EXT) { -+ p->policy = SCHED_NORMAL; -+ atomic_long_inc(&scx_nr_rejected); -+ } -+ -+ task_rq_unlock(rq, p, &rf); -+ } -+ -+ p->scx.flags |= SCX_TASK_RESET_RUNNABLE_AT; -+ return 0; -+} -+ -+static void set_task_scx_weight(struct task_struct *p) -+{ -+ u32 weight = sched_prio_to_weight[p->static_prio - MAX_RT_PRIO]; -+ -+ p->scx.weight = sched_weight_to_cgroup(weight); -+} -+ -+static void scx_ops_enable_task(struct task_struct *p) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ /* -+ * Set the weight before calling ops.enable() so that the scheduler -+ * doesn't see a stale value if they inspect the task struct. -+ */ -+ set_task_scx_weight(p); -+ if (SCX_HAS_OP(enable)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, enable, p); -+ scx_set_task_state(p, SCX_TASK_ENABLED); -+ -+ if (SCX_HAS_OP(set_weight)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); -+} -+ -+static void scx_ops_disable_task(struct task_struct *p) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ WARN_ON_ONCE(scx_get_task_state(p) != SCX_TASK_ENABLED); -+ -+ if (SCX_HAS_OP(disable)) -+ SCX_CALL_OP(SCX_KF_REST, disable, p); -+ scx_set_task_state(p, SCX_TASK_READY); -+} -+ -+static void scx_ops_exit_task(struct task_struct *p) -+{ -+ struct scx_exit_task_args args = { -+ .cancelled = false, -+ }; -+ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ switch (scx_get_task_state(p)) { -+ case SCX_TASK_NONE: -+ return; -+ case SCX_TASK_INIT: -+ args.cancelled = true; -+ break; -+ case SCX_TASK_READY: -+ break; -+ case SCX_TASK_ENABLED: -+ scx_ops_disable_task(p); -+ break; -+ default: -+ WARN_ON_ONCE(true); -+ return; -+ } -+ -+ if (SCX_HAS_OP(exit_task)) -+ SCX_CALL_OP(SCX_KF_REST, exit_task, p, &args); -+ scx_set_task_state(p, SCX_TASK_NONE); -+} -+ -+void init_scx_entity(struct sched_ext_entity *scx) -+{ -+ /* -+ * init_idle() calls this function again after fork sequence is -+ * complete. Don't touch ->tasks_node as it's already linked. -+ */ -+ memset(scx, 0, offsetof(struct sched_ext_entity, tasks_node)); -+ -+ INIT_LIST_HEAD(&scx->dsq_node.list); -+ RB_CLEAR_NODE(&scx->dsq_node.priq); -+ scx->sticky_cpu = -1; -+ scx->holding_cpu = -1; -+ INIT_LIST_HEAD(&scx->runnable_node); -+ scx->runnable_at = jiffies; -+ scx->ddsp_dsq_id = SCX_DSQ_INVALID; -+ scx->slice = SCX_SLICE_DFL; -+} -+ -+void scx_pre_fork(struct task_struct *p) -+{ -+ /* -+ * BPF scheduler enable/disable paths want to be able to iterate and -+ * update all tasks which can become complex when racing forks. As -+ * enable/disable are very cold paths, let's use a percpu_rwsem to -+ * exclude forks. -+ */ -+ percpu_down_read(&scx_fork_rwsem); -+} -+ -+int scx_fork(struct task_struct *p) -+{ -+ percpu_rwsem_assert_held(&scx_fork_rwsem); -+ -+ if (scx_enabled()) -+ return scx_ops_init_task(p, task_group(p), true); -+ else -+ return 0; -+} -+ -+void scx_post_fork(struct task_struct *p) -+{ -+ if (scx_enabled()) { -+ scx_set_task_state(p, SCX_TASK_READY); -+ -+ /* -+ * Enable the task immediately if it's running on sched_ext. -+ * Otherwise, it'll be enabled in switching_to_scx() if and -+ * when it's ever configured to run with a SCHED_EXT policy. -+ */ -+ if (p->sched_class == &ext_sched_class) { -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(p, &rf); -+ scx_ops_enable_task(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+ } -+ -+ spin_lock_irq(&scx_tasks_lock); -+ list_add_tail(&p->scx.tasks_node, &scx_tasks); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ percpu_up_read(&scx_fork_rwsem); -+} -+ -+void scx_cancel_fork(struct task_struct *p) -+{ -+ if (scx_enabled()) { -+ struct rq *rq; -+ struct rq_flags rf; -+ -+ rq = task_rq_lock(p, &rf); -+ WARN_ON_ONCE(scx_get_task_state(p) >= SCX_TASK_READY); -+ scx_ops_exit_task(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+ -+ percpu_up_read(&scx_fork_rwsem); -+} -+ -+void sched_ext_free(struct task_struct *p) -+{ -+ unsigned long flags; -+ -+ spin_lock_irqsave(&scx_tasks_lock, flags); -+ list_del_init(&p->scx.tasks_node); -+ spin_unlock_irqrestore(&scx_tasks_lock, flags); -+ -+ /* -+ * @p is off scx_tasks and wholly ours. scx_ops_enable()'s READY -> -+ * ENABLED transitions can't race us. Disable ops for @p. -+ */ -+ if (scx_get_task_state(p) != SCX_TASK_NONE) { -+ struct rq_flags rf; -+ struct rq *rq; -+ -+ rq = task_rq_lock(p, &rf); -+ scx_ops_exit_task(p); -+ task_rq_unlock(rq, p, &rf); -+ } -+} -+ -+static void reweight_task_scx(struct rq *rq, struct task_struct *p, int newprio) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ set_task_scx_weight(p); -+ if (SCX_HAS_OP(set_weight)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_weight, p, p->scx.weight); -+} -+ -+static void prio_changed_scx(struct rq *rq, struct task_struct *p, int oldprio) -+{ -+} -+ -+static void switching_to_scx(struct rq *rq, struct task_struct *p) -+{ -+ scx_ops_enable_task(p); -+ -+ /* -+ * set_cpus_allowed_scx() is not called while @p is associated with a -+ * different scheduler class. Keep the BPF scheduler up-to-date. -+ */ -+ if (SCX_HAS_OP(set_cpumask)) -+ SCX_CALL_OP_TASK(SCX_KF_REST, set_cpumask, p, -+ (struct cpumask *)p->cpus_ptr); -+} -+ -+static void switched_from_scx(struct rq *rq, struct task_struct *p) -+{ -+ scx_ops_disable_task(p); -+} -+ -+static void wakeup_preempt_scx(struct rq *rq, struct task_struct *p,int wake_flags) {} -+static void switched_to_scx(struct rq *rq, struct task_struct *p) {} -+ -+int scx_check_setscheduler(struct task_struct *p, int policy) -+{ -+ lockdep_assert_rq_held(task_rq(p)); -+ -+ /* if disallow, reject transitioning into SCX */ -+ if (scx_enabled() && READ_ONCE(p->scx.disallow) && -+ p->policy != policy && policy == SCHED_EXT) -+ return -EACCES; -+ -+ return 0; -+} -+ -+#ifdef CONFIG_NO_HZ_FULL -+bool scx_can_stop_tick(struct rq *rq) -+{ -+ struct task_struct *p = rq->curr; -+ -+ if (scx_ops_bypassing()) -+ return false; -+ -+ if (p->sched_class != &ext_sched_class) -+ return true; -+ -+ /* -+ * @rq can dispatch from different DSQs, so we can't tell whether it -+ * needs the tick or not by looking at nr_running. Allow stopping ticks -+ * iff the BPF scheduler indicated so. See set_next_task_scx(). -+ */ -+ return rq->scx.flags & SCX_RQ_CAN_STOP_TICK; -+} -+#endif -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+ -+DEFINE_STATIC_PERCPU_RWSEM(scx_cgroup_rwsem); -+ -+int scx_tg_online(struct task_group *tg) -+{ -+ int ret = 0; -+ -+ WARN_ON_ONCE(tg->scx_flags & (SCX_TG_ONLINE | SCX_TG_INITED)); -+ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (SCX_HAS_OP(cgroup_init)) { -+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, -+ tg->css.cgroup, &args); -+ if (!ret) -+ tg->scx_flags |= SCX_TG_ONLINE | SCX_TG_INITED; -+ else -+ ret = ops_sanitize_err("cgroup_init", ret); -+ } else { -+ tg->scx_flags |= SCX_TG_ONLINE; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+ return ret; -+} -+ -+void scx_tg_offline(struct task_group *tg) -+{ -+ WARN_ON_ONCE(!(tg->scx_flags & SCX_TG_ONLINE)); -+ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (SCX_HAS_OP(cgroup_exit) && (tg->scx_flags & SCX_TG_INITED)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_exit, tg->css.cgroup); -+ tg->scx_flags &= ~(SCX_TG_ONLINE | SCX_TG_INITED); -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+int scx_cgroup_can_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *p; -+ int ret; -+ -+ /* released in scx_finish/cancel_attach() */ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (!scx_enabled()) -+ return 0; -+ -+ cgroup_taskset_for_each(p, css, tset) { -+ struct cgroup *from = tg_cgrp(task_group(p)); -+ struct cgroup *to = tg_cgrp(css_tg(css)); -+ -+ WARN_ON_ONCE(p->scx.cgrp_moving_from); -+ -+ /* -+ * sched_move_task() omits identity migrations. Let's match the -+ * behavior so that ops.cgroup_prep_move() and ops.cgroup_move() -+ * always match one-to-one. -+ */ -+ if (from == to) -+ continue; -+ -+ if (SCX_HAS_OP(cgroup_prep_move)) { -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_prep_move, -+ p, from, css->cgroup); -+ if (ret) -+ goto err; -+ } -+ -+ p->scx.cgrp_moving_from = from; -+ } -+ -+ return 0; -+ -+err: -+ cgroup_taskset_for_each(p, css, tset) { -+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, -+ p->scx.cgrp_moving_from, css->cgroup); -+ p->scx.cgrp_moving_from = NULL; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+ return ops_sanitize_err("cgroup_prep_move", ret); -+} -+ -+void scx_move_task(struct task_struct *p) -+{ -+ /* -+ * We're called from sched_move_task() which handles both cgroup and -+ * autogroup moves. Ignore the latter. -+ * -+ * Also ignore exiting tasks, because in the exit path tasks transition -+ * from the autogroup to the root group, so task_group_is_autogroup() -+ * alone isn't able to catch exiting autogroup tasks. This is safe for -+ * cgroup_move(), because cgroup migrations never happen for PF_EXITING -+ * tasks. -+ */ -+ if (p->flags & PF_EXITING || task_group_is_autogroup(task_group(p))) -+ return; -+ -+ if (!scx_enabled()) -+ return; -+ -+ /* -+ * @p must have ops.cgroup_prep_move() called on it and thus -+ * cgrp_moving_from set. -+ */ -+ if (SCX_HAS_OP(cgroup_move) && !WARN_ON_ONCE(!p->scx.cgrp_moving_from)) -+ SCX_CALL_OP_TASK(SCX_KF_UNLOCKED, cgroup_move, p, -+ p->scx.cgrp_moving_from, tg_cgrp(task_group(p))); -+ p->scx.cgrp_moving_from = NULL; -+} -+ -+void scx_cgroup_finish_attach(void) -+{ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) -+{ -+ struct cgroup_subsys_state *css; -+ struct task_struct *p; -+ -+ if (!scx_enabled()) -+ goto out_unlock; -+ -+ cgroup_taskset_for_each(p, css, tset) { -+ if (SCX_HAS_OP(cgroup_cancel_move) && p->scx.cgrp_moving_from) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_cancel_move, p, -+ p->scx.cgrp_moving_from, css->cgroup); -+ p->scx.cgrp_moving_from = NULL; -+ } -+out_unlock: -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+void scx_group_set_weight(struct task_group *tg, unsigned long weight) -+{ -+ percpu_down_read(&scx_cgroup_rwsem); -+ -+ if (tg->scx_weight != weight) { -+ if (SCX_HAS_OP(cgroup_set_weight)) -+ SCX_CALL_OP(SCX_KF_SLEEPABLE, cgroup_set_weight, -+ tg_cgrp(tg), weight); -+ tg->scx_weight = weight; -+ } -+ -+ percpu_up_read(&scx_cgroup_rwsem); -+} -+ -+static void scx_cgroup_lock(void) -+{ -+ percpu_down_write(&scx_cgroup_rwsem); -+} -+ -+static void scx_cgroup_unlock(void) -+{ -+ percpu_up_write(&scx_cgroup_rwsem); -+} -+ -+#else /* CONFIG_EXT_GROUP_SCHED */ -+ -+static inline void scx_cgroup_lock(void) {} -+static inline void scx_cgroup_unlock(void) {} -+ -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+ -+/* -+ * Omitted operations: -+ * -+ * - wakeup_preempt: NOOP as it isn't useful in the wakeup path because the task -+ * isn't tied to the CPU at that point. Preemption is implemented by resetting -+ * the victim task's slice to 0 and triggering reschedule on the target CPU. -+ * -+ * - migrate_task_rq: Unnecessary as task to cpu mapping is transient. -+ * -+ * - task_fork/dead: We need fork/dead notifications for all tasks regardless of -+ * their current sched_class. Call them directly from sched core instead. -+ * -+ * - task_woken: Unnecessary. -+ */ -+DEFINE_SCHED_CLASS(ext) = { -+ .enqueue_task = enqueue_task_scx, -+ .dequeue_task = dequeue_task_scx, -+ .yield_task = yield_task_scx, -+ .yield_to_task = yield_to_task_scx, -+ -+ .wakeup_preempt = wakeup_preempt_scx, -+ -+ .pick_next_task = pick_next_task_scx, -+ -+ .put_prev_task = put_prev_task_scx, -+ .set_next_task = set_next_task_scx, -+ -+#ifdef CONFIG_SMP -+ .balance = balance_scx, -+ .select_task_rq = select_task_rq_scx, -+ .set_cpus_allowed = set_cpus_allowed_scx, -+ -+ .rq_online = rq_online_scx, -+ .rq_offline = rq_offline_scx, -+#endif -+ -+#ifdef CONFIG_SCHED_CORE -+ .pick_task = pick_task_scx, -+#endif -+ -+ .task_tick = task_tick_scx, -+ -+ .switching_to = switching_to_scx, -+ .switched_from = switched_from_scx, -+ .switched_to = switched_to_scx, -+ .reweight_task = reweight_task_scx, -+ .prio_changed = prio_changed_scx, -+ -+ .update_curr = update_curr_scx, -+ -+#ifdef CONFIG_UCLAMP_TASK -+ .uclamp_enabled = 1, -+#endif -+}; -+ -+static void init_dsq(struct scx_dispatch_q *dsq, u64 dsq_id) -+{ -+ memset(dsq, 0, sizeof(*dsq)); -+ -+ raw_spin_lock_init(&dsq->lock); -+ INIT_LIST_HEAD(&dsq->list); -+ dsq->id = dsq_id; -+} -+ -+static struct scx_dispatch_q *create_dsq(u64 dsq_id, int node) -+{ -+ struct scx_dispatch_q *dsq; -+ int ret; -+ -+ if (dsq_id & SCX_DSQ_FLAG_BUILTIN) -+ return ERR_PTR(-EINVAL); -+ -+ dsq = kmalloc_node(sizeof(*dsq), GFP_KERNEL, node); -+ if (!dsq) -+ return ERR_PTR(-ENOMEM); -+ -+ init_dsq(dsq, dsq_id); -+ -+ ret = rhashtable_insert_fast(&dsq_hash, &dsq->hash_node, -+ dsq_hash_params); -+ if (ret) { -+ kfree(dsq); -+ return ERR_PTR(ret); -+ } -+ return dsq; -+} -+ -+static void free_dsq_irq_workfn(struct irq_work *irq_work) -+{ -+ struct llist_node *to_free = llist_del_all(&dsqs_to_free); -+ struct scx_dispatch_q *dsq, *tmp_dsq; -+ -+ llist_for_each_entry_safe(dsq, tmp_dsq, to_free, free_node) -+ kfree_rcu(dsq, rcu); -+} -+ -+static DEFINE_IRQ_WORK(free_dsq_irq_work, free_dsq_irq_workfn); -+ -+static void destroy_dsq(u64 dsq_id) -+{ -+ struct scx_dispatch_q *dsq; -+ unsigned long flags; -+ -+ rcu_read_lock(); -+ -+ dsq = find_user_dsq(dsq_id); -+ if (!dsq) -+ goto out_unlock_rcu; -+ -+ raw_spin_lock_irqsave(&dsq->lock, flags); -+ -+ if (dsq->nr) { -+ scx_ops_error("attempting to destroy in-use dsq 0x%016llx (nr=%u)", -+ dsq->id, dsq->nr); -+ goto out_unlock_dsq; -+ } -+ -+ if (rhashtable_remove_fast(&dsq_hash, &dsq->hash_node, dsq_hash_params)) -+ goto out_unlock_dsq; -+ -+ /* -+ * Mark dead by invalidating ->id to prevent dispatch_enqueue() from -+ * queueing more tasks. As this function can be called from anywhere, -+ * freeing is bounced through an irq work to avoid nesting RCU -+ * operations inside scheduler locks. -+ */ -+ dsq->id = SCX_DSQ_INVALID; -+ llist_add(&dsq->free_node, &dsqs_to_free); -+ irq_work_queue(&free_dsq_irq_work); -+ -+out_unlock_dsq: -+ raw_spin_unlock_irqrestore(&dsq->lock, flags); -+out_unlock_rcu: -+ rcu_read_unlock(); -+} -+ -+#ifdef CONFIG_EXT_GROUP_SCHED -+static void scx_cgroup_exit(void) -+{ -+ struct cgroup_subsys_state *css; -+ -+ percpu_rwsem_assert_held(&scx_cgroup_rwsem); -+ -+ /* -+ * scx_tg_on/offline() are excluded through scx_cgroup_rwsem. If we walk -+ * cgroups and exit all the inited ones, all online cgroups are exited. -+ */ -+ rcu_read_lock(); -+ css_for_each_descendant_post(css, &root_task_group.css) { -+ struct task_group *tg = css_tg(css); -+ -+ if (!(tg->scx_flags & SCX_TG_INITED)) -+ continue; -+ tg->scx_flags &= ~SCX_TG_INITED; -+ -+ if (!scx_ops.cgroup_exit) -+ continue; -+ -+ if (WARN_ON_ONCE(!css_tryget(css))) -+ continue; -+ rcu_read_unlock(); -+ -+ SCX_CALL_OP(SCX_KF_UNLOCKED, cgroup_exit, css->cgroup); -+ -+ rcu_read_lock(); -+ css_put(css); -+ } -+ rcu_read_unlock(); -+} -+ -+static int scx_cgroup_init(void) -+{ -+ struct cgroup_subsys_state *css; -+ int ret; -+ -+ percpu_rwsem_assert_held(&scx_cgroup_rwsem); -+ -+ /* -+ * scx_tg_on/offline() are excluded thorugh scx_cgroup_rwsem. If we walk -+ * cgroups and init, all online cgroups are initialized. -+ */ -+ rcu_read_lock(); -+ css_for_each_descendant_pre(css, &root_task_group.css) { -+ struct task_group *tg = css_tg(css); -+ struct scx_cgroup_init_args args = { .weight = tg->scx_weight }; -+ -+ if ((tg->scx_flags & -+ (SCX_TG_ONLINE | SCX_TG_INITED)) != SCX_TG_ONLINE) -+ continue; -+ -+ if (!scx_ops.cgroup_init) { -+ tg->scx_flags |= SCX_TG_INITED; -+ continue; -+ } -+ -+ if (WARN_ON_ONCE(!css_tryget(css))) -+ continue; -+ rcu_read_unlock(); -+ -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, cgroup_init, -+ css->cgroup, &args); -+ if (ret) { -+ css_put(css); -+ return ret; -+ } -+ tg->scx_flags |= SCX_TG_INITED; -+ -+ rcu_read_lock(); -+ css_put(css); -+ } -+ rcu_read_unlock(); -+ -+ return 0; -+} -+ -+static void scx_cgroup_config_knobs(void) -+{ -+ static DEFINE_MUTEX(cgintf_mutex); -+ DECLARE_BITMAP(mask, CPU_CFTYPE_CNT) = { }; -+ u64 knob_flags; -+ int i; -+ -+ /* -+ * Called from both class switch and ops enable/disable paths, -+ * synchronize internally. -+ */ -+ mutex_lock(&cgintf_mutex); -+ -+ /* if fair is in use, all knobs should be shown */ -+ if (!scx_switched_all()) { -+ bitmap_fill(mask, CPU_CFTYPE_CNT); -+ goto apply; -+ } -+ -+ /* -+ * On ext, only show the supported knobs. Otherwise, show all possible -+ * knobs so that configuration attempts succeed and the states are -+ * remembered while ops is not loaded. -+ */ -+ if (scx_enabled()) -+ knob_flags = scx_ops.flags; -+ else -+ knob_flags = SCX_OPS_ALL_FLAGS; -+ -+ if (knob_flags & SCX_OPS_CGROUP_KNOB_WEIGHT) { -+ __set_bit(CPU_CFTYPE_WEIGHT, mask); -+ __set_bit(CPU_CFTYPE_WEIGHT_NICE, mask); -+ } -+apply: -+ for (i = 0; i < CPU_CFTYPE_CNT; i++) -+ cgroup_show_cftype(&cpu_cftypes[i], test_bit(i, mask)); -+ -+ mutex_unlock(&cgintf_mutex); -+} -+ -+#else -+static void scx_cgroup_exit(void) {} -+static int scx_cgroup_init(void) { return 0; } -+static void scx_cgroup_config_knobs(void) {} -+#endif -+ -+ -+/******************************************************************************** -+ * Sysfs interface and ops enable/disable. -+ */ -+ -+#define SCX_ATTR(_name) \ -+ static struct kobj_attribute scx_attr_##_name = { \ -+ .attr = { .name = __stringify(_name), .mode = 0444 }, \ -+ .show = scx_attr_##_name##_show, \ -+ } -+ -+static ssize_t scx_attr_state_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%s\n", -+ scx_ops_enable_state_str[scx_ops_enable_state()]); -+} -+SCX_ATTR(state); -+ -+static ssize_t scx_attr_switch_all_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%d\n", READ_ONCE(scx_switching_all)); -+} -+SCX_ATTR(switch_all); -+ -+static ssize_t scx_attr_nr_rejected_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_nr_rejected)); -+} -+SCX_ATTR(nr_rejected); -+ -+static ssize_t scx_attr_hotplug_seq_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%ld\n", atomic_long_read(&scx_hotplug_seq)); -+} -+SCX_ATTR(hotplug_seq); -+ -+static struct attribute *scx_global_attrs[] = { -+ &scx_attr_state.attr, -+ &scx_attr_switch_all.attr, -+ &scx_attr_nr_rejected.attr, -+ &scx_attr_hotplug_seq.attr, -+ NULL, -+}; -+ -+static const struct attribute_group scx_global_attr_group = { -+ .attrs = scx_global_attrs, -+}; -+ -+static void scx_kobj_release(struct kobject *kobj) -+{ -+ kfree(kobj); -+} -+ -+static ssize_t scx_attr_ops_show(struct kobject *kobj, -+ struct kobj_attribute *ka, char *buf) -+{ -+ return sysfs_emit(buf, "%s\n", scx_ops.name); -+} -+SCX_ATTR(ops); -+ -+static struct attribute *scx_sched_attrs[] = { -+ &scx_attr_ops.attr, -+ NULL, -+}; -+ATTRIBUTE_GROUPS(scx_sched); -+ -+static const struct kobj_type scx_ktype = { -+ .release = scx_kobj_release, -+ .sysfs_ops = &kobj_sysfs_ops, -+ .default_groups = scx_sched_groups, -+}; -+ -+static int scx_uevent(const struct kobject *kobj, struct kobj_uevent_env *env) -+{ -+ return add_uevent_var(env, "SCXOPS=%s", scx_ops.name); -+} -+ -+static const struct kset_uevent_ops scx_uevent_ops = { -+ .uevent = scx_uevent, -+}; -+ -+/* -+ * Used by sched_fork() and __setscheduler_prio() to pick the matching -+ * sched_class. dl/rt are already handled. -+ */ -+bool task_should_scx(struct task_struct *p) -+{ -+ if (!scx_enabled() || -+ unlikely(scx_ops_enable_state() == SCX_OPS_DISABLING)) -+ return false; -+ if (READ_ONCE(scx_switching_all)) -+ return true; -+ return p->policy == SCHED_EXT; -+} -+ -+/** -+ * scx_ops_bypass - [Un]bypass scx_ops and guarantee forward progress -+ * -+ * Bypassing guarantees that all runnable tasks make forward progress without -+ * trusting the BPF scheduler. We can't grab any mutexes or rwsems as they might -+ * be held by tasks that the BPF scheduler is forgetting to run, which -+ * unfortunately also excludes toggling the static branches. -+ * -+ * Let's work around by overriding a couple ops and modifying behaviors based on -+ * the DISABLING state and then cycling the queued tasks through dequeue/enqueue -+ * to force global FIFO scheduling. -+ * -+ * a. ops.enqueue() is ignored and tasks are queued in simple global FIFO order. -+ * -+ * b. ops.dispatch() is ignored. -+ * -+ * c. balance_scx() never sets %SCX_TASK_BAL_KEEP as the slice value can't be -+ * trusted. Whenever a tick triggers, the running task is rotated to the tail -+ * of the queue with core_sched_at touched. -+ * -+ * d. pick_next_task() suppresses zero slice warning. -+ * -+ * e. scx_bpf_kick_cpu() is disabled to avoid irq_work malfunction during PM -+ * operations. -+ * -+ * f. scx_prio_less() reverts to the default core_sched_at order. -+ */ -+static void scx_ops_bypass(bool bypass) -+{ -+ int depth, cpu; -+ -+ if (bypass) { -+ depth = atomic_inc_return(&scx_ops_bypass_depth); -+ WARN_ON_ONCE(depth <= 0); -+ if (depth != 1) -+ return; -+ } else { -+ depth = atomic_dec_return(&scx_ops_bypass_depth); -+ WARN_ON_ONCE(depth < 0); -+ if (depth != 0) -+ return; -+ } -+ -+ /* -+ * We need to guarantee that no tasks are on the BPF scheduler while -+ * bypassing. Either we see enabled or the enable path sees the -+ * increased bypass_depth before moving tasks to SCX. -+ */ -+ if (!scx_enabled()) -+ return; -+ -+ /* -+ * No task property is changing. We just need to make sure all currently -+ * queued tasks are re-queued according to the new scx_ops_bypassing() -+ * state. As an optimization, walk each rq's runnable_list instead of -+ * the scx_tasks list. -+ * -+ * This function can't trust the scheduler and thus can't use -+ * cpus_read_lock(). Walk all possible CPUs instead of online. -+ */ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ struct task_struct *p, *n; -+ -+ rq_lock_irqsave(rq, &rf); -+ -+ /* -+ * The use of list_for_each_entry_safe_reverse() is required -+ * because each task is going to be removed from and added back -+ * to the runnable_list during iteration. Because they're added -+ * to the tail of the list, safe reverse iteration can still -+ * visit all nodes. -+ */ -+ list_for_each_entry_safe_reverse(p, n, &rq->scx.runnable_list, -+ scx.runnable_node) { -+ struct sched_enq_and_set_ctx ctx; -+ -+ /* cycling deq/enq is enough, see the function comment */ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -+ sched_enq_and_set_task(&ctx); -+ } -+ -+ rq_unlock_irqrestore(rq, &rf); -+ -+ /* kick to restore ticks */ -+ resched_cpu(cpu); -+ } -+} -+ -+static void free_exit_info(struct scx_exit_info *ei) -+{ -+ kfree(ei->dump); -+ kfree(ei->msg); -+ kfree(ei->bt); -+ kfree(ei); -+} -+ -+static struct scx_exit_info *alloc_exit_info(size_t exit_dump_len) -+{ -+ struct scx_exit_info *ei; -+ -+ ei = kzalloc(sizeof(*ei), GFP_KERNEL); -+ if (!ei) -+ return NULL; -+ -+ ei->bt = kcalloc(sizeof(ei->bt[0]), SCX_EXIT_BT_LEN, GFP_KERNEL); -+ ei->msg = kzalloc(SCX_EXIT_MSG_LEN, GFP_KERNEL); -+ ei->dump = kzalloc(exit_dump_len, GFP_KERNEL); -+ -+ if (!ei->bt || !ei->msg || !ei->dump) { -+ free_exit_info(ei); -+ return NULL; -+ } -+ -+ return ei; -+} -+ -+static const char *scx_exit_reason(enum scx_exit_kind kind) -+{ -+ switch (kind) { -+ case SCX_EXIT_UNREG: -+ return "Scheduler unregistered from user space"; -+ case SCX_EXIT_UNREG_BPF: -+ return "Scheduler unregistered from BPF"; -+ case SCX_EXIT_UNREG_KERN: -+ return "Scheduler unregistered from the main kernel"; -+ case SCX_EXIT_SYSRQ: -+ return "disabled by sysrq-S"; -+ case SCX_EXIT_ERROR: -+ return "runtime error"; -+ case SCX_EXIT_ERROR_BPF: -+ return "scx_bpf_error"; -+ case SCX_EXIT_ERROR_STALL: -+ return "runnable task stall"; -+ default: -+ return ""; -+ } -+} -+ -+static void scx_ops_disable_workfn(struct kthread_work *work) -+{ -+ struct scx_exit_info *ei = scx_exit_info; -+ struct scx_task_iter sti; -+ struct task_struct *p; -+ struct rhashtable_iter rht_iter; -+ struct scx_dispatch_q *dsq; -+ int i, kind; -+ -+ kind = atomic_read(&scx_exit_kind); -+ while (true) { -+ /* -+ * NONE indicates that a new scx_ops has been registered since -+ * disable was scheduled - don't kill the new ops. DONE -+ * indicates that the ops has already been disabled. -+ */ -+ if (kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE) -+ return; -+ if (atomic_try_cmpxchg(&scx_exit_kind, &kind, SCX_EXIT_DONE)) -+ break; -+ } -+ ei->kind = kind; -+ ei->reason = scx_exit_reason(ei->kind); -+ -+ /* guarantee forward progress by bypassing scx_ops */ -+ scx_ops_bypass(true); -+ -+ switch (scx_ops_set_enable_state(SCX_OPS_DISABLING)) { -+ case SCX_OPS_DISABLING: -+ WARN_ONCE(true, "sched_ext: duplicate disabling instance?"); -+ break; -+ case SCX_OPS_DISABLED: -+ pr_warn("sched_ext: ops error detected without ops (%s)\n", -+ scx_exit_info->msg); -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != -+ SCX_OPS_DISABLING); -+ goto done; -+ default: -+ break; -+ } -+ -+ /* -+ * Here, every runnable task is guaranteed to make forward progress and -+ * we can safely use blocking synchronization constructs. Actually -+ * disable ops. -+ */ -+ mutex_lock(&scx_ops_enable_mutex); -+ -+ static_branch_disable(&__scx_switched_all); -+ WRITE_ONCE(scx_switching_all, false); -+ -+ /* -+ * Avoid racing against fork and cgroup changes. See scx_ops_enable() -+ * for explanation on the locking order. -+ */ -+ percpu_down_write(&scx_fork_rwsem); -+ cpus_read_lock(); -+ scx_cgroup_lock(); -+ -+ spin_lock_irq(&scx_tasks_lock); -+ scx_task_iter_init(&sti); -+ /* -+ * Invoke scx_ops_exit_task() on all non-idle tasks, including -+ * TASK_DEAD tasks. Because dead tasks may have a nonzero refcount, -+ * we may not have invoked sched_ext_free() on them by the time a -+ * scheduler is disabled. We must therefore exit the task here, or we'd -+ * fail to invoke ops.exit_task(), as the scheduler will have been -+ * unloaded by the time the task is subsequently exited on the -+ * sched_ext_free() path. -+ */ -+ while ((p = scx_task_iter_next_locked(&sti, true))) { -+ const struct sched_class *old_class = p->sched_class; -+ struct sched_enq_and_set_ctx ctx; -+ -+ if (READ_ONCE(p->__state) != TASK_DEAD) { -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, -+ &ctx); -+ -+ p->scx.slice = min_t(u64, p->scx.slice, SCX_SLICE_DFL); -+ __setscheduler_prio(p, p->prio); -+ check_class_changing(task_rq(p), p, old_class); -+ -+ sched_enq_and_set_task(&ctx); -+ -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ } -+ scx_ops_exit_task(p); -+ } -+ scx_task_iter_exit(&sti); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ /* no task is on scx, turn off all the switches and flush in-progress calls */ -+ static_branch_disable_cpuslocked(&__scx_ops_enabled); -+ for (i = SCX_OPI_BEGIN; i < SCX_OPI_END; i++) -+ static_branch_disable_cpuslocked(&scx_has_op[i]); -+ static_branch_disable_cpuslocked(&scx_ops_enq_last); -+ static_branch_disable_cpuslocked(&scx_ops_enq_exiting); -+ static_branch_disable_cpuslocked(&scx_ops_cpu_preempt); -+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); -+ synchronize_rcu(); -+ -+ scx_cgroup_exit(); -+ -+ scx_cgroup_unlock(); -+ cpus_read_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+ -+ if (ei->kind >= SCX_EXIT_ERROR) { -+ printk(KERN_ERR "sched_ext: BPF scheduler \"%s\" errored, disabling\n", scx_ops.name); -+ -+ if (ei->msg[0] == '\0') -+ printk(KERN_ERR "sched_ext: %s\n", ei->reason); -+ else -+ printk(KERN_ERR "sched_ext: %s (%s)\n", ei->reason, ei->msg); -+ -+ stack_trace_print(ei->bt, ei->bt_len, 2); -+ } -+ -+ if (scx_ops.exit) -+ SCX_CALL_OP(SCX_KF_UNLOCKED, exit, ei); -+ -+ cancel_delayed_work_sync(&scx_watchdog_work); -+ -+ /* -+ * Delete the kobject from the hierarchy eagerly in addition to just -+ * dropping a reference. Otherwise, if the object is deleted -+ * asynchronously, sysfs could observe an object of the same name still -+ * in the hierarchy when another scheduler is loaded. -+ */ -+ kobject_del(scx_root_kobj); -+ kobject_put(scx_root_kobj); -+ scx_root_kobj = NULL; -+ -+ memset(&scx_ops, 0, sizeof(scx_ops)); -+ -+ rhashtable_walk_enter(&dsq_hash, &rht_iter); -+ do { -+ rhashtable_walk_start(&rht_iter); -+ -+ while ((dsq = rhashtable_walk_next(&rht_iter)) && !IS_ERR(dsq)) -+ destroy_dsq(dsq->id); -+ -+ rhashtable_walk_stop(&rht_iter); -+ } while (dsq == ERR_PTR(-EAGAIN)); -+ rhashtable_walk_exit(&rht_iter); -+ -+ free_percpu(scx_dsp_ctx); -+ scx_dsp_ctx = NULL; -+ scx_dsp_max_batch = 0; -+ -+ free_exit_info(scx_exit_info); -+ scx_exit_info = NULL; -+ -+ mutex_unlock(&scx_ops_enable_mutex); -+ -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_DISABLED) != -+ SCX_OPS_DISABLING); -+ -+ scx_cgroup_config_knobs(); -+done: -+ scx_ops_bypass(false); -+} -+ -+static DEFINE_KTHREAD_WORK(scx_ops_disable_work, scx_ops_disable_workfn); -+ -+static void schedule_scx_ops_disable_work(void) -+{ -+ struct kthread_worker *helper = READ_ONCE(scx_ops_helper); -+ -+ /* -+ * We may be called spuriously before the first bpf_sched_ext_reg(). If -+ * scx_ops_helper isn't set up yet, there's nothing to do. -+ */ -+ if (helper) -+ kthread_queue_work(helper, &scx_ops_disable_work); -+} -+ -+static void scx_ops_disable(enum scx_exit_kind kind) -+{ -+ int none = SCX_EXIT_NONE; -+ -+ if (WARN_ON_ONCE(kind == SCX_EXIT_NONE || kind == SCX_EXIT_DONE)) -+ kind = SCX_EXIT_ERROR; -+ -+ atomic_try_cmpxchg(&scx_exit_kind, &none, kind); -+ -+ schedule_scx_ops_disable_work(); -+} -+ -+static void dump_newline(struct seq_buf *s) -+{ -+ trace_sched_ext_dump(""); -+ -+ /* @s may be zero sized and seq_buf triggers WARN if so */ -+ if (s->size) -+ seq_buf_putc(s, '\n'); -+} -+ -+static __printf(2, 3) void dump_line(struct seq_buf *s, const char *fmt, ...) -+{ -+ va_list args; -+ -+#ifdef CONFIG_TRACEPOINTS -+ if (trace_sched_ext_dump_enabled()) { -+ /* protected by scx_dump_state()::dump_lock */ -+ static char line_buf[SCX_EXIT_MSG_LEN]; -+ -+ va_start(args, fmt); -+ vscnprintf(line_buf, sizeof(line_buf), fmt, args); -+ va_end(args); -+ -+ trace_sched_ext_dump(line_buf); -+ } -+#endif -+ /* @s may be zero sized and seq_buf triggers WARN if so */ -+ if (s->size) { -+ va_start(args, fmt); -+ seq_buf_vprintf(s, fmt, args); -+ va_end(args); -+ -+ seq_buf_putc(s, '\n'); -+ } -+} -+ -+static void dump_stack_trace(struct seq_buf *s, const char *prefix, -+ const unsigned long *bt, unsigned int len) -+{ -+ unsigned int i; -+ -+ for (i = 0; i < len; i++) -+ dump_line(s, "%s%pS", prefix, (void *)bt[i]); -+} -+ -+static void ops_dump_init(struct seq_buf *s, const char *prefix) -+{ -+ struct scx_dump_data *dd = &scx_dump_data; -+ -+ lockdep_assert_irqs_disabled(); -+ -+ dd->cpu = smp_processor_id(); /* allow scx_bpf_dump() */ -+ dd->first = true; -+ dd->cursor = 0; -+ dd->s = s; -+ dd->prefix = prefix; -+} -+ -+static void ops_dump_flush(void) -+{ -+ struct scx_dump_data *dd = &scx_dump_data; -+ char *line = dd->buf.line; -+ -+ if (!dd->cursor) -+ return; -+ -+ /* -+ * There's something to flush and this is the first line. Insert a blank -+ * line to distinguish ops dump. -+ */ -+ if (dd->first) { -+ dump_newline(dd->s); -+ dd->first = false; -+ } -+ -+ /* -+ * There may be multiple lines in $line. Scan and emit each line -+ * separately. -+ */ -+ while (true) { -+ char *end = line; -+ char c; -+ -+ while (*end != '\n' && *end != '\0') -+ end++; -+ -+ /* -+ * If $line overflowed, it may not have newline at the end. -+ * Always emit with a newline. -+ */ -+ c = *end; -+ *end = '\0'; -+ dump_line(dd->s, "%s%s", dd->prefix, line); -+ if (c == '\0') -+ break; -+ -+ /* move to the next line */ -+ end++; -+ if (*end == '\0') -+ break; -+ line = end; -+ } -+ -+ dd->cursor = 0; -+} -+ -+static void ops_dump_exit(void) -+{ -+ ops_dump_flush(); -+ scx_dump_data.cpu = -1; -+} -+ -+static void scx_dump_task(struct seq_buf *s, struct scx_dump_ctx *dctx, -+ struct task_struct *p, char marker) -+{ -+ static unsigned long bt[SCX_EXIT_BT_LEN]; -+ char dsq_id_buf[19] = "(n/a)"; -+ unsigned long ops_state = atomic_long_read(&p->scx.ops_state); -+ unsigned int bt_len; -+ -+ if (p->scx.dsq) -+ scnprintf(dsq_id_buf, sizeof(dsq_id_buf), "0x%llx", -+ (unsigned long long)p->scx.dsq->id); -+ -+ dump_newline(s); -+ dump_line(s, " %c%c %s[%d] %+ldms", -+ marker, task_state_to_char(p), p->comm, p->pid, -+ jiffies_delta_msecs(p->scx.runnable_at, dctx->at_jiffies)); -+ dump_line(s, " scx_state/flags=%u/0x%x dsq_flags=0x%x ops_state/qseq=%lu/%lu", -+ scx_get_task_state(p), p->scx.flags & ~SCX_TASK_STATE_MASK, -+ p->scx.dsq_node.flags, ops_state & SCX_OPSS_STATE_MASK, -+ ops_state >> SCX_OPSS_QSEQ_SHIFT); -+ dump_line(s, " sticky/holding_cpu=%d/%d dsq_id=%s dsq_vtime=%llu", -+ p->scx.sticky_cpu, p->scx.holding_cpu, dsq_id_buf, -+ p->scx.dsq_vtime); -+ dump_line(s, " cpus=%*pb", cpumask_pr_args(p->cpus_ptr)); -+ -+ if (SCX_HAS_OP(dump_task)) { -+ ops_dump_init(s, " "); -+ SCX_CALL_OP(SCX_KF_REST, dump_task, dctx, p); -+ ops_dump_exit(); -+ } -+ -+ bt_len = stack_trace_save_tsk(p, bt, SCX_EXIT_BT_LEN, 1); -+ if (bt_len) { -+ dump_newline(s); -+ dump_stack_trace(s, " ", bt, bt_len); -+ } -+} -+ -+static void scx_dump_state(struct scx_exit_info *ei, size_t dump_len) -+{ -+ static DEFINE_SPINLOCK(dump_lock); -+ static const char trunc_marker[] = "\n\n~~~~ TRUNCATED ~~~~\n"; -+ struct scx_dump_ctx dctx = { -+ .kind = ei->kind, -+ .exit_code = ei->exit_code, -+ .reason = ei->reason, -+ .at_ns = ktime_get_ns(), -+ .at_jiffies = jiffies, -+ }; -+ struct seq_buf s; -+ unsigned long flags; -+ char *buf; -+ int cpu; -+ -+ spin_lock_irqsave(&dump_lock, flags); -+ -+ seq_buf_init(&s, ei->dump, dump_len); -+ -+ if (ei->kind == SCX_EXIT_NONE) { -+ dump_line(&s, "Debug dump triggered by %s", ei->reason); -+ } else { -+ dump_line(&s, "%s[%d] triggered exit kind %d:", -+ current->comm, current->pid, ei->kind); -+ dump_line(&s, " %s (%s)", ei->reason, ei->msg); -+ dump_newline(&s); -+ dump_line(&s, "Backtrace:"); -+ dump_stack_trace(&s, " ", ei->bt, ei->bt_len); -+ } -+ -+ if (SCX_HAS_OP(dump)) { -+ ops_dump_init(&s, ""); -+ SCX_CALL_OP(SCX_KF_UNLOCKED, dump, &dctx); -+ ops_dump_exit(); -+ } -+ -+ dump_newline(&s); -+ dump_line(&s, "CPU states"); -+ dump_line(&s, "----------"); -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ struct rq_flags rf; -+ struct task_struct *p; -+ struct seq_buf ns; -+ size_t avail, used; -+ bool idle; -+ -+ rq_lock(rq, &rf); -+ -+ idle = list_empty(&rq->scx.runnable_list) && -+ rq->curr->sched_class == &idle_sched_class; -+ -+ if (idle && !SCX_HAS_OP(dump_cpu)) -+ goto next; -+ -+ /* -+ * We don't yet know whether ops.dump_cpu() will produce output -+ * and we may want to skip the default CPU dump if it doesn't. -+ * Use a nested seq_buf to generate the standard dump so that we -+ * can decide whether to commit later. -+ */ -+ avail = seq_buf_get_buf(&s, &buf); -+ seq_buf_init(&ns, buf, avail); -+ -+ dump_newline(&ns); -+ dump_line(&ns, "CPU %-4d: nr_run=%u flags=0x%x cpu_rel=%d ops_qseq=%lu pnt_seq=%lu", -+ cpu, rq->scx.nr_running, rq->scx.flags, -+ rq->scx.cpu_released, rq->scx.ops_qseq, -+ rq->scx.pnt_seq); -+ dump_line(&ns, " curr=%s[%d] class=%ps", -+ rq->curr->comm, rq->curr->pid, -+ rq->curr->sched_class); -+ if (!cpumask_empty(rq->scx.cpus_to_kick)) -+ dump_line(&ns, " cpus_to_kick : %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_kick)); -+ if (!cpumask_empty(rq->scx.cpus_to_kick_if_idle)) -+ dump_line(&ns, " idle_to_kick : %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_kick_if_idle)); -+ if (!cpumask_empty(rq->scx.cpus_to_preempt)) -+ dump_line(&ns, " cpus_to_preempt: %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_preempt)); -+ if (!cpumask_empty(rq->scx.cpus_to_wait)) -+ dump_line(&ns, " cpus_to_wait : %*pb", -+ cpumask_pr_args(rq->scx.cpus_to_wait)); -+ -+ used = seq_buf_used(&ns); -+ if (SCX_HAS_OP(dump_cpu)) { -+ ops_dump_init(&ns, " "); -+ SCX_CALL_OP(SCX_KF_REST, dump_cpu, &dctx, cpu, idle); -+ ops_dump_exit(); -+ } -+ -+ /* -+ * If idle && nothing generated by ops.dump_cpu(), there's -+ * nothing interesting. Skip. -+ */ -+ if (idle && used == seq_buf_used(&ns)) -+ goto next; -+ -+ /* -+ * $s may already have overflowed when $ns was created. If so, -+ * calling commit on it will trigger BUG. -+ */ -+ if (avail) { -+ seq_buf_commit(&s, seq_buf_used(&ns)); -+ if (seq_buf_has_overflowed(&ns)) -+ seq_buf_set_overflow(&s); -+ } -+ -+ if (rq->curr->sched_class == &ext_sched_class) -+ scx_dump_task(&s, &dctx, rq->curr, '*'); -+ -+ list_for_each_entry(p, &rq->scx.runnable_list, scx.runnable_node) -+ scx_dump_task(&s, &dctx, p, ' '); -+ next: -+ rq_unlock(rq, &rf); -+ } -+ -+ if (seq_buf_has_overflowed(&s) && dump_len >= sizeof(trunc_marker)) -+ memcpy(ei->dump + dump_len - sizeof(trunc_marker), -+ trunc_marker, sizeof(trunc_marker)); -+ -+ spin_unlock_irqrestore(&dump_lock, flags); -+} -+ -+static void scx_ops_error_irq_workfn(struct irq_work *irq_work) -+{ -+ struct scx_exit_info *ei = scx_exit_info; -+ -+ if (ei->kind >= SCX_EXIT_ERROR) -+ scx_dump_state(ei, scx_ops.exit_dump_len); -+ -+ schedule_scx_ops_disable_work(); -+} -+ -+static DEFINE_IRQ_WORK(scx_ops_error_irq_work, scx_ops_error_irq_workfn); -+ -+static __printf(3, 4) void scx_ops_exit_kind(enum scx_exit_kind kind, -+ s64 exit_code, -+ const char *fmt, ...) -+{ -+ struct scx_exit_info *ei = scx_exit_info; -+ int none = SCX_EXIT_NONE; -+ va_list args; -+ -+ if (!atomic_try_cmpxchg(&scx_exit_kind, &none, kind)) -+ return; -+ -+ ei->exit_code = exit_code; -+ -+ if (kind >= SCX_EXIT_ERROR) -+ ei->bt_len = stack_trace_save(ei->bt, SCX_EXIT_BT_LEN, 1); -+ -+ va_start(args, fmt); -+ vscnprintf(ei->msg, SCX_EXIT_MSG_LEN, fmt, args); -+ va_end(args); -+ -+ /* -+ * Set ei->kind and ->reason for scx_dump_state(). They'll be set again -+ * in scx_ops_disable_workfn(). -+ */ -+ ei->kind = kind; -+ ei->reason = scx_exit_reason(ei->kind); -+ -+ irq_work_queue(&scx_ops_error_irq_work); -+} -+ -+static struct kthread_worker *scx_create_rt_helper(const char *name) -+{ -+ struct kthread_worker *helper; -+ -+ helper = kthread_create_worker(0, name); -+ if (helper) -+ sched_set_fifo(helper->task); -+ return helper; -+} -+ -+static void check_hotplug_seq(const struct sched_ext_ops *ops) -+{ -+ unsigned long long global_hotplug_seq; -+ -+ /* -+ * If a hotplug event has occurred between when a scheduler was -+ * initialized, and when we were able to attach, exit and notify user -+ * space about it. -+ */ -+ if (ops->hotplug_seq) { -+ global_hotplug_seq = atomic_long_read(&scx_hotplug_seq); -+ if (ops->hotplug_seq != global_hotplug_seq) { -+ scx_ops_exit(SCX_ECODE_ACT_RESTART | SCX_ECODE_RSN_HOTPLUG, -+ "expected hotplug seq %llu did not match actual %llu", -+ ops->hotplug_seq, global_hotplug_seq); -+ } -+ } -+} -+ -+static int validate_ops(const struct sched_ext_ops *ops) -+{ -+ /* -+ * It doesn't make sense to specify the SCX_OPS_ENQ_LAST flag if the -+ * ops.enqueue() callback isn't implemented. -+ */ -+ if ((ops->flags & SCX_OPS_ENQ_LAST) && !ops->enqueue) { -+ scx_ops_error("SCX_OPS_ENQ_LAST requires ops.enqueue() to be implemented"); -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static int scx_ops_enable(struct sched_ext_ops *ops, struct bpf_link *link) -+{ -+ struct scx_task_iter sti; -+ struct task_struct *p; -+ unsigned long timeout; -+ int i, cpu, ret; -+ -+ mutex_lock(&scx_ops_enable_mutex); -+ -+ if (!scx_ops_helper) { -+ WRITE_ONCE(scx_ops_helper, -+ scx_create_rt_helper("sched_ext_ops_helper")); -+ if (!scx_ops_helper) { -+ ret = -ENOMEM; -+ goto err_unlock; -+ } -+ } -+ -+ if (scx_ops_enable_state() != SCX_OPS_DISABLED) { -+ ret = -EBUSY; -+ goto err_unlock; -+ } -+ -+ scx_root_kobj = kzalloc(sizeof(*scx_root_kobj), GFP_KERNEL); -+ if (!scx_root_kobj) { -+ ret = -ENOMEM; -+ goto err_unlock; -+ } -+ -+ scx_root_kobj->kset = scx_kset; -+ ret = kobject_init_and_add(scx_root_kobj, &scx_ktype, NULL, "root"); -+ if (ret < 0) -+ goto err; -+ -+ scx_exit_info = alloc_exit_info(ops->exit_dump_len); -+ if (!scx_exit_info) { -+ ret = -ENOMEM; -+ goto err_del; -+ } -+ -+ /* -+ * Set scx_ops, transition to PREPPING and clear exit info to arm the -+ * disable path. Failure triggers full disabling from here on. -+ */ -+ scx_ops = *ops; -+ -+ WARN_ON_ONCE(scx_ops_set_enable_state(SCX_OPS_PREPPING) != -+ SCX_OPS_DISABLED); -+ -+ atomic_set(&scx_exit_kind, SCX_EXIT_NONE); -+ scx_warned_zero_slice = false; -+ -+ atomic_long_set(&scx_nr_rejected, 0); -+ -+ for_each_possible_cpu(cpu) -+ cpu_rq(cpu)->scx.cpuperf_target = SCX_CPUPERF_ONE; -+ -+ /* -+ * Keep CPUs stable during enable so that the BPF scheduler can track -+ * online CPUs by watching ->on/offline_cpu() after ->init(). -+ */ -+ cpus_read_lock(); -+ -+ if (scx_ops.init) { -+ ret = SCX_CALL_OP_RET(SCX_KF_SLEEPABLE, init); -+ if (ret) { -+ ret = ops_sanitize_err("init", ret); -+ goto err_disable_unlock_cpus; -+ } -+ } -+ -+ for (i = SCX_OPI_CPU_HOTPLUG_BEGIN; i < SCX_OPI_CPU_HOTPLUG_END; i++) -+ if (((void (**)(void))ops)[i]) -+ static_branch_enable_cpuslocked(&scx_has_op[i]); -+ -+ cpus_read_unlock(); -+ -+ ret = validate_ops(ops); -+ if (ret) -+ goto err_disable; -+ -+ WARN_ON_ONCE(scx_dsp_ctx); -+ scx_dsp_max_batch = ops->dispatch_max_batch ?: SCX_DSP_DFL_MAX_BATCH; -+ scx_dsp_ctx = __alloc_percpu(struct_size_t(struct scx_dsp_ctx, buf, -+ scx_dsp_max_batch), -+ __alignof__(struct scx_dsp_ctx)); -+ if (!scx_dsp_ctx) { -+ ret = -ENOMEM; -+ goto err_disable; -+ } -+ -+ if (ops->timeout_ms) -+ timeout = msecs_to_jiffies(ops->timeout_ms); -+ else -+ timeout = SCX_WATCHDOG_MAX_TIMEOUT; -+ -+ WRITE_ONCE(scx_watchdog_timeout, timeout); -+ WRITE_ONCE(scx_watchdog_timestamp, jiffies); -+ queue_delayed_work(system_unbound_wq, &scx_watchdog_work, -+ scx_watchdog_timeout / 2); -+ -+ /* -+ * Lock out forks, cgroup on/offlining and moves before opening the -+ * floodgate so that they don't wander into the operations prematurely. -+ * -+ * We don't need to keep the CPUs stable but static_branch_*() requires -+ * cpus_read_lock() and scx_cgroup_rwsem must nest inside -+ * cpu_hotplug_lock because of the following dependency chain: -+ * -+ * cpu_hotplug_lock --> cgroup_threadgroup_rwsem --> scx_cgroup_rwsem -+ * -+ * So, we need to do cpus_read_lock() before scx_cgroup_lock() and use -+ * static_branch_*_cpuslocked(). -+ * -+ * Note that cpu_hotplug_lock must nest inside scx_fork_rwsem due to the -+ * following dependency chain: -+ * -+ * scx_fork_rwsem --> pernet_ops_rwsem --> cpu_hotplug_lock -+ */ -+ percpu_down_write(&scx_fork_rwsem); -+ cpus_read_lock(); -+ scx_cgroup_lock(); -+ -+ check_hotplug_seq(ops); -+ -+ for (i = SCX_OPI_NORMAL_BEGIN; i < SCX_OPI_NORMAL_END; i++) -+ if (((void (**)(void))ops)[i]) -+ static_branch_enable_cpuslocked(&scx_has_op[i]); -+ -+ if (ops->flags & SCX_OPS_ENQ_LAST) -+ static_branch_enable_cpuslocked(&scx_ops_enq_last); -+ -+ if (ops->flags & SCX_OPS_ENQ_EXITING) -+ static_branch_enable_cpuslocked(&scx_ops_enq_exiting); -+ if (scx_ops.cpu_acquire || scx_ops.cpu_release) -+ static_branch_enable_cpuslocked(&scx_ops_cpu_preempt); -+ -+ if (!ops->update_idle || (ops->flags & SCX_OPS_KEEP_BUILTIN_IDLE)) { -+ reset_idle_masks(); -+ static_branch_enable_cpuslocked(&scx_builtin_idle_enabled); -+ } else { -+ static_branch_disable_cpuslocked(&scx_builtin_idle_enabled); -+ } -+ -+ /* -+ * All cgroups should be initialized before letting in tasks. cgroup -+ * on/offlining and task migrations are already locked out. -+ */ -+ ret = scx_cgroup_init(); -+ if (ret) -+ goto err_disable_unlock_all; -+ -+ static_branch_enable_cpuslocked(&__scx_ops_enabled); -+ -+ /* -+ * Enable ops for every task. Fork is excluded by scx_fork_rwsem -+ * preventing new tasks from being added. No need to exclude tasks -+ * leaving as sched_ext_free() can handle both prepped and enabled -+ * tasks. Prep all tasks first and then enable them with preemption -+ * disabled. -+ */ -+ spin_lock_irq(&scx_tasks_lock); -+ -+ scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_locked(&sti, false))) { -+ get_task_struct(p); -+ scx_task_iter_rq_unlock(&sti); -+ spin_unlock_irq(&scx_tasks_lock); -+ -+ ret = scx_ops_init_task(p, task_group(p), false); -+ if (ret) { -+ put_task_struct(p); -+ spin_lock_irq(&scx_tasks_lock); -+ scx_task_iter_exit(&sti); -+ spin_unlock_irq(&scx_tasks_lock); -+ pr_err("sched_ext: ops.init_task() failed (%d) for %s[%d] while loading\n", -+ ret, p->comm, p->pid); -+ goto err_disable_unlock_all; -+ } -+ -+ put_task_struct(p); -+ spin_lock_irq(&scx_tasks_lock); -+ } -+ scx_task_iter_exit(&sti); -+ -+ /* -+ * All tasks are prepped but are still ops-disabled. Ensure that -+ * %current can't be scheduled out and switch everyone. -+ * preempt_disable() is necessary because we can't guarantee that -+ * %current won't be starved if scheduled out while switching. -+ */ -+ preempt_disable(); -+ -+ /* -+ * From here on, the disable path must assume that tasks have ops -+ * enabled and need to be recovered. -+ * -+ * Transition to ENABLING fails iff the BPF scheduler has already -+ * triggered scx_bpf_error(). Returning an error code here would lose -+ * the recorded error information. Exit indicating success so that the -+ * error is notified through ops.exit() with all the details. -+ */ -+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLING, SCX_OPS_PREPPING)) { -+ preempt_enable(); -+ spin_unlock_irq(&scx_tasks_lock); -+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); -+ ret = 0; -+ goto err_disable_unlock_all; -+ } -+ -+ /* -+ * We're fully committed and can't fail. The PREPPED -> ENABLED -+ * transitions here are synchronized against sched_ext_free() through -+ * scx_tasks_lock. -+ */ -+ WRITE_ONCE(scx_switching_all, !(ops->flags & SCX_OPS_SWITCH_PARTIAL)); -+ -+ scx_task_iter_init(&sti); -+ while ((p = scx_task_iter_next_locked(&sti, false))) { -+ const struct sched_class *old_class = p->sched_class; -+ struct sched_enq_and_set_ctx ctx; -+ -+ sched_deq_and_put_task(p, DEQUEUE_SAVE | DEQUEUE_MOVE, &ctx); -+ -+ scx_set_task_state(p, SCX_TASK_READY); -+ __setscheduler_prio(p, p->prio); -+ check_class_changing(task_rq(p), p, old_class); -+ -+ sched_enq_and_set_task(&ctx); -+ -+ check_class_changed(task_rq(p), p, old_class, p->prio); -+ } -+ scx_task_iter_exit(&sti); -+ -+ spin_unlock_irq(&scx_tasks_lock); -+ preempt_enable(); -+ scx_cgroup_unlock(); -+ cpus_read_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+ -+ /* see above ENABLING transition for the explanation on exiting with 0 */ -+ if (!scx_ops_tryset_enable_state(SCX_OPS_ENABLED, SCX_OPS_ENABLING)) { -+ WARN_ON_ONCE(atomic_read(&scx_exit_kind) == SCX_EXIT_NONE); -+ ret = 0; -+ goto err_disable; -+ } -+ -+ if (!(ops->flags & SCX_OPS_SWITCH_PARTIAL)) -+ static_branch_enable(&__scx_switched_all); -+ -+ kobject_uevent(scx_root_kobj, KOBJ_ADD); -+ mutex_unlock(&scx_ops_enable_mutex); -+ -+ scx_cgroup_config_knobs(); -+ -+ return 0; -+ -+err_del: -+ kobject_del(scx_root_kobj); -+err: -+ kobject_put(scx_root_kobj); -+ scx_root_kobj = NULL; -+ if (scx_exit_info) { -+ free_exit_info(scx_exit_info); -+ scx_exit_info = NULL; -+ } -+err_unlock: -+ mutex_unlock(&scx_ops_enable_mutex); -+ return ret; -+ -+err_disable_unlock_all: -+ scx_cgroup_unlock(); -+ percpu_up_write(&scx_fork_rwsem); -+err_disable_unlock_cpus: -+ cpus_read_unlock(); -+err_disable: -+ mutex_unlock(&scx_ops_enable_mutex); -+ /* must be fully disabled before returning */ -+ scx_ops_disable(SCX_EXIT_ERROR); -+ kthread_flush_work(&scx_ops_disable_work); -+ return ret; -+} -+ -+ -+/******************************************************************************** -+ * bpf_struct_ops plumbing. -+ */ -+#include -+#include -+#include -+ -+extern struct btf *btf_vmlinux; -+static const struct btf_type *task_struct_type; -+static u32 task_struct_type_id; -+ -+/* Make the 2nd argument of .dispatch a pointer that can be NULL. */ -+static bool promote_dispatch_2nd_arg(int off, int size, -+ enum bpf_access_type type, -+ const struct bpf_prog *prog, -+ struct bpf_insn_access_aux *info) -+{ -+ struct btf *btf = bpf_get_btf_vmlinux(); -+ const struct bpf_struct_ops_desc *st_ops_desc; -+ const struct btf_member *member; -+ const struct btf_type *t; -+ u32 btf_id, member_idx; -+ const char *mname; -+ -+ /* btf_id should be the type id of struct sched_ext_ops */ -+ btf_id = prog->aux->attach_btf_id; -+ st_ops_desc = bpf_struct_ops_find(btf, btf_id); -+ if (!st_ops_desc) -+ return false; -+ -+ /* BTF type of struct sched_ext_ops */ -+ t = st_ops_desc->type; -+ -+ member_idx = prog->expected_attach_type; -+ if (member_idx >= btf_type_vlen(t)) -+ return false; -+ -+ /* -+ * Get the member name of this struct_ops program, which corresponds to -+ * a field in struct sched_ext_ops. For example, the member name of the -+ * dispatch struct_ops program (callback) is "dispatch". -+ */ -+ member = &btf_type_member(t)[member_idx]; -+ mname = btf_name_by_offset(btf_vmlinux, member->name_off); -+ -+ /* -+ * Check if it is the second argument of the function pointer at -+ * "dispatch" in struct sched_ext_ops. The arguments of struct_ops -+ * operators are sequential and 64-bit, so the second argument is at -+ * offset sizeof(__u64). -+ */ -+ if (strcmp(mname, "dispatch") == 0 && -+ off == sizeof(__u64)) { -+ /* -+ * The value is a pointer to a type (struct task_struct) given -+ * by a BTF ID (PTR_TO_BTF_ID). It is trusted (PTR_TRUSTED), -+ * however, can be a NULL (PTR_MAYBE_NULL). The BPF program -+ * should check the pointer to make sure it is not NULL before -+ * using it, or the verifier will reject the program. -+ * -+ * Longer term, this is something that should be addressed by -+ * BTF, and be fully contained within the verifier. -+ */ -+ info->reg_type = PTR_MAYBE_NULL | PTR_TO_BTF_ID | PTR_TRUSTED; -+ info->btf = btf_vmlinux; -+ info->btf_id = task_struct_type_id; -+ -+ return true; -+ } -+ -+ return false; -+} -+ -+static bool bpf_scx_is_valid_access(int off, int size, -+ enum bpf_access_type type, -+ const struct bpf_prog *prog, -+ struct bpf_insn_access_aux *info) -+{ -+ if (type != BPF_READ) -+ return false; -+ if (promote_dispatch_2nd_arg(off, size, type, prog, info)) -+ return true; -+ if (off < 0 || off >= sizeof(__u64) * MAX_BPF_FUNC_ARGS) -+ return false; -+ if (off % size != 0) -+ return false; -+ -+ return btf_ctx_access(off, size, type, prog, info); -+} -+ -+static int bpf_scx_btf_struct_access(struct bpf_verifier_log *log, -+ const struct bpf_reg_state *reg, int off, -+ int size) -+{ -+ const struct btf_type *t; -+ -+ t = btf_type_by_id(reg->btf, reg->btf_id); -+ if (t == task_struct_type) { -+ if (off >= offsetof(struct task_struct, scx.slice) && -+ off + size <= offsetofend(struct task_struct, scx.slice)) -+ return SCALAR_VALUE; -+ if (off >= offsetof(struct task_struct, scx.dsq_vtime) && -+ off + size <= offsetofend(struct task_struct, scx.dsq_vtime)) -+ return SCALAR_VALUE; -+ if (off >= offsetof(struct task_struct, scx.disallow) && -+ off + size <= offsetofend(struct task_struct, scx.disallow)) -+ return SCALAR_VALUE; -+ } -+ -+ return -EACCES; -+} -+ -+static const struct bpf_func_proto * -+bpf_scx_get_func_proto(enum bpf_func_id func_id, const struct bpf_prog *prog) -+{ -+ switch (func_id) { -+ case BPF_FUNC_task_storage_get: -+ return &bpf_task_storage_get_proto; -+ case BPF_FUNC_task_storage_delete: -+ return &bpf_task_storage_delete_proto; -+ default: -+ return bpf_base_func_proto(func_id, prog); -+ } -+} -+ -+static const struct bpf_verifier_ops bpf_scx_verifier_ops = { -+ .get_func_proto = bpf_scx_get_func_proto, -+ .is_valid_access = bpf_scx_is_valid_access, -+ .btf_struct_access = bpf_scx_btf_struct_access, -+}; -+ -+static int bpf_scx_init_member(const struct btf_type *t, -+ const struct btf_member *member, -+ void *kdata, const void *udata) -+{ -+ const struct sched_ext_ops *uops = udata; -+ struct sched_ext_ops *ops = kdata; -+ u32 moff = __btf_member_bit_offset(t, member) / 8; -+ int ret; -+ -+ switch (moff) { -+ case offsetof(struct sched_ext_ops, dispatch_max_batch): -+ if (*(u32 *)(udata + moff) > INT_MAX) -+ return -E2BIG; -+ ops->dispatch_max_batch = *(u32 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, flags): -+ if (*(u64 *)(udata + moff) & ~SCX_OPS_ALL_FLAGS) -+ return -EINVAL; -+ ops->flags = *(u64 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, name): -+ ret = bpf_obj_name_cpy(ops->name, uops->name, -+ sizeof(ops->name)); -+ if (ret < 0) -+ return ret; -+ if (ret == 0) -+ return -EINVAL; -+ return 1; -+ case offsetof(struct sched_ext_ops, timeout_ms): -+ if (msecs_to_jiffies(*(u32 *)(udata + moff)) > -+ SCX_WATCHDOG_MAX_TIMEOUT) -+ return -E2BIG; -+ ops->timeout_ms = *(u32 *)(udata + moff); -+ return 1; -+ case offsetof(struct sched_ext_ops, exit_dump_len): -+ ops->exit_dump_len = -+ *(u32 *)(udata + moff) ?: SCX_EXIT_DUMP_DFL_LEN; -+ return 1; -+ case offsetof(struct sched_ext_ops, hotplug_seq): -+ ops->hotplug_seq = *(u64 *)(udata + moff); -+ return 1; -+ } -+ -+ return 0; -+} -+ -+static int bpf_scx_check_member(const struct btf_type *t, -+ const struct btf_member *member, -+ const struct bpf_prog *prog) -+{ -+ u32 moff = __btf_member_bit_offset(t, member) / 8; -+ -+ switch (moff) { -+ case offsetof(struct sched_ext_ops, init_task): -+#ifdef CONFIG_EXT_GROUP_SCHED -+ case offsetof(struct sched_ext_ops, cgroup_init): -+ case offsetof(struct sched_ext_ops, cgroup_exit): -+ case offsetof(struct sched_ext_ops, cgroup_prep_move): -+#endif -+ case offsetof(struct sched_ext_ops, cpu_online): -+ case offsetof(struct sched_ext_ops, cpu_offline): -+ case offsetof(struct sched_ext_ops, init): -+ case offsetof(struct sched_ext_ops, exit): -+ break; -+ default: -+ if (prog->sleepable) -+ return -EINVAL; -+ } -+ -+ return 0; -+} -+ -+static int bpf_scx_reg(void *kdata, struct bpf_link *link) -+{ -+ return scx_ops_enable(kdata, link); -+} -+ -+static void bpf_scx_unreg(void *kdata, struct bpf_link *link) -+{ -+ scx_ops_disable(SCX_EXIT_UNREG); -+ kthread_flush_work(&scx_ops_disable_work); -+} -+ -+static int bpf_scx_init(struct btf *btf) -+{ -+ u32 type_id; -+ -+ type_id = btf_find_by_name_kind(btf, "task_struct", BTF_KIND_STRUCT); -+ if (type_id < 0) -+ return -EINVAL; -+ task_struct_type = btf_type_by_id(btf, type_id); -+ task_struct_type_id = type_id; -+ -+ return 0; -+} -+ -+static int bpf_scx_update(void *kdata, void *old_kdata, struct bpf_link *link) -+{ -+ /* -+ * sched_ext does not support updating the actively-loaded BPF -+ * scheduler, as registering a BPF scheduler can always fail if the -+ * scheduler returns an error code for e.g. ops.init(), ops.init_task(), -+ * etc. Similarly, we can always race with unregistration happening -+ * elsewhere, such as with sysrq. -+ */ -+ return -EOPNOTSUPP; -+} -+ -+static int bpf_scx_validate(void *kdata) -+{ -+ return 0; -+} -+ -+static s32 select_cpu_stub(struct task_struct *p, s32 prev_cpu, u64 wake_flags) { return -EINVAL; } -+static void enqueue_stub(struct task_struct *p, u64 enq_flags) {} -+static void dequeue_stub(struct task_struct *p, u64 enq_flags) {} -+static void dispatch_stub(s32 prev_cpu, struct task_struct *p) {} -+static void runnable_stub(struct task_struct *p, u64 enq_flags) {} -+static void running_stub(struct task_struct *p) {} -+static void stopping_stub(struct task_struct *p, bool runnable) {} -+static void quiescent_stub(struct task_struct *p, u64 deq_flags) {} -+static bool yield_stub(struct task_struct *from, struct task_struct *to) { return false; } -+static bool core_sched_before_stub(struct task_struct *a, struct task_struct *b) { return false; } -+static void set_weight_stub(struct task_struct *p, u32 weight) {} -+static void set_cpumask_stub(struct task_struct *p, const struct cpumask *mask) {} -+static void update_idle_stub(s32 cpu, bool idle) {} -+static void cpu_acquire_stub(s32 cpu, struct scx_cpu_acquire_args *args) {} -+static void cpu_release_stub(s32 cpu, struct scx_cpu_release_args *args) {} -+static s32 init_task_stub(struct task_struct *p, struct scx_init_task_args *args) { return -EINVAL; } -+static void exit_task_stub(struct task_struct *p, struct scx_exit_task_args *args) {} -+static void enable_stub(struct task_struct *p) {} -+static void disable_stub(struct task_struct *p) {} -+#ifdef CONFIG_EXT_GROUP_SCHED -+static s32 cgroup_init_stub(struct cgroup *cgrp, struct scx_cgroup_init_args *args) { return -EINVAL; } -+static void cgroup_exit_stub(struct cgroup *cgrp) {} -+static s32 cgroup_prep_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) { return -EINVAL; } -+static void cgroup_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -+static void cgroup_cancel_move_stub(struct task_struct *p, struct cgroup *from, struct cgroup *to) {} -+static void cgroup_set_weight_stub(struct cgroup *cgrp, u32 weight) {} -+#endif -+static void cpu_online_stub(s32 cpu) {} -+static void cpu_offline_stub(s32 cpu) {} -+static s32 init_stub(void) { return -EINVAL; } -+static void exit_stub(struct scx_exit_info *info) {} -+ -+static struct sched_ext_ops __bpf_ops_sched_ext_ops = { -+ .select_cpu = select_cpu_stub, -+ .enqueue = enqueue_stub, -+ .dequeue = dequeue_stub, -+ .dispatch = dispatch_stub, -+ .runnable = runnable_stub, -+ .running = running_stub, -+ .stopping = stopping_stub, -+ .quiescent = quiescent_stub, -+ .yield = yield_stub, -+ .core_sched_before = core_sched_before_stub, -+ .set_weight = set_weight_stub, -+ .set_cpumask = set_cpumask_stub, -+ .update_idle = update_idle_stub, -+ .cpu_acquire = cpu_acquire_stub, -+ .cpu_release = cpu_release_stub, -+ .init_task = init_task_stub, -+ .exit_task = exit_task_stub, -+ .enable = enable_stub, -+ .disable = disable_stub, -+#ifdef CONFIG_EXT_GROUP_SCHED -+ .cgroup_init = cgroup_init_stub, -+ .cgroup_exit = cgroup_exit_stub, -+ .cgroup_prep_move = cgroup_prep_move_stub, -+ .cgroup_move = cgroup_move_stub, -+ .cgroup_cancel_move = cgroup_cancel_move_stub, -+ .cgroup_set_weight = cgroup_set_weight_stub, -+#endif -+ .cpu_online = cpu_online_stub, -+ .cpu_offline = cpu_offline_stub, -+ .init = init_stub, -+ .exit = exit_stub, -+}; -+ -+static struct bpf_struct_ops bpf_sched_ext_ops = { -+ .verifier_ops = &bpf_scx_verifier_ops, -+ .reg = bpf_scx_reg, -+ .unreg = bpf_scx_unreg, -+ .check_member = bpf_scx_check_member, -+ .init_member = bpf_scx_init_member, -+ .init = bpf_scx_init, -+ .update = bpf_scx_update, -+ .validate = bpf_scx_validate, -+ .name = "sched_ext_ops", -+ .owner = THIS_MODULE, -+ .cfi_stubs = &__bpf_ops_sched_ext_ops -+}; -+ -+ -+/******************************************************************************** -+ * System integration and init. -+ */ -+ -+static void sysrq_handle_sched_ext_reset(u8 key) -+{ -+ if (scx_ops_helper) -+ scx_ops_disable(SCX_EXIT_SYSRQ); -+ else -+ pr_info("sched_ext: BPF scheduler not yet used\n"); -+} -+ -+static const struct sysrq_key_op sysrq_sched_ext_reset_op = { -+ .handler = sysrq_handle_sched_ext_reset, -+ .help_msg = "reset-sched-ext(S)", -+ .action_msg = "Disable sched_ext and revert all tasks to CFS", -+ .enable_mask = SYSRQ_ENABLE_RTNICE, -+}; -+ -+static void sysrq_handle_sched_ext_dump(u8 key) -+{ -+ struct scx_exit_info ei = { .kind = SCX_EXIT_NONE, .reason = "SysRq-D" }; -+ -+ if (scx_enabled()) -+ scx_dump_state(&ei, 0); -+} -+ -+static const struct sysrq_key_op sysrq_sched_ext_dump_op = { -+ .handler = sysrq_handle_sched_ext_dump, -+ .help_msg = "dump-sched-ext(D)", -+ .action_msg = "Trigger sched_ext debug dump", -+ .enable_mask = SYSRQ_ENABLE_RTNICE, -+}; -+ -+static bool can_skip_idle_kick(struct rq *rq) -+{ -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * We can skip idle kicking if @rq is going to go through at least one -+ * full SCX scheduling cycle before going idle. Just checking whether -+ * curr is not idle is insufficient because we could be racing -+ * balance_one() trying to pull the next task from a remote rq, which -+ * may fail, and @rq may become idle afterwards. -+ * -+ * The race window is small and we don't and can't guarantee that @rq is -+ * only kicked while idle anyway. Skip only when sure. -+ */ -+ return !is_idle_task(rq->curr) && !(rq->scx.flags & SCX_RQ_BALANCING); -+} -+ -+static bool kick_one_cpu(s32 cpu, struct rq *this_rq, unsigned long *pseqs) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ struct scx_rq *this_scx = &this_rq->scx; -+ bool should_wait = false; -+ unsigned long flags; -+ -+ raw_spin_rq_lock_irqsave(rq, flags); -+ -+ /* -+ * During CPU hotplug, a CPU may depend on kicking itself to make -+ * forward progress. Allow kicking self regardless of online state. -+ */ -+ if (cpu_online(cpu) || cpu == cpu_of(this_rq)) { -+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_preempt)) { -+ if (rq->curr->sched_class == &ext_sched_class) -+ rq->curr->scx.slice = 0; -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); -+ } -+ -+ if (cpumask_test_cpu(cpu, this_scx->cpus_to_wait)) { -+ pseqs[cpu] = rq->scx.pnt_seq; -+ should_wait = true; -+ } -+ -+ resched_curr(rq); -+ } else { -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_preempt); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); -+ } -+ -+ raw_spin_rq_unlock_irqrestore(rq, flags); -+ -+ return should_wait; -+} -+ -+static void kick_one_cpu_if_idle(s32 cpu, struct rq *this_rq) -+{ -+ struct rq *rq = cpu_rq(cpu); -+ unsigned long flags; -+ -+ raw_spin_rq_lock_irqsave(rq, flags); -+ -+ if (!can_skip_idle_kick(rq) && -+ (cpu_online(cpu) || cpu == cpu_of(this_rq))) -+ resched_curr(rq); -+ -+ raw_spin_rq_unlock_irqrestore(rq, flags); -+} -+ -+static void kick_cpus_irq_workfn(struct irq_work *irq_work) -+{ -+ struct rq *this_rq = this_rq(); -+ struct scx_rq *this_scx = &this_rq->scx; -+ unsigned long *pseqs = this_cpu_ptr(scx_kick_cpus_pnt_seqs); -+ bool should_wait = false; -+ s32 cpu; -+ -+ for_each_cpu(cpu, this_scx->cpus_to_kick) { -+ should_wait |= kick_one_cpu(cpu, this_rq, pseqs); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); -+ } -+ -+ for_each_cpu(cpu, this_scx->cpus_to_kick_if_idle) { -+ kick_one_cpu_if_idle(cpu, this_rq); -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_kick_if_idle); -+ } -+ -+ if (!should_wait) -+ return; -+ -+ for_each_cpu(cpu, this_scx->cpus_to_wait) { -+ unsigned long *wait_pnt_seq = &cpu_rq(cpu)->scx.pnt_seq; -+ -+ if (cpu != cpu_of(this_rq)) { -+ /* -+ * Pairs with smp_store_release() issued by this CPU in -+ * scx_next_task_picked() on the resched path. -+ * -+ * We busy-wait here to guarantee that no other task can -+ * be scheduled on our core before the target CPU has -+ * entered the resched path. -+ */ -+ while (smp_load_acquire(wait_pnt_seq) == pseqs[cpu]) -+ cpu_relax(); -+ } -+ -+ cpumask_clear_cpu(cpu, this_scx->cpus_to_wait); -+ } -+} -+ -+/** -+ * print_scx_info - print out sched_ext scheduler state -+ * @log_lvl: the log level to use when printing -+ * @p: target task -+ * -+ * If a sched_ext scheduler is enabled, print the name and state of the -+ * scheduler. If @p is on sched_ext, print further information about the task. -+ * -+ * This function can be safely called on any task as long as the task_struct -+ * itself is accessible. While safe, this function isn't synchronized and may -+ * print out mixups or garbages of limited length. -+ */ -+void print_scx_info(const char *log_lvl, struct task_struct *p) -+{ -+ enum scx_ops_enable_state state = scx_ops_enable_state(); -+ const char *all = READ_ONCE(scx_switching_all) ? "+all" : ""; -+ char runnable_at_buf[22] = "?"; -+ struct sched_class *class; -+ unsigned long runnable_at; -+ -+ if (state == SCX_OPS_DISABLED) -+ return; -+ -+ /* -+ * Carefully check if the task was running on sched_ext, and then -+ * carefully copy the time it's been runnable, and its state. -+ */ -+ if (copy_from_kernel_nofault(&class, &p->sched_class, sizeof(class)) || -+ class != &ext_sched_class) { -+ printk("%sSched_ext: %s (%s%s)", log_lvl, scx_ops.name, -+ scx_ops_enable_state_str[state], all); -+ return; -+ } -+ -+ if (!copy_from_kernel_nofault(&runnable_at, &p->scx.runnable_at, -+ sizeof(runnable_at))) -+ scnprintf(runnable_at_buf, sizeof(runnable_at_buf), "%+ldms", -+ jiffies_delta_msecs(runnable_at, jiffies)); -+ -+ /* print everything onto one line to conserve console space */ -+ printk("%sSched_ext: %s (%s%s), task: runnable_at=%s", -+ log_lvl, scx_ops.name, scx_ops_enable_state_str[state], all, -+ runnable_at_buf); -+} -+ -+static int scx_pm_handler(struct notifier_block *nb, unsigned long event, void *ptr) -+{ -+ /* -+ * SCX schedulers often have userspace components which are sometimes -+ * involved in critial scheduling paths. PM operations involve freezing -+ * userspace which can lead to scheduling misbehaviors including stalls. -+ * Let's bypass while PM operations are in progress. -+ */ -+ switch (event) { -+ case PM_HIBERNATION_PREPARE: -+ case PM_SUSPEND_PREPARE: -+ case PM_RESTORE_PREPARE: -+ scx_ops_bypass(true); -+ break; -+ case PM_POST_HIBERNATION: -+ case PM_POST_SUSPEND: -+ case PM_POST_RESTORE: -+ scx_ops_bypass(false); -+ break; -+ } -+ -+ return NOTIFY_OK; -+} -+ -+static struct notifier_block scx_pm_notifier = { -+ .notifier_call = scx_pm_handler, -+}; -+ -+void __init init_sched_ext_class(void) -+{ -+ s32 cpu, v; -+ -+ /* -+ * The following is to prevent the compiler from optimizing out the enum -+ * definitions so that BPF scheduler implementations can use them -+ * through the generated vmlinux.h. -+ */ -+ WRITE_ONCE(v, SCX_ENQ_WAKEUP | SCX_DEQ_SLEEP | SCX_KICK_PREEMPT | -+ SCX_TG_ONLINE); -+ -+ BUG_ON(rhashtable_init(&dsq_hash, &dsq_hash_params)); -+ init_dsq(&scx_dsq_global, SCX_DSQ_GLOBAL); -+#ifdef CONFIG_SMP -+ BUG_ON(!alloc_cpumask_var(&idle_masks.cpu, GFP_KERNEL)); -+ BUG_ON(!alloc_cpumask_var(&idle_masks.smt, GFP_KERNEL)); -+#endif -+ scx_kick_cpus_pnt_seqs = -+ __alloc_percpu(sizeof(scx_kick_cpus_pnt_seqs[0]) * nr_cpu_ids, -+ __alignof__(scx_kick_cpus_pnt_seqs[0])); -+ BUG_ON(!scx_kick_cpus_pnt_seqs); -+ -+ for_each_possible_cpu(cpu) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ init_dsq(&rq->scx.local_dsq, SCX_DSQ_LOCAL); -+ INIT_LIST_HEAD(&rq->scx.runnable_list); -+ -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_kick_if_idle, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_preempt, GFP_KERNEL)); -+ BUG_ON(!zalloc_cpumask_var(&rq->scx.cpus_to_wait, GFP_KERNEL)); -+ init_irq_work(&rq->scx.kick_cpus_irq_work, kick_cpus_irq_workfn); -+ -+ if (cpu_online(cpu)) -+ cpu_rq(cpu)->scx.flags |= SCX_RQ_ONLINE; -+ } -+ -+ register_sysrq_key('S', &sysrq_sched_ext_reset_op); -+ register_sysrq_key('D', &sysrq_sched_ext_dump_op); -+ INIT_DELAYED_WORK(&scx_watchdog_work, scx_watchdog_workfn); -+ scx_cgroup_config_knobs(); -+} -+ -+ -+/******************************************************************************** -+ * Helpers that can be called from the BPF scheduler. -+ */ -+#include -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_create_dsq - Create a custom DSQ -+ * @dsq_id: DSQ to create -+ * @node: NUMA node to allocate from -+ * -+ * Create a custom DSQ identified by @dsq_id. Can be called from ops.init(), -+ * ops.init_task(), ops.cgroup_init() and ops.cgroup_prep_move(). -+ */ -+__bpf_kfunc s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) -+{ -+ if (!scx_kf_allowed(SCX_KF_SLEEPABLE)) -+ return -EINVAL; -+ -+ if (unlikely(node >= (int)nr_node_ids || -+ (node < 0 && node != NUMA_NO_NODE))) -+ return -EINVAL; -+ return PTR_ERR_OR_ZERO(create_dsq(dsq_id, node)); -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_sleepable) -+BTF_ID_FLAGS(func, scx_bpf_create_dsq, KF_SLEEPABLE) -+BTF_KFUNCS_END(scx_kfunc_ids_sleepable) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_sleepable = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_sleepable, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_select_cpu_dfl - The default implementation of ops.select_cpu() -+ * @p: task_struct to select a CPU for -+ * @prev_cpu: CPU @p was on previously -+ * @wake_flags: %SCX_WAKE_* flags -+ * @is_idle: out parameter indicating whether the returned CPU is idle -+ * -+ * Can only be called from ops.select_cpu() if the built-in CPU selection is -+ * enabled - ops.update_idle() is missing or %SCX_OPS_KEEP_BUILTIN_IDLE is set. -+ * @p, @prev_cpu and @wake_flags match ops.select_cpu(). -+ * -+ * Returns the picked CPU with *@is_idle indicating whether the picked CPU is -+ * currently idle and thus a good candidate for direct dispatching. -+ */ -+__bpf_kfunc s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags, bool *is_idle) -+{ -+ if (!scx_kf_allowed(SCX_KF_SELECT_CPU)) { -+ *is_idle = false; -+ return prev_cpu; -+ } -+#ifdef CONFIG_SMP -+ return scx_select_cpu_dfl(p, prev_cpu, wake_flags, is_idle); -+#else -+ *is_idle = false; -+ return prev_cpu; -+#endif -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_select_cpu) -+BTF_ID_FLAGS(func, scx_bpf_select_cpu_dfl, KF_RCU) -+BTF_KFUNCS_END(scx_kfunc_ids_select_cpu) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_select_cpu = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_select_cpu, -+}; -+ -+static bool scx_dispatch_preamble(struct task_struct *p, u64 enq_flags) -+{ -+ if (!scx_kf_allowed(SCX_KF_ENQUEUE | SCX_KF_DISPATCH)) -+ return false; -+ -+ lockdep_assert_irqs_disabled(); -+ -+ if (unlikely(!p)) { -+ scx_ops_error("called with NULL task"); -+ return false; -+ } -+ -+ if (unlikely(enq_flags & __SCX_ENQ_INTERNAL_MASK)) { -+ scx_ops_error("invalid enq_flags 0x%llx", enq_flags); -+ return false; -+ } -+ -+ return true; -+} -+ -+static void scx_dispatch_commit(struct task_struct *p, u64 dsq_id, u64 enq_flags) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ struct task_struct *ddsp_task; -+ -+ ddsp_task = __this_cpu_read(direct_dispatch_task); -+ if (ddsp_task) { -+ mark_direct_dispatch(ddsp_task, p, dsq_id, enq_flags); -+ return; -+ } -+ -+ if (unlikely(dspc->cursor >= scx_dsp_max_batch)) { -+ scx_ops_error("dispatch buffer overflow"); -+ return; -+ } -+ -+ dspc->buf[dspc->cursor++] = (struct scx_dsp_buf_ent){ -+ .task = p, -+ .qseq = atomic_long_read(&p->scx.ops_state) & SCX_OPSS_QSEQ_MASK, -+ .dsq_id = dsq_id, -+ .enq_flags = enq_flags, -+ }; -+} -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_dispatch - Dispatch a task into the FIFO queue of a DSQ -+ * @p: task_struct to dispatch -+ * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Dispatch @p into the FIFO queue of the DSQ identified by @dsq_id. It is safe -+ * to call this function spuriously. Can be called from ops.enqueue(), -+ * ops.select_cpu(), and ops.dispatch(). -+ * -+ * When called from ops.select_cpu() or ops.enqueue(), it's for direct dispatch -+ * and @p must match the task being enqueued. Also, %SCX_DSQ_LOCAL_ON can't be -+ * used to target the local DSQ of a CPU other than the enqueueing one. Use -+ * ops.select_cpu() to be on the target CPU in the first place. -+ * -+ * When called from ops.select_cpu(), @enq_flags and @dsp_id are stored, and @p -+ * will be directly dispatched to the corresponding dispatch queue after -+ * ops.select_cpu() returns. If @p is dispatched to SCX_DSQ_LOCAL, it will be -+ * dispatched to the local DSQ of the CPU returned by ops.select_cpu(). -+ * @enq_flags are OR'd with the enqueue flags on the enqueue path before the -+ * task is dispatched. -+ * -+ * When called from ops.dispatch(), there are no restrictions on @p or @dsq_id -+ * and this function can be called upto ops.dispatch_max_batch times to dispatch -+ * multiple tasks. scx_bpf_dispatch_nr_slots() returns the number of the -+ * remaining slots. scx_bpf_consume() flushes the batch and resets the counter. -+ * -+ * This function doesn't have any locking restrictions and may be called under -+ * BPF locks (in the future when BPF introduces more flexible locking). -+ * -+ * @p is allowed to run for @slice. The scheduling path is triggered on slice -+ * exhaustion. If zero, the current residual slice is maintained. If -+ * %SCX_SLICE_INF, @p never expires and the BPF scheduler must kick the CPU with -+ * scx_bpf_kick_cpu() to trigger scheduling. -+ */ -+__bpf_kfunc void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, -+ u64 enq_flags) -+{ -+ if (!scx_dispatch_preamble(p, enq_flags)) -+ return; -+ -+ if (slice) -+ p->scx.slice = slice; -+ else -+ p->scx.slice = p->scx.slice ?: 1; -+ -+ scx_dispatch_commit(p, dsq_id, enq_flags); -+} -+ -+/** -+ * scx_bpf_dispatch_vtime - Dispatch a task into the vtime priority queue of a DSQ -+ * @p: task_struct to dispatch -+ * @dsq_id: DSQ to dispatch to -+ * @slice: duration @p can run for in nsecs -+ * @vtime: @p's ordering inside the vtime-sorted queue of the target DSQ -+ * @enq_flags: SCX_ENQ_* -+ * -+ * Dispatch @p into the vtime priority queue of the DSQ identified by @dsq_id. -+ * Tasks queued into the priority queue are ordered by @vtime and always -+ * consumed after the tasks in the FIFO queue. All other aspects are identical -+ * to scx_bpf_dispatch(). -+ * -+ * @vtime ordering is according to time_before64() which considers wrapping. A -+ * numerically larger vtime may indicate an earlier position in the ordering and -+ * vice-versa. -+ */ -+__bpf_kfunc void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, -+ u64 slice, u64 vtime, u64 enq_flags) -+{ -+ if (!scx_dispatch_preamble(p, enq_flags)) -+ return; -+ -+ if (slice) -+ p->scx.slice = slice; -+ else -+ p->scx.slice = p->scx.slice ?: 1; -+ -+ p->scx.dsq_vtime = vtime; -+ -+ scx_dispatch_commit(p, dsq_id, enq_flags | SCX_ENQ_DSQ_PRIQ); -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_enqueue_dispatch) -+BTF_ID_FLAGS(func, scx_bpf_dispatch, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_vtime, KF_RCU) -+BTF_KFUNCS_END(scx_kfunc_ids_enqueue_dispatch) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_enqueue_dispatch = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_enqueue_dispatch, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_dispatch_nr_slots - Return the number of remaining dispatch slots -+ * -+ * Can only be called from ops.dispatch(). -+ */ -+__bpf_kfunc u32 scx_bpf_dispatch_nr_slots(void) -+{ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return 0; -+ -+ return scx_dsp_max_batch - __this_cpu_read(scx_dsp_ctx->cursor); -+} -+ -+/** -+ * scx_bpf_dispatch_cancel - Cancel the latest dispatch -+ * -+ * Cancel the latest dispatch. Can be called multiple times to cancel further -+ * dispatches. Can only be called from ops.dispatch(). -+ */ -+__bpf_kfunc void scx_bpf_dispatch_cancel(void) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return; -+ -+ if (dspc->cursor > 0) -+ dspc->cursor--; -+ else -+ scx_ops_error("dispatch buffer underflow"); -+} -+ -+/** -+ * scx_bpf_consume - Transfer a task from a DSQ to the current CPU's local DSQ -+ * @dsq_id: DSQ to consume -+ * -+ * Consume a task from the non-local DSQ identified by @dsq_id and transfer it -+ * to the current CPU's local DSQ for execution. Can only be called from -+ * ops.dispatch(). -+ * -+ * This function flushes the in-flight dispatches from scx_bpf_dispatch() before -+ * trying to consume the specified DSQ. It may also grab rq locks and thus can't -+ * be called under any BPF locks. -+ * -+ * Returns %true if a task has been consumed, %false if there isn't any task to -+ * consume. -+ */ -+__bpf_kfunc bool scx_bpf_consume(u64 dsq_id) -+{ -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ struct scx_dispatch_q *dsq; -+ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return false; -+ -+ flush_dispatch_buf(dspc->rq, dspc->rf); -+ -+ dsq = find_non_local_dsq(dsq_id); -+ if (unlikely(!dsq)) { -+ scx_ops_error("invalid DSQ ID 0x%016llx", dsq_id); -+ return false; -+ } -+ -+ if (consume_dispatch_q(dspc->rq, dspc->rf, dsq)) { -+ /* -+ * A successfully consumed task can be dequeued before it starts -+ * running while the CPU is trying to migrate other dispatched -+ * tasks. Bump nr_tasks to tell balance_scx() to retry on empty -+ * local DSQ. -+ */ -+ dspc->nr_tasks++; -+ return true; -+ } else { -+ return false; -+ } -+} -+ -+/** -+ * __scx_bpf_consume_task - Transfer a task from DSQ iteration to the local DSQ -+ * @it: DSQ iterator in progress -+ * @p: task to consume -+ * -+ * Transfer @p which is on the DSQ currently iterated by @it to the current -+ * CPU's local DSQ. For the transfer to be successful, @p must still be on the -+ * DSQ and have been queued before the DSQ iteration started. This function -+ * doesn't care whether @p was obtained from the DSQ iteration. @p just has to -+ * be on the DSQ and have been queued before the iteration started. -+ * -+ * Returns %true if @p has been consumed, %false if @p had already been consumed -+ * or dequeued. -+ */ -+__bpf_kfunc bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ struct scx_dispatch_q *dsq, *kit_dsq; -+ struct scx_dsp_ctx *dspc = this_cpu_ptr(scx_dsp_ctx); -+ struct rq *task_rq; -+ u64 kit_dsq_seq; -+ -+ /* can't trust @kit, carefully fetch the values we need */ -+ if (get_kernel_nofault(kit_dsq, &kit->dsq) || -+ get_kernel_nofault(kit_dsq_seq, &kit->dsq_seq)) { -+ scx_ops_error("invalid @it 0x%lx", it); -+ return false; -+ } -+ -+ /* -+ * @kit can't be trusted and we can only get the DSQ from @p. As we -+ * don't know @p's rq is locked, use READ_ONCE() to access the field. -+ * Derefing is safe as DSQs are RCU protected. -+ */ -+ dsq = READ_ONCE(p->scx.dsq); -+ -+ if (unlikely(dsq->id == SCX_DSQ_LOCAL)) { -+ scx_ops_error("local DSQ not allowed"); -+ return false; -+ } -+ -+ if (unlikely(!dsq || dsq != kit_dsq)) -+ return false; -+ -+ if (!scx_kf_allowed(SCX_KF_DISPATCH)) -+ return false; -+ -+ flush_dispatch_buf(dspc->rq, dspc->rf); -+ -+ raw_spin_lock(&dsq->lock); -+ -+ /* -+ * Did someone else get to it? @p could have already left $dsq, got -+ * re-enqueud, or be in the process of being consumed by someone else. -+ */ -+ if (unlikely(p->scx.dsq != dsq || -+ time_after64(p->scx.dsq_seq, kit_dsq_seq) || -+ p->scx.holding_cpu >= 0)) -+ goto out_unlock; -+ -+ task_rq = task_rq(p); -+ -+ if (dspc->rq == task_rq) { -+ consume_local_task(dspc->rq, dsq, p); -+ return true; -+ } -+ -+ if (task_can_run_on_remote_rq(p, dspc->rq)) -+ return consume_remote_task(dspc->rq, dspc->rf, dsq, p, task_rq); -+ -+out_unlock: -+ raw_spin_unlock(&dsq->lock); -+ return false; -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_dispatch) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_nr_slots) -+BTF_ID_FLAGS(func, scx_bpf_dispatch_cancel) -+BTF_ID_FLAGS(func, scx_bpf_consume) -+BTF_ID_FLAGS(func, __scx_bpf_consume_task) -+BTF_KFUNCS_END(scx_kfunc_ids_dispatch) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_dispatch = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_dispatch, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_reenqueue_local - Re-enqueue tasks on a local DSQ -+ * -+ * Iterate over all of the tasks currently enqueued on the local DSQ of the -+ * caller's CPU, and re-enqueue them in the BPF scheduler. Returns the number of -+ * processed tasks. Can only be called from ops.cpu_release(). -+ */ -+__bpf_kfunc u32 scx_bpf_reenqueue_local(void) -+{ -+ u32 nr_enqueued, i; -+ struct rq *rq; -+ -+ if (!scx_kf_allowed(SCX_KF_CPU_RELEASE)) -+ return 0; -+ -+ rq = cpu_rq(smp_processor_id()); -+ lockdep_assert_rq_held(rq); -+ -+ /* -+ * Get the number of tasks on the local DSQ before iterating over it to -+ * pull off tasks. The enqueue callback below can signal that it wants -+ * the task to stay on the local DSQ, and we want to prevent the BPF -+ * scheduler from causing us to loop indefinitely. -+ */ -+ nr_enqueued = rq->scx.local_dsq.nr; -+ for (i = 0; i < nr_enqueued; i++) { -+ struct task_struct *p; -+ -+ p = first_local_task(rq); -+ WARN_ON_ONCE(atomic_long_read(&p->scx.ops_state) != -+ SCX_OPSS_NONE); -+ WARN_ON_ONCE(!(p->scx.flags & SCX_TASK_QUEUED)); -+ WARN_ON_ONCE(p->scx.holding_cpu != -1); -+ dispatch_dequeue(rq, p); -+ do_enqueue_task(rq, p, SCX_ENQ_REENQ, -1); -+ } -+ -+ return nr_enqueued; -+} -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_cpu_release) -+BTF_ID_FLAGS(func, scx_bpf_reenqueue_local) -+BTF_KFUNCS_END(scx_kfunc_ids_cpu_release) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_cpu_release = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_cpu_release, -+}; -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_kick_cpu - Trigger reschedule on a CPU -+ * @cpu: cpu to kick -+ * @flags: %SCX_KICK_* flags -+ * -+ * Kick @cpu into rescheduling. This can be used to wake up an idle CPU or -+ * trigger rescheduling on a busy CPU. This can be called from any online -+ * scx_ops operation and the actual kicking is performed asynchronously through -+ * an irq work. -+ */ -+__bpf_kfunc void scx_bpf_kick_cpu(s32 cpu, u64 flags) -+{ -+ struct rq *this_rq; -+ unsigned long irq_flags; -+ -+ if (!ops_cpu_valid(cpu, NULL)) -+ return; -+ -+ /* -+ * While bypassing for PM ops, IRQ handling may not be online which can -+ * lead to irq_work_queue() malfunction such as infinite busy wait for -+ * IRQ status update. Suppress kicking. -+ */ -+ if (scx_ops_bypassing()) -+ return; -+ -+ local_irq_save(irq_flags); -+ -+ this_rq = this_rq(); -+ -+ /* -+ * Actual kicking is bounced to kick_cpus_irq_workfn() to avoid nesting -+ * rq locks. We can probably be smarter and avoid bouncing if called -+ * from ops which don't hold a rq lock. -+ */ -+ if (flags & SCX_KICK_IDLE) { -+ struct rq *target_rq = cpu_rq(cpu); -+ -+ if (unlikely(flags & (SCX_KICK_PREEMPT | SCX_KICK_WAIT))) -+ scx_ops_error("PREEMPT/WAIT cannot be used with SCX_KICK_IDLE"); -+ -+ if (raw_spin_rq_trylock(target_rq)) { -+ if (can_skip_idle_kick(target_rq)) { -+ raw_spin_rq_unlock(target_rq); -+ goto out; -+ } -+ raw_spin_rq_unlock(target_rq); -+ } -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick_if_idle); -+ } else { -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_kick); -+ -+ if (flags & SCX_KICK_PREEMPT) -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_preempt); -+ if (flags & SCX_KICK_WAIT) -+ cpumask_set_cpu(cpu, this_rq->scx.cpus_to_wait); -+ } -+ -+ irq_work_queue(&this_rq->scx.kick_cpus_irq_work); -+out: -+ local_irq_restore(irq_flags); -+} -+ -+/** -+ * scx_bpf_dsq_nr_queued - Return the number of queued tasks -+ * @dsq_id: id of the DSQ -+ * -+ * Return the number of tasks in the DSQ matching @dsq_id. If not found, -+ * -%ENOENT is returned. -+ */ -+__bpf_kfunc s32 scx_bpf_dsq_nr_queued(u64 dsq_id) -+{ -+ struct scx_dispatch_q *dsq; -+ s32 ret; -+ -+ preempt_disable(); -+ -+ if (dsq_id == SCX_DSQ_LOCAL) { -+ ret = READ_ONCE(this_rq()->scx.local_dsq.nr); -+ goto out; -+ } else if ((dsq_id & SCX_DSQ_LOCAL_ON) == SCX_DSQ_LOCAL_ON) { -+ s32 cpu = dsq_id & SCX_DSQ_LOCAL_CPU_MASK; -+ -+ if (ops_cpu_valid(cpu, NULL)) { -+ ret = READ_ONCE(cpu_rq(cpu)->scx.local_dsq.nr); -+ goto out; -+ } -+ } else { -+ dsq = find_non_local_dsq(dsq_id); -+ if (dsq) { -+ ret = READ_ONCE(dsq->nr); -+ goto out; -+ } -+ } -+ ret = -ENOENT; -+out: -+ preempt_enable(); -+ return ret; -+} -+ -+/** -+ * scx_bpf_destroy_dsq - Destroy a custom DSQ -+ * @dsq_id: DSQ to destroy -+ * -+ * Destroy the custom DSQ identified by @dsq_id. Only DSQs created with -+ * scx_bpf_create_dsq() can be destroyed. The caller must ensure that the DSQ is -+ * empty and no further tasks are dispatched to it. Ignored if called on a DSQ -+ * which doesn't exist. Can be called from any online scx_ops operations. -+ */ -+__bpf_kfunc void scx_bpf_destroy_dsq(u64 dsq_id) -+{ -+ destroy_dsq(dsq_id); -+} -+ -+/** -+ * bpf_iter_scx_dsq_new - Create a DSQ iterator -+ * @it: iterator to initialize -+ * @dsq_id: DSQ to iterate -+ * @flags: %SCX_DSQ_ITER_* -+ * -+ * Initialize BPF iterator @it which can be used with bpf_for_each() to walk -+ * tasks in the DSQ specified by @dsq_id. Iteration using @it only includes -+ * tasks which are already queued when this function is invoked. -+ */ -+__bpf_kfunc int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, -+ u64 flags) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ -+ BUILD_BUG_ON(sizeof(struct bpf_iter_scx_dsq_kern) > -+ sizeof(struct bpf_iter_scx_dsq)); -+ BUILD_BUG_ON(__alignof__(struct bpf_iter_scx_dsq_kern) != -+ __alignof__(struct bpf_iter_scx_dsq)); -+ -+ if (flags & ~__SCX_DSQ_ITER_ALL_FLAGS) -+ return -EINVAL; -+ -+ kit->dsq = find_non_local_dsq(dsq_id); -+ if (!kit->dsq) -+ return -ENOENT; -+ -+ INIT_LIST_HEAD(&kit->cursor.list); -+ RB_CLEAR_NODE(&kit->cursor.priq); -+ kit->cursor.flags = SCX_TASK_DSQ_CURSOR; -+ kit->self = kit; -+ kit->dsq_seq = READ_ONCE(kit->dsq->seq); -+ kit->flags = flags; -+ -+ return 0; -+} -+ -+/** -+ * bpf_iter_scx_dsq_next - Progress a DSQ iterator -+ * @it: iterator to progress -+ * -+ * Return the next task. See bpf_iter_scx_dsq_new(). -+ */ -+__bpf_kfunc struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ bool rev = kit->flags & SCX_DSQ_ITER_REV; -+ struct task_struct *p; -+ unsigned long flags; -+ -+ if (!kit->dsq) -+ return NULL; -+ -+ raw_spin_lock_irqsave(&kit->dsq->lock, flags); -+ -+ if (list_empty(&kit->cursor.list)) -+ p = NULL; -+ else -+ p = container_of(&kit->cursor, struct task_struct, scx.dsq_node); -+ -+ /* -+ * Only tasks which were queued before the iteration started are -+ * visible. This bounds BPF iterations and guarantees that vtime never -+ * jumps in the other direction while iterating. -+ */ -+ do { -+ p = nldsq_next_task(kit->dsq, p, rev); -+ } while (p && unlikely(time_after64(p->scx.dsq_seq, kit->dsq_seq))); -+ -+ if (p) { -+ if (rev) -+ list_move_tail(&kit->cursor.list, &p->scx.dsq_node.list); -+ else -+ list_move(&kit->cursor.list, &p->scx.dsq_node.list); -+ } else { -+ list_del_init(&kit->cursor.list); -+ } -+ -+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); -+ -+ return p; -+} -+ -+/** -+ * bpf_iter_scx_dsq_destroy - Destroy a DSQ iterator -+ * @it: iterator to destroy -+ * -+ * Undo scx_iter_scx_dsq_new(). -+ */ -+__bpf_kfunc void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) -+{ -+ struct bpf_iter_scx_dsq_kern *kit = (void *)it; -+ -+ if (!kit->dsq) -+ return; -+ -+ if (!list_empty(&kit->cursor.list)) { -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&kit->dsq->lock, flags); -+ list_del_init(&kit->cursor.list); -+ raw_spin_unlock_irqrestore(&kit->dsq->lock, flags); -+ } -+ kit->dsq = NULL; -+} -+ -+__bpf_kfunc_end_defs(); -+ -+static s32 __bstr_format(u64 *data_buf, char *line_buf, size_t line_size, -+ char *fmt, unsigned long long *data, u32 data__sz) -+{ -+ struct bpf_bprintf_data bprintf_data = { .get_bin_args = true }; -+ s32 ret; -+ -+ if (data__sz % 8 || data__sz > MAX_BPRINTF_VARARGS * 8 || -+ (data__sz && !data)) { -+ scx_ops_error("invalid data=%p and data__sz=%u", -+ (void *)data, data__sz); -+ return -EINVAL; -+ } -+ -+ ret = copy_from_kernel_nofault(data_buf, data, data__sz); -+ if (ret < 0) { -+ scx_ops_error("failed to read data fields (%d)", ret); -+ return ret; -+ } -+ -+ ret = bpf_bprintf_prepare(fmt, UINT_MAX, data_buf, data__sz / 8, -+ &bprintf_data); -+ if (ret < 0) { -+ scx_ops_error("format preparation failed (%d)", ret); -+ return ret; -+ } -+ -+ ret = bstr_printf(line_buf, line_size, fmt, -+ bprintf_data.bin_args); -+ bpf_bprintf_cleanup(&bprintf_data); -+ if (ret < 0) { -+ scx_ops_error("(\"%s\", %p, %u) failed to format", -+ fmt, data, data__sz); -+ return ret; -+ } -+ -+ return ret; -+} -+ -+static s32 bstr_format(struct scx_bstr_buf *buf, -+ char *fmt, unsigned long long *data, u32 data__sz) -+{ -+ return __bstr_format(buf->data, buf->line, sizeof(buf->line), -+ fmt, data, data__sz); -+} -+ -+__bpf_kfunc_start_defs(); -+ -+/** -+ * scx_bpf_exit_bstr - Gracefully exit the BPF scheduler. -+ * @exit_code: Exit value to pass to user space via struct scx_exit_info. -+ * @fmt: error message format string -+ * @data: format string parameters packaged using ___bpf_fill() macro -+ * @data__sz: @data len, must end in '__sz' for the verifier -+ * -+ * Indicate that the BPF scheduler wants to exit gracefully, and initiate ops -+ * disabling. -+ */ -+__bpf_kfunc void scx_bpf_exit_bstr(s64 exit_code, char *fmt, -+ unsigned long long *data, u32 data__sz) -+{ -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); -+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) -+ scx_ops_exit_kind(SCX_EXIT_UNREG_BPF, exit_code, "%s", -+ scx_exit_bstr_buf.line); -+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); -+} -+ -+/** -+ * scx_bpf_error_bstr - Indicate fatal error -+ * @fmt: error message format string -+ * @data: format string parameters packaged using ___bpf_fill() macro -+ * @data__sz: @data len, must end in '__sz' for the verifier -+ * -+ * Indicate that the BPF scheduler encountered a fatal error and initiate ops -+ * disabling. -+ */ -+__bpf_kfunc void scx_bpf_error_bstr(char *fmt, unsigned long long *data, -+ u32 data__sz) -+{ -+ unsigned long flags; -+ -+ raw_spin_lock_irqsave(&scx_exit_bstr_buf_lock, flags); -+ if (bstr_format(&scx_exit_bstr_buf, fmt, data, data__sz) >= 0) -+ scx_ops_exit_kind(SCX_EXIT_ERROR_BPF, 0, "%s", -+ scx_exit_bstr_buf.line); -+ raw_spin_unlock_irqrestore(&scx_exit_bstr_buf_lock, flags); -+} -+ -+/** -+ * scx_bpf_dump - Generate extra debug dump specific to the BPF scheduler -+ * @fmt: format string -+ * @data: format string parameters packaged using ___bpf_fill() macro -+ * @data__sz: @data len, must end in '__sz' for the verifier -+ * -+ * To be called through scx_bpf_dump() helper from ops.dump(), dump_cpu() and -+ * dump_task() to generate extra debug dump specific to the BPF scheduler. -+ * -+ * The extra dump may be multiple lines. A single line may be split over -+ * multiple calls. The last line is automatically terminated. -+ */ -+__bpf_kfunc void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, -+ u32 data__sz) -+{ -+ struct scx_dump_data *dd = &scx_dump_data; -+ struct scx_bstr_buf *buf = &dd->buf; -+ s32 ret; -+ -+ if (raw_smp_processor_id() != dd->cpu) { -+ scx_ops_error("scx_bpf_dump() must only be called from ops.dump() and friends"); -+ return; -+ } -+ -+ /* append the formatted string to the line buf */ -+ ret = __bstr_format(buf->data, buf->line + dd->cursor, -+ sizeof(buf->line) - dd->cursor, fmt, data, data__sz); -+ if (ret < 0) { -+ dump_line(dd->s, "%s[!] (\"%s\", %p, %u) failed to format (%d)", -+ dd->prefix, fmt, data, data__sz, ret); -+ return; -+ } -+ -+ dd->cursor += ret; -+ dd->cursor = min_t(s32, dd->cursor, sizeof(buf->line)); -+ -+ if (!dd->cursor) -+ return; -+ -+ /* -+ * If the line buf overflowed or ends in a newline, flush it into the -+ * dump. This is to allow the caller to generate a single line over -+ * multiple calls. As ops_dump_flush() can also handle multiple lines in -+ * the line buf, the only case which can lead to an unexpected -+ * truncation is when the caller keeps generating newlines in the middle -+ * instead of the end consecutively. Don't do that. -+ */ -+ if (dd->cursor >= sizeof(buf->line) || buf->line[dd->cursor - 1] == '\n') -+ ops_dump_flush(); -+} -+ -+/** -+ * scx_bpf_cpuperf_cap - Query the maximum relative capacity of a CPU -+ * @cpu: CPU of interest -+ * -+ * Return the maximum relative capacity of @cpu in relation to the most -+ * performant CPU in the system. The return value is in the range [1, -+ * %SCX_CPUPERF_ONE]. See scx_bpf_cpuperf_cur(). -+ */ -+__bpf_kfunc u32 scx_bpf_cpuperf_cap(s32 cpu) -+{ -+ if (ops_cpu_valid(cpu, NULL)) -+ return arch_scale_cpu_capacity(cpu); -+ else -+ return SCX_CPUPERF_ONE; -+} -+ -+/** -+ * scx_bpf_cpuperf_cur - Query the current relative performance of a CPU -+ * @cpu: CPU of interest -+ * -+ * Return the current relative performance of @cpu in relation to its maximum. -+ * The return value is in the range [1, %SCX_CPUPERF_ONE]. -+ * -+ * The current performance level of a CPU in relation to the maximum performance -+ * available in the system can be calculated as follows: -+ * -+ * scx_bpf_cpuperf_cap() * scx_bpf_cpuperf_cur() / %SCX_CPUPERF_ONE -+ * -+ * The result is in the range [1, %SCX_CPUPERF_ONE]. -+ */ -+__bpf_kfunc u32 scx_bpf_cpuperf_cur(s32 cpu) -+{ -+ if (ops_cpu_valid(cpu, NULL)) -+ return arch_scale_freq_capacity(cpu); -+ else -+ return SCX_CPUPERF_ONE; -+} -+ -+/** -+ * scx_bpf_cpuperf_set - Set the relative performance target of a CPU -+ * @cpu: CPU of interest -+ * @perf: target performance level [0, %SCX_CPUPERF_ONE] -+ * @flags: %SCX_CPUPERF_* flags -+ * -+ * Set the target performance level of @cpu to @perf. @perf is in linear -+ * relative scale between 0 and %SCX_CPUPERF_ONE. This determines how the -+ * schedutil cpufreq governor chooses the target frequency. -+ * -+ * The actual performance level chosen, CPU grouping, and the overhead and -+ * latency of the operations are dependent on the hardware and cpufreq driver in -+ * use. Consult hardware and cpufreq documentation for more information. The -+ * current performance level can be monitored using scx_bpf_cpuperf_cur(). -+ */ -+__bpf_kfunc void scx_bpf_cpuperf_set(u32 cpu, u32 perf) -+{ -+ if (unlikely(perf > SCX_CPUPERF_ONE)) { -+ scx_ops_error("Invalid cpuperf target %u for CPU %d", perf, cpu); -+ return; -+ } -+ -+ if (ops_cpu_valid(cpu, NULL)) { -+ struct rq *rq = cpu_rq(cpu); -+ -+ rq->scx.cpuperf_target = perf; -+ -+ rcu_read_lock_sched_notrace(); -+ cpufreq_update_util(cpu_rq(cpu), 0); -+ rcu_read_unlock_sched_notrace(); -+ } -+} -+ -+/** -+ * scx_bpf_nr_cpu_ids - Return the number of possible CPU IDs -+ * -+ * All valid CPU IDs in the system are smaller than the returned value. -+ */ -+__bpf_kfunc u32 scx_bpf_nr_cpu_ids(void) -+{ -+ return nr_cpu_ids; -+} -+ -+/** -+ * scx_bpf_get_possible_cpumask - Get a referenced kptr to cpu_possible_mask -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_possible_cpumask(void) -+{ -+ return cpu_possible_mask; -+} -+ -+/** -+ * scx_bpf_get_online_cpumask - Get a referenced kptr to cpu_online_mask -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_online_cpumask(void) -+{ -+ return cpu_online_mask; -+} -+ -+/** -+ * scx_bpf_put_cpumask - Release a possible/online cpumask -+ * @cpumask: cpumask to release -+ */ -+__bpf_kfunc void scx_bpf_put_cpumask(const struct cpumask *cpumask) -+{ -+ /* -+ * Empty function body because we aren't actually acquiring or releasing -+ * a reference to a global cpumask, which is read-only in the caller and -+ * is never released. The acquire / release semantics here are just used -+ * to make the cpumask is a trusted pointer in the caller. -+ */ -+} -+ -+/** -+ * scx_bpf_get_idle_cpumask - Get a referenced kptr to the idle-tracking -+ * per-CPU cpumask. -+ * -+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_cpumask(void) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return cpu_none_mask; -+ } -+ -+#ifdef CONFIG_SMP -+ return idle_masks.cpu; -+#else -+ return cpu_none_mask; -+#endif -+} -+ -+/** -+ * scx_bpf_get_idle_smtmask - Get a referenced kptr to the idle-tracking, -+ * per-physical-core cpumask. Can be used to determine if an entire physical -+ * core is free. -+ * -+ * Returns NULL if idle tracking is not enabled, or running on a UP kernel. -+ */ -+__bpf_kfunc const struct cpumask *scx_bpf_get_idle_smtmask(void) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return cpu_none_mask; -+ } -+ -+#ifdef CONFIG_SMP -+ if (sched_smt_active()) -+ return idle_masks.smt; -+ else -+ return idle_masks.cpu; -+#else -+ return cpu_none_mask; -+#endif -+} -+ -+/** -+ * scx_bpf_put_idle_cpumask - Release a previously acquired referenced kptr to -+ * either the percpu, or SMT idle-tracking cpumask. -+ */ -+__bpf_kfunc void scx_bpf_put_idle_cpumask(const struct cpumask *idle_mask) -+{ -+ /* -+ * Empty function body because we aren't actually acquiring or releasing -+ * a reference to a global idle cpumask, which is read-only in the -+ * caller and is never released. The acquire / release semantics here -+ * are just used to make the cpumask a trusted pointer in the caller. -+ */ -+} -+ -+/** -+ * scx_bpf_test_and_clear_cpu_idle - Test and clear @cpu's idle state -+ * @cpu: cpu to test and clear idle for -+ * -+ * Returns %true if @cpu was idle and its idle state was successfully cleared. -+ * %false otherwise. -+ * -+ * Unavailable if ops.update_idle() is implemented and -+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. -+ */ -+__bpf_kfunc bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return false; -+ } -+ -+ if (ops_cpu_valid(cpu, NULL)) -+ return test_and_clear_cpu_idle(cpu); -+ else -+ return false; -+} -+ -+/** -+ * scx_bpf_pick_idle_cpu - Pick and claim an idle cpu -+ * @cpus_allowed: Allowed cpumask -+ * @flags: %SCX_PICK_IDLE_CPU_* flags -+ * -+ * Pick and claim an idle cpu in @cpus_allowed. Returns the picked idle cpu -+ * number on success. -%EBUSY if no matching cpu was found. -+ * -+ * Idle CPU tracking may race against CPU scheduling state transitions. For -+ * example, this function may return -%EBUSY as CPUs are transitioning into the -+ * idle state. If the caller then assumes that there will be dispatch events on -+ * the CPUs as they were all busy, the scheduler may end up stalling with CPUs -+ * idling while there are pending tasks. Use scx_bpf_pick_any_cpu() and -+ * scx_bpf_kick_cpu() to guarantee that there will be at least one dispatch -+ * event in the near future. -+ * -+ * Unavailable if ops.update_idle() is implemented and -+ * %SCX_OPS_KEEP_BUILTIN_IDLE is not set. -+ */ -+__bpf_kfunc s32 scx_bpf_pick_idle_cpu(const struct cpumask *cpus_allowed, -+ u64 flags) -+{ -+ if (!static_branch_likely(&scx_builtin_idle_enabled)) { -+ scx_ops_error("built-in idle tracking is disabled"); -+ return -EBUSY; -+ } -+ -+ return scx_pick_idle_cpu(cpus_allowed, flags); -+} -+ -+/** -+ * scx_bpf_pick_any_cpu - Pick and claim an idle cpu if available or pick any CPU -+ * @cpus_allowed: Allowed cpumask -+ * @flags: %SCX_PICK_IDLE_CPU_* flags -+ * -+ * Pick and claim an idle cpu in @cpus_allowed. If none is available, pick any -+ * CPU in @cpus_allowed. Guaranteed to succeed and returns the picked idle cpu -+ * number if @cpus_allowed is not empty. -%EBUSY is returned if @cpus_allowed is -+ * empty. -+ * -+ * If ops.update_idle() is implemented and %SCX_OPS_KEEP_BUILTIN_IDLE is not -+ * set, this function can't tell which CPUs are idle and will always pick any -+ * CPU. -+ */ -+__bpf_kfunc s32 scx_bpf_pick_any_cpu(const struct cpumask *cpus_allowed, -+ u64 flags) -+{ -+ s32 cpu; -+ -+ if (static_branch_likely(&scx_builtin_idle_enabled)) { -+ cpu = scx_pick_idle_cpu(cpus_allowed, flags); -+ if (cpu >= 0) -+ return cpu; -+ } -+ -+ cpu = cpumask_any_distribute(cpus_allowed); -+ if (cpu < nr_cpu_ids) -+ return cpu; -+ else -+ return -EBUSY; -+} -+ -+/** -+ * scx_bpf_task_running - Is task currently running? -+ * @p: task of interest -+ */ -+__bpf_kfunc bool scx_bpf_task_running(const struct task_struct *p) -+{ -+ return task_rq(p)->curr == p; -+} -+ -+/** -+ * scx_bpf_task_cpu - CPU a task is currently associated with -+ * @p: task of interest -+ */ -+__bpf_kfunc s32 scx_bpf_task_cpu(const struct task_struct *p) -+{ -+ return task_cpu(p); -+} -+ -+/** -+ * scx_bpf_task_cgroup - Return the sched cgroup of a task -+ * @p: task of interest -+ * -+ * @p->sched_task_group->css.cgroup represents the cgroup @p is associated with -+ * from the scheduler's POV. SCX operations should use this function to -+ * determine @p's current cgroup as, unlike following @p->cgroups, -+ * @p->sched_task_group is protected by @p's rq lock and thus atomic w.r.t. all -+ * rq-locked operations. Can be called on the parameter tasks of rq-locked -+ * operations. The restriction guarantees that @p's rq is locked by the caller. -+ */ -+#ifdef CONFIG_CGROUP_SCHED -+__bpf_kfunc struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) -+{ -+ struct task_group *tg = p->sched_task_group; -+ struct cgroup *cgrp = &cgrp_dfl_root.cgrp; -+ -+ if (!scx_kf_allowed_on_arg_tasks(__SCX_KF_RQ_LOCKED, p)) -+ goto out; -+ -+ /* -+ * A task_group may either be a cgroup or an autogroup. In the latter -+ * case, @tg->css.cgroup is %NULL. A task_group can't become the other -+ * kind once created. -+ */ -+ if (tg && tg->css.cgroup) -+ cgrp = tg->css.cgroup; -+ else -+ cgrp = &cgrp_dfl_root.cgrp; -+out: -+ cgroup_get(cgrp); -+ return cgrp; -+} -+#endif -+ -+__bpf_kfunc_end_defs(); -+ -+BTF_KFUNCS_START(scx_kfunc_ids_any) -+BTF_ID_FLAGS(func, scx_bpf_kick_cpu) -+BTF_ID_FLAGS(func, scx_bpf_dsq_nr_queued) -+BTF_ID_FLAGS(func, scx_bpf_destroy_dsq) -+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_new, KF_ITER_NEW | KF_RCU_PROTECTED) -+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_next, KF_ITER_NEXT | KF_RET_NULL) -+BTF_ID_FLAGS(func, bpf_iter_scx_dsq_destroy, KF_ITER_DESTROY) -+BTF_ID_FLAGS(func, scx_bpf_exit_bstr, KF_TRUSTED_ARGS) -+BTF_ID_FLAGS(func, scx_bpf_error_bstr, KF_TRUSTED_ARGS) -+BTF_ID_FLAGS(func, scx_bpf_dump_bstr, KF_TRUSTED_ARGS) -+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cap) -+BTF_ID_FLAGS(func, scx_bpf_cpuperf_cur) -+BTF_ID_FLAGS(func, scx_bpf_cpuperf_set) -+BTF_ID_FLAGS(func, scx_bpf_nr_cpu_ids) -+BTF_ID_FLAGS(func, scx_bpf_get_possible_cpumask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_get_online_cpumask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_put_cpumask, KF_RELEASE) -+BTF_ID_FLAGS(func, scx_bpf_get_idle_cpumask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_get_idle_smtmask, KF_ACQUIRE) -+BTF_ID_FLAGS(func, scx_bpf_put_idle_cpumask, KF_RELEASE) -+BTF_ID_FLAGS(func, scx_bpf_test_and_clear_cpu_idle) -+BTF_ID_FLAGS(func, scx_bpf_pick_idle_cpu, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_pick_any_cpu, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_task_running, KF_RCU) -+BTF_ID_FLAGS(func, scx_bpf_task_cpu, KF_RCU) -+#ifdef CONFIG_CGROUP_SCHED -+BTF_ID_FLAGS(func, scx_bpf_task_cgroup, KF_RCU | KF_ACQUIRE) -+#endif -+BTF_KFUNCS_END(scx_kfunc_ids_any) -+ -+static const struct btf_kfunc_id_set scx_kfunc_set_any = { -+ .owner = THIS_MODULE, -+ .set = &scx_kfunc_ids_any, -+}; -+ -+static int __init scx_init(void) -+{ -+ int ret; -+ -+ /* -+ * kfunc registration can't be done from init_sched_ext_class() as -+ * register_btf_kfunc_id_set() needs most of the system to be up. -+ * -+ * Some kfuncs are context-sensitive and can only be called from -+ * specific SCX ops. They are grouped into BTF sets accordingly. -+ * Unfortunately, BPF currently doesn't have a way of enforcing such -+ * restrictions. Eventually, the verifier should be able to enforce -+ * them. For now, register them the same and make each kfunc explicitly -+ * check using scx_kf_allowed(). -+ */ -+ if ((ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_sleepable)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_select_cpu)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_enqueue_dispatch)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_dispatch)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_cpu_release)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_STRUCT_OPS, -+ &scx_kfunc_set_any)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_TRACING, -+ &scx_kfunc_set_any)) || -+ (ret = register_btf_kfunc_id_set(BPF_PROG_TYPE_SYSCALL, -+ &scx_kfunc_set_any))) { -+ pr_err("sched_ext: Failed to register kfunc sets (%d)\n", ret); -+ return ret; -+ } -+ -+ ret = register_bpf_struct_ops(&bpf_sched_ext_ops, sched_ext_ops); -+ if (ret) { -+ pr_err("sched_ext: Failed to register struct_ops (%d)\n", ret); -+ return ret; -+ } -+ -+ ret = register_pm_notifier(&scx_pm_notifier); -+ if (ret) { -+ pr_err("sched_ext: Failed to register PM notifier (%d)\n", ret); -+ return ret; -+ } -+ -+ scx_kset = kset_create_and_add("sched_ext", &scx_uevent_ops, kernel_kobj); -+ if (!scx_kset) { -+ pr_err("sched_ext: Failed to create /sys/kernel/sched_ext\n"); -+ return -ENOMEM; -+ } -+ -+ ret = sysfs_create_group(&scx_kset->kobj, &scx_global_attr_group); -+ if (ret < 0) { -+ pr_err("sched_ext: Failed to add global attributes\n"); -+ return ret; -+ } -+ -+ return 0; -+} -+__initcall(scx_init); -diff --git a/kernel/sched/ext.h b/kernel/sched/ext.h -new file mode 100644 -index 000000000000..52d9b7df2a25 ---- /dev/null -+++ b/kernel/sched/ext.h -@@ -0,0 +1,143 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * BPF extensible scheduler class: Documentation/scheduler/sched-ext.rst -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ -+struct sched_enq_and_set_ctx { -+ struct task_struct *p; -+ int queue_flags; -+ bool queued; -+ bool running; -+}; -+ -+void sched_deq_and_put_task(struct task_struct *p, int queue_flags, -+ struct sched_enq_and_set_ctx *ctx); -+void sched_enq_and_set_task(struct sched_enq_and_set_ctx *ctx); -+ -+extern const struct sched_class ext_sched_class; -+ -+DECLARE_STATIC_KEY_FALSE(__scx_ops_enabled); -+DECLARE_STATIC_KEY_FALSE(__scx_switched_all); -+#define scx_enabled() static_branch_unlikely(&__scx_ops_enabled) -+#define scx_switched_all() static_branch_unlikely(&__scx_switched_all) -+ -+DECLARE_STATIC_KEY_FALSE(scx_ops_cpu_preempt); -+ -+static inline bool task_on_scx(const struct task_struct *p) -+{ -+ return scx_enabled() && p->sched_class == &ext_sched_class; -+} -+ -+void scx_next_task_picked(struct rq *rq, struct task_struct *p, -+ const struct sched_class *active); -+void scx_tick(struct rq *rq); -+void init_scx_entity(struct sched_ext_entity *scx); -+void scx_pre_fork(struct task_struct *p); -+int scx_fork(struct task_struct *p); -+void scx_post_fork(struct task_struct *p); -+void scx_cancel_fork(struct task_struct *p); -+int scx_check_setscheduler(struct task_struct *p, int policy); -+bool scx_can_stop_tick(struct rq *rq); -+bool task_should_scx(struct task_struct *p); -+void init_sched_ext_class(void); -+void scx_rq_activate(struct rq *rq); -+void scx_rq_deactivate(struct rq *rq); -+ -+static inline u32 scx_cpuperf_target(s32 cpu) -+{ -+ if (scx_enabled()) -+ return cpu_rq(cpu)->scx.cpuperf_target; -+ else -+ return 0; -+} -+ -+static inline const struct sched_class *next_active_class(const struct sched_class *class) -+{ -+ class++; -+ if (scx_switched_all() && class == &fair_sched_class) -+ class++; -+ if (!scx_enabled() && class == &ext_sched_class) -+ class++; -+ return class; -+} -+ -+#define for_active_class_range(class, _from, _to) \ -+ for (class = (_from); class != (_to); class = next_active_class(class)) -+ -+#define for_each_active_class(class) \ -+ for_active_class_range(class, __sched_class_highest, __sched_class_lowest) -+ -+/* -+ * SCX requires a balance() call before every pick_next_task() call including -+ * when waking up from idle. -+ */ -+#define for_balance_class_range(class, prev_class, end_class) \ -+ for_active_class_range(class, (prev_class) > &ext_sched_class ? \ -+ &ext_sched_class : (prev_class), (end_class)) -+ -+#ifdef CONFIG_SCHED_CORE -+bool scx_prio_less(const struct task_struct *a, const struct task_struct *b, -+ bool in_fi); -+#endif -+ -+#else /* CONFIG_SCHED_CLASS_EXT */ -+ -+#define scx_enabled() false -+#define scx_switched_all() false -+ -+static inline void scx_next_task_picked(struct rq *rq, struct task_struct *p, -+ const struct sched_class *active) {} -+static inline void scx_tick(struct rq *rq) {} -+static inline void scx_pre_fork(struct task_struct *p) {} -+static inline int scx_fork(struct task_struct *p) { return 0; } -+static inline void scx_post_fork(struct task_struct *p) {} -+static inline void scx_cancel_fork(struct task_struct *p) {} -+static inline int scx_check_setscheduler(struct task_struct *p, int policy) { return 0; } -+static inline bool scx_can_stop_tick(struct rq *rq) { return true; } -+static inline bool task_on_scx(const struct task_struct *p) { return false; } -+static inline void init_sched_ext_class(void) {} -+static inline u32 scx_cpuperf_target(s32 cpu) { return 0; } -+static inline void scx_rq_activate(struct rq *rq) {} -+static inline void scx_rq_deactivate(struct rq *rq) {} -+ -+#define for_each_active_class for_each_class -+#define for_balance_class_range for_class_range -+ -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+ -+#if defined(CONFIG_SCHED_CLASS_EXT) && defined(CONFIG_SMP) -+void __scx_update_idle(struct rq *rq, bool idle); -+ -+static inline void scx_update_idle(struct rq *rq, bool idle) -+{ -+ if (scx_enabled()) -+ __scx_update_idle(rq, idle); -+} -+#else -+static inline void scx_update_idle(struct rq *rq, bool idle) {} -+#endif -+ -+#ifdef CONFIG_CGROUP_SCHED -+#ifdef CONFIG_EXT_GROUP_SCHED -+int scx_tg_online(struct task_group *tg); -+void scx_tg_offline(struct task_group *tg); -+int scx_cgroup_can_attach(struct cgroup_taskset *tset); -+void scx_move_task(struct task_struct *p); -+void scx_cgroup_finish_attach(void); -+void scx_cgroup_cancel_attach(struct cgroup_taskset *tset); -+void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight); -+#else /* CONFIG_EXT_GROUP_SCHED */ -+static inline int scx_tg_online(struct task_group *tg) { return 0; } -+static inline void scx_tg_offline(struct task_group *tg) {} -+static inline int scx_cgroup_can_attach(struct cgroup_taskset *tset) { return 0; } -+static inline void scx_move_task(struct task_struct *p) {} -+static inline void scx_cgroup_finish_attach(void) {} -+static inline void scx_cgroup_cancel_attach(struct cgroup_taskset *tset) {} -+static inline void scx_group_set_weight(struct task_group *tg, unsigned long cgrp_weight) {} -+#endif /* CONFIG_EXT_GROUP_SCHED */ -+#endif /* CONFIG_CGROUP_SCHED */ -diff --git a/kernel/sched/fair.c b/kernel/sched/fair.c -index a241e0d45922..00fbaec603bf 100644 ---- a/kernel/sched/fair.c -+++ b/kernel/sched/fair.c -@@ -3848,7 +3848,7 @@ static void reweight_entity(struct cfs_rq *cfs_rq, struct sched_entity *se, - } - } - --void reweight_task(struct task_struct *p, int prio) -+static void reweight_task_fair(struct rq *rq, struct task_struct *p, int prio) - { - struct sched_entity *se = &p->se; - struct cfs_rq *cfs_rq = cfs_rq_of(se); -@@ -8404,7 +8404,7 @@ static void check_preempt_wakeup_fair(struct rq *rq, struct task_struct *p, int - * Batch and idle tasks do not preempt non-idle tasks (their preemption - * is driven by the tick): - */ -- if (unlikely(p->policy != SCHED_NORMAL) || !sched_feat(WAKEUP_PREEMPTION)) -+ if (unlikely(!normal_policy(p->policy)) || !sched_feat(WAKEUP_PREEMPTION)) - return; - - find_matching_se(&se, &pse); -@@ -9365,28 +9365,18 @@ static inline void update_blocked_load_status(struct rq *rq, bool has_blocked) { - - static bool __update_blocked_others(struct rq *rq, bool *done) - { -- const struct sched_class *curr_class; -- u64 now = rq_clock_pelt(rq); -- unsigned long hw_pressure; -- bool decayed; -+ bool updated; - - /* - * update_load_avg() can call cpufreq_update_util(). Make sure that RT, - * DL and IRQ signals have been updated before updating CFS. - */ -- curr_class = rq->curr->sched_class; -- -- hw_pressure = arch_scale_hw_pressure(cpu_of(rq)); -- -- decayed = update_rt_rq_load_avg(now, rq, curr_class == &rt_sched_class) | -- update_dl_rq_load_avg(now, rq, curr_class == &dl_sched_class) | -- update_hw_load_avg(now, rq, hw_pressure) | -- update_irq_load_avg(rq, 0); -+ updated = update_other_load_avgs(rq); - - if (others_have_blocked(rq)) - *done = false; - -- return decayed; -+ return updated; - } - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -13227,6 +13217,7 @@ DEFINE_SCHED_CLASS(fair) = { - .task_tick = task_tick_fair, - .task_fork = task_fork_fair, - -+ .reweight_task = reweight_task_fair, - .prio_changed = prio_changed_fair, - .switched_from = switched_from_fair, - .switched_to = switched_to_fair, -diff --git a/kernel/sched/idle.c b/kernel/sched/idle.c -index 6135fbe83d68..3b6540cc436a 100644 ---- a/kernel/sched/idle.c -+++ b/kernel/sched/idle.c -@@ -458,11 +458,13 @@ static void wakeup_preempt_idle(struct rq *rq, struct task_struct *p, int flags) - - static void put_prev_task_idle(struct rq *rq, struct task_struct *prev) - { -+ scx_update_idle(rq, false); - } - - static void set_next_task_idle(struct rq *rq, struct task_struct *next, bool first) - { - update_idle_core(rq); -+ scx_update_idle(rq, true); - schedstat_inc(rq->sched_goidle); - } - -diff --git a/kernel/sched/sched.h b/kernel/sched/sched.h -index 6e6a45087015..920540d876a6 100644 ---- a/kernel/sched/sched.h -+++ b/kernel/sched/sched.h -@@ -187,9 +187,19 @@ static inline int idle_policy(int policy) - { - return policy == SCHED_IDLE; - } -+ -+static inline int normal_policy(int policy) -+{ -+#ifdef CONFIG_SCHED_CLASS_EXT -+ if (policy == SCHED_EXT) -+ return true; -+#endif -+ return policy == SCHED_NORMAL; -+} -+ - static inline int fair_policy(int policy) - { -- return policy == SCHED_NORMAL || policy == SCHED_BATCH; -+ return normal_policy(policy) || policy == SCHED_BATCH; - } - - static inline int rt_policy(int policy) -@@ -237,6 +247,24 @@ static inline void update_avg(u64 *avg, u64 sample) - #define shr_bound(val, shift) \ - (val >> min_t(typeof(shift), shift, BITS_PER_TYPE(typeof(val)) - 1)) - -+/* -+ * cgroup weight knobs should use the common MIN, DFL and MAX values which are -+ * 1, 100 and 10000 respectively. While it loses a bit of range on both ends, it -+ * maps pretty well onto the shares value used by scheduler and the round-trip -+ * conversions preserve the original value over the entire range. -+ */ -+static inline unsigned long sched_weight_from_cgroup(unsigned long cgrp_weight) -+{ -+ return DIV_ROUND_CLOSEST_ULL(cgrp_weight * 1024, CGROUP_WEIGHT_DFL); -+} -+ -+static inline unsigned long sched_weight_to_cgroup(unsigned long weight) -+{ -+ return clamp_t(unsigned long, -+ DIV_ROUND_CLOSEST_ULL(weight * CGROUP_WEIGHT_DFL, 1024), -+ CGROUP_WEIGHT_MIN, CGROUP_WEIGHT_MAX); -+} -+ - /* - * !! For sched_setattr_nocheck() (kernel) only !! - * -@@ -420,6 +448,11 @@ struct task_group { - struct rt_bandwidth rt_bandwidth; - #endif - -+#ifdef CONFIG_EXT_GROUP_SCHED -+ u32 scx_flags; /* SCX_TG_* */ -+ u32 scx_weight; -+#endif -+ - struct rcu_head rcu; - struct list_head list; - -@@ -475,6 +508,11 @@ static inline int walk_tg_tree(tg_visitor down, tg_visitor up, void *data) - return walk_tg_tree_from(&root_task_group, down, up, data); - } - -+static inline struct task_group *css_tg(struct cgroup_subsys_state *css) -+{ -+ return css ? container_of(css, struct task_group, css) : NULL; -+} -+ - extern int tg_nop(struct task_group *tg, void *data); - - #ifdef CONFIG_FAIR_GROUP_SCHED -@@ -531,6 +569,11 @@ extern void set_task_rq_fair(struct sched_entity *se, - static inline void set_task_rq_fair(struct sched_entity *se, - struct cfs_rq *prev, struct cfs_rq *next) { } - #endif /* CONFIG_SMP */ -+#else /* CONFIG_FAIR_GROUP_SCHED */ -+static inline int sched_group_set_shares(struct task_group *tg, unsigned long shares) -+{ -+ return 0; -+} - #endif /* CONFIG_FAIR_GROUP_SCHED */ - - #else /* CONFIG_CGROUP_SCHED */ -@@ -691,6 +734,37 @@ struct cfs_rq { - #endif /* CONFIG_FAIR_GROUP_SCHED */ - }; - -+#ifdef CONFIG_SCHED_CLASS_EXT -+/* scx_rq->flags, protected by the rq lock */ -+enum scx_rq_flags { -+ /* -+ * A hotplugged CPU starts scheduling before rq_online_scx(). Track -+ * ops.cpu_on/offline() state so that ops.enqueue/dispatch() are called -+ * only while the BPF scheduler considers the CPU to be online. -+ */ -+ SCX_RQ_ONLINE = 1 << 0, -+ SCX_RQ_BALANCING = 1 << 1, -+ SCX_RQ_CAN_STOP_TICK = 1 << 2, -+}; -+ -+struct scx_rq { -+ struct scx_dispatch_q local_dsq; -+ struct list_head runnable_list; /* runnable tasks on this rq */ -+ unsigned long ops_qseq; -+ u64 extra_enq_flags; /* see move_task_to_local_dsq() */ -+ u32 nr_running; -+ u32 flags; -+ u32 cpuperf_target; /* [0, SCHED_CAPACITY_SCALE] */ -+ bool cpu_released; -+ cpumask_var_t cpus_to_kick; -+ cpumask_var_t cpus_to_kick_if_idle; -+ cpumask_var_t cpus_to_preempt; -+ cpumask_var_t cpus_to_wait; -+ unsigned long pnt_seq; -+ struct irq_work kick_cpus_irq_work; -+}; -+#endif /* CONFIG_SCHED_CLASS_EXT */ -+ - static inline int rt_bandwidth_enabled(void) - { - return sysctl_sched_rt_runtime >= 0; -@@ -1036,6 +1110,9 @@ struct rq { - struct cfs_rq cfs; - struct rt_rq rt; - struct dl_rq dl; -+#ifdef CONFIG_SCHED_CLASS_EXT -+ struct scx_rq scx; -+#endif - - #ifdef CONFIG_FAIR_GROUP_SCHED - /* list of leaf cfs_rq on this CPU: */ -@@ -2304,8 +2381,11 @@ struct sched_class { - * cannot assume the switched_from/switched_to pair is serialized by - * rq->lock. They are however serialized by p->pi_lock. - */ -+ void (*switching_to) (struct rq *this_rq, struct task_struct *task); - void (*switched_from)(struct rq *this_rq, struct task_struct *task); - void (*switched_to) (struct rq *this_rq, struct task_struct *task); -+ void (*reweight_task)(struct rq *this_rq, struct task_struct *task, -+ int newprio); - void (*prio_changed) (struct rq *this_rq, struct task_struct *task, - int oldprio); - -@@ -2463,7 +2543,7 @@ extern void init_sched_dl_class(void); - extern void init_sched_rt_class(void); - extern void init_sched_fair_class(void); - --extern void reweight_task(struct task_struct *p, int prio); -+extern void __setscheduler_prio(struct task_struct *p, int prio); - - extern void resched_curr(struct rq *rq); - extern void resched_cpu(int cpu); -@@ -2541,6 +2621,12 @@ static inline void sub_nr_running(struct rq *rq, unsigned count) - extern void activate_task(struct rq *rq, struct task_struct *p, int flags); - extern void deactivate_task(struct rq *rq, struct task_struct *p, int flags); - -+extern void check_class_changing(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class); -+extern void check_class_changed(struct rq *rq, struct task_struct *p, -+ const struct sched_class *prev_class, -+ int oldprio); -+ - extern void wakeup_preempt(struct rq *rq, struct task_struct *p, int flags); - - #if defined(CONFIG_PREEMPT_RT) || defined(CONFIG_CACHY) -@@ -3006,6 +3092,7 @@ static inline void cpufreq_update_util(struct rq *rq, unsigned int flags) {} - #endif - - #ifdef CONFIG_SMP -+bool update_other_load_avgs(struct rq *rq); - unsigned long effective_cpu_util(int cpu, unsigned long util_cfs, - unsigned long *min, - unsigned long *max); -@@ -3048,6 +3135,8 @@ static inline unsigned long cpu_util_rt(struct rq *rq) - { - return READ_ONCE(rq->avg_rt.util_avg); - } -+#else -+static inline bool update_other_load_avgs(struct rq *rq) { return false; } - #endif - - #ifdef CONFIG_UCLAMP_TASK -@@ -3480,4 +3569,27 @@ static inline void init_sched_mm_cid(struct task_struct *t) { } - extern u64 avg_vruntime(struct cfs_rq *cfs_rq); - extern int entity_eligible(struct cfs_rq *cfs_rq, struct sched_entity *se); - -+#ifdef CONFIG_CGROUP_SCHED -+enum cpu_cftype_id { -+#if defined(CONFIG_FAIR_GROUP_SCHED) || defined(CONFIG_EXT_GROUP_SCHED) -+ CPU_CFTYPE_WEIGHT, -+ CPU_CFTYPE_WEIGHT_NICE, -+ CPU_CFTYPE_IDLE, -+#endif -+#ifdef CONFIG_CFS_BANDWIDTH -+ CPU_CFTYPE_MAX, -+ CPU_CFTYPE_MAX_BURST, -+#endif -+#ifdef CONFIG_UCLAMP_TASK_GROUP -+ CPU_CFTYPE_UCLAMP_MIN, -+ CPU_CFTYPE_UCLAMP_MAX, -+#endif -+ CPU_CFTYPE_CNT, -+}; -+ -+extern struct cftype cpu_cftypes[CPU_CFTYPE_CNT + 1]; -+#endif /* CONFIG_CGROUP_SCHED */ -+ -+#include "ext.h" -+ - #endif /* _KERNEL_SCHED_SCHED_H */ -diff --git a/lib/dump_stack.c b/lib/dump_stack.c -index 222c6d6c8281..9581ef4efec5 100644 ---- a/lib/dump_stack.c -+++ b/lib/dump_stack.c -@@ -68,6 +68,7 @@ void dump_stack_print_info(const char *log_lvl) - - print_worker_info(log_lvl, current); - print_stop_info(log_lvl, current); -+ print_scx_info(log_lvl, current); - } - - /** -diff --git a/lib/test_bpf.c b/lib/test_bpf.c -index 207ff87194db..ce5716c3999a 100644 ---- a/lib/test_bpf.c -+++ b/lib/test_bpf.c -@@ -15706,4 +15706,5 @@ static void __exit test_bpf_exit(void) - module_init(test_bpf_init); - module_exit(test_bpf_exit); - -+MODULE_DESCRIPTION("Testsuite for BPF interpreter and BPF JIT compiler"); - MODULE_LICENSE("GPL"); -diff --git a/net/bpf/bpf_dummy_struct_ops.c b/net/bpf/bpf_dummy_struct_ops.c -index 891cdf61c65a..3ea52b05adfb 100644 ---- a/net/bpf/bpf_dummy_struct_ops.c -+++ b/net/bpf/bpf_dummy_struct_ops.c -@@ -272,12 +272,12 @@ static int bpf_dummy_init_member(const struct btf_type *t, - return -EOPNOTSUPP; - } - --static int bpf_dummy_reg(void *kdata) -+static int bpf_dummy_reg(void *kdata, struct bpf_link *link) - { - return -EOPNOTSUPP; - } - --static void bpf_dummy_unreg(void *kdata) -+static void bpf_dummy_unreg(void *kdata, struct bpf_link *link) - { - } - -diff --git a/net/bridge/netfilter/nf_conntrack_bridge.c b/net/bridge/netfilter/nf_conntrack_bridge.c -index c3c51b9a6826..816bb0fde718 100644 ---- a/net/bridge/netfilter/nf_conntrack_bridge.c -+++ b/net/bridge/netfilter/nf_conntrack_bridge.c -@@ -32,7 +32,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, - struct sk_buff *)) - { - int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; -- bool mono_delivery_time = skb->mono_delivery_time; -+ u8 tstamp_type = skb->tstamp_type; - unsigned int hlen, ll_rs, mtu; - ktime_t tstamp = skb->tstamp; - struct ip_frag_state state; -@@ -82,7 +82,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, - if (iter.frag) - ip_fraglist_prepare(skb, &iter); - -- skb_set_delivery_time(skb, tstamp, mono_delivery_time); -+ skb_set_delivery_time(skb, tstamp, tstamp_type); - err = output(net, sk, data, skb); - if (err || !iter.frag) - break; -@@ -113,7 +113,7 @@ static int nf_br_ip_fragment(struct net *net, struct sock *sk, - goto blackhole; - } - -- skb_set_delivery_time(skb2, tstamp, mono_delivery_time); -+ skb_set_delivery_time(skb2, tstamp, tstamp_type); - err = output(net, sk, data, skb2); - if (err) - goto blackhole; -diff --git a/net/core/dev.c b/net/core/dev.c -index e1bb6d7856d9..85fe8138f3e4 100644 ---- a/net/core/dev.c -+++ b/net/core/dev.c -@@ -2160,7 +2160,7 @@ EXPORT_SYMBOL(net_disable_timestamp); - static inline void net_timestamp_set(struct sk_buff *skb) - { - skb->tstamp = 0; -- skb->mono_delivery_time = 0; -+ skb->tstamp_type = SKB_CLOCK_REALTIME; - if (static_branch_unlikely(&netstamp_needed_key)) - skb->tstamp = ktime_get_real(); - } -diff --git a/net/core/filter.c b/net/core/filter.c -index 2510464692af..7c46ecba3b01 100644 ---- a/net/core/filter.c -+++ b/net/core/filter.c -@@ -2274,12 +2274,12 @@ static int __bpf_redirect_neigh_v6(struct sk_buff *skb, struct net_device *dev, - - err = bpf_out_neigh_v6(net, skb, dev, nh); - if (unlikely(net_xmit_eval(err))) -- dev->stats.tx_errors++; -+ DEV_STATS_INC(dev, tx_errors); - else - ret = NET_XMIT_SUCCESS; - goto out_xmit; - out_drop: -- dev->stats.tx_errors++; -+ DEV_STATS_INC(dev, tx_errors); - kfree_skb(skb); - out_xmit: - return ret; -@@ -2380,12 +2380,12 @@ static int __bpf_redirect_neigh_v4(struct sk_buff *skb, struct net_device *dev, - - err = bpf_out_neigh_v4(net, skb, dev, nh); - if (unlikely(net_xmit_eval(err))) -- dev->stats.tx_errors++; -+ DEV_STATS_INC(dev, tx_errors); - else - ret = NET_XMIT_SUCCESS; - goto out_xmit; - out_drop: -- dev->stats.tx_errors++; -+ DEV_STATS_INC(dev, tx_errors); - kfree_skb(skb); - out_xmit: - return ret; -@@ -7726,17 +7726,21 @@ BPF_CALL_3(bpf_skb_set_tstamp, struct sk_buff *, skb, - return -EOPNOTSUPP; - - switch (tstamp_type) { -- case BPF_SKB_TSTAMP_DELIVERY_MONO: -+ case BPF_SKB_CLOCK_REALTIME: -+ skb->tstamp = tstamp; -+ skb->tstamp_type = SKB_CLOCK_REALTIME; -+ break; -+ case BPF_SKB_CLOCK_MONOTONIC: - if (!tstamp) - return -EINVAL; - skb->tstamp = tstamp; -- skb->mono_delivery_time = 1; -+ skb->tstamp_type = SKB_CLOCK_MONOTONIC; - break; -- case BPF_SKB_TSTAMP_UNSPEC: -- if (tstamp) -+ case BPF_SKB_CLOCK_TAI: -+ if (!tstamp) - return -EINVAL; -- skb->tstamp = 0; -- skb->mono_delivery_time = 0; -+ skb->tstamp = tstamp; -+ skb->tstamp_type = SKB_CLOCK_TAI; - break; - default: - return -EINVAL; -@@ -9387,16 +9391,17 @@ static struct bpf_insn *bpf_convert_tstamp_type_read(const struct bpf_insn *si, - { - __u8 value_reg = si->dst_reg; - __u8 skb_reg = si->src_reg; -- /* AX is needed because src_reg and dst_reg could be the same */ -- __u8 tmp_reg = BPF_REG_AX; -- -- *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, -- SKB_BF_MONO_TC_OFFSET); -- *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, -- SKB_MONO_DELIVERY_TIME_MASK, 2); -- *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_UNSPEC); -- *insn++ = BPF_JMP_A(1); -- *insn++ = BPF_MOV32_IMM(value_reg, BPF_SKB_TSTAMP_DELIVERY_MONO); -+ BUILD_BUG_ON(__SKB_CLOCK_MAX != (int)BPF_SKB_CLOCK_TAI); -+ BUILD_BUG_ON(SKB_CLOCK_REALTIME != (int)BPF_SKB_CLOCK_REALTIME); -+ BUILD_BUG_ON(SKB_CLOCK_MONOTONIC != (int)BPF_SKB_CLOCK_MONOTONIC); -+ BUILD_BUG_ON(SKB_CLOCK_TAI != (int)BPF_SKB_CLOCK_TAI); -+ *insn++ = BPF_LDX_MEM(BPF_B, value_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); -+ *insn++ = BPF_ALU32_IMM(BPF_AND, value_reg, SKB_TSTAMP_TYPE_MASK); -+#ifdef __BIG_ENDIAN_BITFIELD -+ *insn++ = BPF_ALU32_IMM(BPF_RSH, value_reg, SKB_TSTAMP_TYPE_RSHIFT); -+#else -+ BUILD_BUG_ON(!(SKB_TSTAMP_TYPE_MASK & 0x1)); -+#endif - - return insn; - } -@@ -9439,11 +9444,12 @@ static struct bpf_insn *bpf_convert_tstamp_read(const struct bpf_prog *prog, - __u8 tmp_reg = BPF_REG_AX; - - *insn++ = BPF_LDX_MEM(BPF_B, tmp_reg, skb_reg, SKB_BF_MONO_TC_OFFSET); -- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, -- TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK); -- *insn++ = BPF_JMP32_IMM(BPF_JNE, tmp_reg, -- TC_AT_INGRESS_MASK | SKB_MONO_DELIVERY_TIME_MASK, 2); -- /* skb->tc_at_ingress && skb->mono_delivery_time, -+ /* check if ingress mask bits is set */ -+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); -+ *insn++ = BPF_JMP_A(4); -+ *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, SKB_TSTAMP_TYPE_MASK, 1); -+ *insn++ = BPF_JMP_A(2); -+ /* skb->tc_at_ingress && skb->tstamp_type, - * read 0 as the (rcv) timestamp. - */ - *insn++ = BPF_MOV64_IMM(value_reg, 0); -@@ -9468,7 +9474,7 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, - * the bpf prog is aware the tstamp could have delivery time. - * Thus, write skb->tstamp as is if tstamp_type_access is true. - * Otherwise, writing at ingress will have to clear the -- * mono_delivery_time bit also. -+ * skb->tstamp_type bit also. - */ - if (!prog->tstamp_type_access) { - __u8 tmp_reg = BPF_REG_AX; -@@ -9478,8 +9484,8 @@ static struct bpf_insn *bpf_convert_tstamp_write(const struct bpf_prog *prog, - *insn++ = BPF_JMP32_IMM(BPF_JSET, tmp_reg, TC_AT_INGRESS_MASK, 1); - /* goto */ - *insn++ = BPF_JMP_A(2); -- /* : mono_delivery_time */ -- *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_MONO_DELIVERY_TIME_MASK); -+ /* : skb->tstamp_type */ -+ *insn++ = BPF_ALU32_IMM(BPF_AND, tmp_reg, ~SKB_TSTAMP_TYPE_MASK); - *insn++ = BPF_STX_MEM(BPF_B, skb_reg, tmp_reg, SKB_BF_MONO_TC_OFFSET); - } - #endif -diff --git a/net/core/sock.c b/net/core/sock.c -index 8629f9aecf91..521e6373d4f7 100644 ---- a/net/core/sock.c -+++ b/net/core/sock.c -@@ -2262,7 +2262,12 @@ static void sk_init_common(struct sock *sk) - lockdep_set_class_and_name(&sk->sk_error_queue.lock, - af_elock_keys + sk->sk_family, - af_family_elock_key_strings[sk->sk_family]); -- lockdep_set_class_and_name(&sk->sk_callback_lock, -+ if (sk->sk_kern_sock) -+ lockdep_set_class_and_name(&sk->sk_callback_lock, -+ af_kern_callback_keys + sk->sk_family, -+ af_family_kern_clock_key_strings[sk->sk_family]); -+ else -+ lockdep_set_class_and_name(&sk->sk_callback_lock, - af_callback_keys + sk->sk_family, - af_family_clock_key_strings[sk->sk_family]); - } -@@ -3460,18 +3465,6 @@ void sock_init_data_uid(struct socket *sock, struct sock *sk, kuid_t uid) - } - sk->sk_uid = uid; - -- rwlock_init(&sk->sk_callback_lock); -- if (sk->sk_kern_sock) -- lockdep_set_class_and_name( -- &sk->sk_callback_lock, -- af_kern_callback_keys + sk->sk_family, -- af_family_kern_clock_key_strings[sk->sk_family]); -- else -- lockdep_set_class_and_name( -- &sk->sk_callback_lock, -- af_callback_keys + sk->sk_family, -- af_family_clock_key_strings[sk->sk_family]); -- - sk->sk_state_change = sock_def_wakeup; - sk->sk_data_ready = sock_def_readable; - sk->sk_write_space = sock_def_write_space; -diff --git a/net/ieee802154/6lowpan/reassembly.c b/net/ieee802154/6lowpan/reassembly.c -index 56ef873828f4..867d637d86f0 100644 ---- a/net/ieee802154/6lowpan/reassembly.c -+++ b/net/ieee802154/6lowpan/reassembly.c -@@ -130,7 +130,7 @@ static int lowpan_frag_queue(struct lowpan_frag_queue *fq, - goto err; - - fq->q.stamp = skb->tstamp; -- fq->q.mono_delivery_time = skb->mono_delivery_time; -+ fq->q.tstamp_type = skb->tstamp_type; - if (frag_type == LOWPAN_DISPATCH_FRAG1) - fq->q.flags |= INET_FRAG_FIRST_IN; - -diff --git a/net/ipv4/bpf_tcp_ca.c b/net/ipv4/bpf_tcp_ca.c -index f180befc28bd..4273cac333f6 100644 ---- a/net/ipv4/bpf_tcp_ca.c -+++ b/net/ipv4/bpf_tcp_ca.c -@@ -260,17 +260,17 @@ static int bpf_tcp_ca_check_member(const struct btf_type *t, - return 0; - } - --static int bpf_tcp_ca_reg(void *kdata) -+static int bpf_tcp_ca_reg(void *kdata, struct bpf_link *link) - { - return tcp_register_congestion_control(kdata); - } - --static void bpf_tcp_ca_unreg(void *kdata) -+static void bpf_tcp_ca_unreg(void *kdata, struct bpf_link *link) - { - tcp_unregister_congestion_control(kdata); - } - --static int bpf_tcp_ca_update(void *kdata, void *old_kdata) -+static int bpf_tcp_ca_update(void *kdata, void *old_kdata, struct bpf_link *link) - { - return tcp_update_congestion_control(kdata, old_kdata); - } -diff --git a/net/ipv4/inet_fragment.c b/net/ipv4/inet_fragment.c -index faaec92a46ac..d179a2c84222 100644 ---- a/net/ipv4/inet_fragment.c -+++ b/net/ipv4/inet_fragment.c -@@ -619,7 +619,7 @@ void inet_frag_reasm_finish(struct inet_frag_queue *q, struct sk_buff *head, - skb_mark_not_on_list(head); - head->prev = NULL; - head->tstamp = q->stamp; -- head->mono_delivery_time = q->mono_delivery_time; -+ head->tstamp_type = q->tstamp_type; - - if (sk) - refcount_add(sum_truesize - head_truesize, &sk->sk_wmem_alloc); -diff --git a/net/ipv4/ip_fragment.c b/net/ipv4/ip_fragment.c -index 08e2c92e25ab..a92664a5ef2e 100644 ---- a/net/ipv4/ip_fragment.c -+++ b/net/ipv4/ip_fragment.c -@@ -355,7 +355,7 @@ static int ip_frag_queue(struct ipq *qp, struct sk_buff *skb) - qp->iif = dev->ifindex; - - qp->q.stamp = skb->tstamp; -- qp->q.mono_delivery_time = skb->mono_delivery_time; -+ qp->q.tstamp_type = skb->tstamp_type; - qp->q.meat += skb->len; - qp->ecn |= ecn; - add_frag_mem_limit(qp->q.fqdir, skb->truesize); -diff --git a/net/ipv4/ip_output.c b/net/ipv4/ip_output.c -index 9500031a1f55..b90d0f78ac80 100644 ---- a/net/ipv4/ip_output.c -+++ b/net/ipv4/ip_output.c -@@ -764,7 +764,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - { - struct iphdr *iph; - struct sk_buff *skb2; -- bool mono_delivery_time = skb->mono_delivery_time; -+ u8 tstamp_type = skb->tstamp_type; - struct rtable *rt = skb_rtable(skb); - unsigned int mtu, hlen, ll_rs; - struct ip_fraglist_iter iter; -@@ -856,7 +856,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - } - } - -- skb_set_delivery_time(skb, tstamp, mono_delivery_time); -+ skb_set_delivery_time(skb, tstamp, tstamp_type); - err = output(net, sk, skb); - - if (!err) -@@ -912,7 +912,7 @@ int ip_do_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - /* - * Put this fragment into the sending queue. - */ -- skb_set_delivery_time(skb2, tstamp, mono_delivery_time); -+ skb_set_delivery_time(skb2, tstamp, tstamp_type); - err = output(net, sk, skb2); - if (err) - goto fail; -@@ -1457,7 +1457,10 @@ struct sk_buff *__ip_make_skb(struct sock *sk, - - skb->priority = (cork->tos != -1) ? cork->priority: READ_ONCE(sk->sk_priority); - skb->mark = cork->mark; -- skb->tstamp = cork->transmit_time; -+ if (sk_is_tcp(sk)) -+ skb_set_delivery_time(skb, cork->transmit_time, SKB_CLOCK_MONOTONIC); -+ else -+ skb_set_delivery_type_by_clockid(skb, cork->transmit_time, sk->sk_clockid); - /* - * Steal rt from cork.dst to avoid a pair of atomic_inc/atomic_dec - * on dst refcount -@@ -1649,7 +1652,8 @@ void ip_send_unicast_reply(struct sock *sk, struct sk_buff *skb, - arg->csumoffset) = csum_fold(csum_add(nskb->csum, - arg->csum)); - nskb->ip_summed = CHECKSUM_NONE; -- nskb->mono_delivery_time = !!transmit_time; -+ if (transmit_time) -+ nskb->tstamp_type = SKB_CLOCK_MONOTONIC; - if (txhash) - skb_set_hash(nskb, txhash, PKT_HASH_TYPE_L4); - ip_push_pending_frames(sk, &fl4); -diff --git a/net/ipv4/raw.c b/net/ipv4/raw.c -index 4cb43401e0e0..1a0953650356 100644 ---- a/net/ipv4/raw.c -+++ b/net/ipv4/raw.c -@@ -360,7 +360,7 @@ static int raw_send_hdrinc(struct sock *sk, struct flowi4 *fl4, - skb->protocol = htons(ETH_P_IP); - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = sockc->mark; -- skb->tstamp = sockc->transmit_time; -+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); - skb_dst_set(skb, &rt->dst); - *rtp = NULL; - -diff --git a/net/ipv4/tcp_ipv4.c b/net/ipv4/tcp_ipv4.c -index b710958393e6..8e891e56c5e0 100644 ---- a/net/ipv4/tcp_ipv4.c -+++ b/net/ipv4/tcp_ipv4.c -@@ -3620,6 +3620,8 @@ void __init tcp_v4_init(void) - */ - inet_sk(sk)->pmtudisc = IP_PMTUDISC_DO; - -+ sk->sk_clockid = CLOCK_MONOTONIC; -+ - per_cpu(ipv4_tcp_sk, cpu) = sk; - } - if (register_pernet_subsys(&tcp_sk_ops)) -diff --git a/net/ipv4/tcp_output.c b/net/ipv4/tcp_output.c -index 3f4bdd2b6476..f68fd3fd1f9f 100644 ---- a/net/ipv4/tcp_output.c -+++ b/net/ipv4/tcp_output.c -@@ -1304,7 +1304,7 @@ static int __tcp_transmit_skb(struct sock *sk, struct sk_buff *skb, - tp = tcp_sk(sk); - prior_wstamp = tp->tcp_wstamp_ns; - tp->tcp_wstamp_ns = max(tp->tcp_wstamp_ns, tp->tcp_clock_cache); -- skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); -+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); - if (clone_it) { - oskb = skb; - -@@ -1658,7 +1658,7 @@ int tcp_fragment(struct sock *sk, enum tcp_queue tcp_queue, - - skb_split(skb, buff, len); - -- skb_set_delivery_time(buff, skb->tstamp, true); -+ skb_set_delivery_time(buff, skb->tstamp, SKB_CLOCK_MONOTONIC); - tcp_fragment_tstamp(skb, buff); - - old_factor = tcp_skb_pcount(skb); -@@ -2790,7 +2790,7 @@ static bool tcp_write_xmit(struct sock *sk, unsigned int mss_now, int nonagle, - if (unlikely(tp->repair) && tp->repair_queue == TCP_SEND_QUEUE) { - /* "skb_mstamp_ns" is used as a start point for the retransmit timer */ - tp->tcp_wstamp_ns = tp->tcp_clock_cache; -- skb_set_delivery_time(skb, tp->tcp_wstamp_ns, true); -+ skb_set_delivery_time(skb, tp->tcp_wstamp_ns, SKB_CLOCK_MONOTONIC); - list_move_tail(&skb->tcp_tsorted_anchor, &tp->tsorted_sent_queue); - tcp_init_tso_segs(skb, mss_now); - tcp_set_tx_in_flight(sk, skb); -@@ -3780,11 +3780,11 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, - #ifdef CONFIG_SYN_COOKIES - if (unlikely(synack_type == TCP_SYNACK_COOKIE && ireq->tstamp_ok)) - skb_set_delivery_time(skb, cookie_init_timestamp(req, now), -- true); -+ SKB_CLOCK_MONOTONIC); - else - #endif - { -- skb_set_delivery_time(skb, now, true); -+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); - if (!tcp_rsk(req)->snt_synack) /* Timestamp first SYNACK */ - tcp_rsk(req)->snt_synack = tcp_skb_timestamp_us(skb); - } -@@ -3871,7 +3871,7 @@ struct sk_buff *tcp_make_synack(const struct sock *sk, struct dst_entry *dst, - bpf_skops_write_hdr_opt((struct sock *)sk, skb, req, syn_skb, - synack_type, &opts); - -- skb_set_delivery_time(skb, now, true); -+ skb_set_delivery_time(skb, now, SKB_CLOCK_MONOTONIC); - tcp_add_tx_delay(skb, tp); - - return skb; -@@ -4055,7 +4055,7 @@ static int tcp_send_syn_data(struct sock *sk, struct sk_buff *syn) - - err = tcp_transmit_skb(sk, syn_data, 1, sk->sk_allocation); - -- skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, true); -+ skb_set_delivery_time(syn, syn_data->skb_mstamp_ns, SKB_CLOCK_MONOTONIC); - - /* Now full SYN+DATA was cloned and sent (or not), - * remove the SYN from the original skb (syn_data) -diff --git a/net/ipv6/ip6_output.c b/net/ipv6/ip6_output.c -index 27d8725445e3..e7a19df3125e 100644 ---- a/net/ipv6/ip6_output.c -+++ b/net/ipv6/ip6_output.c -@@ -859,7 +859,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct rt6_info *rt = dst_rt6_info(skb_dst(skb)); - struct ipv6_pinfo *np = skb->sk && !dev_recursion_level() ? - inet6_sk(skb->sk) : NULL; -- bool mono_delivery_time = skb->mono_delivery_time; -+ u8 tstamp_type = skb->tstamp_type; - struct ip6_frag_state state; - unsigned int mtu, hlen, nexthdr_offset; - ktime_t tstamp = skb->tstamp; -@@ -955,7 +955,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - if (iter.frag) - ip6_fraglist_prepare(skb, &iter); - -- skb_set_delivery_time(skb, tstamp, mono_delivery_time); -+ skb_set_delivery_time(skb, tstamp, tstamp_type); - err = output(net, sk, skb); - if (!err) - IP6_INC_STATS(net, ip6_dst_idev(&rt->dst), -@@ -1016,7 +1016,7 @@ int ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - /* - * Put this fragment into the sending queue. - */ -- skb_set_delivery_time(frag, tstamp, mono_delivery_time); -+ skb_set_delivery_time(frag, tstamp, tstamp_type); - err = output(net, sk, frag); - if (err) - goto fail; -@@ -1924,7 +1924,10 @@ struct sk_buff *__ip6_make_skb(struct sock *sk, - - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = cork->base.mark; -- skb->tstamp = cork->base.transmit_time; -+ if (sk_is_tcp(sk)) -+ skb_set_delivery_time(skb, cork->base.transmit_time, SKB_CLOCK_MONOTONIC); -+ else -+ skb_set_delivery_type_by_clockid(skb, cork->base.transmit_time, sk->sk_clockid); - - ip6_cork_steal_dst(skb, cork); - IP6_INC_STATS(net, rt->rt6i_idev, IPSTATS_MIB_OUTREQUESTS); -diff --git a/net/ipv6/netfilter.c b/net/ipv6/netfilter.c -index 53d255838e6a..e0c2347b4dc6 100644 ---- a/net/ipv6/netfilter.c -+++ b/net/ipv6/netfilter.c -@@ -126,7 +126,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - struct sk_buff *)) - { - int frag_max_size = BR_INPUT_SKB_CB(skb)->frag_max_size; -- bool mono_delivery_time = skb->mono_delivery_time; -+ u8 tstamp_type = skb->tstamp_type; - ktime_t tstamp = skb->tstamp; - struct ip6_frag_state state; - u8 *prevhdr, nexthdr = 0; -@@ -192,7 +192,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - if (iter.frag) - ip6_fraglist_prepare(skb, &iter); - -- skb_set_delivery_time(skb, tstamp, mono_delivery_time); -+ skb_set_delivery_time(skb, tstamp, tstamp_type); - err = output(net, sk, data, skb); - if (err || !iter.frag) - break; -@@ -225,7 +225,7 @@ int br_ip6_fragment(struct net *net, struct sock *sk, struct sk_buff *skb, - goto blackhole; - } - -- skb_set_delivery_time(skb2, tstamp, mono_delivery_time); -+ skb_set_delivery_time(skb2, tstamp, tstamp_type); - err = output(net, sk, data, skb2); - if (err) - goto blackhole; -diff --git a/net/ipv6/netfilter/nf_conntrack_reasm.c b/net/ipv6/netfilter/nf_conntrack_reasm.c -index 5e1b50c6a44d..6f0844c9315d 100644 ---- a/net/ipv6/netfilter/nf_conntrack_reasm.c -+++ b/net/ipv6/netfilter/nf_conntrack_reasm.c -@@ -263,7 +263,7 @@ static int nf_ct_frag6_queue(struct frag_queue *fq, struct sk_buff *skb, - fq->iif = dev->ifindex; - - fq->q.stamp = skb->tstamp; -- fq->q.mono_delivery_time = skb->mono_delivery_time; -+ fq->q.tstamp_type = skb->tstamp_type; - fq->q.meat += skb->len; - fq->ecn |= ecn; - if (payload_len > fq->q.max_size) -diff --git a/net/ipv6/raw.c b/net/ipv6/raw.c -index 2eedf255600b..f838366e8256 100644 ---- a/net/ipv6/raw.c -+++ b/net/ipv6/raw.c -@@ -621,7 +621,7 @@ static int rawv6_send_hdrinc(struct sock *sk, struct msghdr *msg, int length, - skb->protocol = htons(ETH_P_IPV6); - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = sockc->mark; -- skb->tstamp = sockc->transmit_time; -+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, sk->sk_clockid); - - skb_put(skb, length); - skb_reset_network_header(skb); -diff --git a/net/ipv6/reassembly.c b/net/ipv6/reassembly.c -index 327caca64257..a48be617a8ab 100644 ---- a/net/ipv6/reassembly.c -+++ b/net/ipv6/reassembly.c -@@ -198,7 +198,7 @@ static int ip6_frag_queue(struct frag_queue *fq, struct sk_buff *skb, - fq->iif = dev->ifindex; - - fq->q.stamp = skb->tstamp; -- fq->q.mono_delivery_time = skb->mono_delivery_time; -+ fq->q.tstamp_type = skb->tstamp_type; - fq->q.meat += skb->len; - fq->ecn |= ecn; - add_frag_mem_limit(fq->q.fqdir, skb->truesize); -diff --git a/net/ipv6/tcp_ipv6.c b/net/ipv6/tcp_ipv6.c -index 8c577b651bfc..a8fd473a61ee 100644 ---- a/net/ipv6/tcp_ipv6.c -+++ b/net/ipv6/tcp_ipv6.c -@@ -975,7 +975,7 @@ static void tcp_v6_send_response(const struct sock *sk, struct sk_buff *skb, u32 - mark = inet_twsk(sk)->tw_mark; - else - mark = READ_ONCE(sk->sk_mark); -- skb_set_delivery_time(buff, tcp_transmit_time(sk), true); -+ skb_set_delivery_time(buff, tcp_transmit_time(sk), SKB_CLOCK_MONOTONIC); - } - if (txhash) { - /* autoflowlabel/skb_get_hash_flowi6 rely on buff->hash */ -@@ -2382,8 +2382,14 @@ static struct inet_protosw tcpv6_protosw = { - - static int __net_init tcpv6_net_init(struct net *net) - { -- return inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, -- SOCK_RAW, IPPROTO_TCP, net); -+ int res; -+ -+ res = inet_ctl_sock_create(&net->ipv6.tcp_sk, PF_INET6, -+ SOCK_RAW, IPPROTO_TCP, net); -+ if (!res) -+ net->ipv6.tcp_sk->sk_clockid = CLOCK_MONOTONIC; -+ -+ return res; - } - - static void __net_exit tcpv6_net_exit(struct net *net) -diff --git a/net/netfilter/nf_conntrack_bpf.c b/net/netfilter/nf_conntrack_bpf.c -index d2492d050fe6..4a136fc3a9c0 100644 ---- a/net/netfilter/nf_conntrack_bpf.c -+++ b/net/netfilter/nf_conntrack_bpf.c -@@ -32,7 +32,9 @@ - * -EINVAL - Passed NULL for bpf_tuple pointer - * -EINVAL - opts->reserved is not 0 - * -EINVAL - netns_id is less than -1 -- * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (12) -+ * -EINVAL - opts__sz isn't NF_BPF_CT_OPTS_SZ (16) or 12 -+ * -EINVAL - opts->ct_zone_id set when -+ opts__sz isn't NF_BPF_CT_OPTS_SZ (16) - * -EPROTO - l4proto isn't one of IPPROTO_TCP or IPPROTO_UDP - * -ENONET - No network namespace found for netns_id - * -ENOENT - Conntrack lookup could not find entry for tuple -@@ -42,6 +44,8 @@ - * Values: - * IPPROTO_TCP, IPPROTO_UDP - * @dir: - connection tracking tuple direction. -+ * @ct_zone_id - connection tracking zone id. -+ * @ct_zone_dir - connection tracking zone direction. - * @reserved - Reserved member, will be reused for more options in future - * Values: - * 0 -@@ -51,11 +55,13 @@ struct bpf_ct_opts { - s32 error; - u8 l4proto; - u8 dir; -- u8 reserved[2]; -+ u16 ct_zone_id; -+ u8 ct_zone_dir; -+ u8 reserved[3]; - }; - - enum { -- NF_BPF_CT_OPTS_SZ = 12, -+ NF_BPF_CT_OPTS_SZ = 16, - }; - - static int bpf_nf_ct_tuple_parse(struct bpf_sock_tuple *bpf_tuple, -@@ -104,12 +110,21 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, - u32 timeout) - { - struct nf_conntrack_tuple otuple, rtuple; -+ struct nf_conntrack_zone ct_zone; - struct nf_conn *ct; - int err; - -- if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || -- opts_len != NF_BPF_CT_OPTS_SZ) -+ if (!opts || !bpf_tuple) - return ERR_PTR(-EINVAL); -+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) -+ return ERR_PTR(-EINVAL); -+ if (opts_len == NF_BPF_CT_OPTS_SZ) { -+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) -+ return ERR_PTR(-EINVAL); -+ } else { -+ if (opts->ct_zone_id) -+ return ERR_PTR(-EINVAL); -+ } - - if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) - return ERR_PTR(-EINVAL); -@@ -130,7 +145,16 @@ __bpf_nf_ct_alloc_entry(struct net *net, struct bpf_sock_tuple *bpf_tuple, - return ERR_PTR(-ENONET); - } - -- ct = nf_conntrack_alloc(net, &nf_ct_zone_dflt, &otuple, &rtuple, -+ if (opts_len == NF_BPF_CT_OPTS_SZ) { -+ if (opts->ct_zone_dir == 0) -+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; -+ nf_ct_zone_init(&ct_zone, -+ opts->ct_zone_id, opts->ct_zone_dir, 0); -+ } else { -+ ct_zone = nf_ct_zone_dflt; -+ } -+ -+ ct = nf_conntrack_alloc(net, &ct_zone, &otuple, &rtuple, - GFP_ATOMIC); - if (IS_ERR(ct)) - goto out; -@@ -152,12 +176,21 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, - { - struct nf_conntrack_tuple_hash *hash; - struct nf_conntrack_tuple tuple; -+ struct nf_conntrack_zone ct_zone; - struct nf_conn *ct; - int err; - -- if (!opts || !bpf_tuple || opts->reserved[0] || opts->reserved[1] || -- opts_len != NF_BPF_CT_OPTS_SZ) -+ if (!opts || !bpf_tuple) - return ERR_PTR(-EINVAL); -+ if (!(opts_len == NF_BPF_CT_OPTS_SZ || opts_len == 12)) -+ return ERR_PTR(-EINVAL); -+ if (opts_len == NF_BPF_CT_OPTS_SZ) { -+ if (opts->reserved[0] || opts->reserved[1] || opts->reserved[2]) -+ return ERR_PTR(-EINVAL); -+ } else { -+ if (opts->ct_zone_id) -+ return ERR_PTR(-EINVAL); -+ } - if (unlikely(opts->l4proto != IPPROTO_TCP && opts->l4proto != IPPROTO_UDP)) - return ERR_PTR(-EPROTO); - if (unlikely(opts->netns_id < BPF_F_CURRENT_NETNS)) -@@ -174,7 +207,16 @@ static struct nf_conn *__bpf_nf_ct_lookup(struct net *net, - return ERR_PTR(-ENONET); - } - -- hash = nf_conntrack_find_get(net, &nf_ct_zone_dflt, &tuple); -+ if (opts_len == NF_BPF_CT_OPTS_SZ) { -+ if (opts->ct_zone_dir == 0) -+ opts->ct_zone_dir = NF_CT_DEFAULT_ZONE_DIR; -+ nf_ct_zone_init(&ct_zone, -+ opts->ct_zone_id, opts->ct_zone_dir, 0); -+ } else { -+ ct_zone = nf_ct_zone_dflt; -+ } -+ -+ hash = nf_conntrack_find_get(net, &ct_zone, &tuple); - if (opts->netns_id >= 0) - put_net(net); - if (!hash) -@@ -245,7 +287,7 @@ __bpf_kfunc_start_defs(); - * @opts - Additional options for allocation (documented above) - * Cannot be NULL - * @opts__sz - Length of the bpf_ct_opts structure -- * Must be NF_BPF_CT_OPTS_SZ (12) -+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12 - */ - __bpf_kfunc struct nf_conn___init * - bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, -@@ -279,7 +321,7 @@ bpf_xdp_ct_alloc(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, - * @opts - Additional options for lookup (documented above) - * Cannot be NULL - * @opts__sz - Length of the bpf_ct_opts structure -- * Must be NF_BPF_CT_OPTS_SZ (12) -+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12 - */ - __bpf_kfunc struct nf_conn * - bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, -@@ -312,7 +354,7 @@ bpf_xdp_ct_lookup(struct xdp_md *xdp_ctx, struct bpf_sock_tuple *bpf_tuple, - * @opts - Additional options for allocation (documented above) - * Cannot be NULL - * @opts__sz - Length of the bpf_ct_opts structure -- * Must be NF_BPF_CT_OPTS_SZ (12) -+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12 - */ - __bpf_kfunc struct nf_conn___init * - bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, -@@ -347,7 +389,7 @@ bpf_skb_ct_alloc(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, - * @opts - Additional options for lookup (documented above) - * Cannot be NULL - * @opts__sz - Length of the bpf_ct_opts structure -- * Must be NF_BPF_CT_OPTS_SZ (12) -+ * Must be NF_BPF_CT_OPTS_SZ (16) or 12 - */ - __bpf_kfunc struct nf_conn * - bpf_skb_ct_lookup(struct __sk_buff *skb_ctx, struct bpf_sock_tuple *bpf_tuple, -diff --git a/net/packet/af_packet.c b/net/packet/af_packet.c -index ea3ebc160e25..fce390887591 100644 ---- a/net/packet/af_packet.c -+++ b/net/packet/af_packet.c -@@ -2056,8 +2056,7 @@ static int packet_sendmsg_spkt(struct socket *sock, struct msghdr *msg, - skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = READ_ONCE(sk->sk_mark); -- skb->tstamp = sockc.transmit_time; -- -+ skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); - skb_setup_tx_timestamp(skb, sockc.tsflags); - - if (unlikely(extra_len == 4)) -@@ -2584,7 +2583,7 @@ static int tpacket_fill_skb(struct packet_sock *po, struct sk_buff *skb, - skb->dev = dev; - skb->priority = READ_ONCE(po->sk.sk_priority); - skb->mark = READ_ONCE(po->sk.sk_mark); -- skb->tstamp = sockc->transmit_time; -+ skb_set_delivery_type_by_clockid(skb, sockc->transmit_time, po->sk.sk_clockid); - skb_setup_tx_timestamp(skb, sockc->tsflags); - skb_zcopy_set_nouarg(skb, ph.raw); - -@@ -3062,7 +3061,7 @@ static int packet_snd(struct socket *sock, struct msghdr *msg, size_t len) - skb->dev = dev; - skb->priority = READ_ONCE(sk->sk_priority); - skb->mark = sockc.mark; -- skb->tstamp = sockc.transmit_time; -+ skb_set_delivery_type_by_clockid(skb, sockc.transmit_time, sk->sk_clockid); - - if (unlikely(extra_len == 4)) - skb->no_fcs = 1; -diff --git a/net/sched/act_bpf.c b/net/sched/act_bpf.c -index 0e3cf11ae5fc..396b576390d0 100644 ---- a/net/sched/act_bpf.c -+++ b/net/sched/act_bpf.c -@@ -54,8 +54,8 @@ TC_INDIRECT_SCOPE int tcf_bpf_act(struct sk_buff *skb, - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(filter, skb); - } -- if (unlikely(!skb->tstamp && skb->mono_delivery_time)) -- skb->mono_delivery_time = 0; -+ if (unlikely(!skb->tstamp && skb->tstamp_type)) -+ skb->tstamp_type = SKB_CLOCK_REALTIME; - if (skb_sk_is_prefetched(skb) && filter_res != TC_ACT_OK) - skb_orphan(skb); - -diff --git a/net/sched/cls_bpf.c b/net/sched/cls_bpf.c -index 5e83e890f6a4..1941ebec23ff 100644 ---- a/net/sched/cls_bpf.c -+++ b/net/sched/cls_bpf.c -@@ -104,8 +104,8 @@ TC_INDIRECT_SCOPE int cls_bpf_classify(struct sk_buff *skb, - bpf_compute_data_pointers(skb); - filter_res = bpf_prog_run(prog->filter, skb); - } -- if (unlikely(!skb->tstamp && skb->mono_delivery_time)) -- skb->mono_delivery_time = 0; -+ if (unlikely(!skb->tstamp && skb->tstamp_type)) -+ skb->tstamp_type = SKB_CLOCK_REALTIME; - - if (prog->exts_integrated) { - res->class = 0; -diff --git a/samples/bpf/cpustat_kern.c b/samples/bpf/cpustat_kern.c -index 944f13fe164a..7ec7143e2757 100644 ---- a/samples/bpf/cpustat_kern.c -+++ b/samples/bpf/cpustat_kern.c -@@ -211,7 +211,7 @@ int bpf_prog1(struct cpu_args *ctx) - SEC("tracepoint/power/cpu_frequency") - int bpf_prog2(struct cpu_args *ctx) - { -- u64 *pts, *cstate, *pstate, prev_state, cur_ts, delta; -+ u64 *pts, *cstate, *pstate, cur_ts, delta; - u32 key, cpu, pstate_idx; - u64 *val; - -@@ -232,7 +232,6 @@ int bpf_prog2(struct cpu_args *ctx) - if (!cstate) - return 0; - -- prev_state = *pstate; - *pstate = ctx->state; - - if (!*pts) { -diff --git a/scripts/Makefile.btf b/scripts/Makefile.btf -index 2d6e5ed9081e..bca8a8f26ea4 100644 ---- a/scripts/Makefile.btf -+++ b/scripts/Makefile.btf -@@ -14,9 +14,7 @@ pahole-flags-$(call test-ge, $(pahole-ver), 121) += --btf_gen_floats - - pahole-flags-$(call test-ge, $(pahole-ver), 122) += -j - --ifeq ($(pahole-ver), 125) --pahole-flags-y += --skip_encoding_btf_inconsistent_proto --btf_gen_optimized --endif -+pahole-flags-$(call test-ge, $(pahole-ver), 125) += --skip_encoding_btf_inconsistent_proto --btf_gen_optimized - - else - -diff --git a/tools/Makefile b/tools/Makefile -index 276f5d0d53a4..278d24723b74 100644 ---- a/tools/Makefile -+++ b/tools/Makefile -@@ -28,6 +28,7 @@ help: - @echo ' pci - PCI tools' - @echo ' perf - Linux performance measurement and analysis tool' - @echo ' selftests - various kernel selftests' -+ @echo ' sched_ext - sched_ext example schedulers' - @echo ' bootconfig - boot config tool' - @echo ' spi - spi tools' - @echo ' tmon - thermal monitoring and tuning tool' -@@ -91,6 +92,9 @@ perf: FORCE - $(Q)mkdir -p $(PERF_O) . - $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= - -+sched_ext: FORCE -+ $(call descend,sched_ext) -+ - selftests: FORCE - $(call descend,testing/$@) - -@@ -184,6 +188,9 @@ perf_clean: - $(Q)mkdir -p $(PERF_O) . - $(Q)$(MAKE) --no-print-directory -C perf O=$(PERF_O) subdir= clean - -+sched_ext_clean: -+ $(call descend,sched_ext,clean) -+ - selftests_clean: - $(call descend,testing/$(@:_clean=),clean) - -@@ -213,6 +220,7 @@ clean: acpi_clean counter_clean cpupower_clean hv_clean firewire_clean \ - mm_clean bpf_clean iio_clean x86_energy_perf_policy_clean tmon_clean \ - freefall_clean build_clean libbpf_clean libsubcmd_clean \ - gpio_clean objtool_clean leds_clean wmi_clean pci_clean firmware_clean debugging_clean \ -- intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean -+ intel-speed-select_clean tracing_clean thermal_clean thermometer_clean thermal-engine_clean \ -+ sched_ext_clean - - .PHONY: FORCE -diff --git a/tools/bpf/bpftool/Documentation/bpftool-btf.rst b/tools/bpf/bpftool/Documentation/bpftool-btf.rst -index eaba24320fb2..3f6bca03ad2e 100644 ---- a/tools/bpf/bpftool/Documentation/bpftool-btf.rst -+++ b/tools/bpf/bpftool/Documentation/bpftool-btf.rst -@@ -28,7 +28,7 @@ BTF COMMANDS - | **bpftool** **btf help** - | - | *BTF_SRC* := { **id** *BTF_ID* | **prog** *PROG* | **map** *MAP* [{**key** | **value** | **kv** | **all**}] | **file** *FILE* } --| *FORMAT* := { **raw** | **c** } -+| *FORMAT* := { **raw** | **c** [**unsorted**] } - | *MAP* := { **id** *MAP_ID* | **pinned** *FILE* } - | *PROG* := { **id** *PROG_ID* | **pinned** *FILE* | **tag** *PROG_TAG* | **name** *PROG_NAME* } - -@@ -63,7 +63,9 @@ bpftool btf dump *BTF_SRC* - pahole. - - **format** option can be used to override default (raw) output format. Raw -- (**raw**) or C-syntax (**c**) output formats are supported. -+ (**raw**) or C-syntax (**c**) output formats are supported. With C-style -+ formatting, the output is sorted by default. Use the **unsorted** option -+ to avoid sorting the output. - - bpftool btf help - Print short help message. -diff --git a/tools/bpf/bpftool/Makefile b/tools/bpf/bpftool/Makefile -index dfa4f1bebbb3..ba927379eb20 100644 ---- a/tools/bpf/bpftool/Makefile -+++ b/tools/bpf/bpftool/Makefile -@@ -204,10 +204,11 @@ ifeq ($(feature-clang-bpf-co-re),1) - - BUILD_BPF_SKELS := 1 - --$(OUTPUT)vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL_BOOTSTRAP) - ifeq ($(VMLINUX_H),) -+$(OUTPUT)vmlinux.h: $(VMLINUX_BTF) $(BPFTOOL_BOOTSTRAP) - $(QUIET_GEN)$(BPFTOOL_BOOTSTRAP) btf dump file $< format c > $@ - else -+$(OUTPUT)vmlinux.h: $(VMLINUX_H) - $(Q)cp "$(VMLINUX_H)" $@ - endif - -diff --git a/tools/bpf/bpftool/bash-completion/bpftool b/tools/bpf/bpftool/bash-completion/bpftool -index 04afe2ac2228..be99d49b8714 100644 ---- a/tools/bpf/bpftool/bash-completion/bpftool -+++ b/tools/bpf/bpftool/bash-completion/bpftool -@@ -930,6 +930,9 @@ _bpftool() - format) - COMPREPLY=( $( compgen -W "c raw" -- "$cur" ) ) - ;; -+ c) -+ COMPREPLY=( $( compgen -W "unsorted" -- "$cur" ) ) -+ ;; - *) - # emit extra options - case ${words[3]} in -diff --git a/tools/bpf/bpftool/btf.c b/tools/bpf/bpftool/btf.c -index 91fcb75babe3..af047dedde38 100644 ---- a/tools/bpf/bpftool/btf.c -+++ b/tools/bpf/bpftool/btf.c -@@ -43,6 +43,13 @@ static const char * const btf_kind_str[NR_BTF_KINDS] = { - [BTF_KIND_ENUM64] = "ENUM64", - }; - -+struct sort_datum { -+ int index; -+ int type_rank; -+ const char *sort_name; -+ const char *own_name; -+}; -+ - static const char *btf_int_enc_str(__u8 encoding) - { - switch (encoding) { -@@ -460,9 +467,122 @@ static void __printf(2, 0) btf_dump_printf(void *ctx, - vfprintf(stdout, fmt, args); - } - -+static int btf_type_rank(const struct btf *btf, __u32 index, bool has_name) -+{ -+ const struct btf_type *t = btf__type_by_id(btf, index); -+ const int kind = btf_kind(t); -+ const int max_rank = 10; -+ -+ if (t->name_off) -+ has_name = true; -+ -+ switch (kind) { -+ case BTF_KIND_ENUM: -+ case BTF_KIND_ENUM64: -+ return has_name ? 1 : 0; -+ case BTF_KIND_INT: -+ case BTF_KIND_FLOAT: -+ return 2; -+ case BTF_KIND_STRUCT: -+ case BTF_KIND_UNION: -+ return has_name ? 3 : max_rank; -+ case BTF_KIND_FUNC_PROTO: -+ return has_name ? 4 : max_rank; -+ case BTF_KIND_ARRAY: -+ if (has_name) -+ return btf_type_rank(btf, btf_array(t)->type, has_name); -+ return max_rank; -+ case BTF_KIND_TYPE_TAG: -+ case BTF_KIND_CONST: -+ case BTF_KIND_PTR: -+ case BTF_KIND_VOLATILE: -+ case BTF_KIND_RESTRICT: -+ case BTF_KIND_TYPEDEF: -+ case BTF_KIND_DECL_TAG: -+ if (has_name) -+ return btf_type_rank(btf, t->type, has_name); -+ return max_rank; -+ default: -+ return max_rank; -+ } -+} -+ -+static const char *btf_type_sort_name(const struct btf *btf, __u32 index, bool from_ref) -+{ -+ const struct btf_type *t = btf__type_by_id(btf, index); -+ -+ switch (btf_kind(t)) { -+ case BTF_KIND_ENUM: -+ case BTF_KIND_ENUM64: { -+ int name_off = t->name_off; -+ -+ /* Use name of the first element for anonymous enums if allowed */ -+ if (!from_ref && !t->name_off && btf_vlen(t)) -+ name_off = btf_enum(t)->name_off; -+ -+ return btf__name_by_offset(btf, name_off); -+ } -+ case BTF_KIND_ARRAY: -+ return btf_type_sort_name(btf, btf_array(t)->type, true); -+ case BTF_KIND_TYPE_TAG: -+ case BTF_KIND_CONST: -+ case BTF_KIND_PTR: -+ case BTF_KIND_VOLATILE: -+ case BTF_KIND_RESTRICT: -+ case BTF_KIND_TYPEDEF: -+ case BTF_KIND_DECL_TAG: -+ return btf_type_sort_name(btf, t->type, true); -+ default: -+ return btf__name_by_offset(btf, t->name_off); -+ } -+ return NULL; -+} -+ -+static int btf_type_compare(const void *left, const void *right) -+{ -+ const struct sort_datum *d1 = (const struct sort_datum *)left; -+ const struct sort_datum *d2 = (const struct sort_datum *)right; -+ int r; -+ -+ if (d1->type_rank != d2->type_rank) -+ return d1->type_rank < d2->type_rank ? -1 : 1; -+ -+ r = strcmp(d1->sort_name, d2->sort_name); -+ if (r) -+ return r; -+ -+ return strcmp(d1->own_name, d2->own_name); -+} -+ -+static struct sort_datum *sort_btf_c(const struct btf *btf) -+{ -+ struct sort_datum *datums; -+ int n; -+ -+ n = btf__type_cnt(btf); -+ datums = malloc(sizeof(struct sort_datum) * n); -+ if (!datums) -+ return NULL; -+ -+ for (int i = 0; i < n; ++i) { -+ struct sort_datum *d = datums + i; -+ const struct btf_type *t = btf__type_by_id(btf, i); -+ -+ d->index = i; -+ d->type_rank = btf_type_rank(btf, i, false); -+ d->sort_name = btf_type_sort_name(btf, i, false); -+ d->own_name = btf__name_by_offset(btf, t->name_off); -+ } -+ -+ qsort(datums, n, sizeof(struct sort_datum), btf_type_compare); -+ -+ return datums; -+} -+ - static int dump_btf_c(const struct btf *btf, -- __u32 *root_type_ids, int root_type_cnt) -+ __u32 *root_type_ids, int root_type_cnt, bool sort_dump) - { -+ struct sort_datum *datums = NULL; - struct btf_dump *d; - int err = 0, i; - -@@ -486,8 +606,12 @@ static int dump_btf_c(const struct btf *btf, - } else { - int cnt = btf__type_cnt(btf); - -+ if (sort_dump) -+ datums = sort_btf_c(btf); - for (i = 1; i < cnt; i++) { -- err = btf_dump__dump_type(d, i); -+ int idx = datums ? datums[i].index : i; -+ -+ err = btf_dump__dump_type(d, idx); - if (err) - goto done; - } -@@ -500,6 +624,7 @@ static int dump_btf_c(const struct btf *btf, - printf("#endif /* __VMLINUX_H__ */\n"); - - done: -+ free(datums); - btf_dump__free(d); - return err; - } -@@ -549,10 +674,10 @@ static bool btf_is_kernel_module(__u32 btf_id) - - static int do_dump(int argc, char **argv) - { -+ bool dump_c = false, sort_dump_c = true; - struct btf *btf = NULL, *base = NULL; - __u32 root_type_ids[2]; - int root_type_cnt = 0; -- bool dump_c = false; - __u32 btf_id = -1; - const char *src; - int fd = -1; -@@ -663,6 +788,9 @@ static int do_dump(int argc, char **argv) - goto done; - } - NEXT_ARG(); -+ } else if (is_prefix(*argv, "unsorted")) { -+ sort_dump_c = false; -+ NEXT_ARG(); - } else { - p_err("unrecognized option: '%s'", *argv); - err = -EINVAL; -@@ -691,7 +819,7 @@ static int do_dump(int argc, char **argv) - err = -ENOTSUP; - goto done; - } -- err = dump_btf_c(btf, root_type_ids, root_type_cnt); -+ err = dump_btf_c(btf, root_type_ids, root_type_cnt, sort_dump_c); - } else { - err = dump_btf_raw(btf, root_type_ids, root_type_cnt); - } -@@ -1063,7 +1191,7 @@ static int do_help(int argc, char **argv) - " %1$s %2$s help\n" - "\n" - " BTF_SRC := { id BTF_ID | prog PROG | map MAP [{key | value | kv | all}] | file FILE }\n" -- " FORMAT := { raw | c }\n" -+ " FORMAT := { raw | c [unsorted] }\n" - " " HELP_SPEC_MAP "\n" - " " HELP_SPEC_PROGRAM "\n" - " " HELP_SPEC_OPTIONS " |\n" -diff --git a/tools/bpf/bpftool/common.c b/tools/bpf/bpftool/common.c -index 958e92acca8e..9b75639434b8 100644 ---- a/tools/bpf/bpftool/common.c -+++ b/tools/bpf/bpftool/common.c -@@ -410,7 +410,7 @@ void get_prog_full_name(const struct bpf_prog_info *prog_info, int prog_fd, - { - const char *prog_name = prog_info->name; - const struct btf_type *func_type; -- const struct bpf_func_info finfo = {}; -+ struct bpf_func_info finfo = {}; - struct bpf_prog_info info = {}; - __u32 info_len = sizeof(info); - struct btf *prog_btf = NULL; -diff --git a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c -index 7bdbcac3cf62..948dde25034e 100644 ---- a/tools/bpf/bpftool/skeleton/pid_iter.bpf.c -+++ b/tools/bpf/bpftool/skeleton/pid_iter.bpf.c -@@ -29,6 +29,7 @@ enum bpf_link_type___local { - }; - - extern const void bpf_link_fops __ksym; -+extern const void bpf_link_fops_poll __ksym __weak; - extern const void bpf_map_fops __ksym; - extern const void bpf_prog_fops __ksym; - extern const void btf_fops __ksym; -@@ -84,7 +85,11 @@ int iter(struct bpf_iter__task_file *ctx) - fops = &btf_fops; - break; - case BPF_OBJ_LINK: -- fops = &bpf_link_fops; -+ if (&bpf_link_fops_poll && -+ file->f_op == &bpf_link_fops_poll) -+ fops = &bpf_link_fops_poll; -+ else -+ fops = &bpf_link_fops; - break; - default: - return 0; -diff --git a/tools/bpf/bpftool/skeleton/profiler.bpf.c b/tools/bpf/bpftool/skeleton/profiler.bpf.c -index 2f80edc682f1..f48c783cb9f7 100644 ---- a/tools/bpf/bpftool/skeleton/profiler.bpf.c -+++ b/tools/bpf/bpftool/skeleton/profiler.bpf.c -@@ -40,17 +40,17 @@ struct { - - const volatile __u32 num_cpu = 1; - const volatile __u32 num_metric = 1; --#define MAX_NUM_MATRICS 4 -+#define MAX_NUM_METRICS 4 - - SEC("fentry/XXX") - int BPF_PROG(fentry_XXX) - { -- struct bpf_perf_event_value___local *ptrs[MAX_NUM_MATRICS]; -+ struct bpf_perf_event_value___local *ptrs[MAX_NUM_METRICS]; - u32 key = bpf_get_smp_processor_id(); - u32 i; - - /* look up before reading, to reduce error */ -- for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { -+ for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++) { - u32 flag = i; - - ptrs[i] = bpf_map_lookup_elem(&fentry_readings, &flag); -@@ -58,7 +58,7 @@ int BPF_PROG(fentry_XXX) - return 0; - } - -- for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { -+ for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++) { - struct bpf_perf_event_value___local reading; - int err; - -@@ -99,14 +99,14 @@ fexit_update_maps(u32 id, struct bpf_perf_event_value___local *after) - SEC("fexit/XXX") - int BPF_PROG(fexit_XXX) - { -- struct bpf_perf_event_value___local readings[MAX_NUM_MATRICS]; -+ struct bpf_perf_event_value___local readings[MAX_NUM_METRICS]; - u32 cpu = bpf_get_smp_processor_id(); - u32 i, zero = 0; - int err; - u64 *count; - - /* read all events before updating the maps, to reduce error */ -- for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) { -+ for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++) { - err = bpf_perf_event_read_value(&events, cpu + i * num_cpu, - (void *)(readings + i), - sizeof(*readings)); -@@ -116,7 +116,7 @@ int BPF_PROG(fexit_XXX) - count = bpf_map_lookup_elem(&counts, &zero); - if (count) { - *count += 1; -- for (i = 0; i < num_metric && i < MAX_NUM_MATRICS; i++) -+ for (i = 0; i < num_metric && i < MAX_NUM_METRICS; i++) - fexit_update_maps(i, &readings[i]); - } - return 0; -diff --git a/tools/include/uapi/linux/bpf.h b/tools/include/uapi/linux/bpf.h -index 90706a47f6ff..25ea393cf084 100644 ---- a/tools/include/uapi/linux/bpf.h -+++ b/tools/include/uapi/linux/bpf.h -@@ -6207,12 +6207,17 @@ union { \ - __u64 :64; \ - } __attribute__((aligned(8))) - -+/* The enum used in skb->tstamp_type. It specifies the clock type -+ * of the time stored in the skb->tstamp. -+ */ - enum { -- BPF_SKB_TSTAMP_UNSPEC, -- BPF_SKB_TSTAMP_DELIVERY_MONO, /* tstamp has mono delivery time */ -- /* For any BPF_SKB_TSTAMP_* that the bpf prog cannot handle, -- * the bpf prog should handle it like BPF_SKB_TSTAMP_UNSPEC -- * and try to deduce it by ingress, egress or skb->sk->sk_clockid. -+ BPF_SKB_TSTAMP_UNSPEC = 0, /* DEPRECATED */ -+ BPF_SKB_TSTAMP_DELIVERY_MONO = 1, /* DEPRECATED */ -+ BPF_SKB_CLOCK_REALTIME = 0, -+ BPF_SKB_CLOCK_MONOTONIC = 1, -+ BPF_SKB_CLOCK_TAI = 2, -+ /* For any future BPF_SKB_CLOCK_* that the bpf prog cannot handle, -+ * the bpf prog can try to deduce it by ingress/egress/skb->sk->sk_clockid. - */ - }; - -diff --git a/tools/lib/bpf/libbpf.c b/tools/lib/bpf/libbpf.c -index 5401f2df463d..d1627a2ca30b 100644 ---- a/tools/lib/bpf/libbpf.c -+++ b/tools/lib/bpf/libbpf.c -@@ -229,7 +229,30 @@ static const char * const prog_type_name[] = { - static int __base_pr(enum libbpf_print_level level, const char *format, - va_list args) - { -- if (level == LIBBPF_DEBUG) -+ const char *env_var = "LIBBPF_LOG_LEVEL"; -+ static enum libbpf_print_level min_level = LIBBPF_INFO; -+ static bool initialized; -+ -+ if (!initialized) { -+ char *verbosity; -+ -+ initialized = true; -+ verbosity = getenv(env_var); -+ if (verbosity) { -+ if (strcasecmp(verbosity, "warn") == 0) -+ min_level = LIBBPF_WARN; -+ else if (strcasecmp(verbosity, "debug") == 0) -+ min_level = LIBBPF_DEBUG; -+ else if (strcasecmp(verbosity, "info") == 0) -+ min_level = LIBBPF_INFO; -+ else -+ fprintf(stderr, "libbpf: unrecognized '%s' envvar value: '%s', should be one of 'warn', 'debug', or 'info'.\n", -+ env_var, verbosity); -+ } -+ } -+ -+ /* if too verbose, skip logging */ -+ if (level > min_level) - return 0; - - return vfprintf(stderr, format, args); -diff --git a/tools/lib/bpf/libbpf.h b/tools/lib/bpf/libbpf.h -index c3f77d9260fe..26e4e35528c5 100644 ---- a/tools/lib/bpf/libbpf.h -+++ b/tools/lib/bpf/libbpf.h -@@ -98,7 +98,10 @@ typedef int (*libbpf_print_fn_t)(enum libbpf_print_level level, - - /** - * @brief **libbpf_set_print()** sets user-provided log callback function to -- * be used for libbpf warnings and informational messages. -+ * be used for libbpf warnings and informational messages. If the user callback -+ * is not set, messages are logged to stderr by default. The verbosity of these -+ * messages can be controlled by setting the environment variable -+ * LIBBPF_LOG_LEVEL to either warn, info, or debug. - * @param fn The log print function. If NULL, libbpf won't print anything. - * @return Pointer to old print function. - * -diff --git a/tools/lib/bpf/libbpf_internal.h b/tools/lib/bpf/libbpf_internal.h -index a0dcfb82e455..7e7e686008c6 100644 ---- a/tools/lib/bpf/libbpf_internal.h -+++ b/tools/lib/bpf/libbpf_internal.h -@@ -597,13 +597,9 @@ static inline int ensure_good_fd(int fd) - return fd; - } - --static inline int sys_dup2(int oldfd, int newfd) -+static inline int sys_dup3(int oldfd, int newfd, int flags) - { --#ifdef __NR_dup2 -- return syscall(__NR_dup2, oldfd, newfd); --#else -- return syscall(__NR_dup3, oldfd, newfd, 0); --#endif -+ return syscall(__NR_dup3, oldfd, newfd, flags); - } - - /* Point *fixed_fd* to the same file that *tmp_fd* points to. -@@ -614,7 +610,7 @@ static inline int reuse_fd(int fixed_fd, int tmp_fd) - { - int err; - -- err = sys_dup2(tmp_fd, fixed_fd); -+ err = sys_dup3(tmp_fd, fixed_fd, O_CLOEXEC); - err = err < 0 ? -errno : 0; - close(tmp_fd); /* clean up temporary FD */ - return err; -diff --git a/tools/sched_ext/.gitignore b/tools/sched_ext/.gitignore -new file mode 100644 -index 000000000000..d6264fe1c8cd ---- /dev/null -+++ b/tools/sched_ext/.gitignore -@@ -0,0 +1,2 @@ -+tools/ -+build/ -diff --git a/tools/sched_ext/Makefile b/tools/sched_ext/Makefile -new file mode 100644 -index 000000000000..ca3815e572d8 ---- /dev/null -+++ b/tools/sched_ext/Makefile -@@ -0,0 +1,246 @@ -+# SPDX-License-Identifier: GPL-2.0 -+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+include ../build/Build.include -+include ../scripts/Makefile.arch -+include ../scripts/Makefile.include -+ -+all: all_targets -+ -+ifneq ($(LLVM),) -+ifneq ($(filter %/,$(LLVM)),) -+LLVM_PREFIX := $(LLVM) -+else ifneq ($(filter -%,$(LLVM)),) -+LLVM_SUFFIX := $(LLVM) -+endif -+ -+CLANG_TARGET_FLAGS_arm := arm-linux-gnueabi -+CLANG_TARGET_FLAGS_arm64 := aarch64-linux-gnu -+CLANG_TARGET_FLAGS_hexagon := hexagon-linux-musl -+CLANG_TARGET_FLAGS_m68k := m68k-linux-gnu -+CLANG_TARGET_FLAGS_mips := mipsel-linux-gnu -+CLANG_TARGET_FLAGS_powerpc := powerpc64le-linux-gnu -+CLANG_TARGET_FLAGS_riscv := riscv64-linux-gnu -+CLANG_TARGET_FLAGS_s390 := s390x-linux-gnu -+CLANG_TARGET_FLAGS_x86 := x86_64-linux-gnu -+CLANG_TARGET_FLAGS := $(CLANG_TARGET_FLAGS_$(ARCH)) -+ -+ifeq ($(CROSS_COMPILE),) -+ifeq ($(CLANG_TARGET_FLAGS),) -+$(error Specify CROSS_COMPILE or add '--target=' option to lib.mk) -+else -+CLANG_FLAGS += --target=$(CLANG_TARGET_FLAGS) -+endif # CLANG_TARGET_FLAGS -+else -+CLANG_FLAGS += --target=$(notdir $(CROSS_COMPILE:%-=%)) -+endif # CROSS_COMPILE -+ -+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as -+else -+CC := $(CROSS_COMPILE)gcc -+endif # LLVM -+ -+CURDIR := $(abspath .) -+TOOLSDIR := $(abspath ..) -+LIBDIR := $(TOOLSDIR)/lib -+BPFDIR := $(LIBDIR)/bpf -+TOOLSINCDIR := $(TOOLSDIR)/include -+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool -+APIDIR := $(TOOLSINCDIR)/uapi -+GENDIR := $(abspath ../../include/generated) -+GENHDR := $(GENDIR)/autoconf.h -+ -+ifeq ($(O),) -+OUTPUT_DIR := $(CURDIR)/build -+else -+OUTPUT_DIR := $(O)/build -+endif # O -+OBJ_DIR := $(OUTPUT_DIR)/obj -+INCLUDE_DIR := $(OUTPUT_DIR)/include -+BPFOBJ_DIR := $(OBJ_DIR)/libbpf -+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext -+BINDIR := $(OUTPUT_DIR)/bin -+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a -+ifneq ($(CROSS_COMPILE),) -+HOST_BUILD_DIR := $(OBJ_DIR)/host -+HOST_OUTPUT_DIR := host-tools -+HOST_INCLUDE_DIR := $(HOST_OUTPUT_DIR)/include -+else -+HOST_BUILD_DIR := $(OBJ_DIR) -+HOST_OUTPUT_DIR := $(OUTPUT_DIR) -+HOST_INCLUDE_DIR := $(INCLUDE_DIR) -+endif -+HOST_BPFOBJ := $(HOST_BUILD_DIR)/libbpf/libbpf.a -+RESOLVE_BTFIDS := $(HOST_BUILD_DIR)/resolve_btfids/resolve_btfids -+DEFAULT_BPFTOOL := $(HOST_OUTPUT_DIR)/sbin/bpftool -+ -+VMLINUX_BTF_PATHS ?= $(if $(O),$(O)/vmlinux) \ -+ $(if $(KBUILD_OUTPUT),$(KBUILD_OUTPUT)/vmlinux) \ -+ ../../vmlinux \ -+ /sys/kernel/btf/vmlinux \ -+ /boot/vmlinux-$(shell uname -r) -+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) -+ifeq ($(VMLINUX_BTF),) -+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") -+endif -+ -+BPFTOOL ?= $(DEFAULT_BPFTOOL) -+ -+ifneq ($(wildcard $(GENHDR)),) -+ GENFLAGS := -DHAVE_GENHDR -+endif -+ -+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ -+ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -+ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -+ -+# Silence some warnings when compiled with clang -+ifneq ($(LLVM),) -+CFLAGS += -Wno-unused-command-line-argument -+endif -+ -+LDFLAGS = -lelf -lz -lpthread -+ -+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ -+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -+$(shell $(1) -dM -E - $@ -+else -+ $(call msg,CP,,$@) -+ $(Q)cp "$(VMLINUX_H)" $@ -+endif -+ -+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h include/scx/*.h \ -+ | $(BPFOBJ) $(SCXOBJ_DIR) -+ $(call msg,CLNG-BPF,,$(notdir $@)) -+ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ -+ -+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) -+ $(eval sched=$(notdir $@)) -+ $(call msg,GEN-SKEL,,$(sched)) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) -+ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) -+ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ -+ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) -+ -+SCX_COMMON_DEPS := include/scx/common.h include/scx/user_exit_info.h | $(BINDIR) -+ -+c-sched-targets = scx_simple scx_qmap scx_central scx_flatcg -+ -+$(addprefix $(BINDIR)/,$(c-sched-targets)): \ -+ $(BINDIR)/%: \ -+ $(filter-out %.bpf.c,%.c) \ -+ $(INCLUDE_DIR)/%.bpf.skel.h \ -+ $(SCX_COMMON_DEPS) -+ $(eval sched=$(notdir $@)) -+ $(CC) $(CFLAGS) -c $(sched).c -o $(SCXOBJ_DIR)/$(sched).o -+ $(CC) -o $@ $(SCXOBJ_DIR)/$(sched).o $(HOST_BPFOBJ) $(LDFLAGS) -+ -+$(c-sched-targets): %: $(BINDIR)/% -+ -+install: all -+ $(Q)mkdir -p $(DESTDIR)/usr/local/bin/ -+ $(Q)cp $(BINDIR)/* $(DESTDIR)/usr/local/bin/ -+ -+clean: -+ rm -rf $(OUTPUT_DIR) $(HOST_OUTPUT_DIR) -+ rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h -+ rm -f $(c-sched-targets) -+ -+help: -+ @echo 'Building targets' -+ @echo '================' -+ @echo '' -+ @echo ' all - Compile all schedulers' -+ @echo '' -+ @echo 'Alternatively, you may compile individual schedulers:' -+ @echo '' -+ @printf ' %s\n' $(c-sched-targets) -+ @echo '' -+ @echo 'For any scheduler build target, you may specify an alternative' -+ @echo 'build output path with the O= environment variable. For example:' -+ @echo '' -+ @echo ' O=/tmp/sched_ext make all' -+ @echo '' -+ @echo 'will compile all schedulers, and emit the build artifacts to' -+ @echo '/tmp/sched_ext/build.' -+ @echo '' -+ @echo '' -+ @echo 'Installing targets' -+ @echo '==================' -+ @echo '' -+ @echo ' install - Compile and install all schedulers to /usr/bin.' -+ @echo ' You may specify the DESTDIR= environment variable' -+ @echo ' to indicate a prefix for /usr/bin. For example:' -+ @echo '' -+ @echo ' DESTDIR=/tmp/sched_ext make install' -+ @echo '' -+ @echo ' will build the schedulers in CWD/build, and' -+ @echo ' install the schedulers to /tmp/sched_ext/usr/bin.' -+ @echo '' -+ @echo '' -+ @echo 'Cleaning targets' -+ @echo '================' -+ @echo '' -+ @echo ' clean - Remove all generated files' -+ -+all_targets: $(c-sched-targets) -+ -+.PHONY: all all_targets $(c-sched-targets) clean help -+ -+# delete failed targets -+.DELETE_ON_ERROR: -+ -+# keep intermediate (.bpf.skel.h, .bpf.o, etc) targets -+.SECONDARY: -diff --git a/tools/sched_ext/README.md b/tools/sched_ext/README.md -new file mode 100644 -index 000000000000..16a42e4060f6 ---- /dev/null -+++ b/tools/sched_ext/README.md -@@ -0,0 +1,270 @@ -+SCHED_EXT EXAMPLE SCHEDULERS -+============================ -+ -+# Introduction -+ -+This directory contains a number of example sched_ext schedulers. These -+schedulers are meant to provide examples of different types of schedulers -+that can be built using sched_ext, and illustrate how various features of -+sched_ext can be used. -+ -+Some of the examples are performant, production-ready schedulers. That is, for -+the correct workload and with the correct tuning, they may be deployed in a -+production environment with acceptable or possibly even improved performance. -+Others are just examples that in practice, would not provide acceptable -+performance (though they could be improved to get there). -+ -+This README will describe these example schedulers, including describing the -+types of workloads or scenarios they're designed to accommodate, and whether or -+not they're production ready. For more details on any of these schedulers, -+please see the header comment in their .bpf.c file. -+ -+ -+# Compiling the examples -+ -+There are a few toolchain dependencies for compiling the example schedulers. -+ -+## Toolchain dependencies -+ -+1. clang >= 16.0.0 -+ -+The schedulers are BPF programs, and therefore must be compiled with clang. gcc -+is actively working on adding a BPF backend compiler as well, but are still -+missing some features such as BTF type tags which are necessary for using -+kptrs. -+ -+2. pahole >= 1.25 -+ -+You may need pahole in order to generate BTF from DWARF. -+ -+3. rust >= 1.70.0 -+ -+Rust schedulers uses features present in the rust toolchain >= 1.70.0. You -+should be able to use the stable build from rustup, but if that doesn't -+work, try using the rustup nightly build. -+ -+There are other requirements as well, such as make, but these are the main / -+non-trivial ones. -+ -+## Compiling the kernel -+ -+In order to run a sched_ext scheduler, you'll have to run a kernel compiled -+with the patches in this repository, and with a minimum set of necessary -+Kconfig options: -+ -+``` -+CONFIG_BPF=y -+CONFIG_SCHED_CLASS_EXT=y -+CONFIG_BPF_SYSCALL=y -+CONFIG_BPF_JIT=y -+CONFIG_DEBUG_INFO_BTF=y -+``` -+ -+It's also recommended that you also include the following Kconfig options: -+ -+``` -+CONFIG_BPF_JIT_ALWAYS_ON=y -+CONFIG_BPF_JIT_DEFAULT_ON=y -+CONFIG_PAHOLE_HAS_SPLIT_BTF=y -+CONFIG_PAHOLE_HAS_BTF_TAG=y -+``` -+ -+There is a `Kconfig` file in this directory whose contents you can append to -+your local `.config` file, as long as there are no conflicts with any existing -+options in the file. -+ -+## Getting a vmlinux.h file -+ -+You may notice that most of the example schedulers include a "vmlinux.h" file. -+This is a large, auto-generated header file that contains all of the types -+defined in some vmlinux binary that was compiled with -+[BTF](https://docs.kernel.org/bpf/btf.html) (i.e. with the BTF-related Kconfig -+options specified above). -+ -+The header file is created using `bpftool`, by passing it a vmlinux binary -+compiled with BTF as follows: -+ -+```bash -+$ bpftool btf dump file /path/to/vmlinux format c > vmlinux.h -+``` -+ -+`bpftool` analyzes all of the BTF encodings in the binary, and produces a -+header file that can be included by BPF programs to access those types. For -+example, using vmlinux.h allows a scheduler to access fields defined directly -+in vmlinux as follows: -+ -+```c -+#include "vmlinux.h" -+// vmlinux.h is also implicitly included by scx_common.bpf.h. -+#include "scx_common.bpf.h" -+ -+/* -+ * vmlinux.h provides definitions for struct task_struct and -+ * struct scx_enable_args. -+ */ -+void BPF_STRUCT_OPS(example_enable, struct task_struct *p, -+ struct scx_enable_args *args) -+{ -+ bpf_printk("Task %s enabled in example scheduler", p->comm); -+} -+ -+// vmlinux.h provides the definition for struct sched_ext_ops. -+SEC(".struct_ops.link") -+struct sched_ext_ops example_ops { -+ .enable = (void *)example_enable, -+ .name = "example", -+} -+``` -+ -+The scheduler build system will generate this vmlinux.h file as part of the -+scheduler build pipeline. It looks for a vmlinux file in the following -+dependency order: -+ -+1. If the O= environment variable is defined, at `$O/vmlinux` -+2. If the KBUILD_OUTPUT= environment variable is defined, at -+ `$KBUILD_OUTPUT/vmlinux` -+3. At `../../vmlinux` (i.e. at the root of the kernel tree where you're -+ compiling the schedulers) -+3. `/sys/kernel/btf/vmlinux` -+4. `/boot/vmlinux-$(uname -r)` -+ -+In other words, if you have compiled a kernel in your local repo, its vmlinux -+file will be used to generate vmlinux.h. Otherwise, it will be the vmlinux of -+the kernel you're currently running on. This means that if you're running on a -+kernel with sched_ext support, you may not need to compile a local kernel at -+all. -+ -+### Aside on CO-RE -+ -+One of the cooler features of BPF is that it supports -+[CO-RE](https://nakryiko.com/posts/bpf-core-reference-guide/) (Compile Once Run -+Everywhere). This feature allows you to reference fields inside of structs with -+types defined internal to the kernel, and not have to recompile if you load the -+BPF program on a different kernel with the field at a different offset. In our -+example above, we print out a task name with `p->comm`. CO-RE would perform -+relocations for that access when the program is loaded to ensure that it's -+referencing the correct offset for the currently running kernel. -+ -+## Compiling the schedulers -+ -+Once you have your toolchain setup, and a vmlinux that can be used to generate -+a full vmlinux.h file, you can compile the schedulers using `make`: -+ -+```bash -+$ make -j($nproc) -+``` -+ -+# Example schedulers -+ -+This directory contains the following example schedulers. These schedulers are -+for testing and demonstrating different aspects of sched_ext. While some may be -+useful in limited scenarios, they are not intended to be practical. -+ -+For more scheduler implementations, tools and documentation, visit -+https://github.com/sched-ext/scx. -+ -+## scx_simple -+ -+A simple scheduler that provides an example of a minimal sched_ext scheduler. -+scx_simple can be run in either global weighted vtime mode, or FIFO mode. -+ -+Though very simple, in limited scenarios, this scheduler can perform reasonably -+well on single-socket systems with a unified L3 cache. -+ -+## scx_qmap -+ -+Another simple, yet slightly more complex scheduler that provides an example of -+a basic weighted FIFO queuing policy. It also provides examples of some common -+useful BPF features, such as sleepable per-task storage allocation in the -+`ops.prep_enable()` callback, and using the `BPF_MAP_TYPE_QUEUE` map type to -+enqueue tasks. It also illustrates how core-sched support could be implemented. -+ -+## scx_central -+ -+A "central" scheduler where scheduling decisions are made from a single CPU. -+This scheduler illustrates how scheduling decisions can be dispatched from a -+single CPU, allowing other cores to run with infinite slices, without timer -+ticks, and without having to incur the overhead of making scheduling decisions. -+ -+The approach demonstrated by this scheduler may be useful for any workload that -+benefits from minimizing scheduling overhead and timer ticks. An example of -+where this could be particularly useful is running VMs, where running with -+infinite slices and no timer ticks allows the VM to avoid unnecessary expensive -+vmexits. -+ -+## scx_flatcg -+ -+A flattened cgroup hierarchy scheduler. This scheduler implements hierarchical -+weight-based cgroup CPU control by flattening the cgroup hierarchy into a single -+layer, by compounding the active weight share at each level. The effect of this -+is a much more performant CPU controller, which does not need to descend down -+cgroup trees in order to properly compute a cgroup's share. -+ -+Similar to scx_simple, in limited scenarios, this scheduler can perform -+reasonably well on single socket-socket systems with a unified L3 cache and show -+significantly lowered hierarchical scheduling overhead. -+ -+ -+# Troubleshooting -+ -+There are a number of common issues that you may run into when building the -+schedulers. We'll go over some of the common ones here. -+ -+## Build Failures -+ -+### Old version of clang -+ -+``` -+error: static assertion failed due to requirement 'SCX_DSQ_FLAG_BUILTIN': bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole -+ _Static_assert(SCX_DSQ_FLAG_BUILTIN, -+ ^~~~~~~~~~~~~~~~~~~~ -+1 error generated. -+``` -+ -+This means you built the kernel or the schedulers with an older version of -+clang than what's supported (i.e. older than 16.0.0). To remediate this: -+ -+1. `which clang` to make sure you're using a sufficiently new version of clang. -+ -+2. `make fullclean` in the root path of the repository, and rebuild the kernel -+ and schedulers. -+ -+3. Rebuild the kernel, and then your example schedulers. -+ -+The schedulers are also cleaned if you invoke `make mrproper` in the root -+directory of the tree. -+ -+### Stale kernel build / incomplete vmlinux.h file -+ -+As described above, you'll need a `vmlinux.h` file that was generated from a -+vmlinux built with BTF, and with sched_ext support enabled. If you don't, -+you'll see errors such as the following which indicate that a type being -+referenced in a scheduler is unknown: -+ -+``` -+/path/to/sched_ext/tools/sched_ext/user_exit_info.h:25:23: note: forward declaration of 'struct scx_exit_info' -+ -+const struct scx_exit_info *ei) -+ -+^ -+``` -+ -+In order to resolve this, please follow the steps above in -+[Getting a vmlinux.h file](#getting-a-vmlinuxh-file) in order to ensure your -+schedulers are using a vmlinux.h file that includes the requisite types. -+ -+## Misc -+ -+### llvm: [OFF] -+ -+You may see the following output when building the schedulers: -+ -+``` -+Auto-detecting system features: -+... clang-bpf-co-re: [ on ] -+... llvm: [ OFF ] -+... libcap: [ on ] -+... libbfd: [ on ] -+``` -+ -+Seeing `llvm: [ OFF ]` here is not an issue. You can safely ignore. -diff --git a/tools/sched_ext/include/bpf-compat/gnu/stubs.h b/tools/sched_ext/include/bpf-compat/gnu/stubs.h -new file mode 100644 -index 000000000000..ad7d139ce907 ---- /dev/null -+++ b/tools/sched_ext/include/bpf-compat/gnu/stubs.h -@@ -0,0 +1,11 @@ -+/* -+ * Dummy gnu/stubs.h. clang can end up including /usr/include/gnu/stubs.h when -+ * compiling BPF files although its content doesn't play any role. The file in -+ * turn includes stubs-64.h or stubs-32.h depending on whether __x86_64__ is -+ * defined. When compiling a BPF source, __x86_64__ isn't set and thus -+ * stubs-32.h is selected. However, the file is not there if the system doesn't -+ * have 32bit glibc devel package installed leading to a build failure. -+ * -+ * The problem is worked around by making this file available in the include -+ * search paths before the system one when building BPF. -+ */ -diff --git a/tools/sched_ext/include/scx/common.bpf.h b/tools/sched_ext/include/scx/common.bpf.h -new file mode 100644 -index 000000000000..d0b708e959c1 ---- /dev/null -+++ b/tools/sched_ext/include/scx/common.bpf.h -@@ -0,0 +1,349 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef __SCX_COMMON_BPF_H -+#define __SCX_COMMON_BPF_H -+ -+#include "vmlinux.h" -+#include -+#include -+#include -+#include "user_exit_info.h" -+ -+#define PF_WQ_WORKER 0x00000020 /* I'm a workqueue worker */ -+#define PF_KTHREAD 0x00200000 /* I am a kernel thread */ -+#define PF_EXITING 0x00000004 -+#define CLOCK_MONOTONIC 1 -+ -+/* -+ * Earlier versions of clang/pahole lost upper 32bits in 64bit enums which can -+ * lead to really confusing misbehaviors. Let's trigger a build failure. -+ */ -+static inline void ___vmlinux_h_sanity_check___(void) -+{ -+ _Static_assert(SCX_DSQ_FLAG_BUILTIN, -+ "bpftool generated vmlinux.h is missing high bits for 64bit enums, upgrade clang and pahole"); -+} -+ -+s32 scx_bpf_create_dsq(u64 dsq_id, s32 node) __ksym; -+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, bool *is_idle) __ksym; -+void scx_bpf_dispatch(struct task_struct *p, u64 dsq_id, u64 slice, u64 enq_flags) __ksym; -+void scx_bpf_dispatch_vtime(struct task_struct *p, u64 dsq_id, u64 slice, u64 vtime, u64 enq_flags) __ksym; -+u32 scx_bpf_dispatch_nr_slots(void) __ksym; -+void scx_bpf_dispatch_cancel(void) __ksym; -+bool scx_bpf_consume(u64 dsq_id) __ksym; -+bool __scx_bpf_consume_task(unsigned long it, struct task_struct *p) __ksym __weak; -+u32 scx_bpf_reenqueue_local(void) __ksym; -+void scx_bpf_kick_cpu(s32 cpu, u64 flags) __ksym; -+s32 scx_bpf_dsq_nr_queued(u64 dsq_id) __ksym; -+void scx_bpf_destroy_dsq(u64 dsq_id) __ksym; -+int bpf_iter_scx_dsq_new(struct bpf_iter_scx_dsq *it, u64 dsq_id, bool rev) __ksym __weak; -+struct task_struct *bpf_iter_scx_dsq_next(struct bpf_iter_scx_dsq *it) __ksym __weak; -+void bpf_iter_scx_dsq_destroy(struct bpf_iter_scx_dsq *it) __ksym __weak; -+void scx_bpf_exit_bstr(s64 exit_code, char *fmt, unsigned long long *data, u32 data__sz) __ksym __weak; -+void scx_bpf_error_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym; -+void scx_bpf_dump_bstr(char *fmt, unsigned long long *data, u32 data_len) __ksym __weak; -+u32 scx_bpf_cpuperf_cap(s32 cpu) __ksym __weak; -+u32 scx_bpf_cpuperf_cur(s32 cpu) __ksym __weak; -+void scx_bpf_cpuperf_set(s32 cpu, u32 perf) __ksym __weak; -+u32 scx_bpf_nr_cpu_ids(void) __ksym __weak; -+const struct cpumask *scx_bpf_get_possible_cpumask(void) __ksym __weak; -+const struct cpumask *scx_bpf_get_online_cpumask(void) __ksym __weak; -+void scx_bpf_put_cpumask(const struct cpumask *cpumask) __ksym __weak; -+const struct cpumask *scx_bpf_get_idle_cpumask(void) __ksym; -+const struct cpumask *scx_bpf_get_idle_smtmask(void) __ksym; -+void scx_bpf_put_idle_cpumask(const struct cpumask *cpumask) __ksym; -+bool scx_bpf_test_and_clear_cpu_idle(s32 cpu) __ksym; -+s32 scx_bpf_pick_idle_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; -+s32 scx_bpf_pick_any_cpu(const cpumask_t *cpus_allowed, u64 flags) __ksym; -+bool scx_bpf_task_running(const struct task_struct *p) __ksym; -+s32 scx_bpf_task_cpu(const struct task_struct *p) __ksym; -+struct cgroup *scx_bpf_task_cgroup(struct task_struct *p) __ksym; -+ -+/* -+ * Use the following as @it when calling scx_bpf_consume_task() from whitin -+ * bpf_for_each() loops. -+ */ -+#define BPF_FOR_EACH_ITER (&___it) -+ -+/* hopefully temporary wrapper to work around BPF restriction */ -+static inline bool scx_bpf_consume_task(struct bpf_iter_scx_dsq *it, -+ struct task_struct *p) -+{ -+ unsigned long ptr; -+ bpf_probe_read_kernel(&ptr, sizeof(ptr), it); -+ return __scx_bpf_consume_task(ptr, p); -+} -+ -+static inline __attribute__((format(printf, 1, 2))) -+void ___scx_bpf_bstr_format_checker(const char *fmt, ...) {} -+ -+/* -+ * Helper macro for initializing the fmt and variadic argument inputs to both -+ * bstr exit kfuncs. Callers to this function should use ___fmt and ___param to -+ * refer to the initialized list of inputs to the bstr kfunc. -+ */ -+#define scx_bpf_bstr_preamble(fmt, args...) \ -+ static char ___fmt[] = fmt; \ -+ /* \ -+ * Note that __param[] must have at least one \ -+ * element to keep the verifier happy. \ -+ */ \ -+ unsigned long long ___param[___bpf_narg(args) ?: 1] = {}; \ -+ \ -+ _Pragma("GCC diagnostic push") \ -+ _Pragma("GCC diagnostic ignored \"-Wint-conversion\"") \ -+ ___bpf_fill(___param, args); \ -+ _Pragma("GCC diagnostic pop") \ -+ -+/* -+ * scx_bpf_exit() wraps the scx_bpf_exit_bstr() kfunc with variadic arguments -+ * instead of an array of u64. Using this macro will cause the scheduler to -+ * exit cleanly with the specified exit code being passed to user space. -+ */ -+#define scx_bpf_exit(code, fmt, args...) \ -+({ \ -+ scx_bpf_bstr_preamble(fmt, args) \ -+ scx_bpf_exit_bstr(code, ___fmt, ___param, sizeof(___param)); \ -+ ___scx_bpf_bstr_format_checker(fmt, ##args); \ -+}) -+ -+/* -+ * scx_bpf_error() wraps the scx_bpf_error_bstr() kfunc with variadic arguments -+ * instead of an array of u64. Invoking this macro will cause the scheduler to -+ * exit in an erroneous state, with diagnostic information being passed to the -+ * user. -+ */ -+#define scx_bpf_error(fmt, args...) \ -+({ \ -+ scx_bpf_bstr_preamble(fmt, args) \ -+ scx_bpf_error_bstr(___fmt, ___param, sizeof(___param)); \ -+ ___scx_bpf_bstr_format_checker(fmt, ##args); \ -+}) -+ -+/* -+ * scx_bpf_dump() wraps the scx_bpf_dump_bstr() kfunc with variadic arguments -+ * instead of an array of u64. To be used from ops.dump() and friends. -+ */ -+#define scx_bpf_dump(fmt, args...) \ -+({ \ -+ scx_bpf_bstr_preamble(fmt, args) \ -+ scx_bpf_dump_bstr(___fmt, ___param, sizeof(___param)); \ -+ ___scx_bpf_bstr_format_checker(fmt, ##args); \ -+}) -+ -+#define BPF_STRUCT_OPS(name, args...) \ -+SEC("struct_ops/"#name) \ -+BPF_PROG(name, ##args) -+ -+#define BPF_STRUCT_OPS_SLEEPABLE(name, args...) \ -+SEC("struct_ops.s/"#name) \ -+BPF_PROG(name, ##args) -+ -+/** -+ * RESIZABLE_ARRAY - Generates annotations for an array that may be resized -+ * @elfsec: the data section of the BPF program in which to place the array -+ * @arr: the name of the array -+ * -+ * libbpf has an API for setting map value sizes. Since data sections (i.e. -+ * bss, data, rodata) themselves are maps, a data section can be resized. If -+ * a data section has an array as its last element, the BTF info for that -+ * array will be adjusted so that length of the array is extended to meet the -+ * new length of the data section. This macro annotates an array to have an -+ * element count of one with the assumption that this array can be resized -+ * within the userspace program. It also annotates the section specifier so -+ * this array exists in a custom sub data section which can be resized -+ * independently. -+ * -+ * See RESIZE_ARRAY() for the userspace convenience macro for resizing an -+ * array declared with RESIZABLE_ARRAY(). -+ */ -+#define RESIZABLE_ARRAY(elfsec, arr) arr[1] SEC("."#elfsec"."#arr) -+ -+/** -+ * MEMBER_VPTR - Obtain the verified pointer to a struct or array member -+ * @base: struct or array to index -+ * @member: dereferenced member (e.g. .field, [idx0][idx1], .field[idx0] ...) -+ * -+ * The verifier often gets confused by the instruction sequence the compiler -+ * generates for indexing struct fields or arrays. This macro forces the -+ * compiler to generate a code sequence which first calculates the byte offset, -+ * checks it against the struct or array size and add that byte offset to -+ * generate the pointer to the member to help the verifier. -+ * -+ * Ideally, we want to abort if the calculated offset is out-of-bounds. However, -+ * BPF currently doesn't support abort, so evaluate to %NULL instead. The caller -+ * must check for %NULL and take appropriate action to appease the verifier. To -+ * avoid confusing the verifier, it's best to check for %NULL and dereference -+ * immediately. -+ * -+ * vptr = MEMBER_VPTR(my_array, [i][j]); -+ * if (!vptr) -+ * return error; -+ * *vptr = new_value; -+ * -+ * sizeof(@base) should encompass the memory area to be accessed and thus can't -+ * be a pointer to the area. Use `MEMBER_VPTR(*ptr, .member)` instead of -+ * `MEMBER_VPTR(ptr, ->member)`. -+ */ -+#define MEMBER_VPTR(base, member) (typeof((base) member) *) \ -+({ \ -+ u64 __base = (u64)&(base); \ -+ u64 __addr = (u64)&((base) member) - __base; \ -+ _Static_assert(sizeof(base) >= sizeof((base) member), \ -+ "@base is smaller than @member, is @base a pointer?"); \ -+ asm volatile ( \ -+ "if %0 <= %[max] goto +2\n" \ -+ "%0 = 0\n" \ -+ "goto +1\n" \ -+ "%0 += %1\n" \ -+ : "+r"(__addr) \ -+ : "r"(__base), \ -+ [max]"i"(sizeof(base) - sizeof((base) member))); \ -+ __addr; \ -+}) -+ -+/** -+ * ARRAY_ELEM_PTR - Obtain the verified pointer to an array element -+ * @arr: array to index into -+ * @i: array index -+ * @n: number of elements in array -+ * -+ * Similar to MEMBER_VPTR() but is intended for use with arrays where the -+ * element count needs to be explicit. -+ * It can be used in cases where a global array is defined with an initial -+ * size but is intended to be be resized before loading the BPF program. -+ * Without this version of the macro, MEMBER_VPTR() will use the compile time -+ * size of the array to compute the max, which will result in rejection by -+ * the verifier. -+ */ -+#define ARRAY_ELEM_PTR(arr, i, n) (typeof(arr[i]) *) \ -+({ \ -+ u64 __base = (u64)arr; \ -+ u64 __addr = (u64)&(arr[i]) - __base; \ -+ asm volatile ( \ -+ "if %0 <= %[max] goto +2\n" \ -+ "%0 = 0\n" \ -+ "goto +1\n" \ -+ "%0 += %1\n" \ -+ : "+r"(__addr) \ -+ : "r"(__base), \ -+ [max]"r"(sizeof(arr[0]) * ((n) - 1))); \ -+ __addr; \ -+}) -+ -+/* -+ * BPF core and other generic helpers -+ */ -+ -+/* list and rbtree */ -+#define __contains(name, node) __attribute__((btf_decl_tag("contains:" #name ":" #node))) -+#define private(name) SEC(".data." #name) __hidden __attribute__((aligned(8))) -+ -+/* -+ * bpf_log2 - Compute the base 2 logarithm of a 32-bit exponential value. -+ * @v: The value for which we're computing the base 2 logarithm. -+ */ -+static inline u32 bpf_log2(u32 v) -+{ -+ u32 r; -+ u32 shift; -+ -+ r = (v > 0xFFFF) << 4; v >>= r; -+ shift = (v > 0xFF) << 3; v >>= shift; r |= shift; -+ shift = (v > 0xF) << 2; v >>= shift; r |= shift; -+ shift = (v > 0x3) << 1; v >>= shift; r |= shift; -+ r |= (v >> 1); -+ return r; -+} -+ -+/* -+ * bpf_log2l - Compute the base 2 logarithm of a 64-bit exponential value. -+ * @v: The value for which we're computing the base 2 logarithm. -+ */ -+static inline u32 bpf_log2l(u64 v) -+{ -+ u32 hi = v >> 32; -+ if (hi) -+ return bpf_log2(hi) + 32 + 1; -+ else -+ return bpf_log2(v) + 1; -+} -+ -+/* useful compiler attributes */ -+#define likely(x) __builtin_expect(!!(x), 1) -+#define unlikely(x) __builtin_expect(!!(x), 0) -+#define __maybe_unused __attribute__((__unused__)) -+ -+ -+void *bpf_obj_new_impl(__u64 local_type_id, void *meta) __ksym; -+void bpf_obj_drop_impl(void *kptr, void *meta) __ksym; -+ -+#define bpf_obj_new(type) ((type *)bpf_obj_new_impl(bpf_core_type_id_local(type), NULL)) -+#define bpf_obj_drop(kptr) bpf_obj_drop_impl(kptr, NULL) -+ -+void bpf_list_push_front(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -+void bpf_list_push_back(struct bpf_list_head *head, struct bpf_list_node *node) __ksym; -+struct bpf_list_node *bpf_list_pop_front(struct bpf_list_head *head) __ksym; -+struct bpf_list_node *bpf_list_pop_back(struct bpf_list_head *head) __ksym; -+struct bpf_rb_node *bpf_rbtree_remove(struct bpf_rb_root *root, -+ struct bpf_rb_node *node) __ksym; -+int bpf_rbtree_add_impl(struct bpf_rb_root *root, struct bpf_rb_node *node, -+ bool (less)(struct bpf_rb_node *a, const struct bpf_rb_node *b), -+ void *meta, __u64 off) __ksym; -+#define bpf_rbtree_add(head, node, less) bpf_rbtree_add_impl(head, node, less, NULL, 0) -+ -+struct bpf_rb_node *bpf_rbtree_first(struct bpf_rb_root *root) __ksym; -+ -+void *bpf_refcount_acquire_impl(void *kptr, void *meta) __ksym; -+#define bpf_refcount_acquire(kptr) bpf_refcount_acquire_impl(kptr, NULL) -+ -+/* task */ -+struct task_struct *bpf_task_from_pid(s32 pid) __ksym; -+struct task_struct *bpf_task_acquire(struct task_struct *p) __ksym; -+void bpf_task_release(struct task_struct *p) __ksym; -+ -+/* cgroup */ -+struct cgroup *bpf_cgroup_ancestor(struct cgroup *cgrp, int level) __ksym; -+void bpf_cgroup_release(struct cgroup *cgrp) __ksym; -+struct cgroup *bpf_cgroup_from_id(u64 cgid) __ksym; -+ -+/* cpumask */ -+struct bpf_cpumask *bpf_cpumask_create(void) __ksym; -+struct bpf_cpumask *bpf_cpumask_acquire(struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_release(struct bpf_cpumask *cpumask) __ksym; -+u32 bpf_cpumask_first(const struct cpumask *cpumask) __ksym; -+u32 bpf_cpumask_first_zero(const struct cpumask *cpumask) __ksym; -+void bpf_cpumask_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_cpu(u32 cpu, const struct cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_and_set_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_test_and_clear_cpu(u32 cpu, struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_setall(struct bpf_cpumask *cpumask) __ksym; -+void bpf_cpumask_clear(struct bpf_cpumask *cpumask) __ksym; -+bool bpf_cpumask_and(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+void bpf_cpumask_or(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+void bpf_cpumask_xor(struct bpf_cpumask *dst, const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+bool bpf_cpumask_equal(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_intersects(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_subset(const struct cpumask *src1, const struct cpumask *src2) __ksym; -+bool bpf_cpumask_empty(const struct cpumask *cpumask) __ksym; -+bool bpf_cpumask_full(const struct cpumask *cpumask) __ksym; -+void bpf_cpumask_copy(struct bpf_cpumask *dst, const struct cpumask *src) __ksym; -+u32 bpf_cpumask_any_distribute(const struct cpumask *cpumask) __ksym; -+u32 bpf_cpumask_any_and_distribute(const struct cpumask *src1, -+ const struct cpumask *src2) __ksym; -+ -+/* rcu */ -+void bpf_rcu_read_lock(void) __ksym; -+void bpf_rcu_read_unlock(void) __ksym; -+ -+#include "compat.bpf.h" -+ -+#endif /* __SCX_COMMON_BPF_H */ -diff --git a/tools/sched_ext/include/scx/common.h b/tools/sched_ext/include/scx/common.h -new file mode 100644 -index 000000000000..8d5a6775f64d ---- /dev/null -+++ b/tools/sched_ext/include/scx/common.h -@@ -0,0 +1,71 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+#ifndef __SCHED_EXT_COMMON_H -+#define __SCHED_EXT_COMMON_H -+ -+#ifdef __KERNEL__ -+#error "Should not be included by BPF programs" -+#endif -+ -+#include -+#include -+#include -+#include -+#include -+ -+typedef uint8_t u8; -+typedef uint16_t u16; -+typedef uint32_t u32; -+typedef uint64_t u64; -+typedef int8_t s8; -+typedef int16_t s16; -+typedef int32_t s32; -+typedef int64_t s64; -+ -+#define SCX_BUG(__fmt, ...) \ -+ do { \ -+ fprintf(stderr, "%s:%d [scx panic]: %s\n", __FILE__, __LINE__, \ -+ strerror(errno)); \ -+ fprintf(stderr, __fmt __VA_OPT__(,) __VA_ARGS__); \ -+ fprintf(stderr, "\n"); \ -+ \ -+ exit(EXIT_FAILURE); \ -+ } while (0) -+ -+#define SCX_BUG_ON(__cond, __fmt, ...) \ -+ do { \ -+ if (__cond) \ -+ SCX_BUG((__fmt) __VA_OPT__(,) __VA_ARGS__); \ -+ } while (0) -+ -+/** -+ * RESIZE_ARRAY - Convenience macro for resizing a BPF array -+ * @elfsec: the data section of the BPF program in which to the array exists -+ * @arr: the name of the array -+ * @n: the desired array element count -+ * -+ * For BPF arrays declared with RESIZABLE_ARRAY(), this macro performs two -+ * operations. It resizes the map which corresponds to the custom data -+ * section that contains the target array. As a side effect, the BTF info for -+ * the array is adjusted so that the array length is sized to cover the new -+ * data section size. The second operation is reassigning the skeleton pointer -+ * for that custom data section so that it points to the newly memory mapped -+ * region. -+ */ -+#define RESIZE_ARRAY(elfsec, arr, n) \ -+ do { \ -+ size_t __sz; \ -+ bpf_map__set_value_size(skel->maps.elfsec##_##arr, \ -+ sizeof(skel->elfsec##_##arr->arr[0]) * (n)); \ -+ skel->elfsec##_##arr = \ -+ bpf_map__initial_value(skel->maps.elfsec##_##arr, &__sz); \ -+ } while (0) -+ -+#include "user_exit_info.h" -+#include "compat.h" -+ -+#endif /* __SCHED_EXT_COMMON_H */ -diff --git a/tools/sched_ext/include/scx/compat.bpf.h b/tools/sched_ext/include/scx/compat.bpf.h -new file mode 100644 -index 000000000000..914baac2e965 ---- /dev/null -+++ b/tools/sched_ext/include/scx/compat.bpf.h -@@ -0,0 +1,120 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 Tejun Heo -+ * Copyright (c) 2024 David Vernet -+ */ -+#ifndef __SCX_COMPAT_BPF_H -+#define __SCX_COMPAT_BPF_H -+ -+#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ -+({ \ -+ __type __ret = 0; \ -+ if (bpf_core_enum_value_exists(__type, __ent)) \ -+ __ret = __ent; \ -+ __ret; \ -+}) -+ -+/* -+ * %SCX_KICK_IDLE is a later addition. To support both before and after, use -+ * %__COMPAT_SCX_KICK_IDLE which becomes 0 on kernels which don't support it. -+ * Users can use %SCX_KICK_IDLE directly in the future. -+ */ -+#define __COMPAT_SCX_KICK_IDLE \ -+ __COMPAT_ENUM_OR_ZERO(enum scx_kick_flags, SCX_KICK_IDLE) -+ -+/* -+ * scx_switch_all() was replaced by %SCX_OPS_SWITCH_PARTIAL. See -+ * %__COMPAT_SCX_OPS_SWITCH_PARTIAL in compat.h. This can be dropped in the -+ * future. -+ */ -+void scx_bpf_switch_all(void) __ksym __weak; -+ -+static inline void __COMPAT_scx_bpf_switch_all(void) -+{ -+ if (!bpf_core_enum_value_exists(enum scx_ops_flags, SCX_OPS_SWITCH_PARTIAL)) -+ scx_bpf_switch_all(); -+} -+ -+/* -+ * scx_bpf_exit() is a new addition. Fall back to scx_bpf_error() if -+ * unavailable. Users can use scx_bpf_exit() directly in the future. -+ */ -+#define __COMPAT_scx_bpf_exit(code, fmt, args...) \ -+({ \ -+ if (bpf_ksym_exists(scx_bpf_exit_bstr)) \ -+ scx_bpf_exit((code), fmt, ##args); \ -+ else \ -+ scx_bpf_error(fmt, ##args); \ -+}) -+ -+/* -+ * scx_bpf_dump() is a new addition. Ignore if unavailable. Users can use -+ * scx_bpf_dump() directly in the future. -+ */ -+#define __COMPAT_scx_bpf_dump(fmt, args...) \ -+({ \ -+ if (bpf_ksym_exists(scx_bpf_dump_bstr)) \ -+ scx_bpf_dump(fmt, ##args); \ -+}) -+ -+/* -+ * scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. No good -+ * way to noop these kfuncs. Provide a test macro. Users can assume existence in -+ * the future. -+ */ -+#define __COMPAT_HAS_CPUMASKS \ -+ bpf_ksym_exists(scx_bpf_nr_cpu_ids) -+ -+/* -+ * cpuperf is new. The followings become noop on older kernels. Callers can be -+ * updated to call cpuperf kfuncs directly in the future. -+ */ -+static inline u32 __COMPAT_scx_bpf_cpuperf_cap(s32 cpu) -+{ -+ if (bpf_ksym_exists(scx_bpf_cpuperf_cap)) -+ return scx_bpf_cpuperf_cap(cpu); -+ else -+ return 1024; -+} -+ -+static inline u32 __COMPAT_scx_bpf_cpuperf_cur(s32 cpu) -+{ -+ if (bpf_ksym_exists(scx_bpf_cpuperf_cur)) -+ return scx_bpf_cpuperf_cur(cpu); -+ else -+ return 1024; -+} -+ -+static inline void __COMPAT_scx_bpf_cpuperf_set(s32 cpu, u32 perf) -+{ -+ if (bpf_ksym_exists(scx_bpf_cpuperf_set)) -+ return scx_bpf_cpuperf_set(cpu, perf); -+} -+ -+/* -+ * Iteration and scx_bpf_consume_task() are new. The following become noop on -+ * older kernels. The users can switch to bpf_for_each(scx_dsq) and directly -+ * call scx_bpf_consume_task() in the future. -+ */ -+#define __COMPAT_DSQ_FOR_EACH(p, dsq_id, flags) \ -+ if (bpf_ksym_exists(bpf_iter_scx_dsq_new)) \ -+ bpf_for_each(scx_dsq, (p), (dsq_id), (flags)) -+ -+static inline bool __COMPAT_scx_bpf_consume_task(struct bpf_iter_scx_dsq *it, -+ struct task_struct *p) -+{ -+ return false; -+} -+ -+/* -+ * Define sched_ext_ops. This may be expanded to define multiple variants for -+ * backward compatibility. See compat.h::SCX_OPS_LOAD/ATTACH(). -+ */ -+#define SCX_OPS_DEFINE(__name, ...) \ -+ SEC(".struct_ops.link") \ -+ struct sched_ext_ops __name = { \ -+ __VA_ARGS__, \ -+ }; -+ -+#endif /* __SCX_COMPAT_BPF_H */ -diff --git a/tools/sched_ext/include/scx/compat.h b/tools/sched_ext/include/scx/compat.h -new file mode 100644 -index 000000000000..47ec920f8776 ---- /dev/null -+++ b/tools/sched_ext/include/scx/compat.h -@@ -0,0 +1,208 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 Tejun Heo -+ * Copyright (c) 2024 David Vernet -+ */ -+#ifndef __SCX_COMPAT_H -+#define __SCX_COMPAT_H -+ -+#include -+#include -+#include -+#include -+ -+struct btf *__COMPAT_vmlinux_btf __attribute__((weak)); -+ -+static inline void __COMPAT_load_vmlinux_btf(void) -+{ -+ if (!__COMPAT_vmlinux_btf) { -+ __COMPAT_vmlinux_btf = btf__load_vmlinux_btf(); -+ SCX_BUG_ON(!__COMPAT_vmlinux_btf, "btf__load_vmlinux_btf()"); -+ } -+} -+ -+static inline bool __COMPAT_read_enum(const char *type, const char *name, u64 *v) -+{ -+ const struct btf_type *t; -+ const char *n; -+ s32 tid; -+ int i; -+ -+ __COMPAT_load_vmlinux_btf(); -+ -+ tid = btf__find_by_name(__COMPAT_vmlinux_btf, type); -+ if (tid < 0) -+ return false; -+ -+ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); -+ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); -+ -+ if (btf_is_enum(t)) { -+ struct btf_enum *e = btf_enum(t); -+ -+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { -+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); -+ SCX_BUG_ON(!n, "btf__name_by_offset()"); -+ if (!strcmp(n, name)) { -+ *v = e[i].val; -+ return true; -+ } -+ } -+ } else if (btf_is_enum64(t)) { -+ struct btf_enum64 *e = btf_enum64(t); -+ -+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { -+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, e[i].name_off); -+ SCX_BUG_ON(!n, "btf__name_by_offset()"); -+ if (!strcmp(n, name)) { -+ *v = btf_enum64_value(&e[i]); -+ return true; -+ } -+ } -+ } -+ -+ return false; -+} -+ -+#define __COMPAT_ENUM_OR_ZERO(__type, __ent) \ -+({ \ -+ u64 __val = 0; \ -+ __COMPAT_read_enum(__type, __ent, &__val); \ -+ __val; \ -+}) -+ -+static inline bool __COMPAT_has_ksym(const char *ksym) -+{ -+ __COMPAT_load_vmlinux_btf(); -+ return btf__find_by_name(__COMPAT_vmlinux_btf, ksym) >= 0; -+} -+ -+static inline bool __COMPAT_struct_has_field(const char *type, const char *field) -+{ -+ const struct btf_type *t; -+ const struct btf_member *m; -+ const char *n; -+ s32 tid; -+ int i; -+ -+ __COMPAT_load_vmlinux_btf(); -+ tid = btf__find_by_name_kind(__COMPAT_vmlinux_btf, type, BTF_KIND_STRUCT); -+ if (tid < 0) -+ return false; -+ -+ t = btf__type_by_id(__COMPAT_vmlinux_btf, tid); -+ SCX_BUG_ON(!t, "btf__type_by_id(%d)", tid); -+ -+ m = btf_members(t); -+ -+ for (i = 0; i < BTF_INFO_VLEN(t->info); i++) { -+ n = btf__name_by_offset(__COMPAT_vmlinux_btf, m[i].name_off); -+ SCX_BUG_ON(!n, "btf__name_by_offset()"); -+ if (!strcmp(n, field)) -+ return true; -+ } -+ -+ return false; -+} -+ -+/* -+ * An ops flag, %SCX_OPS_SWITCH_PARTIAL, replaced scx_bpf_switch_all() which had -+ * to be called from ops.init(). To support both before and after, use both -+ * %__COMPAT_SCX_OPS_SWITCH_PARTIAL and %__COMPAT_scx_bpf_switch_all() defined -+ * in compat.bpf.h. Users can switch to directly using %SCX_OPS_SWITCH_PARTIAL -+ * in the future. -+ */ -+#define __COMPAT_SCX_OPS_SWITCH_PARTIAL \ -+ __COMPAT_ENUM_OR_ZERO("scx_ops_flags", "SCX_OPS_SWITCH_PARTIAL") -+ -+/* -+ * scx_bpf_nr_cpu_ids(), scx_bpf_get_possible/online_cpumask() are new. Users -+ * will be able to assume existence in the future. -+ */ -+#define __COMPAT_HAS_CPUMASKS \ -+ __COMPAT_has_ksym("scx_bpf_nr_cpu_ids") -+ -+/* -+ * DSQ iterator is new. Users will be able to assume existence in the future. -+ */ -+#define __COMPAT_HAS_DSQ_ITER \ -+ __COMPAT_has_ksym("bpf_iter_scx_dsq_new") -+ -+static inline long scx_hotplug_seq(void) -+{ -+ int fd; -+ char buf[32]; -+ ssize_t len; -+ long val; -+ -+ fd = open("/sys/kernel/sched_ext/hotplug_seq", O_RDONLY); -+ if (fd < 0) -+ return -ENOENT; -+ -+ len = read(fd, buf, sizeof(buf) - 1); -+ SCX_BUG_ON(len <= 0, "read failed (%ld)", len); -+ buf[len] = 0; -+ close(fd); -+ -+ val = strtoul(buf, NULL, 10); -+ SCX_BUG_ON(val < 0, "invalid num hotplug events: %lu", val); -+ -+ return val; -+} -+ -+/* -+ * struct sched_ext_ops can change over time. If compat.bpf.h::SCX_OPS_DEFINE() -+ * is used to define ops and compat.h::SCX_OPS_LOAD/ATTACH() are used to load -+ * and attach it, backward compatibility is automatically maintained where -+ * reasonable. -+ * -+ * - ops.tick(): Ignored on older kernels with a warning. -+ * - ops.dump*(): Ignored on older kernels with a warning. -+ * - ops.exit_dump_len: Cleared to zero on older kernels with a warning. -+ * - ops.hotplug_seq: Ignored on older kernels. -+ */ -+#define SCX_OPS_OPEN(__ops_name, __scx_name) ({ \ -+ struct __scx_name *__skel; \ -+ \ -+ __skel = __scx_name##__open(); \ -+ SCX_BUG_ON(!__skel, "Could not open " #__scx_name); \ -+ \ -+ if (__COMPAT_struct_has_field("sched_ext_ops", "hotplug_seq")) \ -+ __skel->struct_ops.__ops_name->hotplug_seq = scx_hotplug_seq(); \ -+ __skel; \ -+}) -+ -+#define SCX_OPS_LOAD(__skel, __ops_name, __scx_name, __uei_name) ({ \ -+ UEI_SET_SIZE(__skel, __ops_name, __uei_name); \ -+ if (!__COMPAT_struct_has_field("sched_ext_ops", "exit_dump_len") && \ -+ (__skel)->struct_ops.__ops_name->exit_dump_len) { \ -+ fprintf(stderr, "WARNING: kernel doesn't support setting exit dump len\n"); \ -+ (__skel)->struct_ops.__ops_name->exit_dump_len = 0; \ -+ } \ -+ if (!__COMPAT_struct_has_field("sched_ext_ops", "tick") && \ -+ (__skel)->struct_ops.__ops_name->tick) { \ -+ fprintf(stderr, "WARNING: kernel doesn't support ops.tick()\n"); \ -+ (__skel)->struct_ops.__ops_name->tick = NULL; \ -+ } \ -+ if (!__COMPAT_struct_has_field("sched_ext_ops", "dump") && \ -+ ((__skel)->struct_ops.__ops_name->dump || \ -+ (__skel)->struct_ops.__ops_name->dump_cpu || \ -+ (__skel)->struct_ops.__ops_name->dump_task)) { \ -+ fprintf(stderr, "WARNING: kernel doesn't support ops.dump*()\n"); \ -+ (__skel)->struct_ops.__ops_name->dump = NULL; \ -+ (__skel)->struct_ops.__ops_name->dump_cpu = NULL; \ -+ (__skel)->struct_ops.__ops_name->dump_task = NULL; \ -+ } \ -+ SCX_BUG_ON(__scx_name##__load((__skel)), "Failed to load skel"); \ -+}) -+ -+#define SCX_OPS_ATTACH(__skel, __ops_name, __scx_name) ({ \ -+ struct bpf_link *__link; \ -+ SCX_BUG_ON(__scx_name##__attach((__skel)), "Failed to attach skel"); \ -+ __link = bpf_map__attach_struct_ops((__skel)->maps.__ops_name); \ -+ SCX_BUG_ON(!__link, "Failed to attach struct_ops"); \ -+ __link; \ -+}) -+ -+#endif /* __SCX_COMPAT_H */ -diff --git a/tools/sched_ext/include/scx/user_exit_info.h b/tools/sched_ext/include/scx/user_exit_info.h -new file mode 100644 -index 000000000000..2d86d01a9575 ---- /dev/null -+++ b/tools/sched_ext/include/scx/user_exit_info.h -@@ -0,0 +1,111 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Define struct user_exit_info which is shared between BPF and userspace parts -+ * to communicate exit status and other information. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#ifndef __USER_EXIT_INFO_H -+#define __USER_EXIT_INFO_H -+ -+enum uei_sizes { -+ UEI_REASON_LEN = 128, -+ UEI_MSG_LEN = 1024, -+ UEI_DUMP_DFL_LEN = 32768, -+}; -+ -+struct user_exit_info { -+ int kind; -+ s64 exit_code; -+ char reason[UEI_REASON_LEN]; -+ char msg[UEI_MSG_LEN]; -+}; -+ -+#ifdef __bpf__ -+ -+#include "vmlinux.h" -+#include -+ -+#define UEI_DEFINE(__name) \ -+ char RESIZABLE_ARRAY(data, __name##_dump); \ -+ const volatile u32 __name##_dump_len; \ -+ struct user_exit_info __name SEC(".data") -+ -+#define UEI_RECORD(__uei_name, __ei) ({ \ -+ bpf_probe_read_kernel_str(__uei_name.reason, \ -+ sizeof(__uei_name.reason), (__ei)->reason); \ -+ bpf_probe_read_kernel_str(__uei_name.msg, \ -+ sizeof(__uei_name.msg), (__ei)->msg); \ -+ bpf_probe_read_kernel_str(__uei_name##_dump, \ -+ __uei_name##_dump_len, (__ei)->dump); \ -+ if (bpf_core_field_exists((__ei)->exit_code)) \ -+ __uei_name.exit_code = (__ei)->exit_code; \ -+ /* use __sync to force memory barrier */ \ -+ __sync_val_compare_and_swap(&__uei_name.kind, __uei_name.kind, \ -+ (__ei)->kind); \ -+}) -+ -+#else /* !__bpf__ */ -+ -+#include -+#include -+ -+/* no need to call the following explicitly if SCX_OPS_LOAD() is used */ -+#define UEI_SET_SIZE(__skel, __ops_name, __uei_name) ({ \ -+ u32 __len = (__skel)->struct_ops.__ops_name->exit_dump_len ?: UEI_DUMP_DFL_LEN; \ -+ (__skel)->rodata->__uei_name##_dump_len = __len; \ -+ RESIZE_ARRAY(data, __uei_name##_dump, __len); \ -+}) -+ -+#define UEI_EXITED(__skel, __uei_name) ({ \ -+ /* use __sync to force memory barrier */ \ -+ __sync_val_compare_and_swap(&(__skel)->data->__uei_name.kind, -1, -1); \ -+}) -+ -+#define UEI_REPORT(__skel, __uei_name) ({ \ -+ struct user_exit_info *__uei = &(__skel)->data->__uei_name; \ -+ char *__uei_dump = (__skel)->data_##__uei_name##_dump->__uei_name##_dump; \ -+ if (__uei_dump[0] != '\0') { \ -+ fputs("\nDEBUG DUMP\n", stderr); \ -+ fputs("================================================================================\n\n", stderr); \ -+ fputs(__uei_dump, stderr); \ -+ fputs("\n================================================================================\n\n", stderr); \ -+ } \ -+ fprintf(stderr, "EXIT: %s", __uei->reason); \ -+ if (__uei->msg[0] != '\0') \ -+ fprintf(stderr, " (%s)", __uei->msg); \ -+ fputs("\n", stderr); \ -+ __uei->exit_code; \ -+}) -+ -+/* -+ * We can't import vmlinux.h while compiling user C code. Let's duplicate -+ * scx_exit_code definition. -+ */ -+enum scx_exit_code { -+ /* Reasons */ -+ SCX_ECODE_RSN_HOTPLUG = 1LLU << 32, -+ -+ /* Actions */ -+ SCX_ECODE_ACT_RESTART = 1LLU << 48, -+}; -+ -+enum uei_ecode_mask { -+ UEI_ECODE_USER_MASK = ((1LLU << 32) - 1), -+ UEI_ECODE_SYS_RSN_MASK = ((1LLU << 16) - 1) << 32, -+ UEI_ECODE_SYS_ACT_MASK = ((1LLU << 16) - 1) << 48, -+}; -+ -+/* -+ * These macro interpret the ecode returned from UEI_REPORT(). -+ */ -+#define UEI_ECODE_USER(__ecode) ((__ecode) & UEI_ECODE_USER_MASK) -+#define UEI_ECODE_SYS_RSN(__ecode) ((__ecode) & UEI_ECODE_SYS_RSN_MASK) -+#define UEI_ECODE_SYS_ACT(__ecode) ((__ecode) & UEI_ECODE_SYS_ACT_MASK) -+ -+#define UEI_ECODE_RESTART(__ecode) (UEI_ECODE_SYS_ACT((__ecode)) == SCX_ECODE_ACT_RESTART) -+ -+#endif /* __bpf__ */ -+#endif /* __USER_EXIT_INFO_H */ -diff --git a/tools/sched_ext/scx_central.bpf.c b/tools/sched_ext/scx_central.bpf.c -new file mode 100644 -index 000000000000..b297ccbd70b4 ---- /dev/null -+++ b/tools/sched_ext/scx_central.bpf.c -@@ -0,0 +1,362 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A central FIFO sched_ext scheduler which demonstrates the followings: -+ * -+ * a. Making all scheduling decisions from one CPU: -+ * -+ * The central CPU is the only one making scheduling decisions. All other -+ * CPUs kick the central CPU when they run out of tasks to run. -+ * -+ * There is one global BPF queue and the central CPU schedules all CPUs by -+ * dispatching from the global queue to each CPU's local dsq from dispatch(). -+ * This isn't the most straightforward. e.g. It'd be easier to bounce -+ * through per-CPU BPF queues. The current design is chosen to maximally -+ * utilize and verify various SCX mechanisms such as LOCAL_ON dispatching. -+ * -+ * b. Tickless operation -+ * -+ * All tasks are dispatched with the infinite slice which allows stopping the -+ * ticks on CONFIG_NO_HZ_FULL kernels running with the proper nohz_full -+ * parameter. The tickless operation can be observed through -+ * /proc/interrupts. -+ * -+ * Periodic switching is enforced by a periodic timer checking all CPUs and -+ * preempting them as necessary. Unfortunately, BPF timer currently doesn't -+ * have a way to pin to a specific CPU, so the periodic timer isn't pinned to -+ * the central CPU. -+ * -+ * c. Preemption -+ * -+ * Kthreads are unconditionally queued to the head of a matching local dsq -+ * and dispatched with SCX_DSQ_PREEMPT. This ensures that a kthread is always -+ * prioritized over user threads, which is required for ensuring forward -+ * progress as e.g. the periodic timer may run on a ksoftirqd and if the -+ * ksoftirqd gets starved by a user thread, there may not be anything else to -+ * vacate that user thread. -+ * -+ * SCX_KICK_PREEMPT is used to trigger scheduling and CPUs to move to the -+ * next tasks. -+ * -+ * This scheduler is designed to maximize usage of various SCX mechanisms. A -+ * more practical implementation would likely put the scheduling loop outside -+ * the central CPU's dispatch() path and add some form of priority mechanism. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+enum { -+ FALLBACK_DSQ_ID = 0, -+ MS_TO_NS = 1000LLU * 1000, -+ TIMER_INTERVAL_NS = 1 * MS_TO_NS, -+}; -+ -+const volatile s32 central_cpu; -+const volatile u32 nr_cpu_ids = 1; /* !0 for veristat, set during init */ -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+ -+bool timer_pinned = true; -+u64 nr_total, nr_locals, nr_queued, nr_lost_pids; -+u64 nr_timers, nr_dispatches, nr_mismatches, nr_retries; -+u64 nr_overflows; -+ -+UEI_DEFINE(uei); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 4096); -+ __type(value, s32); -+} central_q SEC(".maps"); -+ -+/* can't use percpu map due to bad lookups */ -+bool RESIZABLE_ARRAY(data, cpu_gimme_task); -+u64 RESIZABLE_ARRAY(data, cpu_started_at); -+ -+struct central_timer { -+ struct bpf_timer timer; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct central_timer); -+} central_timer SEC(".maps"); -+ -+static bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+s32 BPF_STRUCT_OPS(central_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ /* -+ * Steer wakeups to the central CPU as much as possible to avoid -+ * disturbing other CPUs. It's safe to blindly return the central cpu as -+ * select_cpu() is a hint and if @p can't be on it, the kernel will -+ * automatically pick a fallback CPU. -+ */ -+ return central_cpu; -+} -+ -+void BPF_STRUCT_OPS(central_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ s32 pid = p->pid; -+ -+ __sync_fetch_and_add(&nr_total, 1); -+ -+ /* -+ * Push per-cpu kthreads at the head of local dsq's and preempt the -+ * corresponding CPU. This ensures that e.g. ksoftirqd isn't blocked -+ * behind other threads which is necessary for forward progress -+ * guarantee as we depend on the BPF timer which may run from ksoftirqd. -+ */ -+ if ((p->flags & PF_KTHREAD) && p->nr_cpus_allowed == 1) { -+ __sync_fetch_and_add(&nr_locals, 1); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_INF, -+ enq_flags | SCX_ENQ_PREEMPT); -+ return; -+ } -+ -+ if (bpf_map_push_elem(¢ral_q, &pid, 0)) { -+ __sync_fetch_and_add(&nr_overflows, 1); -+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, enq_flags); -+ return; -+ } -+ -+ __sync_fetch_and_add(&nr_queued, 1); -+ -+ if (!scx_bpf_task_running(p)) -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+} -+ -+static bool dispatch_to_cpu(s32 cpu) -+{ -+ struct task_struct *p; -+ s32 pid; -+ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ if (bpf_map_pop_elem(¢ral_q, &pid)) -+ break; -+ -+ __sync_fetch_and_sub(&nr_queued, 1); -+ -+ p = bpf_task_from_pid(pid); -+ if (!p) { -+ __sync_fetch_and_add(&nr_lost_pids, 1); -+ continue; -+ } -+ -+ /* -+ * If we can't run the task at the top, do the dumb thing and -+ * bounce it to the fallback dsq. -+ */ -+ if (!bpf_cpumask_test_cpu(cpu, p->cpus_ptr)) { -+ __sync_fetch_and_add(&nr_mismatches, 1); -+ scx_bpf_dispatch(p, FALLBACK_DSQ_ID, SCX_SLICE_INF, 0); -+ bpf_task_release(p); -+ /* -+ * We might run out of dispatch buffer slots if we continue dispatching -+ * to the fallback DSQ, without dispatching to the local DSQ of the -+ * target CPU. In such a case, break the loop now as will fail the -+ * next dispatch operation. -+ */ -+ if (!scx_bpf_dispatch_nr_slots()) -+ break; -+ continue; -+ } -+ -+ /* dispatch to local and mark that @cpu doesn't need more */ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | cpu, SCX_SLICE_INF, 0); -+ -+ if (cpu != central_cpu) -+ scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE); -+ -+ bpf_task_release(p); -+ return true; -+ } -+ -+ return false; -+} -+ -+void BPF_STRUCT_OPS(central_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ if (cpu == central_cpu) { -+ /* dispatch for all other CPUs first */ -+ __sync_fetch_and_add(&nr_dispatches, 1); -+ -+ bpf_for(cpu, 0, nr_cpu_ids) { -+ bool *gimme; -+ -+ if (!scx_bpf_dispatch_nr_slots()) -+ break; -+ -+ /* central's gimme is never set */ -+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (gimme && !*gimme) -+ continue; -+ -+ if (dispatch_to_cpu(cpu)) -+ *gimme = false; -+ } -+ -+ /* -+ * Retry if we ran out of dispatch buffer slots as we might have -+ * skipped some CPUs and also need to dispatch for self. The ext -+ * core automatically retries if the local dsq is empty but we -+ * can't rely on that as we're dispatching for other CPUs too. -+ * Kick self explicitly to retry. -+ */ -+ if (!scx_bpf_dispatch_nr_slots()) { -+ __sync_fetch_and_add(&nr_retries, 1); -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+ return; -+ } -+ -+ /* look for a task to run on the central CPU */ -+ if (scx_bpf_consume(FALLBACK_DSQ_ID)) -+ return; -+ dispatch_to_cpu(central_cpu); -+ } else { -+ bool *gimme; -+ -+ if (scx_bpf_consume(FALLBACK_DSQ_ID)) -+ return; -+ -+ gimme = ARRAY_ELEM_PTR(cpu_gimme_task, cpu, nr_cpu_ids); -+ if (gimme) -+ *gimme = true; -+ -+ /* -+ * Force dispatch on the scheduling CPU so that it finds a task -+ * to run for us. -+ */ -+ scx_bpf_kick_cpu(central_cpu, SCX_KICK_PREEMPT); -+ } -+} -+ -+void BPF_STRUCT_OPS(central_running, struct task_struct *p) -+{ -+ s32 cpu = scx_bpf_task_cpu(p); -+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at) -+ *started_at = bpf_ktime_get_ns() ?: 1; /* 0 indicates idle */ -+} -+ -+void BPF_STRUCT_OPS(central_stopping, struct task_struct *p, bool runnable) -+{ -+ s32 cpu = scx_bpf_task_cpu(p); -+ u64 *started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at) -+ *started_at = 0; -+} -+ -+static int central_timerfn(void *map, int *key, struct bpf_timer *timer) -+{ -+ u64 now = bpf_ktime_get_ns(); -+ u64 nr_to_kick = nr_queued; -+ s32 i, curr_cpu; -+ -+ curr_cpu = bpf_get_smp_processor_id(); -+ if (timer_pinned && (curr_cpu != central_cpu)) { -+ scx_bpf_error("Central timer ran on CPU %d, not central CPU %d", -+ curr_cpu, central_cpu); -+ return 0; -+ } -+ -+ bpf_for(i, 0, nr_cpu_ids) { -+ s32 cpu = (nr_timers + i) % nr_cpu_ids; -+ u64 *started_at; -+ -+ if (cpu == central_cpu) -+ continue; -+ -+ /* kick iff the current one exhausted its slice */ -+ started_at = ARRAY_ELEM_PTR(cpu_started_at, cpu, nr_cpu_ids); -+ if (started_at && *started_at && -+ vtime_before(now, *started_at + slice_ns)) -+ continue; -+ -+ /* and there's something pending */ -+ if (scx_bpf_dsq_nr_queued(FALLBACK_DSQ_ID) || -+ scx_bpf_dsq_nr_queued(SCX_DSQ_LOCAL_ON | cpu)) -+ ; -+ else if (nr_to_kick) -+ nr_to_kick--; -+ else -+ continue; -+ -+ scx_bpf_kick_cpu(cpu, SCX_KICK_PREEMPT); -+ } -+ -+ bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); -+ __sync_fetch_and_add(&nr_timers, 1); -+ return 0; -+} -+ -+int BPF_STRUCT_OPS_SLEEPABLE(central_init) -+{ -+ u32 key = 0; -+ struct bpf_timer *timer; -+ int ret; -+ -+ __COMPAT_scx_bpf_switch_all(); -+ ret = scx_bpf_create_dsq(FALLBACK_DSQ_ID, -1); -+ if (ret) -+ return ret; -+ -+ timer = bpf_map_lookup_elem(¢ral_timer, &key); -+ if (!timer) -+ return -ESRCH; -+ -+ if (bpf_get_smp_processor_id() != central_cpu) { -+ scx_bpf_error("init from non-central CPU"); -+ return -EINVAL; -+ } -+ -+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); -+ bpf_timer_set_callback(timer, central_timerfn); -+ -+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, BPF_F_TIMER_CPU_PIN); -+ /* -+ * BPF_F_TIMER_CPU_PIN is pretty new (>=6.7). If we're running in a -+ * kernel which doesn't have it, bpf_timer_start() will return -EINVAL. -+ * Retry without the PIN. This would be the perfect use case for -+ * bpf_core_enum_value_exists() but the enum type doesn't have a name -+ * and can't be used with bpf_core_enum_value_exists(). Oh well... -+ */ -+ if (ret == -EINVAL) { -+ timer_pinned = false; -+ ret = bpf_timer_start(timer, TIMER_INTERVAL_NS, 0); -+ } -+ if (ret) -+ scx_bpf_error("bpf_timer_start failed (%d)", ret); -+ return ret; -+} -+ -+void BPF_STRUCT_OPS(central_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(central_ops, -+ /* -+ * We are offloading all scheduling decisions to the central CPU -+ * and thus being the last task on a given CPU doesn't mean -+ * anything special. Enqueue the last tasks like any other tasks. -+ */ -+ .flags = SCX_OPS_ENQ_LAST, -+ -+ .select_cpu = (void *)central_select_cpu, -+ .enqueue = (void *)central_enqueue, -+ .dispatch = (void *)central_dispatch, -+ .running = (void *)central_running, -+ .stopping = (void *)central_stopping, -+ .init = (void *)central_init, -+ .exit = (void *)central_exit, -+ .name = "central"); -diff --git a/tools/sched_ext/scx_central.c b/tools/sched_ext/scx_central.c -new file mode 100644 -index 000000000000..1e0568624ccc ---- /dev/null -+++ b/tools/sched_ext/scx_central.c -@@ -0,0 +1,135 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#define _GNU_SOURCE -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_central.bpf.skel.h" -+ -+const char help_fmt[] = -+"A central FIFO sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-c CPU]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -c CPU Override the central CPU (default: 0)\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_central *skel; -+ struct bpf_link *link; -+ __u64 seq = 0, ecode; -+ __s32 opt; -+ cpu_set_t *cpuset; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+restart: -+ skel = SCX_OPS_OPEN(central_ops, scx_central); -+ -+ skel->rodata->central_cpu = 0; -+ skel->rodata->nr_cpu_ids = libbpf_num_possible_cpus(); -+ -+ while ((opt = getopt(argc, argv, "s:c:pvh")) != -1) { -+ switch (opt) { -+ case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ case 'c': -+ skel->rodata->central_cpu = strtoul(optarg, NULL, 0); -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ /* Resize arrays so their element count is equal to cpu count. */ -+ RESIZE_ARRAY(data, cpu_gimme_task, skel->rodata->nr_cpu_ids); -+ RESIZE_ARRAY(data, cpu_started_at, skel->rodata->nr_cpu_ids); -+ -+ SCX_OPS_LOAD(skel, central_ops, scx_central, uei); -+ -+ /* -+ * Affinitize the loading thread to the central CPU, as: -+ * - That's where the BPF timer is first invoked in the BPF program. -+ * - We probably don't want this user space component to take up a core -+ * from a task that would benefit from avoiding preemption on one of -+ * the tickless cores. -+ * -+ * Until BPF supports pinning the timer, it's not guaranteed that it -+ * will always be invoked on the central CPU. In practice, this -+ * suffices the majority of the time. -+ */ -+ cpuset = CPU_ALLOC(skel->rodata->nr_cpu_ids); -+ SCX_BUG_ON(!cpuset, "Failed to allocate cpuset"); -+ CPU_ZERO(cpuset); -+ CPU_SET(skel->rodata->central_cpu, cpuset); -+ SCX_BUG_ON(sched_setaffinity(0, sizeof(cpuset), cpuset), -+ "Failed to affinitize to central CPU %d (max %d)", -+ skel->rodata->central_cpu, skel->rodata->nr_cpu_ids - 1); -+ CPU_FREE(cpuset); -+ -+ link = SCX_OPS_ATTACH(skel, central_ops, scx_central); -+ -+ if (!skel->data->timer_pinned) -+ printf("WARNING : BPF_F_TIMER_CPU_PIN not available, timer not pinned to central\n"); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ printf("[SEQ %llu]\n", seq++); -+ printf("total :%10" PRIu64 " local:%10" PRIu64 " queued:%10" PRIu64 " lost:%10" PRIu64 "\n", -+ skel->bss->nr_total, -+ skel->bss->nr_locals, -+ skel->bss->nr_queued, -+ skel->bss->nr_lost_pids); -+ printf("timer :%10" PRIu64 " dispatch:%10" PRIu64 " mismatch:%10" PRIu64 " retry:%10" PRIu64 "\n", -+ skel->bss->nr_timers, -+ skel->bss->nr_dispatches, -+ skel->bss->nr_mismatches, -+ skel->bss->nr_retries); -+ printf("overflow:%10" PRIu64 "\n", -+ skel->bss->nr_overflows); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ ecode = UEI_REPORT(skel, uei); -+ scx_central__destroy(skel); -+ -+ if (UEI_ECODE_RESTART(ecode)) -+ goto restart; -+ return 0; -+} -diff --git a/tools/sched_ext/scx_flatcg.bpf.c b/tools/sched_ext/scx_flatcg.bpf.c -new file mode 100644 -index 000000000000..389bea204150 ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.bpf.c -@@ -0,0 +1,939 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A demo sched_ext flattened cgroup hierarchy scheduler. It implements -+ * hierarchical weight-based cgroup CPU control by flattening the cgroup -+ * hierarchy into a single layer by compounding the active weight share at each -+ * level. Consider the following hierarchy with weights in parentheses: -+ * -+ * R + A (100) + B (100) -+ * | \ C (100) -+ * \ D (200) -+ * -+ * Ignoring the root and threaded cgroups, only B, C and D can contain tasks. -+ * Let's say all three have runnable tasks. The total share that each of these -+ * three cgroups is entitled to can be calculated by compounding its share at -+ * each level. -+ * -+ * For example, B is competing against C and in that competition its share is -+ * 100/(100+100) == 1/2. At its parent level, A is competing against D and A's -+ * share in that competition is 100/(200+100) == 1/3. B's eventual share in the -+ * system can be calculated by multiplying the two shares, 1/2 * 1/3 == 1/6. C's -+ * eventual shaer is the same at 1/6. D is only competing at the top level and -+ * its share is 200/(100+200) == 2/3. -+ * -+ * So, instead of hierarchically scheduling level-by-level, we can consider it -+ * as B, C and D competing each other with respective share of 1/6, 1/6 and 2/3 -+ * and keep updating the eventual shares as the cgroups' runnable states change. -+ * -+ * This flattening of hierarchy can bring a substantial performance gain when -+ * the cgroup hierarchy is nested multiple levels. in a simple benchmark using -+ * wrk[8] on apache serving a CGI script calculating sha1sum of a small file, it -+ * outperforms CFS by ~3% with CPU controller disabled and by ~10% with two -+ * apache instances competing with 2:1 weight ratio nested four level deep. -+ * -+ * However, the gain comes at the cost of not being able to properly handle -+ * thundering herd of cgroups. For example, if many cgroups which are nested -+ * behind a low priority parent cgroup wake up around the same time, they may be -+ * able to consume more CPU cycles than they are entitled to. In many use cases, -+ * this isn't a real concern especially given the performance gain. Also, there -+ * are ways to mitigate the problem further by e.g. introducing an extra -+ * scheduling layer on cgroup delegation boundaries. -+ * -+ * The scheduler first picks the cgroup to run and then schedule the tasks -+ * within by using nested weighted vtime scheduling by default. The -+ * cgroup-internal scheduling can be switched to FIFO with the -f option. -+ */ -+#include -+#include "scx_flatcg.h" -+ -+/* -+ * Maximum amount of retries to find a valid cgroup. -+ */ -+#define CGROUP_MAX_RETRIES 1024 -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile u32 nr_cpus = 32; /* !0 for veristat, set during init */ -+const volatile u64 cgrp_slice_ns = SCX_SLICE_DFL; -+const volatile bool fifo_sched; -+ -+u64 cvtime_now; -+UEI_DEFINE(uei); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __type(key, u32); -+ __type(value, u64); -+ __uint(max_entries, FCG_NR_STATS); -+} stats SEC(".maps"); -+ -+static void stat_inc(enum fcg_stat_idx idx) -+{ -+ u32 idx_v = idx; -+ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx_v); -+ if (cnt_p) -+ (*cnt_p)++; -+} -+ -+struct fcg_cpu_ctx { -+ u64 cur_cgid; -+ u64 cur_at; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __type(key, u32); -+ __type(value, struct fcg_cpu_ctx); -+ __uint(max_entries, 1); -+} cpu_ctx SEC(".maps"); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_CGRP_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct fcg_cgrp_ctx); -+} cgrp_ctx SEC(".maps"); -+ -+struct cgv_node { -+ struct bpf_rb_node rb_node; -+ __u64 cvtime; -+ __u64 cgid; -+ struct bpf_refcount refcount; -+}; -+ -+private(CGV_TREE) struct bpf_spin_lock cgv_tree_lock; -+private(CGV_TREE) struct bpf_rb_root cgv_tree __contains(cgv_node, rb_node); -+ -+struct cgv_node_stash { -+ struct cgv_node __kptr *node; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_HASH); -+ __uint(max_entries, 16384); -+ __type(key, __u64); -+ __type(value, struct cgv_node_stash); -+} cgv_node_stash SEC(".maps"); -+ -+struct fcg_task_ctx { -+ u64 bypassed_at; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct fcg_task_ctx); -+} task_ctx SEC(".maps"); -+ -+/* gets inc'd on weight tree changes to expire the cached hweights */ -+u64 hweight_gen = 1; -+ -+static u64 div_round_up(u64 dividend, u64 divisor) -+{ -+ return (dividend + divisor - 1) / divisor; -+} -+ -+static bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+static bool cgv_node_less(struct bpf_rb_node *a, const struct bpf_rb_node *b) -+{ -+ struct cgv_node *cgc_a, *cgc_b; -+ -+ cgc_a = container_of(a, struct cgv_node, rb_node); -+ cgc_b = container_of(b, struct cgv_node, rb_node); -+ -+ return cgc_a->cvtime < cgc_b->cvtime; -+} -+ -+static struct fcg_cpu_ctx *find_cpu_ctx(void) -+{ -+ struct fcg_cpu_ctx *cpuc; -+ u32 idx = 0; -+ -+ cpuc = bpf_map_lookup_elem(&cpu_ctx, &idx); -+ if (!cpuc) { -+ scx_bpf_error("cpu_ctx lookup failed"); -+ return NULL; -+ } -+ return cpuc; -+} -+ -+static struct fcg_cgrp_ctx *find_cgrp_ctx(struct cgroup *cgrp) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (!cgc) { -+ scx_bpf_error("cgrp_ctx lookup failed for cgid %llu", cgrp->kn->id); -+ return NULL; -+ } -+ return cgc; -+} -+ -+static struct fcg_cgrp_ctx *find_ancestor_cgrp_ctx(struct cgroup *cgrp, int level) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ -+ cgrp = bpf_cgroup_ancestor(cgrp, level); -+ if (!cgrp) { -+ scx_bpf_error("ancestor cgroup lookup failed"); -+ return NULL; -+ } -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ scx_bpf_error("ancestor cgrp_ctx lookup failed"); -+ bpf_cgroup_release(cgrp); -+ return cgc; -+} -+ -+static void cgrp_refresh_hweight(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) -+{ -+ int level; -+ -+ if (!cgc->nr_active) { -+ stat_inc(FCG_STAT_HWT_SKIP); -+ return; -+ } -+ -+ if (cgc->hweight_gen == hweight_gen) { -+ stat_inc(FCG_STAT_HWT_CACHE); -+ return; -+ } -+ -+ stat_inc(FCG_STAT_HWT_UPDATES); -+ bpf_for(level, 0, cgrp->level + 1) { -+ struct fcg_cgrp_ctx *cgc; -+ bool is_active; -+ -+ cgc = find_ancestor_cgrp_ctx(cgrp, level); -+ if (!cgc) -+ break; -+ -+ if (!level) { -+ cgc->hweight = FCG_HWEIGHT_ONE; -+ cgc->hweight_gen = hweight_gen; -+ } else { -+ struct fcg_cgrp_ctx *pcgc; -+ -+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); -+ if (!pcgc) -+ break; -+ -+ /* -+ * We can be oppotunistic here and not grab the -+ * cgv_tree_lock and deal with the occasional races. -+ * However, hweight updates are already cached and -+ * relatively low-frequency. Let's just do the -+ * straightforward thing. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ is_active = cgc->nr_active; -+ if (is_active) { -+ cgc->hweight_gen = pcgc->hweight_gen; -+ cgc->hweight = -+ div_round_up(pcgc->hweight * cgc->weight, -+ pcgc->child_weight_sum); -+ } -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!is_active) { -+ stat_inc(FCG_STAT_HWT_RACE); -+ break; -+ } -+ } -+ } -+} -+ -+static void cgrp_cap_budget(struct cgv_node *cgv_node, struct fcg_cgrp_ctx *cgc) -+{ -+ u64 delta, cvtime, max_budget; -+ -+ /* -+ * A node which is on the rbtree can't be pointed to from elsewhere yet -+ * and thus can't be updated and repositioned. Instead, we collect the -+ * vtime deltas separately and apply it asynchronously here. -+ */ -+ delta = cgc->cvtime_delta; -+ __sync_fetch_and_sub(&cgc->cvtime_delta, delta); -+ cvtime = cgv_node->cvtime + delta; -+ -+ /* -+ * Allow a cgroup to carry the maximum budget proportional to its -+ * hweight such that a full-hweight cgroup can immediately take up half -+ * of the CPUs at the most while staying at the front of the rbtree. -+ */ -+ max_budget = (cgrp_slice_ns * nr_cpus * cgc->hweight) / -+ (2 * FCG_HWEIGHT_ONE); -+ if (vtime_before(cvtime, cvtime_now - max_budget)) -+ cvtime = cvtime_now - max_budget; -+ -+ cgv_node->cvtime = cvtime; -+} -+ -+static void cgrp_enqueued(struct cgroup *cgrp, struct fcg_cgrp_ctx *cgc) -+{ -+ struct cgv_node_stash *stash; -+ struct cgv_node *cgv_node; -+ u64 cgid = cgrp->kn->id; -+ -+ /* paired with cmpxchg in try_pick_next_cgroup() */ -+ if (__sync_val_compare_and_swap(&cgc->queued, 0, 1)) { -+ stat_inc(FCG_STAT_ENQ_SKIP); -+ return; -+ } -+ -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash || !stash->node) { -+ scx_bpf_error("cgv_node lookup failed for cgid %llu", cgid); -+ return; -+ } -+ -+ cgv_node = bpf_refcount_acquire(stash->node); -+ if (!cgv_node) { -+ /* -+ * Node never leaves cgv_node_stash, this should only happen if -+ * fcg_cgroup_exit deletes the stashed node -+ */ -+ stat_inc(FCG_STAT_ENQ_RACE); -+ return; -+ } -+ -+ bpf_spin_lock(&cgv_tree_lock); -+ cgrp_cap_budget(cgv_node, cgc); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+} -+ -+static void set_bypassed_at(struct task_struct *p, struct fcg_task_ctx *taskc) -+{ -+ /* -+ * Tell fcg_stopping() that this bypassed the regular scheduling path -+ * and should be force charged to the cgroup. 0 is used to indicate that -+ * the task isn't bypassing, so if the current runtime is 0, go back by -+ * one nanosecond. -+ */ -+ taskc->bypassed_at = p->se.sum_exec_runtime ?: (u64)-1; -+} -+ -+s32 BPF_STRUCT_OPS(fcg_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) -+{ -+ struct fcg_task_ctx *taskc; -+ bool is_idle = false; -+ s32 cpu; -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return cpu; -+ } -+ -+ /* -+ * If select_cpu_dfl() is recommending local enqueue, the target CPU is -+ * idle. Follow it and charge the cgroup later in fcg_stopping() after -+ * the fact. -+ */ -+ if (is_idle) { -+ set_bypassed_at(p, taskc); -+ stat_inc(FCG_STAT_LOCAL); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); -+ } -+ -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(fcg_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ struct fcg_task_ctx *taskc; -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ /* -+ * Use the direct dispatching and force charging to deal with tasks with -+ * custom affinities so that we don't have to worry about per-cgroup -+ * dq's containing tasks that can't be executed from some CPUs. -+ */ -+ if (p->nr_cpus_allowed != nr_cpus) { -+ set_bypassed_at(p, taskc); -+ -+ /* -+ * The global dq is deprioritized as we don't want to let tasks -+ * to boost themselves by constraining its cpumask. The -+ * deprioritization is rather severe, so let's not apply that to -+ * per-cpu kernel threads. This is ham-fisted. We probably wanna -+ * implement per-cgroup fallback dq's instead so that we have -+ * more control over when tasks with custom cpumask get issued. -+ */ -+ if (p->nr_cpus_allowed == 1 && (p->flags & PF_KTHREAD)) { -+ stat_inc(FCG_STAT_LOCAL); -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, enq_flags); -+ } else { -+ stat_inc(FCG_STAT_GLOBAL); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+ } -+ return; -+ } -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ goto out_release; -+ -+ if (fifo_sched) { -+ scx_bpf_dispatch(p, cgrp->kn->id, SCX_SLICE_DFL, enq_flags); -+ } else { -+ u64 tvtime = p->scx.dsq_vtime; -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(tvtime, cgc->tvtime_now - SCX_SLICE_DFL)) -+ tvtime = cgc->tvtime_now - SCX_SLICE_DFL; -+ -+ scx_bpf_dispatch_vtime(p, cgrp->kn->id, SCX_SLICE_DFL, -+ tvtime, enq_flags); -+ } -+ -+ cgrp_enqueued(cgrp, cgc); -+out_release: -+ bpf_cgroup_release(cgrp); -+} -+ -+/* -+ * Walk the cgroup tree to update the active weight sums as tasks wake up and -+ * sleep. The weight sums are used as the base when calculating the proportion a -+ * given cgroup or task is entitled to at each level. -+ */ -+static void update_active_weight_sums(struct cgroup *cgrp, bool runnable) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ bool updated = false; -+ int idx; -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ return; -+ -+ /* -+ * In most cases, a hot cgroup would have multiple threads going to -+ * sleep and waking up while the whole cgroup stays active. In leaf -+ * cgroups, ->nr_runnable which is updated with __sync operations gates -+ * ->nr_active updates, so that we don't have to grab the cgv_tree_lock -+ * repeatedly for a busy cgroup which is staying active. -+ */ -+ if (runnable) { -+ if (__sync_fetch_and_add(&cgc->nr_runnable, 1)) -+ return; -+ stat_inc(FCG_STAT_ACT); -+ } else { -+ if (__sync_sub_and_fetch(&cgc->nr_runnable, 1)) -+ return; -+ stat_inc(FCG_STAT_DEACT); -+ } -+ -+ /* -+ * If @cgrp is becoming runnable, its hweight should be refreshed after -+ * it's added to the weight tree so that enqueue has the up-to-date -+ * value. If @cgrp is becoming quiescent, the hweight should be -+ * refreshed before it's removed from the weight tree so that the usage -+ * charging which happens afterwards has access to the latest value. -+ */ -+ if (!runnable) -+ cgrp_refresh_hweight(cgrp, cgc); -+ -+ /* propagate upwards */ -+ bpf_for(idx, 0, cgrp->level) { -+ int level = cgrp->level - idx; -+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; -+ bool propagate = false; -+ -+ cgc = find_ancestor_cgrp_ctx(cgrp, level); -+ if (!cgc) -+ break; -+ if (level) { -+ pcgc = find_ancestor_cgrp_ctx(cgrp, level - 1); -+ if (!pcgc) -+ break; -+ } -+ -+ /* -+ * We need the propagation protected by a lock to synchronize -+ * against weight changes. There's no reason to drop the lock at -+ * each level but bpf_spin_lock() doesn't want any function -+ * calls while locked. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ -+ if (runnable) { -+ if (!cgc->nr_active++) { -+ updated = true; -+ if (pcgc) { -+ propagate = true; -+ pcgc->child_weight_sum += cgc->weight; -+ } -+ } -+ } else { -+ if (!--cgc->nr_active) { -+ updated = true; -+ if (pcgc) { -+ propagate = true; -+ pcgc->child_weight_sum -= cgc->weight; -+ } -+ } -+ } -+ -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!propagate) -+ break; -+ } -+ -+ if (updated) -+ __sync_fetch_and_add(&hweight_gen, 1); -+ -+ if (runnable) -+ cgrp_refresh_hweight(cgrp, cgc); -+} -+ -+void BPF_STRUCT_OPS(fcg_runnable, struct task_struct *p, u64 enq_flags) -+{ -+ struct cgroup *cgrp; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ update_active_weight_sums(cgrp, true); -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_running, struct task_struct *p) -+{ -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ if (fifo_sched) -+ return; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (cgc) { -+ /* -+ * @cgc->tvtime_now always progresses forward as tasks start -+ * executing. The test and update can be performed concurrently -+ * from multiple CPUs and thus racy. Any error should be -+ * contained and temporary. Let's just live with it. -+ */ -+ if (vtime_before(cgc->tvtime_now, p->scx.dsq_vtime)) -+ cgc->tvtime_now = p->scx.dsq_vtime; -+ } -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_stopping, struct task_struct *p, bool runnable) -+{ -+ struct fcg_task_ctx *taskc; -+ struct cgroup *cgrp; -+ struct fcg_cgrp_ctx *cgc; -+ -+ /* -+ * Scale the execution time by the inverse of the weight and charge. -+ * -+ * Note that the default yield implementation yields by setting -+ * @p->scx.slice to zero and the following would treat the yielding task -+ * as if it has consumed all its slice. If this penalizes yielding tasks -+ * too much, determine the execution time by taking explicit timestamps -+ * instead of depending on @p->scx.slice. -+ */ -+ if (!fifo_sched) -+ p->scx.dsq_vtime += -+ (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, 0); -+ if (!taskc) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ if (!taskc->bypassed_at) -+ return; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ cgc = find_cgrp_ctx(cgrp); -+ if (cgc) { -+ __sync_fetch_and_add(&cgc->cvtime_delta, -+ p->se.sum_exec_runtime - taskc->bypassed_at); -+ taskc->bypassed_at = 0; -+ } -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_quiescent, struct task_struct *p, u64 deq_flags) -+{ -+ struct cgroup *cgrp; -+ -+ cgrp = scx_bpf_task_cgroup(p); -+ update_active_weight_sums(cgrp, false); -+ bpf_cgroup_release(cgrp); -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_set_weight, struct cgroup *cgrp, u32 weight) -+{ -+ struct fcg_cgrp_ctx *cgc, *pcgc = NULL; -+ -+ cgc = find_cgrp_ctx(cgrp); -+ if (!cgc) -+ return; -+ -+ if (cgrp->level) { -+ pcgc = find_ancestor_cgrp_ctx(cgrp, cgrp->level - 1); -+ if (!pcgc) -+ return; -+ } -+ -+ bpf_spin_lock(&cgv_tree_lock); -+ if (pcgc && cgc->nr_active) -+ pcgc->child_weight_sum += (s64)weight - cgc->weight; -+ cgc->weight = weight; -+ bpf_spin_unlock(&cgv_tree_lock); -+} -+ -+static bool try_pick_next_cgroup(u64 *cgidp) -+{ -+ struct bpf_rb_node *rb_node; -+ struct cgv_node *cgv_node; -+ struct fcg_cgrp_ctx *cgc; -+ struct cgroup *cgrp; -+ u64 cgid; -+ -+ /* pop the front cgroup and wind cvtime_now accordingly */ -+ bpf_spin_lock(&cgv_tree_lock); -+ -+ rb_node = bpf_rbtree_first(&cgv_tree); -+ if (!rb_node) { -+ bpf_spin_unlock(&cgv_tree_lock); -+ stat_inc(FCG_STAT_PNC_NO_CGRP); -+ *cgidp = 0; -+ return true; -+ } -+ -+ rb_node = bpf_rbtree_remove(&cgv_tree, rb_node); -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ if (!rb_node) { -+ /* -+ * This should never happen. bpf_rbtree_first() was called -+ * above while the tree lock was held, so the node should -+ * always be present. -+ */ -+ scx_bpf_error("node could not be removed"); -+ return true; -+ } -+ -+ cgv_node = container_of(rb_node, struct cgv_node, rb_node); -+ cgid = cgv_node->cgid; -+ -+ if (vtime_before(cvtime_now, cgv_node->cvtime)) -+ cvtime_now = cgv_node->cvtime; -+ -+ /* -+ * If lookup fails, the cgroup's gone. Free and move on. See -+ * fcg_cgroup_exit(). -+ */ -+ cgrp = bpf_cgroup_from_id(cgid); -+ if (!cgrp) { -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (!cgc) { -+ bpf_cgroup_release(cgrp); -+ stat_inc(FCG_STAT_PNC_GONE); -+ goto out_free; -+ } -+ -+ if (!scx_bpf_consume(cgid)) { -+ bpf_cgroup_release(cgrp); -+ stat_inc(FCG_STAT_PNC_EMPTY); -+ goto out_stash; -+ } -+ -+ /* -+ * Successfully consumed from the cgroup. This will be our current -+ * cgroup for the new slice. Refresh its hweight. -+ */ -+ cgrp_refresh_hweight(cgrp, cgc); -+ -+ bpf_cgroup_release(cgrp); -+ -+ /* -+ * As the cgroup may have more tasks, add it back to the rbtree. Note -+ * that here we charge the full slice upfront and then exact later -+ * according to the actual consumption. This prevents lowpri thundering -+ * herd from saturating the machine. -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ cgv_node->cvtime += cgrp_slice_ns * FCG_HWEIGHT_ONE / (cgc->hweight ?: 1); -+ cgrp_cap_budget(cgv_node, cgc); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+ -+ *cgidp = cgid; -+ stat_inc(FCG_STAT_PNC_NEXT); -+ return true; -+ -+out_stash: -+ /* -+ * Paired with cmpxchg in cgrp_enqueued(). If they see the following -+ * transition, they'll enqueue the cgroup. If they are earlier, we'll -+ * see their task in the dq below and requeue the cgroup. -+ */ -+ __sync_val_compare_and_swap(&cgc->queued, 1, 0); -+ -+ if (scx_bpf_dsq_nr_queued(cgid)) { -+ bpf_spin_lock(&cgv_tree_lock); -+ bpf_rbtree_add(&cgv_tree, &cgv_node->rb_node, cgv_node_less); -+ bpf_spin_unlock(&cgv_tree_lock); -+ stat_inc(FCG_STAT_PNC_RACE); -+ return false; -+ } -+ -+out_free: -+ bpf_obj_drop(cgv_node); -+ return false; -+} -+ -+void BPF_STRUCT_OPS(fcg_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ struct fcg_cpu_ctx *cpuc; -+ struct fcg_cgrp_ctx *cgc; -+ struct cgroup *cgrp; -+ u64 now = bpf_ktime_get_ns(); -+ bool picked_next = false; -+ -+ cpuc = find_cpu_ctx(); -+ if (!cpuc) -+ return; -+ -+ if (!cpuc->cur_cgid) -+ goto pick_next_cgroup; -+ -+ if (vtime_before(now, cpuc->cur_at + cgrp_slice_ns)) { -+ if (scx_bpf_consume(cpuc->cur_cgid)) { -+ stat_inc(FCG_STAT_CNS_KEEP); -+ return; -+ } -+ stat_inc(FCG_STAT_CNS_EMPTY); -+ } else { -+ stat_inc(FCG_STAT_CNS_EXPIRE); -+ } -+ -+ /* -+ * The current cgroup is expiring. It was already charged a full slice. -+ * Calculate the actual usage and accumulate the delta. -+ */ -+ cgrp = bpf_cgroup_from_id(cpuc->cur_cgid); -+ if (!cgrp) { -+ stat_inc(FCG_STAT_CNS_GONE); -+ goto pick_next_cgroup; -+ } -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, 0); -+ if (cgc) { -+ /* -+ * We want to update the vtime delta and then look for the next -+ * cgroup to execute but the latter needs to be done in a loop -+ * and we can't keep the lock held. Oh well... -+ */ -+ bpf_spin_lock(&cgv_tree_lock); -+ __sync_fetch_and_add(&cgc->cvtime_delta, -+ (cpuc->cur_at + cgrp_slice_ns - now) * -+ FCG_HWEIGHT_ONE / (cgc->hweight ?: 1)); -+ bpf_spin_unlock(&cgv_tree_lock); -+ } else { -+ stat_inc(FCG_STAT_CNS_GONE); -+ } -+ -+ bpf_cgroup_release(cgrp); -+ -+pick_next_cgroup: -+ cpuc->cur_at = now; -+ -+ if (scx_bpf_consume(SCX_DSQ_GLOBAL)) { -+ cpuc->cur_cgid = 0; -+ return; -+ } -+ -+ bpf_repeat(CGROUP_MAX_RETRIES) { -+ if (try_pick_next_cgroup(&cpuc->cur_cgid)) { -+ picked_next = true; -+ break; -+ } -+ } -+ -+ /* -+ * This only happens if try_pick_next_cgroup() races against enqueue -+ * path for more than CGROUP_MAX_RETRIES times, which is extremely -+ * unlikely and likely indicates an underlying bug. There shouldn't be -+ * any stall risk as the race is against enqueue. -+ */ -+ if (!picked_next) -+ stat_inc(FCG_STAT_PNC_FAIL); -+} -+ -+s32 BPF_STRUCT_OPS(fcg_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ struct fcg_task_ctx *taskc; -+ struct fcg_cgrp_ctx *cgc; -+ -+ /* -+ * @p is new. Let's ensure that its task_ctx is available. We can sleep -+ * in this function and the following will automatically use GFP_KERNEL. -+ */ -+ taskc = bpf_task_storage_get(&task_ctx, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE); -+ if (!taskc) -+ return -ENOMEM; -+ -+ taskc->bypassed_at = 0; -+ -+ if (!(cgc = find_cgrp_ctx(args->cgroup))) -+ return -ENOENT; -+ -+ p->scx.dsq_vtime = cgc->tvtime_now; -+ -+ return 0; -+} -+ -+int BPF_STRUCT_OPS_SLEEPABLE(fcg_cgroup_init, struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args) -+{ -+ struct fcg_cgrp_ctx *cgc; -+ struct cgv_node *cgv_node; -+ struct cgv_node_stash empty_stash = {}, *stash; -+ u64 cgid = cgrp->kn->id; -+ int ret; -+ -+ /* -+ * Technically incorrect as cgroup ID is full 64bit while dq ID is -+ * 63bit. Should not be a problem in practice and easy to spot in the -+ * unlikely case that it breaks. -+ */ -+ ret = scx_bpf_create_dsq(cgid, -1); -+ if (ret) -+ return ret; -+ -+ cgc = bpf_cgrp_storage_get(&cgrp_ctx, cgrp, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE); -+ if (!cgc) { -+ ret = -ENOMEM; -+ goto err_destroy_dsq; -+ } -+ -+ cgc->weight = args->weight; -+ cgc->hweight = FCG_HWEIGHT_ONE; -+ -+ ret = bpf_map_update_elem(&cgv_node_stash, &cgid, &empty_stash, -+ BPF_NOEXIST); -+ if (ret) { -+ if (ret != -ENOMEM) -+ scx_bpf_error("unexpected stash creation error (%d)", -+ ret); -+ goto err_destroy_dsq; -+ } -+ -+ stash = bpf_map_lookup_elem(&cgv_node_stash, &cgid); -+ if (!stash) { -+ scx_bpf_error("unexpected cgv_node stash lookup failure"); -+ ret = -ENOENT; -+ goto err_destroy_dsq; -+ } -+ -+ cgv_node = bpf_obj_new(struct cgv_node); -+ if (!cgv_node) { -+ ret = -ENOMEM; -+ goto err_del_cgv_node; -+ } -+ -+ cgv_node->cgid = cgid; -+ cgv_node->cvtime = cvtime_now; -+ -+ cgv_node = bpf_kptr_xchg(&stash->node, cgv_node); -+ if (cgv_node) { -+ scx_bpf_error("unexpected !NULL cgv_node stash"); -+ ret = -EBUSY; -+ goto err_drop; -+ } -+ -+ return 0; -+ -+err_drop: -+ bpf_obj_drop(cgv_node); -+err_del_cgv_node: -+ bpf_map_delete_elem(&cgv_node_stash, &cgid); -+err_destroy_dsq: -+ scx_bpf_destroy_dsq(cgid); -+ return ret; -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_exit, struct cgroup *cgrp) -+{ -+ u64 cgid = cgrp->kn->id; -+ -+ /* -+ * For now, there's no way find and remove the cgv_node if it's on the -+ * cgv_tree. Let's drain them in the dispatch path as they get popped -+ * off the front of the tree. -+ */ -+ bpf_map_delete_elem(&cgv_node_stash, &cgid); -+ scx_bpf_destroy_dsq(cgid); -+} -+ -+void BPF_STRUCT_OPS(fcg_cgroup_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{ -+ struct fcg_cgrp_ctx *from_cgc, *to_cgc; -+ s64 vtime_delta; -+ -+ /* find_cgrp_ctx() triggers scx_ops_error() on lookup failures */ -+ if (!(from_cgc = find_cgrp_ctx(from)) || !(to_cgc = find_cgrp_ctx(to))) -+ return; -+ -+ vtime_delta = p->scx.dsq_vtime - from_cgc->tvtime_now; -+ p->scx.dsq_vtime = to_cgc->tvtime_now + vtime_delta; -+} -+ -+void BPF_STRUCT_OPS(fcg_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(flatcg_ops, -+ .select_cpu = (void *)fcg_select_cpu, -+ .enqueue = (void *)fcg_enqueue, -+ .dispatch = (void *)fcg_dispatch, -+ .runnable = (void *)fcg_runnable, -+ .running = (void *)fcg_running, -+ .stopping = (void *)fcg_stopping, -+ .quiescent = (void *)fcg_quiescent, -+ .init_task = (void *)fcg_init_task, -+ .cgroup_set_weight = (void *)fcg_cgroup_set_weight, -+ .cgroup_init = (void *)fcg_cgroup_init, -+ .cgroup_exit = (void *)fcg_cgroup_exit, -+ .cgroup_move = (void *)fcg_cgroup_move, -+ .exit = (void *)fcg_exit, -+ .flags = SCX_OPS_CGROUP_KNOB_WEIGHT | SCX_OPS_ENQ_EXITING, -+ .name = "flatcg"); -diff --git a/tools/sched_ext/scx_flatcg.c b/tools/sched_ext/scx_flatcg.c -new file mode 100644 -index 000000000000..5d24ca9c29d9 ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.c -@@ -0,0 +1,233 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_flatcg.h" -+#include "scx_flatcg.bpf.skel.h" -+ -+#ifndef FILEID_KERNFS -+#define FILEID_KERNFS 0xfe -+#endif -+ -+const char help_fmt[] = -+"A flattened cgroup hierarchy sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-i INTERVAL] [-f] [-v]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -i INTERVAL Report interval\n" -+" -f Use FIFO scheduling instead of weighted vtime scheduling\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+static float read_cpu_util(__u64 *last_sum, __u64 *last_idle) -+{ -+ FILE *fp; -+ char buf[4096]; -+ char *line, *cur = NULL, *tok; -+ __u64 sum = 0, idle = 0; -+ __u64 delta_sum, delta_idle; -+ int idx; -+ -+ fp = fopen("/proc/stat", "r"); -+ if (!fp) { -+ perror("fopen(\"/proc/stat\")"); -+ return 0.0; -+ } -+ -+ if (!fgets(buf, sizeof(buf), fp)) { -+ perror("fgets(\"/proc/stat\")"); -+ fclose(fp); -+ return 0.0; -+ } -+ fclose(fp); -+ -+ line = buf; -+ for (idx = 0; (tok = strtok_r(line, " \n", &cur)); idx++) { -+ char *endp = NULL; -+ __u64 v; -+ -+ if (idx == 0) { -+ line = NULL; -+ continue; -+ } -+ v = strtoull(tok, &endp, 0); -+ if (!endp || *endp != '\0') { -+ fprintf(stderr, "failed to parse %dth field of /proc/stat (\"%s\")\n", -+ idx, tok); -+ continue; -+ } -+ sum += v; -+ if (idx == 4) -+ idle = v; -+ } -+ -+ delta_sum = sum - *last_sum; -+ delta_idle = idle - *last_idle; -+ *last_sum = sum; -+ *last_idle = idle; -+ -+ return delta_sum ? (float)(delta_sum - delta_idle) / delta_sum : 0.0; -+} -+ -+static void fcg_read_stats(struct scx_flatcg *skel, __u64 *stats) -+{ -+ __u64 cnts[FCG_NR_STATS][skel->rodata->nr_cpus]; -+ __u32 idx; -+ -+ memset(stats, 0, sizeof(stats[0]) * FCG_NR_STATS); -+ -+ for (idx = 0; idx < FCG_NR_STATS; idx++) { -+ int ret, cpu; -+ -+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), -+ &idx, cnts[idx]); -+ if (ret < 0) -+ continue; -+ for (cpu = 0; cpu < skel->rodata->nr_cpus; cpu++) -+ stats[idx] += cnts[idx][cpu]; -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_flatcg *skel; -+ struct bpf_link *link; -+ struct timespec intv_ts = { .tv_sec = 2, .tv_nsec = 0 }; -+ bool dump_cgrps = false; -+ __u64 last_cpu_sum = 0, last_cpu_idle = 0; -+ __u64 last_stats[FCG_NR_STATS] = {}; -+ unsigned long seq = 0; -+ __s32 opt; -+ __u64 ecode; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+restart: -+ skel = SCX_OPS_OPEN(flatcg_ops, scx_flatcg); -+ -+ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); -+ -+ while ((opt = getopt(argc, argv, "s:i:dfvh")) != -1) { -+ double v; -+ -+ switch (opt) { -+ case 's': -+ v = strtod(optarg, NULL); -+ skel->rodata->cgrp_slice_ns = v * 1000; -+ break; -+ case 'i': -+ v = strtod(optarg, NULL); -+ intv_ts.tv_sec = v; -+ intv_ts.tv_nsec = (v - (float)intv_ts.tv_sec) * 1000000000; -+ break; -+ case 'd': -+ dump_cgrps = true; -+ break; -+ case 'f': -+ skel->rodata->fifo_sched = true; -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ case 'h': -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ printf("slice=%.1lfms intv=%.1lfs dump_cgrps=%d", -+ (double)skel->rodata->cgrp_slice_ns / 1000000.0, -+ (double)intv_ts.tv_sec + (double)intv_ts.tv_nsec / 1000000000.0, -+ dump_cgrps); -+ -+ SCX_OPS_LOAD(skel, flatcg_ops, scx_flatcg, uei); -+ link = SCX_OPS_ATTACH(skel, flatcg_ops, scx_flatcg); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ __u64 acc_stats[FCG_NR_STATS]; -+ __u64 stats[FCG_NR_STATS]; -+ float cpu_util; -+ int i; -+ -+ cpu_util = read_cpu_util(&last_cpu_sum, &last_cpu_idle); -+ -+ fcg_read_stats(skel, acc_stats); -+ for (i = 0; i < FCG_NR_STATS; i++) -+ stats[i] = acc_stats[i] - last_stats[i]; -+ -+ memcpy(last_stats, acc_stats, sizeof(acc_stats)); -+ -+ printf("\n[SEQ %6lu cpu=%5.1lf hweight_gen=%" PRIu64 "]\n", -+ seq++, cpu_util * 100.0, skel->data->hweight_gen); -+ printf(" act:%6llu deact:%6llu global:%6llu local:%6llu\n", -+ stats[FCG_STAT_ACT], -+ stats[FCG_STAT_DEACT], -+ stats[FCG_STAT_GLOBAL], -+ stats[FCG_STAT_LOCAL]); -+ printf("HWT cache:%6llu update:%6llu skip:%6llu race:%6llu\n", -+ stats[FCG_STAT_HWT_CACHE], -+ stats[FCG_STAT_HWT_UPDATES], -+ stats[FCG_STAT_HWT_SKIP], -+ stats[FCG_STAT_HWT_RACE]); -+ printf("ENQ skip:%6llu race:%6llu\n", -+ stats[FCG_STAT_ENQ_SKIP], -+ stats[FCG_STAT_ENQ_RACE]); -+ printf("CNS keep:%6llu expire:%6llu empty:%6llu gone:%6llu\n", -+ stats[FCG_STAT_CNS_KEEP], -+ stats[FCG_STAT_CNS_EXPIRE], -+ stats[FCG_STAT_CNS_EMPTY], -+ stats[FCG_STAT_CNS_GONE]); -+ printf("PNC next:%6llu empty:%6llu nocgrp:%6llu gone:%6llu race:%6llu fail:%6llu\n", -+ stats[FCG_STAT_PNC_NEXT], -+ stats[FCG_STAT_PNC_EMPTY], -+ stats[FCG_STAT_PNC_NO_CGRP], -+ stats[FCG_STAT_PNC_GONE], -+ stats[FCG_STAT_PNC_RACE], -+ stats[FCG_STAT_PNC_FAIL]); -+ printf("BAD remove:%6llu\n", -+ acc_stats[FCG_STAT_BAD_REMOVAL]); -+ fflush(stdout); -+ -+ nanosleep(&intv_ts, NULL); -+ } -+ -+ bpf_link__destroy(link); -+ ecode = UEI_REPORT(skel, uei); -+ scx_flatcg__destroy(skel); -+ -+ if (UEI_ECODE_RESTART(ecode)) -+ goto restart; -+ return 0; -+} -diff --git a/tools/sched_ext/scx_flatcg.h b/tools/sched_ext/scx_flatcg.h -new file mode 100644 -index 000000000000..6f2ea50acb1c ---- /dev/null -+++ b/tools/sched_ext/scx_flatcg.h -@@ -0,0 +1,51 @@ -+#ifndef __SCX_EXAMPLE_FLATCG_H -+#define __SCX_EXAMPLE_FLATCG_H -+ -+enum { -+ FCG_HWEIGHT_ONE = 1LLU << 16, -+}; -+ -+enum fcg_stat_idx { -+ FCG_STAT_ACT, -+ FCG_STAT_DEACT, -+ FCG_STAT_LOCAL, -+ FCG_STAT_GLOBAL, -+ -+ FCG_STAT_HWT_UPDATES, -+ FCG_STAT_HWT_CACHE, -+ FCG_STAT_HWT_SKIP, -+ FCG_STAT_HWT_RACE, -+ -+ FCG_STAT_ENQ_SKIP, -+ FCG_STAT_ENQ_RACE, -+ -+ FCG_STAT_CNS_KEEP, -+ FCG_STAT_CNS_EXPIRE, -+ FCG_STAT_CNS_EMPTY, -+ FCG_STAT_CNS_GONE, -+ -+ FCG_STAT_PNC_NO_CGRP, -+ FCG_STAT_PNC_NEXT, -+ FCG_STAT_PNC_EMPTY, -+ FCG_STAT_PNC_GONE, -+ FCG_STAT_PNC_RACE, -+ FCG_STAT_PNC_FAIL, -+ -+ FCG_STAT_BAD_REMOVAL, -+ -+ FCG_NR_STATS, -+}; -+ -+struct fcg_cgrp_ctx { -+ u32 nr_active; -+ u32 nr_runnable; -+ u32 queued; -+ u32 weight; -+ u32 hweight; -+ u64 child_weight_sum; -+ u64 hweight_gen; -+ s64 cvtime_delta; -+ u64 tvtime_now; -+}; -+ -+#endif /* __SCX_EXAMPLE_FLATCG_H */ -diff --git a/tools/sched_ext/scx_qmap.bpf.c b/tools/sched_ext/scx_qmap.bpf.c -new file mode 100644 -index 000000000000..d74c5cf2a251 ---- /dev/null -+++ b/tools/sched_ext/scx_qmap.bpf.c -@@ -0,0 +1,728 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A simple five-level FIFO queue scheduler. -+ * -+ * There are five FIFOs implemented using BPF_MAP_TYPE_QUEUE. A task gets -+ * assigned to one depending on its compound weight. Each CPU round robins -+ * through the FIFOs and dispatches more from FIFOs with higher indices - 1 from -+ * queue0, 2 from queue1, 4 from queue2 and so on. -+ * -+ * This scheduler demonstrates: -+ * -+ * - BPF-side queueing using PIDs. -+ * - Sleepable per-task storage allocation using ops.prep_enable(). -+ * - Using ops.cpu_release() to handle a higher priority scheduling class taking -+ * the CPU away. -+ * - Core-sched support. -+ * -+ * This scheduler is primarily for demonstration and testing of sched_ext -+ * features and unlikely to be useful for actual workloads. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+ -+enum consts { -+ ONE_SEC_IN_NS = 1000000000, -+ SHARED_DSQ = 0, -+}; -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile u64 slice_ns = SCX_SLICE_DFL; -+const volatile u32 stall_user_nth; -+const volatile u32 stall_kernel_nth; -+const volatile u32 dsp_inf_loop_after; -+const volatile u32 dsp_batch; -+const volatile bool print_shared_dsq; -+const volatile char exp_prefix[17]; -+const volatile s32 disallow_tgid; -+const volatile bool suppress_dump; -+const volatile bool switch_partial; -+ -+u32 test_error_cnt; -+ -+UEI_DEFINE(uei); -+ -+struct qmap { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 4096); -+ __type(value, u32); -+} queue0 SEC(".maps"), -+ queue1 SEC(".maps"), -+ queue2 SEC(".maps"), -+ queue3 SEC(".maps"), -+ queue4 SEC(".maps"); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY_OF_MAPS); -+ __uint(max_entries, 5); -+ __type(key, int); -+ __array(values, struct qmap); -+} queue_arr SEC(".maps") = { -+ .values = { -+ [0] = &queue0, -+ [1] = &queue1, -+ [2] = &queue2, -+ [3] = &queue3, -+ [4] = &queue4, -+ }, -+}; -+ -+/* -+ * If enabled, CPU performance target is set according to the queue index -+ * according to the following table. -+ */ -+static const u32 qidx_to_cpuperf_target[] = { -+ [0] = SCX_CPUPERF_ONE * 0 / 4, -+ [1] = SCX_CPUPERF_ONE * 1 / 4, -+ [2] = SCX_CPUPERF_ONE * 2 / 4, -+ [3] = SCX_CPUPERF_ONE * 3 / 4, -+ [4] = SCX_CPUPERF_ONE * 4 / 4, -+}; -+ -+/* -+ * Per-queue sequence numbers to implement core-sched ordering. -+ * -+ * Tail seq is assigned to each queued task and incremented. Head seq tracks the -+ * sequence number of the latest dispatched task. The distance between the a -+ * task's seq and the associated queue's head seq is called the queue distance -+ * and used when comparing two tasks for ordering. See qmap_core_sched_before(). -+ */ -+static u64 core_sched_head_seqs[5]; -+static u64 core_sched_tail_seqs[5]; -+ -+/* Per-task scheduling context */ -+struct task_ctx { -+ bool force_local; /* Dispatch directly to local_dsq */ -+ u64 core_sched_seq; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct task_ctx); -+} task_ctx_stor SEC(".maps"); -+ -+struct cpu_ctx { -+ u64 dsp_idx; /* dispatch index */ -+ u64 dsp_cnt; /* remaining count */ -+ u32 avg_weight; -+ u32 cpuperf_target; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct cpu_ctx); -+} cpu_ctx_stor SEC(".maps"); -+ -+/* Statistics */ -+u64 nr_enqueued, nr_dispatched, nr_reenqueued, nr_dequeued; -+u64 nr_core_sched_execed, nr_expedited; -+u32 cpuperf_min, cpuperf_avg, cpuperf_max; -+u32 cpuperf_target_min, cpuperf_target_avg, cpuperf_target_max; -+ -+s32 BPF_STRUCT_OPS(qmap_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ struct task_ctx *tctx; -+ s32 cpu; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return -ESRCH; -+ } -+ -+ if (p->nr_cpus_allowed == 1 || -+ scx_bpf_test_and_clear_cpu_idle(prev_cpu)) { -+ tctx->force_local = true; -+ return prev_cpu; -+ } -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ return cpu; -+ -+ return prev_cpu; -+} -+ -+static int weight_to_idx(u32 weight) -+{ -+ /* Coarsely map the compound weight to a FIFO. */ -+ if (weight <= 25) -+ return 0; -+ else if (weight <= 50) -+ return 1; -+ else if (weight < 200) -+ return 2; -+ else if (weight < 400) -+ return 3; -+ else -+ return 4; -+} -+ -+void BPF_STRUCT_OPS(qmap_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ static u32 user_cnt, kernel_cnt; -+ struct task_ctx *tctx; -+ u32 pid = p->pid; -+ int idx = weight_to_idx(p->scx.weight); -+ void *ring; -+ -+ if (p->flags & PF_KTHREAD) { -+ if (stall_kernel_nth && !(++kernel_cnt % stall_kernel_nth)) -+ return; -+ } else { -+ if (stall_user_nth && !(++user_cnt % stall_user_nth)) -+ return; -+ } -+ -+ if (test_error_cnt && !--test_error_cnt) -+ scx_bpf_error("test triggering error"); -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ /* -+ * All enqueued tasks must have their core_sched_seq updated for correct -+ * core-sched ordering, which is why %SCX_OPS_ENQ_LAST is specified in -+ * qmap_ops.flags. -+ */ -+ tctx->core_sched_seq = core_sched_tail_seqs[idx]++; -+ -+ /* -+ * If qmap_select_cpu() is telling us to or this is the last runnable -+ * task on the CPU, enqueue locally. -+ */ -+ if (tctx->force_local || (enq_flags & SCX_ENQ_LAST)) { -+ tctx->force_local = false; -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, enq_flags); -+ return; -+ } -+ -+ /* -+ * If the task was re-enqueued due to the CPU being preempted by a -+ * higher priority scheduling class, just re-enqueue the task directly -+ * on the global DSQ. As we want another CPU to pick it up, find and -+ * kick an idle CPU. -+ */ -+ if (enq_flags & SCX_ENQ_REENQ) { -+ s32 cpu; -+ -+ scx_bpf_dispatch(p, SHARED_DSQ, 0, enq_flags); -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ scx_bpf_kick_cpu(cpu, __COMPAT_SCX_KICK_IDLE); -+ return; -+ } -+ -+ ring = bpf_map_lookup_elem(&queue_arr, &idx); -+ if (!ring) { -+ scx_bpf_error("failed to find ring %d", idx); -+ return; -+ } -+ -+ /* Queue on the selected FIFO. If the FIFO overflows, punt to global. */ -+ if (bpf_map_push_elem(ring, &pid, 0)) { -+ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, enq_flags); -+ return; -+ } -+ -+ __sync_fetch_and_add(&nr_enqueued, 1); -+} -+ -+/* -+ * The BPF queue map doesn't support removal and sched_ext can handle spurious -+ * dispatches. qmap_dequeue() is only used to collect statistics. -+ */ -+void BPF_STRUCT_OPS(qmap_dequeue, struct task_struct *p, u64 deq_flags) -+{ -+ __sync_fetch_and_add(&nr_dequeued, 1); -+ if (deq_flags & SCX_DEQ_CORE_SCHED_EXEC) -+ __sync_fetch_and_add(&nr_core_sched_execed, 1); -+} -+ -+static void update_core_sched_head_seq(struct task_struct *p) -+{ -+ struct task_ctx *tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ int idx = weight_to_idx(p->scx.weight); -+ -+ if (tctx) -+ core_sched_head_seqs[idx] = tctx->core_sched_seq; -+ else -+ scx_bpf_error("task_ctx lookup failed"); -+} -+ -+static bool consume_shared_dsq(void) -+{ -+ struct task_struct *p; -+ bool consumed; -+ -+ if (exp_prefix[0] == '\0') -+ return scx_bpf_consume(SHARED_DSQ); -+ -+ /* -+ * To demonstrate the use of scx_bpf_consume_task(), implement silly -+ * selective priority boosting mechanism by scanning SHARED_DSQ looking -+ * for matching comms and consume them first. This makes difference only -+ * when dsp_batch is larger than 1. -+ */ -+ consumed = false; -+ __COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, 0) { -+ char comm[sizeof(exp_prefix)]; -+ -+ memcpy(comm, p->comm, sizeof(exp_prefix) - 1); -+ -+ if (!bpf_strncmp(comm, sizeof(exp_prefix), -+ (const char *)exp_prefix) && -+ __COMPAT_scx_bpf_consume_task(BPF_FOR_EACH_ITER, p)) { -+ consumed = true; -+ __sync_fetch_and_add(&nr_expedited, 1); -+ } -+ } -+ -+ return consumed || scx_bpf_consume(SHARED_DSQ); -+} -+ -+void BPF_STRUCT_OPS(qmap_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ struct task_struct *p; -+ struct cpu_ctx *cpuc; -+ u32 zero = 0, batch = dsp_batch ?: 1; -+ void *fifo; -+ s32 i, pid; -+ -+ if (consume_shared_dsq()) -+ return; -+ -+ if (dsp_inf_loop_after && nr_dispatched > dsp_inf_loop_after) { -+ /* -+ * PID 2 should be kthreadd which should mostly be idle and off -+ * the scheduler. Let's keep dispatching it to force the kernel -+ * to call this function over and over again. -+ */ -+ p = bpf_task_from_pid(2); -+ if (p) { -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, slice_ns, 0); -+ bpf_task_release(p); -+ return; -+ } -+ } -+ -+ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { -+ scx_bpf_error("failed to look up cpu_ctx"); -+ return; -+ } -+ -+ for (i = 0; i < 5; i++) { -+ /* Advance the dispatch cursor and pick the fifo. */ -+ if (!cpuc->dsp_cnt) { -+ cpuc->dsp_idx = (cpuc->dsp_idx + 1) % 5; -+ cpuc->dsp_cnt = 1 << cpuc->dsp_idx; -+ } -+ -+ fifo = bpf_map_lookup_elem(&queue_arr, &cpuc->dsp_idx); -+ if (!fifo) { -+ scx_bpf_error("failed to find ring %llu", cpuc->dsp_idx); -+ return; -+ } -+ -+ /* Dispatch or advance. */ -+ bpf_repeat(BPF_MAX_LOOPS) { -+ if (bpf_map_pop_elem(fifo, &pid)) -+ break; -+ -+ p = bpf_task_from_pid(pid); -+ if (!p) -+ continue; -+ -+ update_core_sched_head_seq(p); -+ __sync_fetch_and_add(&nr_dispatched, 1); -+ scx_bpf_dispatch(p, SHARED_DSQ, slice_ns, 0); -+ bpf_task_release(p); -+ batch--; -+ cpuc->dsp_cnt--; -+ if (!batch || !scx_bpf_dispatch_nr_slots()) { -+ consume_shared_dsq(); -+ return; -+ } -+ if (!cpuc->dsp_cnt) -+ break; -+ } -+ -+ cpuc->dsp_cnt = 0; -+ } -+} -+ -+void BPF_STRUCT_OPS(qmap_tick, struct task_struct *p) -+{ -+ struct cpu_ctx *cpuc; -+ u32 zero = 0; -+ int idx; -+ -+ if (!(cpuc = bpf_map_lookup_elem(&cpu_ctx_stor, &zero))) { -+ scx_bpf_error("failed to look up cpu_ctx"); -+ return; -+ } -+ -+ /* -+ * Use the running avg of weights to select the target cpuperf level. -+ * This is a demonstration of the cpuperf feature rather than a -+ * practical strategy to regulate CPU frequency. -+ */ -+ cpuc->avg_weight = cpuc->avg_weight * 3 / 4 + p->scx.weight / 4; -+ idx = weight_to_idx(cpuc->avg_weight); -+ cpuc->cpuperf_target = qidx_to_cpuperf_target[idx]; -+ -+ scx_bpf_cpuperf_set(scx_bpf_task_cpu(p), cpuc->cpuperf_target); -+} -+ -+/* -+ * The distance from the head of the queue scaled by the weight of the queue. -+ * The lower the number, the older the task and the higher the priority. -+ */ -+static s64 task_qdist(struct task_struct *p) -+{ -+ int idx = weight_to_idx(p->scx.weight); -+ struct task_ctx *tctx; -+ s64 qdist; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return 0; -+ } -+ -+ qdist = tctx->core_sched_seq - core_sched_head_seqs[idx]; -+ -+ /* -+ * As queue index increments, the priority doubles. The queue w/ index 3 -+ * is dispatched twice more frequently than 2. Reflect the difference by -+ * scaling qdists accordingly. Note that the shift amount needs to be -+ * flipped depending on the sign to avoid flipping priority direction. -+ */ -+ if (qdist >= 0) -+ return qdist << (4 - idx); -+ else -+ return qdist << idx; -+} -+ -+/* -+ * This is called to determine the task ordering when core-sched is picking -+ * tasks to execute on SMT siblings and should encode about the same ordering as -+ * the regular scheduling path. Use the priority-scaled distances from the head -+ * of the queues to compare the two tasks which should be consistent with the -+ * dispatch path behavior. -+ */ -+bool BPF_STRUCT_OPS(qmap_core_sched_before, -+ struct task_struct *a, struct task_struct *b) -+{ -+ return task_qdist(a) > task_qdist(b); -+} -+ -+void BPF_STRUCT_OPS(qmap_cpu_release, s32 cpu, struct scx_cpu_release_args *args) -+{ -+ u32 cnt; -+ -+ /* -+ * Called when @cpu is taken by a higher priority scheduling class. This -+ * makes @cpu no longer available for executing sched_ext tasks. As we -+ * don't want the tasks in @cpu's local dsq to sit there until @cpu -+ * becomes available again, re-enqueue them into the global dsq. See -+ * %SCX_ENQ_REENQ handling in qmap_enqueue(). -+ */ -+ cnt = scx_bpf_reenqueue_local(); -+ if (cnt) -+ __sync_fetch_and_add(&nr_reenqueued, cnt); -+} -+ -+s32 BPF_STRUCT_OPS(qmap_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ if (p->tgid == disallow_tgid) -+ p->scx.disallow = true; -+ -+ /* -+ * @p is new. Let's ensure that its task_ctx is available. We can sleep -+ * in this function and the following will automatically use GFP_KERNEL. -+ */ -+ if (bpf_task_storage_get(&task_ctx_stor, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE)) -+ return 0; -+ else -+ return -ENOMEM; -+} -+ -+void BPF_STRUCT_OPS(qmap_dump, struct scx_dump_ctx *dctx) -+{ -+ s32 i, pid; -+ -+ if (suppress_dump) -+ return; -+ -+ bpf_for(i, 0, 5) { -+ void *fifo; -+ -+ if (!(fifo = bpf_map_lookup_elem(&queue_arr, &i))) -+ return; -+ -+ __COMPAT_scx_bpf_dump("QMAP FIFO[%d]:", i); -+ bpf_repeat(4096) { -+ if (bpf_map_pop_elem(fifo, &pid)) -+ break; -+ __COMPAT_scx_bpf_dump(" %d", pid); -+ } -+ __COMPAT_scx_bpf_dump("\n"); -+ } -+} -+ -+void BPF_STRUCT_OPS(qmap_dump_cpu, struct scx_dump_ctx *dctx, s32 cpu, bool idle) -+{ -+ u32 zero = 0; -+ struct cpu_ctx *cpuc; -+ -+ if (suppress_dump || idle) -+ return; -+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, cpu))) -+ return; -+ -+ __COMPAT_scx_bpf_dump("QMAP: dsp_idx=%llu dsp_cnt=%llu avg_weight=%u cpuperf_target=%u", -+ cpuc->dsp_idx, cpuc->dsp_cnt, cpuc->avg_weight, -+ cpuc->cpuperf_target); -+} -+ -+void BPF_STRUCT_OPS(qmap_dump_task, struct scx_dump_ctx *dctx, struct task_struct *p) -+{ -+ struct task_ctx *taskc; -+ -+ if (suppress_dump) -+ return; -+ if (!(taskc = bpf_task_storage_get(&task_ctx_stor, p, 0, 0))) -+ return; -+ -+ __COMPAT_scx_bpf_dump("QMAP: force_local=%d core_sched_seq=%llu", -+ taskc->force_local, taskc->core_sched_seq); -+} -+ -+/* -+ * Print out the online and possible CPU map using bpf_printk() as a -+ * demonstration of using the cpumask kfuncs and ops.cpu_on/offline(). -+ */ -+static void print_cpus(void) -+{ -+ const struct cpumask *possible, *online; -+ s32 cpu; -+ char buf[128] = "", *p; -+ int idx; -+ -+ if (!__COMPAT_HAS_CPUMASKS) -+ return; -+ -+ possible = scx_bpf_get_possible_cpumask(); -+ online = scx_bpf_get_online_cpumask(); -+ -+ idx = 0; -+ bpf_for(cpu, 0, scx_bpf_nr_cpu_ids()) { -+ if (!(p = MEMBER_VPTR(buf, [idx++]))) -+ break; -+ if (bpf_cpumask_test_cpu(cpu, online)) -+ *p++ = 'O'; -+ else if (bpf_cpumask_test_cpu(cpu, possible)) -+ *p++ = 'X'; -+ else -+ *p++ = ' '; -+ -+ if ((cpu & 7) == 7) { -+ if (!(p = MEMBER_VPTR(buf, [idx++]))) -+ break; -+ *p++ = '|'; -+ } -+ } -+ buf[sizeof(buf) - 1] = '\0'; -+ -+ scx_bpf_put_cpumask(online); -+ scx_bpf_put_cpumask(possible); -+ -+ bpf_printk("CPUS: |%s", buf); -+} -+ -+void BPF_STRUCT_OPS(qmap_cpu_online, s32 cpu) -+{ -+ bpf_printk("CPU %d coming online", cpu); -+ /* @cpu is already online at this point */ -+ print_cpus(); -+} -+ -+void BPF_STRUCT_OPS(qmap_cpu_offline, s32 cpu) -+{ -+ bpf_printk("CPU %d going offline", cpu); -+ /* @cpu is still online at this point */ -+ print_cpus(); -+} -+ -+struct monitor_timer { -+ struct bpf_timer timer; -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_ARRAY); -+ __uint(max_entries, 1); -+ __type(key, u32); -+ __type(value, struct monitor_timer); -+} central_timer SEC(".maps"); -+ -+/* -+ * Print out the min, avg and max performance levels of CPUs every second to -+ * demonstrate the cpuperf interface. -+ */ -+static void monitor_cpuperf(void) -+{ -+ u32 zero = 0, nr_cpu_ids; -+ u64 cap_sum = 0, cur_sum = 0, cur_min = SCX_CPUPERF_ONE, cur_max = 0; -+ u64 target_sum = 0, target_min = SCX_CPUPERF_ONE, target_max = 0; -+ const struct cpumask *online; -+ int i, nr_online_cpus = 0; -+ -+ if (!__COMPAT_HAS_CPUMASKS) -+ return; -+ -+ nr_cpu_ids = scx_bpf_nr_cpu_ids(); -+ online = scx_bpf_get_online_cpumask(); -+ -+ bpf_for(i, 0, nr_cpu_ids) { -+ struct cpu_ctx *cpuc; -+ u32 cap, cur; -+ -+ if (!bpf_cpumask_test_cpu(i, online)) -+ continue; -+ nr_online_cpus++; -+ -+ /* collect the capacity and current cpuperf */ -+ cap = scx_bpf_cpuperf_cap(i); -+ cur = scx_bpf_cpuperf_cur(i); -+ -+ cur_min = cur < cur_min ? cur : cur_min; -+ cur_max = cur > cur_max ? cur : cur_max; -+ -+ /* -+ * $cur is relative to $cap. Scale it down accordingly so that -+ * it's in the same scale as other CPUs and $cur_sum/$cap_sum -+ * makes sense. -+ */ -+ cur_sum += cur * cap / SCX_CPUPERF_ONE; -+ cap_sum += cap; -+ -+ if (!(cpuc = bpf_map_lookup_percpu_elem(&cpu_ctx_stor, &zero, i))) { -+ scx_bpf_error("failed to look up cpu_ctx"); -+ goto out; -+ } -+ -+ /* collect target */ -+ cur = cpuc->cpuperf_target; -+ target_sum += cur; -+ target_min = cur < target_min ? cur : target_min; -+ target_max = cur > target_max ? cur : target_max; -+ } -+ -+ cpuperf_min = cur_min; -+ cpuperf_avg = cur_sum * SCX_CPUPERF_ONE / cap_sum; -+ cpuperf_max = cur_max; -+ -+ cpuperf_target_min = target_min; -+ cpuperf_target_avg = target_sum / nr_online_cpus; -+ cpuperf_target_max = target_max; -+out: -+ scx_bpf_put_cpumask(online); -+} -+ -+/* -+ * Dump the currently queued tasks in the shared DSQ to demonstrate the usage of -+ * scx_bpf_dsq_nr_queued() and DSQ iterator. Raise the dispatch batch count to -+ * see meaningful dumps in the trace pipe. -+ */ -+static void dump_shared_dsq(void) -+{ -+ struct task_struct *p; -+ s32 nr; -+ -+ if (!(nr = scx_bpf_dsq_nr_queued(SHARED_DSQ))) -+ return; -+ -+ bpf_printk("Dumping %d tasks in SHARED_DSQ in reverse order", nr); -+ -+ bpf_rcu_read_lock(); -+ __COMPAT_DSQ_FOR_EACH(p, SHARED_DSQ, SCX_DSQ_ITER_REV) -+ bpf_printk("%s[%d]", p->comm, p->pid); -+ bpf_rcu_read_unlock(); -+} -+ -+static int monitor_timerfn(void *map, int *key, struct bpf_timer *timer) -+{ -+ monitor_cpuperf(); -+ -+ if (print_shared_dsq) -+ dump_shared_dsq(); -+ -+ bpf_timer_start(timer, ONE_SEC_IN_NS, 0); -+ return 0; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(qmap_init) -+{ -+ u32 key = 0; -+ struct bpf_timer *timer; -+ s32 ret; -+ -+ if (!switch_partial) -+ __COMPAT_scx_bpf_switch_all(); -+ -+ print_cpus(); -+ -+ ret = scx_bpf_create_dsq(SHARED_DSQ, -1); -+ if (ret) -+ return ret; -+ -+ timer = bpf_map_lookup_elem(¢ral_timer, &key); -+ if (!timer) -+ return -ESRCH; -+ -+ bpf_timer_init(timer, ¢ral_timer, CLOCK_MONOTONIC); -+ bpf_timer_set_callback(timer, monitor_timerfn); -+ -+ return bpf_timer_start(timer, ONE_SEC_IN_NS, 0); -+} -+ -+void BPF_STRUCT_OPS(qmap_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(qmap_ops, -+ .select_cpu = (void *)qmap_select_cpu, -+ .enqueue = (void *)qmap_enqueue, -+ .dequeue = (void *)qmap_dequeue, -+ .dispatch = (void *)qmap_dispatch, -+ .tick = (void *)qmap_tick, -+ .core_sched_before = (void *)qmap_core_sched_before, -+ .cpu_release = (void *)qmap_cpu_release, -+ .init_task = (void *)qmap_init_task, -+ .dump = (void *)qmap_dump, -+ .dump_cpu = (void *)qmap_dump_cpu, -+ .dump_task = (void *)qmap_dump_task, -+ .cpu_online = (void *)qmap_cpu_online, -+ .cpu_offline = (void *)qmap_cpu_offline, -+ .init = (void *)qmap_init, -+ .exit = (void *)qmap_exit, -+ .flags = SCX_OPS_ENQ_LAST, -+ .timeout_ms = 5000U, -+ .name = "qmap"); -diff --git a/tools/sched_ext/scx_qmap.c b/tools/sched_ext/scx_qmap.c -new file mode 100644 -index 000000000000..e10ceb170793 ---- /dev/null -+++ b/tools/sched_ext/scx_qmap.c -@@ -0,0 +1,154 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_qmap.bpf.skel.h" -+ -+const char help_fmt[] = -+"A simple five-level FIFO queue sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-s SLICE_US] [-e COUNT] [-t COUNT] [-T COUNT] [-l COUNT] [-b COUNT]\n" -+" [-P] [-E PREFIX] [-d PID] [-D LEN] [-p] [-v]\n" -+"\n" -+" -s SLICE_US Override slice duration\n" -+" -e COUNT Trigger scx_bpf_error() after COUNT enqueues\n" -+" -t COUNT Stall every COUNT'th user thread\n" -+" -T COUNT Stall every COUNT'th kernel thread\n" -+" -l COUNT Trigger dispatch infinite looping after COUNT dispatches\n" -+" -b COUNT Dispatch upto COUNT tasks together\n" -+" -P Print out DSQ content to trace_pipe every second, use with -b\n" -+" -E PREFIX Expedite consumption of threads w/ matching comm, use with -b\n" -+" (e.g. match shell on a loaded system)\n" -+" -d PID Disallow a process from switching into SCHED_EXT (-1 for self)\n" -+" -D LEN Set scx_exit_info.dump buffer length\n" -+" -S Suppress qmap-specific debug dump\n" -+" -p Switch only tasks on SCHED_EXT policy intead of all\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int dummy) -+{ -+ exit_req = 1; -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_qmap *skel; -+ struct bpf_link *link; -+ int opt; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ skel = SCX_OPS_OPEN(qmap_ops, scx_qmap); -+ -+ while ((opt = getopt(argc, argv, "s:e:t:T:l:b:PE:d:D:Spvh")) != -1) { -+ switch (opt) { -+ case 's': -+ skel->rodata->slice_ns = strtoull(optarg, NULL, 0) * 1000; -+ break; -+ case 'e': -+ skel->bss->test_error_cnt = strtoul(optarg, NULL, 0); -+ break; -+ case 't': -+ skel->rodata->stall_user_nth = strtoul(optarg, NULL, 0); -+ break; -+ case 'T': -+ skel->rodata->stall_kernel_nth = strtoul(optarg, NULL, 0); -+ break; -+ case 'l': -+ skel->rodata->dsp_inf_loop_after = strtoul(optarg, NULL, 0); -+ break; -+ case 'b': -+ skel->rodata->dsp_batch = strtoul(optarg, NULL, 0); -+ break; -+ case 'P': -+ skel->rodata->print_shared_dsq = true; -+ break; -+ case 'E': -+ strncpy(skel->rodata->exp_prefix, optarg, -+ sizeof(skel->rodata->exp_prefix) - 1); -+ break; -+ case 'd': -+ skel->rodata->disallow_tgid = strtol(optarg, NULL, 0); -+ if (skel->rodata->disallow_tgid < 0) -+ skel->rodata->disallow_tgid = getpid(); -+ break; -+ case 'D': -+ skel->struct_ops.qmap_ops->exit_dump_len = strtoul(optarg, NULL, 0); -+ break; -+ case 'S': -+ skel->rodata->suppress_dump = true; -+ break; -+ case 'p': -+ skel->rodata->switch_partial = true; -+ skel->struct_ops.qmap_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL; -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ if (!__COMPAT_HAS_DSQ_ITER && -+ (skel->rodata->print_shared_dsq || strlen(skel->rodata->exp_prefix))) -+ fprintf(stderr, "kernel doesn't support DSQ iteration\n"); -+ -+ SCX_OPS_LOAD(skel, qmap_ops, scx_qmap, uei); -+ link = SCX_OPS_ATTACH(skel, qmap_ops, scx_qmap); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ long nr_enqueued = skel->bss->nr_enqueued; -+ long nr_dispatched = skel->bss->nr_dispatched; -+ -+ printf("stats : enq=%lu dsp=%lu delta=%ld reenq=%"PRIu64" deq=%"PRIu64" core=%"PRIu64" exp=%"PRIu64"\n", -+ nr_enqueued, nr_dispatched, nr_enqueued - nr_dispatched, -+ skel->bss->nr_reenqueued, skel->bss->nr_dequeued, -+ skel->bss->nr_core_sched_execed, skel->bss->nr_expedited); -+ if (__COMPAT_has_ksym("scx_bpf_cpuperf_cur")) -+ printf("cpuperf: cur min/avg/max=%u/%u/%u target min/avg/max=%u/%u/%u\n", -+ skel->bss->cpuperf_min, -+ skel->bss->cpuperf_avg, -+ skel->bss->cpuperf_max, -+ skel->bss->cpuperf_target_min, -+ skel->bss->cpuperf_target_avg, -+ skel->bss->cpuperf_target_max); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ UEI_REPORT(skel, uei); -+ scx_qmap__destroy(skel); -+ /* -+ * scx_qmap implements ops.cpu_on/offline() and doesn't need to restart -+ * on CPU hotplug events. -+ */ -+ return 0; -+} -diff --git a/tools/sched_ext/scx_show_state.py b/tools/sched_ext/scx_show_state.py -new file mode 100644 -index 000000000000..d457d2a74e1e ---- /dev/null -+++ b/tools/sched_ext/scx_show_state.py -@@ -0,0 +1,39 @@ -+#!/usr/bin/env drgn -+# -+# Copyright (C) 2024 Tejun Heo -+# Copyright (C) 2024 Meta Platforms, Inc. and affiliates. -+ -+desc = """ -+This is a drgn script to show the current sched_ext state. -+For more info on drgn, visit https://github.com/osandov/drgn. -+""" -+ -+import drgn -+import sys -+ -+def err(s): -+ print(s, file=sys.stderr, flush=True) -+ sys.exit(1) -+ -+def read_int(name): -+ return int(prog[name].value_()) -+ -+def read_atomic(name): -+ return prog[name].counter.value_() -+ -+def read_static_key(name): -+ return prog[name].key.enabled.counter.value_() -+ -+def ops_state_str(state): -+ return prog['scx_ops_enable_state_str'][state].string_().decode() -+ -+ops = prog['scx_ops'] -+enable_state = read_atomic("scx_ops_enable_state_var") -+ -+print(f'ops : {ops.name.string_().decode()}') -+print(f'enabled : {read_static_key("__scx_ops_enabled")}') -+print(f'switching_all : {read_int("scx_switching_all")}') -+print(f'switched_all : {read_static_key("__scx_switched_all")}') -+print(f'enable_state : {ops_state_str(enable_state)} ({enable_state})') -+print(f'bypass_depth : {read_atomic("scx_ops_bypass_depth")}') -+print(f'nr_rejected : {read_atomic("scx_nr_rejected")}') -diff --git a/tools/sched_ext/scx_simple.bpf.c b/tools/sched_ext/scx_simple.bpf.c -new file mode 100644 -index 000000000000..6fc66ab9877a ---- /dev/null -+++ b/tools/sched_ext/scx_simple.bpf.c -@@ -0,0 +1,157 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A simple scheduler. -+ * -+ * By default, it operates as a simple global weighted vtime scheduler and can -+ * be switched to FIFO scheduling. It also demonstrates the following niceties. -+ * -+ * - Statistics tracking how many tasks are queued to local and global dsq's. -+ * - Termination notification for userspace. -+ * -+ * While very simple, this scheduler should work reasonably well on CPUs with a -+ * uniform L3 cache topology. While preemption is not implemented, the fact that -+ * the scheduling queue is shared across all CPUs means that whatever is at the -+ * front of the queue is likely to be executed fairly quickly given enough -+ * number of CPUs. The FIFO scheduling mode may be beneficial to some workloads -+ * but comes with the usual problems with FIFO scheduling where saturating -+ * threads can easily drown out interactive ones. -+ * -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+const volatile bool fifo_sched; -+ -+static u64 vtime_now; -+UEI_DEFINE(uei); -+ -+/* -+ * Built-in DSQs such as SCX_DSQ_GLOBAL cannot be used as priority queues -+ * (meaning, cannot be dispatched to with scx_bpf_dispatch_vtime()). We -+ * therefore create a separate DSQ with ID 0 that we dispatch to and consume -+ * from. If scx_simple only supported global FIFO scheduling, then we could -+ * just use SCX_DSQ_GLOBAL. -+ */ -+#define SHARED_DSQ 0 -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); -+ __uint(key_size, sizeof(u32)); -+ __uint(value_size, sizeof(u64)); -+ __uint(max_entries, 2); /* [local, global] */ -+} stats SEC(".maps"); -+ -+static void stat_inc(u32 idx) -+{ -+ u64 *cnt_p = bpf_map_lookup_elem(&stats, &idx); -+ if (cnt_p) -+ (*cnt_p)++; -+} -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+s32 BPF_STRUCT_OPS(simple_select_cpu, struct task_struct *p, s32 prev_cpu, u64 wake_flags) -+{ -+ bool is_idle = false; -+ s32 cpu; -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &is_idle); -+ if (is_idle) { -+ stat_inc(0); /* count local queueing */ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, 0); -+ } -+ -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(simple_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ stat_inc(1); /* count global queueing */ -+ -+ if (fifo_sched) { -+ scx_bpf_dispatch(p, SHARED_DSQ, SCX_SLICE_DFL, enq_flags); -+ } else { -+ u64 vtime = p->scx.dsq_vtime; -+ -+ /* -+ * Limit the amount of budget that an idling task can accumulate -+ * to one slice. -+ */ -+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) -+ vtime = vtime_now - SCX_SLICE_DFL; -+ -+ scx_bpf_dispatch_vtime(p, SHARED_DSQ, SCX_SLICE_DFL, vtime, -+ enq_flags); -+ } -+} -+ -+void BPF_STRUCT_OPS(simple_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ scx_bpf_consume(SHARED_DSQ); -+} -+ -+void BPF_STRUCT_OPS(simple_running, struct task_struct *p) -+{ -+ if (fifo_sched) -+ return; -+ -+ /* -+ * Global vtime always progresses forward as tasks start executing. The -+ * test and update can be performed concurrently from multiple CPUs and -+ * thus racy. Any error should be contained and temporary. Let's just -+ * live with it. -+ */ -+ if (vtime_before(vtime_now, p->scx.dsq_vtime)) -+ vtime_now = p->scx.dsq_vtime; -+} -+ -+void BPF_STRUCT_OPS(simple_stopping, struct task_struct *p, bool runnable) -+{ -+ if (fifo_sched) -+ return; -+ -+ /* -+ * Scale the execution time by the inverse of the weight and charge. -+ * -+ * Note that the default yield implementation yields by setting -+ * @p->scx.slice to zero and the following would treat the yielding task -+ * as if it has consumed all its slice. If this penalizes yielding tasks -+ * too much, determine the execution time by taking explicit timestamps -+ * instead of depending on @p->scx.slice. -+ */ -+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+} -+ -+void BPF_STRUCT_OPS(simple_enable, struct task_struct *p) -+{ -+ p->scx.dsq_vtime = vtime_now; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(simple_init) -+{ -+ __COMPAT_scx_bpf_switch_all(); -+ return scx_bpf_create_dsq(SHARED_DSQ, -1); -+} -+ -+void BPF_STRUCT_OPS(simple_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SCX_OPS_DEFINE(simple_ops, -+ .select_cpu = (void *)simple_select_cpu, -+ .enqueue = (void *)simple_enqueue, -+ .dispatch = (void *)simple_dispatch, -+ .running = (void *)simple_running, -+ .stopping = (void *)simple_stopping, -+ .enable = (void *)simple_enable, -+ .init = (void *)simple_init, -+ .exit = (void *)simple_exit, -+ .name = "simple"); -diff --git a/tools/sched_ext/scx_simple.c b/tools/sched_ext/scx_simple.c -new file mode 100644 -index 000000000000..76d83199545c ---- /dev/null -+++ b/tools/sched_ext/scx_simple.c -@@ -0,0 +1,107 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2022 Tejun Heo -+ * Copyright (c) 2022 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_simple.bpf.skel.h" -+ -+const char help_fmt[] = -+"A simple sched_ext scheduler.\n" -+"\n" -+"See the top-level comment in .bpf.c for more details.\n" -+"\n" -+"Usage: %s [-f] [-v]\n" -+"\n" -+" -f Use FIFO scheduling instead of weighted vtime scheduling\n" -+" -v Print libbpf debug messages\n" -+" -h Display this help and exit\n"; -+ -+static bool verbose; -+static volatile int exit_req; -+ -+static int libbpf_print_fn(enum libbpf_print_level level, const char *format, va_list args) -+{ -+ if (level == LIBBPF_DEBUG && !verbose) -+ return 0; -+ return vfprintf(stderr, format, args); -+} -+ -+static void sigint_handler(int simple) -+{ -+ exit_req = 1; -+} -+ -+static void read_stats(struct scx_simple *skel, __u64 *stats) -+{ -+ int nr_cpus = libbpf_num_possible_cpus(); -+ __u64 cnts[2][nr_cpus]; -+ __u32 idx; -+ -+ memset(stats, 0, sizeof(stats[0]) * 2); -+ -+ for (idx = 0; idx < 2; idx++) { -+ int ret, cpu; -+ -+ ret = bpf_map_lookup_elem(bpf_map__fd(skel->maps.stats), -+ &idx, cnts[idx]); -+ if (ret < 0) -+ continue; -+ for (cpu = 0; cpu < nr_cpus; cpu++) -+ stats[idx] += cnts[idx][cpu]; -+ } -+} -+ -+int main(int argc, char **argv) -+{ -+ struct scx_simple *skel; -+ struct bpf_link *link; -+ __u32 opt; -+ __u64 ecode; -+ -+ libbpf_set_print(libbpf_print_fn); -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+restart: -+ skel = SCX_OPS_OPEN(simple_ops, scx_simple); -+ -+ while ((opt = getopt(argc, argv, "fvh")) != -1) { -+ switch (opt) { -+ case 'f': -+ skel->rodata->fifo_sched = true; -+ break; -+ case 'v': -+ verbose = true; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ SCX_OPS_LOAD(skel, simple_ops, scx_simple, uei); -+ link = SCX_OPS_ATTACH(skel, simple_ops, scx_simple); -+ -+ while (!exit_req && !UEI_EXITED(skel, uei)) { -+ __u64 stats[2]; -+ -+ read_stats(skel, stats); -+ printf("local=%llu global=%llu\n", stats[0], stats[1]); -+ fflush(stdout); -+ sleep(1); -+ } -+ -+ bpf_link__destroy(link); -+ ecode = UEI_REPORT(skel, uei); -+ scx_simple__destroy(skel); -+ -+ if (UEI_ECODE_RESTART(ecode)) -+ goto restart; -+ return 0; -+} -diff --git a/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c b/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c -index b1dd889d5d7d..948eb3962732 100644 ---- a/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c -+++ b/tools/testing/selftests/bpf/bpf_test_no_cfi/bpf_test_no_cfi.c -@@ -22,12 +22,12 @@ static int dummy_init_member(const struct btf_type *t, - return 0; - } - --static int dummy_reg(void *kdata) -+static int dummy_reg(void *kdata, struct bpf_link *link) - { - return 0; - } - --static void dummy_unreg(void *kdata) -+static void dummy_unreg(void *kdata, struct bpf_link *link) - { - } - -diff --git a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c -index 2a18bd320e92..0a09732cde4b 100644 ---- a/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c -+++ b/tools/testing/selftests/bpf/bpf_testmod/bpf_testmod.c -@@ -820,7 +820,7 @@ static const struct bpf_verifier_ops bpf_testmod_verifier_ops = { - .is_valid_access = bpf_testmod_ops_is_valid_access, - }; - --static int bpf_dummy_reg(void *kdata) -+static int bpf_dummy_reg(void *kdata, struct bpf_link *link) - { - struct bpf_testmod_ops *ops = kdata; - -@@ -835,7 +835,7 @@ static int bpf_dummy_reg(void *kdata) - return 0; - } - --static void bpf_dummy_unreg(void *kdata) -+static void bpf_dummy_unreg(void *kdata, struct bpf_link *link) - { - } - -@@ -871,7 +871,7 @@ struct bpf_struct_ops bpf_bpf_testmod_ops = { - .owner = THIS_MODULE, - }; - --static int bpf_dummy_reg2(void *kdata) -+static int bpf_dummy_reg2(void *kdata, struct bpf_link *link) - { - struct bpf_testmod_ops2 *ops = kdata; - -diff --git a/tools/testing/selftests/bpf/config b/tools/testing/selftests/bpf/config -index eeabd798bc3a..2fb16da78dce 100644 ---- a/tools/testing/selftests/bpf/config -+++ b/tools/testing/selftests/bpf/config -@@ -80,6 +80,7 @@ CONFIG_NETFILTER_XT_TARGET_CT=y - CONFIG_NETKIT=y - CONFIG_NF_CONNTRACK=y - CONFIG_NF_CONNTRACK_MARK=y -+CONFIG_NF_CONNTRACK_ZONES=y - CONFIG_NF_DEFRAG_IPV4=y - CONFIG_NF_DEFRAG_IPV6=y - CONFIG_NF_NAT=y -diff --git a/tools/testing/selftests/bpf/network_helpers.c b/tools/testing/selftests/bpf/network_helpers.c -index 35250e6cde7f..e20caef06aae 100644 ---- a/tools/testing/selftests/bpf/network_helpers.c -+++ b/tools/testing/selftests/bpf/network_helpers.c -@@ -94,7 +94,8 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl - if (settimeo(fd, opts->timeout_ms)) - goto error_close; - -- if (opts->post_socket_cb && opts->post_socket_cb(fd, NULL)) { -+ if (opts->post_socket_cb && -+ opts->post_socket_cb(fd, opts->cb_opts)) { - log_err("Failed to call post_socket_cb"); - goto error_close; - } -@@ -118,22 +119,32 @@ static int __start_server(int type, const struct sockaddr *addr, socklen_t addrl - return -1; - } - --int start_server(int family, int type, const char *addr_str, __u16 port, -- int timeout_ms) -+int start_server_str(int family, int type, const char *addr_str, __u16 port, -+ const struct network_helper_opts *opts) - { -- struct network_helper_opts opts = { -- .timeout_ms = timeout_ms, -- }; - struct sockaddr_storage addr; - socklen_t addrlen; - -+ if (!opts) -+ opts = &default_opts; -+ - if (make_sockaddr(family, addr_str, port, &addr, &addrlen)) - return -1; - -- return __start_server(type, (struct sockaddr *)&addr, addrlen, &opts); -+ return __start_server(type, (struct sockaddr *)&addr, addrlen, opts); -+} -+ -+int start_server(int family, int type, const char *addr_str, __u16 port, -+ int timeout_ms) -+{ -+ struct network_helper_opts opts = { -+ .timeout_ms = timeout_ms, -+ }; -+ -+ return start_server_str(family, type, addr_str, port, &opts); - } - --static int reuseport_cb(int fd, const struct post_socket_opts *opts) -+static int reuseport_cb(int fd, void *opts) - { - int on = 1; - -@@ -338,9 +349,8 @@ int connect_to_fd_opts(int server_fd, const struct network_helper_opts *opts) - if (settimeo(fd, opts->timeout_ms)) - goto error_close; - -- if (opts->cc && opts->cc[0] && -- setsockopt(fd, SOL_TCP, TCP_CONGESTION, opts->cc, -- strlen(opts->cc) + 1)) -+ if (opts->post_socket_cb && -+ opts->post_socket_cb(fd, opts->cb_opts)) - goto error_close; - - if (!opts->noconnect) -diff --git a/tools/testing/selftests/bpf/network_helpers.h b/tools/testing/selftests/bpf/network_helpers.h -index 883c7ea9d8d5..11eea8e2e4f1 100644 ---- a/tools/testing/selftests/bpf/network_helpers.h -+++ b/tools/testing/selftests/bpf/network_helpers.h -@@ -21,16 +21,14 @@ typedef __u16 __sum16; - #define VIP_NUM 5 - #define MAGIC_BYTES 123 - --struct post_socket_opts {}; -- - struct network_helper_opts { -- const char *cc; - int timeout_ms; - bool must_fail; - bool noconnect; - int type; - int proto; -- int (*post_socket_cb)(int fd, const struct post_socket_opts *opts); -+ int (*post_socket_cb)(int fd, void *opts); -+ void *cb_opts; - }; - - /* ipv4 test vector */ -@@ -50,6 +48,8 @@ struct ipv6_packet { - extern struct ipv6_packet pkt_v6; - - int settimeo(int fd, int timeout_ms); -+int start_server_str(int family, int type, const char *addr_str, __u16 port, -+ const struct network_helper_opts *opts); - int start_server(int family, int type, const char *addr, __u16 port, - int timeout_ms); - int *start_reuseport_server(int family, int type, const char *addr_str, -diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c -index b30ff6b3b81a..a4a1f93878d4 100644 ---- a/tools/testing/selftests/bpf/prog_tests/bpf_nf.c -+++ b/tools/testing/selftests/bpf/prog_tests/bpf_nf.c -@@ -104,6 +104,7 @@ static void test_bpf_nf_ct(int mode) - - ASSERT_EQ(skel->bss->test_einval_bpf_tuple, -EINVAL, "Test EINVAL for NULL bpf_tuple"); - ASSERT_EQ(skel->bss->test_einval_reserved, -EINVAL, "Test EINVAL for reserved not set to 0"); -+ ASSERT_EQ(skel->bss->test_einval_reserved_new, -EINVAL, "Test EINVAL for reserved in new struct not set to 0"); - ASSERT_EQ(skel->bss->test_einval_netns_id, -EINVAL, "Test EINVAL for netns_id < -1"); - ASSERT_EQ(skel->bss->test_einval_len_opts, -EINVAL, "Test EINVAL for len__opts != NF_BPF_CT_OPTS_SZ"); - ASSERT_EQ(skel->bss->test_eproto_l4proto, -EPROTO, "Test EPROTO for l4proto != TCP or UDP"); -@@ -122,6 +123,12 @@ static void test_bpf_nf_ct(int mode) - ASSERT_EQ(skel->bss->test_exist_lookup_mark, 43, "Test existing connection lookup ctmark"); - ASSERT_EQ(skel->data->test_snat_addr, 0, "Test for source natting"); - ASSERT_EQ(skel->data->test_dnat_addr, 0, "Test for destination natting"); -+ ASSERT_EQ(skel->data->test_ct_zone_id_alloc_entry, 0, "Test for alloc new entry in specified ct zone"); -+ ASSERT_EQ(skel->data->test_ct_zone_id_insert_entry, 0, "Test for insert new entry in specified ct zone"); -+ ASSERT_EQ(skel->data->test_ct_zone_id_succ_lookup, 0, "Test for successful lookup in specified ct_zone"); -+ ASSERT_EQ(skel->bss->test_ct_zone_dir_enoent_lookup, -ENOENT, "Test ENOENT for lookup with wrong ct zone dir"); -+ ASSERT_EQ(skel->bss->test_ct_zone_id_enoent_lookup, -ENOENT, "Test ENOENT for lookup in wrong ct zone"); -+ - end: - if (client_fd != -1) - close(client_fd); -diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c -index 0aca02532794..ebc7d4616880 100644 ---- a/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c -+++ b/tools/testing/selftests/bpf/prog_tests/bpf_tcp_ca.c -@@ -23,6 +23,10 @@ - static const unsigned int total_bytes = 10 * 1024 * 1024; - static int expected_stg = 0xeB9F; - -+struct cb_opts { -+ const char *cc; -+}; -+ - static int settcpca(int fd, const char *tcp_ca) - { - int err; -@@ -34,12 +38,14 @@ static int settcpca(int fd, const char *tcp_ca) - return 0; - } - --static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) -+static void do_test(const struct network_helper_opts *opts, -+ const struct bpf_map *sk_stg_map) - { -+ struct cb_opts *cb_opts = (struct cb_opts *)opts->cb_opts; - int lfd = -1, fd = -1; - int err; - -- lfd = start_server(AF_INET6, SOCK_STREAM, NULL, 0, 0); -+ lfd = start_server_str(AF_INET6, SOCK_STREAM, NULL, 0, opts); - if (!ASSERT_NEQ(lfd, -1, "socket")) - return; - -@@ -49,7 +55,7 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) - return; - } - -- if (settcpca(lfd, tcp_ca) || settcpca(fd, tcp_ca)) -+ if (settcpca(fd, cb_opts->cc)) - goto done; - - if (sk_stg_map) { -@@ -81,8 +87,22 @@ static void do_test(const char *tcp_ca, const struct bpf_map *sk_stg_map) - close(fd); - } - -+static int cc_cb(int fd, void *opts) -+{ -+ struct cb_opts *cb_opts = (struct cb_opts *)opts; -+ -+ return settcpca(fd, cb_opts->cc); -+} -+ - static void test_cubic(void) - { -+ struct cb_opts cb_opts = { -+ .cc = "bpf_cubic", -+ }; -+ struct network_helper_opts opts = { -+ .post_socket_cb = cc_cb, -+ .cb_opts = &cb_opts, -+ }; - struct bpf_cubic *cubic_skel; - struct bpf_link *link; - -@@ -96,7 +116,7 @@ static void test_cubic(void) - return; - } - -- do_test("bpf_cubic", NULL); -+ do_test(&opts, NULL); - - ASSERT_EQ(cubic_skel->bss->bpf_cubic_acked_called, 1, "pkts_acked called"); - -@@ -106,6 +126,13 @@ static void test_cubic(void) - - static void test_dctcp(void) - { -+ struct cb_opts cb_opts = { -+ .cc = "bpf_dctcp", -+ }; -+ struct network_helper_opts opts = { -+ .post_socket_cb = cc_cb, -+ .cb_opts = &cb_opts, -+ }; - struct bpf_dctcp *dctcp_skel; - struct bpf_link *link; - -@@ -119,7 +146,7 @@ static void test_dctcp(void) - return; - } - -- do_test("bpf_dctcp", dctcp_skel->maps.sk_stg_map); -+ do_test(&opts, dctcp_skel->maps.sk_stg_map); - ASSERT_EQ(dctcp_skel->bss->stg_result, expected_stg, "stg_result"); - - bpf_link__destroy(link); -@@ -172,10 +199,16 @@ static void test_dctcp_fallback(void) - { - int err, lfd = -1, cli_fd = -1, srv_fd = -1; - struct network_helper_opts opts = { -- .cc = "cubic", -+ .post_socket_cb = cc_cb, - }; - struct bpf_dctcp *dctcp_skel; - struct bpf_link *link = NULL; -+ struct cb_opts dctcp = { -+ .cc = "bpf_dctcp", -+ }; -+ struct cb_opts cubic = { -+ .cc = "cubic", -+ }; - char srv_cc[16]; - socklen_t cc_len = sizeof(srv_cc); - -@@ -190,11 +223,12 @@ static void test_dctcp_fallback(void) - if (!ASSERT_OK_PTR(link, "dctcp link")) - goto done; - -- lfd = start_server(AF_INET6, SOCK_STREAM, "::1", 0, 0); -- if (!ASSERT_GE(lfd, 0, "lfd") || -- !ASSERT_OK(settcpca(lfd, "bpf_dctcp"), "lfd=>bpf_dctcp")) -+ opts.cb_opts = &dctcp; -+ lfd = start_server_str(AF_INET6, SOCK_STREAM, "::1", 0, &opts); -+ if (!ASSERT_GE(lfd, 0, "lfd")) - goto done; - -+ opts.cb_opts = &cubic; - cli_fd = connect_to_fd_opts(lfd, &opts); - if (!ASSERT_GE(cli_fd, 0, "cli_fd")) - goto done; -@@ -297,6 +331,13 @@ static void test_unsupp_cong_op(void) - - static void test_update_ca(void) - { -+ struct cb_opts cb_opts = { -+ .cc = "tcp_ca_update", -+ }; -+ struct network_helper_opts opts = { -+ .post_socket_cb = cc_cb, -+ .cb_opts = &cb_opts, -+ }; - struct tcp_ca_update *skel; - struct bpf_link *link; - int saved_ca1_cnt; -@@ -309,14 +350,14 @@ static void test_update_ca(void) - link = bpf_map__attach_struct_ops(skel->maps.ca_update_1); - ASSERT_OK_PTR(link, "attach_struct_ops"); - -- do_test("tcp_ca_update", NULL); -+ do_test(&opts, NULL); - saved_ca1_cnt = skel->bss->ca1_cnt; - ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt"); - - err = bpf_link__update_map(link, skel->maps.ca_update_2); - ASSERT_OK(err, "update_map"); - -- do_test("tcp_ca_update", NULL); -+ do_test(&opts, NULL); - ASSERT_EQ(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt"); - ASSERT_GT(skel->bss->ca2_cnt, 0, "ca2_ca2_cnt"); - -@@ -326,6 +367,13 @@ static void test_update_ca(void) - - static void test_update_wrong(void) - { -+ struct cb_opts cb_opts = { -+ .cc = "tcp_ca_update", -+ }; -+ struct network_helper_opts opts = { -+ .post_socket_cb = cc_cb, -+ .cb_opts = &cb_opts, -+ }; - struct tcp_ca_update *skel; - struct bpf_link *link; - int saved_ca1_cnt; -@@ -338,14 +386,14 @@ static void test_update_wrong(void) - link = bpf_map__attach_struct_ops(skel->maps.ca_update_1); - ASSERT_OK_PTR(link, "attach_struct_ops"); - -- do_test("tcp_ca_update", NULL); -+ do_test(&opts, NULL); - saved_ca1_cnt = skel->bss->ca1_cnt; - ASSERT_GT(saved_ca1_cnt, 0, "ca1_ca1_cnt"); - - err = bpf_link__update_map(link, skel->maps.ca_wrong); - ASSERT_ERR(err, "update_map"); - -- do_test("tcp_ca_update", NULL); -+ do_test(&opts, NULL); - ASSERT_GT(skel->bss->ca1_cnt, saved_ca1_cnt, "ca2_ca1_cnt"); - - bpf_link__destroy(link); -@@ -354,6 +402,13 @@ static void test_update_wrong(void) - - static void test_mixed_links(void) - { -+ struct cb_opts cb_opts = { -+ .cc = "tcp_ca_update", -+ }; -+ struct network_helper_opts opts = { -+ .post_socket_cb = cc_cb, -+ .cb_opts = &cb_opts, -+ }; - struct tcp_ca_update *skel; - struct bpf_link *link, *link_nl; - int err; -@@ -368,7 +423,7 @@ static void test_mixed_links(void) - link = bpf_map__attach_struct_ops(skel->maps.ca_update_1); - ASSERT_OK_PTR(link, "attach_struct_ops"); - -- do_test("tcp_ca_update", NULL); -+ do_test(&opts, NULL); - ASSERT_GT(skel->bss->ca1_cnt, 0, "ca1_ca1_cnt"); - - err = bpf_link__update_map(link, skel->maps.ca_no_link); -@@ -455,6 +510,13 @@ static void test_tcp_ca_kfunc(void) - - static void test_cc_cubic(void) - { -+ struct cb_opts cb_opts = { -+ .cc = "bpf_cc_cubic", -+ }; -+ struct network_helper_opts opts = { -+ .post_socket_cb = cc_cb, -+ .cb_opts = &cb_opts, -+ }; - struct bpf_cc_cubic *cc_cubic_skel; - struct bpf_link *link; - -@@ -468,7 +530,7 @@ static void test_cc_cubic(void) - return; - } - -- do_test("bpf_cc_cubic", NULL); -+ do_test(&opts, NULL); - - bpf_link__destroy(link); - bpf_cc_cubic__destroy(cc_cubic_skel); -diff --git a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c -index 4c6ada5b270b..73f669014b69 100644 ---- a/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c -+++ b/tools/testing/selftests/bpf/prog_tests/bpf_verif_scale.c -@@ -45,12 +45,6 @@ static int check_load(const char *file, enum bpf_prog_type type) - return err; - } - --struct scale_test_def { -- const char *file; -- enum bpf_prog_type attach_type; -- bool fails; --}; -- - static void scale_test(const char *file, - enum bpf_prog_type attach_type, - bool should_fail) -diff --git a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c -index 3b7c57fe55a5..08b6391f2f56 100644 ---- a/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c -+++ b/tools/testing/selftests/bpf/prog_tests/ctx_rewrite.c -@@ -69,15 +69,17 @@ static struct test_case test_cases[] = { - { - N(SCHED_CLS, struct __sk_buff, tstamp), - .read = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" -- "w11 &= 3;" -- "if w11 != 0x3 goto pc+2;" -+ "if w11 & 0x4 goto pc+1;" -+ "goto pc+4;" -+ "if w11 & 0x3 goto pc+1;" -+ "goto pc+2;" - "$dst = 0;" - "goto pc+1;" - "$dst = *(u64 *)($ctx + sk_buff::tstamp);", - .write = "r11 = *(u8 *)($ctx + sk_buff::__mono_tc_offset);" -- "if w11 & 0x2 goto pc+1;" -+ "if w11 & 0x4 goto pc+1;" - "goto pc+2;" -- "w11 &= -2;" -+ "w11 &= -4;" - "*(u8 *)($ctx + sk_buff::__mono_tc_offset) = r11;" - "*(u64 *)($ctx + sk_buff::tstamp) = $src;", - }, -diff --git a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c -index 1d3a20f01b60..7cd8be2780ca 100644 ---- a/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c -+++ b/tools/testing/selftests/bpf/prog_tests/sockopt_inherit.c -@@ -70,7 +70,7 @@ static void *server_thread(void *arg) - return (void *)(long)err; - } - --static int custom_cb(int fd, const struct post_socket_opts *opts) -+static int custom_cb(int fd, void *opts) - { - char buf; - int err; -diff --git a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c -index b1073d36d77a..327d51f59142 100644 ---- a/tools/testing/selftests/bpf/prog_tests/tc_redirect.c -+++ b/tools/testing/selftests/bpf/prog_tests/tc_redirect.c -@@ -890,9 +890,6 @@ static void test_udp_dtime(struct test_tc_dtime *skel, int family, bool bpf_fwd) - - ASSERT_EQ(dtimes[INGRESS_FWDNS_P100], 0, - dtime_cnt_str(t, INGRESS_FWDNS_P100)); -- /* non mono delivery time is not forwarded */ -- ASSERT_EQ(dtimes[INGRESS_FWDNS_P101], 0, -- dtime_cnt_str(t, INGRESS_FWDNS_P101)); - for (i = EGRESS_FWDNS_P100; i < SET_DTIME; i++) - ASSERT_GT(dtimes[i], 0, dtime_cnt_str(t, i)); - -diff --git a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c -index 29e183a80f49..bbcf12696a6b 100644 ---- a/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c -+++ b/tools/testing/selftests/bpf/prog_tests/test_struct_ops_module.c -@@ -3,9 +3,12 @@ - #include - #include - -+#include -+ - #include "struct_ops_module.skel.h" - #include "struct_ops_nulled_out_cb.skel.h" - #include "struct_ops_forgotten_cb.skel.h" -+#include "struct_ops_detach.skel.h" - - static void check_map_info(struct bpf_map_info *info) - { -@@ -242,6 +245,58 @@ static void test_struct_ops_forgotten_cb(void) - struct_ops_forgotten_cb__destroy(skel); - } - -+/* Detach a link from a user space program */ -+static void test_detach_link(void) -+{ -+ struct epoll_event ev, events[2]; -+ struct struct_ops_detach *skel; -+ struct bpf_link *link = NULL; -+ int fd, epollfd = -1, nfds; -+ int err; -+ -+ skel = struct_ops_detach__open_and_load(); -+ if (!ASSERT_OK_PTR(skel, "struct_ops_detach__open_and_load")) -+ return; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.testmod_do_detach); -+ if (!ASSERT_OK_PTR(link, "attach_struct_ops")) -+ goto cleanup; -+ -+ fd = bpf_link__fd(link); -+ if (!ASSERT_GE(fd, 0, "link_fd")) -+ goto cleanup; -+ -+ epollfd = epoll_create1(0); -+ if (!ASSERT_GE(epollfd, 0, "epoll_create1")) -+ goto cleanup; -+ -+ ev.events = EPOLLHUP; -+ ev.data.fd = fd; -+ err = epoll_ctl(epollfd, EPOLL_CTL_ADD, fd, &ev); -+ if (!ASSERT_OK(err, "epoll_ctl")) -+ goto cleanup; -+ -+ err = bpf_link__detach(link); -+ if (!ASSERT_OK(err, "detach_link")) -+ goto cleanup; -+ -+ /* Wait for EPOLLHUP */ -+ nfds = epoll_wait(epollfd, events, 2, 500); -+ if (!ASSERT_EQ(nfds, 1, "epoll_wait")) -+ goto cleanup; -+ -+ if (!ASSERT_EQ(events[0].data.fd, fd, "epoll_wait_fd")) -+ goto cleanup; -+ if (!ASSERT_TRUE(events[0].events & EPOLLHUP, "events[0].events")) -+ goto cleanup; -+ -+cleanup: -+ if (epollfd >= 0) -+ close(epollfd); -+ bpf_link__destroy(link); -+ struct_ops_detach__destroy(skel); -+} -+ - void serial_test_struct_ops_module(void) - { - if (test__start_subtest("struct_ops_load")) -@@ -254,5 +309,7 @@ void serial_test_struct_ops_module(void) - test_struct_ops_nulled_out_cb(); - if (test__start_subtest("struct_ops_forgotten_cb")) - test_struct_ops_forgotten_cb(); -+ if (test__start_subtest("test_detach_link")) -+ test_detach_link(); - } - -diff --git a/tools/testing/selftests/bpf/prog_tests/verifier.c b/tools/testing/selftests/bpf/prog_tests/verifier.c -index 1c9c4ec1be11..6816ff064516 100644 ---- a/tools/testing/selftests/bpf/prog_tests/verifier.c -+++ b/tools/testing/selftests/bpf/prog_tests/verifier.c -@@ -86,6 +86,7 @@ - #include "verifier_xadd.skel.h" - #include "verifier_xdp.skel.h" - #include "verifier_xdp_direct_packet_access.skel.h" -+#include "verifier_bits_iter.skel.h" - - #define MAX_ENTRIES 11 - -@@ -202,6 +203,7 @@ void test_verifier_var_off(void) { RUN(verifier_var_off); } - void test_verifier_xadd(void) { RUN(verifier_xadd); } - void test_verifier_xdp(void) { RUN(verifier_xdp); } - void test_verifier_xdp_direct_packet_access(void) { RUN(verifier_xdp_direct_packet_access); } -+void test_verifier_bits_iter(void) { RUN(verifier_bits_iter); } - - static int init_test_val_map(struct bpf_object *obj, char *map_name) - { -diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c -index c5969ca6f26b..564835ba7d51 100644 ---- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c -+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_array_map.c -@@ -6,12 +6,6 @@ - - char _license[] SEC("license") = "GPL"; - --struct key_t { -- int a; -- int b; -- int c; --}; -- - struct { - __uint(type, BPF_MAP_TYPE_ARRAY); - __uint(max_entries, 3); -diff --git a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c -index 85fa710fad90..9f0e0705b2bf 100644 ---- a/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c -+++ b/tools/testing/selftests/bpf/progs/bpf_iter_bpf_percpu_array_map.c -@@ -6,12 +6,6 @@ - - char _license[] SEC("license") = "GPL"; - --struct key_t { -- int a; -- int b; -- int c; --}; -- - struct { - __uint(type, BPF_MAP_TYPE_PERCPU_ARRAY); - __uint(max_entries, 3); -diff --git a/tools/testing/selftests/bpf/progs/struct_ops_detach.c b/tools/testing/selftests/bpf/progs/struct_ops_detach.c -new file mode 100644 -index 000000000000..56b787a89876 ---- /dev/null -+++ b/tools/testing/selftests/bpf/progs/struct_ops_detach.c -@@ -0,0 +1,10 @@ -+// SPDX-License-Identifier: GPL-2.0 -+/* Copyright (c) 2024 Meta Platforms, Inc. and affiliates. */ -+#include -+#include -+#include "../bpf_testmod/bpf_testmod.h" -+ -+char _license[] SEC("license") = "GPL"; -+ -+SEC(".struct_ops.link") -+struct bpf_testmod_ops testmod_do_detach; -diff --git a/tools/testing/selftests/bpf/progs/test_bpf_nf.c b/tools/testing/selftests/bpf/progs/test_bpf_nf.c -index 77ad8adf68da..0289d8ce2b80 100644 ---- a/tools/testing/selftests/bpf/progs/test_bpf_nf.c -+++ b/tools/testing/selftests/bpf/progs/test_bpf_nf.c -@@ -9,10 +9,14 @@ - #define EINVAL 22 - #define ENOENT 2 - -+#define NF_CT_ZONE_DIR_ORIG (1 << IP_CT_DIR_ORIGINAL) -+#define NF_CT_ZONE_DIR_REPL (1 << IP_CT_DIR_REPLY) -+ - extern unsigned long CONFIG_HZ __kconfig; - - int test_einval_bpf_tuple = 0; - int test_einval_reserved = 0; -+int test_einval_reserved_new = 0; - int test_einval_netns_id = 0; - int test_einval_len_opts = 0; - int test_eproto_l4proto = 0; -@@ -22,6 +26,11 @@ int test_eafnosupport = 0; - int test_alloc_entry = -EINVAL; - int test_insert_entry = -EAFNOSUPPORT; - int test_succ_lookup = -ENOENT; -+int test_ct_zone_id_alloc_entry = -EINVAL; -+int test_ct_zone_id_insert_entry = -EAFNOSUPPORT; -+int test_ct_zone_id_succ_lookup = -ENOENT; -+int test_ct_zone_dir_enoent_lookup = 0; -+int test_ct_zone_id_enoent_lookup = 0; - u32 test_delta_timeout = 0; - u32 test_status = 0; - u32 test_insert_lookup_mark = 0; -@@ -45,6 +54,17 @@ struct bpf_ct_opts___local { - s32 netns_id; - s32 error; - u8 l4proto; -+ u8 dir; -+ u8 reserved[2]; -+}; -+ -+struct bpf_ct_opts___new { -+ s32 netns_id; -+ s32 error; -+ u8 l4proto; -+ u8 dir; -+ u16 ct_zone_id; -+ u8 ct_zone_dir; - u8 reserved[3]; - } __attribute__((preserve_access_index)); - -@@ -220,10 +240,97 @@ nf_ct_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, - } - } - -+static __always_inline void -+nf_ct_opts_new_test(struct nf_conn *(*lookup_fn)(void *, struct bpf_sock_tuple *, u32, -+ struct bpf_ct_opts___new *, u32), -+ struct nf_conn *(*alloc_fn)(void *, struct bpf_sock_tuple *, u32, -+ struct bpf_ct_opts___new *, u32), -+ void *ctx) -+{ -+ struct bpf_ct_opts___new opts_def = { .l4proto = IPPROTO_TCP, .netns_id = -1 }; -+ struct bpf_sock_tuple bpf_tuple; -+ struct nf_conn *ct; -+ -+ __builtin_memset(&bpf_tuple, 0, sizeof(bpf_tuple.ipv4)); -+ -+ opts_def.reserved[0] = 1; -+ ct = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, -+ sizeof(opts_def)); -+ opts_def.reserved[0] = 0; -+ if (ct) -+ bpf_ct_release(ct); -+ else -+ test_einval_reserved_new = opts_def.error; -+ -+ bpf_tuple.ipv4.saddr = bpf_get_prandom_u32(); /* src IP */ -+ bpf_tuple.ipv4.daddr = bpf_get_prandom_u32(); /* dst IP */ -+ bpf_tuple.ipv4.sport = bpf_get_prandom_u32(); /* src port */ -+ bpf_tuple.ipv4.dport = bpf_get_prandom_u32(); /* dst port */ -+ -+ /* use non-default ct zone */ -+ opts_def.ct_zone_id = 10; -+ opts_def.ct_zone_dir = NF_CT_ZONE_DIR_ORIG; -+ ct = alloc_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), &opts_def, -+ sizeof(opts_def)); -+ if (ct) { -+ __u16 sport = bpf_get_prandom_u32(); -+ __u16 dport = bpf_get_prandom_u32(); -+ union nf_inet_addr saddr = {}; -+ union nf_inet_addr daddr = {}; -+ struct nf_conn *ct_ins; -+ -+ bpf_ct_set_timeout(ct, 10000); -+ -+ /* snat */ -+ saddr.ip = bpf_get_prandom_u32(); -+ bpf_ct_set_nat_info(ct, &saddr, sport, NF_NAT_MANIP_SRC___local); -+ /* dnat */ -+ daddr.ip = bpf_get_prandom_u32(); -+ bpf_ct_set_nat_info(ct, &daddr, dport, NF_NAT_MANIP_DST___local); -+ -+ ct_ins = bpf_ct_insert_entry(ct); -+ if (ct_ins) { -+ struct nf_conn *ct_lk; -+ -+ /* entry should exist in same ct zone we inserted it */ -+ ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), -+ &opts_def, sizeof(opts_def)); -+ if (ct_lk) { -+ bpf_ct_release(ct_lk); -+ test_ct_zone_id_succ_lookup = 0; -+ } -+ -+ /* entry should not exist with wrong direction */ -+ opts_def.ct_zone_dir = NF_CT_ZONE_DIR_REPL; -+ ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), -+ &opts_def, sizeof(opts_def)); -+ opts_def.ct_zone_dir = NF_CT_ZONE_DIR_ORIG; -+ if (ct_lk) -+ bpf_ct_release(ct_lk); -+ else -+ test_ct_zone_dir_enoent_lookup = opts_def.error; -+ -+ /* entry should not exist in default ct zone */ -+ opts_def.ct_zone_id = 0; -+ ct_lk = lookup_fn(ctx, &bpf_tuple, sizeof(bpf_tuple.ipv4), -+ &opts_def, sizeof(opts_def)); -+ if (ct_lk) -+ bpf_ct_release(ct_lk); -+ else -+ test_ct_zone_id_enoent_lookup = opts_def.error; -+ -+ bpf_ct_release(ct_ins); -+ test_ct_zone_id_insert_entry = 0; -+ } -+ test_ct_zone_id_alloc_entry = 0; -+ } -+} -+ - SEC("xdp") - int nf_xdp_ct_test(struct xdp_md *ctx) - { - nf_ct_test((void *)bpf_xdp_ct_lookup, (void *)bpf_xdp_ct_alloc, ctx); -+ nf_ct_opts_new_test((void *)bpf_xdp_ct_lookup, (void *)bpf_xdp_ct_alloc, ctx); - return 0; - } - -@@ -231,6 +338,7 @@ SEC("tc") - int nf_skb_ct_test(struct __sk_buff *ctx) - { - nf_ct_test((void *)bpf_skb_ct_lookup, (void *)bpf_skb_ct_alloc, ctx); -+ nf_ct_opts_new_test((void *)bpf_skb_ct_lookup, (void *)bpf_skb_ct_alloc, ctx); - return 0; - } - -diff --git a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h -index 99d2ea9fb658..f48f85f1bd70 100644 ---- a/tools/testing/selftests/bpf/progs/test_sockmap_kern.h -+++ b/tools/testing/selftests/bpf/progs/test_sockmap_kern.h -@@ -92,7 +92,7 @@ struct { - __uint(value_size, sizeof(int)); - } tls_sock_map SEC(".maps"); - --SEC("sk_skb1") -+SEC("sk_skb/stream_parser") - int bpf_prog1(struct __sk_buff *skb) - { - int *f, two = 2; -@@ -104,7 +104,7 @@ int bpf_prog1(struct __sk_buff *skb) - return skb->len; - } - --SEC("sk_skb2") -+SEC("sk_skb/stream_verdict") - int bpf_prog2(struct __sk_buff *skb) - { - __u32 lport = skb->local_port; -@@ -151,7 +151,7 @@ static inline void bpf_write_pass(struct __sk_buff *skb, int offset) - memcpy(c + offset, "PASS", 4); - } - --SEC("sk_skb3") -+SEC("sk_skb/stream_verdict") - int bpf_prog3(struct __sk_buff *skb) - { - int err, *f, ret = SK_PASS; -@@ -177,9 +177,6 @@ int bpf_prog3(struct __sk_buff *skb) - return bpf_sk_redirect_hash(skb, &tls_sock_map, &ret, flags); - #endif - } -- f = bpf_map_lookup_elem(&sock_skb_opts, &one); -- if (f && *f) -- ret = SK_DROP; - err = bpf_skb_adjust_room(skb, 4, 0, 0); - if (err) - return SK_DROP; -@@ -233,7 +230,7 @@ int bpf_sockmap(struct bpf_sock_ops *skops) - return 0; - } - --SEC("sk_msg1") -+SEC("sk_msg") - int bpf_prog4(struct sk_msg_md *msg) - { - int *bytes, zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5; -@@ -263,7 +260,7 @@ int bpf_prog4(struct sk_msg_md *msg) - return SK_PASS; - } - --SEC("sk_msg2") -+SEC("sk_msg") - int bpf_prog6(struct sk_msg_md *msg) - { - int zero = 0, one = 1, two = 2, three = 3, four = 4, five = 5, key = 0; -@@ -308,7 +305,7 @@ int bpf_prog6(struct sk_msg_md *msg) - #endif - } - --SEC("sk_msg3") -+SEC("sk_msg") - int bpf_prog8(struct sk_msg_md *msg) - { - void *data_end = (void *)(long) msg->data_end; -@@ -329,7 +326,8 @@ int bpf_prog8(struct sk_msg_md *msg) - - return SK_PASS; - } --SEC("sk_msg4") -+ -+SEC("sk_msg") - int bpf_prog9(struct sk_msg_md *msg) - { - void *data_end = (void *)(long) msg->data_end; -@@ -347,7 +345,7 @@ int bpf_prog9(struct sk_msg_md *msg) - return SK_PASS; - } - --SEC("sk_msg5") -+SEC("sk_msg") - int bpf_prog10(struct sk_msg_md *msg) - { - int *bytes, *start, *end, *start_push, *end_push, *start_pop, *pop; -diff --git a/tools/testing/selftests/bpf/progs/test_tc_dtime.c b/tools/testing/selftests/bpf/progs/test_tc_dtime.c -index 74ec09f040b7..ca8e8734d901 100644 ---- a/tools/testing/selftests/bpf/progs/test_tc_dtime.c -+++ b/tools/testing/selftests/bpf/progs/test_tc_dtime.c -@@ -222,17 +222,21 @@ int egress_host(struct __sk_buff *skb) - return TC_ACT_OK; - - if (skb_proto(skb_type) == IPPROTO_TCP) { -- if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO && -+ if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC && - skb->tstamp) - inc_dtimes(EGRESS_ENDHOST); - else - inc_errs(EGRESS_ENDHOST); -- } else { -- if (skb->tstamp_type == BPF_SKB_TSTAMP_UNSPEC && -+ } else if (skb_proto(skb_type) == IPPROTO_UDP) { -+ if (skb->tstamp_type == BPF_SKB_CLOCK_TAI && - skb->tstamp) - inc_dtimes(EGRESS_ENDHOST); - else - inc_errs(EGRESS_ENDHOST); -+ } else { -+ if (skb->tstamp_type == BPF_SKB_CLOCK_REALTIME && -+ skb->tstamp) -+ inc_errs(EGRESS_ENDHOST); - } - - skb->tstamp = EGRESS_ENDHOST_MAGIC; -@@ -252,7 +256,7 @@ int ingress_host(struct __sk_buff *skb) - if (!skb_type) - return TC_ACT_OK; - -- if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO && -+ if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC && - skb->tstamp == EGRESS_FWDNS_MAGIC) - inc_dtimes(INGRESS_ENDHOST); - else -@@ -315,7 +319,6 @@ int egress_fwdns_prio100(struct __sk_buff *skb) - SEC("tc") - int ingress_fwdns_prio101(struct __sk_buff *skb) - { -- __u64 expected_dtime = EGRESS_ENDHOST_MAGIC; - int skb_type; - - skb_type = skb_get_type(skb); -@@ -323,29 +326,24 @@ int ingress_fwdns_prio101(struct __sk_buff *skb) - /* Should have handled in prio100 */ - return TC_ACT_SHOT; - -- if (skb_proto(skb_type) == IPPROTO_UDP) -- expected_dtime = 0; -- - if (skb->tstamp_type) { - if (fwdns_clear_dtime() || -- skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO || -- skb->tstamp != expected_dtime) -+ (skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC && -+ skb->tstamp_type != BPF_SKB_CLOCK_TAI) || -+ skb->tstamp != EGRESS_ENDHOST_MAGIC) - inc_errs(INGRESS_FWDNS_P101); - else - inc_dtimes(INGRESS_FWDNS_P101); - } else { -- if (!fwdns_clear_dtime() && expected_dtime) -+ if (!fwdns_clear_dtime()) - inc_errs(INGRESS_FWDNS_P101); - } - -- if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) { -+ if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) { - skb->tstamp = INGRESS_FWDNS_MAGIC; - } else { - if (bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC, -- BPF_SKB_TSTAMP_DELIVERY_MONO)) -- inc_errs(SET_DTIME); -- if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC, -- BPF_SKB_TSTAMP_UNSPEC)) -+ BPF_SKB_CLOCK_MONOTONIC)) - inc_errs(SET_DTIME); - } - -@@ -370,7 +368,7 @@ int egress_fwdns_prio101(struct __sk_buff *skb) - - if (skb->tstamp_type) { - if (fwdns_clear_dtime() || -- skb->tstamp_type != BPF_SKB_TSTAMP_DELIVERY_MONO || -+ skb->tstamp_type != BPF_SKB_CLOCK_MONOTONIC || - skb->tstamp != INGRESS_FWDNS_MAGIC) - inc_errs(EGRESS_FWDNS_P101); - else -@@ -380,14 +378,11 @@ int egress_fwdns_prio101(struct __sk_buff *skb) - inc_errs(EGRESS_FWDNS_P101); - } - -- if (skb->tstamp_type == BPF_SKB_TSTAMP_DELIVERY_MONO) { -+ if (skb->tstamp_type == BPF_SKB_CLOCK_MONOTONIC) { - skb->tstamp = EGRESS_FWDNS_MAGIC; - } else { - if (bpf_skb_set_tstamp(skb, EGRESS_FWDNS_MAGIC, -- BPF_SKB_TSTAMP_DELIVERY_MONO)) -- inc_errs(SET_DTIME); -- if (!bpf_skb_set_tstamp(skb, INGRESS_FWDNS_MAGIC, -- BPF_SKB_TSTAMP_UNSPEC)) -+ BPF_SKB_CLOCK_MONOTONIC)) - inc_errs(SET_DTIME); - } - -diff --git a/tools/testing/selftests/bpf/progs/verifier_bits_iter.c b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c -new file mode 100644 -index 000000000000..716113c2bce2 ---- /dev/null -+++ b/tools/testing/selftests/bpf/progs/verifier_bits_iter.c -@@ -0,0 +1,153 @@ -+// SPDX-License-Identifier: GPL-2.0-only -+/* Copyright (c) 2024 Yafang Shao */ -+ -+#include "vmlinux.h" -+#include -+#include -+ -+#include "bpf_misc.h" -+#include "task_kfunc_common.h" -+ -+char _license[] SEC("license") = "GPL"; -+ -+int bpf_iter_bits_new(struct bpf_iter_bits *it, const u64 *unsafe_ptr__ign, -+ u32 nr_bits) __ksym __weak; -+int *bpf_iter_bits_next(struct bpf_iter_bits *it) __ksym __weak; -+void bpf_iter_bits_destroy(struct bpf_iter_bits *it) __ksym __weak; -+ -+SEC("iter.s/cgroup") -+__description("bits iter without destroy") -+__failure __msg("Unreleased reference") -+int BPF_PROG(no_destroy, struct bpf_iter_meta *meta, struct cgroup *cgrp) -+{ -+ struct bpf_iter_bits it; -+ u64 data = 1; -+ -+ bpf_iter_bits_new(&it, &data, 1); -+ bpf_iter_bits_next(&it); -+ return 0; -+} -+ -+SEC("iter/cgroup") -+__description("uninitialized iter in ->next()") -+__failure __msg("expected an initialized iter_bits as arg #1") -+int BPF_PROG(next_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) -+{ -+ struct bpf_iter_bits *it = NULL; -+ -+ bpf_iter_bits_next(it); -+ return 0; -+} -+ -+SEC("iter/cgroup") -+__description("uninitialized iter in ->destroy()") -+__failure __msg("expected an initialized iter_bits as arg #1") -+int BPF_PROG(destroy_uninit, struct bpf_iter_meta *meta, struct cgroup *cgrp) -+{ -+ struct bpf_iter_bits it = {}; -+ -+ bpf_iter_bits_destroy(&it); -+ return 0; -+} -+ -+SEC("syscall") -+__description("null pointer") -+__success __retval(0) -+int null_pointer(void) -+{ -+ int nr = 0; -+ int *bit; -+ -+ bpf_for_each(bits, bit, NULL, 1) -+ nr++; -+ return nr; -+} -+ -+SEC("syscall") -+__description("bits copy") -+__success __retval(10) -+int bits_copy(void) -+{ -+ u64 data = 0xf7310UL; /* 4 + 3 + 2 + 1 + 0*/ -+ int nr = 0; -+ int *bit; -+ -+ bpf_for_each(bits, bit, &data, 1) -+ nr++; -+ return nr; -+} -+ -+SEC("syscall") -+__description("bits memalloc") -+__success __retval(64) -+int bits_memalloc(void) -+{ -+ u64 data[2]; -+ int nr = 0; -+ int *bit; -+ -+ __builtin_memset(&data, 0xf0, sizeof(data)); /* 4 * 16 */ -+ bpf_for_each(bits, bit, &data[0], sizeof(data) / sizeof(u64)) -+ nr++; -+ return nr; -+} -+ -+SEC("syscall") -+__description("bit index") -+__success __retval(8) -+int bit_index(void) -+{ -+ u64 data = 0x100; -+ int bit_idx = 0; -+ int *bit; -+ -+ bpf_for_each(bits, bit, &data, 1) { -+ if (*bit == 0) -+ continue; -+ bit_idx = *bit; -+ } -+ return bit_idx; -+} -+ -+SEC("syscall") -+__description("bits nomem") -+__success __retval(0) -+int bits_nomem(void) -+{ -+ u64 data[4]; -+ int nr = 0; -+ int *bit; -+ -+ __builtin_memset(&data, 0xff, sizeof(data)); -+ bpf_for_each(bits, bit, &data[0], 513) /* Be greater than 512 */ -+ nr++; -+ return nr; -+} -+ -+SEC("syscall") -+__description("fewer words") -+__success __retval(1) -+int fewer_words(void) -+{ -+ u64 data[2] = {0x1, 0xff}; -+ int nr = 0; -+ int *bit; -+ -+ bpf_for_each(bits, bit, &data[0], 1) -+ nr++; -+ return nr; -+} -+ -+SEC("syscall") -+__description("zero words") -+__success __retval(0) -+int zero_words(void) -+{ -+ u64 data[2] = {0x1, 0xff}; -+ int nr = 0; -+ int *bit; -+ -+ bpf_for_each(bits, bit, &data[0], 0) -+ nr++; -+ return nr; -+} -diff --git a/tools/testing/selftests/bpf/test_sockmap.c b/tools/testing/selftests/bpf/test_sockmap.c -index 92752f5eeded..9cba4ec844a5 100644 ---- a/tools/testing/selftests/bpf/test_sockmap.c -+++ b/tools/testing/selftests/bpf/test_sockmap.c -@@ -63,7 +63,8 @@ int passed; - int failed; - int map_fd[9]; - struct bpf_map *maps[9]; --int prog_fd[11]; -+struct bpf_program *progs[9]; -+struct bpf_link *links[9]; - - int txmsg_pass; - int txmsg_redir; -@@ -680,7 +681,8 @@ static int msg_loop(int fd, int iov_count, int iov_length, int cnt, - } - } - -- s->bytes_recvd += recv; -+ if (recv > 0) -+ s->bytes_recvd += recv; - - if (opt->check_recved_len && s->bytes_recvd > total_bytes) { - errno = EMSGSIZE; -@@ -952,7 +954,8 @@ enum { - - static int run_options(struct sockmap_options *options, int cg_fd, int test) - { -- int i, key, next_key, err, tx_prog_fd = -1, zero = 0; -+ int i, key, next_key, err, zero = 0; -+ struct bpf_program *tx_prog; - - /* If base test skip BPF setup */ - if (test == BASE || test == BASE_SENDPAGE) -@@ -960,48 +963,44 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) - - /* Attach programs to sockmap */ - if (!txmsg_omit_skb_parser) { -- err = bpf_prog_attach(prog_fd[0], map_fd[0], -- BPF_SK_SKB_STREAM_PARSER, 0); -- if (err) { -+ links[0] = bpf_program__attach_sockmap(progs[0], map_fd[0]); -+ if (!links[0]) { - fprintf(stderr, -- "ERROR: bpf_prog_attach (sockmap %i->%i): %d (%s)\n", -- prog_fd[0], map_fd[0], err, strerror(errno)); -- return err; -+ "ERROR: bpf_program__attach_sockmap (sockmap %i->%i): (%s)\n", -+ bpf_program__fd(progs[0]), map_fd[0], strerror(errno)); -+ return -1; - } - } - -- err = bpf_prog_attach(prog_fd[1], map_fd[0], -- BPF_SK_SKB_STREAM_VERDICT, 0); -- if (err) { -- fprintf(stderr, "ERROR: bpf_prog_attach (sockmap): %d (%s)\n", -- err, strerror(errno)); -- return err; -+ links[1] = bpf_program__attach_sockmap(progs[1], map_fd[0]); -+ if (!links[1]) { -+ fprintf(stderr, "ERROR: bpf_program__attach_sockmap (sockmap): (%s)\n", -+ strerror(errno)); -+ return -1; - } - - /* Attach programs to TLS sockmap */ - if (txmsg_ktls_skb) { - if (!txmsg_omit_skb_parser) { -- err = bpf_prog_attach(prog_fd[0], map_fd[8], -- BPF_SK_SKB_STREAM_PARSER, 0); -- if (err) { -+ links[2] = bpf_program__attach_sockmap(progs[0], map_fd[8]); -+ if (!links[2]) { - fprintf(stderr, -- "ERROR: bpf_prog_attach (TLS sockmap %i->%i): %d (%s)\n", -- prog_fd[0], map_fd[8], err, strerror(errno)); -- return err; -+ "ERROR: bpf_program__attach_sockmap (TLS sockmap %i->%i): (%s)\n", -+ bpf_program__fd(progs[0]), map_fd[8], strerror(errno)); -+ return -1; - } - } - -- err = bpf_prog_attach(prog_fd[2], map_fd[8], -- BPF_SK_SKB_STREAM_VERDICT, 0); -- if (err) { -- fprintf(stderr, "ERROR: bpf_prog_attach (TLS sockmap): %d (%s)\n", -- err, strerror(errno)); -- return err; -+ links[3] = bpf_program__attach_sockmap(progs[2], map_fd[8]); -+ if (!links[3]) { -+ fprintf(stderr, "ERROR: bpf_program__attach_sockmap (TLS sockmap): (%s)\n", -+ strerror(errno)); -+ return -1; - } - } - - /* Attach to cgroups */ -- err = bpf_prog_attach(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS, 0); -+ err = bpf_prog_attach(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS, 0); - if (err) { - fprintf(stderr, "ERROR: bpf_prog_attach (groups): %d (%s)\n", - err, strerror(errno)); -@@ -1017,30 +1016,31 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) - - /* Attach txmsg program to sockmap */ - if (txmsg_pass) -- tx_prog_fd = prog_fd[4]; -+ tx_prog = progs[4]; - else if (txmsg_redir) -- tx_prog_fd = prog_fd[5]; -+ tx_prog = progs[5]; - else if (txmsg_apply) -- tx_prog_fd = prog_fd[6]; -+ tx_prog = progs[6]; - else if (txmsg_cork) -- tx_prog_fd = prog_fd[7]; -+ tx_prog = progs[7]; - else if (txmsg_drop) -- tx_prog_fd = prog_fd[8]; -+ tx_prog = progs[8]; - else -- tx_prog_fd = 0; -+ tx_prog = NULL; - -- if (tx_prog_fd) { -- int redir_fd, i = 0; -+ if (tx_prog) { -+ int redir_fd; - -- err = bpf_prog_attach(tx_prog_fd, -- map_fd[1], BPF_SK_MSG_VERDICT, 0); -- if (err) { -+ links[4] = bpf_program__attach_sockmap(tx_prog, map_fd[1]); -+ if (!links[4]) { - fprintf(stderr, -- "ERROR: bpf_prog_attach (txmsg): %d (%s)\n", -- err, strerror(errno)); -+ "ERROR: bpf_program__attach_sockmap (txmsg): (%s)\n", -+ strerror(errno)); -+ err = -1; - goto out; - } - -+ i = 0; - err = bpf_map_update_elem(map_fd[1], &i, &c1, BPF_ANY); - if (err) { - fprintf(stderr, -@@ -1279,16 +1279,14 @@ static int run_options(struct sockmap_options *options, int cg_fd, int test) - fprintf(stderr, "unknown test\n"); - out: - /* Detatch and zero all the maps */ -- bpf_prog_detach2(prog_fd[3], cg_fd, BPF_CGROUP_SOCK_OPS); -- bpf_prog_detach2(prog_fd[0], map_fd[0], BPF_SK_SKB_STREAM_PARSER); -- bpf_prog_detach2(prog_fd[1], map_fd[0], BPF_SK_SKB_STREAM_VERDICT); -- bpf_prog_detach2(prog_fd[0], map_fd[8], BPF_SK_SKB_STREAM_PARSER); -- bpf_prog_detach2(prog_fd[2], map_fd[8], BPF_SK_SKB_STREAM_VERDICT); -+ bpf_prog_detach2(bpf_program__fd(progs[3]), cg_fd, BPF_CGROUP_SOCK_OPS); - -- if (tx_prog_fd >= 0) -- bpf_prog_detach2(tx_prog_fd, map_fd[1], BPF_SK_MSG_VERDICT); -+ for (i = 0; i < ARRAY_SIZE(links); i++) { -+ if (links[i]) -+ bpf_link__detach(links[i]); -+ } - -- for (i = 0; i < 8; i++) { -+ for (i = 0; i < ARRAY_SIZE(map_fd); i++) { - key = next_key = 0; - bpf_map_update_elem(map_fd[i], &key, &zero, BPF_ANY); - while (bpf_map_get_next_key(map_fd[i], &key, &next_key) == 0) { -@@ -1783,34 +1781,6 @@ char *map_names[] = { - "tls_sock_map", - }; - --int prog_attach_type[] = { -- BPF_SK_SKB_STREAM_PARSER, -- BPF_SK_SKB_STREAM_VERDICT, -- BPF_SK_SKB_STREAM_VERDICT, -- BPF_CGROUP_SOCK_OPS, -- BPF_SK_MSG_VERDICT, -- BPF_SK_MSG_VERDICT, -- BPF_SK_MSG_VERDICT, -- BPF_SK_MSG_VERDICT, -- BPF_SK_MSG_VERDICT, -- BPF_SK_MSG_VERDICT, -- BPF_SK_MSG_VERDICT, --}; -- --int prog_type[] = { -- BPF_PROG_TYPE_SK_SKB, -- BPF_PROG_TYPE_SK_SKB, -- BPF_PROG_TYPE_SK_SKB, -- BPF_PROG_TYPE_SOCK_OPS, -- BPF_PROG_TYPE_SK_MSG, -- BPF_PROG_TYPE_SK_MSG, -- BPF_PROG_TYPE_SK_MSG, -- BPF_PROG_TYPE_SK_MSG, -- BPF_PROG_TYPE_SK_MSG, -- BPF_PROG_TYPE_SK_MSG, -- BPF_PROG_TYPE_SK_MSG, --}; -- - static int populate_progs(char *bpf_file) - { - struct bpf_program *prog; -@@ -1829,17 +1799,10 @@ static int populate_progs(char *bpf_file) - return -1; - } - -- bpf_object__for_each_program(prog, obj) { -- bpf_program__set_type(prog, prog_type[i]); -- bpf_program__set_expected_attach_type(prog, -- prog_attach_type[i]); -- i++; -- } -- - i = bpf_object__load(obj); - i = 0; - bpf_object__for_each_program(prog, obj) { -- prog_fd[i] = bpf_program__fd(prog); -+ progs[i] = prog; - i++; - } - -@@ -1853,6 +1816,9 @@ static int populate_progs(char *bpf_file) - } - } - -+ for (i = 0; i < ARRAY_SIZE(links); i++) -+ links[i] = NULL; -+ - return 0; - } - -diff --git a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c -index 7b5fc98838cd..aebc58c24dc5 100644 ---- a/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c -+++ b/tools/testing/selftests/bpf/test_tcp_check_syncookie_user.c -@@ -139,14 +139,14 @@ static int run_test(int server_fd, int results_fd, bool xdp) - return ret; - } - --static int v6only_true(int fd, const struct post_socket_opts *opts) -+static int v6only_true(int fd, void *opts) - { - int mode = true; - - return setsockopt(fd, IPPROTO_IPV6, IPV6_V6ONLY, &mode, sizeof(mode)); - } - --static int v6only_false(int fd, const struct post_socket_opts *opts) -+static int v6only_false(int fd, void *opts) - { - int mode = false; - -diff --git a/tools/testing/selftests/bpf/test_verifier.c b/tools/testing/selftests/bpf/test_verifier.c -index df04bda1c927..610392dfc4fb 100644 ---- a/tools/testing/selftests/bpf/test_verifier.c -+++ b/tools/testing/selftests/bpf/test_verifier.c -@@ -1237,11 +1237,6 @@ static void do_test_fixup(struct bpf_test *test, enum bpf_prog_type prog_type, - fixup_prog_kfuncs(prog, fd_array, test->fixup_kfunc_btf_id); - } - --struct libcap { -- struct __user_cap_header_struct hdr; -- struct __user_cap_data_struct data[2]; --}; -- - static int set_admin(bool admin) - { - int err; -diff --git a/tools/testing/selftests/sched_ext/.gitignore b/tools/testing/selftests/sched_ext/.gitignore -new file mode 100644 -index 000000000000..ae5491a114c0 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/.gitignore -@@ -0,0 +1,6 @@ -+* -+!*.c -+!*.h -+!Makefile -+!.gitignore -+!config -diff --git a/tools/testing/selftests/sched_ext/Makefile b/tools/testing/selftests/sched_ext/Makefile -new file mode 100644 -index 000000000000..0754a2c110a1 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/Makefile -@@ -0,0 +1,218 @@ -+# SPDX-License-Identifier: GPL-2.0 -+# Copyright (c) 2022 Meta Platforms, Inc. and affiliates. -+include ../../../build/Build.include -+include ../../../scripts/Makefile.arch -+include ../../../scripts/Makefile.include -+include ../lib.mk -+ -+ifneq ($(LLVM),) -+ifneq ($(filter %/,$(LLVM)),) -+LLVM_PREFIX := $(LLVM) -+else ifneq ($(filter -%,$(LLVM)),) -+LLVM_SUFFIX := $(LLVM) -+endif -+ -+CC := $(LLVM_PREFIX)clang$(LLVM_SUFFIX) $(CLANG_FLAGS) -fintegrated-as -+else -+CC := gcc -+endif # LLVM -+ -+ifneq ($(CROSS_COMPILE),) -+$(error CROSS_COMPILE not supported for scx selftests) -+endif # CROSS_COMPILE -+ -+CURDIR := $(abspath .) -+REPOROOT := $(abspath ../../../..) -+TOOLSDIR := $(REPOROOT)/tools -+LIBDIR := $(TOOLSDIR)/lib -+BPFDIR := $(LIBDIR)/bpf -+TOOLSINCDIR := $(TOOLSDIR)/include -+BPFTOOLDIR := $(TOOLSDIR)/bpf/bpftool -+APIDIR := $(TOOLSINCDIR)/uapi -+GENDIR := $(REPOROOT)/include/generated -+GENHDR := $(GENDIR)/autoconf.h -+SCXTOOLSDIR := $(TOOLSDIR)/sched_ext -+SCXTOOLSINCDIR := $(TOOLSDIR)/sched_ext/include -+ -+OUTPUT_DIR := $(CURDIR)/build -+OBJ_DIR := $(OUTPUT_DIR)/obj -+INCLUDE_DIR := $(OUTPUT_DIR)/include -+BPFOBJ_DIR := $(OBJ_DIR)/libbpf -+SCXOBJ_DIR := $(OBJ_DIR)/sched_ext -+BPFOBJ := $(BPFOBJ_DIR)/libbpf.a -+LIBBPF_OUTPUT := $(OBJ_DIR)/libbpf/libbpf.a -+DEFAULT_BPFTOOL := $(OUTPUT_DIR)/sbin/bpftool -+HOST_BUILD_DIR := $(OBJ_DIR) -+HOST_OUTPUT_DIR := $(OUTPUT_DIR) -+ -+VMLINUX_BTF_PATHS ?= ../../../../vmlinux \ -+ /sys/kernel/btf/vmlinux \ -+ /boot/vmlinux-$(shell uname -r) -+VMLINUX_BTF ?= $(abspath $(firstword $(wildcard $(VMLINUX_BTF_PATHS)))) -+ifeq ($(VMLINUX_BTF),) -+$(error Cannot find a vmlinux for VMLINUX_BTF at any of "$(VMLINUX_BTF_PATHS)") -+endif -+ -+BPFTOOL ?= $(DEFAULT_BPFTOOL) -+ -+ifneq ($(wildcard $(GENHDR)),) -+ GENFLAGS := -DHAVE_GENHDR -+endif -+ -+CFLAGS += -g -O2 -rdynamic -pthread -Wall -Werror $(GENFLAGS) \ -+ -I$(INCLUDE_DIR) -I$(GENDIR) -I$(LIBDIR) \ -+ -I$(TOOLSINCDIR) -I$(APIDIR) -I$(CURDIR)/include -I$(SCXTOOLSINCDIR) -+ -+# Silence some warnings when compiled with clang -+ifneq ($(LLVM),) -+CFLAGS += -Wno-unused-command-line-argument -+endif -+ -+LDFLAGS = -lelf -lz -lpthread -lzstd -+ -+IS_LITTLE_ENDIAN = $(shell $(CC) -dM -E - &1 \ -+ | sed -n '/<...> search starts here:/,/End of search list./{ s| \(/.*\)|-idirafter \1|p }') \ -+$(shell $(1) -dM -E - $@ -+else -+ $(call msg,CP,,$@) -+ $(Q)cp "$(VMLINUX_H)" $@ -+endif -+ -+$(SCXOBJ_DIR)/%.bpf.o: %.bpf.c $(INCLUDE_DIR)/vmlinux.h | $(BPFOBJ) $(SCXOBJ_DIR) -+ $(call msg,CLNG-BPF,,$(notdir $@)) -+ $(Q)$(CLANG) $(BPF_CFLAGS) -target bpf -c $< -o $@ -+ -+$(INCLUDE_DIR)/%.bpf.skel.h: $(SCXOBJ_DIR)/%.bpf.o $(INCLUDE_DIR)/vmlinux.h $(BPFTOOL) | $(INCLUDE_DIR) -+ $(eval sched=$(notdir $@)) -+ $(call msg,GEN-SKEL,,$(sched)) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked1.o) $< -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked2.o) $(<:.o=.linked1.o) -+ $(Q)$(BPFTOOL) gen object $(<:.o=.linked3.o) $(<:.o=.linked2.o) -+ $(Q)diff $(<:.o=.linked2.o) $(<:.o=.linked3.o) -+ $(Q)$(BPFTOOL) gen skeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $@ -+ $(Q)$(BPFTOOL) gen subskeleton $(<:.o=.linked3.o) name $(subst .bpf.skel.h,,$(sched)) > $(@:.skel.h=.subskel.h) -+ -+################ -+# C schedulers # -+################ -+ -+override define CLEAN -+ rm -rf $(OUTPUT_DIR) -+ rm -f *.o *.bpf.o *.bpf.skel.h *.bpf.subskel.h -+ rm -f $(TEST_GEN_PROGS) -+ rm -f runner -+endef -+ -+# Every testcase takes all of the BPF progs are dependencies by default. This -+# allows testcases to load any BPF scheduler, which is useful for testcases -+# that don't need their own prog to run their test. -+all_test_bpfprogs := $(foreach prog,$(wildcard *.bpf.c),$(INCLUDE_DIR)/$(patsubst %.c,%.skel.h,$(prog))) -+ -+auto-test-targets := \ -+ create_dsq \ -+ enq_last_no_enq_fails \ -+ enq_select_cpu_fails \ -+ ddsp_bogus_dsq_fail \ -+ ddsp_vtimelocal_fail \ -+ dsp_local_on \ -+ exit \ -+ hotplug \ -+ init_enable_count \ -+ maximal \ -+ maybe_null \ -+ minimal \ -+ prog_run \ -+ reload_loop \ -+ select_cpu_dfl \ -+ select_cpu_dfl_nodispatch \ -+ select_cpu_dispatch \ -+ select_cpu_dispatch_bad_dsq \ -+ select_cpu_dispatch_dbl_dsp \ -+ select_cpu_vtime \ -+ test_example \ -+ -+testcase-targets := $(addsuffix .o,$(addprefix $(SCXOBJ_DIR)/,$(auto-test-targets))) -+ -+$(SCXOBJ_DIR)/runner.o: runner.c | $(SCXOBJ_DIR) -+ $(CC) $(CFLAGS) -c $< -o $@ -+ -+# Create all of the test targets object files, whose testcase objects will be -+# registered into the runner in ELF constructors. -+# -+# Note that we must do double expansion here in order to support conditionally -+# compiling BPF object files only if one is present, as the wildcard Make -+# function doesn't support using implicit rules otherwise. -+$(testcase-targets): $(SCXOBJ_DIR)/%.o: %.c $(SCXOBJ_DIR)/runner.o $(all_test_bpfprogs) | $(SCXOBJ_DIR) -+ $(eval test=$(patsubst %.o,%.c,$(notdir $@))) -+ $(CC) $(CFLAGS) -c $< -o $@ $(SCXOBJ_DIR)/runner.o -+ -+$(SCXOBJ_DIR)/util.o: util.c | $(SCXOBJ_DIR) -+ $(CC) $(CFLAGS) -c $< -o $@ -+ -+runner: $(SCXOBJ_DIR)/runner.o $(SCXOBJ_DIR)/util.o $(BPFOBJ) $(testcase-targets) -+ @echo "$(testcase-targets)" -+ $(CC) $(CFLAGS) -o $@ $^ $(LDFLAGS) -+ -+TEST_GEN_PROGS := runner -+ -+all: runner -+ -+.PHONY: all clean help -+ -+.DEFAULT_GOAL := all -+ -+.DELETE_ON_ERROR: -+ -+.SECONDARY: -diff --git a/tools/testing/selftests/sched_ext/config b/tools/testing/selftests/sched_ext/config -new file mode 100644 -index 000000000000..0de9b4ee249d ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/config -@@ -0,0 +1,9 @@ -+CONFIG_SCHED_DEBUG=y -+CONFIG_SCHED_CLASS_EXT=y -+CONFIG_CGROUPS=y -+CONFIG_CGROUP_SCHED=y -+CONFIG_EXT_GROUP_SCHED=y -+CONFIG_BPF=y -+CONFIG_BPF_SYSCALL=y -+CONFIG_DEBUG_INFO=y -+CONFIG_DEBUG_INFO_BTF=y -diff --git a/tools/testing/selftests/sched_ext/create_dsq.bpf.c b/tools/testing/selftests/sched_ext/create_dsq.bpf.c -new file mode 100644 -index 000000000000..23f79ed343f0 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/create_dsq.bpf.c -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Create and destroy DSQs in a loop. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+void BPF_STRUCT_OPS(create_dsq_exit_task, struct task_struct *p, -+ struct scx_exit_task_args *args) -+{ -+ scx_bpf_destroy_dsq(p->pid); -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ s32 err; -+ -+ err = scx_bpf_create_dsq(p->pid, -1); -+ if (err) -+ scx_bpf_error("Failed to create DSQ for %s[%d]", -+ p->comm, p->pid); -+ -+ return err; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(create_dsq_init) -+{ -+ u32 i; -+ s32 err; -+ -+ bpf_for(i, 0, 1024) { -+ err = scx_bpf_create_dsq(i, -1); -+ if (err) { -+ scx_bpf_error("Failed to create DSQ %d", i); -+ return 0; -+ } -+ } -+ -+ bpf_for(i, 0, 1024) { -+ scx_bpf_destroy_dsq(i); -+ } -+ -+ return 0; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops create_dsq_ops = { -+ .init_task = create_dsq_init_task, -+ .exit_task = create_dsq_exit_task, -+ .init = create_dsq_init, -+ .name = "create_dsq", -+}; -diff --git a/tools/testing/selftests/sched_ext/create_dsq.c b/tools/testing/selftests/sched_ext/create_dsq.c -new file mode 100644 -index 000000000000..fa946d9146d4 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/create_dsq.c -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include "create_dsq.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct create_dsq *skel; -+ -+ skel = create_dsq__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct create_dsq *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.create_dsq_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct create_dsq *skel = ctx; -+ -+ create_dsq__destroy(skel); -+} -+ -+struct scx_test create_dsq = { -+ .name = "create_dsq", -+ .description = "Create and destroy a dsq in a loop", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&create_dsq) -diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c -new file mode 100644 -index 000000000000..e97ad41d354a ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.bpf.c -@@ -0,0 +1,42 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ -+ if (cpu >= 0) { -+ /* -+ * If we dispatch to a bogus DSQ that will fall back to the -+ * builtin global DSQ, we fail gracefully. -+ */ -+ scx_bpf_dispatch_vtime(p, 0xcafef00d, SCX_SLICE_DFL, -+ p->scx.dsq_vtime, 0); -+ return cpu; -+ } -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(ddsp_bogus_dsq_fail_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops ddsp_bogus_dsq_fail_ops = { -+ .select_cpu = ddsp_bogus_dsq_fail_select_cpu, -+ .exit = ddsp_bogus_dsq_fail_exit, -+ .name = "ddsp_bogus_dsq_fail", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c -new file mode 100644 -index 000000000000..e65d22f23f3b ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_bogus_dsq_fail.c -@@ -0,0 +1,57 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "ddsp_bogus_dsq_fail.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct ddsp_bogus_dsq_fail *skel; -+ -+ skel = ddsp_bogus_dsq_fail__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct ddsp_bogus_dsq_fail *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.ddsp_bogus_dsq_fail_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct ddsp_bogus_dsq_fail *skel = ctx; -+ -+ ddsp_bogus_dsq_fail__destroy(skel); -+} -+ -+struct scx_test ddsp_bogus_dsq_fail = { -+ .name = "ddsp_bogus_dsq_fail", -+ .description = "Verify we gracefully fail, and fall back to using a " -+ "built-in DSQ, if we do a direct dispatch to an invalid" -+ " DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&ddsp_bogus_dsq_fail) -diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c -new file mode 100644 -index 000000000000..dde7e7dafbfb ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.bpf.c -@@ -0,0 +1,39 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(ddsp_vtimelocal_fail_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ s32 cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ -+ if (cpu >= 0) { -+ /* Shouldn't be allowed to vtime dispatch to a builtin DSQ. */ -+ scx_bpf_dispatch_vtime(p, SCX_DSQ_LOCAL, SCX_SLICE_DFL, -+ p->scx.dsq_vtime, 0); -+ return cpu; -+ } -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(ddsp_vtimelocal_fail_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops ddsp_vtimelocal_fail_ops = { -+ .select_cpu = ddsp_vtimelocal_fail_select_cpu, -+ .exit = ddsp_vtimelocal_fail_exit, -+ .name = "ddsp_vtimelocal_fail", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c -new file mode 100644 -index 000000000000..abafee587cd6 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/ddsp_vtimelocal_fail.c -@@ -0,0 +1,56 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include "ddsp_vtimelocal_fail.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct ddsp_vtimelocal_fail *skel; -+ -+ skel = ddsp_vtimelocal_fail__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct ddsp_vtimelocal_fail *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.ddsp_vtimelocal_fail_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct ddsp_vtimelocal_fail *skel = ctx; -+ -+ ddsp_vtimelocal_fail__destroy(skel); -+} -+ -+struct scx_test ddsp_vtimelocal_fail = { -+ .name = "ddsp_vtimelocal_fail", -+ .description = "Verify we gracefully fail, and fall back to using a " -+ "built-in DSQ, if we do a direct vtime dispatch to a " -+ "built-in DSQ from DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&ddsp_vtimelocal_fail) -diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c -new file mode 100644 -index 000000000000..efb4672decb4 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/dsp_local_on.bpf.c -@@ -0,0 +1,65 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+const volatile s32 nr_cpus; -+ -+UEI_DEFINE(uei); -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_QUEUE); -+ __uint(max_entries, 8192); -+ __type(value, s32); -+} queue SEC(".maps"); -+ -+s32 BPF_STRUCT_OPS(dsp_local_on_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(dsp_local_on_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ s32 pid = p->pid; -+ -+ if (bpf_map_push_elem(&queue, &pid, 0)) -+ scx_bpf_error("Failed to enqueue %s[%d]", p->comm, p->pid); -+} -+ -+void BPF_STRUCT_OPS(dsp_local_on_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ s32 pid, target; -+ struct task_struct *p; -+ -+ if (bpf_map_pop_elem(&queue, &pid)) -+ return; -+ -+ p = bpf_task_from_pid(pid); -+ if (!p) -+ return; -+ -+ target = bpf_get_prandom_u32() % nr_cpus; -+ -+ scx_bpf_dispatch(p, SCX_DSQ_LOCAL_ON | target, SCX_SLICE_DFL, 0); -+ bpf_task_release(p); -+} -+ -+void BPF_STRUCT_OPS(dsp_local_on_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops dsp_local_on_ops = { -+ .select_cpu = dsp_local_on_select_cpu, -+ .enqueue = dsp_local_on_enqueue, -+ .dispatch = dsp_local_on_dispatch, -+ .exit = dsp_local_on_exit, -+ .name = "dsp_local_on", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/dsp_local_on.c b/tools/testing/selftests/sched_ext/dsp_local_on.c -new file mode 100644 -index 000000000000..472851b56854 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/dsp_local_on.c -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include "dsp_local_on.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct dsp_local_on *skel; -+ -+ skel = dsp_local_on__open(); -+ SCX_FAIL_IF(!skel, "Failed to open"); -+ -+ skel->rodata->nr_cpus = libbpf_num_possible_cpus(); -+ SCX_FAIL_IF(dsp_local_on__load(skel), "Failed to load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct dsp_local_on *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.dsp_local_on_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ /* Just sleeping is fine, plenty of scheduling events happening */ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct dsp_local_on *skel = ctx; -+ -+ dsp_local_on__destroy(skel); -+} -+ -+struct scx_test dsp_local_on = { -+ .name = "dsp_local_on", -+ .description = "Verify we can directly dispatch tasks to a local DSQs " -+ "from osp.dispatch()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&dsp_local_on) -diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c -new file mode 100644 -index 000000000000..b0b99531d5d5 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.bpf.c -@@ -0,0 +1,21 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops enq_last_no_enq_fails_ops = { -+ .name = "enq_last_no_enq_fails", -+ /* Need to define ops.enqueue() with SCX_OPS_ENQ_LAST */ -+ .flags = SCX_OPS_ENQ_LAST, -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c -new file mode 100644 -index 000000000000..2a3eda5e2c0b ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_last_no_enq_fails.c -@@ -0,0 +1,60 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "enq_last_no_enq_fails.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct enq_last_no_enq_fails *skel; -+ -+ skel = enq_last_no_enq_fails__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct enq_last_no_enq_fails *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.enq_last_no_enq_fails_ops); -+ if (link) { -+ SCX_ERR("Incorrectly succeeded in to attaching scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct enq_last_no_enq_fails *skel = ctx; -+ -+ enq_last_no_enq_fails__destroy(skel); -+} -+ -+struct scx_test enq_last_no_enq_fails = { -+ .name = "enq_last_no_enq_fails", -+ .description = "Verify we fail to load a scheduler if we specify " -+ "the SCX_OPS_ENQ_LAST flag without defining " -+ "ops.enqueue()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&enq_last_no_enq_fails) -diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c -new file mode 100644 -index 000000000000..b3dfc1033cd6 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.bpf.c -@@ -0,0 +1,43 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+/* Manually specify the signature until the kfunc is added to the scx repo. */ -+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, -+ bool *found) __ksym; -+ -+s32 BPF_STRUCT_OPS(enq_select_cpu_fails_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(enq_select_cpu_fails_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ /* -+ * Need to initialize the variable or the verifier will fail to load. -+ * Improving these semantics is actively being worked on. -+ */ -+ bool found = false; -+ -+ /* Can only call from ops.select_cpu() */ -+ scx_bpf_select_cpu_dfl(p, 0, 0, &found); -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops enq_select_cpu_fails_ops = { -+ .select_cpu = enq_select_cpu_fails_select_cpu, -+ .enqueue = enq_select_cpu_fails_enqueue, -+ .name = "enq_select_cpu_fails", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c -new file mode 100644 -index 000000000000..dd1350e5f002 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/enq_select_cpu_fails.c -@@ -0,0 +1,61 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "enq_select_cpu_fails.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct enq_select_cpu_fails *skel; -+ -+ skel = enq_select_cpu_fails__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct enq_select_cpu_fails *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.enq_select_cpu_fails_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ sleep(1); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct enq_select_cpu_fails *skel = ctx; -+ -+ enq_select_cpu_fails__destroy(skel); -+} -+ -+struct scx_test enq_select_cpu_fails = { -+ .name = "enq_select_cpu_fails", -+ .description = "Verify we fail to call scx_bpf_select_cpu_dfl() " -+ "from ops.enqueue()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&enq_select_cpu_fails) -diff --git a/tools/testing/selftests/sched_ext/exit.bpf.c b/tools/testing/selftests/sched_ext/exit.bpf.c -new file mode 100644 -index 000000000000..ae12ddaac921 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/exit.bpf.c -@@ -0,0 +1,84 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+#include "exit_test.h" -+ -+const volatile int exit_point; -+UEI_DEFINE(uei); -+ -+#define EXIT_CLEANLY() scx_bpf_exit(exit_point, "%d", exit_point) -+ -+s32 BPF_STRUCT_OPS(exit_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ bool found; -+ -+ if (exit_point == EXIT_SELECT_CPU) -+ EXIT_CLEANLY(); -+ -+ return scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, &found); -+} -+ -+void BPF_STRUCT_OPS(exit_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ if (exit_point == EXIT_ENQUEUE) -+ EXIT_CLEANLY(); -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+void BPF_STRUCT_OPS(exit_dispatch, s32 cpu, struct task_struct *p) -+{ -+ if (exit_point == EXIT_DISPATCH) -+ EXIT_CLEANLY(); -+ -+ scx_bpf_consume(SCX_DSQ_GLOBAL); -+} -+ -+void BPF_STRUCT_OPS(exit_enable, struct task_struct *p) -+{ -+ if (exit_point == EXIT_ENABLE) -+ EXIT_CLEANLY(); -+} -+ -+s32 BPF_STRUCT_OPS(exit_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ if (exit_point == EXIT_INIT_TASK) -+ EXIT_CLEANLY(); -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(exit_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(exit_init) -+{ -+ if (exit_point == EXIT_INIT) -+ EXIT_CLEANLY(); -+ -+ return 0; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops exit_ops = { -+ .select_cpu = exit_select_cpu, -+ .enqueue = exit_enqueue, -+ .dispatch = exit_dispatch, -+ .init_task = exit_init_task, -+ .enable = exit_enable, -+ .exit = exit_exit, -+ .init = exit_init, -+ .name = "exit", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/exit.c b/tools/testing/selftests/sched_ext/exit.c -new file mode 100644 -index 000000000000..31bcd06e21cd ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/exit.c -@@ -0,0 +1,55 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "exit.bpf.skel.h" -+#include "scx_test.h" -+ -+#include "exit_test.h" -+ -+static enum scx_test_status run(void *ctx) -+{ -+ enum exit_test_case tc; -+ -+ for (tc = 0; tc < NUM_EXITS; tc++) { -+ struct exit *skel; -+ struct bpf_link *link; -+ char buf[16]; -+ -+ skel = exit__open(); -+ skel->rodata->exit_point = tc; -+ exit__load(skel); -+ link = bpf_map__attach_struct_ops(skel->maps.exit_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ exit__destroy(skel); -+ return SCX_TEST_FAIL; -+ } -+ -+ /* Assumes uei.kind is written last */ -+ while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) -+ sched_yield(); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); -+ SCX_EQ(skel->data->uei.exit_code, tc); -+ sprintf(buf, "%d", tc); -+ SCX_ASSERT(!strcmp(skel->data->uei.msg, buf)); -+ bpf_link__destroy(link); -+ exit__destroy(skel); -+ } -+ -+ return SCX_TEST_PASS; -+} -+ -+struct scx_test exit_test = { -+ .name = "exit", -+ .description = "Verify we can cleanly exit a scheduler in multiple places", -+ .run = run, -+}; -+REGISTER_SCX_TEST(&exit_test) -diff --git a/tools/testing/selftests/sched_ext/exit_test.h b/tools/testing/selftests/sched_ext/exit_test.h -new file mode 100644 -index 000000000000..94f0268b9cb8 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/exit_test.h -@@ -0,0 +1,20 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#ifndef __EXIT_TEST_H__ -+#define __EXIT_TEST_H__ -+ -+enum exit_test_case { -+ EXIT_SELECT_CPU, -+ EXIT_ENQUEUE, -+ EXIT_DISPATCH, -+ EXIT_ENABLE, -+ EXIT_INIT_TASK, -+ EXIT_INIT, -+ NUM_EXITS, -+}; -+ -+#endif // # __EXIT_TEST_H__ -diff --git a/tools/testing/selftests/sched_ext/hotplug.bpf.c b/tools/testing/selftests/sched_ext/hotplug.bpf.c -new file mode 100644 -index 000000000000..8f2601db39f3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/hotplug.bpf.c -@@ -0,0 +1,61 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+#include "hotplug_test.h" -+ -+UEI_DEFINE(uei); -+ -+void BPF_STRUCT_OPS(hotplug_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+static void exit_from_hotplug(s32 cpu, bool onlining) -+{ -+ /* -+ * Ignored, just used to verify that we can invoke blocking kfuncs -+ * from the hotplug path. -+ */ -+ scx_bpf_create_dsq(0, -1); -+ -+ s64 code = SCX_ECODE_ACT_RESTART | HOTPLUG_EXIT_RSN; -+ -+ if (onlining) -+ code |= HOTPLUG_ONLINING; -+ -+ scx_bpf_exit(code, "hotplug event detected (%d going %s)", cpu, -+ onlining ? "online" : "offline"); -+} -+ -+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_online, s32 cpu) -+{ -+ exit_from_hotplug(cpu, true); -+} -+ -+void BPF_STRUCT_OPS_SLEEPABLE(hotplug_cpu_offline, s32 cpu) -+{ -+ exit_from_hotplug(cpu, false); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops hotplug_cb_ops = { -+ .cpu_online = hotplug_cpu_online, -+ .cpu_offline = hotplug_cpu_offline, -+ .exit = hotplug_exit, -+ .name = "hotplug_cbs", -+ .timeout_ms = 1000U, -+}; -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops hotplug_nocb_ops = { -+ .exit = hotplug_exit, -+ .name = "hotplug_nocbs", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/hotplug.c b/tools/testing/selftests/sched_ext/hotplug.c -new file mode 100644 -index 000000000000..87bf220b1bce ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/hotplug.c -@@ -0,0 +1,168 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+#include "hotplug_test.h" -+#include "hotplug.bpf.skel.h" -+#include "scx_test.h" -+#include "util.h" -+ -+const char *online_path = "/sys/devices/system/cpu/cpu1/online"; -+ -+static bool is_cpu_online(void) -+{ -+ return file_read_long(online_path) > 0; -+} -+ -+static void toggle_online_status(bool online) -+{ -+ long val = online ? 1 : 0; -+ int ret; -+ -+ ret = file_write_long(online_path, val); -+ if (ret != 0) -+ fprintf(stderr, "Failed to bring CPU %s (%s)", -+ online ? "online" : "offline", strerror(errno)); -+} -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ if (!is_cpu_online()) -+ return SCX_TEST_SKIP; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status test_hotplug(bool onlining, bool cbs_defined) -+{ -+ struct hotplug *skel; -+ struct bpf_link *link; -+ long kind, code; -+ -+ SCX_ASSERT(is_cpu_online()); -+ -+ skel = hotplug__open_and_load(); -+ SCX_ASSERT(skel); -+ -+ /* Testing the offline -> online path, so go offline before starting */ -+ if (onlining) -+ toggle_online_status(0); -+ -+ if (cbs_defined) { -+ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_BPF); -+ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | HOTPLUG_EXIT_RSN; -+ if (onlining) -+ code |= HOTPLUG_ONLINING; -+ } else { -+ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); -+ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | -+ SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); -+ } -+ -+ if (cbs_defined) -+ link = bpf_map__attach_struct_ops(skel->maps.hotplug_cb_ops); -+ else -+ link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); -+ -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ hotplug__destroy(skel); -+ return SCX_TEST_FAIL; -+ } -+ -+ toggle_online_status(onlining ? 1 : 0); -+ -+ while (!UEI_EXITED(skel, uei)) -+ sched_yield(); -+ -+ SCX_EQ(skel->data->uei.kind, kind); -+ SCX_EQ(UEI_REPORT(skel, uei), code); -+ -+ if (!onlining) -+ toggle_online_status(1); -+ -+ bpf_link__destroy(link); -+ hotplug__destroy(skel); -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status test_hotplug_attach(void) -+{ -+ struct hotplug *skel; -+ struct bpf_link *link; -+ enum scx_test_status status = SCX_TEST_PASS; -+ long kind, code; -+ -+ SCX_ASSERT(is_cpu_online()); -+ SCX_ASSERT(scx_hotplug_seq() > 0); -+ -+ skel = SCX_OPS_OPEN(hotplug_nocb_ops, hotplug); -+ SCX_ASSERT(skel); -+ -+ SCX_OPS_LOAD(skel, hotplug_nocb_ops, hotplug, uei); -+ -+ /* -+ * Take the CPU offline to increment the global hotplug seq, which -+ * should cause attach to fail due to us setting the hotplug seq above -+ */ -+ toggle_online_status(0); -+ link = bpf_map__attach_struct_ops(skel->maps.hotplug_nocb_ops); -+ -+ toggle_online_status(1); -+ -+ SCX_ASSERT(link); -+ while (!UEI_EXITED(skel, uei)) -+ sched_yield(); -+ -+ kind = SCX_KIND_VAL(SCX_EXIT_UNREG_KERN); -+ code = SCX_ECODE_VAL(SCX_ECODE_ACT_RESTART) | -+ SCX_ECODE_VAL(SCX_ECODE_RSN_HOTPLUG); -+ SCX_EQ(skel->data->uei.kind, kind); -+ SCX_EQ(UEI_REPORT(skel, uei), code); -+ -+ bpf_link__destroy(link); -+ hotplug__destroy(skel); -+ -+ return status; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ -+#define HP_TEST(__onlining, __cbs_defined) ({ \ -+ if (test_hotplug(__onlining, __cbs_defined) != SCX_TEST_PASS) \ -+ return SCX_TEST_FAIL; \ -+}) -+ -+ HP_TEST(true, true); -+ HP_TEST(false, true); -+ HP_TEST(true, false); -+ HP_TEST(false, false); -+ -+#undef HP_TEST -+ -+ return test_hotplug_attach(); -+} -+ -+static void cleanup(void *ctx) -+{ -+ toggle_online_status(1); -+} -+ -+struct scx_test hotplug_test = { -+ .name = "hotplug", -+ .description = "Verify hotplug behavior", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&hotplug_test) -diff --git a/tools/testing/selftests/sched_ext/hotplug_test.h b/tools/testing/selftests/sched_ext/hotplug_test.h -new file mode 100644 -index 000000000000..73d236f90787 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/hotplug_test.h -@@ -0,0 +1,15 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#ifndef __HOTPLUG_TEST_H__ -+#define __HOTPLUG_TEST_H__ -+ -+enum hotplug_test_flags { -+ HOTPLUG_EXIT_RSN = 1LLU << 0, -+ HOTPLUG_ONLINING = 1LLU << 1, -+}; -+ -+#endif // # __HOTPLUG_TEST_H__ -diff --git a/tools/testing/selftests/sched_ext/init_enable_count.bpf.c b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c -new file mode 100644 -index 000000000000..47ea89a626c3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/init_enable_count.bpf.c -@@ -0,0 +1,53 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that verifies that we do proper counting of init, enable, etc -+ * callbacks. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+u64 init_task_cnt, exit_task_cnt, enable_cnt, disable_cnt; -+u64 init_fork_cnt, init_transition_cnt; -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(cnt_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ __sync_fetch_and_add(&init_task_cnt, 1); -+ -+ if (args->fork) -+ __sync_fetch_and_add(&init_fork_cnt, 1); -+ else -+ __sync_fetch_and_add(&init_transition_cnt, 1); -+ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(cnt_exit_task, struct task_struct *p) -+{ -+ __sync_fetch_and_add(&exit_task_cnt, 1); -+} -+ -+void BPF_STRUCT_OPS(cnt_enable, struct task_struct *p) -+{ -+ __sync_fetch_and_add(&enable_cnt, 1); -+} -+ -+void BPF_STRUCT_OPS(cnt_disable, struct task_struct *p) -+{ -+ __sync_fetch_and_add(&disable_cnt, 1); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops init_enable_count_ops = { -+ .init_task = cnt_init_task, -+ .exit_task = cnt_exit_task, -+ .enable = cnt_enable, -+ .disable = cnt_disable, -+ .name = "init_enable_count", -+}; -diff --git a/tools/testing/selftests/sched_ext/init_enable_count.c b/tools/testing/selftests/sched_ext/init_enable_count.c -new file mode 100644 -index 000000000000..ef9da0a50846 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/init_enable_count.c -@@ -0,0 +1,166 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+#include "scx_test.h" -+#include "init_enable_count.bpf.skel.h" -+ -+#define SCHED_EXT 7 -+ -+static struct init_enable_count * -+open_load_prog(bool global) -+{ -+ struct init_enable_count *skel; -+ -+ skel = init_enable_count__open(); -+ SCX_BUG_ON(!skel, "Failed to open skel"); -+ -+ if (!global) -+ skel->struct_ops.init_enable_count_ops->flags |= __COMPAT_SCX_OPS_SWITCH_PARTIAL; -+ -+ SCX_BUG_ON(init_enable_count__load(skel), "Failed to load skel"); -+ -+ return skel; -+} -+ -+static enum scx_test_status run_test(bool global) -+{ -+ struct init_enable_count *skel; -+ struct bpf_link *link; -+ const u32 num_children = 5, num_pre_forks = 1024; -+ int ret, i, status; -+ struct sched_param param = {}; -+ pid_t pids[num_pre_forks]; -+ -+ skel = open_load_prog(global); -+ -+ /* -+ * Fork a bunch of children before we attach the scheduler so that we -+ * ensure (at least in practical terms) that there are more tasks that -+ * transition from SCHED_OTHER -> SCHED_EXT than there are tasks that -+ * take the fork() path either below or in other processes. -+ */ -+ for (i = 0; i < num_pre_forks; i++) { -+ pids[i] = fork(); -+ SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ for (i = 0; i < num_pre_forks; i++) { -+ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], -+ "Failed to wait for pre-forked child\n"); -+ -+ SCX_FAIL_IF(status != 0, "Pre-forked child %d exited with status %d\n", i, -+ status); -+ } -+ -+ bpf_link__destroy(link); -+ SCX_GE(skel->bss->init_task_cnt, num_pre_forks); -+ SCX_GE(skel->bss->exit_task_cnt, num_pre_forks); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.init_enable_count_ops); -+ SCX_FAIL_IF(!link, "Failed to attach struct_ops"); -+ -+ /* SCHED_EXT children */ -+ for (i = 0; i < num_children; i++) { -+ pids[i] = fork(); -+ SCX_FAIL_IF(pids[i] < 0, "Failed to fork child"); -+ -+ if (pids[i] == 0) { -+ ret = sched_setscheduler(0, SCHED_EXT, ¶m); -+ SCX_BUG_ON(ret, "Failed to set sched to sched_ext"); -+ -+ /* -+ * Reset to SCHED_OTHER for half of them. Counts for -+ * everything should still be the same regardless, as -+ * ops.disable() is invoked even if a task is still on -+ * SCHED_EXT before it exits. -+ */ -+ if (i % 2 == 0) { -+ ret = sched_setscheduler(0, SCHED_OTHER, ¶m); -+ SCX_BUG_ON(ret, "Failed to reset sched to normal"); -+ } -+ exit(0); -+ } -+ } -+ for (i = 0; i < num_children; i++) { -+ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], -+ "Failed to wait for SCX child\n"); -+ -+ SCX_FAIL_IF(status != 0, "SCX child %d exited with status %d\n", i, -+ status); -+ } -+ -+ /* SCHED_OTHER children */ -+ for (i = 0; i < num_children; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) -+ exit(0); -+ } -+ -+ for (i = 0; i < num_children; i++) { -+ SCX_FAIL_IF(waitpid(pids[i], &status, 0) != pids[i], -+ "Failed to wait for normal child\n"); -+ -+ SCX_FAIL_IF(status != 0, "Normal child %d exited with status %d\n", i, -+ status); -+ } -+ -+ bpf_link__destroy(link); -+ -+ SCX_GE(skel->bss->init_task_cnt, 2 * num_children); -+ SCX_GE(skel->bss->exit_task_cnt, 2 * num_children); -+ -+ if (global) { -+ SCX_GE(skel->bss->enable_cnt, 2 * num_children); -+ SCX_GE(skel->bss->disable_cnt, 2 * num_children); -+ } else { -+ SCX_EQ(skel->bss->enable_cnt, num_children); -+ SCX_EQ(skel->bss->disable_cnt, num_children); -+ } -+ /* -+ * We forked a ton of tasks before we attached the scheduler above, so -+ * this should be fine. Technically it could be flaky if a ton of forks -+ * are happening at the same time in other processes, but that should -+ * be exceedingly unlikely. -+ */ -+ SCX_GT(skel->bss->init_transition_cnt, skel->bss->init_fork_cnt); -+ SCX_GE(skel->bss->init_fork_cnt, 2 * num_children); -+ -+ init_enable_count__destroy(skel); -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ enum scx_test_status status; -+ -+ status = run_test(true); -+ if (status != SCX_TEST_PASS) -+ return status; -+ -+ return run_test(false); -+} -+ -+struct scx_test init_enable_count = { -+ .name = "init_enable_count", -+ .description = "Verify we do the correct amount of counting of init, " -+ "enable, etc callbacks.", -+ .run = run, -+}; -+REGISTER_SCX_TEST(&init_enable_count) -diff --git a/tools/testing/selftests/sched_ext/maximal.bpf.c b/tools/testing/selftests/sched_ext/maximal.bpf.c -new file mode 100644 -index 000000000000..00bfa9cb95d3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maximal.bpf.c -@@ -0,0 +1,164 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler with every callback defined. -+ * -+ * This scheduler defines every callback. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+s32 BPF_STRUCT_OPS(maximal_select_cpu, struct task_struct *p, s32 prev_cpu, -+ u64 wake_flags) -+{ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(maximal_enqueue, struct task_struct *p, u64 enq_flags) -+{ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+void BPF_STRUCT_OPS(maximal_dequeue, struct task_struct *p, u64 deq_flags) -+{} -+ -+void BPF_STRUCT_OPS(maximal_dispatch, s32 cpu, struct task_struct *prev) -+{ -+ scx_bpf_consume(SCX_DSQ_GLOBAL); -+} -+ -+void BPF_STRUCT_OPS(maximal_runnable, struct task_struct *p, u64 enq_flags) -+{} -+ -+void BPF_STRUCT_OPS(maximal_running, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maximal_stopping, struct task_struct *p, bool runnable) -+{} -+ -+void BPF_STRUCT_OPS(maximal_quiescent, struct task_struct *p, u64 deq_flags) -+{} -+ -+bool BPF_STRUCT_OPS(maximal_yield, struct task_struct *from, -+ struct task_struct *to) -+{ -+ return false; -+} -+ -+bool BPF_STRUCT_OPS(maximal_core_sched_before, struct task_struct *a, -+ struct task_struct *b) -+{ -+ return false; -+} -+ -+void BPF_STRUCT_OPS(maximal_set_weight, struct task_struct *p, u32 weight) -+{} -+ -+void BPF_STRUCT_OPS(maximal_set_cpumask, struct task_struct *p, -+ const struct cpumask *cpumask) -+{} -+ -+void BPF_STRUCT_OPS(maximal_update_idle, s32 cpu, bool idle) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_acquire, s32 cpu, -+ struct scx_cpu_acquire_args *args) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_release, s32 cpu, -+ struct scx_cpu_release_args *args) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_online, s32 cpu) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cpu_offline, s32 cpu) -+{} -+ -+s32 BPF_STRUCT_OPS(maximal_init_task, struct task_struct *p, -+ struct scx_init_task_args *args) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_enable, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maximal_exit_task, struct task_struct *p, -+ struct scx_exit_task_args *args) -+{} -+ -+void BPF_STRUCT_OPS(maximal_disable, struct task_struct *p) -+{} -+ -+s32 BPF_STRUCT_OPS(maximal_cgroup_init, struct cgroup *cgrp, -+ struct scx_cgroup_init_args *args) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_exit, struct cgroup *cgrp) -+{} -+ -+s32 BPF_STRUCT_OPS(maximal_cgroup_prep_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_cancel_move, struct task_struct *p, -+ struct cgroup *from, struct cgroup *to) -+{} -+ -+void BPF_STRUCT_OPS(maximal_cgroup_set_weight, struct cgroup *cgrp, u32 weight) -+{} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(maximal_init) -+{ -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(maximal_exit, struct scx_exit_info *info) -+{} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops maximal_ops = { -+ .select_cpu = maximal_select_cpu, -+ .enqueue = maximal_enqueue, -+ .dequeue = maximal_dequeue, -+ .dispatch = maximal_dispatch, -+ .runnable = maximal_runnable, -+ .running = maximal_running, -+ .stopping = maximal_stopping, -+ .quiescent = maximal_quiescent, -+ .yield = maximal_yield, -+ .core_sched_before = maximal_core_sched_before, -+ .set_weight = maximal_set_weight, -+ .set_cpumask = maximal_set_cpumask, -+ .update_idle = maximal_update_idle, -+ .cpu_acquire = maximal_cpu_acquire, -+ .cpu_release = maximal_cpu_release, -+ .cpu_online = maximal_cpu_online, -+ .cpu_offline = maximal_cpu_offline, -+ .init_task = maximal_init_task, -+ .enable = maximal_enable, -+ .exit_task = maximal_exit_task, -+ .disable = maximal_disable, -+ .cgroup_init = maximal_cgroup_init, -+ .cgroup_exit = maximal_cgroup_exit, -+ .cgroup_prep_move = maximal_cgroup_prep_move, -+ .cgroup_move = maximal_cgroup_move, -+ .cgroup_cancel_move = maximal_cgroup_cancel_move, -+ .cgroup_set_weight = maximal_cgroup_set_weight, -+ .init = maximal_init, -+ .exit = maximal_exit, -+ .name = "maximal", -+}; -diff --git a/tools/testing/selftests/sched_ext/maximal.c b/tools/testing/selftests/sched_ext/maximal.c -new file mode 100644 -index 000000000000..f38fc973c380 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maximal.c -@@ -0,0 +1,51 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include "maximal.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct maximal *skel; -+ -+ skel = maximal__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct maximal *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct maximal *skel = ctx; -+ -+ maximal__destroy(skel); -+} -+ -+struct scx_test maximal = { -+ .name = "maximal", -+ .description = "Verify we can load a scheduler with every callback defined", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&maximal) -diff --git a/tools/testing/selftests/sched_ext/maybe_null.bpf.c b/tools/testing/selftests/sched_ext/maybe_null.bpf.c -new file mode 100644 -index 000000000000..ad5e694226bb ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maybe_null.bpf.c -@@ -0,0 +1,26 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+u64 vtime_test; -+ -+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maybe_null_success_dispatch, s32 cpu, struct task_struct *p) -+{ -+ if (p != NULL) -+ vtime_test = p->scx.dsq_vtime; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops maybe_null_success = { -+ .dispatch = maybe_null_success_dispatch, -+ .enable = maybe_null_running, -+ .name = "minimal", -+}; -diff --git a/tools/testing/selftests/sched_ext/maybe_null.c b/tools/testing/selftests/sched_ext/maybe_null.c -new file mode 100644 -index 000000000000..3f26b784f9c5 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maybe_null.c -@@ -0,0 +1,40 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ */ -+#include -+#include -+#include -+#include -+#include "maybe_null.bpf.skel.h" -+#include "maybe_null_fail.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct maybe_null *skel; -+ struct maybe_null_fail *fail_skel; -+ -+ skel = maybe_null__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load maybe_null skel"); -+ return SCX_TEST_FAIL; -+ } -+ maybe_null__destroy(skel); -+ -+ fail_skel = maybe_null_fail__open_and_load(); -+ if (fail_skel) { -+ maybe_null_fail__destroy(fail_skel); -+ SCX_ERR("Should failed to open and load maybe_null_fail skel"); -+ return SCX_TEST_FAIL; -+ } -+ -+ return SCX_TEST_PASS; -+} -+ -+struct scx_test maybe_null = { -+ .name = "maybe_null", -+ .description = "Verify if PTR_MAYBE_NULL work for .dispatch", -+ .run = run, -+}; -+REGISTER_SCX_TEST(&maybe_null) -diff --git a/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c b/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c -new file mode 100644 -index 000000000000..1607fe07bead ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/maybe_null_fail.bpf.c -@@ -0,0 +1,25 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+u64 vtime_test; -+ -+void BPF_STRUCT_OPS(maybe_null_running, struct task_struct *p) -+{} -+ -+void BPF_STRUCT_OPS(maybe_null_fail_dispatch, s32 cpu, struct task_struct *p) -+{ -+ vtime_test = p->scx.dsq_vtime; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops maybe_null_fail = { -+ .dispatch = maybe_null_fail_dispatch, -+ .enable = maybe_null_running, -+ .name = "minimal", -+}; -diff --git a/tools/testing/selftests/sched_ext/minimal.bpf.c b/tools/testing/selftests/sched_ext/minimal.bpf.c -new file mode 100644 -index 000000000000..6a7eccef0104 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/minimal.bpf.c -@@ -0,0 +1,21 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A completely minimal scheduler. -+ * -+ * This scheduler defines the absolute minimal set of struct sched_ext_ops -+ * fields: its name. It should _not_ fail to be loaded, and can be used to -+ * exercise the default scheduling paths in ext.c. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops minimal_ops = { -+ .name = "minimal", -+}; -diff --git a/tools/testing/selftests/sched_ext/minimal.c b/tools/testing/selftests/sched_ext/minimal.c -new file mode 100644 -index 000000000000..6c5db8ebbf8a ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/minimal.c -@@ -0,0 +1,58 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "minimal.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct minimal *skel; -+ -+ skel = minimal__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct minimal *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.minimal_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ return SCX_TEST_FAIL; -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct minimal *skel = ctx; -+ -+ minimal__destroy(skel); -+} -+ -+struct scx_test minimal = { -+ .name = "minimal", -+ .description = "Verify we can load a fully minimal scheduler", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&minimal) -diff --git a/tools/testing/selftests/sched_ext/prog_run.bpf.c b/tools/testing/selftests/sched_ext/prog_run.bpf.c -new file mode 100644 -index 000000000000..fd2c8f12af16 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/prog_run.bpf.c -@@ -0,0 +1,32 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates that we can invoke sched_ext kfuncs in -+ * BPF_PROG_TYPE_SYSCALL programs. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#include -+ -+UEI_DEFINE(uei); -+ -+char _license[] SEC("license") = "GPL"; -+ -+SEC("syscall") -+int BPF_PROG(prog_run_syscall) -+{ -+ scx_bpf_exit(0xdeadbeef, "Exited from PROG_RUN"); -+ return 0; -+} -+ -+void BPF_STRUCT_OPS(prog_run_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops prog_run_ops = { -+ .exit = prog_run_exit, -+ .name = "prog_run", -+}; -diff --git a/tools/testing/selftests/sched_ext/prog_run.c b/tools/testing/selftests/sched_ext/prog_run.c -new file mode 100644 -index 000000000000..3cd57ef8daaa ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/prog_run.c -@@ -0,0 +1,78 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "prog_run.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct prog_run *skel; -+ -+ skel = prog_run__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct prog_run *skel = ctx; -+ struct bpf_link *link; -+ int prog_fd, err = 0; -+ -+ prog_fd = bpf_program__fd(skel->progs.prog_run_syscall); -+ if (prog_fd < 0) { -+ SCX_ERR("Failed to get BPF_PROG_RUN prog"); -+ return SCX_TEST_FAIL; -+ } -+ -+ LIBBPF_OPTS(bpf_test_run_opts, topts); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.prog_run_ops); -+ if (!link) { -+ SCX_ERR("Failed to attach scheduler"); -+ close(prog_fd); -+ return SCX_TEST_FAIL; -+ } -+ -+ err = bpf_prog_test_run_opts(prog_fd, &topts); -+ SCX_EQ(err, 0); -+ -+ /* Assumes uei.kind is written last */ -+ while (skel->data->uei.kind == EXIT_KIND(SCX_EXIT_NONE)) -+ sched_yield(); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_UNREG_BPF)); -+ SCX_EQ(skel->data->uei.exit_code, 0xdeadbeef); -+ close(prog_fd); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct prog_run *skel = ctx; -+ -+ prog_run__destroy(skel); -+} -+ -+struct scx_test prog_run = { -+ .name = "prog_run", -+ .description = "Verify we can call into a scheduler with BPF_PROG_RUN, and invoke kfuncs", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&prog_run) -diff --git a/tools/testing/selftests/sched_ext/reload_loop.c b/tools/testing/selftests/sched_ext/reload_loop.c -new file mode 100644 -index 000000000000..5cfba2d6e056 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/reload_loop.c -@@ -0,0 +1,75 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "maximal.bpf.skel.h" -+#include "scx_test.h" -+ -+static struct maximal *skel; -+static pthread_t threads[2]; -+ -+bool force_exit = false; -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ skel = maximal__open_and_load(); -+ if (!skel) { -+ SCX_ERR("Failed to open and load skel"); -+ return SCX_TEST_FAIL; -+ } -+ -+ return SCX_TEST_PASS; -+} -+ -+static void *do_reload_loop(void *arg) -+{ -+ u32 i; -+ -+ for (i = 0; i < 1024 && !force_exit; i++) { -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.maximal_ops); -+ if (link) -+ bpf_link__destroy(link); -+ } -+ -+ return NULL; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ int err; -+ void *ret; -+ -+ err = pthread_create(&threads[0], NULL, do_reload_loop, NULL); -+ SCX_FAIL_IF(err, "Failed to create thread 0"); -+ -+ err = pthread_create(&threads[1], NULL, do_reload_loop, NULL); -+ SCX_FAIL_IF(err, "Failed to create thread 1"); -+ -+ SCX_FAIL_IF(pthread_join(threads[0], &ret), "thread 0 failed"); -+ SCX_FAIL_IF(pthread_join(threads[1], &ret), "thread 1 failed"); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ force_exit = true; -+ maximal__destroy(skel); -+} -+ -+struct scx_test reload_loop = { -+ .name = "reload_loop", -+ .description = "Stress test loading and unloading schedulers repeatedly in a tight loop", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&reload_loop) -diff --git a/tools/testing/selftests/sched_ext/runner.c b/tools/testing/selftests/sched_ext/runner.c -new file mode 100644 -index 000000000000..eab48c7ff309 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/runner.c -@@ -0,0 +1,201 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include -+#include "scx_test.h" -+ -+const char help_fmt[] = -+"The runner for sched_ext tests.\n" -+"\n" -+"The runner is statically linked against all testcases, and runs them all serially.\n" -+"It's required for the testcases to be serial, as only a single host-wide sched_ext\n" -+"scheduler may be loaded at any given time." -+"\n" -+"Usage: %s [-t TEST] [-h]\n" -+"\n" -+" -t TEST Only run tests whose name includes this string\n" -+" -s Include print output for skipped tests\n" -+" -q Don't print the test descriptions during run\n" -+" -h Display this help and exit\n"; -+ -+static volatile int exit_req; -+static bool quiet, print_skipped; -+ -+#define MAX_SCX_TESTS 2048 -+ -+static struct scx_test __scx_tests[MAX_SCX_TESTS]; -+static unsigned __scx_num_tests = 0; -+ -+static void sigint_handler(int simple) -+{ -+ exit_req = 1; -+} -+ -+static void print_test_preamble(const struct scx_test *test, bool quiet) -+{ -+ printf("===== START =====\n"); -+ printf("TEST: %s\n", test->name); -+ if (!quiet) -+ printf("DESCRIPTION: %s\n", test->description); -+ printf("OUTPUT:\n"); -+} -+ -+static const char *status_to_result(enum scx_test_status status) -+{ -+ switch (status) { -+ case SCX_TEST_PASS: -+ case SCX_TEST_SKIP: -+ return "ok"; -+ case SCX_TEST_FAIL: -+ return "not ok"; -+ default: -+ return ""; -+ } -+} -+ -+static void print_test_result(const struct scx_test *test, -+ enum scx_test_status status, -+ unsigned int testnum) -+{ -+ const char *result = status_to_result(status); -+ const char *directive = status == SCX_TEST_SKIP ? "SKIP " : ""; -+ -+ printf("%s %u %s # %s\n", result, testnum, test->name, directive); -+ printf("===== END =====\n"); -+} -+ -+static bool should_skip_test(const struct scx_test *test, const char * filter) -+{ -+ return !strstr(test->name, filter); -+} -+ -+static enum scx_test_status run_test(const struct scx_test *test) -+{ -+ enum scx_test_status status; -+ void *context = NULL; -+ -+ if (test->setup) { -+ status = test->setup(&context); -+ if (status != SCX_TEST_PASS) -+ return status; -+ } -+ -+ status = test->run(context); -+ -+ if (test->cleanup) -+ test->cleanup(context); -+ -+ return status; -+} -+ -+static bool test_valid(const struct scx_test *test) -+{ -+ if (!test) { -+ fprintf(stderr, "NULL test detected\n"); -+ return false; -+ } -+ -+ if (!test->name) { -+ fprintf(stderr, -+ "Test with no name found. Must specify test name.\n"); -+ return false; -+ } -+ -+ if (!test->description) { -+ fprintf(stderr, "Test %s requires description.\n", test->name); -+ return false; -+ } -+ -+ if (!test->run) { -+ fprintf(stderr, "Test %s has no run() callback\n", test->name); -+ return false; -+ } -+ -+ return true; -+} -+ -+int main(int argc, char **argv) -+{ -+ const char *filter = NULL; -+ unsigned testnum = 0, i; -+ unsigned passed = 0, skipped = 0, failed = 0; -+ int opt; -+ -+ signal(SIGINT, sigint_handler); -+ signal(SIGTERM, sigint_handler); -+ -+ libbpf_set_strict_mode(LIBBPF_STRICT_ALL); -+ -+ while ((opt = getopt(argc, argv, "qst:h")) != -1) { -+ switch (opt) { -+ case 'q': -+ quiet = true; -+ break; -+ case 's': -+ print_skipped = true; -+ break; -+ case 't': -+ filter = optarg; -+ break; -+ default: -+ fprintf(stderr, help_fmt, basename(argv[0])); -+ return opt != 'h'; -+ } -+ } -+ -+ for (i = 0; i < __scx_num_tests; i++) { -+ enum scx_test_status status; -+ struct scx_test *test = &__scx_tests[i]; -+ -+ if (filter && should_skip_test(test, filter)) { -+ /* -+ * Printing the skipped tests and their preambles can -+ * add a lot of noise to the runner output. Printing -+ * this is only really useful for CI, so let's skip it -+ * by default. -+ */ -+ if (print_skipped) { -+ print_test_preamble(test, quiet); -+ print_test_result(test, SCX_TEST_SKIP, ++testnum); -+ } -+ continue; -+ } -+ -+ print_test_preamble(test, quiet); -+ status = run_test(test); -+ print_test_result(test, status, ++testnum); -+ switch (status) { -+ case SCX_TEST_PASS: -+ passed++; -+ break; -+ case SCX_TEST_SKIP: -+ skipped++; -+ break; -+ case SCX_TEST_FAIL: -+ failed++; -+ break; -+ } -+ } -+ printf("\n\n=============================\n\n"); -+ printf("RESULTS:\n\n"); -+ printf("PASSED: %u\n", passed); -+ printf("SKIPPED: %u\n", skipped); -+ printf("FAILED: %u\n", failed); -+ -+ return 0; -+} -+ -+void scx_test_register(struct scx_test *test) -+{ -+ SCX_BUG_ON(!test_valid(test), "Invalid test found"); -+ SCX_BUG_ON(__scx_num_tests >= MAX_SCX_TESTS, "Maximum tests exceeded"); -+ -+ __scx_tests[__scx_num_tests++] = *test; -+} -diff --git a/tools/testing/selftests/sched_ext/scx_test.h b/tools/testing/selftests/sched_ext/scx_test.h -new file mode 100644 -index 000000000000..90b8d6915bb7 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/scx_test.h -@@ -0,0 +1,131 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 Tejun Heo -+ * Copyright (c) 2023 David Vernet -+ */ -+ -+#ifndef __SCX_TEST_H__ -+#define __SCX_TEST_H__ -+ -+#include -+#include -+#include -+ -+enum scx_test_status { -+ SCX_TEST_PASS = 0, -+ SCX_TEST_SKIP, -+ SCX_TEST_FAIL, -+}; -+ -+#define EXIT_KIND(__ent) __COMPAT_ENUM_OR_ZERO("scx_exit_kind", #__ent) -+ -+struct scx_test { -+ /** -+ * name - The name of the testcase. -+ */ -+ const char *name; -+ -+ /** -+ * description - A description of your testcase: what it tests and is -+ * meant to validate. -+ */ -+ const char *description; -+ -+ /* -+ * setup - Setup the test. -+ * @ctx: A pointer to a context object that will be passed to run and -+ * cleanup. -+ * -+ * An optional callback that allows a testcase to perform setup for its -+ * run. A test may return SCX_TEST_SKIP to skip the run. -+ */ -+ enum scx_test_status (*setup)(void **ctx); -+ -+ /* -+ * run - Run the test. -+ * @ctx: Context set in the setup() callback. If @ctx was not set in -+ * setup(), it is NULL. -+ * -+ * The main test. Callers should return one of: -+ * -+ * - SCX_TEST_PASS: Test passed -+ * - SCX_TEST_SKIP: Test should be skipped -+ * - SCX_TEST_FAIL: Test failed -+ * -+ * This callback must be defined. -+ */ -+ enum scx_test_status (*run)(void *ctx); -+ -+ /* -+ * cleanup - Perform cleanup following the test -+ * @ctx: Context set in the setup() callback. If @ctx was not set in -+ * setup(), it is NULL. -+ * -+ * An optional callback that allows a test to perform cleanup after -+ * being run. This callback is run even if the run() callback returns -+ * SCX_TEST_SKIP or SCX_TEST_FAIL. It is not run if setup() returns -+ * SCX_TEST_SKIP or SCX_TEST_FAIL. -+ */ -+ void (*cleanup)(void *ctx); -+}; -+ -+void scx_test_register(struct scx_test *test); -+ -+#define REGISTER_SCX_TEST(__test) \ -+ __attribute__((constructor)) \ -+ static void ___scxregister##__LINE__(void) \ -+ { \ -+ scx_test_register(__test); \ -+ } -+ -+#define SCX_ERR(__fmt, ...) \ -+ do { \ -+ fprintf(stderr, "ERR: %s:%d\n", __FILE__, __LINE__); \ -+ fprintf(stderr, __fmt"\n", ##__VA_ARGS__); \ -+ } while (0) -+ -+#define SCX_FAIL(__fmt, ...) \ -+ do { \ -+ SCX_ERR(__fmt, ##__VA_ARGS__); \ -+ return SCX_TEST_FAIL; \ -+ } while (0) -+ -+#define SCX_FAIL_IF(__cond, __fmt, ...) \ -+ do { \ -+ if (__cond) \ -+ SCX_FAIL(__fmt, ##__VA_ARGS__); \ -+ } while (0) -+ -+#define SCX_GT(_x, _y) SCX_FAIL_IF((_x) <= (_y), "Expected %s > %s (%lu > %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_GE(_x, _y) SCX_FAIL_IF((_x) < (_y), "Expected %s >= %s (%lu >= %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_LT(_x, _y) SCX_FAIL_IF((_x) >= (_y), "Expected %s < %s (%lu < %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_LE(_x, _y) SCX_FAIL_IF((_x) > (_y), "Expected %s <= %s (%lu <= %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_EQ(_x, _y) SCX_FAIL_IF((_x) != (_y), "Expected %s == %s (%lu == %lu)", \ -+ #_x, #_y, (u64)(_x), (u64)(_y)) -+#define SCX_ASSERT(_x) SCX_FAIL_IF(!(_x), "Expected %s to be true (%lu)", \ -+ #_x, (u64)(_x)) -+ -+#define SCX_ECODE_VAL(__ecode) ({ \ -+ u64 __val = 0; \ -+ bool __found = false; \ -+ \ -+ __found = __COMPAT_read_enum("scx_exit_code", #__ecode, &__val); \ -+ SCX_ASSERT(__found); \ -+ (s64)__val; \ -+}) -+ -+#define SCX_KIND_VAL(__kind) ({ \ -+ u64 __val = 0; \ -+ bool __found = false; \ -+ \ -+ __found = __COMPAT_read_enum("scx_exit_kind", #__kind, &__val); \ -+ SCX_ASSERT(__found); \ -+ __val; \ -+}) -+ -+#endif // # __SCX_TEST_H__ -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c -new file mode 100644 -index 000000000000..2ed2991afafe ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.bpf.c -@@ -0,0 +1,40 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+bool saw_local = false; -+ -+static bool task_is_test(const struct task_struct *p) -+{ -+ return !bpf_strncmp(p->comm, 9, "select_cpu"); -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dfl_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ const struct cpumask *idle_mask = scx_bpf_get_idle_cpumask(); -+ -+ if (task_is_test(p) && -+ bpf_cpumask_test_cpu(scx_bpf_task_cpu(p), idle_mask)) { -+ saw_local = true; -+ } -+ scx_bpf_put_idle_cpumask(idle_mask); -+ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, enq_flags); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dfl_ops = { -+ .enqueue = select_cpu_dfl_enqueue, -+ .name = "select_cpu_dfl", -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl.c b/tools/testing/selftests/sched_ext/select_cpu_dfl.c -new file mode 100644 -index 000000000000..a53a40c2d2f0 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl.c -@@ -0,0 +1,72 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dfl.bpf.skel.h" -+#include "scx_test.h" -+ -+#define NUM_CHILDREN 1028 -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dfl *skel; -+ -+ skel = select_cpu_dfl__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dfl *skel = ctx; -+ struct bpf_link *link; -+ pid_t pids[NUM_CHILDREN]; -+ int i, status; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); -+ SCX_EQ(status, 0); -+ } -+ -+ SCX_ASSERT(!skel->bss->saw_local); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dfl *skel = ctx; -+ -+ select_cpu_dfl__destroy(skel); -+} -+ -+struct scx_test select_cpu_dfl = { -+ .name = "select_cpu_dfl", -+ .description = "Verify the default ops.select_cpu() dispatches tasks " -+ "when idles cores are found, and skips ops.enqueue()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dfl) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c -new file mode 100644 -index 000000000000..4bb5abb2d369 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.bpf.c -@@ -0,0 +1,89 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation, and with the SCX_OPS_ENQ_DFL_NO_DISPATCH ops flag -+ * specified. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+bool saw_local = false; -+ -+/* Per-task scheduling context */ -+struct task_ctx { -+ bool force_local; /* CPU changed by ops.select_cpu() */ -+}; -+ -+struct { -+ __uint(type, BPF_MAP_TYPE_TASK_STORAGE); -+ __uint(map_flags, BPF_F_NO_PREALLOC); -+ __type(key, int); -+ __type(value, struct task_ctx); -+} task_ctx_stor SEC(".maps"); -+ -+/* Manually specify the signature until the kfunc is added to the scx repo. */ -+s32 scx_bpf_select_cpu_dfl(struct task_struct *p, s32 prev_cpu, u64 wake_flags, -+ bool *found) __ksym; -+ -+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ struct task_ctx *tctx; -+ s32 cpu; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return -ESRCH; -+ } -+ -+ cpu = scx_bpf_select_cpu_dfl(p, prev_cpu, wake_flags, -+ &tctx->force_local); -+ -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_enqueue, struct task_struct *p, -+ u64 enq_flags) -+{ -+ u64 dsq_id = SCX_DSQ_GLOBAL; -+ struct task_ctx *tctx; -+ -+ tctx = bpf_task_storage_get(&task_ctx_stor, p, 0, 0); -+ if (!tctx) { -+ scx_bpf_error("task_ctx lookup failed"); -+ return; -+ } -+ -+ if (tctx->force_local) { -+ dsq_id = SCX_DSQ_LOCAL; -+ tctx->force_local = false; -+ saw_local = true; -+ } -+ -+ scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, enq_flags); -+} -+ -+s32 BPF_STRUCT_OPS(select_cpu_dfl_nodispatch_init_task, -+ struct task_struct *p, struct scx_init_task_args *args) -+{ -+ if (bpf_task_storage_get(&task_ctx_stor, p, 0, -+ BPF_LOCAL_STORAGE_GET_F_CREATE)) -+ return 0; -+ else -+ return -ENOMEM; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dfl_nodispatch_ops = { -+ .select_cpu = select_cpu_dfl_nodispatch_select_cpu, -+ .enqueue = select_cpu_dfl_nodispatch_enqueue, -+ .init_task = select_cpu_dfl_nodispatch_init_task, -+ .name = "select_cpu_dfl_nodispatch", -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c -new file mode 100644 -index 000000000000..1d85bf4bf3a3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dfl_nodispatch.c -@@ -0,0 +1,72 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dfl_nodispatch.bpf.skel.h" -+#include "scx_test.h" -+ -+#define NUM_CHILDREN 1028 -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dfl_nodispatch *skel; -+ -+ skel = select_cpu_dfl_nodispatch__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dfl_nodispatch *skel = ctx; -+ struct bpf_link *link; -+ pid_t pids[NUM_CHILDREN]; -+ int i, status; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dfl_nodispatch_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); -+ SCX_EQ(status, 0); -+ } -+ -+ SCX_ASSERT(skel->bss->saw_local); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dfl_nodispatch *skel = ctx; -+ -+ select_cpu_dfl_nodispatch__destroy(skel); -+} -+ -+struct scx_test select_cpu_dfl_nodispatch = { -+ .name = "select_cpu_dfl_nodispatch", -+ .description = "Verify behavior of scx_bpf_select_cpu_dfl() in " -+ "ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dfl_nodispatch) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c -new file mode 100644 -index 000000000000..f0b96a4a04b2 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.bpf.c -@@ -0,0 +1,41 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+s32 BPF_STRUCT_OPS(select_cpu_dispatch_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ u64 dsq_id = SCX_DSQ_LOCAL; -+ s32 cpu = prev_cpu; -+ -+ if (scx_bpf_test_and_clear_cpu_idle(cpu)) -+ goto dispatch; -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ goto dispatch; -+ -+ dsq_id = SCX_DSQ_GLOBAL; -+ cpu = prev_cpu; -+ -+dispatch: -+ scx_bpf_dispatch(p, dsq_id, SCX_SLICE_DFL, 0); -+ return cpu; -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dispatch_ops = { -+ .select_cpu = select_cpu_dispatch_select_cpu, -+ .name = "select_cpu_dispatch", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c -new file mode 100644 -index 000000000000..0309ca8785b3 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch.c -@@ -0,0 +1,70 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dispatch.bpf.skel.h" -+#include "scx_test.h" -+ -+#define NUM_CHILDREN 1028 -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dispatch *skel; -+ -+ skel = select_cpu_dispatch__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dispatch *skel = ctx; -+ struct bpf_link *link; -+ pid_t pids[NUM_CHILDREN]; -+ int i, status; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ pids[i] = fork(); -+ if (pids[i] == 0) { -+ sleep(1); -+ exit(0); -+ } -+ } -+ -+ for (i = 0; i < NUM_CHILDREN; i++) { -+ SCX_EQ(waitpid(pids[i], &status, 0), pids[i]); -+ SCX_EQ(status, 0); -+ } -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dispatch *skel = ctx; -+ -+ select_cpu_dispatch__destroy(skel); -+} -+ -+struct scx_test select_cpu_dispatch = { -+ .name = "select_cpu_dispatch", -+ .description = "Test direct dispatching to built-in DSQs from " -+ "ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dispatch) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c -new file mode 100644 -index 000000000000..7b42ddce0f56 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.bpf.c -@@ -0,0 +1,37 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ /* Dispatching to a random DSQ should fail. */ -+ scx_bpf_dispatch(p, 0xcafef00d, SCX_SLICE_DFL, 0); -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dispatch_bad_dsq_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dispatch_bad_dsq_ops = { -+ .select_cpu = select_cpu_dispatch_bad_dsq_select_cpu, -+ .exit = select_cpu_dispatch_bad_dsq_exit, -+ .name = "select_cpu_dispatch_bad_dsq", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c -new file mode 100644 -index 000000000000..47eb6ed7627d ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_bad_dsq.c -@@ -0,0 +1,56 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dispatch_bad_dsq.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dispatch_bad_dsq *skel; -+ -+ skel = select_cpu_dispatch_bad_dsq__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dispatch_bad_dsq *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_bad_dsq_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dispatch_bad_dsq *skel = ctx; -+ -+ select_cpu_dispatch_bad_dsq__destroy(skel); -+} -+ -+struct scx_test select_cpu_dispatch_bad_dsq = { -+ .name = "select_cpu_dispatch_bad_dsq", -+ .description = "Verify graceful failure if we direct-dispatch to a " -+ "bogus DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dispatch_bad_dsq) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c -new file mode 100644 -index 000000000000..653e3dc0b4dc ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.bpf.c -@@ -0,0 +1,38 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates the behavior of direct dispatching with a default -+ * select_cpu implementation. -+ * -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+UEI_DEFINE(uei); -+ -+s32 BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ /* Dispatching twice in a row is disallowed. */ -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); -+ scx_bpf_dispatch(p, SCX_DSQ_GLOBAL, SCX_SLICE_DFL, 0); -+ -+ return prev_cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_dispatch_dbl_dsp_exit, struct scx_exit_info *ei) -+{ -+ UEI_RECORD(uei, ei); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_dispatch_dbl_dsp_ops = { -+ .select_cpu = select_cpu_dispatch_dbl_dsp_select_cpu, -+ .exit = select_cpu_dispatch_dbl_dsp_exit, -+ .name = "select_cpu_dispatch_dbl_dsp", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c -new file mode 100644 -index 000000000000..48ff028a3c46 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_dispatch_dbl_dsp.c -@@ -0,0 +1,56 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2023 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2023 David Vernet -+ * Copyright (c) 2023 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_dispatch_dbl_dsp.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_dispatch_dbl_dsp *skel; -+ -+ skel = select_cpu_dispatch_dbl_dsp__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_dispatch_dbl_dsp *skel = ctx; -+ struct bpf_link *link; -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_dispatch_dbl_dsp_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ sleep(1); -+ -+ SCX_EQ(skel->data->uei.kind, EXIT_KIND(SCX_EXIT_ERROR)); -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_dispatch_dbl_dsp *skel = ctx; -+ -+ select_cpu_dispatch_dbl_dsp__destroy(skel); -+} -+ -+struct scx_test select_cpu_dispatch_dbl_dsp = { -+ .name = "select_cpu_dispatch_dbl_dsp", -+ .description = "Verify graceful failure if we dispatch twice to a " -+ "DSQ in ops.select_cpu()", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_dispatch_dbl_dsp) -diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c -new file mode 100644 -index 000000000000..7f3ebf4fc2ea ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.bpf.c -@@ -0,0 +1,92 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * A scheduler that validates that enqueue flags are properly stored and -+ * applied at dispatch time when a task is directly dispatched from -+ * ops.select_cpu(). We validate this by using scx_bpf_dispatch_vtime(), and -+ * making the test a very basic vtime scheduler. -+ * -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+ -+#include -+ -+char _license[] SEC("license") = "GPL"; -+ -+volatile bool consumed; -+ -+static u64 vtime_now; -+ -+#define VTIME_DSQ 0 -+ -+static inline bool vtime_before(u64 a, u64 b) -+{ -+ return (s64)(a - b) < 0; -+} -+ -+static inline u64 task_vtime(const struct task_struct *p) -+{ -+ u64 vtime = p->scx.dsq_vtime; -+ -+ if (vtime_before(vtime, vtime_now - SCX_SLICE_DFL)) -+ return vtime_now - SCX_SLICE_DFL; -+ else -+ return vtime; -+} -+ -+s32 BPF_STRUCT_OPS(select_cpu_vtime_select_cpu, struct task_struct *p, -+ s32 prev_cpu, u64 wake_flags) -+{ -+ s32 cpu; -+ -+ cpu = scx_bpf_pick_idle_cpu(p->cpus_ptr, 0); -+ if (cpu >= 0) -+ goto ddsp; -+ -+ cpu = prev_cpu; -+ scx_bpf_test_and_clear_cpu_idle(cpu); -+ddsp: -+ scx_bpf_dispatch_vtime(p, VTIME_DSQ, SCX_SLICE_DFL, task_vtime(p), 0); -+ return cpu; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_dispatch, s32 cpu, struct task_struct *p) -+{ -+ if (scx_bpf_consume(VTIME_DSQ)) -+ consumed = true; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_running, struct task_struct *p) -+{ -+ if (vtime_before(vtime_now, p->scx.dsq_vtime)) -+ vtime_now = p->scx.dsq_vtime; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_stopping, struct task_struct *p, -+ bool runnable) -+{ -+ p->scx.dsq_vtime += (SCX_SLICE_DFL - p->scx.slice) * 100 / p->scx.weight; -+} -+ -+void BPF_STRUCT_OPS(select_cpu_vtime_enable, struct task_struct *p) -+{ -+ p->scx.dsq_vtime = vtime_now; -+} -+ -+s32 BPF_STRUCT_OPS_SLEEPABLE(select_cpu_vtime_init) -+{ -+ return scx_bpf_create_dsq(VTIME_DSQ, -1); -+} -+ -+SEC(".struct_ops.link") -+struct sched_ext_ops select_cpu_vtime_ops = { -+ .select_cpu = select_cpu_vtime_select_cpu, -+ .dispatch = select_cpu_vtime_dispatch, -+ .running = select_cpu_vtime_running, -+ .stopping = select_cpu_vtime_stopping, -+ .enable = select_cpu_vtime_enable, -+ .init = select_cpu_vtime_init, -+ .name = "select_cpu_vtime", -+ .timeout_ms = 1000U, -+}; -diff --git a/tools/testing/selftests/sched_ext/select_cpu_vtime.c b/tools/testing/selftests/sched_ext/select_cpu_vtime.c -new file mode 100644 -index 000000000000..b4629c2364f5 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/select_cpu_vtime.c -@@ -0,0 +1,59 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ * Copyright (c) 2024 Tejun Heo -+ */ -+#include -+#include -+#include -+#include -+#include "select_cpu_vtime.bpf.skel.h" -+#include "scx_test.h" -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ struct select_cpu_vtime *skel; -+ -+ skel = select_cpu_vtime__open_and_load(); -+ SCX_FAIL_IF(!skel, "Failed to open and load skel"); -+ *ctx = skel; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ struct select_cpu_vtime *skel = ctx; -+ struct bpf_link *link; -+ -+ SCX_ASSERT(!skel->bss->consumed); -+ -+ link = bpf_map__attach_struct_ops(skel->maps.select_cpu_vtime_ops); -+ SCX_FAIL_IF(!link, "Failed to attach scheduler"); -+ -+ sleep(1); -+ -+ SCX_ASSERT(skel->bss->consumed); -+ -+ bpf_link__destroy(link); -+ -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup(void *ctx) -+{ -+ struct select_cpu_vtime *skel = ctx; -+ -+ select_cpu_vtime__destroy(skel); -+} -+ -+struct scx_test select_cpu_vtime = { -+ .name = "select_cpu_vtime", -+ .description = "Test doing direct vtime-dispatching from " -+ "ops.select_cpu(), to a non-built-in DSQ", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&select_cpu_vtime) -diff --git a/tools/testing/selftests/sched_ext/test_example.c b/tools/testing/selftests/sched_ext/test_example.c -new file mode 100644 -index 000000000000..ce36cdf03cdc ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/test_example.c -@@ -0,0 +1,49 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 Tejun Heo -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include "scx_test.h" -+ -+static bool setup_called = false; -+static bool run_called = false; -+static bool cleanup_called = false; -+ -+static int context = 10; -+ -+static enum scx_test_status setup(void **ctx) -+{ -+ setup_called = true; -+ *ctx = &context; -+ -+ return SCX_TEST_PASS; -+} -+ -+static enum scx_test_status run(void *ctx) -+{ -+ int *arg = ctx; -+ -+ SCX_ASSERT(setup_called); -+ SCX_ASSERT(!run_called && !cleanup_called); -+ SCX_EQ(*arg, context); -+ -+ run_called = true; -+ return SCX_TEST_PASS; -+} -+ -+static void cleanup (void *ctx) -+{ -+ SCX_BUG_ON(!run_called || cleanup_called, "Wrong callbacks invoked"); -+} -+ -+struct scx_test example = { -+ .name = "example", -+ .description = "Validate the basic function of the test suite itself", -+ .setup = setup, -+ .run = run, -+ .cleanup = cleanup, -+}; -+REGISTER_SCX_TEST(&example) -diff --git a/tools/testing/selftests/sched_ext/util.c b/tools/testing/selftests/sched_ext/util.c -new file mode 100644 -index 000000000000..e47769c91918 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/util.c -@@ -0,0 +1,71 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+#include -+#include -+#include -+#include -+#include -+#include -+ -+/* Returns read len on success, or -errno on failure. */ -+static ssize_t read_text(const char *path, char *buf, size_t max_len) -+{ -+ ssize_t len; -+ int fd; -+ -+ fd = open(path, O_RDONLY); -+ if (fd < 0) -+ return -errno; -+ -+ len = read(fd, buf, max_len - 1); -+ -+ if (len >= 0) -+ buf[len] = 0; -+ -+ close(fd); -+ return len < 0 ? -errno : len; -+} -+ -+/* Returns written len on success, or -errno on failure. */ -+static ssize_t write_text(const char *path, char *buf, ssize_t len) -+{ -+ int fd; -+ ssize_t written; -+ -+ fd = open(path, O_WRONLY | O_APPEND); -+ if (fd < 0) -+ return -errno; -+ -+ written = write(fd, buf, len); -+ close(fd); -+ return written < 0 ? -errno : written; -+} -+ -+long file_read_long(const char *path) -+{ -+ char buf[128]; -+ -+ -+ if (read_text(path, buf, sizeof(buf)) <= 0) -+ return -1; -+ -+ return atol(buf); -+} -+ -+int file_write_long(const char *path, long val) -+{ -+ char buf[64]; -+ int ret; -+ -+ ret = sprintf(buf, "%lu", val); -+ if (ret < 0) -+ return ret; -+ -+ if (write_text(path, buf, sizeof(buf)) <= 0) -+ return -1; -+ -+ return 0; -+} -diff --git a/tools/testing/selftests/sched_ext/util.h b/tools/testing/selftests/sched_ext/util.h -new file mode 100644 -index 000000000000..bc13dfec1267 ---- /dev/null -+++ b/tools/testing/selftests/sched_ext/util.h -@@ -0,0 +1,13 @@ -+/* SPDX-License-Identifier: GPL-2.0 */ -+/* -+ * Copyright (c) 2024 Meta Platforms, Inc. and affiliates. -+ * Copyright (c) 2024 David Vernet -+ */ -+ -+#ifndef __SCX_TEST_UTIL_H__ -+#define __SCX_TEST_UTIL_H__ -+ -+long file_read_long(const char *path); -+int file_write_long(const char *path, long val); -+ -+#endif // __SCX_TEST_H__ --- -2.45.1.145.g83f1add914 - diff --git a/sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip b/sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip new file mode 100644 index 0000000..ea85a13 --- /dev/null +++ b/sys-kernel/scx/REVERT-scx-1.0.14-builtin-preserve-enum-value.patch.skip @@ -0,0 +1,13 @@ +Taken from https://github.com/sched-ext/scx/commit/7d9b2cc26473526883297df78e8eee3f2e7b6194. + +--- a/lib/scxtest/overrides.h ++++ b/lib/scxtest/overrides.h +@@ -13,7 +13,7 @@ + * that we want to get rid of that belongs here. + */ + #define __builtin_preserve_field_info(x,y) 1 +-#define __builtin_preserve_enum_value(x,y) 1 ++#define __builtin_preserve_enum_value(x,y,z) 1 + + #define bpf_addr_space_cast(var, dst_as, src_as) + -- 2.53.0